From e3e0c7ed7a285e4a4872cd4b59e432fe5bae5bda Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 26 Apr 2026 13:19:43 -0400 Subject: [PATCH 1/5] Add public transfer artifact manifest --- .github/CONTRIBUTING.md | 3 +- .../upload-public-transfer-dataset.yaml | 26 ++ Makefile | 6 + README.md | 10 + docs/myst.yml | 1 + docs/public_transfer_dataset.md | 48 ++++ policyengine_uk_data/datasets/__init__.py | 2 + policyengine_uk_data/datasets/enhanced_cps.py | 1 + .../datasets/policybench_transfer.py | 8 +- .../storage/enhanced_cps_manifest_2025.json | 90 +++++++ .../storage/upload_public_transfer_dataset.py | 29 ++ .../storage/write_enhanced_cps_manifest.py | 7 + .../test_enhanced_cps_artifact_manifest.py | 137 ++++++++++ .../utils/enhanced_cps_manifest.py | 248 ++++++++++++++++++ uv.lock | 2 +- 15 files changed, 615 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/upload-public-transfer-dataset.yaml create mode 100644 docs/public_transfer_dataset.md create mode 100644 policyengine_uk_data/storage/enhanced_cps_manifest_2025.json create mode 100644 policyengine_uk_data/storage/upload_public_transfer_dataset.py create mode 100644 policyengine_uk_data/storage/write_enhanced_cps_manifest.py create mode 100644 policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py create mode 100644 policyengine_uk_data/utils/enhanced_cps_manifest.py diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index ffc952586..0705841ec 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -30,7 +30,8 @@ This repo builds the `.h5` files that feed `policyengine-uk`: The enhanced FRS dataset is licensed under strict UK Data Service terms. Violating them risks losing access, which would end PolicyEngine UK. -- **Never upload data to any public location.** The HuggingFace repo `policyengine/policyengine-uk-data-private` is private and authenticated. +- **Never upload FRS-derived or UKDS-licensed data to any public location.** The HuggingFace repo `policyengine/policyengine-uk-data-private` is private and authenticated. +- The public transfer artifacts documented in `docs/public_transfer_dataset.md` are the narrow exception. Upload them only through `make upload-public-transfer`, which targets the public repo intentionally. - **Never modify `upload_completed_datasets.py` or `utils/data_upload.py`** to change upload destinations without explicit confirmation from the data controller (currently Nikhil Woodruff). - **Never print, log, or output individual-level records.** Aggregates (sums, means, counts, weighted totals) are fine; individual rows are not. - **If you see a private/public repo split, assume it is intentional** — ask why before changing it. diff --git a/.github/workflows/upload-public-transfer-dataset.yaml b/.github/workflows/upload-public-transfer-dataset.yaml new file mode 100644 index 000000000..79d9827f5 --- /dev/null +++ b/.github/workflows/upload-public-transfer-dataset.yaml @@ -0,0 +1,26 @@ +name: Upload public transfer dataset + +on: + workflow_dispatch: + +jobs: + upload-public-transfer: + runs-on: ubuntu-latest + env: + HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }} + steps: + - uses: actions/checkout@v4 + - name: Install uv + uses: astral-sh/setup-uv@v5 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.13 + - name: Install package + run: uv pip install -e ".[dev]" --system + - name: Verify public transfer artifact contract + run: | + pytest policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py + pytest policyengine_uk_data/tests/test_policybench_transfer.py + - name: Upload public transfer artifacts + run: make upload-public-transfer diff --git a/Makefile b/Makefile index 30955d252..2e2bb699d 100644 --- a/Makefile +++ b/Makefile @@ -15,6 +15,12 @@ download: upload: python policyengine_uk_data/storage/upload_completed_datasets.py +enhanced-cps-manifest: + python policyengine_uk_data/storage/write_enhanced_cps_manifest.py + +upload-public-transfer: + python policyengine_uk_data/storage/upload_public_transfer_dataset.py + documentation: pip install --pre "jupyter-book>=2" jb clean docs && jb build docs diff --git a/README.md b/README.md index 4f6de4a32..d0f8b2e07 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ This repo now also includes a public calibrated microdata file: - `policyengine_uk_data/storage/enhanced_cps_2025.h5` - source manifest: `policyengine_uk_data/storage/enhanced_cps_source_2025.csv` +- artifact manifest: `policyengine_uk_data/storage/enhanced_cps_manifest_2025.json` The public UK calibrated transfer dataset starts from a public export of eligible households from PolicyEngine-US Enhanced CPS. In the current build that source manifest contains @@ -47,11 +48,20 @@ This is a public calibrated dataset, not a replacement for the FRS or enhanced FRS. It is intended as the first step in a broader cross-country public-microdata strategy. +The legacy `policybench_transfer_2025.h5` and +`policybench_transfer_source_2025.csv` files remain the original +1,000-household proof-of-method artifacts. The Python +`create_policybench_transfer` and `save_policybench_transfer` entry points are +backward-compatible aliases for the current `enhanced_cps` builder, not a +request to regenerate the legacy 1,000-household files. + Programmatic entrypoints: - `policyengine_uk_data.datasets.create_enhanced_cps` - `policyengine_uk_data.datasets.export_enhanced_cps_source` - `policyengine_uk_data.datasets.save_enhanced_cps` +- `make enhanced-cps-manifest` +- `make upload-public-transfer` Backward-compatible aliases remain available: diff --git a/docs/myst.yml b/docs/myst.yml index 29f21f0a3..1fd8ceb55 100644 --- a/docs/myst.yml +++ b/docs/myst.yml @@ -9,6 +9,7 @@ project: github: policyengine/policyengine-uk-data toc: - file: intro.md + - file: public_transfer_dataset.md - file: methodology.ipynb - file: imputations.md - file: validation/index.md diff --git a/docs/public_transfer_dataset.md b/docs/public_transfer_dataset.md new file mode 100644 index 000000000..9612ae6ca --- /dev/null +++ b/docs/public_transfer_dataset.md @@ -0,0 +1,48 @@ +# Public UK transfer dataset + +The public UK transfer dataset is an openly distributable benchmark artifact. +It is not native UK survey microdata. + +The 2025 artifact starts from a public export of benchmark-compatible +PolicyEngine US Enhanced CPS households. The builder maps those households into +UK-facing PolicyEngine inputs, assigns synthetic UK geography, populates input +leaves such as council tax bands, vehicle ownership, pensions, disability/PIP, +consumption, and capital gains, and recalibrates household weights to selected +UK national, regional, and country targets. + +The current public artifact is: + +- `policyengine_uk_data/storage/enhanced_cps_2025.h5` +- `policyengine_uk_data/storage/enhanced_cps_source_2025.csv` +- `policyengine_uk_data/storage/enhanced_cps_manifest_2025.json` + +The artifact manifest is the source of record for row counts, checksums, build +assumptions, weight diagnostics, and loss diagnostics. The checked-in 2025 +manifest reports 28,532 households in both the source CSV and H5 file, 58,848 +people in the H5 file, an effective sample size of about 11,197 households, and +a top-10 household-weight share of about 0.52%. + +## Intended use + +Use this dataset for public demos, reproducible examples, and public benchmark +analysis where restricted UK microdata cannot be redistributed. + +Do not use this dataset as a substitute for FRS or enhanced FRS, as evidence of +the UK joint household distribution, or as administrative ground truth. Aggregate +calibration can improve target fit without recovering the native UK joint +distribution. + +## Versioning + +The public artifact should be cited by path, manifest, package version, and +checksum. The 2025 artifact uses a pinned USD-to-GBP exchange rate of 0.759 from +the IRS 2025 yearly average exchange-rate table. The builder deliberately does +not call a live foreign-exchange API. + +## Legacy files + +The `policybench_transfer_2025.h5` and `policybench_transfer_source_2025.csv` +files are retained as the original 1,000-household proof-of-method artifacts. +Current Python entry points named `create_policybench_transfer` and +`save_policybench_transfer` are aliases for the current 28,532-household +`enhanced_cps` builder. diff --git a/policyengine_uk_data/datasets/__init__.py b/policyengine_uk_data/datasets/__init__.py index c7db400b3..84d8add24 100644 --- a/policyengine_uk_data/datasets/__init__.py +++ b/policyengine_uk_data/datasets/__init__.py @@ -1,5 +1,6 @@ from .enhanced_cps import ( ENHANCED_CPS_FILE, + ENHANCED_CPS_MANIFEST_FILE, ENHANCED_CPS_SOURCE_FILE, create_enhanced_cps, export_enhanced_cps_source, @@ -15,6 +16,7 @@ __all__ = [ "ENHANCED_CPS_FILE", + "ENHANCED_CPS_MANIFEST_FILE", "ENHANCED_CPS_SOURCE_FILE", "create_enhanced_cps", "export_enhanced_cps_source", diff --git a/policyengine_uk_data/datasets/enhanced_cps.py b/policyengine_uk_data/datasets/enhanced_cps.py index 5819573d4..8ba026511 100644 --- a/policyengine_uk_data/datasets/enhanced_cps.py +++ b/policyengine_uk_data/datasets/enhanced_cps.py @@ -23,6 +23,7 @@ ENHANCED_CPS_SOURCE_FILE = STORAGE_FOLDER / "enhanced_cps_source_2025.csv" ENHANCED_CPS_FILE = STORAGE_FOLDER / "enhanced_cps_2025.h5" +ENHANCED_CPS_MANIFEST_FILE = STORAGE_FOLDER / "enhanced_cps_manifest_2025.json" COUNCIL_TAX_BANDS_FILE = STORAGE_FOLDER / "council_tax_bands_2024.csv" # Build assumptions are pinned so the checked-in H5 is reproducible. Do not diff --git a/policyengine_uk_data/datasets/policybench_transfer.py b/policyengine_uk_data/datasets/policybench_transfer.py index 360202de6..baa144c28 100644 --- a/policyengine_uk_data/datasets/policybench_transfer.py +++ b/policyengine_uk_data/datasets/policybench_transfer.py @@ -1,4 +1,10 @@ -"""Backward-compatible aliases for the public UK enhanced CPS dataset.""" +"""Backward-compatible aliases for the current public UK transfer builder. + +The checked-in ``policybench_transfer_2025`` artifacts remain the original +1,000-household proof-of-method files for historical comparison. The Python +entry points below intentionally alias the current 28,532-household +``enhanced_cps_2025`` builder instead of recreating those legacy artifacts. +""" from policyengine_uk_data.datasets.enhanced_cps import ( ENHANCED_CPS_SOURCE_FILE as POLICYBENCH_TRANSFER_SOURCE_FILE, diff --git a/policyengine_uk_data/storage/enhanced_cps_manifest_2025.json b/policyengine_uk_data/storage/enhanced_cps_manifest_2025.json new file mode 100644 index 000000000..89a6d0878 --- /dev/null +++ b/policyengine_uk_data/storage/enhanced_cps_manifest_2025.json @@ -0,0 +1,90 @@ +{ + "artifact": "enhanced_cps_2025", + "build": { + "build_command": "uv run --python 3.13 python -m policyengine_uk_data.storage.write_enhanced_cps_manifest", + "builder": "policyengine_uk_data.datasets.save_enhanced_cps", + "calibrated": true, + "calibration_target_year": 2025, + "exchange_rate": { + "live_api_called": false, + "source_url": "https://www.irs.gov/individuals/international-taxpayers/yearly-average-currency-exchange-rates", + "usd_to_gbp": 0.759 + }, + "fiscal_year": 2025, + "source_dataset": "PolicyEngine US Enhanced CPS public export", + "source_scope": "benchmark-compatible households" + }, + "description": "Public UK calibrated transfer dataset derived from a public export of benchmark-compatible PolicyEngine US Enhanced CPS households.", + "files": { + "artifact": { + "git_blob_sha": "6184d619b15497b00fea2d349a4af041f85d17a0", + "last_modified_commit": "9514dfb7ec607897c9f7122a2e073b922c9fd8b6", + "path": "policyengine_uk_data/storage/enhanced_cps_2025.h5", + "sha256": "199ebc61d29231b4799ad337a95393765b5fb5aede1834b93ff2acecceded866", + "size_bytes": 23379906 + }, + "source_manifest": { + "git_blob_sha": "b94400dd8d729bcf7807ddc59a920c6f48bee2dd", + "last_modified_commit": "9514dfb7ec607897c9f7122a2e073b922c9fd8b6", + "path": "policyengine_uk_data/storage/enhanced_cps_source_2025.csv", + "sha256": "ab69ba5cec8c079a7cacd1494bc6766139e3c6e07e779e3719a57be60e039892", + "size_bytes": 58751828 + } + }, + "generated_at": "2026-04-26T17:15:02.755838Z", + "intended_uses": [ + "public demos", + "reproducible examples", + "public benchmark analysis" + ], + "loss_diagnostics": { + "calibrated": { + "excludes_zero_target_relative_errors": true, + "mean_abs_relative_error": 0.330212, + "median_abs_relative_error": 0.191874, + "nonfinite_relative_error_count": 1, + "nonzero_finite_target_count": 590, + "p90_abs_relative_error": 1.0, + "share_within_10pct": 0.284746, + "share_within_25pct": 0.608475, + "target_count": 591, + "zero_target_count": 1 + }, + "raw_transfer_weights": { + "excludes_zero_target_relative_errors": true, + "mean_abs_relative_error": 3.418089, + "median_abs_relative_error": 1.575202, + "nonfinite_relative_error_count": 1, + "nonzero_finite_target_count": 590, + "p90_abs_relative_error": 5.829189, + "share_within_10pct": 0.022034, + "share_within_25pct": 0.052542, + "target_count": 591, + "zero_target_count": 1 + }, + "target_year": 2025 + }, + "not_intended_uses": [ + "substitution for FRS or enhanced FRS", + "precise UK distributional analysis", + "administrative truth validation" + ], + "row_counts": { + "h5_benunits": 28532, + "h5_households": 28532, + "h5_people": 58848, + "source_households": 28532 + }, + "schema_version": 1, + "weight_diagnostics": { + "effective_sample_size": 11196.873445, + "max_household_weight": 20155.812507, + "min_household_weight": 1.0000000000000003e-09, + "near_zero_weight_count": 1408, + "near_zero_weight_threshold": 1e-06, + "top_100_share": 0.033223773, + "top_10_share": 0.005239509, + "top_1_share": 0.000714643, + "total_household_weight": 28204012.49725 + } +} diff --git a/policyengine_uk_data/storage/upload_public_transfer_dataset.py b/policyengine_uk_data/storage/upload_public_transfer_dataset.py new file mode 100644 index 000000000..ab5f507da --- /dev/null +++ b/policyengine_uk_data/storage/upload_public_transfer_dataset.py @@ -0,0 +1,29 @@ +"""Upload the explicitly public UK calibrated transfer dataset artifacts.""" + +from policyengine_uk_data.datasets import ( + ENHANCED_CPS_FILE, + ENHANCED_CPS_MANIFEST_FILE, + ENHANCED_CPS_SOURCE_FILE, +) +from policyengine_uk_data.utils.data_upload import upload_files_to_hf +from policyengine_uk_data.utils.hf_destinations import PUBLIC_REPO + + +def upload_public_transfer_dataset() -> None: + files = [ + ENHANCED_CPS_FILE, + ENHANCED_CPS_SOURCE_FILE, + ENHANCED_CPS_MANIFEST_FILE, + ] + for file_path in files: + if not file_path.exists(): + raise ValueError(f"File {file_path} does not exist.") + + upload_files_to_hf( + files=files, + hf_repo_name=PUBLIC_REPO, + ) + + +if __name__ == "__main__": + upload_public_transfer_dataset() diff --git a/policyengine_uk_data/storage/write_enhanced_cps_manifest.py b/policyengine_uk_data/storage/write_enhanced_cps_manifest.py new file mode 100644 index 000000000..cfdb1c45a --- /dev/null +++ b/policyengine_uk_data/storage/write_enhanced_cps_manifest.py @@ -0,0 +1,7 @@ +"""Write the public enhanced CPS artifact manifest.""" + +from policyengine_uk_data.utils.enhanced_cps_manifest import main + + +if __name__ == "__main__": + main() diff --git a/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py b/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py new file mode 100644 index 000000000..a9922311c --- /dev/null +++ b/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py @@ -0,0 +1,137 @@ +import hashlib +import json +import subprocess +from pathlib import Path +from unittest.mock import patch + +import numpy as np +import pandas as pd +from policyengine_uk.data import UKSingleYearDataset + +from policyengine_uk_data.datasets import ( + ENHANCED_CPS_FILE, + ENHANCED_CPS_MANIFEST_FILE, + ENHANCED_CPS_SOURCE_FILE, +) +from policyengine_uk_data.datasets.policybench_transfer import ( + POLICYBENCH_TRANSFER_SOURCE_FILE, +) +from policyengine_uk_data.storage import STORAGE_FOLDER +from policyengine_uk_data.storage.upload_public_transfer_dataset import ( + upload_public_transfer_dataset, +) +from policyengine_uk_data.utils.hf_destinations import PUBLIC_REPO + + +def _sha256(path: Path) -> str: + digest = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _git_blob_sha(path: Path) -> str: + root = Path( + subprocess.check_output( + ["git", "rev-parse", "--show-toplevel"], + text=True, + ).strip() + ) + relative_path = path.resolve().relative_to(root) + return subprocess.check_output( + ["git", "rev-parse", f"HEAD:{relative_path}"], + text=True, + ).strip() + + +def _csv_rows(path: Path) -> int: + return len(pd.read_csv(path, usecols=["scenario_id"])) + + +def _dataset_counts(path: Path) -> dict[str, int]: + values = UKSingleYearDataset(file_path=str(path)).load() + return { + "h5_households": len(values["household_id"]), + "h5_people": len(values["person_id"]), + "h5_benunits": len(values["benunit_id"]), + } + + +def _household_weights(path: Path) -> np.ndarray: + values = UKSingleYearDataset(file_path=str(path)).load() + return np.asarray(values["household_weight"], dtype=float) + + +def test_enhanced_cps_manifest_matches_committed_artifacts(): + manifest = json.loads(ENHANCED_CPS_MANIFEST_FILE.read_text()) + + assert manifest["row_counts"]["source_households"] == _csv_rows( + ENHANCED_CPS_SOURCE_FILE + ) + assert ( + manifest["row_counts"] | _dataset_counts(ENHANCED_CPS_FILE) + == manifest["row_counts"] + ) + assert manifest["files"]["artifact"]["sha256"] == _sha256(ENHANCED_CPS_FILE) + assert manifest["files"]["source_manifest"]["sha256"] == _sha256( + ENHANCED_CPS_SOURCE_FILE + ) + assert manifest["files"]["artifact"]["git_blob_sha"] == _git_blob_sha( + ENHANCED_CPS_FILE + ) + assert manifest["files"]["source_manifest"]["git_blob_sha"] == _git_blob_sha( + ENHANCED_CPS_SOURCE_FILE + ) + + +def test_enhanced_cps_manifest_matches_docs_and_weight_diagnostics(): + manifest = json.loads(ENHANCED_CPS_MANIFEST_FILE.read_text()) + readme = Path("README.md").read_text() + source_rows = manifest["row_counts"]["source_households"] + + assert f"{source_rows:,}" in readme + + weights = _household_weights(ENHANCED_CPS_FILE) + sorted_weights = np.sort(weights)[::-1] + total_weight = weights.sum() + diagnostics = manifest["weight_diagnostics"] + + assert np.isclose(diagnostics["total_household_weight"], total_weight) + assert np.isclose( + diagnostics["effective_sample_size"], + total_weight**2 / np.square(weights).sum(), + ) + assert np.isclose(diagnostics["max_household_weight"], weights.max()) + assert np.isclose( + diagnostics["top_10_share"], + sorted_weights[:10].sum() / total_weight, + ) + assert diagnostics["near_zero_weight_count"] == int( + (weights <= diagnostics["near_zero_weight_threshold"]).sum() + ) + + +def test_legacy_policybench_transfer_artifacts_are_explicitly_legacy(): + legacy_source = STORAGE_FOLDER / "policybench_transfer_source_2025.csv" + legacy_artifact = STORAGE_FOLDER / "policybench_transfer_2025.h5" + + assert _csv_rows(legacy_source) == 1_000 + assert _dataset_counts(legacy_artifact)["h5_households"] == 1_000 + assert POLICYBENCH_TRANSFER_SOURCE_FILE == ENHANCED_CPS_SOURCE_FILE + + +def test_public_transfer_upload_targets_public_hf_repo(): + with patch( + "policyengine_uk_data.storage.upload_public_transfer_dataset.upload_files_to_hf" + ) as upload_files_to_hf: + upload_public_transfer_dataset() + + upload_files_to_hf.assert_called_once() + kwargs = upload_files_to_hf.call_args.kwargs + assert kwargs["hf_repo_name"] == PUBLIC_REPO + assert kwargs["files"] == [ + ENHANCED_CPS_FILE, + ENHANCED_CPS_SOURCE_FILE, + ENHANCED_CPS_MANIFEST_FILE, + ] diff --git a/policyengine_uk_data/utils/enhanced_cps_manifest.py b/policyengine_uk_data/utils/enhanced_cps_manifest.py new file mode 100644 index 000000000..207bc27f4 --- /dev/null +++ b/policyengine_uk_data/utils/enhanced_cps_manifest.py @@ -0,0 +1,248 @@ +"""Artifact manifest generation for the public UK enhanced CPS transfer data.""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import subprocess +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import numpy as np +import pandas as pd +from policyengine_uk.data import UKSingleYearDataset + +from policyengine_uk_data.datasets.enhanced_cps import ( + ENHANCED_CPS_FILE, + ENHANCED_CPS_MANIFEST_FILE, + ENHANCED_CPS_SOURCE_FILE, + USD_TO_GBP, + USD_TO_GBP_SOURCE_URL, + create_enhanced_cps, +) +from policyengine_uk_data.utils.loss import get_loss_results + +ENHANCED_CPS_MANIFEST_SCHEMA_VERSION = 1 + + +def _sha256(path: Path) -> str: + digest = hashlib.sha256() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(1024 * 1024), b""): + digest.update(chunk) + return digest.hexdigest() + + +def _repo_root() -> Path | None: + try: + root = subprocess.check_output( + ["git", "rev-parse", "--show-toplevel"], + text=True, + ).strip() + except Exception: + return None + return Path(root) + + +def _git_value(*args: str) -> str | None: + try: + return subprocess.check_output(["git", *args], text=True).strip() + except Exception: + return None + + +def _relative_to_repo(path: Path) -> str | None: + root = _repo_root() + if root is None: + return None + try: + return str(path.resolve().relative_to(root)) + except ValueError: + return None + + +def _git_blob_sha(path: Path) -> str | None: + relative_path = _relative_to_repo(path) + if relative_path is None: + return None + return _git_value("rev-parse", f"HEAD:{relative_path}") + + +def _last_commit(path: Path) -> str | None: + relative_path = _relative_to_repo(path) + if relative_path is None: + return None + return _git_value("log", "-1", "--format=%H", "--", relative_path) + + +def _loss_summary(dataset: UKSingleYearDataset, fiscal_year: int) -> dict[str, Any]: + loss = get_loss_results(dataset, str(fiscal_year)) + abs_relative_error = loss.abs_rel_error.to_numpy() + include = np.isfinite(abs_relative_error) & (loss.target.to_numpy() != 0) + included_errors = loss.loc[include, "abs_rel_error"] + return { + "target_count": int(len(loss)), + "nonzero_finite_target_count": int(include.sum()), + "zero_target_count": int((loss.target == 0).sum()), + "nonfinite_relative_error_count": int((~np.isfinite(abs_relative_error)).sum()), + "mean_abs_relative_error": round(float(included_errors.mean()), 6), + "median_abs_relative_error": round(float(included_errors.median()), 6), + "p90_abs_relative_error": round( + float(included_errors.quantile(0.9)), + 6, + ), + "share_within_10pct": round(float((included_errors <= 0.10).mean()), 6), + "share_within_25pct": round(float((included_errors <= 0.25).mean()), 6), + "excludes_zero_target_relative_errors": True, + } + + +def _weight_diagnostics(weights: np.ndarray) -> dict[str, Any]: + sorted_weights = np.sort(weights)[::-1] + total = weights.sum() + threshold = 1e-6 + return { + "total_household_weight": round(float(total), 6), + "effective_sample_size": round( + float(total**2 / np.square(weights).sum()), + 6, + ), + "min_household_weight": float(weights.min()), + "max_household_weight": round(float(weights.max()), 6), + "top_1_share": round(float(sorted_weights[:1].sum() / total), 9), + "top_10_share": round(float(sorted_weights[:10].sum() / total), 9), + "top_100_share": round(float(sorted_weights[:100].sum() / total), 9), + "near_zero_weight_threshold": threshold, + "near_zero_weight_count": int((weights <= threshold).sum()), + } + + +def build_enhanced_cps_manifest( + *, + source_file_path: str | Path = ENHANCED_CPS_SOURCE_FILE, + artifact_file_path: str | Path = ENHANCED_CPS_FILE, + fiscal_year: int = 2025, + include_loss: bool = True, + include_raw_loss: bool = True, +) -> dict[str, Any]: + """Build a JSON-serializable manifest for the committed public artifact.""" + source_file_path = Path(source_file_path) + artifact_file_path = Path(artifact_file_path) + + source = pd.read_csv(source_file_path, usecols=["scenario_id"]) + dataset = UKSingleYearDataset(file_path=str(artifact_file_path)) + values = dataset.load() + household_weights = np.asarray(values["household_weight"], dtype=float) + + manifest: dict[str, Any] = { + "schema_version": ENHANCED_CPS_MANIFEST_SCHEMA_VERSION, + "artifact": "enhanced_cps_2025", + "generated_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "description": ( + "Public UK calibrated transfer dataset derived from a public export " + "of benchmark-compatible PolicyEngine US Enhanced CPS households." + ), + "intended_uses": [ + "public demos", + "reproducible examples", + "public benchmark analysis", + ], + "not_intended_uses": [ + "substitution for FRS or enhanced FRS", + "precise UK distributional analysis", + "administrative truth validation", + ], + "files": { + "artifact": { + "path": _relative_to_repo(artifact_file_path) + or str(artifact_file_path), + "sha256": _sha256(artifact_file_path), + "git_blob_sha": _git_blob_sha(artifact_file_path), + "last_modified_commit": _last_commit(artifact_file_path), + "size_bytes": artifact_file_path.stat().st_size, + }, + "source_manifest": { + "path": _relative_to_repo(source_file_path) or str(source_file_path), + "sha256": _sha256(source_file_path), + "git_blob_sha": _git_blob_sha(source_file_path), + "last_modified_commit": _last_commit(source_file_path), + "size_bytes": source_file_path.stat().st_size, + }, + }, + "build": { + "builder": "policyengine_uk_data.datasets.save_enhanced_cps", + "build_command": ( + "uv run --python 3.13 python -m " + "policyengine_uk_data.storage.write_enhanced_cps_manifest" + ), + "fiscal_year": fiscal_year, + "source_dataset": "PolicyEngine US Enhanced CPS public export", + "source_scope": "benchmark-compatible households", + "calibrated": True, + "calibration_target_year": fiscal_year, + "exchange_rate": { + "usd_to_gbp": USD_TO_GBP, + "source_url": USD_TO_GBP_SOURCE_URL, + "live_api_called": False, + }, + }, + "row_counts": { + "source_households": int(len(source)), + "h5_households": int(len(values["household_id"])), + "h5_people": int(len(values["person_id"])), + "h5_benunits": int(len(values["benunit_id"])), + }, + "weight_diagnostics": _weight_diagnostics(household_weights), + } + + if include_loss: + manifest["loss_diagnostics"] = { + "target_year": fiscal_year, + "calibrated": _loss_summary(dataset, fiscal_year), + } + if include_raw_loss: + raw_dataset = create_enhanced_cps( + source_file_path=source_file_path, + fiscal_year=fiscal_year, + calibrate=False, + ) + manifest["loss_diagnostics"]["raw_transfer_weights"] = _loss_summary( + raw_dataset, + fiscal_year, + ) + + return manifest + + +def write_enhanced_cps_manifest( + output_file_path: str | Path = ENHANCED_CPS_MANIFEST_FILE, + **kwargs, +) -> dict[str, Any]: + """Write the public enhanced CPS artifact manifest to disk.""" + manifest = build_enhanced_cps_manifest(**kwargs) + output_file_path = Path(output_file_path) + output_file_path.write_text( + json.dumps(manifest, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + return manifest + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--output", default=str(ENHANCED_CPS_MANIFEST_FILE)) + parser.add_argument("--skip-loss", action="store_true") + parser.add_argument("--skip-raw-loss", action="store_true") + args = parser.parse_args() + + write_enhanced_cps_manifest( + output_file_path=args.output, + include_loss=not args.skip_loss, + include_raw_loss=not args.skip_raw_loss, + ) + + +if __name__ == "__main__": + main() diff --git a/uv.lock b/uv.lock index adc63bb55..d2b35a8c3 100644 --- a/uv.lock +++ b/uv.lock @@ -1366,7 +1366,7 @@ wheels = [ [[package]] name = "policyengine-uk-data" -version = "1.53.1" +version = "1.54.0" source = { editable = "." } dependencies = [ { name = "google-auth" }, From de08a89192780abbeaa269d22e09ff250c0f162f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 26 Apr 2026 22:29:37 -0400 Subject: [PATCH 2/5] Tighten transfer leaf input guard --- .../tests/test_policybench_transfer.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/policyengine_uk_data/tests/test_policybench_transfer.py b/policyengine_uk_data/tests/test_policybench_transfer.py index 2d5c10f6d..970c16395 100644 --- a/policyengine_uk_data/tests/test_policybench_transfer.py +++ b/policyengine_uk_data/tests/test_policybench_transfer.py @@ -21,6 +21,21 @@ } +def _is_valid_leaf_input(column: str, entity: str, system: CountryTaxBenefitSystem): + variable = system.variables.get(column) + if variable is None: + return False + if variable.entity.key != entity: + return False + if column in ALLOWED_REPORTED_DATA_INPUTS: + return True + if not variable.is_input_variable(): + return False + if getattr(variable, "defined_for", None) is not None: + return False + return True + + def _subset_source(tmp_path: Path, rows: int) -> Path: source = pd.read_csv(ENHANCED_CPS_SOURCE_FILE).head(rows).copy() subset_path = tmp_path / f"enhanced_cps_source_{rows}.csv" @@ -58,12 +73,7 @@ def test_policybench_transfer_writes_only_valid_leaf_inputs(tmp_path: Path): invalid_columns = [ column for column in frame.columns - if column not in system.variables - or system.variables[column].entity.key != entity - or ( - not system.variables[column].is_input_variable() - and column not in ALLOWED_REPORTED_DATA_INPUTS - ) + if not _is_valid_leaf_input(column, entity, system) ] assert invalid_columns == [] From 7c0692830d5c9f6ee5243d5f846623cd11c445f5 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 30 Apr 2026 10:23:57 -0400 Subject: [PATCH 3/5] Move PIP amount mapping into UK data --- policyengine_uk_data/datasets/enhanced_cps.py | 34 +------ policyengine_uk_data/datasets/frs.py | 93 ++++++++++++------- .../datasets/imputations/frs_only.py | 48 ++++++++-- .../storage/uprating_factors.csv | 2 - .../storage/uprating_growth_factors.csv | 2 - .../tests/test_frs_only_imputation.py | 15 ++- .../tests/test_policybench_transfer.py | 6 ++ 7 files changed, 128 insertions(+), 72 deletions(-) diff --git a/policyengine_uk_data/datasets/enhanced_cps.py b/policyengine_uk_data/datasets/enhanced_cps.py index 8ba026511..61db8e62d 100644 --- a/policyengine_uk_data/datasets/enhanced_cps.py +++ b/policyengine_uk_data/datasets/enhanced_cps.py @@ -36,26 +36,13 @@ "yearly-average-currency-exchange-rates" ) -# 2025/26 reported-benefit mapping assumptions used only to populate UK input -# leaves from U.S. source records. PolicyEngine UK applies its own parameters -# when calculating derived tax and benefit outputs. +# 2025/26 transfer assumptions used only to populate UK input leaves from U.S. +# source records. PolicyEngine UK applies its own parameters when calculating +# derived tax and benefit outputs. NEW_STATE_PENSION_2025 = 224.96 * 52 DIVIDEND_YIELD_FOR_WEALTH_IMPUTATION = 0.03 RENTAL_YIELD_FOR_WEALTH_IMPUTATION = 0.04 -PIP_2025_WEEKLY_RATES = { - "daily_living": { - "NONE": 0.0, - "STANDARD": 73.89, - "ENHANCED": 110.40, - }, - "mobility": { - "NONE": 0.0, - "STANDARD": 29.19, - "ENHANCED": 77.04, - }, -} - REGION_SHARES = ( ("NORTH_EAST", 0.04), ("NORTH_WEST", 0.11), @@ -249,11 +236,6 @@ def _pip_category(person: dict) -> str: return "ENHANCED" if severe_signal or low_earnings else "STANDARD" -def _pip_reported_amount(category: str, component: str) -> float: - weekly = PIP_2025_WEEKLY_RATES[component][category] - return round(weekly * 52, 2) - - def _household_cash_income(people: list[dict], exchange_rate: float) -> float: total = 0.0 for person in people: @@ -689,14 +671,8 @@ def _build_base_dataset( if bool(inputs.get("is_blind", False)) else 0.0, "is_disabled_for_benefits": bool(inputs.get("is_disabled", False)), - "pip_dl_reported": _pip_reported_amount( - pip_category, - "daily_living", - ), - "pip_m_reported": _pip_reported_amount( - pip_category, - "mobility", - ), + "pip_dl_category": pip_category, + "pip_m_category": pip_category, "hours_worked": float( inputs.get( "weekly_hours_worked", diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 170f11b51..37deb0a14 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -56,6 +56,25 @@ "disabled_students_allowance_course_eligible", "disabled_students_allowance_has_qualifying_condition", ) +PIP_CATEGORY_SAFETY_MARGIN = 0.1 + + +def _pip_category_from_reported( + reported, + standard_rate: float, + enhanced_rate: float, +) -> np.ndarray: + """Convert annual reported PIP amounts to PE-UK PIP category inputs.""" + + reported_weekly = pd.Series(reported).fillna(0).astype(float) / WEEKS_IN_YEAR + return np.select( + [ + reported_weekly >= enhanced_rate * (1 - PIP_CATEGORY_SAFETY_MARGIN), + reported_weekly >= standard_rate * (1 - PIP_CATEGORY_SAFETY_MARGIN), + ], + ["ENHANCED", "STANDARD"], + default="NONE", + ) @lru_cache(maxsize=None) @@ -1099,6 +1118,49 @@ def determine_education_level(fted_val, typeed2_val, age_val): household.index, ) + benefit = CountryTaxBenefitSystem().parameters(year).gov.dwp + + pe_person["pip_dl_category"] = _pip_category_from_reported( + pe_person["pip_dl_reported"], + benefit.pip.daily_living.standard, + benefit.pip.daily_living.enhanced, + ) + pe_person["pip_m_category"] = _pip_category_from_reported( + pe_person["pip_m_reported"], + benefit.pip.mobility.standard, + benefit.pip.mobility.enhanced, + ) + + has_pip = (pe_person["pip_dl_category"] != "NONE") | ( + pe_person["pip_m_category"] != "NONE" + ) + pe_person["is_disabled_for_benefits"] = ( + pe_person.dla_sc_reported + pe_person.dla_m_reported > 0 + ) | has_pip + + THRESHOLD_SAFETY_GAP = 1 * WEEKS_IN_YEAR + + pe_person["is_enhanced_disabled_for_benefits"] = ( + pe_person.dla_sc_reported + > benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP + ) + + # Child Tax Credit Regulations 2002 s. 8 + paragraph_3 = ( + pe_person.dla_sc_reported + >= benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP + ) + paragraph_4 = pe_person["pip_dl_category"] == "ENHANCED" + paragraph_5 = pe_person.afcs_reported > 0 + pe_person["is_severely_disabled_for_benefits"] = ( + paragraph_3 | paragraph_4 | paragraph_5 + ) + + pe_person = pe_person.drop( + columns=["pip_dl_reported", "pip_m_reported"], + errors="ignore", + ) + dataset = UKSingleYearDataset( person=pe_person, benunit=pe_benunit, @@ -1144,37 +1206,6 @@ def determine_education_level(fted_val, typeed2_val, age_val): pe_household["brma"] = brmas - parameters = sim.tax_benefit_system.parameters - benefit = parameters(year).gov.dwp - - pe_person["is_disabled_for_benefits"] = ( - pe_person.dla_sc_reported - + pe_person.dla_m_reported - + pe_person.pip_m_reported - + pe_person.pip_dl_reported - ) > 0 - - THRESHOLD_SAFETY_GAP = 1 * WEEKS_IN_YEAR - - pe_person["is_enhanced_disabled_for_benefits"] = ( - pe_person.dla_sc_reported - > benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP - ) - - # Child Tax Credit Regulations 2002 s. 8 - paragraph_3 = ( - pe_person.dla_sc_reported - >= benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP - ) - paragraph_4 = ( - pe_person.pip_dl_reported - >= benefit.pip.daily_living.enhanced * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP - ) - paragraph_5 = pe_person.afcs_reported > 0 - pe_person["is_severely_disabled_for_benefits"] = ( - paragraph_3 | paragraph_4 | paragraph_5 - ) - # Dataset-side claimant-state approximations for future legacy ESA/JSA # modelling. These are explicit proxies based on observed survey # conditions, not legislative determinations. diff --git a/policyengine_uk_data/datasets/imputations/frs_only.py b/policyengine_uk_data/datasets/imputations/frs_only.py index 1730bc257..0ec4541e2 100644 --- a/policyengine_uk_data/datasets/imputations/frs_only.py +++ b/policyengine_uk_data/datasets/imputations/frs_only.py @@ -81,8 +81,6 @@ "state_pension_reported", "dla_sc_reported", "dla_m_reported", - "pip_m_reported", - "pip_dl_reported", "sda_reported", "carers_allowance_reported", "iidb_reported", @@ -98,6 +96,22 @@ "esa_income_reported", ] +FRS_ONLY_PERSON_CATEGORY_VARIABLES = [ + "pip_m_category", + "pip_dl_category", +] + +PIP_CATEGORY_TO_CODE = { + "NONE": 0.0, + "STANDARD": 1.0, + "ENHANCED": 2.0, +} +PIP_CODE_TO_CATEGORY = { + 0: "NONE", + 1: "STANDARD", + 2: "ENHANCED", +} + def _one_hot_encode(df: pd.DataFrame, columns: list[str]) -> pd.DataFrame: """Return ``df`` with object-typed ``columns`` one-hot encoded. @@ -157,6 +171,11 @@ def _build_predictor_frame(dataset: UKSingleYearDataset) -> pd.DataFrame: return frame +def _category_codes(series: pd.Series) -> pd.Series: + categories = series.fillna("NONE").astype(str).str.upper() + return categories.map(PIP_CATEGORY_TO_CODE).fillna(0.0) + + def impute_frs_only_variables( train_dataset: UKSingleYearDataset, target_dataset: UKSingleYearDataset, @@ -183,12 +202,20 @@ def impute_frs_only_variables( target_person = target_dataset.person # Use only variables present in both frames. - outputs = [ + numeric_outputs = [ v for v in FRS_ONLY_PERSON_VARIABLES if v in train_person.columns and v in target_person.columns ] - missing = set(FRS_ONLY_PERSON_VARIABLES) - set(outputs) + category_outputs = [ + v + for v in FRS_ONLY_PERSON_CATEGORY_VARIABLES + if v in train_person.columns and v in target_person.columns + ] + outputs = numeric_outputs + category_outputs + missing = ( + set(FRS_ONLY_PERSON_VARIABLES) | set(FRS_ONLY_PERSON_CATEGORY_VARIABLES) + ) - set(outputs) if missing: logger.warning( "Stage-2 FRS-only imputation: %d variables absent from " @@ -213,7 +240,9 @@ def impute_frs_only_variables( # Replace NaNs in outputs with 0 so the QRF trains on clean targets; # FRS-only variables are almost all zero-heavy "amount if eligible" # columns that default to zero when unreported. - train_outputs = train_person[outputs].fillna(0).astype(float) + train_outputs = train_person[numeric_outputs].fillna(0).astype(float) + for column in category_outputs: + train_outputs[column] = _category_codes(train_person[column]) logger.info( "Stage-2 FRS-only imputation: %d outputs, training on %d FRS " @@ -231,10 +260,17 @@ def impute_frs_only_variables( # clamp to zero (the population-typical value for these variables). predictions = predictions.fillna(0.0) - for column in outputs: + for column in numeric_outputs: # Clamp negative predictions — these columns represent receipted # amounts or contributions and are non-negative by construction. values = np.maximum(predictions[column].values, 0.0) target_dataset.person[column] = values + for column in category_outputs: + values = np.rint(predictions[column].fillna(0.0).values).astype(int) + values = np.clip(values, 0, 2) + target_dataset.person[column] = [ + PIP_CODE_TO_CATEGORY[value] for value in values + ] + return target_dataset diff --git a/policyengine_uk_data/storage/uprating_factors.csv b/policyengine_uk_data/storage/uprating_factors.csv index 5a7e54df4..73f18973b 100644 --- a/policyengine_uk_data/storage/uprating_factors.csv +++ b/policyengine_uk_data/storage/uprating_factors.csv @@ -60,8 +60,6 @@ pension_credit_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38, pension_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 personal_pension_contributions,1.0,1.059,1.127,1.205,1.261,1.308,1.337,1.365,1.396,1.431,1.431,1.431,1.431,1.431,1.431 petrol_spending,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 -pip_dl_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 -pip_m_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 private_pension_income,1.0,1.003,1.053,1.106,1.161,1.216,1.261,1.288,1.315,1.346,1.346,1.346,1.346,1.346,1.346 private_transfer_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 property_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 diff --git a/policyengine_uk_data/storage/uprating_growth_factors.csv b/policyengine_uk_data/storage/uprating_growth_factors.csv index eb8b7fb6d..8a4a924fa 100644 --- a/policyengine_uk_data/storage/uprating_growth_factors.csv +++ b/policyengine_uk_data/storage/uprating_growth_factors.csv @@ -60,8 +60,6 @@ pension_credit_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0. pension_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 personal_pension_contributions,0,0.059,0.064,0.069,0.046,0.037,0.022,0.021,0.023,0.025,0.0,0.0,0.0,0.0,0.0 petrol_spending,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 -pip_dl_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 -pip_m_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 private_pension_income,0,0.003,0.05,0.05,0.05,0.047,0.037,0.021,0.021,0.024,0.0,0.0,0.0,0.0,0.0 private_transfer_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 property_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 diff --git a/policyengine_uk_data/tests/test_frs_only_imputation.py b/policyengine_uk_data/tests/test_frs_only_imputation.py index 9393c7436..265105ab7 100644 --- a/policyengine_uk_data/tests/test_frs_only_imputation.py +++ b/policyengine_uk_data/tests/test_frs_only_imputation.py @@ -72,8 +72,16 @@ def _fake_dataset(person_rows: int, seed: int = 0): "state_pension_reported": 0.0, "dla_sc_reported": 0.0, "dla_m_reported": 0.0, - "pip_m_reported": 0.0, - "pip_dl_reported": 0.0, + "pip_m_category": np.where( + rng.random(person_rows) < 0.05, + "STANDARD", + "NONE", + ), + "pip_dl_category": np.where( + rng.random(person_rows) < 0.05, + "ENHANCED", + "NONE", + ), "sda_reported": 0.0, "carers_allowance_reported": 0.0, "iidb_reported": 0.0, @@ -126,6 +134,9 @@ def test_frs_only_outputs_are_non_negative(): assert np.all(values >= 0), f"{column} has negative predictions" assert np.isfinite(values).all(), f"{column} has NaN / inf predictions" + for column in ("pip_m_category", "pip_dl_category"): + assert result.person[column].isin(["NONE", "STANDARD", "ENHANCED"]).all() + def test_frs_only_does_not_touch_non_output_columns(): """Stage-2 must only rewrite the curated output list, nothing else.""" diff --git a/policyengine_uk_data/tests/test_policybench_transfer.py b/policyengine_uk_data/tests/test_policybench_transfer.py index 970c16395..ac75231c3 100644 --- a/policyengine_uk_data/tests/test_policybench_transfer.py +++ b/policyengine_uk_data/tests/test_policybench_transfer.py @@ -79,6 +79,12 @@ def test_policybench_transfer_writes_only_valid_leaf_inputs(tmp_path: Path): assert "household_wealth" not in dataset.household.columns assert "total_wealth" not in dataset.household.columns + assert "pip_dl_reported" not in dataset.person.columns + assert "pip_m_reported" not in dataset.person.columns + assert ( + dataset.person["pip_dl_category"].isin(["NONE", "STANDARD", "ENHANCED"]).all() + ) + assert dataset.person["pip_m_category"].isin(["NONE", "STANDARD", "ENHANCED"]).all() for column in ( "savings", "main_residence_value", From 8be858ff588449705728f1fc48e91675ed6c7722 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 30 Apr 2026 10:33:14 -0400 Subject: [PATCH 4/5] Move disability benefit category mapping into UK data --- policyengine_uk_data/datasets/frs.py | 76 +++++++++++++------ .../datasets/imputations/frs_only.py | 46 +++++------ .../storage/uprating_factors.csv | 3 - .../storage/uprating_growth_factors.csv | 3 - .../tests/test_frs_only_imputation.py | 20 +++-- .../tests/test_policybench_transfer.py | 3 + 6 files changed, 89 insertions(+), 62 deletions(-) diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 18e9535b2..9aa11f8e2 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -59,20 +59,19 @@ PIP_CATEGORY_SAFETY_MARGIN = 0.1 -def _pip_category_from_reported( +def _category_from_reported( reported, - standard_rate: float, - enhanced_rate: float, + thresholds: tuple[tuple[str, float], ...], ) -> np.ndarray: - """Convert annual reported PIP amounts to PE-UK PIP category inputs.""" + """Convert annual reported amounts to PE-UK category inputs.""" reported_weekly = pd.Series(reported).fillna(0).astype(float) / WEEKS_IN_YEAR return np.select( [ - reported_weekly >= enhanced_rate * (1 - PIP_CATEGORY_SAFETY_MARGIN), - reported_weekly >= standard_rate * (1 - PIP_CATEGORY_SAFETY_MARGIN), + reported_weekly >= rate * (1 - PIP_CATEGORY_SAFETY_MARGIN) + for _, rate in thresholds ], - ["ENHANCED", "STANDARD"], + [category for category, _ in thresholds], default="NONE", ) @@ -1244,36 +1243,57 @@ def determine_education_level(fted_val, typeed2_val, age_val): benefit = CountryTaxBenefitSystem().parameters(year).gov.dwp - pe_person["pip_dl_category"] = _pip_category_from_reported( + pe_person["aa_category"] = _category_from_reported( + pe_person["attendance_allowance_reported"], + ( + ("HIGHER", benefit.attendance_allowance.higher), + ("LOWER", benefit.attendance_allowance.lower), + ), + ) + pe_person["dla_sc_category"] = _category_from_reported( + pe_person["dla_sc_reported"], + ( + ("HIGHER", benefit.dla.self_care.higher), + ("MIDDLE", benefit.dla.self_care.middle), + ("LOWER", benefit.dla.self_care.lower), + ), + ) + pe_person["dla_m_category"] = _category_from_reported( + pe_person["dla_m_reported"], + ( + ("HIGHER", benefit.dla.mobility.higher), + ("LOWER", benefit.dla.mobility.lower), + ), + ) + pe_person["pip_dl_category"] = _category_from_reported( pe_person["pip_dl_reported"], - benefit.pip.daily_living.standard, - benefit.pip.daily_living.enhanced, + ( + ("ENHANCED", benefit.pip.daily_living.enhanced), + ("STANDARD", benefit.pip.daily_living.standard), + ), ) - pe_person["pip_m_category"] = _pip_category_from_reported( + pe_person["pip_m_category"] = _category_from_reported( pe_person["pip_m_reported"], - benefit.pip.mobility.standard, - benefit.pip.mobility.enhanced, + ( + ("ENHANCED", benefit.pip.mobility.enhanced), + ("STANDARD", benefit.pip.mobility.standard), + ), ) has_pip = (pe_person["pip_dl_category"] != "NONE") | ( pe_person["pip_m_category"] != "NONE" ) - pe_person["is_disabled_for_benefits"] = ( - pe_person.dla_sc_reported + pe_person.dla_m_reported > 0 - ) | has_pip - - THRESHOLD_SAFETY_GAP = 1 * WEEKS_IN_YEAR + has_dla = (pe_person["dla_sc_category"] != "NONE") | ( + pe_person["dla_m_category"] != "NONE" + ) + pe_person["is_disabled_for_benefits"] = has_dla | has_pip pe_person["is_enhanced_disabled_for_benefits"] = ( - pe_person.dla_sc_reported - > benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP + pe_person["dla_sc_category"] == "HIGHER" ) # Child Tax Credit Regulations 2002 s. 8 - paragraph_3 = ( - pe_person.dla_sc_reported - >= benefit.dla.self_care.higher * WEEKS_IN_YEAR - THRESHOLD_SAFETY_GAP - ) + paragraph_3 = pe_person["dla_sc_category"] == "HIGHER" paragraph_4 = pe_person["pip_dl_category"] == "ENHANCED" paragraph_5 = pe_person.afcs_reported > 0 pe_person["is_severely_disabled_for_benefits"] = ( @@ -1281,7 +1301,13 @@ def determine_education_level(fted_val, typeed2_val, age_val): ) pe_person = pe_person.drop( - columns=["pip_dl_reported", "pip_m_reported"], + columns=[ + "attendance_allowance_reported", + "dla_sc_reported", + "dla_m_reported", + "pip_dl_reported", + "pip_m_reported", + ], errors="ignore", ) diff --git a/policyengine_uk_data/datasets/imputations/frs_only.py b/policyengine_uk_data/datasets/imputations/frs_only.py index 0ec4541e2..55c46d095 100644 --- a/policyengine_uk_data/datasets/imputations/frs_only.py +++ b/policyengine_uk_data/datasets/imputations/frs_only.py @@ -77,10 +77,7 @@ "income_support_reported", "working_tax_credit_reported", "child_tax_credit_reported", - "attendance_allowance_reported", "state_pension_reported", - "dla_sc_reported", - "dla_m_reported", "sda_reported", "carers_allowance_reported", "iidb_reported", @@ -96,20 +93,12 @@ "esa_income_reported", ] -FRS_ONLY_PERSON_CATEGORY_VARIABLES = [ - "pip_m_category", - "pip_dl_category", -] - -PIP_CATEGORY_TO_CODE = { - "NONE": 0.0, - "STANDARD": 1.0, - "ENHANCED": 2.0, -} -PIP_CODE_TO_CATEGORY = { - 0: "NONE", - 1: "STANDARD", - 2: "ENHANCED", +FRS_ONLY_PERSON_CATEGORY_VARIABLES = { + "aa_category": ["NONE", "LOWER", "HIGHER"], + "dla_m_category": ["NONE", "LOWER", "HIGHER"], + "dla_sc_category": ["NONE", "LOWER", "MIDDLE", "HIGHER"], + "pip_m_category": ["NONE", "STANDARD", "ENHANCED"], + "pip_dl_category": ["NONE", "STANDARD", "ENHANCED"], } @@ -171,9 +160,12 @@ def _build_predictor_frame(dataset: UKSingleYearDataset) -> pd.DataFrame: return frame -def _category_codes(series: pd.Series) -> pd.Series: +def _category_codes(series: pd.Series, allowed_categories: list[str]) -> pd.Series: + category_to_code = { + category: float(index) for index, category in enumerate(allowed_categories) + } categories = series.fillna("NONE").astype(str).str.upper() - return categories.map(PIP_CATEGORY_TO_CODE).fillna(0.0) + return categories.map(category_to_code).fillna(0.0) def impute_frs_only_variables( @@ -209,12 +201,12 @@ def impute_frs_only_variables( ] category_outputs = [ v - for v in FRS_ONLY_PERSON_CATEGORY_VARIABLES + for v in FRS_ONLY_PERSON_CATEGORY_VARIABLES.keys() if v in train_person.columns and v in target_person.columns ] outputs = numeric_outputs + category_outputs missing = ( - set(FRS_ONLY_PERSON_VARIABLES) | set(FRS_ONLY_PERSON_CATEGORY_VARIABLES) + set(FRS_ONLY_PERSON_VARIABLES) | set(FRS_ONLY_PERSON_CATEGORY_VARIABLES.keys()) ) - set(outputs) if missing: logger.warning( @@ -242,7 +234,10 @@ def impute_frs_only_variables( # columns that default to zero when unreported. train_outputs = train_person[numeric_outputs].fillna(0).astype(float) for column in category_outputs: - train_outputs[column] = _category_codes(train_person[column]) + train_outputs[column] = _category_codes( + train_person[column], + FRS_ONLY_PERSON_CATEGORY_VARIABLES[column], + ) logger.info( "Stage-2 FRS-only imputation: %d outputs, training on %d FRS " @@ -267,10 +262,9 @@ def impute_frs_only_variables( target_dataset.person[column] = values for column in category_outputs: + allowed_categories = FRS_ONLY_PERSON_CATEGORY_VARIABLES[column] values = np.rint(predictions[column].fillna(0.0).values).astype(int) - values = np.clip(values, 0, 2) - target_dataset.person[column] = [ - PIP_CODE_TO_CATEGORY[value] for value in values - ] + values = np.clip(values, 0, len(allowed_categories) - 1) + target_dataset.person[column] = [allowed_categories[value] for value in values] return target_dataset diff --git a/policyengine_uk_data/storage/uprating_factors.csv b/policyengine_uk_data/storage/uprating_factors.csv index 73f18973b..c5f6f638e 100644 --- a/policyengine_uk_data/storage/uprating_factors.csv +++ b/policyengine_uk_data/storage/uprating_factors.csv @@ -1,7 +1,6 @@ Variable,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034 afcs_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 alcohol_and_tobacco_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 -attendance_allowance_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 benunit_rent,1.0,1.0,1.0,1.11,1.184,1.223,1.275,1.312,1.351,1.392,1.392,1.392,1.392,1.392,1.392 bsp_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 capital_gains,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 @@ -15,8 +14,6 @@ communication_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.3 corporate_wealth,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 diesel_spending,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 dividend_income,1.0,1.0,1.092,1.147,1.19,1.223,1.258,1.297,1.34,1.384,1.384,1.384,1.384,1.384,1.384 -dla_m_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 -dla_sc_reported,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 domestic_energy_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 education_consumption,1.0,1.04,1.144,1.209,1.237,1.277,1.301,1.327,1.353,1.38,1.38,1.38,1.38,1.38,1.38 employee_pension_contributions,1.0,1.059,1.127,1.205,1.261,1.308,1.337,1.365,1.396,1.431,1.431,1.431,1.431,1.431,1.431 diff --git a/policyengine_uk_data/storage/uprating_growth_factors.csv b/policyengine_uk_data/storage/uprating_growth_factors.csv index 8a4a924fa..7d330d17a 100644 --- a/policyengine_uk_data/storage/uprating_growth_factors.csv +++ b/policyengine_uk_data/storage/uprating_growth_factors.csv @@ -1,7 +1,6 @@ Variable,2020,2021,2022,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034 afcs_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 alcohol_and_tobacco_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 -attendance_allowance_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 benunit_rent,0,0.0,0.0,0.11,0.067,0.033,0.043,0.029,0.03,0.03,0.0,0.0,0.0,0.0,0.0 bsp_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 capital_gains,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 @@ -15,8 +14,6 @@ communication_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0, corporate_wealth,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 diesel_spending,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 dividend_income,0,0.0,0.092,0.05,0.037,0.028,0.029,0.031,0.033,0.033,0.0,0.0,0.0,0.0,0.0 -dla_m_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 -dla_sc_reported,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 domestic_energy_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 education_consumption,0,0.04,0.1,0.057,0.023,0.032,0.019,0.02,0.02,0.02,0.0,0.0,0.0,0.0,0.0 employee_pension_contributions,0,0.059,0.064,0.069,0.046,0.037,0.022,0.021,0.023,0.025,0.0,0.0,0.0,0.0,0.0 diff --git a/policyengine_uk_data/tests/test_frs_only_imputation.py b/policyengine_uk_data/tests/test_frs_only_imputation.py index 265105ab7..cb2554950 100644 --- a/policyengine_uk_data/tests/test_frs_only_imputation.py +++ b/policyengine_uk_data/tests/test_frs_only_imputation.py @@ -68,10 +68,10 @@ def _fake_dataset(person_rows: int, seed: int = 0): "income_support_reported": 0.0, "working_tax_credit_reported": 0.0, "child_tax_credit_reported": 0.0, - "attendance_allowance_reported": 0.0, + "aa_category": "NONE", "state_pension_reported": 0.0, - "dla_sc_reported": 0.0, - "dla_m_reported": 0.0, + "dla_sc_category": "NONE", + "dla_m_category": "NONE", "pip_m_category": np.where( rng.random(person_rows) < 0.05, "STANDARD", @@ -134,8 +134,18 @@ def test_frs_only_outputs_are_non_negative(): assert np.all(values >= 0), f"{column} has negative predictions" assert np.isfinite(values).all(), f"{column} has NaN / inf predictions" - for column in ("pip_m_category", "pip_dl_category"): - assert result.person[column].isin(["NONE", "STANDARD", "ENHANCED"]).all() + for column in ( + "aa_category", + "dla_sc_category", + "dla_m_category", + "pip_m_category", + "pip_dl_category", + ): + assert ( + result.person[column] + .isin(["NONE", "LOWER", "MIDDLE", "HIGHER", "STANDARD", "ENHANCED"]) + .all() + ) def test_frs_only_does_not_touch_non_output_columns(): diff --git a/policyengine_uk_data/tests/test_policybench_transfer.py b/policyengine_uk_data/tests/test_policybench_transfer.py index ac75231c3..7dec319e8 100644 --- a/policyengine_uk_data/tests/test_policybench_transfer.py +++ b/policyengine_uk_data/tests/test_policybench_transfer.py @@ -79,6 +79,9 @@ def test_policybench_transfer_writes_only_valid_leaf_inputs(tmp_path: Path): assert "household_wealth" not in dataset.household.columns assert "total_wealth" not in dataset.household.columns + assert "attendance_allowance_reported" not in dataset.person.columns + assert "dla_sc_reported" not in dataset.person.columns + assert "dla_m_reported" not in dataset.person.columns assert "pip_dl_reported" not in dataset.person.columns assert "pip_m_reported" not in dataset.person.columns assert ( From 3621fcbc6613d79e18f11a09ecccb2d2e4ec57de Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 2 May 2026 08:59:59 -0400 Subject: [PATCH 5/5] Fix public transfer upload version --- .../storage/upload_public_transfer_dataset.py | 8 +++++++- .../tests/test_enhanced_cps_artifact_manifest.py | 6 ++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/policyengine_uk_data/storage/upload_public_transfer_dataset.py b/policyengine_uk_data/storage/upload_public_transfer_dataset.py index ab5f507da..89e3f7ad5 100644 --- a/policyengine_uk_data/storage/upload_public_transfer_dataset.py +++ b/policyengine_uk_data/storage/upload_public_transfer_dataset.py @@ -1,5 +1,7 @@ """Upload the explicitly public UK calibrated transfer dataset artifacts.""" +from importlib import metadata + from policyengine_uk_data.datasets import ( ENHANCED_CPS_FILE, ENHANCED_CPS_MANIFEST_FILE, @@ -9,7 +11,7 @@ from policyengine_uk_data.utils.hf_destinations import PUBLIC_REPO -def upload_public_transfer_dataset() -> None: +def upload_public_transfer_dataset(version: str | None = None) -> None: files = [ ENHANCED_CPS_FILE, ENHANCED_CPS_SOURCE_FILE, @@ -19,8 +21,12 @@ def upload_public_transfer_dataset() -> None: if not file_path.exists(): raise ValueError(f"File {file_path} does not exist.") + if version is None: + version = metadata.version("policyengine-uk-data") + upload_files_to_hf( files=files, + version=version, hf_repo_name=PUBLIC_REPO, ) diff --git a/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py b/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py index a9922311c..d1d2c0622 100644 --- a/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py +++ b/policyengine_uk_data/tests/test_enhanced_cps_artifact_manifest.py @@ -123,12 +123,14 @@ def test_legacy_policybench_transfer_artifacts_are_explicitly_legacy(): def test_public_transfer_upload_targets_public_hf_repo(): with patch( - "policyengine_uk_data.storage.upload_public_transfer_dataset.upload_files_to_hf" + "policyengine_uk_data.storage.upload_public_transfer_dataset.upload_files_to_hf", + autospec=True, ) as upload_files_to_hf: - upload_public_transfer_dataset() + upload_public_transfer_dataset(version="1.55.3") upload_files_to_hf.assert_called_once() kwargs = upload_files_to_hf.call_args.kwargs + assert kwargs["version"] == "1.55.3" assert kwargs["hf_repo_name"] == PUBLIC_REPO assert kwargs["files"] == [ ENHANCED_CPS_FILE,