diff --git a/changelog.d/1150.fixed.md b/changelog.d/1150.fixed.md new file mode 100644 index 000000000..2a615567b --- /dev/null +++ b/changelog.d/1150.fixed.md @@ -0,0 +1 @@ +Fail Enhanced CPS calibration when final household weights drift from the source household count. diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index de0d0b58e..2a5f2c142 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -42,6 +42,9 @@ torch = None +HOUSEHOLD_WEIGHT_TOTAL_REL_TOLERANCE = 0.02 + + def initialize_weight_priors( original_weights: np.ndarray, seed: int = 1456, @@ -81,6 +84,48 @@ def initialize_weight_priors( return priors +def validate_household_weight_total( + weights: np.ndarray, + *, + source_total: float, + year: int, + rel_tolerance: float = HOUSEHOLD_WEIGHT_TOTAL_REL_TOLERANCE, +) -> float: + """Validate calibrated household weights against the source total.""" + + weights = np.asarray(weights) + if np.any(np.isnan(weights)): + raise ValueError(f"Year {year}: household_weight contains NaN values") + if np.any(weights < 0): + raise ValueError(f"Year {year}: household_weight contains negative values") + + weighted_hh_count = float(np.sum(weights)) + if not (1e8 <= weighted_hh_count <= 2e8): + raise ValueError( + f"Year {year}: weighted household count " + f"{weighted_hh_count:,.0f} outside expected range " + f"[100M, 200M]" + ) + + source_total = float(source_total) + if not np.isfinite(source_total) or source_total <= 0: + raise ValueError( + f"Year {year}: source household count total must be positive; " + f"got {source_total:,.0f}" + ) + + rel_error = abs(weighted_hh_count - source_total) / source_total + if rel_error > rel_tolerance: + raise ValueError( + f"Year {year}: weighted household count " + f"{weighted_hh_count:,.0f} differs from source household count " + f"{source_total:,.0f} by {rel_error:.2%}, exceeding " + f"{rel_tolerance:.2%} tolerance" + ) + + return weighted_hh_count + + def _to_numpy(value) -> np.ndarray: return np.asarray(getattr(value, "values", value)) @@ -639,6 +684,7 @@ def generate(self): data["household_weight"] = {} original_weights = sim.calculate("household_weight") original_weights = initialize_weight_priors(original_weights.values) + source_household_count = float(np.sum(original_weights)) bad_targets = [ "nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household", @@ -688,25 +734,16 @@ def generate(self): ) data["household_weight"][year] = optimised_weights - # Validate dense weights - w = optimised_weights - if np.any(np.isnan(w)): - raise ValueError(f"Year {year}: household_weight contains NaN values") - if np.any(w < 0): - raise ValueError( - f"Year {year}: household_weight contains negative values" - ) - weighted_hh_count = float(np.sum(w)) - if not (1e8 <= weighted_hh_count <= 2e8): - raise ValueError( - f"Year {year}: weighted household count " - f"{weighted_hh_count:,.0f} outside expected range " - f"[100M, 200M]" - ) + weighted_hh_count = validate_household_weight_total( + optimised_weights, + source_total=source_household_count, + year=year, + ) logging.info( f"Year {year}: weights validated — " - f"{weighted_hh_count:,.0f} weighted households, " - f"{int(np.sum(w > 0))} non-zero" + f"{weighted_hh_count:,.0f} weighted households " + f"vs {source_household_count:,.0f} source households, " + f"{int(np.sum(optimised_weights > 0))} non-zero" ) if 2025 in ACA_POST_CALIBRATION_PERSON_TARGETS: @@ -824,9 +861,15 @@ def generate(self): data = sim.dataset.load_dataset() original_weights = sim.calculate("household_weight") original_weights = initialize_weight_priors(original_weights.values) + source_household_count = float(np.sum(original_weights)) for year in [2024]: loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year) optimised_weights = reweight(original_weights, loss_matrix, targets_array) + validate_household_weight_total( + optimised_weights, + source_total=source_household_count, + year=year, + ) data["household_weight"] = optimised_weights self.save_dataset(data) diff --git a/pyproject.toml b/pyproject.toml index 975e75829..4e3259399 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.711.0", + "policyengine-us==1.712.0", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/tests/unit/datasets/test_enhanced_cps_seeding.py b/tests/unit/datasets/test_enhanced_cps_seeding.py index 11e7e457d..2e386f942 100644 --- a/tests/unit/datasets/test_enhanced_cps_seeding.py +++ b/tests/unit/datasets/test_enhanced_cps_seeding.py @@ -8,6 +8,7 @@ """ import numpy as np +import pytest from policyengine_us_data.utils.seed import set_seeds @@ -56,3 +57,30 @@ def test_initialize_weight_priors_preserves_source_weight_total(): np.testing.assert_allclose(priors.sum(), 100.0) np.testing.assert_allclose(priors, np.array([40.0, 10.0, 25.0, 25.0])) + + +def test_validate_household_weight_total_accepts_close_total(): + from policyengine_us_data.datasets.cps.enhanced_cps import ( + validate_household_weight_total, + ) + + total = validate_household_weight_total( + np.array([50_000_000.0, 96_000_000.0]), + source_total=145_000_000.0, + year=2024, + ) + + assert total == 146_000_000.0 + + +def test_validate_household_weight_total_rejects_inflated_total(): + from policyengine_us_data.datasets.cps.enhanced_cps import ( + validate_household_weight_total, + ) + + with pytest.raises(ValueError, match="differs from source household count"): + validate_household_weight_total( + np.array([100_000_000.0, 86_900_000.0]), + source_total=145_000_000.0, + year=2024, + ) diff --git a/uv.lock b/uv.lock index 71f4a0242..891d69404 100644 --- a/uv.lock +++ b/uv.lock @@ -2164,7 +2164,7 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.711.0" +version = "1.712.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2174,9 +2174,9 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/48/ed/8825980a62e009610d6fa36f55f6c8a32deb0fb770d1f3513e2df9c7f7fe/policyengine_us-1.711.0.tar.gz", hash = "sha256:c52c8e68f3a01ee5935320175e841459503e67f84c41899f9768f4a5b300b4a3", size = 9956103, upload-time = "2026-05-27T21:31:17.868Z" } +sdist = { url = "https://files.pythonhosted.org/packages/94/de/1c6ed33b769f12a29ee148eaf73399fa11b48d726f8920b513657c4ef2f5/policyengine_us-1.712.0.tar.gz", hash = "sha256:821f9d25fa1893d1e95b090868983f56281a448eec82bb7b11ec5d08814c8e39", size = 9957228, upload-time = "2026-05-27T22:52:32.555Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ff/aa/3e8471c852c75ecc7c2cbbdaedf79b70a8d207df7f689abfd2b3b570bd7a/policyengine_us-1.711.0-py3-none-any.whl", hash = "sha256:e37d7ee5926954ecf9e03d91ccd190a1609e6322426c12fd6cdd867a913ee2d9", size = 10887738, upload-time = "2026-05-27T21:31:14.859Z" }, + { url = "https://files.pythonhosted.org/packages/03/b0/0073cf07946c52ab7c7098e4570362bfa4c285070014a530aee02bfd452d/policyengine_us-1.712.0-py3-none-any.whl", hash = "sha256:618450b2eca7b15ff73530d0f67fa4d1064104c46d155fc3ecc7f40b1c178956", size = 10887739, upload-time = "2026-05-27T22:52:27.786Z" }, ] [[package]] @@ -2246,7 +2246,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.711.0" }, + { name = "policyengine-us", specifier = "==1.712.0" }, { name = "requests", specifier = ">=2.25.0" }, { name = "scipy", specifier = ">=1.15.3" }, { name = "setuptools", specifier = ">=60" },