Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/1150.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fail Enhanced CPS calibration when final household weights drift from the source household count.
77 changes: 60 additions & 17 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
torch = None


HOUSEHOLD_WEIGHT_TOTAL_REL_TOLERANCE = 0.02


def initialize_weight_priors(
original_weights: np.ndarray,
seed: int = 1456,
Expand Down Expand Up @@ -81,6 +84,48 @@ def initialize_weight_priors(
return priors


def validate_household_weight_total(
weights: np.ndarray,
*,
source_total: float,
year: int,
rel_tolerance: float = HOUSEHOLD_WEIGHT_TOTAL_REL_TOLERANCE,
) -> float:
"""Validate calibrated household weights against the source total."""

weights = np.asarray(weights)
if np.any(np.isnan(weights)):
raise ValueError(f"Year {year}: household_weight contains NaN values")
if np.any(weights < 0):
raise ValueError(f"Year {year}: household_weight contains negative values")

weighted_hh_count = float(np.sum(weights))
if not (1e8 <= weighted_hh_count <= 2e8):
raise ValueError(
f"Year {year}: weighted household count "
f"{weighted_hh_count:,.0f} outside expected range "
f"[100M, 200M]"
)

source_total = float(source_total)
if not np.isfinite(source_total) or source_total <= 0:
raise ValueError(
f"Year {year}: source household count total must be positive; "
f"got {source_total:,.0f}"
)

rel_error = abs(weighted_hh_count - source_total) / source_total
if rel_error > rel_tolerance:
raise ValueError(
f"Year {year}: weighted household count "
f"{weighted_hh_count:,.0f} differs from source household count "
f"{source_total:,.0f} by {rel_error:.2%}, exceeding "
f"{rel_tolerance:.2%} tolerance"
)

return weighted_hh_count


def _to_numpy(value) -> np.ndarray:
return np.asarray(getattr(value, "values", value))

Expand Down Expand Up @@ -639,6 +684,7 @@ def generate(self):
data["household_weight"] = {}
original_weights = sim.calculate("household_weight")
original_weights = initialize_weight_priors(original_weights.values)
source_household_count = float(np.sum(original_weights))

bad_targets = [
"nation/irs/adjusted gross income/total/AGI in 10k-15k/taxable/Head of Household",
Expand Down Expand Up @@ -688,25 +734,16 @@ def generate(self):
)
data["household_weight"][year] = optimised_weights

# Validate dense weights
w = optimised_weights
if np.any(np.isnan(w)):
raise ValueError(f"Year {year}: household_weight contains NaN values")
if np.any(w < 0):
raise ValueError(
f"Year {year}: household_weight contains negative values"
)
weighted_hh_count = float(np.sum(w))
if not (1e8 <= weighted_hh_count <= 2e8):
raise ValueError(
f"Year {year}: weighted household count "
f"{weighted_hh_count:,.0f} outside expected range "
f"[100M, 200M]"
)
weighted_hh_count = validate_household_weight_total(
optimised_weights,
source_total=source_household_count,
year=year,
)
logging.info(
f"Year {year}: weights validated — "
f"{weighted_hh_count:,.0f} weighted households, "
f"{int(np.sum(w > 0))} non-zero"
f"{weighted_hh_count:,.0f} weighted households "
f"vs {source_household_count:,.0f} source households, "
f"{int(np.sum(optimised_weights > 0))} non-zero"
)

if 2025 in ACA_POST_CALIBRATION_PERSON_TARGETS:
Expand Down Expand Up @@ -824,9 +861,15 @@ def generate(self):
data = sim.dataset.load_dataset()
original_weights = sim.calculate("household_weight")
original_weights = initialize_weight_priors(original_weights.values)
source_household_count = float(np.sum(original_weights))
for year in [2024]:
loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year)
optimised_weights = reweight(original_weights, loss_matrix, targets_array)
validate_household_weight_total(
optimised_weights,
source_total=source_household_count,
year=year,
)
data["household_weight"] = optimised_weights

self.save_dataset(data)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ classifiers = [
"Programming Language :: Python :: 3.14",
]
dependencies = [
"policyengine-us==1.711.0",
"policyengine-us==1.712.0",
# policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for
# PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
# after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.
Expand Down
28 changes: 28 additions & 0 deletions tests/unit/datasets/test_enhanced_cps_seeding.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""

import numpy as np
import pytest

from policyengine_us_data.utils.seed import set_seeds

Expand Down Expand Up @@ -56,3 +57,30 @@ def test_initialize_weight_priors_preserves_source_weight_total():

np.testing.assert_allclose(priors.sum(), 100.0)
np.testing.assert_allclose(priors, np.array([40.0, 10.0, 25.0, 25.0]))


def test_validate_household_weight_total_accepts_close_total():
from policyengine_us_data.datasets.cps.enhanced_cps import (
validate_household_weight_total,
)

total = validate_household_weight_total(
np.array([50_000_000.0, 96_000_000.0]),
source_total=145_000_000.0,
year=2024,
)

assert total == 146_000_000.0


def test_validate_household_weight_total_rejects_inflated_total():
from policyengine_us_data.datasets.cps.enhanced_cps import (
validate_household_weight_total,
)

with pytest.raises(ValueError, match="differs from source household count"):
validate_household_weight_total(
np.array([100_000_000.0, 86_900_000.0]),
source_total=145_000_000.0,
year=2024,
)
8 changes: 4 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading