From baf2a2447c049ba2d6872ac114d26708335b796a Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 7 Apr 2026 09:48:50 -0400 Subject: [PATCH 01/14] Align SPM threshold recalculation --- .../calibration/calibration_utils.py | 44 ++- .../calibration/publish_local_area.py | 263 ++++++++---------- .../test_spm_thresholds.py | 51 ++++ policyengine_us_data/utils/spm.py | 144 +++++++++- 4 files changed, 344 insertions(+), 158 deletions(-) create mode 100644 policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py diff --git a/policyengine_us_data/calibration/calibration_utils.py b/policyengine_us_data/calibration/calibration_utils.py index 8af1bab7a..a4c489bab 100644 --- a/policyengine_us_data/calibration/calibration_utils.py +++ b/policyengine_us_data/calibration/calibration_utils.py @@ -7,10 +7,12 @@ import numpy as np import pandas as pd -from spm_calculator import SPMCalculator, spm_equivalence_scale -from spm_calculator.geoadj import calculate_geoadj_from_rent - -from policyengine_us_data.utils.spm import TENURE_CODE_MAP +from policyengine_us_data.utils.spm import ( + TENURE_CODE_MAP, + calculate_geoadj_from_rent, + get_spm_reference_thresholds, + spm_equivalence_scale, +) from policyengine_us.variables.household.demographic.geographic.state_name import ( StateName, ) @@ -491,6 +493,7 @@ def get_cd_index_mapping(db_uri: str = None): tuple: (cd_to_index dict, index_to_cd dict, cds_ordered list) """ from sqlalchemy import create_engine, text + from pathlib import Path from policyengine_us_data.storage import STORAGE_FOLDER if db_uri is None: @@ -531,7 +534,7 @@ def load_geo_labels(path) -> List[str]: def load_cd_geoadj_values( cds_to_calibrate: List[str], -) -> Dict[str, float]: +) -> Dict[str, Dict[str, float]]: """ Load geographic adjustment factors from rent data CSV. Uses median 2BR rent by CD vs national median to compute geoadj. @@ -550,11 +553,18 @@ def load_cd_geoadj_values( # Build lookup from rent data rent_lookup = {} for _, row in rent_df.iterrows(): - geoadj = calculate_geoadj_from_rent( - local_rent=row["median_2br_rent"], - national_rent=row["national_median_2br_rent"], - ) - rent_lookup[row["cd_geoid"]] = geoadj + rent_lookup[row["cd_geoid"]] = { + tenure: calculate_geoadj_from_rent( + local_rent=row["median_2br_rent"], + national_rent=row["national_median_2br_rent"], + tenure=tenure, + ) + for tenure in ( + "owner_with_mortgage", + "owner_without_mortgage", + "renter", + ) + } geoadj_dict = {} for cd in cds_to_calibrate: @@ -562,7 +572,11 @@ def load_cd_geoadj_values( geoadj_dict[cd] = rent_lookup[cd] else: print(f"Warning: No rent data for CD {cd}, using geoadj=1.0") - geoadj_dict[cd] = 1.0 + geoadj_dict[cd] = { + "owner_with_mortgage": 1.0, + "owner_without_mortgage": 1.0, + "renter": 1.0, + } return geoadj_dict @@ -593,6 +607,11 @@ def calculate_spm_thresholds_vectorized( Returns: Float32 array of SPM thresholds, one per SPM unit. """ + person_ages = np.asarray(person_ages) + person_spm_unit_ids = np.asarray(person_spm_unit_ids) + spm_unit_tenure_types = np.asarray(spm_unit_tenure_types) + spm_unit_geoadj = np.asarray(spm_unit_geoadj, dtype=np.float64) + n_units = len(spm_unit_tenure_types) # Count adults and children per SPM unit @@ -614,8 +633,7 @@ def calculate_spm_thresholds_vectorized( tenure_codes[mask] = code # Look up base thresholds - calc = SPMCalculator(year=year) - base_thresholds = calc.get_base_thresholds() + base_thresholds = get_spm_reference_thresholds(year) thresholds = np.zeros(n_units, dtype=np.float32) for i in range(n_units): diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py index b3e6085a9..4a8bb8a16 100644 --- a/policyengine_us_data/calibration/publish_local_area.py +++ b/policyengine_us_data/calibration/publish_local_area.py @@ -8,11 +8,6 @@ python publish_local_area.py [--skip-download] [--states-only] [--upload] """ -import hashlib -import json -import shutil - - import numpy as np from pathlib import Path from typing import List @@ -30,6 +25,7 @@ ) from policyengine_us_data.calibration.block_assignment import ( derive_geography_from_blocks, + get_county_filter_probability, ) from policyengine_us_data.calibration.clone_and_assign import ( GeographyAssignment, @@ -45,76 +41,28 @@ CHECKPOINT_FILE_CITIES = Path("completed_cities.txt") WORK_DIR = Path("local_area_build") -NYC_COUNTY_FIPS = {"36005", "36047", "36061", "36081", "36085"} - - -META_FILE = WORK_DIR / "checkpoint_meta.json" - - -def compute_input_fingerprint( - weights_path: Path, dataset_path: Path, n_clones: int, seed: int -) -> str: - h = hashlib.sha256() - for p in [weights_path, dataset_path]: - with open(p, "rb") as f: - while chunk := f.read(8192): - h.update(chunk) - h.update(f"{n_clones}:{seed}".encode()) - return h.hexdigest()[:16] - - -def validate_or_clear_checkpoints(fingerprint: str): - if META_FILE.exists(): - stored = json.loads(META_FILE.read_text()) - if stored.get("fingerprint") == fingerprint: - print(f"Inputs unchanged ({fingerprint}), resuming...") - return - print( - f"Inputs changed " - f"({stored.get('fingerprint')} -> {fingerprint}), " - f"clearing..." - ) - else: - print(f"No checkpoint metadata, starting fresh ({fingerprint})") - h5_count = sum( - 1 - for subdir in ["states", "districts", "cities"] - if (WORK_DIR / subdir).exists() - for _ in (WORK_DIR / subdir).rglob("*.h5") - ) - if h5_count > 0: - print( - f"WARNING: {h5_count} H5 files exist. " - f"Clearing only checkpoint files, preserving H5s." - ) - for cp in [ - CHECKPOINT_FILE, - CHECKPOINT_FILE_DISTRICTS, - CHECKPOINT_FILE_CITIES, - ]: - if cp.exists(): - cp.unlink() - else: - for cp in [ - CHECKPOINT_FILE, - CHECKPOINT_FILE_DISTRICTS, - CHECKPOINT_FILE_CITIES, - ]: - if cp.exists(): - cp.unlink() - for subdir in ["states", "districts", "cities"]: - d = WORK_DIR / subdir - if d.exists(): - shutil.rmtree(d) - META_FILE.parent.mkdir(parents=True, exist_ok=True) - META_FILE.write_text(json.dumps({"fingerprint": fingerprint})) - - -SUB_ENTITIES = [ - "tax_unit", - "spm_unit", - "family", - "marital_unit", +NYC_COUNTIES = { + "QUEENS_COUNTY_NY", + "BRONX_COUNTY_NY", + "RICHMOND_COUNTY_NY", + "NEW_YORK_COUNTY_NY", + "KINGS_COUNTY_NY", +} + +NYC_CDS = [ + "3603", + "3605", + "3606", + "3607", + "3608", + "3609", + "3610", + "3611", + "3612", + "3613", + "3614", + "3615", + "3616", ] @@ -163,7 +111,7 @@ def build_h5( dataset_path: Path, output_path: Path, cd_subset: List[str] = None, - county_fips_filter: set = None, + county_filter: set = None, takeup_filter: List[str] = None, ) -> Path: """Build an H5 file by cloning records for each nonzero weight. @@ -174,8 +122,8 @@ def build_h5( dataset_path: Path to base dataset H5 file. output_path: Where to write the output H5 file. cd_subset: If provided, only include clones for these CDs. - county_fips_filter: If provided, zero out weights for clones - whose county FIPS is not in this set. + county_filter: If provided, scale weights by P(target|CD) + for city datasets. takeup_filter: List of takeup vars to apply. Returns: @@ -216,11 +164,17 @@ def build_h5( cd_mask = np.vectorize(lambda cd: cd in cd_subset_set)(clone_cds_matrix) W[~cd_mask] = 0 - # County FIPS filtering: zero out clones not in target counties - if county_fips_filter is not None: - fips_array = np.asarray(geography.county_fips).reshape(n_clones_total, n_hh) - fips_mask = np.isin(fips_array, list(county_fips_filter)) - W[~fips_mask] = 0 + # County filtering: scale weights by P(target_counties | CD) + if county_filter is not None: + unique_cds = np.unique(clone_cds_matrix) + cd_prob = { + cd: get_county_filter_probability(cd, county_filter) for cd in unique_cds + } + p_matrix = np.vectorize( + cd_prob.__getitem__, + otypes=[float], + )(clone_cds_matrix) + W *= p_matrix label = ( f"CD subset {cd_subset}" @@ -237,7 +191,7 @@ def build_h5( if n_clones == 0: raise ValueError( f"No active clones after filtering. " - f"cd_subset={cd_subset}, county_fips_filter={county_fips_filter}" + f"cd_subset={cd_subset}, county_filter={county_filter}" ) clone_weights = W[active_geo, active_hh] active_blocks = blocks.reshape(n_clones_total, n_hh)[active_geo, active_hh] @@ -258,6 +212,12 @@ def build_h5( for p_idx, p_hh_id in enumerate(person_hh_ids): hh_to_persons[hh_id_to_idx[int(p_hh_id)]].append(p_idx) + SUB_ENTITIES = [ + "tax_unit", + "spm_unit", + "family", + "marital_unit", + ] hh_to_entity = {} entity_id_arrays = {} person_entity_id_arrays = {} @@ -351,6 +311,22 @@ def build_h5( unique_geo = derive_geography_from_blocks(unique_blocks) clone_geo = {k: v[block_inv] for k, v in unique_geo.items()} + # === Calculate weights for all entity levels === + person_weights = np.repeat(clone_weights, persons_per_clone) + per_person_wt = clone_weights / np.maximum(persons_per_clone, 1) + + entity_weights = {} + for ek in SUB_ENTITIES: + n_ents = len(entity_clone_idx[ek]) + ent_person_counts = np.zeros(n_ents, dtype=np.int32) + np.add.at( + ent_person_counts, + new_person_entity_ids[ek], + 1, + ) + clone_ids_e = np.repeat(np.arange(n_clones), entities_per_clone[ek]) + entity_weights[ek] = per_person_wt[clone_ids_e] * ent_person_counts + # === Determine variables to save === vars_to_save = set(sim.input_variables) vars_to_save.add("county") @@ -400,6 +376,7 @@ def build_h5( for period in periods: values = holder.get_array(period) + # Convert Arrow-backed arrays to numpy before indexing if hasattr(values, "_pa_array") or hasattr(values, "_ndarray"): values = np.asarray(values) @@ -436,12 +413,16 @@ def build_h5( } # === Override weights === - # Only write household_weight; sub-entity weights (tax_unit_weight, - # spm_unit_weight, person_weight, etc.) are formula variables in - # policyengine-us that derive from household_weight at runtime. data["household_weight"] = { time_period: clone_weights.astype(np.float32), } + data["person_weight"] = { + time_period: person_weights.astype(np.float32), + } + for ek in SUB_ENTITIES: + data[f"{ek}_weight"] = { + time_period: entity_weights[ek].astype(np.float32), + } # === Override geography === data["state_fips"] = { @@ -470,13 +451,6 @@ def build_h5( time_period: clone_geo[gv].astype("S"), } - # === Set zip_code for LA County clones (ACA rating area fix) === - la_mask = clone_geo["county_fips"].astype(str) == "06037" - if la_mask.any(): - zip_codes = np.full(len(la_mask), "UNKNOWN") - zip_codes[la_mask] = "90001" - data["zip_code"] = {time_period: zip_codes.astype("S")} - # === Gap 4: Congressional district GEOID === clone_cd_geoids = np.array([int(cd) for cd in active_clone_cds], dtype=np.int32) data["congressional_district_geoid"] = { @@ -487,19 +461,11 @@ def build_h5( print("Recalculating SPM thresholds...") unique_cds_list = sorted(set(active_clone_cds)) cd_geoadj_values = load_cd_geoadj_values(unique_cds_list) - # Build per-SPM-unit geoadj from clone's CD - spm_clone_ids = np.repeat( - np.arange(n_clones, dtype=np.int64), - entities_per_clone["spm_unit"], - ) - spm_unit_geoadj = np.array( - [cd_geoadj_values[active_clone_cds[c]] for c in spm_clone_ids], - dtype=np.float64, - ) - # Get cloned person ages and SPM tenure types + # Get cloned person ages and SPM unit IDs person_ages = sim.calculate("age", map_to="person").values[person_clone_idx] + # Get cloned tenure types spm_tenure_holder = sim.get_holder("spm_unit_tenure_type") spm_tenure_periods = spm_tenure_holder.get_known_periods() if spm_tenure_periods: @@ -516,6 +482,25 @@ def build_h5( dtype="S30", ) + # Build per-SPM-unit geoadj from the clone's CD and tenure. + spm_clone_ids = np.repeat( + np.arange(n_clones, dtype=np.int64), + entities_per_clone["spm_unit"], + ) + spm_unit_geoadj = np.array( + [ + cd_geoadj_values[active_clone_cds[clone_id]][ + ( + spm_tenure_cloned[spm_unit_index] + .decode() + .lower() + ) + ] + for spm_unit_index, clone_id in enumerate(spm_clone_ids) + ], + dtype=np.float64, + ) + new_spm_thresholds = calculate_spm_thresholds_vectorized( person_ages=person_ages, person_spm_unit_ids=new_person_entity_ids["spm_unit"], @@ -556,7 +541,6 @@ def build_h5( hh_blocks=active_blocks, hh_state_fips=hh_state_fips, hh_ids=original_hh_ids, - hh_clone_indices=active_geo.astype(np.int64), entity_hh_indices=entity_hh_indices, entity_counts=entity_counts, time_period=time_period, @@ -754,6 +738,8 @@ def build_cities( """Build city H5 files with checkpointing, optionally uploading.""" w = np.load(weights_path) + all_cds = sorted(set(geography.cd_geoid.astype(str))) + cities_dir = output_dir / "cities" cities_dir.mkdir(parents=True, exist_ok=True) @@ -763,29 +749,34 @@ def build_cities( if "NYC" in completed_cities: print("Skipping NYC (already completed)") else: - output_path = cities_dir / "NYC.h5" - - try: - build_h5( - weights=w, - geography=geography, - dataset_path=dataset_path, - output_path=output_path, - county_fips_filter=NYC_COUNTY_FIPS, - takeup_filter=takeup_filter, - ) - - if upload: - print("Uploading NYC.h5 to GCP...") - upload_local_area_file(str(output_path), "cities", skip_hf=True) - hf_queue.append((str(output_path), "cities")) - - record_completed_city("NYC") - print("Completed NYC") - - except Exception as e: - print(f"ERROR building NYC: {e}") - raise + cd_subset = [cd for cd in all_cds if cd in NYC_CDS] + if not cd_subset: + print("No NYC-related CDs found, skipping") + else: + output_path = cities_dir / "NYC.h5" + + try: + build_h5( + weights=w, + geography=geography, + dataset_path=dataset_path, + output_path=output_path, + cd_subset=cd_subset, + county_filter=NYC_COUNTIES, + takeup_filter=takeup_filter, + ) + + if upload: + print("Uploading NYC.h5 to GCP...") + upload_local_area_file(str(output_path), "cities", skip_hf=True) + hf_queue.append((str(output_path), "cities")) + + record_completed_city("NYC") + print("Completed NYC") + + except Exception as e: + print(f"ERROR building NYC: {e}") + raise if upload and hf_queue: print(f"\nUploading batch of {len(hf_queue)} city files to HuggingFace...") @@ -880,19 +871,9 @@ def main(): print(f"Using dataset: {inputs['dataset']}") - print("Computing input fingerprint...") - fingerprint = compute_input_fingerprint( - inputs["weights"], - inputs["dataset"], - args.n_clones, - args.seed, - ) - validate_or_clear_checkpoints(fingerprint) - - print("Loading base simulation to get household count...") - _sim = Microsimulation(dataset=str(inputs["dataset"])) - n_hh = len(_sim.calculate("household_id", map_to="household").values) - del _sim + sim = Microsimulation(dataset=str(inputs["dataset"])) + n_hh = sim.calculate("household_id", map_to="household").shape[0] + del sim print(f"\nBase dataset has {n_hh:,} households") geo_cache = WORK_DIR / f"geography_{n_hh}x{args.n_clones}_s{args.seed}.npz" diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py new file mode 100644 index 000000000..8cd2a06b0 --- /dev/null +++ b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +from policyengine_us import CountryTaxBenefitSystem +from policyengine_us_data.calibration.calibration_utils import ( + calculate_spm_thresholds_vectorized, + load_cd_geoadj_values, +) + +SYSTEM = CountryTaxBenefitSystem() +CPI_U = SYSTEM.parameters.gov.bls.cpi.cpi_u + + +def test_load_cd_geoadj_values_returns_tenure_specific_lookup(monkeypatch): + rent_df = pd.DataFrame( + { + "cd_id": ["0101"], + "median_2br_rent": [1_500.0], + "national_median_2br_rent": [1_000.0], + } + ) + monkeypatch.setattr( + "policyengine_us_data.calibration.calibration_utils.pd.read_csv", + lambda *args, **kwargs: rent_df, + ) + + geoadj_lookup = load_cd_geoadj_values(["101"]) + + assert geoadj_lookup["101"]["renter"] == pytest.approx(1.2215) + assert geoadj_lookup["101"]["owner_with_mortgage"] == pytest.approx(1.217) + assert geoadj_lookup["101"]["owner_without_mortgage"] == pytest.approx( + 1.1615 + ) + + +def test_calculate_spm_thresholds_vectorized_matches_policyengine_us_future_path(): + thresholds = calculate_spm_thresholds_vectorized( + person_ages=np.array([40, 35, 10, 8]), + person_spm_unit_ids=np.array([0, 0, 0, 0]), + spm_unit_tenure_types=np.array([b"RENTER"]), + spm_unit_geoadj=np.array([1.1]), + year=2027, + ) + + cpi_ratio = float(CPI_U("2027-02-01") / CPI_U("2024-02-01")) + expected = 39_430.0 * cpi_ratio * 1.1 + + assert thresholds[0] == pytest.approx(expected) diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py index ad3c9e9fb..3aba81693 100644 --- a/policyengine_us_data/utils/spm.py +++ b/policyengine_us_data/utils/spm.py @@ -1,7 +1,69 @@ -"""SPM threshold calculation utilities using the spm-calculator package.""" +"""SPM threshold calculation utilities aligned with PolicyEngine US.""" +from functools import lru_cache import numpy as np -from spm_calculator import SPMCalculator, spm_equivalence_scale +from policyengine_us import CountryTaxBenefitSystem + +PUBLISHED_SPM_REFERENCE_THRESHOLDS = { + 2015: { + "renter": 25_155.0, + "owner_with_mortgage": 24_859.0, + "owner_without_mortgage": 20_639.0, + }, + 2016: { + "renter": 25_558.0, + "owner_with_mortgage": 25_248.0, + "owner_without_mortgage": 20_943.0, + }, + 2017: { + "renter": 26_213.0, + "owner_with_mortgage": 25_897.0, + "owner_without_mortgage": 21_527.0, + }, + 2018: { + "renter": 26_905.0, + "owner_with_mortgage": 26_565.0, + "owner_without_mortgage": 22_095.0, + }, + 2019: { + "renter": 27_515.0, + "owner_with_mortgage": 27_172.0, + "owner_without_mortgage": 22_600.0, + }, + 2020: { + "renter": 28_881.0, + "owner_with_mortgage": 28_533.0, + "owner_without_mortgage": 23_948.0, + }, + 2021: { + "renter": 31_453.0, + "owner_with_mortgage": 31_089.0, + "owner_without_mortgage": 26_022.0, + }, + 2022: { + "renter": 33_402.0, + "owner_with_mortgage": 32_949.0, + "owner_without_mortgage": 27_679.0, + }, + 2023: { + "renter": 36_606.0, + "owner_with_mortgage": 36_192.0, + "owner_without_mortgage": 30_347.0, + }, + 2024: { + "renter": 39_430.0, + "owner_with_mortgage": 39_068.0, + "owner_without_mortgage": 32_586.0, + }, +} + +LATEST_PUBLISHED_SPM_THRESHOLD_YEAR = max(PUBLISHED_SPM_REFERENCE_THRESHOLDS) +REFERENCE_RAW_SCALE = 3**0.7 +TENURE_HOUSING_SHARES = { + "owner_with_mortgage": 0.434, + "owner_without_mortgage": 0.323, + "renter": 0.443, +} TENURE_CODE_MAP = { 1: "owner_with_mortgage", @@ -10,6 +72,81 @@ } +@lru_cache(maxsize=1) +def _get_cpi_u_parameter(): + system = CountryTaxBenefitSystem() + return system.parameters.gov.bls.cpi.cpi_u + + +def spm_equivalence_scale(num_adults, num_children): + adults, children = np.broadcast_arrays( + np.asarray(num_adults, dtype=float), + np.asarray(num_children, dtype=float), + ) + + raw = np.zeros_like(adults, dtype=float) + has_people = (adults + children) > 0 + with_children = has_people & (children > 0) + + single_adult_with_children = with_children & (adults <= 1) + raw[single_adult_with_children] = ( + 1.0 + + 0.8 + + 0.5 * np.maximum(children[single_adult_with_children] - 1, 0) + ) ** 0.7 + + multi_adult_with_children = with_children & ~single_adult_with_children + raw[multi_adult_with_children] = ( + adults[multi_adult_with_children] + + 0.5 * children[multi_adult_with_children] + ) ** 0.7 + + no_children = has_people & ~with_children + one_adult = no_children & (adults <= 1) + two_adults = no_children & (adults == 2) + larger_adult_units = no_children & (adults > 2) + + raw[one_adult] = 1.0 + raw[two_adults] = 1.41 + raw[larger_adult_units] = adults[larger_adult_units] ** 0.7 + + return raw / REFERENCE_RAW_SCALE + + +def get_spm_reference_thresholds(year: int) -> dict[str, float]: + if year in PUBLISHED_SPM_REFERENCE_THRESHOLDS: + return PUBLISHED_SPM_REFERENCE_THRESHOLDS[year].copy() + + if year < min(PUBLISHED_SPM_REFERENCE_THRESHOLDS): + raise ValueError( + f"No published SPM reference thresholds for {year}. " + f"Earliest available year is {min(PUBLISHED_SPM_REFERENCE_THRESHOLDS)}." + ) + + cpi_u = _get_cpi_u_parameter() + factor = float( + cpi_u(f"{year}-02-01") + / cpi_u(f"{LATEST_PUBLISHED_SPM_THRESHOLD_YEAR}-02-01") + ) + latest_thresholds = PUBLISHED_SPM_REFERENCE_THRESHOLDS[ + LATEST_PUBLISHED_SPM_THRESHOLD_YEAR + ] + return { + tenure: value * factor + for tenure, value in latest_thresholds.items() + } + + +def calculate_geoadj_from_rent( + local_rent, + national_rent: float, + tenure: str = "renter", +): + share = TENURE_HOUSING_SHARES[tenure] + rent_ratio = np.asarray(local_rent, dtype=float) / float(national_rent) + return rent_ratio * share + (1.0 - share) + + def calculate_spm_thresholds_with_geoadj( num_adults: np.ndarray, num_children: np.ndarray, @@ -35,8 +172,7 @@ def calculate_spm_thresholds_with_geoadj( Returns: Array of SPM threshold values. """ - calc = SPMCalculator(year=year) - base_thresholds = calc.get_base_thresholds() + base_thresholds = get_spm_reference_thresholds(year) n = len(num_adults) thresholds = np.zeros(n) From 418d73c3c70e46e5c4d89f864cc1a241eeb1783b Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 7 Apr 2026 15:59:44 -0400 Subject: [PATCH 02/14] Split housing benchmarks by Census and HUD concepts --- .../db/etl_national_targets.py | 11 +- .../pull_hardcoded_targets.py | 82 +++++---- policyengine_us_data/utils/census_spm.py | 65 +++++++ policyengine_us_data/utils/hud_housing.py | 49 ++++++ policyengine_us_data/utils/loss.py | 7 +- tests/unit/test_census_spm_housing_targets.py | 161 ++++++++++++++++++ validation/benefit_validation.py | 95 ++++++++--- 7 files changed, 403 insertions(+), 67 deletions(-) create mode 100644 policyengine_us_data/utils/census_spm.py create mode 100644 policyengine_us_data/utils/hud_housing.py create mode 100644 tests/unit/test_census_spm_housing_targets.py diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 773411dc5..d1c459656 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -12,6 +12,9 @@ from policyengine_us_data.storage.calibration_targets.soi_metadata import ( RETIREMENT_CONTRIBUTION_TARGETS, ) +from policyengine_us_data.utils.census_spm import ( + build_census_spm_capped_housing_subsidy_target, +) from policyengine_us_data.utils.db import ( DEFAULT_DATASET, etl_argparser, @@ -187,13 +190,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "notes": "Work and childcare expenses for SPM", "year": HARDCODED_YEAR, }, - { - "variable": "spm_unit_capped_housing_subsidy", - "value": 35e9, - "source": "HUD/Census", - "notes": "Housing subsidies", - "year": HARDCODED_YEAR, - }, + build_census_spm_capped_housing_subsidy_target(HARDCODED_YEAR), { "variable": "tanf", "value": 9e9, diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py index 3199a56a2..c1478d957 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py @@ -1,52 +1,66 @@ import pandas as pd import numpy as np from policyengine_us_data.storage import CALIBRATION_FOLDER +from policyengine_us_data.utils.census_spm import ( + get_census_spm_capped_housing_subsidy_total, +) """ -Hardcoded targets for the year 2024 from CPS-derived statistics and other sources. Include medical expenses, sum of SPM thresholds, and child support expenses. +Hardcoded targets for the year 2024 from CPS-derived statistics and other +sources. Housing uses the Census SPM capped subsidy concept, not HUD spending. """ -HARD_CODED_TOTALS = { - "health_insurance_premiums_without_medicare_part_b": 385e9, - "other_medical_expenses": 278e9, - "medicare_part_b_premiums": 112e9, - "over_the_counter_health_expenses": 72e9, - "spm_unit_spm_threshold": 3_945e9, - "child_support_expense": 33e9, - "child_support_received": 33e9, - "spm_unit_capped_work_childcare_expenses": 348e9, - "spm_unit_capped_housing_subsidy": 35e9, - "tanf": 9e9, - # Alimony could be targeted via SOI - "alimony_income": 13e9, - "alimony_expense": 13e9, - # Rough estimate, not CPS derived - "real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections - "rent": 735e9, # ACS total uprated by CPI - # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics - # shows $38,316,190,000 in Box 7: Social security tips (2018) - # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC - # Assume 40% through 2024 - "tip_income": 38e9 * 1.4, -} +def get_hard_coded_totals( + year: int = 2024, + storage_folder=None, +) -> dict[str, float]: + return { + "health_insurance_premiums_without_medicare_part_b": 385e9, + "other_medical_expenses": 278e9, + "medicare_part_b_premiums": 112e9, + "over_the_counter_health_expenses": 72e9, + "spm_unit_spm_threshold": 3_945e9, + "child_support_expense": 33e9, + "child_support_received": 33e9, + "spm_unit_capped_work_childcare_expenses": 348e9, + "spm_unit_capped_housing_subsidy": get_census_spm_capped_housing_subsidy_total( + year, storage_folder=storage_folder + ), + "tanf": 9e9, + # Alimony could be targeted via SOI + "alimony_income": 13e9, + "alimony_expense": 13e9, + # Rough estimate, not CPS derived + "real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections + "rent": 735e9, # ACS total uprated by CPI + # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics + # shows $38,316,190,000 in Box 7: Social security tips (2018) + # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC + # Assume 40% through 2024 + "tip_income": 38e9 * 1.4, + } -def pull_hardcoded_targets(): +def pull_hardcoded_targets(year: int = 2024, storage_folder=None): """ Returns a DataFrame with hardcoded targets for various CPS-derived statistics and other sources. """ + hard_coded_totals = get_hard_coded_totals( + year=year, + storage_folder=storage_folder, + ) data = { - "DATA_SOURCE": ["hardcoded"] * len(HARD_CODED_TOTALS), - "GEO_ID": ["0000000US"] * len(HARD_CODED_TOTALS), - "GEO_NAME": ["national"] * len(HARD_CODED_TOTALS), - "VARIABLE": list(HARD_CODED_TOTALS.keys()), - "VALUE": list(HARD_CODED_TOTALS.values()), + "DATA_SOURCE": ["hardcoded"] * len(hard_coded_totals), + "GEO_ID": ["0000000US"] * len(hard_coded_totals), + "GEO_NAME": ["national"] * len(hard_coded_totals), + "VARIABLE": list(hard_coded_totals.keys()), + "VALUE": list(hard_coded_totals.values()), "IS_COUNT": [0.0] - * len(HARD_CODED_TOTALS), # All values are monetary amounts, not counts + * len(hard_coded_totals), # All values are monetary amounts, not counts "BREAKDOWN_VARIABLE": [np.nan] - * len(HARD_CODED_TOTALS), # No breakdown variable for hardcoded targets - "LOWER_BOUND": [np.nan] * len(HARD_CODED_TOTALS), - "UPPER_BOUND": [np.nan] * len(HARD_CODED_TOTALS), + * len(hard_coded_totals), # No breakdown variable for hardcoded targets + "LOWER_BOUND": [np.nan] * len(hard_coded_totals), + "UPPER_BOUND": [np.nan] * len(hard_coded_totals), } df = pd.DataFrame(data) diff --git a/policyengine_us_data/utils/census_spm.py b/policyengine_us_data/utils/census_spm.py new file mode 100644 index 000000000..e7a9c32b4 --- /dev/null +++ b/policyengine_us_data/utils/census_spm.py @@ -0,0 +1,65 @@ +from pathlib import Path + +import pandas as pd + +from policyengine_us_data.storage import STORAGE_FOLDER + +# These totals are derived from raw public-use Census CPS ASEC files by +# summing SPM_CAPHOUSESUB * SPM_WEIGHT / 100 at the SPM-unit level. +# They represent the Census SPM capped housing subsidy concept, not HUD +# spending or outlays. +CENSUS_SPM_CAPPED_HOUSING_SUBSIDY_TOTALS = { + 2022: 29_549_204_420.92, + 2023: 31_844_144_470.85, + 2024: 33_649_114_150.37, +} + + +def get_census_spm_capped_housing_subsidy_total( + year: int, + storage_folder: Path | str | None = None, +) -> float: + """ + Return the Census CPS ASEC total for SPM_CAPHOUSESUB. + + If a storage folder is provided, recompute directly from the local + raw Census CPS HDF file. Otherwise, use the checked-in year-specific + totals above so callers do not depend on local data files at import + time. + """ + + if storage_folder is None: + if year not in CENSUS_SPM_CAPPED_HOUSING_SUBSIDY_TOTALS: + raise ValueError( + f"No published Census SPM capped housing subsidy total for {year}." + ) + return CENSUS_SPM_CAPPED_HOUSING_SUBSIDY_TOTALS[year] + + storage_path = Path(storage_folder) / f"census_cps_{year}.h5" + if not storage_path.exists(): + raise FileNotFoundError( + f"Missing raw Census CPS file for {year}: {storage_path}" + ) + + with pd.HDFStore(storage_path, mode="r") as store: + spm_unit = store["spm_unit"][["SPM_CAPHOUSESUB", "SPM_WEIGHT"]] + + return float((spm_unit.SPM_CAPHOUSESUB * spm_unit.SPM_WEIGHT).sum() / 100) + + +def build_census_spm_capped_housing_subsidy_target( + year: int, + storage_folder: Path | str | None = None, +) -> dict: + return { + "variable": "spm_unit_capped_housing_subsidy", + "value": get_census_spm_capped_housing_subsidy_total( + year, storage_folder=storage_folder + ), + "source": "Census CPS ASEC public-use SPM_CAPHOUSESUB", + "notes": ( + "Capped SPM housing subsidy total from raw Census CPS ASEC " + "SPM_CAPHOUSESUB; not HUD spending or outlays." + ), + "year": year, + } diff --git a/policyengine_us_data/utils/hud_housing.py b/policyengine_us_data/utils/hud_housing.py new file mode 100644 index 000000000..1b7620e5d --- /dev/null +++ b/policyengine_us_data/utils/hud_housing.py @@ -0,0 +1,49 @@ +HUD_USER_HOUSING_ASSISTANCE_BENCHMARKS = { + 2022: { + "reported_households": 4_537_614, + "average_monthly_spending_per_unit": 899, + }, + 2023: { + "reported_households": 4_569_973, + "average_monthly_spending_per_unit": 989, + }, + 2024: { + "reported_households": 4_584_691, + "average_monthly_spending_per_unit": 1_067, + }, + 2025: { + "reported_households": 4_519_561, + "average_monthly_spending_per_unit": 1_135, + }, +} + + +def get_hud_user_housing_benchmark(year: int) -> dict: + if year not in HUD_USER_HOUSING_ASSISTANCE_BENCHMARKS: + raise ValueError(f"No HUD USER housing benchmark for {year}.") + + benchmark = dict(HUD_USER_HOUSING_ASSISTANCE_BENCHMARKS[year]) + benchmark["annual_spending_total"] = ( + benchmark["reported_households"] + * benchmark["average_monthly_spending_per_unit"] + * 12 + ) + return benchmark + + +def build_hud_user_housing_assistance_benchmark(year: int) -> dict: + benchmark = get_hud_user_housing_benchmark(year) + return { + "variable": "housing_assistance", + "annual_spending_total": benchmark["annual_spending_total"], + "reported_households": benchmark["reported_households"], + "average_monthly_spending_per_unit": benchmark[ + "average_monthly_spending_per_unit" + ], + "source": "HUD USER Picture of Subsidized Households", + "notes": ( + "Annual federal spending and occupied assisted households from " + "HUD USER; not Census SPM capped subsidy." + ), + "year": year, + } diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index e141cdf43..6d8b35f46 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -12,6 +12,9 @@ from policyengine_us_data.storage.calibration_targets.soi_metadata import ( RETIREMENT_CONTRIBUTION_TARGETS, ) +from policyengine_us_data.utils.census_spm import ( + get_census_spm_capped_housing_subsidy_total, +) from policyengine_core.reforms import Reform from policyengine_us_data.utils.soi import pe_to_soi, get_soi @@ -31,7 +34,9 @@ "child_support_expense": 33e9, "child_support_received": 33e9, "spm_unit_capped_work_childcare_expenses": 348e9, - "spm_unit_capped_housing_subsidy": 35e9, + "spm_unit_capped_housing_subsidy": get_census_spm_capped_housing_subsidy_total( + 2024 + ), "tanf": 9e9, # Alimony could be targeted via SOI "alimony_income": 13e9, diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py new file mode 100644 index 000000000..6693ffe9a --- /dev/null +++ b/tests/unit/test_census_spm_housing_targets.py @@ -0,0 +1,161 @@ +import sys +import types + +import pandas as pd +import validation.benefit_validation as benefit_validation + +from policyengine_us_data.db import etl_national_targets +from policyengine_us_data.storage.calibration_targets.pull_hardcoded_targets import ( + pull_hardcoded_targets, +) +from policyengine_us_data.utils.census_spm import ( + build_census_spm_capped_housing_subsidy_target, + get_census_spm_capped_housing_subsidy_total, +) +from policyengine_us_data.utils.hud_housing import ( + build_hud_user_housing_assistance_benchmark, + get_hud_user_housing_benchmark, +) + + +def _write_census_cps(storage_dir, year=2024): + path = storage_dir / f"census_cps_{year}.h5" + with pd.HDFStore(path, mode="w") as store: + store["spm_unit"] = pd.DataFrame( + { + "SPM_CAPHOUSESUB": [1_000.0, 0.0, 2_000.0], + "SPM_WEIGHT": [150, 250, 100], + } + ) + return path + + +def test_get_census_spm_capped_housing_subsidy_total_uses_raw_cps_spm_values( + tmp_path, +): + _write_census_cps(tmp_path) + + total = get_census_spm_capped_housing_subsidy_total( + 2024, storage_folder=tmp_path + ) + + expected = (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100 + assert total == expected + + +def test_pull_hardcoded_targets_uses_census_spm_housing_total(tmp_path): + _write_census_cps(tmp_path) + + targets = pull_hardcoded_targets(year=2024, storage_folder=tmp_path) + housing = targets.loc[ + targets.VARIABLE == "spm_unit_capped_housing_subsidy", "VALUE" + ].iat[0] + + assert housing == (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100 + + +def test_extract_national_targets_uses_census_spm_housing_target(monkeypatch): + class DummyMicrosimulation: + def __init__(self, dataset): + self.default_calculation_period = 2024 + + monkeypatch.setitem( + sys.modules, + "policyengine_us", + types.SimpleNamespace(Microsimulation=DummyMicrosimulation), + ) + monkeypatch.setattr( + etl_national_targets, + "build_census_spm_capped_housing_subsidy_target", + lambda year, storage_folder=None: { + "variable": "spm_unit_capped_housing_subsidy", + "value": 123.0, + "source": "Census CPS ASEC public-use SPM_CAPHOUSESUB", + "notes": "Capped SPM housing subsidy total from raw Census CPS ASEC.", + "year": year, + }, + ) + + targets = etl_national_targets.extract_national_targets("dummy") + housing = next( + target + for target in targets["direct_sum_targets"] + if target["variable"] == "spm_unit_capped_housing_subsidy" + ) + + assert housing["value"] == 123.0 + assert housing["source"] == "Census CPS ASEC public-use SPM_CAPHOUSESUB" + assert "Capped SPM housing subsidy" in housing["notes"] + + +def test_build_census_spm_capped_housing_subsidy_target_labels_concept(tmp_path): + _write_census_cps(tmp_path) + + target = build_census_spm_capped_housing_subsidy_target( + 2024, storage_folder=tmp_path + ) + + assert target["variable"] == "spm_unit_capped_housing_subsidy" + assert target["value"] == (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100 + assert target["source"] == "Census CPS ASEC public-use SPM_CAPHOUSESUB" + assert "not HUD spending" in target["notes"] + + +def test_get_hud_user_housing_benchmark_uses_annual_spending_concept(): + benchmark = get_hud_user_housing_benchmark(2022) + + assert benchmark["reported_households"] == 4_537_614 + assert benchmark["average_monthly_spending_per_unit"] == 899 + assert benchmark["annual_spending_total"] == 48_951_779_832 + + +def test_build_hud_user_housing_assistance_benchmark_labels_admin_concept(): + benchmark = build_hud_user_housing_assistance_benchmark(2022) + + assert benchmark["variable"] == "housing_assistance" + assert benchmark["source"] == "HUD USER Picture of Subsidized Households" + assert benchmark["reported_households"] == 4_537_614 + assert benchmark["annual_spending_total"] == 48_951_779_832 + assert "not Census SPM capped subsidy" in benchmark["notes"] + + +def test_get_program_benchmarks_keeps_housing_concepts_separate(monkeypatch): + monkeypatch.setattr( + benefit_validation, + "build_census_spm_capped_housing_subsidy_target", + lambda year: { + "variable": "spm_unit_capped_housing_subsidy", + "value": 111e9, + "source": "Census benchmark", + "notes": "Census concept", + "year": year, + }, + ) + monkeypatch.setattr( + benefit_validation, + "build_hud_user_housing_assistance_benchmark", + lambda year: { + "variable": "housing_assistance", + "annual_spending_total": 222e9, + "reported_households": 3_000_000, + "average_monthly_spending_per_unit": 999, + "source": "HUD USER benchmark", + "notes": "HUD concept", + "year": year, + }, + ) + + benchmarks = benefit_validation.get_program_benchmarks(2022) + + assert benchmarks["housing_spm_capped"]["variable"] == "spm_unit_capped_housing_subsidy" + assert benchmarks["housing_spm_capped"]["benchmark_total"] == 111 + assert benchmarks["housing_spm_capped"]["benchmark_source"] == "Census benchmark" + + assert benchmarks["housing_assistance_hud_user"]["variable"] == "housing_assistance" + assert benchmarks["housing_assistance_hud_user"]["benchmark_total"] == 222 + assert benchmarks["housing_assistance_hud_user"]["benchmark_source"] == "HUD USER benchmark" + assert ( + benchmarks["housing_assistance_hud_user"]["benchmark_participants_millions"] + == 3 + ) + assert benchmarks["housing_assistance_hud_user"]["map_to"] == "household" diff --git a/validation/benefit_validation.py b/validation/benefit_validation.py index cf4689720..723fe6a08 100644 --- a/validation/benefit_validation.py +++ b/validation/benefit_validation.py @@ -9,41 +9,75 @@ import numpy as np from policyengine_us import Microsimulation from policyengine_us_data.datasets.cps.enhanced_cps import EnhancedCPS - - -def analyze_benefit_underreporting(): - """Analyze impact of CPS benefit underreporting on results.""" - - sim = Microsimulation(dataset=EnhancedCPS, dataset_year=2022) - - programs = { +from policyengine_us_data.utils.census_spm import ( + build_census_spm_capped_housing_subsidy_target, +) +from policyengine_us_data.utils.hud_housing import ( + build_hud_user_housing_assistance_benchmark, +) + + +def get_program_benchmarks(year: int): + housing_spm_target = build_census_spm_capped_housing_subsidy_target(year) + housing_hud_target = build_hud_user_housing_assistance_benchmark(year) + return { "snap": { "variable": "snap", - "admin_total": 114.1, # billions, from USDA + "benchmark_total": 114.1, # billions, from USDA + "benchmark_source": "USDA", "participation_rate": 0.82, # from USDA + "weight_variable": "spm_unit_weight", }, "ssi": { "variable": "ssi", - "admin_total": 56.7, # from SSA + "benchmark_total": 56.7, # from SSA + "benchmark_source": "SSA", "participation_rate": 0.93, + "weight_variable": "spm_unit_weight", }, "tanf": { "variable": "tanf", - "admin_total": 7.1, # from HHS + "benchmark_total": 7.1, # from HHS + "benchmark_source": "HHS", "participation_rate": 0.23, + "weight_variable": "spm_unit_weight", }, - "housing": { - "variable": "housing_benefit", - "admin_total": 50.3, # from HUD - "participation_rate": 0.76, + "housing_spm_capped": { + "variable": "spm_unit_capped_housing_subsidy", + "benchmark_total": housing_spm_target["value"] / 1e9, + "benchmark_source": housing_spm_target["source"], + "participation_rate": np.nan, + "weight_variable": "spm_unit_weight", + }, + "housing_assistance_hud_user": { + "variable": "housing_assistance", + "benchmark_total": housing_hud_target["annual_spending_total"] / 1e9, + "benchmark_source": housing_hud_target["source"], + "benchmark_participants_millions": ( + housing_hud_target["reported_households"] / 1e6 + ), + "participation_rate": np.nan, + "weight_variable": "household_weight", + "map_to": "household", }, } + +def analyze_benefit_underreporting(): + """Analyze impact of CPS benefit underreporting on results.""" + + year = 2022 + sim = Microsimulation(dataset=EnhancedCPS, dataset_year=year) + programs = get_program_benchmarks(year) + results = [] for program, info in programs.items(): # Calculate totals - benefit = sim.calculate(info["variable"], 2022) - weight = sim.calculate("household_weight", 2022) + benefit_kwargs = {} + if info.get("map_to"): + benefit_kwargs["map_to"] = info["map_to"] + benefit = sim.calculate(info["variable"], year, **benefit_kwargs) + weight = sim.calculate(info["weight_variable"], year) # Total benefits total = (benefit * weight).sum() / 1e9 # billions @@ -53,15 +87,26 @@ def analyze_benefit_underreporting(): weighted_participants = ((benefit > 0) * weight).sum() / 1e6 # millions # Underreporting factor - underreporting = info["admin_total"] / total if total > 0 else np.inf + benchmark_ratio = ( + info["benchmark_total"] / total if total > 0 else np.inf + ) + benchmark_participants = info.get("benchmark_participants_millions") + participant_ratio = ( + weighted_participants / benchmark_participants + if benchmark_participants not in (None, 0) + else np.nan + ) results.append( { "program": program, "enhanced_cps_total": total, - "admin_total": info["admin_total"], - "underreporting_factor": underreporting, + "benchmark_total": info["benchmark_total"], + "benchmark_source": info["benchmark_source"], + "benchmark_ratio": benchmark_ratio, "participants_millions": weighted_participants, + "benchmark_participants_millions": benchmark_participants, + "participant_ratio": participant_ratio, "mean_benefit": ( benefit[benefit > 0].mean() if (benefit > 0).any() else 0 ), @@ -77,11 +122,11 @@ def validate_program_interactions(): sim = Microsimulation(dataset=EnhancedCPS, dataset_year=2022) # Get program benefits - snap = sim.calculate("snap", 2022) > 0 - medicaid = sim.calculate("medicaid", 2022) > 0 - ssi = sim.calculate("ssi", 2022) > 0 - tanf = sim.calculate("tanf", 2022) > 0 - housing = sim.calculate("housing_benefit", 2022) > 0 + snap = sim.calculate("snap", 2022, map_to="household") > 0 + medicaid = sim.calculate("medicaid", 2022, map_to="household") > 0 + ssi = sim.calculate("ssi", 2022, map_to="household") > 0 + tanf = sim.calculate("tanf", 2022, map_to="household") > 0 + housing = sim.calculate("housing_assistance", 2022, map_to="household") > 0 weight = sim.calculate("household_weight", 2022) From c71e53aaab58a06950faa20f02856e93969df382 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 09:28:37 -0400 Subject: [PATCH 03/14] Stabilize clone-half SPM thresholds and weight priors --- .../datasets/cps/enhanced_cps.py | 48 ++++++++-- .../datasets/cps/extended_cps.py | 89 ++++++++++++++++++- tests/unit/test_extended_cps.py | 46 ++++++++++ 3 files changed, 176 insertions(+), 7 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 275d71e2b..e0ce2c208 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -28,6 +28,42 @@ torch = None +def initialize_weight_priors( + original_weights: np.ndarray, + seed: int = 1456, + epsilon: float = 1e-6, + positive_jitter_scale: float = 0.01, +) -> np.ndarray: + """Build deterministic positive priors for sparse reweighting. + + Original CPS households should keep priors close to their survey + weights. Clone-half households start with zero weight on purpose, so + they should receive only a tiny positive epsilon to keep the log + optimization well-defined without giving them a meaningful head start. + """ + + weights = np.asarray(original_weights, dtype=np.float64) + if np.any(weights < 0): + raise ValueError("original_weights must be non-negative") + + rng = np.random.default_rng(seed) + priors = np.empty_like(weights, dtype=np.float64) + + positive_mask = weights > 0 + if positive_mask.any(): + jitter = np.maximum( + rng.normal(loc=1.0, scale=positive_jitter_scale, size=positive_mask.sum()), + 0.5, + ) + priors[positive_mask] = np.maximum(weights[positive_mask] * jitter, epsilon) + + zero_mask = ~positive_mask + if zero_mask.any(): + priors[zero_mask] = epsilon * rng.uniform(1.0, 2.0, size=zero_mask.sum()) + + return priors + + def _get_period_array(period_values: dict, period: int) -> np.ndarray: """Get a period array from a TIME_PERIOD_ARRAYS variable dict.""" value = period_values.get(period) @@ -191,9 +227,9 @@ def generate(self): data = sim.dataset.load_dataset() base_year = int(sim.default_calculation_period) data["household_weight"] = {} - original_weights = sim.calculate("household_weight") - original_weights = original_weights.values + np.random.normal( - 1, 0.1, len(original_weights) + original_weights = initialize_weight_priors( + sim.calculate("household_weight").values, + seed=1456, ) bad_targets = [ @@ -328,9 +364,9 @@ def generate(self): sim = Microsimulation(dataset=self.input_dataset) data = sim.dataset.load_dataset() - original_weights = sim.calculate("household_weight") - original_weights = original_weights.values + np.random.normal( - 1, 0.1, len(original_weights) + original_weights = initialize_weight_priors( + sim.calculate("household_weight").values, + seed=1456, ) for year in [2024]: loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 8b9041c0c..b110b536c 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -18,6 +18,10 @@ convert_mortgage_interest_to_structural_inputs, impute_tax_unit_mortgage_balance_hints, ) +from policyengine_us_data.utils.spm import ( + get_spm_reference_thresholds, + spm_equivalence_scale, +) from policyengine_us_data.utils.policyengine import has_policyengine_us_variables from policyengine_us_data.utils.retirement_limits import ( get_retirement_limits, @@ -77,7 +81,6 @@ def _supports_structural_mortgage_inputs() -> bool: "spm_unit_federal_tax_reported", "spm_unit_state_tax_reported", "spm_unit_capped_work_childcare_expenses", - "spm_unit_spm_threshold", "spm_unit_net_income_reported", "spm_unit_pre_subsidy_childcare_expenses", # Medical expenses @@ -325,6 +328,15 @@ def reconcile_ss_subcomponents(predictions, total_ss): "social_security_survivors", } +_SPM_TENURE_TO_REFERENCE_KEY = { + b"OWNER_WITH_MORTGAGE": "owner_with_mortgage", + b"OWNER_WITHOUT_MORTGAGE": "owner_without_mortgage", + b"RENTER": "renter", + "OWNER_WITH_MORTGAGE": "owner_with_mortgage", + "OWNER_WITHOUT_MORTGAGE": "owner_without_mortgage", + "RENTER": "renter", +} + def _apply_post_processing(predictions, X_test, time_period, data): """Apply retirement constraints and SS reconciliation.""" @@ -369,6 +381,73 @@ def _apply_post_processing(predictions, X_test, time_period, data): return predictions +def _index_into_spm_units( + person_spm_unit_ids: np.ndarray, + spm_unit_ids: np.ndarray, +) -> np.ndarray: + if np.all(spm_unit_ids[:-1] <= spm_unit_ids[1:]): + indices = np.searchsorted(spm_unit_ids, person_spm_unit_ids) + valid = (indices >= 0) & (indices < len(spm_unit_ids)) + if valid.all() and np.array_equal(spm_unit_ids[indices], person_spm_unit_ids): + return indices + + indexer = pd.Index(spm_unit_ids).get_indexer(person_spm_unit_ids) + if (indexer < 0).any(): + raise ValueError("person_spm_unit_id contains values missing from spm_unit_id") + return indexer + + +def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray: + """Rebuild cloned SPM thresholds from donor-half geography. + + The clone half should keep the donor household's geography but use the + current threshold formula rather than a learned QRF output. We infer the + donor-half geographic adjustment from the stored first-half thresholds and + apply that same geography to the corresponding clone-half SPM units. + """ + + person_ages = np.asarray(data["age"][time_period]) + person_spm_unit_ids = np.asarray(data["person_spm_unit_id"][time_period]) + spm_unit_ids = np.asarray(data["spm_unit_id"][time_period]) + tenure_types = np.asarray(data["spm_unit_tenure_type"][time_period]) + stored_thresholds = np.asarray(data["spm_unit_spm_threshold"][time_period], dtype=float) + + n_units = len(spm_unit_ids) + if n_units % 2 != 0: + raise ValueError("Expected cloned SPM units to have an even-length spm_unit_id array") + + unit_indices = _index_into_spm_units(person_spm_unit_ids, spm_unit_ids) + is_adult = person_ages >= 18 + adult_counts = np.zeros(n_units, dtype=np.int32) + child_counts = np.zeros(n_units, dtype=np.int32) + np.add.at(adult_counts, unit_indices, is_adult.astype(np.int32)) + np.add.at(child_counts, unit_indices, (~is_adult).astype(np.int32)) + + thresholds_by_tenure = get_spm_reference_thresholds(time_period) + reference_base = np.array( + [ + thresholds_by_tenure[ + _SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter") + ] + for tenure in tenure_types + ], + dtype=float, + ) + equivalence_scale = spm_equivalence_scale(adult_counts, child_counts) + + n_half = n_units // 2 + donor_denominator = reference_base[:n_half] * equivalence_scale[:n_half] + donor_geoadj = np.divide( + stored_thresholds[:n_half], + donor_denominator, + out=np.ones(n_half, dtype=float), + where=donor_denominator > 0, + ) + geoadj = np.concatenate([donor_geoadj, donor_geoadj]) + rebuilt = reference_base * equivalence_scale * geoadj + return rebuilt.astype(np.float32) + + def _splice_cps_only_predictions( data: dict, predictions: pd.DataFrame, @@ -489,6 +568,14 @@ def generate(self): time_period=self.time_period, dataset_path=str(self.cps.file_path), ) + if "spm_unit_spm_threshold" in new_data: + logger.info("Rebuilding cloned SPM thresholds from donor-half geography") + new_data["spm_unit_spm_threshold"] = { + self.time_period: rebuild_cloned_spm_thresholds( + new_data, + self.time_period, + ) + } new_data = self._rename_imputed_to_inputs(new_data) if _supports_structural_mortgage_inputs(): diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index a44977bdc..75bb66101 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -11,6 +11,7 @@ import pandas as pd import pytest +from policyengine_us_data.datasets.cps.enhanced_cps import initialize_weight_priors from policyengine_us_data.calibration.puf_impute import ( IMPUTED_VARIABLES, OVERRIDDEN_IMPUTED_VARIABLES, @@ -19,6 +20,7 @@ CPS_ONLY_IMPUTED_VARIABLES, CPS_STAGE2_INCOME_PREDICTORS, apply_retirement_constraints, + rebuild_cloned_spm_thresholds, reconcile_ss_subcomponents, ) from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES @@ -110,6 +112,7 @@ def test_pension_income_not_in_cps_only(self): should_not_be_here = { "taxable_private_pension_income", "tax_exempt_private_pension_income", + "spm_unit_spm_threshold", } present = should_not_be_here & set(CPS_ONLY_IMPUTED_VARIABLES) assert present == set(), ( @@ -117,6 +120,49 @@ def test_pension_income_not_in_cps_only(self): ) +class TestDeterministicSpmThresholdRebuild: + def test_rebuild_cloned_spm_thresholds_uses_donor_geoadj(self): + data = { + "age": {2024: np.array([40, 12, 70, 40, 12, 70], dtype=np.int32)}, + "person_spm_unit_id": {2024: np.array([10, 10, 20, 30, 30, 40], dtype=np.int32)}, + "spm_unit_id": {2024: np.array([10, 20, 30, 40], dtype=np.int32)}, + "spm_unit_tenure_type": { + 2024: np.array( + [b"RENTER", b"OWNER_WITHOUT_MORTGAGE", b"RENTER", b"OWNER_WITHOUT_MORTGAGE"] + ) + }, + "spm_unit_spm_threshold": { + 2024: np.array([20_000.0, 15_000.0, 999_999.0, 999_999.0], dtype=np.float64) + }, + } + + rebuilt = rebuild_cloned_spm_thresholds(data, 2024) + + np.testing.assert_allclose(rebuilt[:2], np.array([20_000.0, 15_000.0])) + np.testing.assert_allclose(rebuilt[2:], np.array([20_000.0, 15_000.0])) + + +class TestWeightPriorInitialization: + def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(self): + weights = np.array([1_500.0, 0.0, 625.0, 0.0], dtype=np.float64) + + priors = initialize_weight_priors(weights, seed=123) + + assert np.all(priors > 0) + assert priors[1] < 1e-4 + assert priors[3] < 1e-4 + assert priors[0] == pytest.approx(1_500.0, rel=0.05) + assert priors[2] == pytest.approx(625.0, rel=0.05) + + def test_initialize_weight_priors_is_reproducible(self): + weights = np.array([400.0, 0.0, 100.0], dtype=np.float64) + + priors_a = initialize_weight_priors(weights, seed=77) + priors_b = initialize_weight_priors(weights, seed=77) + + np.testing.assert_allclose(priors_a, priors_b) + + class TestRetirementConstraints: """Post-processing retirement constraints enforce IRS caps.""" From 84e425991c01166ff1406401ce89b21974c1637f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 09:52:55 -0400 Subject: [PATCH 04/14] Format clone-half threshold fixes --- policyengine_us_data/datasets/cps/extended_cps.py | 12 +++++++----- tests/unit/test_extended_cps.py | 15 ++++++++++++--- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index b110b536c..22a443804 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -410,11 +410,15 @@ def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray: person_spm_unit_ids = np.asarray(data["person_spm_unit_id"][time_period]) spm_unit_ids = np.asarray(data["spm_unit_id"][time_period]) tenure_types = np.asarray(data["spm_unit_tenure_type"][time_period]) - stored_thresholds = np.asarray(data["spm_unit_spm_threshold"][time_period], dtype=float) + stored_thresholds = np.asarray( + data["spm_unit_spm_threshold"][time_period], dtype=float + ) n_units = len(spm_unit_ids) if n_units % 2 != 0: - raise ValueError("Expected cloned SPM units to have an even-length spm_unit_id array") + raise ValueError( + "Expected cloned SPM units to have an even-length spm_unit_id array" + ) unit_indices = _index_into_spm_units(person_spm_unit_ids, spm_unit_ids) is_adult = person_ages >= 18 @@ -426,9 +430,7 @@ def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray: thresholds_by_tenure = get_spm_reference_thresholds(time_period) reference_base = np.array( [ - thresholds_by_tenure[ - _SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter") - ] + thresholds_by_tenure[_SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter")] for tenure in tenure_types ], dtype=float, diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index 75bb66101..c278d0ea1 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -124,15 +124,24 @@ class TestDeterministicSpmThresholdRebuild: def test_rebuild_cloned_spm_thresholds_uses_donor_geoadj(self): data = { "age": {2024: np.array([40, 12, 70, 40, 12, 70], dtype=np.int32)}, - "person_spm_unit_id": {2024: np.array([10, 10, 20, 30, 30, 40], dtype=np.int32)}, + "person_spm_unit_id": { + 2024: np.array([10, 10, 20, 30, 30, 40], dtype=np.int32) + }, "spm_unit_id": {2024: np.array([10, 20, 30, 40], dtype=np.int32)}, "spm_unit_tenure_type": { 2024: np.array( - [b"RENTER", b"OWNER_WITHOUT_MORTGAGE", b"RENTER", b"OWNER_WITHOUT_MORTGAGE"] + [ + b"RENTER", + b"OWNER_WITHOUT_MORTGAGE", + b"RENTER", + b"OWNER_WITHOUT_MORTGAGE", + ] ) }, "spm_unit_spm_threshold": { - 2024: np.array([20_000.0, 15_000.0, 999_999.0, 999_999.0], dtype=np.float64) + 2024: np.array( + [20_000.0, 15_000.0, 999_999.0, 999_999.0], dtype=np.float64 + ) }, } From 289bbc7137ac68534ec61250d1323bbb1e91f446 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 09:54:20 -0400 Subject: [PATCH 05/14] Format SPM benchmark and calibration files --- .../calibration/publish_local_area.py | 6 +----- .../calibration_targets/pull_hardcoded_targets.py | 1 + .../test_spm_thresholds.py | 4 +--- policyengine_us_data/utils/spm.py | 15 ++++----------- tests/unit/test_census_spm_housing_targets.py | 14 +++++++++----- validation/benefit_validation.py | 4 +--- 6 files changed, 17 insertions(+), 27 deletions(-) diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py index 4a8bb8a16..bc9f409f0 100644 --- a/policyengine_us_data/calibration/publish_local_area.py +++ b/policyengine_us_data/calibration/publish_local_area.py @@ -490,11 +490,7 @@ def build_h5( spm_unit_geoadj = np.array( [ cd_geoadj_values[active_clone_cds[clone_id]][ - ( - spm_tenure_cloned[spm_unit_index] - .decode() - .lower() - ) + (spm_tenure_cloned[spm_unit_index].decode().lower()) ] for spm_unit_index, clone_id in enumerate(spm_clone_ids) ], diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py index c1478d957..0503c7b59 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py @@ -10,6 +10,7 @@ sources. Housing uses the Census SPM capped subsidy concept, not HUD spending. """ + def get_hard_coded_totals( year: int = 2024, storage_folder=None, diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py index 8cd2a06b0..3e5b8bd34 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py @@ -31,9 +31,7 @@ def test_load_cd_geoadj_values_returns_tenure_specific_lookup(monkeypatch): assert geoadj_lookup["101"]["renter"] == pytest.approx(1.2215) assert geoadj_lookup["101"]["owner_with_mortgage"] == pytest.approx(1.217) - assert geoadj_lookup["101"]["owner_without_mortgage"] == pytest.approx( - 1.1615 - ) + assert geoadj_lookup["101"]["owner_without_mortgage"] == pytest.approx(1.1615) def test_calculate_spm_thresholds_vectorized_matches_policyengine_us_future_path(): diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py index 3aba81693..a2220339b 100644 --- a/policyengine_us_data/utils/spm.py +++ b/policyengine_us_data/utils/spm.py @@ -90,15 +90,12 @@ def spm_equivalence_scale(num_adults, num_children): single_adult_with_children = with_children & (adults <= 1) raw[single_adult_with_children] = ( - 1.0 - + 0.8 - + 0.5 * np.maximum(children[single_adult_with_children] - 1, 0) + 1.0 + 0.8 + 0.5 * np.maximum(children[single_adult_with_children] - 1, 0) ) ** 0.7 multi_adult_with_children = with_children & ~single_adult_with_children raw[multi_adult_with_children] = ( - adults[multi_adult_with_children] - + 0.5 * children[multi_adult_with_children] + adults[multi_adult_with_children] + 0.5 * children[multi_adult_with_children] ) ** 0.7 no_children = has_people & ~with_children @@ -125,16 +122,12 @@ def get_spm_reference_thresholds(year: int) -> dict[str, float]: cpi_u = _get_cpi_u_parameter() factor = float( - cpi_u(f"{year}-02-01") - / cpi_u(f"{LATEST_PUBLISHED_SPM_THRESHOLD_YEAR}-02-01") + cpi_u(f"{year}-02-01") / cpi_u(f"{LATEST_PUBLISHED_SPM_THRESHOLD_YEAR}-02-01") ) latest_thresholds = PUBLISHED_SPM_REFERENCE_THRESHOLDS[ LATEST_PUBLISHED_SPM_THRESHOLD_YEAR ] - return { - tenure: value * factor - for tenure, value in latest_thresholds.items() - } + return {tenure: value * factor for tenure, value in latest_thresholds.items()} def calculate_geoadj_from_rent( diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py index 6693ffe9a..be0098d29 100644 --- a/tests/unit/test_census_spm_housing_targets.py +++ b/tests/unit/test_census_spm_housing_targets.py @@ -35,9 +35,7 @@ def test_get_census_spm_capped_housing_subsidy_total_uses_raw_cps_spm_values( ): _write_census_cps(tmp_path) - total = get_census_spm_capped_housing_subsidy_total( - 2024, storage_folder=tmp_path - ) + total = get_census_spm_capped_housing_subsidy_total(2024, storage_folder=tmp_path) expected = (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100 assert total == expected @@ -147,13 +145,19 @@ def test_get_program_benchmarks_keeps_housing_concepts_separate(monkeypatch): benchmarks = benefit_validation.get_program_benchmarks(2022) - assert benchmarks["housing_spm_capped"]["variable"] == "spm_unit_capped_housing_subsidy" + assert ( + benchmarks["housing_spm_capped"]["variable"] + == "spm_unit_capped_housing_subsidy" + ) assert benchmarks["housing_spm_capped"]["benchmark_total"] == 111 assert benchmarks["housing_spm_capped"]["benchmark_source"] == "Census benchmark" assert benchmarks["housing_assistance_hud_user"]["variable"] == "housing_assistance" assert benchmarks["housing_assistance_hud_user"]["benchmark_total"] == 222 - assert benchmarks["housing_assistance_hud_user"]["benchmark_source"] == "HUD USER benchmark" + assert ( + benchmarks["housing_assistance_hud_user"]["benchmark_source"] + == "HUD USER benchmark" + ) assert ( benchmarks["housing_assistance_hud_user"]["benchmark_participants_millions"] == 3 diff --git a/validation/benefit_validation.py b/validation/benefit_validation.py index 723fe6a08..14360782d 100644 --- a/validation/benefit_validation.py +++ b/validation/benefit_validation.py @@ -87,9 +87,7 @@ def analyze_benefit_underreporting(): weighted_participants = ((benefit > 0) * weight).sum() / 1e6 # millions # Underreporting factor - benchmark_ratio = ( - info["benchmark_total"] / total if total > 0 else np.inf - ) + benchmark_ratio = info["benchmark_total"] / total if total > 0 else np.inf benchmark_participants = info.get("benchmark_participants_millions") participant_ratio = ( weighted_participants / benchmark_participants From 9b7222874afde3f6e1f088442a5679f622e49e21 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 10:44:13 -0400 Subject: [PATCH 06/14] Add changelog fragment for clone-half SPM fixes --- changelog.d/fixed/702.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/fixed/702.md diff --git a/changelog.d/fixed/702.md b/changelog.d/fixed/702.md new file mode 100644 index 000000000..6e2bafc59 --- /dev/null +++ b/changelog.d/fixed/702.md @@ -0,0 +1 @@ +Aligned clone-half enhanced CPS generation with the modeled SPM pipeline by rebuilding cloned SPM thresholds deterministically and keeping zero-weight synthetic households near zero in sparse reweighting priors. From 1151faacf82f338150c69baae3120ac9dcc8adcc Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 10:45:54 -0400 Subject: [PATCH 07/14] Retrigger PR checks From 9c01bc787abb8d55bc1591241b9107fda612a81e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 10:46:50 -0400 Subject: [PATCH 08/14] Retrigger PR checks after cancel From 0caed7938e9ee4ff18a5aa16dfcc208fd0038600 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 10:57:01 -0400 Subject: [PATCH 09/14] Fix Towncrier fragment layout --- changelog.d/{fixed/702.md => 702.fixed} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename changelog.d/{fixed/702.md => 702.fixed} (100%) diff --git a/changelog.d/fixed/702.md b/changelog.d/702.fixed similarity index 100% rename from changelog.d/fixed/702.md rename to changelog.d/702.fixed From 98a4e4701a2b8c3267e2e20538ad1303d999db62 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 13:40:18 -0400 Subject: [PATCH 10/14] Tighten clone prior and tenure validation --- .../datasets/cps/enhanced_cps.py | 13 +++------ .../datasets/cps/extended_cps.py | 12 ++++++++- tests/unit/test_extended_cps.py | 27 +++++++++++++++++-- 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index e0ce2c208..82bf9cc06 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -32,13 +32,12 @@ def initialize_weight_priors( original_weights: np.ndarray, seed: int = 1456, epsilon: float = 1e-6, - positive_jitter_scale: float = 0.01, ) -> np.ndarray: """Build deterministic positive priors for sparse reweighting. - Original CPS households should keep priors close to their survey - weights. Clone-half households start with zero weight on purpose, so - they should receive only a tiny positive epsilon to keep the log + Original CPS households should keep their survey weights unchanged. + Clone-half households start with zero weight on purpose, so they + should receive only a tiny positive epsilon to keep the log optimization well-defined without giving them a meaningful head start. """ @@ -51,11 +50,7 @@ def initialize_weight_priors( positive_mask = weights > 0 if positive_mask.any(): - jitter = np.maximum( - rng.normal(loc=1.0, scale=positive_jitter_scale, size=positive_mask.sum()), - 0.5, - ) - priors[positive_mask] = np.maximum(weights[positive_mask] * jitter, epsilon) + priors[positive_mask] = weights[positive_mask] zero_mask = ~positive_mask if zero_mask.any(): diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 22a443804..b7867f795 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -338,6 +338,16 @@ def reconcile_ss_subcomponents(predictions, total_ss): } +def _reference_threshold_key_for_tenure(tenure_type) -> str: + reference_key = _SPM_TENURE_TO_REFERENCE_KEY.get(tenure_type) + if reference_key is None: + raise ValueError( + "Unsupported spm_unit_tenure_type for cloned threshold rebuild: " + f"{tenure_type!r}" + ) + return reference_key + + def _apply_post_processing(predictions, X_test, time_period, data): """Apply retirement constraints and SS reconciliation.""" ret_cols = [c for c in predictions.columns if c in _RETIREMENT_VARS] @@ -430,7 +440,7 @@ def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray: thresholds_by_tenure = get_spm_reference_thresholds(time_period) reference_base = np.array( [ - thresholds_by_tenure[_SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter")] + thresholds_by_tenure[_reference_threshold_key_for_tenure(tenure)] for tenure in tenure_types ], dtype=float, diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index c278d0ea1..1c86a3bee 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -150,6 +150,22 @@ def test_rebuild_cloned_spm_thresholds_uses_donor_geoadj(self): np.testing.assert_allclose(rebuilt[:2], np.array([20_000.0, 15_000.0])) np.testing.assert_allclose(rebuilt[2:], np.array([20_000.0, 15_000.0])) + def test_rebuild_cloned_spm_thresholds_rejects_unknown_tenure(self): + data = { + "age": {2024: np.array([40, 12, 40, 12], dtype=np.int32)}, + "person_spm_unit_id": {2024: np.array([10, 10, 20, 20], dtype=np.int32)}, + "spm_unit_id": {2024: np.array([10, 20], dtype=np.int32)}, + "spm_unit_tenure_type": { + 2024: np.array([b"RENTER", b"UNRECOGNIZED"], dtype="|S12") + }, + "spm_unit_spm_threshold": { + 2024: np.array([20_000.0, 20_000.0], dtype=np.float64) + }, + } + + with pytest.raises(ValueError, match="Unsupported spm_unit_tenure_type"): + rebuild_cloned_spm_thresholds(data, 2024) + class TestWeightPriorInitialization: def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(self): @@ -160,8 +176,15 @@ def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(self): assert np.all(priors > 0) assert priors[1] < 1e-4 assert priors[3] < 1e-4 - assert priors[0] == pytest.approx(1_500.0, rel=0.05) - assert priors[2] == pytest.approx(625.0, rel=0.05) + assert priors[0] == pytest.approx(1_500.0) + assert priors[2] == pytest.approx(625.0) + + def test_initialize_weight_priors_preserves_positive_weights_exactly(self): + weights = np.array([1_500.0, 625.0, 42.0], dtype=np.float64) + + priors = initialize_weight_priors(weights, seed=123) + + np.testing.assert_array_equal(priors, weights) def test_initialize_weight_priors_is_reproducible(self): weights = np.array([400.0, 0.0, 100.0], dtype=np.float64) From aed202ae5dfccbcc05130b31999877057cde9c66 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 21:13:05 -0400 Subject: [PATCH 11/14] Fix local-area calibration imports and takeup args --- policyengine_us_data/calibration/publish_local_area.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py index b08c59232..0429c933f 100644 --- a/policyengine_us_data/calibration/publish_local_area.py +++ b/policyengine_us_data/calibration/publish_local_area.py @@ -25,6 +25,8 @@ ) from policyengine_us_data.calibration.block_assignment import ( derive_geography_from_blocks, +) +from policyengine_us_data.calibration.county_assignment import ( get_county_filter_probability, ) from policyengine_us_data.calibration.clone_and_assign import ( @@ -544,6 +546,7 @@ def build_h5( hh_blocks=active_blocks, hh_state_fips=hh_state_fips, hh_ids=original_hh_ids, + hh_clone_indices=active_geo.astype(np.int64), entity_hh_indices=entity_hh_indices, entity_counts=entity_counts, time_period=time_period, From e2ae174debe40c7709480ed2180aeb70f1aadbf2 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 21:46:56 -0400 Subject: [PATCH 12/14] Harden housing target test stub --- tests/unit/test_census_spm_housing_targets.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py index be0098d29..6befed4ed 100644 --- a/tests/unit/test_census_spm_housing_targets.py +++ b/tests/unit/test_census_spm_housing_targets.py @@ -1,3 +1,4 @@ +import importlib import sys import types @@ -57,10 +58,18 @@ class DummyMicrosimulation: def __init__(self, dataset): self.default_calculation_period = 2024 + policyengine_us = importlib.import_module("policyengine_us") monkeypatch.setitem( sys.modules, "policyengine_us", - types.SimpleNamespace(Microsimulation=DummyMicrosimulation), + types.SimpleNamespace( + **{ + name: getattr(policyengine_us, name) + for name in dir(policyengine_us) + if not name.startswith("__") and name != "Microsimulation" + }, + Microsimulation=DummyMicrosimulation, + ), ) monkeypatch.setattr( etl_national_targets, From f5750e3215d2292596e97f0be7e0577f5811f291 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 21:51:26 -0400 Subject: [PATCH 13/14] Trigger PR checks From 17e77b3938f07937e77bb28ff7f44277e25e1c6e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 10:20:39 -0400 Subject: [PATCH 14/14] Use valid year in housing target test --- tests/unit/test_census_spm_housing_targets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py index 6befed4ed..e595848d1 100644 --- a/tests/unit/test_census_spm_housing_targets.py +++ b/tests/unit/test_census_spm_housing_targets.py @@ -83,7 +83,7 @@ def __init__(self, dataset): }, ) - targets = etl_national_targets.extract_national_targets("dummy") + targets = etl_national_targets.extract_national_targets(2024) housing = next( target for target in targets["direct_sum_targets"]