From baf2a2447c049ba2d6872ac114d26708335b796a Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Tue, 7 Apr 2026 09:48:50 -0400
Subject: [PATCH 01/14] Align SPM threshold recalculation

---
 .../calibration/calibration_utils.py          |  44 ++-
 .../calibration/publish_local_area.py         | 263 ++++++++----------
 .../test_spm_thresholds.py                    |  51 ++++
 policyengine_us_data/utils/spm.py             | 144 +++++++++-
 4 files changed, 344 insertions(+), 158 deletions(-)
 create mode 100644 policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py

diff --git a/policyengine_us_data/calibration/calibration_utils.py b/policyengine_us_data/calibration/calibration_utils.py
index 8af1bab7a..a4c489bab 100644
--- a/policyengine_us_data/calibration/calibration_utils.py
+++ b/policyengine_us_data/calibration/calibration_utils.py
@@ -7,10 +7,12 @@
 import numpy as np
 import pandas as pd
 
-from spm_calculator import SPMCalculator, spm_equivalence_scale
-from spm_calculator.geoadj import calculate_geoadj_from_rent
-
-from policyengine_us_data.utils.spm import TENURE_CODE_MAP
+from policyengine_us_data.utils.spm import (
+    TENURE_CODE_MAP,
+    calculate_geoadj_from_rent,
+    get_spm_reference_thresholds,
+    spm_equivalence_scale,
+)
 from policyengine_us.variables.household.demographic.geographic.state_name import (
     StateName,
 )
@@ -491,6 +493,7 @@ def get_cd_index_mapping(db_uri: str = None):
         tuple: (cd_to_index dict, index_to_cd dict, cds_ordered list)
     """
     from sqlalchemy import create_engine, text
+    from pathlib import Path
     from policyengine_us_data.storage import STORAGE_FOLDER
 
     if db_uri is None:
@@ -531,7 +534,7 @@ def load_geo_labels(path) -> List[str]:
 
 def load_cd_geoadj_values(
     cds_to_calibrate: List[str],
-) -> Dict[str, float]:
+) -> Dict[str, Dict[str, float]]:
     """
     Load geographic adjustment factors from rent data CSV.
     Uses median 2BR rent by CD vs national median to compute geoadj.
@@ -550,11 +553,18 @@ def load_cd_geoadj_values(
     # Build lookup from rent data
     rent_lookup = {}
     for _, row in rent_df.iterrows():
-        geoadj = calculate_geoadj_from_rent(
-            local_rent=row["median_2br_rent"],
-            national_rent=row["national_median_2br_rent"],
-        )
-        rent_lookup[row["cd_geoid"]] = geoadj
+        rent_lookup[row["cd_geoid"]] = {
+            tenure: calculate_geoadj_from_rent(
+                local_rent=row["median_2br_rent"],
+                national_rent=row["national_median_2br_rent"],
+                tenure=tenure,
+            )
+            for tenure in (
+                "owner_with_mortgage",
+                "owner_without_mortgage",
+                "renter",
+            )
+        }
 
     geoadj_dict = {}
     for cd in cds_to_calibrate:
@@ -562,7 +572,11 @@ def load_cd_geoadj_values(
             geoadj_dict[cd] = rent_lookup[cd]
         else:
             print(f"Warning: No rent data for CD {cd}, using geoadj=1.0")
-            geoadj_dict[cd] = 1.0
+            geoadj_dict[cd] = {
+                "owner_with_mortgage": 1.0,
+                "owner_without_mortgage": 1.0,
+                "renter": 1.0,
+            }
 
     return geoadj_dict
 
@@ -593,6 +607,11 @@ def calculate_spm_thresholds_vectorized(
     Returns:
         Float32 array of SPM thresholds, one per SPM unit.
     """
+    person_ages = np.asarray(person_ages)
+    person_spm_unit_ids = np.asarray(person_spm_unit_ids)
+    spm_unit_tenure_types = np.asarray(spm_unit_tenure_types)
+    spm_unit_geoadj = np.asarray(spm_unit_geoadj, dtype=np.float64)
+
     n_units = len(spm_unit_tenure_types)
 
     # Count adults and children per SPM unit
@@ -614,8 +633,7 @@ def calculate_spm_thresholds_vectorized(
         tenure_codes[mask] = code
 
     # Look up base thresholds
-    calc = SPMCalculator(year=year)
-    base_thresholds = calc.get_base_thresholds()
+    base_thresholds = get_spm_reference_thresholds(year)
 
     thresholds = np.zeros(n_units, dtype=np.float32)
     for i in range(n_units):
diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py
index b3e6085a9..4a8bb8a16 100644
--- a/policyengine_us_data/calibration/publish_local_area.py
+++ b/policyengine_us_data/calibration/publish_local_area.py
@@ -8,11 +8,6 @@
     python publish_local_area.py [--skip-download] [--states-only] [--upload]
 """
 
-import hashlib
-import json
-import shutil
-
-
 import numpy as np
 from pathlib import Path
 from typing import List
@@ -30,6 +25,7 @@
 )
 from policyengine_us_data.calibration.block_assignment import (
     derive_geography_from_blocks,
+    get_county_filter_probability,
 )
 from policyengine_us_data.calibration.clone_and_assign import (
     GeographyAssignment,
@@ -45,76 +41,28 @@
 CHECKPOINT_FILE_CITIES = Path("completed_cities.txt")
 WORK_DIR = Path("local_area_build")
 
-NYC_COUNTY_FIPS = {"36005", "36047", "36061", "36081", "36085"}
-
-
-META_FILE = WORK_DIR / "checkpoint_meta.json"
-
-
-def compute_input_fingerprint(
-    weights_path: Path, dataset_path: Path, n_clones: int, seed: int
-) -> str:
-    h = hashlib.sha256()
-    for p in [weights_path, dataset_path]:
-        with open(p, "rb") as f:
-            while chunk := f.read(8192):
-                h.update(chunk)
-    h.update(f"{n_clones}:{seed}".encode())
-    return h.hexdigest()[:16]
-
-
-def validate_or_clear_checkpoints(fingerprint: str):
-    if META_FILE.exists():
-        stored = json.loads(META_FILE.read_text())
-        if stored.get("fingerprint") == fingerprint:
-            print(f"Inputs unchanged ({fingerprint}), resuming...")
-            return
-        print(
-            f"Inputs changed "
-            f"({stored.get('fingerprint')} -> {fingerprint}), "
-            f"clearing..."
-        )
-    else:
-        print(f"No checkpoint metadata, starting fresh ({fingerprint})")
-    h5_count = sum(
-        1
-        for subdir in ["states", "districts", "cities"]
-        if (WORK_DIR / subdir).exists()
-        for _ in (WORK_DIR / subdir).rglob("*.h5")
-    )
-    if h5_count > 0:
-        print(
-            f"WARNING: {h5_count} H5 files exist. "
-            f"Clearing only checkpoint files, preserving H5s."
-        )
-        for cp in [
-            CHECKPOINT_FILE,
-            CHECKPOINT_FILE_DISTRICTS,
-            CHECKPOINT_FILE_CITIES,
-        ]:
-            if cp.exists():
-                cp.unlink()
-    else:
-        for cp in [
-            CHECKPOINT_FILE,
-            CHECKPOINT_FILE_DISTRICTS,
-            CHECKPOINT_FILE_CITIES,
-        ]:
-            if cp.exists():
-                cp.unlink()
-        for subdir in ["states", "districts", "cities"]:
-            d = WORK_DIR / subdir
-            if d.exists():
-                shutil.rmtree(d)
-    META_FILE.parent.mkdir(parents=True, exist_ok=True)
-    META_FILE.write_text(json.dumps({"fingerprint": fingerprint}))
-
-
-SUB_ENTITIES = [
-    "tax_unit",
-    "spm_unit",
-    "family",
-    "marital_unit",
+NYC_COUNTIES = {
+    "QUEENS_COUNTY_NY",
+    "BRONX_COUNTY_NY",
+    "RICHMOND_COUNTY_NY",
+    "NEW_YORK_COUNTY_NY",
+    "KINGS_COUNTY_NY",
+}
+
+NYC_CDS = [
+    "3603",
+    "3605",
+    "3606",
+    "3607",
+    "3608",
+    "3609",
+    "3610",
+    "3611",
+    "3612",
+    "3613",
+    "3614",
+    "3615",
+    "3616",
 ]
 
 
@@ -163,7 +111,7 @@ def build_h5(
     dataset_path: Path,
     output_path: Path,
     cd_subset: List[str] = None,
-    county_fips_filter: set = None,
+    county_filter: set = None,
     takeup_filter: List[str] = None,
 ) -> Path:
     """Build an H5 file by cloning records for each nonzero weight.
@@ -174,8 +122,8 @@ def build_h5(
         dataset_path: Path to base dataset H5 file.
         output_path: Where to write the output H5 file.
         cd_subset: If provided, only include clones for these CDs.
-        county_fips_filter: If provided, zero out weights for clones
-            whose county FIPS is not in this set.
+        county_filter: If provided, scale weights by P(target|CD)
+            for city datasets.
         takeup_filter: List of takeup vars to apply.
 
     Returns:
@@ -216,11 +164,17 @@ def build_h5(
         cd_mask = np.vectorize(lambda cd: cd in cd_subset_set)(clone_cds_matrix)
         W[~cd_mask] = 0
 
-    # County FIPS filtering: zero out clones not in target counties
-    if county_fips_filter is not None:
-        fips_array = np.asarray(geography.county_fips).reshape(n_clones_total, n_hh)
-        fips_mask = np.isin(fips_array, list(county_fips_filter))
-        W[~fips_mask] = 0
+    # County filtering: scale weights by P(target_counties | CD)
+    if county_filter is not None:
+        unique_cds = np.unique(clone_cds_matrix)
+        cd_prob = {
+            cd: get_county_filter_probability(cd, county_filter) for cd in unique_cds
+        }
+        p_matrix = np.vectorize(
+            cd_prob.__getitem__,
+            otypes=[float],
+        )(clone_cds_matrix)
+        W *= p_matrix
 
     label = (
         f"CD subset {cd_subset}"
@@ -237,7 +191,7 @@ def build_h5(
     if n_clones == 0:
         raise ValueError(
             f"No active clones after filtering. "
-            f"cd_subset={cd_subset}, county_fips_filter={county_fips_filter}"
+            f"cd_subset={cd_subset}, county_filter={county_filter}"
         )
     clone_weights = W[active_geo, active_hh]
     active_blocks = blocks.reshape(n_clones_total, n_hh)[active_geo, active_hh]
@@ -258,6 +212,12 @@ def build_h5(
     for p_idx, p_hh_id in enumerate(person_hh_ids):
         hh_to_persons[hh_id_to_idx[int(p_hh_id)]].append(p_idx)
 
+    SUB_ENTITIES = [
+        "tax_unit",
+        "spm_unit",
+        "family",
+        "marital_unit",
+    ]
     hh_to_entity = {}
     entity_id_arrays = {}
     person_entity_id_arrays = {}
@@ -351,6 +311,22 @@ def build_h5(
     unique_geo = derive_geography_from_blocks(unique_blocks)
     clone_geo = {k: v[block_inv] for k, v in unique_geo.items()}
 
+    # === Calculate weights for all entity levels ===
+    person_weights = np.repeat(clone_weights, persons_per_clone)
+    per_person_wt = clone_weights / np.maximum(persons_per_clone, 1)
+
+    entity_weights = {}
+    for ek in SUB_ENTITIES:
+        n_ents = len(entity_clone_idx[ek])
+        ent_person_counts = np.zeros(n_ents, dtype=np.int32)
+        np.add.at(
+            ent_person_counts,
+            new_person_entity_ids[ek],
+            1,
+        )
+        clone_ids_e = np.repeat(np.arange(n_clones), entities_per_clone[ek])
+        entity_weights[ek] = per_person_wt[clone_ids_e] * ent_person_counts
+
     # === Determine variables to save ===
     vars_to_save = set(sim.input_variables)
     vars_to_save.add("county")
@@ -400,6 +376,7 @@ def build_h5(
         for period in periods:
             values = holder.get_array(period)
 
+            # Convert Arrow-backed arrays to numpy before indexing
             if hasattr(values, "_pa_array") or hasattr(values, "_ndarray"):
                 values = np.asarray(values)
 
@@ -436,12 +413,16 @@ def build_h5(
         }
 
     # === Override weights ===
-    # Only write household_weight; sub-entity weights (tax_unit_weight,
-    # spm_unit_weight, person_weight, etc.) are formula variables in
-    # policyengine-us that derive from household_weight at runtime.
     data["household_weight"] = {
         time_period: clone_weights.astype(np.float32),
     }
+    data["person_weight"] = {
+        time_period: person_weights.astype(np.float32),
+    }
+    for ek in SUB_ENTITIES:
+        data[f"{ek}_weight"] = {
+            time_period: entity_weights[ek].astype(np.float32),
+        }
 
     # === Override geography ===
     data["state_fips"] = {
@@ -470,13 +451,6 @@ def build_h5(
                 time_period: clone_geo[gv].astype("S"),
             }
 
-    # === Set zip_code for LA County clones (ACA rating area fix) ===
-    la_mask = clone_geo["county_fips"].astype(str) == "06037"
-    if la_mask.any():
-        zip_codes = np.full(len(la_mask), "UNKNOWN")
-        zip_codes[la_mask] = "90001"
-        data["zip_code"] = {time_period: zip_codes.astype("S")}
-
     # === Gap 4: Congressional district GEOID ===
     clone_cd_geoids = np.array([int(cd) for cd in active_clone_cds], dtype=np.int32)
     data["congressional_district_geoid"] = {
@@ -487,19 +461,11 @@ def build_h5(
     print("Recalculating SPM thresholds...")
     unique_cds_list = sorted(set(active_clone_cds))
     cd_geoadj_values = load_cd_geoadj_values(unique_cds_list)
-    # Build per-SPM-unit geoadj from clone's CD
-    spm_clone_ids = np.repeat(
-        np.arange(n_clones, dtype=np.int64),
-        entities_per_clone["spm_unit"],
-    )
-    spm_unit_geoadj = np.array(
-        [cd_geoadj_values[active_clone_cds[c]] for c in spm_clone_ids],
-        dtype=np.float64,
-    )
 
-    # Get cloned person ages and SPM tenure types
+    # Get cloned person ages and SPM unit IDs
     person_ages = sim.calculate("age", map_to="person").values[person_clone_idx]
 
+    # Get cloned tenure types
     spm_tenure_holder = sim.get_holder("spm_unit_tenure_type")
     spm_tenure_periods = spm_tenure_holder.get_known_periods()
     if spm_tenure_periods:
@@ -516,6 +482,25 @@ def build_h5(
             dtype="S30",
         )
 
+    # Build per-SPM-unit geoadj from the clone's CD and tenure.
+    spm_clone_ids = np.repeat(
+        np.arange(n_clones, dtype=np.int64),
+        entities_per_clone["spm_unit"],
+    )
+    spm_unit_geoadj = np.array(
+        [
+            cd_geoadj_values[active_clone_cds[clone_id]][
+                (
+                    spm_tenure_cloned[spm_unit_index]
+                    .decode()
+                    .lower()
+                )
+            ]
+            for spm_unit_index, clone_id in enumerate(spm_clone_ids)
+        ],
+        dtype=np.float64,
+    )
+
     new_spm_thresholds = calculate_spm_thresholds_vectorized(
         person_ages=person_ages,
         person_spm_unit_ids=new_person_entity_ids["spm_unit"],
@@ -556,7 +541,6 @@ def build_h5(
             hh_blocks=active_blocks,
             hh_state_fips=hh_state_fips,
             hh_ids=original_hh_ids,
-            hh_clone_indices=active_geo.astype(np.int64),
             entity_hh_indices=entity_hh_indices,
             entity_counts=entity_counts,
             time_period=time_period,
@@ -754,6 +738,8 @@ def build_cities(
     """Build city H5 files with checkpointing, optionally uploading."""
     w = np.load(weights_path)
 
+    all_cds = sorted(set(geography.cd_geoid.astype(str)))
+
     cities_dir = output_dir / "cities"
     cities_dir.mkdir(parents=True, exist_ok=True)
 
@@ -763,29 +749,34 @@ def build_cities(
     if "NYC" in completed_cities:
         print("Skipping NYC (already completed)")
     else:
-        output_path = cities_dir / "NYC.h5"
-
-        try:
-            build_h5(
-                weights=w,
-                geography=geography,
-                dataset_path=dataset_path,
-                output_path=output_path,
-                county_fips_filter=NYC_COUNTY_FIPS,
-                takeup_filter=takeup_filter,
-            )
-
-            if upload:
-                print("Uploading NYC.h5 to GCP...")
-                upload_local_area_file(str(output_path), "cities", skip_hf=True)
-                hf_queue.append((str(output_path), "cities"))
-
-            record_completed_city("NYC")
-            print("Completed NYC")
-
-        except Exception as e:
-            print(f"ERROR building NYC: {e}")
-            raise
+        cd_subset = [cd for cd in all_cds if cd in NYC_CDS]
+        if not cd_subset:
+            print("No NYC-related CDs found, skipping")
+        else:
+            output_path = cities_dir / "NYC.h5"
+
+            try:
+                build_h5(
+                    weights=w,
+                    geography=geography,
+                    dataset_path=dataset_path,
+                    output_path=output_path,
+                    cd_subset=cd_subset,
+                    county_filter=NYC_COUNTIES,
+                    takeup_filter=takeup_filter,
+                )
+
+                if upload:
+                    print("Uploading NYC.h5 to GCP...")
+                    upload_local_area_file(str(output_path), "cities", skip_hf=True)
+                    hf_queue.append((str(output_path), "cities"))
+
+                record_completed_city("NYC")
+                print("Completed NYC")
+
+            except Exception as e:
+                print(f"ERROR building NYC: {e}")
+                raise
 
     if upload and hf_queue:
         print(f"\nUploading batch of {len(hf_queue)} city files to HuggingFace...")
@@ -880,19 +871,9 @@ def main():
 
     print(f"Using dataset: {inputs['dataset']}")
 
-    print("Computing input fingerprint...")
-    fingerprint = compute_input_fingerprint(
-        inputs["weights"],
-        inputs["dataset"],
-        args.n_clones,
-        args.seed,
-    )
-    validate_or_clear_checkpoints(fingerprint)
-
-    print("Loading base simulation to get household count...")
-    _sim = Microsimulation(dataset=str(inputs["dataset"]))
-    n_hh = len(_sim.calculate("household_id", map_to="household").values)
-    del _sim
+    sim = Microsimulation(dataset=str(inputs["dataset"]))
+    n_hh = sim.calculate("household_id", map_to="household").shape[0]
+    del sim
     print(f"\nBase dataset has {n_hh:,} households")
 
     geo_cache = WORK_DIR / f"geography_{n_hh}x{args.n_clones}_s{args.seed}.npz"
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py
new file mode 100644
index 000000000..8cd2a06b0
--- /dev/null
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from policyengine_us import CountryTaxBenefitSystem
+from policyengine_us_data.calibration.calibration_utils import (
+    calculate_spm_thresholds_vectorized,
+    load_cd_geoadj_values,
+)
+
+SYSTEM = CountryTaxBenefitSystem()
+CPI_U = SYSTEM.parameters.gov.bls.cpi.cpi_u
+
+
+def test_load_cd_geoadj_values_returns_tenure_specific_lookup(monkeypatch):
+    rent_df = pd.DataFrame(
+        {
+            "cd_id": ["0101"],
+            "median_2br_rent": [1_500.0],
+            "national_median_2br_rent": [1_000.0],
+        }
+    )
+    monkeypatch.setattr(
+        "policyengine_us_data.calibration.calibration_utils.pd.read_csv",
+        lambda *args, **kwargs: rent_df,
+    )
+
+    geoadj_lookup = load_cd_geoadj_values(["101"])
+
+    assert geoadj_lookup["101"]["renter"] == pytest.approx(1.2215)
+    assert geoadj_lookup["101"]["owner_with_mortgage"] == pytest.approx(1.217)
+    assert geoadj_lookup["101"]["owner_without_mortgage"] == pytest.approx(
+        1.1615
+    )
+
+
+def test_calculate_spm_thresholds_vectorized_matches_policyengine_us_future_path():
+    thresholds = calculate_spm_thresholds_vectorized(
+        person_ages=np.array([40, 35, 10, 8]),
+        person_spm_unit_ids=np.array([0, 0, 0, 0]),
+        spm_unit_tenure_types=np.array([b"RENTER"]),
+        spm_unit_geoadj=np.array([1.1]),
+        year=2027,
+    )
+
+    cpi_ratio = float(CPI_U("2027-02-01") / CPI_U("2024-02-01"))
+    expected = 39_430.0 * cpi_ratio * 1.1
+
+    assert thresholds[0] == pytest.approx(expected)
diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py
index ad3c9e9fb..3aba81693 100644
--- a/policyengine_us_data/utils/spm.py
+++ b/policyengine_us_data/utils/spm.py
@@ -1,7 +1,69 @@
-"""SPM threshold calculation utilities using the spm-calculator package."""
+"""SPM threshold calculation utilities aligned with PolicyEngine US."""
 
+from functools import lru_cache
 import numpy as np
-from spm_calculator import SPMCalculator, spm_equivalence_scale
+from policyengine_us import CountryTaxBenefitSystem
+
+PUBLISHED_SPM_REFERENCE_THRESHOLDS = {
+    2015: {
+        "renter": 25_155.0,
+        "owner_with_mortgage": 24_859.0,
+        "owner_without_mortgage": 20_639.0,
+    },
+    2016: {
+        "renter": 25_558.0,
+        "owner_with_mortgage": 25_248.0,
+        "owner_without_mortgage": 20_943.0,
+    },
+    2017: {
+        "renter": 26_213.0,
+        "owner_with_mortgage": 25_897.0,
+        "owner_without_mortgage": 21_527.0,
+    },
+    2018: {
+        "renter": 26_905.0,
+        "owner_with_mortgage": 26_565.0,
+        "owner_without_mortgage": 22_095.0,
+    },
+    2019: {
+        "renter": 27_515.0,
+        "owner_with_mortgage": 27_172.0,
+        "owner_without_mortgage": 22_600.0,
+    },
+    2020: {
+        "renter": 28_881.0,
+        "owner_with_mortgage": 28_533.0,
+        "owner_without_mortgage": 23_948.0,
+    },
+    2021: {
+        "renter": 31_453.0,
+        "owner_with_mortgage": 31_089.0,
+        "owner_without_mortgage": 26_022.0,
+    },
+    2022: {
+        "renter": 33_402.0,
+        "owner_with_mortgage": 32_949.0,
+        "owner_without_mortgage": 27_679.0,
+    },
+    2023: {
+        "renter": 36_606.0,
+        "owner_with_mortgage": 36_192.0,
+        "owner_without_mortgage": 30_347.0,
+    },
+    2024: {
+        "renter": 39_430.0,
+        "owner_with_mortgage": 39_068.0,
+        "owner_without_mortgage": 32_586.0,
+    },
+}
+
+LATEST_PUBLISHED_SPM_THRESHOLD_YEAR = max(PUBLISHED_SPM_REFERENCE_THRESHOLDS)
+REFERENCE_RAW_SCALE = 3**0.7
+TENURE_HOUSING_SHARES = {
+    "owner_with_mortgage": 0.434,
+    "owner_without_mortgage": 0.323,
+    "renter": 0.443,
+}
 
 TENURE_CODE_MAP = {
     1: "owner_with_mortgage",
@@ -10,6 +72,81 @@
 }
 
 
+@lru_cache(maxsize=1)
+def _get_cpi_u_parameter():
+    system = CountryTaxBenefitSystem()
+    return system.parameters.gov.bls.cpi.cpi_u
+
+
+def spm_equivalence_scale(num_adults, num_children):
+    adults, children = np.broadcast_arrays(
+        np.asarray(num_adults, dtype=float),
+        np.asarray(num_children, dtype=float),
+    )
+
+    raw = np.zeros_like(adults, dtype=float)
+    has_people = (adults + children) > 0
+    with_children = has_people & (children > 0)
+
+    single_adult_with_children = with_children & (adults <= 1)
+    raw[single_adult_with_children] = (
+        1.0
+        + 0.8
+        + 0.5 * np.maximum(children[single_adult_with_children] - 1, 0)
+    ) ** 0.7
+
+    multi_adult_with_children = with_children & ~single_adult_with_children
+    raw[multi_adult_with_children] = (
+        adults[multi_adult_with_children]
+        + 0.5 * children[multi_adult_with_children]
+    ) ** 0.7
+
+    no_children = has_people & ~with_children
+    one_adult = no_children & (adults <= 1)
+    two_adults = no_children & (adults == 2)
+    larger_adult_units = no_children & (adults > 2)
+
+    raw[one_adult] = 1.0
+    raw[two_adults] = 1.41
+    raw[larger_adult_units] = adults[larger_adult_units] ** 0.7
+
+    return raw / REFERENCE_RAW_SCALE
+
+
+def get_spm_reference_thresholds(year: int) -> dict[str, float]:
+    if year in PUBLISHED_SPM_REFERENCE_THRESHOLDS:
+        return PUBLISHED_SPM_REFERENCE_THRESHOLDS[year].copy()
+
+    if year < min(PUBLISHED_SPM_REFERENCE_THRESHOLDS):
+        raise ValueError(
+            f"No published SPM reference thresholds for {year}. "
+            f"Earliest available year is {min(PUBLISHED_SPM_REFERENCE_THRESHOLDS)}."
+        )
+
+    cpi_u = _get_cpi_u_parameter()
+    factor = float(
+        cpi_u(f"{year}-02-01")
+        / cpi_u(f"{LATEST_PUBLISHED_SPM_THRESHOLD_YEAR}-02-01")
+    )
+    latest_thresholds = PUBLISHED_SPM_REFERENCE_THRESHOLDS[
+        LATEST_PUBLISHED_SPM_THRESHOLD_YEAR
+    ]
+    return {
+        tenure: value * factor
+        for tenure, value in latest_thresholds.items()
+    }
+
+
+def calculate_geoadj_from_rent(
+    local_rent,
+    national_rent: float,
+    tenure: str = "renter",
+):
+    share = TENURE_HOUSING_SHARES[tenure]
+    rent_ratio = np.asarray(local_rent, dtype=float) / float(national_rent)
+    return rent_ratio * share + (1.0 - share)
+
+
 def calculate_spm_thresholds_with_geoadj(
     num_adults: np.ndarray,
     num_children: np.ndarray,
@@ -35,8 +172,7 @@ def calculate_spm_thresholds_with_geoadj(
     Returns:
         Array of SPM threshold values.
     """
-    calc = SPMCalculator(year=year)
-    base_thresholds = calc.get_base_thresholds()
+    base_thresholds = get_spm_reference_thresholds(year)
 
     n = len(num_adults)
     thresholds = np.zeros(n)

From 418d73c3c70e46e5c4d89f864cc1a241eeb1783b Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Tue, 7 Apr 2026 15:59:44 -0400
Subject: [PATCH 02/14] Split housing benchmarks by Census and HUD concepts

---
 .../db/etl_national_targets.py                |  11 +-
 .../pull_hardcoded_targets.py                 |  82 +++++----
 policyengine_us_data/utils/census_spm.py      |  65 +++++++
 policyengine_us_data/utils/hud_housing.py     |  49 ++++++
 policyengine_us_data/utils/loss.py            |   7 +-
 tests/unit/test_census_spm_housing_targets.py | 161 ++++++++++++++++++
 validation/benefit_validation.py              |  95 ++++++++---
 7 files changed, 403 insertions(+), 67 deletions(-)
 create mode 100644 policyengine_us_data/utils/census_spm.py
 create mode 100644 policyengine_us_data/utils/hud_housing.py
 create mode 100644 tests/unit/test_census_spm_housing_targets.py

diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py
index 773411dc5..d1c459656 100644
--- a/policyengine_us_data/db/etl_national_targets.py
+++ b/policyengine_us_data/db/etl_national_targets.py
@@ -12,6 +12,9 @@
 from policyengine_us_data.storage.calibration_targets.soi_metadata import (
     RETIREMENT_CONTRIBUTION_TARGETS,
 )
+from policyengine_us_data.utils.census_spm import (
+    build_census_spm_capped_housing_subsidy_target,
+)
 from policyengine_us_data.utils.db import (
     DEFAULT_DATASET,
     etl_argparser,
@@ -187,13 +190,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET):
             "notes": "Work and childcare expenses for SPM",
             "year": HARDCODED_YEAR,
         },
-        {
-            "variable": "spm_unit_capped_housing_subsidy",
-            "value": 35e9,
-            "source": "HUD/Census",
-            "notes": "Housing subsidies",
-            "year": HARDCODED_YEAR,
-        },
+        build_census_spm_capped_housing_subsidy_target(HARDCODED_YEAR),
         {
             "variable": "tanf",
             "value": 9e9,
diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py
index 3199a56a2..c1478d957 100644
--- a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py
+++ b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py
@@ -1,52 +1,66 @@
 import pandas as pd
 import numpy as np
 from policyengine_us_data.storage import CALIBRATION_FOLDER
+from policyengine_us_data.utils.census_spm import (
+    get_census_spm_capped_housing_subsidy_total,
+)
 
 """
-Hardcoded targets for the year 2024 from CPS-derived statistics and other sources. Include medical expenses, sum of SPM thresholds, and child support expenses.
+Hardcoded targets for the year 2024 from CPS-derived statistics and other
+sources. Housing uses the Census SPM capped subsidy concept, not HUD spending.
 """
 
-HARD_CODED_TOTALS = {
-    "health_insurance_premiums_without_medicare_part_b": 385e9,
-    "other_medical_expenses": 278e9,
-    "medicare_part_b_premiums": 112e9,
-    "over_the_counter_health_expenses": 72e9,
-    "spm_unit_spm_threshold": 3_945e9,
-    "child_support_expense": 33e9,
-    "child_support_received": 33e9,
-    "spm_unit_capped_work_childcare_expenses": 348e9,
-    "spm_unit_capped_housing_subsidy": 35e9,
-    "tanf": 9e9,
-    # Alimony could be targeted via SOI
-    "alimony_income": 13e9,
-    "alimony_expense": 13e9,
-    # Rough estimate, not CPS derived
-    "real_estate_taxes": 500e9,  # Rough estimate between 350bn and 600bn total property tax collections
-    "rent": 735e9,  # ACS total uprated by CPI
-    # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics
-    # shows $38,316,190,000 in Box 7: Social security tips (2018)
-    # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC
-    # Assume 40% through 2024
-    "tip_income": 38e9 * 1.4,
-}
+def get_hard_coded_totals(
+    year: int = 2024,
+    storage_folder=None,
+) -> dict[str, float]:
+    return {
+        "health_insurance_premiums_without_medicare_part_b": 385e9,
+        "other_medical_expenses": 278e9,
+        "medicare_part_b_premiums": 112e9,
+        "over_the_counter_health_expenses": 72e9,
+        "spm_unit_spm_threshold": 3_945e9,
+        "child_support_expense": 33e9,
+        "child_support_received": 33e9,
+        "spm_unit_capped_work_childcare_expenses": 348e9,
+        "spm_unit_capped_housing_subsidy": get_census_spm_capped_housing_subsidy_total(
+            year, storage_folder=storage_folder
+        ),
+        "tanf": 9e9,
+        # Alimony could be targeted via SOI
+        "alimony_income": 13e9,
+        "alimony_expense": 13e9,
+        # Rough estimate, not CPS derived
+        "real_estate_taxes": 500e9,  # Rough estimate between 350bn and 600bn total property tax collections
+        "rent": 735e9,  # ACS total uprated by CPI
+        # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics
+        # shows $38,316,190,000 in Box 7: Social security tips (2018)
+        # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC
+        # Assume 40% through 2024
+        "tip_income": 38e9 * 1.4,
+    }
 
 
-def pull_hardcoded_targets():
+def pull_hardcoded_targets(year: int = 2024, storage_folder=None):
     """
     Returns a DataFrame with hardcoded targets for various CPS-derived statistics and other sources.
     """
+    hard_coded_totals = get_hard_coded_totals(
+        year=year,
+        storage_folder=storage_folder,
+    )
     data = {
-        "DATA_SOURCE": ["hardcoded"] * len(HARD_CODED_TOTALS),
-        "GEO_ID": ["0000000US"] * len(HARD_CODED_TOTALS),
-        "GEO_NAME": ["national"] * len(HARD_CODED_TOTALS),
-        "VARIABLE": list(HARD_CODED_TOTALS.keys()),
-        "VALUE": list(HARD_CODED_TOTALS.values()),
+        "DATA_SOURCE": ["hardcoded"] * len(hard_coded_totals),
+        "GEO_ID": ["0000000US"] * len(hard_coded_totals),
+        "GEO_NAME": ["national"] * len(hard_coded_totals),
+        "VARIABLE": list(hard_coded_totals.keys()),
+        "VALUE": list(hard_coded_totals.values()),
         "IS_COUNT": [0.0]
-        * len(HARD_CODED_TOTALS),  # All values are monetary amounts, not counts
+        * len(hard_coded_totals),  # All values are monetary amounts, not counts
         "BREAKDOWN_VARIABLE": [np.nan]
-        * len(HARD_CODED_TOTALS),  # No breakdown variable for hardcoded targets
-        "LOWER_BOUND": [np.nan] * len(HARD_CODED_TOTALS),
-        "UPPER_BOUND": [np.nan] * len(HARD_CODED_TOTALS),
+        * len(hard_coded_totals),  # No breakdown variable for hardcoded targets
+        "LOWER_BOUND": [np.nan] * len(hard_coded_totals),
+        "UPPER_BOUND": [np.nan] * len(hard_coded_totals),
     }
 
     df = pd.DataFrame(data)
diff --git a/policyengine_us_data/utils/census_spm.py b/policyengine_us_data/utils/census_spm.py
new file mode 100644
index 000000000..e7a9c32b4
--- /dev/null
+++ b/policyengine_us_data/utils/census_spm.py
@@ -0,0 +1,65 @@
+from pathlib import Path
+
+import pandas as pd
+
+from policyengine_us_data.storage import STORAGE_FOLDER
+
+# These totals are derived from raw public-use Census CPS ASEC files by
+# summing SPM_CAPHOUSESUB * SPM_WEIGHT / 100 at the SPM-unit level.
+# They represent the Census SPM capped housing subsidy concept, not HUD
+# spending or outlays.
+CENSUS_SPM_CAPPED_HOUSING_SUBSIDY_TOTALS = {
+    2022: 29_549_204_420.92,
+    2023: 31_844_144_470.85,
+    2024: 33_649_114_150.37,
+}
+
+
+def get_census_spm_capped_housing_subsidy_total(
+    year: int,
+    storage_folder: Path | str | None = None,
+) -> float:
+    """
+    Return the Census CPS ASEC total for SPM_CAPHOUSESUB.
+
+    If a storage folder is provided, recompute directly from the local
+    raw Census CPS HDF file. Otherwise, use the checked-in year-specific
+    totals above so callers do not depend on local data files at import
+    time.
+    """
+
+    if storage_folder is None:
+        if year not in CENSUS_SPM_CAPPED_HOUSING_SUBSIDY_TOTALS:
+            raise ValueError(
+                f"No published Census SPM capped housing subsidy total for {year}."
+            )
+        return CENSUS_SPM_CAPPED_HOUSING_SUBSIDY_TOTALS[year]
+
+    storage_path = Path(storage_folder) / f"census_cps_{year}.h5"
+    if not storage_path.exists():
+        raise FileNotFoundError(
+            f"Missing raw Census CPS file for {year}: {storage_path}"
+        )
+
+    with pd.HDFStore(storage_path, mode="r") as store:
+        spm_unit = store["spm_unit"][["SPM_CAPHOUSESUB", "SPM_WEIGHT"]]
+
+    return float((spm_unit.SPM_CAPHOUSESUB * spm_unit.SPM_WEIGHT).sum() / 100)
+
+
+def build_census_spm_capped_housing_subsidy_target(
+    year: int,
+    storage_folder: Path | str | None = None,
+) -> dict:
+    return {
+        "variable": "spm_unit_capped_housing_subsidy",
+        "value": get_census_spm_capped_housing_subsidy_total(
+            year, storage_folder=storage_folder
+        ),
+        "source": "Census CPS ASEC public-use SPM_CAPHOUSESUB",
+        "notes": (
+            "Capped SPM housing subsidy total from raw Census CPS ASEC "
+            "SPM_CAPHOUSESUB; not HUD spending or outlays."
+        ),
+        "year": year,
+    }
diff --git a/policyengine_us_data/utils/hud_housing.py b/policyengine_us_data/utils/hud_housing.py
new file mode 100644
index 000000000..1b7620e5d
--- /dev/null
+++ b/policyengine_us_data/utils/hud_housing.py
@@ -0,0 +1,49 @@
+HUD_USER_HOUSING_ASSISTANCE_BENCHMARKS = {
+    2022: {
+        "reported_households": 4_537_614,
+        "average_monthly_spending_per_unit": 899,
+    },
+    2023: {
+        "reported_households": 4_569_973,
+        "average_monthly_spending_per_unit": 989,
+    },
+    2024: {
+        "reported_households": 4_584_691,
+        "average_monthly_spending_per_unit": 1_067,
+    },
+    2025: {
+        "reported_households": 4_519_561,
+        "average_monthly_spending_per_unit": 1_135,
+    },
+}
+
+
+def get_hud_user_housing_benchmark(year: int) -> dict:
+    if year not in HUD_USER_HOUSING_ASSISTANCE_BENCHMARKS:
+        raise ValueError(f"No HUD USER housing benchmark for {year}.")
+
+    benchmark = dict(HUD_USER_HOUSING_ASSISTANCE_BENCHMARKS[year])
+    benchmark["annual_spending_total"] = (
+        benchmark["reported_households"]
+        * benchmark["average_monthly_spending_per_unit"]
+        * 12
+    )
+    return benchmark
+
+
+def build_hud_user_housing_assistance_benchmark(year: int) -> dict:
+    benchmark = get_hud_user_housing_benchmark(year)
+    return {
+        "variable": "housing_assistance",
+        "annual_spending_total": benchmark["annual_spending_total"],
+        "reported_households": benchmark["reported_households"],
+        "average_monthly_spending_per_unit": benchmark[
+            "average_monthly_spending_per_unit"
+        ],
+        "source": "HUD USER Picture of Subsidized Households",
+        "notes": (
+            "Annual federal spending and occupied assisted households from "
+            "HUD USER; not Census SPM capped subsidy."
+        ),
+        "year": year,
+    }
diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py
index e141cdf43..6d8b35f46 100644
--- a/policyengine_us_data/utils/loss.py
+++ b/policyengine_us_data/utils/loss.py
@@ -12,6 +12,9 @@
 from policyengine_us_data.storage.calibration_targets.soi_metadata import (
     RETIREMENT_CONTRIBUTION_TARGETS,
 )
+from policyengine_us_data.utils.census_spm import (
+    get_census_spm_capped_housing_subsidy_total,
+)
 from policyengine_core.reforms import Reform
 from policyengine_us_data.utils.soi import pe_to_soi, get_soi
 
@@ -31,7 +34,9 @@
     "child_support_expense": 33e9,
     "child_support_received": 33e9,
     "spm_unit_capped_work_childcare_expenses": 348e9,
-    "spm_unit_capped_housing_subsidy": 35e9,
+    "spm_unit_capped_housing_subsidy": get_census_spm_capped_housing_subsidy_total(
+        2024
+    ),
     "tanf": 9e9,
     # Alimony could be targeted via SOI
     "alimony_income": 13e9,
diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py
new file mode 100644
index 000000000..6693ffe9a
--- /dev/null
+++ b/tests/unit/test_census_spm_housing_targets.py
@@ -0,0 +1,161 @@
+import sys
+import types
+
+import pandas as pd
+import validation.benefit_validation as benefit_validation
+
+from policyengine_us_data.db import etl_national_targets
+from policyengine_us_data.storage.calibration_targets.pull_hardcoded_targets import (
+    pull_hardcoded_targets,
+)
+from policyengine_us_data.utils.census_spm import (
+    build_census_spm_capped_housing_subsidy_target,
+    get_census_spm_capped_housing_subsidy_total,
+)
+from policyengine_us_data.utils.hud_housing import (
+    build_hud_user_housing_assistance_benchmark,
+    get_hud_user_housing_benchmark,
+)
+
+
+def _write_census_cps(storage_dir, year=2024):
+    path = storage_dir / f"census_cps_{year}.h5"
+    with pd.HDFStore(path, mode="w") as store:
+        store["spm_unit"] = pd.DataFrame(
+            {
+                "SPM_CAPHOUSESUB": [1_000.0, 0.0, 2_000.0],
+                "SPM_WEIGHT": [150, 250, 100],
+            }
+        )
+    return path
+
+
+def test_get_census_spm_capped_housing_subsidy_total_uses_raw_cps_spm_values(
+    tmp_path,
+):
+    _write_census_cps(tmp_path)
+
+    total = get_census_spm_capped_housing_subsidy_total(
+        2024, storage_folder=tmp_path
+    )
+
+    expected = (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100
+    assert total == expected
+
+
+def test_pull_hardcoded_targets_uses_census_spm_housing_total(tmp_path):
+    _write_census_cps(tmp_path)
+
+    targets = pull_hardcoded_targets(year=2024, storage_folder=tmp_path)
+    housing = targets.loc[
+        targets.VARIABLE == "spm_unit_capped_housing_subsidy", "VALUE"
+    ].iat[0]
+
+    assert housing == (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100
+
+
+def test_extract_national_targets_uses_census_spm_housing_target(monkeypatch):
+    class DummyMicrosimulation:
+        def __init__(self, dataset):
+            self.default_calculation_period = 2024
+
+    monkeypatch.setitem(
+        sys.modules,
+        "policyengine_us",
+        types.SimpleNamespace(Microsimulation=DummyMicrosimulation),
+    )
+    monkeypatch.setattr(
+        etl_national_targets,
+        "build_census_spm_capped_housing_subsidy_target",
+        lambda year, storage_folder=None: {
+            "variable": "spm_unit_capped_housing_subsidy",
+            "value": 123.0,
+            "source": "Census CPS ASEC public-use SPM_CAPHOUSESUB",
+            "notes": "Capped SPM housing subsidy total from raw Census CPS ASEC.",
+            "year": year,
+        },
+    )
+
+    targets = etl_national_targets.extract_national_targets("dummy")
+    housing = next(
+        target
+        for target in targets["direct_sum_targets"]
+        if target["variable"] == "spm_unit_capped_housing_subsidy"
+    )
+
+    assert housing["value"] == 123.0
+    assert housing["source"] == "Census CPS ASEC public-use SPM_CAPHOUSESUB"
+    assert "Capped SPM housing subsidy" in housing["notes"]
+
+
+def test_build_census_spm_capped_housing_subsidy_target_labels_concept(tmp_path):
+    _write_census_cps(tmp_path)
+
+    target = build_census_spm_capped_housing_subsidy_target(
+        2024, storage_folder=tmp_path
+    )
+
+    assert target["variable"] == "spm_unit_capped_housing_subsidy"
+    assert target["value"] == (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100
+    assert target["source"] == "Census CPS ASEC public-use SPM_CAPHOUSESUB"
+    assert "not HUD spending" in target["notes"]
+
+
+def test_get_hud_user_housing_benchmark_uses_annual_spending_concept():
+    benchmark = get_hud_user_housing_benchmark(2022)
+
+    assert benchmark["reported_households"] == 4_537_614
+    assert benchmark["average_monthly_spending_per_unit"] == 899
+    assert benchmark["annual_spending_total"] == 48_951_779_832
+
+
+def test_build_hud_user_housing_assistance_benchmark_labels_admin_concept():
+    benchmark = build_hud_user_housing_assistance_benchmark(2022)
+
+    assert benchmark["variable"] == "housing_assistance"
+    assert benchmark["source"] == "HUD USER Picture of Subsidized Households"
+    assert benchmark["reported_households"] == 4_537_614
+    assert benchmark["annual_spending_total"] == 48_951_779_832
+    assert "not Census SPM capped subsidy" in benchmark["notes"]
+
+
+def test_get_program_benchmarks_keeps_housing_concepts_separate(monkeypatch):
+    monkeypatch.setattr(
+        benefit_validation,
+        "build_census_spm_capped_housing_subsidy_target",
+        lambda year: {
+            "variable": "spm_unit_capped_housing_subsidy",
+            "value": 111e9,
+            "source": "Census benchmark",
+            "notes": "Census concept",
+            "year": year,
+        },
+    )
+    monkeypatch.setattr(
+        benefit_validation,
+        "build_hud_user_housing_assistance_benchmark",
+        lambda year: {
+            "variable": "housing_assistance",
+            "annual_spending_total": 222e9,
+            "reported_households": 3_000_000,
+            "average_monthly_spending_per_unit": 999,
+            "source": "HUD USER benchmark",
+            "notes": "HUD concept",
+            "year": year,
+        },
+    )
+
+    benchmarks = benefit_validation.get_program_benchmarks(2022)
+
+    assert benchmarks["housing_spm_capped"]["variable"] == "spm_unit_capped_housing_subsidy"
+    assert benchmarks["housing_spm_capped"]["benchmark_total"] == 111
+    assert benchmarks["housing_spm_capped"]["benchmark_source"] == "Census benchmark"
+
+    assert benchmarks["housing_assistance_hud_user"]["variable"] == "housing_assistance"
+    assert benchmarks["housing_assistance_hud_user"]["benchmark_total"] == 222
+    assert benchmarks["housing_assistance_hud_user"]["benchmark_source"] == "HUD USER benchmark"
+    assert (
+        benchmarks["housing_assistance_hud_user"]["benchmark_participants_millions"]
+        == 3
+    )
+    assert benchmarks["housing_assistance_hud_user"]["map_to"] == "household"
diff --git a/validation/benefit_validation.py b/validation/benefit_validation.py
index cf4689720..723fe6a08 100644
--- a/validation/benefit_validation.py
+++ b/validation/benefit_validation.py
@@ -9,41 +9,75 @@
 import numpy as np
 from policyengine_us import Microsimulation
 from policyengine_us_data.datasets.cps.enhanced_cps import EnhancedCPS
-
-
-def analyze_benefit_underreporting():
-    """Analyze impact of CPS benefit underreporting on results."""
-
-    sim = Microsimulation(dataset=EnhancedCPS, dataset_year=2022)
-
-    programs = {
+from policyengine_us_data.utils.census_spm import (
+    build_census_spm_capped_housing_subsidy_target,
+)
+from policyengine_us_data.utils.hud_housing import (
+    build_hud_user_housing_assistance_benchmark,
+)
+
+
+def get_program_benchmarks(year: int):
+    housing_spm_target = build_census_spm_capped_housing_subsidy_target(year)
+    housing_hud_target = build_hud_user_housing_assistance_benchmark(year)
+    return {
         "snap": {
             "variable": "snap",
-            "admin_total": 114.1,  # billions, from USDA
+            "benchmark_total": 114.1,  # billions, from USDA
+            "benchmark_source": "USDA",
             "participation_rate": 0.82,  # from USDA
+            "weight_variable": "spm_unit_weight",
         },
         "ssi": {
             "variable": "ssi",
-            "admin_total": 56.7,  # from SSA
+            "benchmark_total": 56.7,  # from SSA
+            "benchmark_source": "SSA",
             "participation_rate": 0.93,
+            "weight_variable": "spm_unit_weight",
         },
         "tanf": {
             "variable": "tanf",
-            "admin_total": 7.1,  # from HHS
+            "benchmark_total": 7.1,  # from HHS
+            "benchmark_source": "HHS",
             "participation_rate": 0.23,
+            "weight_variable": "spm_unit_weight",
         },
-        "housing": {
-            "variable": "housing_benefit",
-            "admin_total": 50.3,  # from HUD
-            "participation_rate": 0.76,
+        "housing_spm_capped": {
+            "variable": "spm_unit_capped_housing_subsidy",
+            "benchmark_total": housing_spm_target["value"] / 1e9,
+            "benchmark_source": housing_spm_target["source"],
+            "participation_rate": np.nan,
+            "weight_variable": "spm_unit_weight",
+        },
+        "housing_assistance_hud_user": {
+            "variable": "housing_assistance",
+            "benchmark_total": housing_hud_target["annual_spending_total"] / 1e9,
+            "benchmark_source": housing_hud_target["source"],
+            "benchmark_participants_millions": (
+                housing_hud_target["reported_households"] / 1e6
+            ),
+            "participation_rate": np.nan,
+            "weight_variable": "household_weight",
+            "map_to": "household",
         },
     }
 
+
+def analyze_benefit_underreporting():
+    """Analyze impact of CPS benefit underreporting on results."""
+
+    year = 2022
+    sim = Microsimulation(dataset=EnhancedCPS, dataset_year=year)
+    programs = get_program_benchmarks(year)
+
     results = []
     for program, info in programs.items():
         # Calculate totals
-        benefit = sim.calculate(info["variable"], 2022)
-        weight = sim.calculate("household_weight", 2022)
+        benefit_kwargs = {}
+        if info.get("map_to"):
+            benefit_kwargs["map_to"] = info["map_to"]
+        benefit = sim.calculate(info["variable"], year, **benefit_kwargs)
+        weight = sim.calculate(info["weight_variable"], year)
 
         # Total benefits
         total = (benefit * weight).sum() / 1e9  # billions
@@ -53,15 +87,26 @@ def analyze_benefit_underreporting():
         weighted_participants = ((benefit > 0) * weight).sum() / 1e6  # millions
 
         # Underreporting factor
-        underreporting = info["admin_total"] / total if total > 0 else np.inf
+        benchmark_ratio = (
+            info["benchmark_total"] / total if total > 0 else np.inf
+        )
+        benchmark_participants = info.get("benchmark_participants_millions")
+        participant_ratio = (
+            weighted_participants / benchmark_participants
+            if benchmark_participants not in (None, 0)
+            else np.nan
+        )
 
         results.append(
             {
                 "program": program,
                 "enhanced_cps_total": total,
-                "admin_total": info["admin_total"],
-                "underreporting_factor": underreporting,
+                "benchmark_total": info["benchmark_total"],
+                "benchmark_source": info["benchmark_source"],
+                "benchmark_ratio": benchmark_ratio,
                 "participants_millions": weighted_participants,
+                "benchmark_participants_millions": benchmark_participants,
+                "participant_ratio": participant_ratio,
                 "mean_benefit": (
                     benefit[benefit > 0].mean() if (benefit > 0).any() else 0
                 ),
@@ -77,11 +122,11 @@ def validate_program_interactions():
     sim = Microsimulation(dataset=EnhancedCPS, dataset_year=2022)
 
     # Get program benefits
-    snap = sim.calculate("snap", 2022) > 0
-    medicaid = sim.calculate("medicaid", 2022) > 0
-    ssi = sim.calculate("ssi", 2022) > 0
-    tanf = sim.calculate("tanf", 2022) > 0
-    housing = sim.calculate("housing_benefit", 2022) > 0
+    snap = sim.calculate("snap", 2022, map_to="household") > 0
+    medicaid = sim.calculate("medicaid", 2022, map_to="household") > 0
+    ssi = sim.calculate("ssi", 2022, map_to="household") > 0
+    tanf = sim.calculate("tanf", 2022, map_to="household") > 0
+    housing = sim.calculate("housing_assistance", 2022, map_to="household") > 0
 
     weight = sim.calculate("household_weight", 2022)
 

From c71e53aaab58a06950faa20f02856e93969df382 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 09:28:37 -0400
Subject: [PATCH 03/14] Stabilize clone-half SPM thresholds and weight priors

---
 .../datasets/cps/enhanced_cps.py              | 48 ++++++++--
 .../datasets/cps/extended_cps.py              | 89 ++++++++++++++++++-
 tests/unit/test_extended_cps.py               | 46 ++++++++++
 3 files changed, 176 insertions(+), 7 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index 275d71e2b..e0ce2c208 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -28,6 +28,42 @@
     torch = None
 
 
+def initialize_weight_priors(
+    original_weights: np.ndarray,
+    seed: int = 1456,
+    epsilon: float = 1e-6,
+    positive_jitter_scale: float = 0.01,
+) -> np.ndarray:
+    """Build deterministic positive priors for sparse reweighting.
+
+    Original CPS households should keep priors close to their survey
+    weights. Clone-half households start with zero weight on purpose, so
+    they should receive only a tiny positive epsilon to keep the log
+    optimization well-defined without giving them a meaningful head start.
+    """
+
+    weights = np.asarray(original_weights, dtype=np.float64)
+    if np.any(weights < 0):
+        raise ValueError("original_weights must be non-negative")
+
+    rng = np.random.default_rng(seed)
+    priors = np.empty_like(weights, dtype=np.float64)
+
+    positive_mask = weights > 0
+    if positive_mask.any():
+        jitter = np.maximum(
+            rng.normal(loc=1.0, scale=positive_jitter_scale, size=positive_mask.sum()),
+            0.5,
+        )
+        priors[positive_mask] = np.maximum(weights[positive_mask] * jitter, epsilon)
+
+    zero_mask = ~positive_mask
+    if zero_mask.any():
+        priors[zero_mask] = epsilon * rng.uniform(1.0, 2.0, size=zero_mask.sum())
+
+    return priors
+
+
 def _get_period_array(period_values: dict, period: int) -> np.ndarray:
     """Get a period array from a TIME_PERIOD_ARRAYS variable dict."""
     value = period_values.get(period)
@@ -191,9 +227,9 @@ def generate(self):
         data = sim.dataset.load_dataset()
         base_year = int(sim.default_calculation_period)
         data["household_weight"] = {}
-        original_weights = sim.calculate("household_weight")
-        original_weights = original_weights.values + np.random.normal(
-            1, 0.1, len(original_weights)
+        original_weights = initialize_weight_priors(
+            sim.calculate("household_weight").values,
+            seed=1456,
         )
 
         bad_targets = [
@@ -328,9 +364,9 @@ def generate(self):
 
         sim = Microsimulation(dataset=self.input_dataset)
         data = sim.dataset.load_dataset()
-        original_weights = sim.calculate("household_weight")
-        original_weights = original_weights.values + np.random.normal(
-            1, 0.1, len(original_weights)
+        original_weights = initialize_weight_priors(
+            sim.calculate("household_weight").values,
+            seed=1456,
         )
         for year in [2024]:
             loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year)
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index 8b9041c0c..b110b536c 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -18,6 +18,10 @@
     convert_mortgage_interest_to_structural_inputs,
     impute_tax_unit_mortgage_balance_hints,
 )
+from policyengine_us_data.utils.spm import (
+    get_spm_reference_thresholds,
+    spm_equivalence_scale,
+)
 from policyengine_us_data.utils.policyengine import has_policyengine_us_variables
 from policyengine_us_data.utils.retirement_limits import (
     get_retirement_limits,
@@ -77,7 +81,6 @@ def _supports_structural_mortgage_inputs() -> bool:
     "spm_unit_federal_tax_reported",
     "spm_unit_state_tax_reported",
     "spm_unit_capped_work_childcare_expenses",
-    "spm_unit_spm_threshold",
     "spm_unit_net_income_reported",
     "spm_unit_pre_subsidy_childcare_expenses",
     # Medical expenses
@@ -325,6 +328,15 @@ def reconcile_ss_subcomponents(predictions, total_ss):
     "social_security_survivors",
 }
 
+_SPM_TENURE_TO_REFERENCE_KEY = {
+    b"OWNER_WITH_MORTGAGE": "owner_with_mortgage",
+    b"OWNER_WITHOUT_MORTGAGE": "owner_without_mortgage",
+    b"RENTER": "renter",
+    "OWNER_WITH_MORTGAGE": "owner_with_mortgage",
+    "OWNER_WITHOUT_MORTGAGE": "owner_without_mortgage",
+    "RENTER": "renter",
+}
+
 
 def _apply_post_processing(predictions, X_test, time_period, data):
     """Apply retirement constraints and SS reconciliation."""
@@ -369,6 +381,73 @@ def _apply_post_processing(predictions, X_test, time_period, data):
     return predictions
 
 
+def _index_into_spm_units(
+    person_spm_unit_ids: np.ndarray,
+    spm_unit_ids: np.ndarray,
+) -> np.ndarray:
+    if np.all(spm_unit_ids[:-1] <= spm_unit_ids[1:]):
+        indices = np.searchsorted(spm_unit_ids, person_spm_unit_ids)
+        valid = (indices >= 0) & (indices < len(spm_unit_ids))
+        if valid.all() and np.array_equal(spm_unit_ids[indices], person_spm_unit_ids):
+            return indices
+
+    indexer = pd.Index(spm_unit_ids).get_indexer(person_spm_unit_ids)
+    if (indexer < 0).any():
+        raise ValueError("person_spm_unit_id contains values missing from spm_unit_id")
+    return indexer
+
+
+def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray:
+    """Rebuild cloned SPM thresholds from donor-half geography.
+
+    The clone half should keep the donor household's geography but use the
+    current threshold formula rather than a learned QRF output. We infer the
+    donor-half geographic adjustment from the stored first-half thresholds and
+    apply that same geography to the corresponding clone-half SPM units.
+    """
+
+    person_ages = np.asarray(data["age"][time_period])
+    person_spm_unit_ids = np.asarray(data["person_spm_unit_id"][time_period])
+    spm_unit_ids = np.asarray(data["spm_unit_id"][time_period])
+    tenure_types = np.asarray(data["spm_unit_tenure_type"][time_period])
+    stored_thresholds = np.asarray(data["spm_unit_spm_threshold"][time_period], dtype=float)
+
+    n_units = len(spm_unit_ids)
+    if n_units % 2 != 0:
+        raise ValueError("Expected cloned SPM units to have an even-length spm_unit_id array")
+
+    unit_indices = _index_into_spm_units(person_spm_unit_ids, spm_unit_ids)
+    is_adult = person_ages >= 18
+    adult_counts = np.zeros(n_units, dtype=np.int32)
+    child_counts = np.zeros(n_units, dtype=np.int32)
+    np.add.at(adult_counts, unit_indices, is_adult.astype(np.int32))
+    np.add.at(child_counts, unit_indices, (~is_adult).astype(np.int32))
+
+    thresholds_by_tenure = get_spm_reference_thresholds(time_period)
+    reference_base = np.array(
+        [
+            thresholds_by_tenure[
+                _SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter")
+            ]
+            for tenure in tenure_types
+        ],
+        dtype=float,
+    )
+    equivalence_scale = spm_equivalence_scale(adult_counts, child_counts)
+
+    n_half = n_units // 2
+    donor_denominator = reference_base[:n_half] * equivalence_scale[:n_half]
+    donor_geoadj = np.divide(
+        stored_thresholds[:n_half],
+        donor_denominator,
+        out=np.ones(n_half, dtype=float),
+        where=donor_denominator > 0,
+    )
+    geoadj = np.concatenate([donor_geoadj, donor_geoadj])
+    rebuilt = reference_base * equivalence_scale * geoadj
+    return rebuilt.astype(np.float32)
+
+
 def _splice_cps_only_predictions(
     data: dict,
     predictions: pd.DataFrame,
@@ -489,6 +568,14 @@ def generate(self):
             time_period=self.time_period,
             dataset_path=str(self.cps.file_path),
         )
+        if "spm_unit_spm_threshold" in new_data:
+            logger.info("Rebuilding cloned SPM thresholds from donor-half geography")
+            new_data["spm_unit_spm_threshold"] = {
+                self.time_period: rebuild_cloned_spm_thresholds(
+                    new_data,
+                    self.time_period,
+                )
+            }
 
         new_data = self._rename_imputed_to_inputs(new_data)
         if _supports_structural_mortgage_inputs():
diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py
index a44977bdc..75bb66101 100644
--- a/tests/unit/test_extended_cps.py
+++ b/tests/unit/test_extended_cps.py
@@ -11,6 +11,7 @@
 import pandas as pd
 import pytest
 
+from policyengine_us_data.datasets.cps.enhanced_cps import initialize_weight_priors
 from policyengine_us_data.calibration.puf_impute import (
     IMPUTED_VARIABLES,
     OVERRIDDEN_IMPUTED_VARIABLES,
@@ -19,6 +20,7 @@
     CPS_ONLY_IMPUTED_VARIABLES,
     CPS_STAGE2_INCOME_PREDICTORS,
     apply_retirement_constraints,
+    rebuild_cloned_spm_thresholds,
     reconcile_ss_subcomponents,
 )
 from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES
@@ -110,6 +112,7 @@ def test_pension_income_not_in_cps_only(self):
         should_not_be_here = {
             "taxable_private_pension_income",
             "tax_exempt_private_pension_income",
+            "spm_unit_spm_threshold",
         }
         present = should_not_be_here & set(CPS_ONLY_IMPUTED_VARIABLES)
         assert present == set(), (
@@ -117,6 +120,49 @@ def test_pension_income_not_in_cps_only(self):
         )
 
 
+class TestDeterministicSpmThresholdRebuild:
+    def test_rebuild_cloned_spm_thresholds_uses_donor_geoadj(self):
+        data = {
+            "age": {2024: np.array([40, 12, 70, 40, 12, 70], dtype=np.int32)},
+            "person_spm_unit_id": {2024: np.array([10, 10, 20, 30, 30, 40], dtype=np.int32)},
+            "spm_unit_id": {2024: np.array([10, 20, 30, 40], dtype=np.int32)},
+            "spm_unit_tenure_type": {
+                2024: np.array(
+                    [b"RENTER", b"OWNER_WITHOUT_MORTGAGE", b"RENTER", b"OWNER_WITHOUT_MORTGAGE"]
+                )
+            },
+            "spm_unit_spm_threshold": {
+                2024: np.array([20_000.0, 15_000.0, 999_999.0, 999_999.0], dtype=np.float64)
+            },
+        }
+
+        rebuilt = rebuild_cloned_spm_thresholds(data, 2024)
+
+        np.testing.assert_allclose(rebuilt[:2], np.array([20_000.0, 15_000.0]))
+        np.testing.assert_allclose(rebuilt[2:], np.array([20_000.0, 15_000.0]))
+
+
+class TestWeightPriorInitialization:
+    def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(self):
+        weights = np.array([1_500.0, 0.0, 625.0, 0.0], dtype=np.float64)
+
+        priors = initialize_weight_priors(weights, seed=123)
+
+        assert np.all(priors > 0)
+        assert priors[1] < 1e-4
+        assert priors[3] < 1e-4
+        assert priors[0] == pytest.approx(1_500.0, rel=0.05)
+        assert priors[2] == pytest.approx(625.0, rel=0.05)
+
+    def test_initialize_weight_priors_is_reproducible(self):
+        weights = np.array([400.0, 0.0, 100.0], dtype=np.float64)
+
+        priors_a = initialize_weight_priors(weights, seed=77)
+        priors_b = initialize_weight_priors(weights, seed=77)
+
+        np.testing.assert_allclose(priors_a, priors_b)
+
+
 class TestRetirementConstraints:
     """Post-processing retirement constraints enforce IRS caps."""
 

From 84e425991c01166ff1406401ce89b21974c1637f Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 09:52:55 -0400
Subject: [PATCH 04/14] Format clone-half threshold fixes

---
 policyengine_us_data/datasets/cps/extended_cps.py | 12 +++++++-----
 tests/unit/test_extended_cps.py                   | 15 ++++++++++++---
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index b110b536c..22a443804 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -410,11 +410,15 @@ def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray:
     person_spm_unit_ids = np.asarray(data["person_spm_unit_id"][time_period])
     spm_unit_ids = np.asarray(data["spm_unit_id"][time_period])
     tenure_types = np.asarray(data["spm_unit_tenure_type"][time_period])
-    stored_thresholds = np.asarray(data["spm_unit_spm_threshold"][time_period], dtype=float)
+    stored_thresholds = np.asarray(
+        data["spm_unit_spm_threshold"][time_period], dtype=float
+    )
 
     n_units = len(spm_unit_ids)
     if n_units % 2 != 0:
-        raise ValueError("Expected cloned SPM units to have an even-length spm_unit_id array")
+        raise ValueError(
+            "Expected cloned SPM units to have an even-length spm_unit_id array"
+        )
 
     unit_indices = _index_into_spm_units(person_spm_unit_ids, spm_unit_ids)
     is_adult = person_ages >= 18
@@ -426,9 +430,7 @@ def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray:
     thresholds_by_tenure = get_spm_reference_thresholds(time_period)
     reference_base = np.array(
         [
-            thresholds_by_tenure[
-                _SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter")
-            ]
+            thresholds_by_tenure[_SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter")]
             for tenure in tenure_types
         ],
         dtype=float,
diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py
index 75bb66101..c278d0ea1 100644
--- a/tests/unit/test_extended_cps.py
+++ b/tests/unit/test_extended_cps.py
@@ -124,15 +124,24 @@ class TestDeterministicSpmThresholdRebuild:
     def test_rebuild_cloned_spm_thresholds_uses_donor_geoadj(self):
         data = {
             "age": {2024: np.array([40, 12, 70, 40, 12, 70], dtype=np.int32)},
-            "person_spm_unit_id": {2024: np.array([10, 10, 20, 30, 30, 40], dtype=np.int32)},
+            "person_spm_unit_id": {
+                2024: np.array([10, 10, 20, 30, 30, 40], dtype=np.int32)
+            },
             "spm_unit_id": {2024: np.array([10, 20, 30, 40], dtype=np.int32)},
             "spm_unit_tenure_type": {
                 2024: np.array(
-                    [b"RENTER", b"OWNER_WITHOUT_MORTGAGE", b"RENTER", b"OWNER_WITHOUT_MORTGAGE"]
+                    [
+                        b"RENTER",
+                        b"OWNER_WITHOUT_MORTGAGE",
+                        b"RENTER",
+                        b"OWNER_WITHOUT_MORTGAGE",
+                    ]
                 )
             },
             "spm_unit_spm_threshold": {
-                2024: np.array([20_000.0, 15_000.0, 999_999.0, 999_999.0], dtype=np.float64)
+                2024: np.array(
+                    [20_000.0, 15_000.0, 999_999.0, 999_999.0], dtype=np.float64
+                )
             },
         }
 

From 289bbc7137ac68534ec61250d1323bbb1e91f446 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 09:54:20 -0400
Subject: [PATCH 05/14] Format SPM benchmark and calibration files

---
 .../calibration/publish_local_area.py             |  6 +-----
 .../calibration_targets/pull_hardcoded_targets.py |  1 +
 .../test_spm_thresholds.py                        |  4 +---
 policyengine_us_data/utils/spm.py                 | 15 ++++-----------
 tests/unit/test_census_spm_housing_targets.py     | 14 +++++++++-----
 validation/benefit_validation.py                  |  4 +---
 6 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py
index 4a8bb8a16..bc9f409f0 100644
--- a/policyengine_us_data/calibration/publish_local_area.py
+++ b/policyengine_us_data/calibration/publish_local_area.py
@@ -490,11 +490,7 @@ def build_h5(
     spm_unit_geoadj = np.array(
         [
             cd_geoadj_values[active_clone_cds[clone_id]][
-                (
-                    spm_tenure_cloned[spm_unit_index]
-                    .decode()
-                    .lower()
-                )
+                (spm_tenure_cloned[spm_unit_index].decode().lower())
             ]
             for spm_unit_index, clone_id in enumerate(spm_clone_ids)
         ],
diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py
index c1478d957..0503c7b59 100644
--- a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py
+++ b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py
@@ -10,6 +10,7 @@
 sources. Housing uses the Census SPM capped subsidy concept, not HUD spending.
 """
 
+
 def get_hard_coded_totals(
     year: int = 2024,
     storage_folder=None,
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py
index 8cd2a06b0..3e5b8bd34 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py
@@ -31,9 +31,7 @@ def test_load_cd_geoadj_values_returns_tenure_specific_lookup(monkeypatch):
 
     assert geoadj_lookup["101"]["renter"] == pytest.approx(1.2215)
     assert geoadj_lookup["101"]["owner_with_mortgage"] == pytest.approx(1.217)
-    assert geoadj_lookup["101"]["owner_without_mortgage"] == pytest.approx(
-        1.1615
-    )
+    assert geoadj_lookup["101"]["owner_without_mortgage"] == pytest.approx(1.1615)
 
 
 def test_calculate_spm_thresholds_vectorized_matches_policyengine_us_future_path():
diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py
index 3aba81693..a2220339b 100644
--- a/policyengine_us_data/utils/spm.py
+++ b/policyengine_us_data/utils/spm.py
@@ -90,15 +90,12 @@ def spm_equivalence_scale(num_adults, num_children):
 
     single_adult_with_children = with_children & (adults <= 1)
     raw[single_adult_with_children] = (
-        1.0
-        + 0.8
-        + 0.5 * np.maximum(children[single_adult_with_children] - 1, 0)
+        1.0 + 0.8 + 0.5 * np.maximum(children[single_adult_with_children] - 1, 0)
     ) ** 0.7
 
     multi_adult_with_children = with_children & ~single_adult_with_children
     raw[multi_adult_with_children] = (
-        adults[multi_adult_with_children]
-        + 0.5 * children[multi_adult_with_children]
+        adults[multi_adult_with_children] + 0.5 * children[multi_adult_with_children]
     ) ** 0.7
 
     no_children = has_people & ~with_children
@@ -125,16 +122,12 @@ def get_spm_reference_thresholds(year: int) -> dict[str, float]:
 
     cpi_u = _get_cpi_u_parameter()
     factor = float(
-        cpi_u(f"{year}-02-01")
-        / cpi_u(f"{LATEST_PUBLISHED_SPM_THRESHOLD_YEAR}-02-01")
+        cpi_u(f"{year}-02-01") / cpi_u(f"{LATEST_PUBLISHED_SPM_THRESHOLD_YEAR}-02-01")
     )
     latest_thresholds = PUBLISHED_SPM_REFERENCE_THRESHOLDS[
         LATEST_PUBLISHED_SPM_THRESHOLD_YEAR
     ]
-    return {
-        tenure: value * factor
-        for tenure, value in latest_thresholds.items()
-    }
+    return {tenure: value * factor for tenure, value in latest_thresholds.items()}
 
 
 def calculate_geoadj_from_rent(
diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py
index 6693ffe9a..be0098d29 100644
--- a/tests/unit/test_census_spm_housing_targets.py
+++ b/tests/unit/test_census_spm_housing_targets.py
@@ -35,9 +35,7 @@ def test_get_census_spm_capped_housing_subsidy_total_uses_raw_cps_spm_values(
 ):
     _write_census_cps(tmp_path)
 
-    total = get_census_spm_capped_housing_subsidy_total(
-        2024, storage_folder=tmp_path
-    )
+    total = get_census_spm_capped_housing_subsidy_total(2024, storage_folder=tmp_path)
 
     expected = (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100
     assert total == expected
@@ -147,13 +145,19 @@ def test_get_program_benchmarks_keeps_housing_concepts_separate(monkeypatch):
 
     benchmarks = benefit_validation.get_program_benchmarks(2022)
 
-    assert benchmarks["housing_spm_capped"]["variable"] == "spm_unit_capped_housing_subsidy"
+    assert (
+        benchmarks["housing_spm_capped"]["variable"]
+        == "spm_unit_capped_housing_subsidy"
+    )
     assert benchmarks["housing_spm_capped"]["benchmark_total"] == 111
     assert benchmarks["housing_spm_capped"]["benchmark_source"] == "Census benchmark"
 
     assert benchmarks["housing_assistance_hud_user"]["variable"] == "housing_assistance"
     assert benchmarks["housing_assistance_hud_user"]["benchmark_total"] == 222
-    assert benchmarks["housing_assistance_hud_user"]["benchmark_source"] == "HUD USER benchmark"
+    assert (
+        benchmarks["housing_assistance_hud_user"]["benchmark_source"]
+        == "HUD USER benchmark"
+    )
     assert (
         benchmarks["housing_assistance_hud_user"]["benchmark_participants_millions"]
         == 3
diff --git a/validation/benefit_validation.py b/validation/benefit_validation.py
index 723fe6a08..14360782d 100644
--- a/validation/benefit_validation.py
+++ b/validation/benefit_validation.py
@@ -87,9 +87,7 @@ def analyze_benefit_underreporting():
         weighted_participants = ((benefit > 0) * weight).sum() / 1e6  # millions
 
         # Underreporting factor
-        benchmark_ratio = (
-            info["benchmark_total"] / total if total > 0 else np.inf
-        )
+        benchmark_ratio = info["benchmark_total"] / total if total > 0 else np.inf
         benchmark_participants = info.get("benchmark_participants_millions")
         participant_ratio = (
             weighted_participants / benchmark_participants

From 9b7222874afde3f6e1f088442a5679f622e49e21 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 10:44:13 -0400
Subject: [PATCH 06/14] Add changelog fragment for clone-half SPM fixes

---
 changelog.d/fixed/702.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/fixed/702.md

diff --git a/changelog.d/fixed/702.md b/changelog.d/fixed/702.md
new file mode 100644
index 000000000..6e2bafc59
--- /dev/null
+++ b/changelog.d/fixed/702.md
@@ -0,0 +1 @@
+Aligned clone-half enhanced CPS generation with the modeled SPM pipeline by rebuilding cloned SPM thresholds deterministically and keeping zero-weight synthetic households near zero in sparse reweighting priors.

From 1151faacf82f338150c69baae3120ac9dcc8adcc Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 10:45:54 -0400
Subject: [PATCH 07/14] Retrigger PR checks


From 9c01bc787abb8d55bc1591241b9107fda612a81e Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 10:46:50 -0400
Subject: [PATCH 08/14] Retrigger PR checks after cancel


From 0caed7938e9ee4ff18a5aa16dfcc208fd0038600 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 10:57:01 -0400
Subject: [PATCH 09/14] Fix Towncrier fragment layout

---
 changelog.d/{fixed/702.md => 702.fixed} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename changelog.d/{fixed/702.md => 702.fixed} (100%)

diff --git a/changelog.d/fixed/702.md b/changelog.d/702.fixed
similarity index 100%
rename from changelog.d/fixed/702.md
rename to changelog.d/702.fixed

From 98a4e4701a2b8c3267e2e20538ad1303d999db62 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 13:40:18 -0400
Subject: [PATCH 10/14] Tighten clone prior and tenure validation

---
 .../datasets/cps/enhanced_cps.py              | 13 +++------
 .../datasets/cps/extended_cps.py              | 12 ++++++++-
 tests/unit/test_extended_cps.py               | 27 +++++++++++++++++--
 3 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
index e0ce2c208..82bf9cc06 100644
--- a/policyengine_us_data/datasets/cps/enhanced_cps.py
+++ b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -32,13 +32,12 @@ def initialize_weight_priors(
     original_weights: np.ndarray,
     seed: int = 1456,
     epsilon: float = 1e-6,
-    positive_jitter_scale: float = 0.01,
 ) -> np.ndarray:
     """Build deterministic positive priors for sparse reweighting.
 
-    Original CPS households should keep priors close to their survey
-    weights. Clone-half households start with zero weight on purpose, so
-    they should receive only a tiny positive epsilon to keep the log
+    Original CPS households should keep their survey weights unchanged.
+    Clone-half households start with zero weight on purpose, so they
+    should receive only a tiny positive epsilon to keep the log
     optimization well-defined without giving them a meaningful head start.
     """
 
@@ -51,11 +50,7 @@ def initialize_weight_priors(
 
     positive_mask = weights > 0
     if positive_mask.any():
-        jitter = np.maximum(
-            rng.normal(loc=1.0, scale=positive_jitter_scale, size=positive_mask.sum()),
-            0.5,
-        )
-        priors[positive_mask] = np.maximum(weights[positive_mask] * jitter, epsilon)
+        priors[positive_mask] = weights[positive_mask]
 
     zero_mask = ~positive_mask
     if zero_mask.any():
diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
index 22a443804..b7867f795 100644
--- a/policyengine_us_data/datasets/cps/extended_cps.py
+++ b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -338,6 +338,16 @@ def reconcile_ss_subcomponents(predictions, total_ss):
 }
 
 
+def _reference_threshold_key_for_tenure(tenure_type) -> str:
+    reference_key = _SPM_TENURE_TO_REFERENCE_KEY.get(tenure_type)
+    if reference_key is None:
+        raise ValueError(
+            "Unsupported spm_unit_tenure_type for cloned threshold rebuild: "
+            f"{tenure_type!r}"
+        )
+    return reference_key
+
+
 def _apply_post_processing(predictions, X_test, time_period, data):
     """Apply retirement constraints and SS reconciliation."""
     ret_cols = [c for c in predictions.columns if c in _RETIREMENT_VARS]
@@ -430,7 +440,7 @@ def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray:
     thresholds_by_tenure = get_spm_reference_thresholds(time_period)
     reference_base = np.array(
         [
-            thresholds_by_tenure[_SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter")]
+            thresholds_by_tenure[_reference_threshold_key_for_tenure(tenure)]
             for tenure in tenure_types
         ],
         dtype=float,
diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py
index c278d0ea1..1c86a3bee 100644
--- a/tests/unit/test_extended_cps.py
+++ b/tests/unit/test_extended_cps.py
@@ -150,6 +150,22 @@ def test_rebuild_cloned_spm_thresholds_uses_donor_geoadj(self):
         np.testing.assert_allclose(rebuilt[:2], np.array([20_000.0, 15_000.0]))
         np.testing.assert_allclose(rebuilt[2:], np.array([20_000.0, 15_000.0]))
 
+    def test_rebuild_cloned_spm_thresholds_rejects_unknown_tenure(self):
+        data = {
+            "age": {2024: np.array([40, 12, 40, 12], dtype=np.int32)},
+            "person_spm_unit_id": {2024: np.array([10, 10, 20, 20], dtype=np.int32)},
+            "spm_unit_id": {2024: np.array([10, 20], dtype=np.int32)},
+            "spm_unit_tenure_type": {
+                2024: np.array([b"RENTER", b"UNRECOGNIZED"], dtype="|S12")
+            },
+            "spm_unit_spm_threshold": {
+                2024: np.array([20_000.0, 20_000.0], dtype=np.float64)
+            },
+        }
+
+        with pytest.raises(ValueError, match="Unsupported spm_unit_tenure_type"):
+            rebuild_cloned_spm_thresholds(data, 2024)
+
 
 class TestWeightPriorInitialization:
     def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(self):
@@ -160,8 +176,15 @@ def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(self):
         assert np.all(priors > 0)
         assert priors[1] < 1e-4
         assert priors[3] < 1e-4
-        assert priors[0] == pytest.approx(1_500.0, rel=0.05)
-        assert priors[2] == pytest.approx(625.0, rel=0.05)
+        assert priors[0] == pytest.approx(1_500.0)
+        assert priors[2] == pytest.approx(625.0)
+
+    def test_initialize_weight_priors_preserves_positive_weights_exactly(self):
+        weights = np.array([1_500.0, 625.0, 42.0], dtype=np.float64)
+
+        priors = initialize_weight_priors(weights, seed=123)
+
+        np.testing.assert_array_equal(priors, weights)
 
     def test_initialize_weight_priors_is_reproducible(self):
         weights = np.array([400.0, 0.0, 100.0], dtype=np.float64)

From aed202ae5dfccbcc05130b31999877057cde9c66 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 21:13:05 -0400
Subject: [PATCH 11/14] Fix local-area calibration imports and takeup args

---
 policyengine_us_data/calibration/publish_local_area.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py
index b08c59232..0429c933f 100644
--- a/policyengine_us_data/calibration/publish_local_area.py
+++ b/policyengine_us_data/calibration/publish_local_area.py
@@ -25,6 +25,8 @@
 )
 from policyengine_us_data.calibration.block_assignment import (
     derive_geography_from_blocks,
+)
+from policyengine_us_data.calibration.county_assignment import (
     get_county_filter_probability,
 )
 from policyengine_us_data.calibration.clone_and_assign import (
@@ -544,6 +546,7 @@ def build_h5(
             hh_blocks=active_blocks,
             hh_state_fips=hh_state_fips,
             hh_ids=original_hh_ids,
+            hh_clone_indices=active_geo.astype(np.int64),
             entity_hh_indices=entity_hh_indices,
             entity_counts=entity_counts,
             time_period=time_period,

From e2ae174debe40c7709480ed2180aeb70f1aadbf2 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 21:46:56 -0400
Subject: [PATCH 12/14] Harden housing target test stub

---
 tests/unit/test_census_spm_housing_targets.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py
index be0098d29..6befed4ed 100644
--- a/tests/unit/test_census_spm_housing_targets.py
+++ b/tests/unit/test_census_spm_housing_targets.py
@@ -1,3 +1,4 @@
+import importlib
 import sys
 import types
 
@@ -57,10 +58,18 @@ class DummyMicrosimulation:
         def __init__(self, dataset):
             self.default_calculation_period = 2024
 
+    policyengine_us = importlib.import_module("policyengine_us")
     monkeypatch.setitem(
         sys.modules,
         "policyengine_us",
-        types.SimpleNamespace(Microsimulation=DummyMicrosimulation),
+        types.SimpleNamespace(
+            **{
+                name: getattr(policyengine_us, name)
+                for name in dir(policyengine_us)
+                if not name.startswith("__") and name != "Microsimulation"
+            },
+            Microsimulation=DummyMicrosimulation,
+        ),
     )
     monkeypatch.setattr(
         etl_national_targets,

From f5750e3215d2292596e97f0be7e0577f5811f291 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Wed, 8 Apr 2026 21:51:26 -0400
Subject: [PATCH 13/14] Trigger PR checks


From 17e77b3938f07937e77bb28ff7f44277e25e1c6e Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Thu, 9 Apr 2026 10:20:39 -0400
Subject: [PATCH 14/14] Use valid year in housing target test

---
 tests/unit/test_census_spm_housing_targets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py
index 6befed4ed..e595848d1 100644
--- a/tests/unit/test_census_spm_housing_targets.py
+++ b/tests/unit/test_census_spm_housing_targets.py
@@ -83,7 +83,7 @@ def __init__(self, dataset):
         },
     )
 
-    targets = etl_national_targets.extract_national_targets("dummy")
+    targets = etl_national_targets.extract_national_targets(2024)
     housing = next(
         target
         for target in targets["direct_sum_targets"]