From f9162bd0846e2e8be15c55d4e309efb48426206e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 22:15:41 -0400 Subject: [PATCH 1/8] Use Census childcare capping formula --- policyengine_us_data/datasets/cps/cps.py | 1 + .../datasets/cps/extended_cps.py | 151 +++++++++++------- tests/unit/test_extended_cps.py | 43 ++--- 3 files changed, 116 insertions(+), 79 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 5d5774eea..08301f167 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -494,6 +494,7 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int): cps["weekly_hours_worked"] = person.HRSWK cps["hours_worked_last_week"] = person.A_HRS1 + cps["weeks_worked"] = np.clip(person.WKSWORK, 0, 52) cps["taxable_interest_income"] = person.INT_VAL * (p["taxable_interest_fraction"]) cps["tax_exempt_interest_income"] = person.INT_VAL * ( diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index c840f29af..760ac5df9 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -26,6 +26,12 @@ logger = logging.getLogger(__name__) +# Census SPM technical documentation, "SPM Work Expense Values". +# These are weekly work expense amounts applied to each adult earner. +SPM_WEEKLY_WORK_EXPENSE_BY_YEAR = { + 2024: 41.17, +} + def _supports_structural_mortgage_inputs() -> bool: return has_policyengine_us_variables(*STRUCTURAL_MORTGAGE_VARIABLES) @@ -325,69 +331,101 @@ def reconcile_ss_subcomponents(predictions, total_ss): } +def _get_spm_weekly_work_expense(year: int) -> float: + try: + return SPM_WEEKLY_WORK_EXPENSE_BY_YEAR[year] + except KeyError as exc: + raise ValueError( + f"No Census SPM weekly work expense value configured for {year}" + ) from exc + + +def _calculate_clone_work_expenses( + clone_person_data: pd.DataFrame, + clone_spm_unit_ids: np.ndarray, +) -> np.ndarray: + clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) + if clone_person_data.empty: + return np.zeros(len(clone_spm_unit_ids), dtype=float) + + adult_earners = clone_person_data.loc[ + (clone_person_data["age"] >= 18) & (clone_person_data["earnings"] > 0), + ["spm_unit_id", "weeks_worked"], + ].copy() + if adult_earners.empty: + return np.zeros(len(clone_spm_unit_ids), dtype=float) + + adult_earners["weeks_worked"] = adult_earners["weeks_worked"].clip( + lower=0, upper=52 + ) + return ( + adult_earners.groupby("spm_unit_id")["weeks_worked"] + .sum() + .reindex( + clone_spm_unit_ids, + fill_value=0.0, + ) + .to_numpy(dtype=float) + ) + + +def _calculate_clone_lower_earner_caps( + clone_person_data: pd.DataFrame, + clone_spm_unit_ids: np.ndarray, +) -> np.ndarray: + clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) + if clone_person_data.empty: + return np.zeros(len(clone_spm_unit_ids), dtype=float) + + head_or_spouse = clone_person_data.loc[ + clone_person_data["is_parent_proxy"].astype(bool), + ["spm_unit_id", "earnings"], + ].copy() + if head_or_spouse.empty: + return np.zeros(len(clone_spm_unit_ids), dtype=float) + + head_or_spouse["earnings"] = head_or_spouse["earnings"].clip(lower=0.0) + lower_earner_caps = head_or_spouse.groupby("spm_unit_id")["earnings"].agg( + lambda values: float(values.min()) if len(values) > 1 else float(values.iloc[0]) + ) + return lower_earner_caps.reindex( + clone_spm_unit_ids, + fill_value=0.0, + ).to_numpy(dtype=float) + + def derive_clone_capped_childcare_expenses( - donor_pre_subsidy: np.ndarray, - donor_capped: np.ndarray, clone_pre_subsidy: np.ndarray, clone_person_data: pd.DataFrame, clone_spm_unit_ids: np.ndarray, + time_period: int, ) -> np.ndarray: - """Derive clone-half capped childcare from clone inputs. + """Derive clone-half capped work and childcare expenses from clone inputs. The CPS provides both pre-subsidy childcare and the SPM-specific - capped childcare deduction. For the clone half, we impute only the - pre-subsidy amount, then deterministically rebuild the capped amount - instead of letting a second QRF predict it independently. - - We preserve the donor's observed capping share while also respecting - the clone's own earnings cap. This keeps the clone-half value - consistent with pre-subsidy childcare and avoids impossible outputs - such as capped childcare exceeding pre-subsidy childcare. + capped work-and-childcare deduction. For the clone half, we impute + only the pre-subsidy childcare amount, then deterministically rebuild + the capped value using the Census SPM rule: + work expenses plus childcare, capped at the lower earner's earnings + for the reference person and spouse/partner. """ - donor_pre_subsidy = np.asarray(donor_pre_subsidy, dtype=float) - donor_capped = np.asarray(donor_capped, dtype=float) clone_pre_subsidy = np.asarray(clone_pre_subsidy, dtype=float) - clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) - - donor_cap_share = np.divide( - donor_capped, - donor_pre_subsidy, - out=np.zeros_like(donor_capped, dtype=float), - where=donor_pre_subsidy > 0, + weekly_work_expense = _get_spm_weekly_work_expense(time_period) + annual_work_expenses = ( + _calculate_clone_work_expenses( + clone_person_data=clone_person_data, + clone_spm_unit_ids=clone_spm_unit_ids, + ) + * weekly_work_expense + ) + lower_earner_cap = _calculate_clone_lower_earner_caps( + clone_person_data=clone_person_data, + clone_spm_unit_ids=clone_spm_unit_ids, ) - donor_cap_share = np.clip(donor_cap_share, 0.0, 1.0) - capped_from_share = np.maximum(clone_pre_subsidy, 0.0) * donor_cap_share - - if clone_person_data.empty: - earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float) - else: - eligible = clone_person_data["is_parent_proxy"].astype(bool) - parent_rows = clone_person_data.loc[ - eligible, ["spm_unit_id", "age", "earnings"] - ].copy() - if parent_rows.empty: - earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float) - else: - parent_rows["earnings"] = parent_rows["earnings"].clip(lower=0.0) - parent_rows["age_rank"] = parent_rows.groupby("spm_unit_id")["age"].rank( - method="first", ascending=False - ) - top_two = parent_rows[parent_rows["age_rank"] <= 2].sort_values( - ["spm_unit_id", "age_rank"] - ) - earnings_cap_by_unit = top_two.groupby("spm_unit_id")["earnings"].agg( - lambda values: ( - float(values.iloc[0]) - if len(values) == 1 - else float(np.minimum(values.iloc[0], values.iloc[1])) - ) - ) - earnings_cap = earnings_cap_by_unit.reindex( - clone_spm_unit_ids, fill_value=0.0 - ).to_numpy(dtype=float) - return np.minimum(capped_from_share, earnings_cap) + combined_expenses = np.maximum(clone_pre_subsidy, 0.0) + annual_work_expenses + return np.minimum(combined_expenses, lower_earner_cap) def _rebuild_clone_capped_childcare_expenses( @@ -421,26 +459,19 @@ def _rebuild_clone_capped_childcare_expenses( data["employment_income"][time_period][n_persons_half:] + data["self_employment_income"][time_period][n_persons_half:] ), + "weeks_worked": data["weeks_worked"][time_period][n_persons_half:], } ) - - donor_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][ - :n_spm_units_half - ] - donor_capped = data["spm_unit_capped_work_childcare_expenses"][time_period][ - :n_spm_units_half - ] clone_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][ n_spm_units_half: ] clone_spm_unit_ids = data["spm_unit_id"][time_period][n_spm_units_half:] return derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, clone_pre_subsidy=clone_pre_subsidy, clone_person_data=clone_person_data, clone_spm_unit_ids=clone_spm_unit_ids, + time_period=time_period, ) diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index e32172db2..a6ea8b654 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -126,57 +126,62 @@ def test_capped_childcare_not_in_cps_only(self): class TestCloneChildcareDerivation: - """Clone-half capped childcare should be derived deterministically.""" + """Clone-half capped work-and-childcare expenses should be deterministic.""" - def test_caps_at_pre_subsidy_and_clone_earnings(self): - donor_pre_subsidy = np.array([10000.0, 4000.0, 6000.0]) - donor_capped = np.array([4000.0, 4000.0, 0.0]) - clone_pre_subsidy = np.array([12000.0, 5000.0, 3000.0]) + def test_caps_combined_work_and_childcare_at_lower_earner(self): + clone_pre_subsidy = np.array([1200.0, 5000.0, 3000.0]) person_data = pd.DataFrame( { "spm_unit_id": [1, 1, 2, 2, 3], "age": [40, 38, 35, 33, 29], "is_parent_proxy": [True, True, True, True, True], "earnings": [9000.0, 3000.0, 1500.0, 0.0, 2000.0], + "weeks_worked": [10.0, 20.0, 30.0, 5.0, 15.0], } ) result = derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, clone_pre_subsidy=clone_pre_subsidy, clone_person_data=person_data, clone_spm_unit_ids=np.array([1, 2, 3]), + time_period=2024, ) - np.testing.assert_allclose(result, np.array([3000.0, 0.0, 0.0])) + np.testing.assert_allclose( + result, + np.array( + [ + 2435.1, # 1200 childcare + (10 + 20) * 41.17 work expenses + 0.0, # Two-parent unit capped by the lower earner's zero earnings + 2000.0, # Single proxy unit capped at the proxy's earnings + ] + ), + rtol=0, + atol=1e-6, + ) - def test_uses_single_parent_earnings_cap_for_single_proxy_units(self): - donor_pre_subsidy = np.array([4000.0]) - donor_capped = np.array([4000.0]) - clone_pre_subsidy = np.array([6000.0]) + def test_includes_work_expenses_even_without_childcare(self): + clone_pre_subsidy = np.array([0.0]) person_data = pd.DataFrame( { "spm_unit_id": [10], "age": [31], "is_parent_proxy": [True], "earnings": [2500.0], + "weeks_worked": [12.0], } ) result = derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, clone_pre_subsidy=clone_pre_subsidy, clone_person_data=person_data, clone_spm_unit_ids=np.array([10]), + time_period=2024, ) - np.testing.assert_allclose(result, np.array([2500.0])) + np.testing.assert_allclose(result, np.array([494.04]), rtol=0, atol=1e-6) def test_falls_back_to_zero_without_parent_proxies(self): - donor_pre_subsidy = np.array([3000.0]) - donor_capped = np.array([2000.0]) clone_pre_subsidy = np.array([3000.0]) person_data = pd.DataFrame( { @@ -184,15 +189,15 @@ def test_falls_back_to_zero_without_parent_proxies(self): "age": [12, 9], "is_parent_proxy": [False, False], "earnings": [0.0, 0.0], + "weeks_worked": [0.0, 0.0], } ) result = derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, clone_pre_subsidy=clone_pre_subsidy, clone_person_data=person_data, clone_spm_unit_ids=np.array([20]), + time_period=2024, ) np.testing.assert_allclose(result, np.array([0.0])) From fe77557976f369b505db2ce3d98ae40bed7f8255 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 22:56:15 -0400 Subject: [PATCH 2/8] Add childcare formula changelog fragment --- changelog.d/705.fixed | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/705.fixed diff --git a/changelog.d/705.fixed b/changelog.d/705.fixed new file mode 100644 index 000000000..4c60c9bff --- /dev/null +++ b/changelog.d/705.fixed @@ -0,0 +1 @@ +Use Census work-and-childcare capping inputs for clone-half SPM childcare expenses instead of donor capping shares. From dd75f1d97eecb1f193e95a6feb7d366ba99a34cb Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 06:59:25 -0400 Subject: [PATCH 3/8] Populate childcare formula inputs from CPS --- .../datasets/cps/extended_cps.py | 168 ------------------ tests/unit/test_extended_cps.py | 82 +-------- tests/unit/test_weeks_worked.py | 41 +++++ 3 files changed, 42 insertions(+), 249 deletions(-) create mode 100644 tests/unit/test_weeks_worked.py diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 760ac5df9..310d0a072 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -26,12 +26,6 @@ logger = logging.getLogger(__name__) -# Census SPM technical documentation, "SPM Work Expense Values". -# These are weekly work expense amounts applied to each adult earner. -SPM_WEEKLY_WORK_EXPENSE_BY_YEAR = { - 2024: 41.17, -} - def _supports_structural_mortgage_inputs() -> bool: return has_policyengine_us_variables(*STRUCTURAL_MORTGAGE_VARIABLES) @@ -331,150 +325,6 @@ def reconcile_ss_subcomponents(predictions, total_ss): } -def _get_spm_weekly_work_expense(year: int) -> float: - try: - return SPM_WEEKLY_WORK_EXPENSE_BY_YEAR[year] - except KeyError as exc: - raise ValueError( - f"No Census SPM weekly work expense value configured for {year}" - ) from exc - - -def _calculate_clone_work_expenses( - clone_person_data: pd.DataFrame, - clone_spm_unit_ids: np.ndarray, -) -> np.ndarray: - clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) - if clone_person_data.empty: - return np.zeros(len(clone_spm_unit_ids), dtype=float) - - adult_earners = clone_person_data.loc[ - (clone_person_data["age"] >= 18) & (clone_person_data["earnings"] > 0), - ["spm_unit_id", "weeks_worked"], - ].copy() - if adult_earners.empty: - return np.zeros(len(clone_spm_unit_ids), dtype=float) - - adult_earners["weeks_worked"] = adult_earners["weeks_worked"].clip( - lower=0, upper=52 - ) - return ( - adult_earners.groupby("spm_unit_id")["weeks_worked"] - .sum() - .reindex( - clone_spm_unit_ids, - fill_value=0.0, - ) - .to_numpy(dtype=float) - ) - - -def _calculate_clone_lower_earner_caps( - clone_person_data: pd.DataFrame, - clone_spm_unit_ids: np.ndarray, -) -> np.ndarray: - clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) - if clone_person_data.empty: - return np.zeros(len(clone_spm_unit_ids), dtype=float) - - head_or_spouse = clone_person_data.loc[ - clone_person_data["is_parent_proxy"].astype(bool), - ["spm_unit_id", "earnings"], - ].copy() - if head_or_spouse.empty: - return np.zeros(len(clone_spm_unit_ids), dtype=float) - - head_or_spouse["earnings"] = head_or_spouse["earnings"].clip(lower=0.0) - lower_earner_caps = head_or_spouse.groupby("spm_unit_id")["earnings"].agg( - lambda values: float(values.min()) if len(values) > 1 else float(values.iloc[0]) - ) - return lower_earner_caps.reindex( - clone_spm_unit_ids, - fill_value=0.0, - ).to_numpy(dtype=float) - - -def derive_clone_capped_childcare_expenses( - clone_pre_subsidy: np.ndarray, - clone_person_data: pd.DataFrame, - clone_spm_unit_ids: np.ndarray, - time_period: int, -) -> np.ndarray: - """Derive clone-half capped work and childcare expenses from clone inputs. - - The CPS provides both pre-subsidy childcare and the SPM-specific - capped work-and-childcare deduction. For the clone half, we impute - only the pre-subsidy childcare amount, then deterministically rebuild - the capped value using the Census SPM rule: - work expenses plus childcare, capped at the lower earner's earnings - for the reference person and spouse/partner. - """ - - clone_pre_subsidy = np.asarray(clone_pre_subsidy, dtype=float) - weekly_work_expense = _get_spm_weekly_work_expense(time_period) - annual_work_expenses = ( - _calculate_clone_work_expenses( - clone_person_data=clone_person_data, - clone_spm_unit_ids=clone_spm_unit_ids, - ) - * weekly_work_expense - ) - lower_earner_cap = _calculate_clone_lower_earner_caps( - clone_person_data=clone_person_data, - clone_spm_unit_ids=clone_spm_unit_ids, - ) - - combined_expenses = np.maximum(clone_pre_subsidy, 0.0) + annual_work_expenses - return np.minimum(combined_expenses, lower_earner_cap) - - -def _rebuild_clone_capped_childcare_expenses( - data: dict, - time_period: int, - cps_sim, -) -> np.ndarray: - """Rebuild clone-half capped childcare expenses after stage-2 imputation.""" - - n_persons_half = len(data["person_id"][time_period]) // 2 - n_spm_units_half = len(data["spm_unit_id"][time_period]) // 2 - - person_roles = cps_sim.calculate_dataframe( - ["age", "is_tax_unit_head", "is_tax_unit_spouse"] - ) - if len(person_roles) != n_persons_half: - raise ValueError( - "Unexpected person role frame length while rebuilding clone childcare " - f"expenses: got {len(person_roles)}, expected {n_persons_half}" - ) - - clone_person_data = pd.DataFrame( - { - "spm_unit_id": data["person_spm_unit_id"][time_period][n_persons_half:], - "age": person_roles["age"].values, - "is_parent_proxy": ( - person_roles["is_tax_unit_head"].values - | person_roles["is_tax_unit_spouse"].values - ), - "earnings": ( - data["employment_income"][time_period][n_persons_half:] - + data["self_employment_income"][time_period][n_persons_half:] - ), - "weeks_worked": data["weeks_worked"][time_period][n_persons_half:], - } - ) - clone_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][ - n_spm_units_half: - ] - clone_spm_unit_ids = data["spm_unit_id"][time_period][n_spm_units_half:] - - return derive_clone_capped_childcare_expenses( - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=clone_person_data, - clone_spm_unit_ids=clone_spm_unit_ids, - time_period=time_period, - ) - - def _apply_post_processing(predictions, X_test, time_period, data): """Apply retirement constraints and SS reconciliation.""" ret_cols = [c for c in predictions.columns if c in _RETIREMENT_VARS] @@ -579,24 +429,6 @@ def _splice_cps_only_predictions( new_values = np.concatenate([cps_half, pred_values]) data[var] = {time_period: new_values} - if ( - "spm_unit_capped_work_childcare_expenses" in data - and "spm_unit_pre_subsidy_childcare_expenses" in data - ): - n_half = entity_half_lengths.get( - "spm_unit", - len(data["spm_unit_capped_work_childcare_expenses"][time_period]) // 2, - ) - cps_half = data["spm_unit_capped_work_childcare_expenses"][time_period][:n_half] - clone_half = _rebuild_clone_capped_childcare_expenses( - data=data, - time_period=time_period, - cps_sim=cps_sim, - ) - data["spm_unit_capped_work_childcare_expenses"] = { - time_period: np.concatenate([cps_half, clone_half]) - } - del cps_sim return data diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index a6ea8b654..3bbd98a2d 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -19,7 +19,6 @@ CPS_ONLY_IMPUTED_VARIABLES, CPS_STAGE2_INCOME_PREDICTORS, apply_retirement_constraints, - derive_clone_capped_childcare_expenses, reconcile_ss_subcomponents, ) from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES @@ -118,91 +117,12 @@ def test_pension_income_not_in_cps_only(self): ) def test_capped_childcare_not_in_cps_only(self): - """Capped childcare should be derived from clone-half inputs, not - independently QRF-imputed.""" + """Capped childcare should not be independently QRF-imputed.""" assert "spm_unit_capped_work_childcare_expenses" not in set( CPS_ONLY_IMPUTED_VARIABLES ) -class TestCloneChildcareDerivation: - """Clone-half capped work-and-childcare expenses should be deterministic.""" - - def test_caps_combined_work_and_childcare_at_lower_earner(self): - clone_pre_subsidy = np.array([1200.0, 5000.0, 3000.0]) - person_data = pd.DataFrame( - { - "spm_unit_id": [1, 1, 2, 2, 3], - "age": [40, 38, 35, 33, 29], - "is_parent_proxy": [True, True, True, True, True], - "earnings": [9000.0, 3000.0, 1500.0, 0.0, 2000.0], - "weeks_worked": [10.0, 20.0, 30.0, 5.0, 15.0], - } - ) - - result = derive_clone_capped_childcare_expenses( - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=person_data, - clone_spm_unit_ids=np.array([1, 2, 3]), - time_period=2024, - ) - - np.testing.assert_allclose( - result, - np.array( - [ - 2435.1, # 1200 childcare + (10 + 20) * 41.17 work expenses - 0.0, # Two-parent unit capped by the lower earner's zero earnings - 2000.0, # Single proxy unit capped at the proxy's earnings - ] - ), - rtol=0, - atol=1e-6, - ) - - def test_includes_work_expenses_even_without_childcare(self): - clone_pre_subsidy = np.array([0.0]) - person_data = pd.DataFrame( - { - "spm_unit_id": [10], - "age": [31], - "is_parent_proxy": [True], - "earnings": [2500.0], - "weeks_worked": [12.0], - } - ) - - result = derive_clone_capped_childcare_expenses( - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=person_data, - clone_spm_unit_ids=np.array([10]), - time_period=2024, - ) - - np.testing.assert_allclose(result, np.array([494.04]), rtol=0, atol=1e-6) - - def test_falls_back_to_zero_without_parent_proxies(self): - clone_pre_subsidy = np.array([3000.0]) - person_data = pd.DataFrame( - { - "spm_unit_id": [20, 20], - "age": [12, 9], - "is_parent_proxy": [False, False], - "earnings": [0.0, 0.0], - "weeks_worked": [0.0, 0.0], - } - ) - - result = derive_clone_capped_childcare_expenses( - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=person_data, - clone_spm_unit_ids=np.array([20]), - time_period=2024, - ) - - np.testing.assert_allclose(result, np.array([0.0])) - - class TestRetirementConstraints: """Post-processing retirement constraints enforce IRS caps.""" diff --git a/tests/unit/test_weeks_worked.py b/tests/unit/test_weeks_worked.py new file mode 100644 index 000000000..ad5f801af --- /dev/null +++ b/tests/unit/test_weeks_worked.py @@ -0,0 +1,41 @@ +""" +Tests for weeks_worked extraction from CPS ASEC. + +The Census CPS ASEC exposes WKSWORK directly, which we now carry through as +the model input for future-year SPM work-expense calculations. +""" + +import numpy as np +from pathlib import Path + + +class TestWeeksWorked: + """Test suite for weeks_worked variable extraction.""" + + def test_census_cps_includes_wkswork(self): + census_cps_path = Path(__file__).parent.parent.parent / ( + "policyengine_us_data/datasets/cps/census_cps.py" + ) + content = census_cps_path.read_text() + + assert '"WKSWORK"' in content, "WKSWORK should be in PERSON_COLUMNS" + + def test_cps_maps_weeks_worked_from_wkswork(self): + cps_path = Path(__file__).parent.parent.parent / ( + "policyengine_us_data/datasets/cps/cps.py" + ) + content = cps_path.read_text() + + assert 'cps["weeks_worked"]' in content + assert "person.WKSWORK" in content + assert "np.clip(person.WKSWORK, 0, 52)" in content + + def test_weeks_worked_value_range(self): + raw_values = np.array([-4, 0, 1, 26, 52, 60]) + processed = np.clip(raw_values, 0, 52) + + assert processed.min() >= 0, "Minimum should be >= 0" + assert processed.max() <= 52, "Maximum should be <= 52" + assert processed[0] == 0, "Negative values should clip to 0" + assert processed[3] == 26, "Valid weeks should be preserved" + assert processed[5] == 52, "Values above 52 should clip to 52" From 18f8ebd4a44b174180df43fdb13daefe2890cdf2 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 07:40:25 -0400 Subject: [PATCH 4/8] Carry CPS partner input for SPM childcare --- .../datasets/cps/census_cps.py | 1 + policyengine_us_data/datasets/cps/cps.py | 3 ++ tests/unit/test_reference_partner.py | 32 +++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 tests/unit/test_reference_partner.py diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py index 042fefe56..6faed88fe 100644 --- a/policyengine_us_data/datasets/cps/census_cps.py +++ b/policyengine_us_data/datasets/cps/census_cps.py @@ -233,6 +233,7 @@ class CensusCPS_2018(CensusCPS): "A_FNLWGT", "A_LINENO", "A_SPOUSE", + "PERRP", "A_AGE", "A_SEX", "PEDISEYE", diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 08301f167..6fe4d2f35 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -462,6 +462,9 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["is_surviving_spouse"] = person.A_MARITL == 4 cps["is_separated"] = person.A_MARITL == 6 + cps["is_unmarried_partner_of_household_head"] = person.PERRP.isin( + [43, 44, 46, 47] + ) # High school or college/university enrollment status. cps["is_full_time_college_student"] = person.A_HSCOL == 2 diff --git a/tests/unit/test_reference_partner.py b/tests/unit/test_reference_partner.py new file mode 100644 index 000000000..83ee6bd28 --- /dev/null +++ b/tests/unit/test_reference_partner.py @@ -0,0 +1,32 @@ +""" +Tests for reference-person partner extraction from CPS ASEC. + +The public CPS ASEC relationship-to-reference-person variable PERRP identifies +unmarried partners of the household head/reference person. We carry that +through so the SPM childcare cap can distinguish the reference person's partner +from unrelated adults in the same SPM unit. +""" + +from pathlib import Path + + +class TestReferencePartner: + """Test suite for CPS relationship-to-reference-person extraction.""" + + def test_census_cps_includes_perrp(self): + census_cps_path = Path(__file__).parent.parent.parent / ( + "policyengine_us_data/datasets/cps/census_cps.py" + ) + content = census_cps_path.read_text() + + assert '"PERRP"' in content, "PERRP should be in PERSON_COLUMNS" + + def test_cps_maps_unmarried_partner_from_perrp(self): + cps_path = Path(__file__).parent.parent.parent / ( + "policyengine_us_data/datasets/cps/cps.py" + ) + content = cps_path.read_text() + + assert 'cps["is_unmarried_partner_of_household_head"]' in content + for code in ("43", "44", "46", "47"): + assert code in content From e5a488e0bc35207b9c19f94b853949d4733e2750 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 10:16:50 -0400 Subject: [PATCH 5/8] Format CPS PERRP mapping --- policyengine_us_data/datasets/cps/cps.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 6fe4d2f35..f0e2c756a 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -462,9 +462,7 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["is_surviving_spouse"] = person.A_MARITL == 4 cps["is_separated"] = person.A_MARITL == 6 - cps["is_unmarried_partner_of_household_head"] = person.PERRP.isin( - [43, 44, 46, 47] - ) + cps["is_unmarried_partner_of_household_head"] = person.PERRP.isin([43, 44, 46, 47]) # High school or college/university enrollment status. cps["is_full_time_college_student"] = person.A_HSCOL == 2 From 27169f8f4f51cda5b2454178f84ef53b22e5e3fa Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 10 Apr 2026 10:31:35 -0400 Subject: [PATCH 6/8] Address childcare CPS review comments --- policyengine_us_data/datasets/cps/cps.py | 13 ++++++- tests/unit/test_reference_partner.py | 49 +++++++++++++++++------- tests/unit/test_weeks_worked.py | 45 +++++++++++----------- 3 files changed, 70 insertions(+), 37 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index dcdab86e3..8c31d10ce 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -90,6 +90,15 @@ ), } +# Census CPS ASEC 2024 technical documentation, PERRP: +# https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar24.pdf +PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES = { + 43: "Opposite Sex Unmarried Partner with Relatives", + 44: "Opposite Sex Unmarried Partner without Relatives", + 46: "Same Sex Unmarried Partner with Relatives", + 47: "Same Sex Unmarried Partner without Relatives", +} + class CPS(Dataset): name = "cps" @@ -572,7 +581,9 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["is_surviving_spouse"] = person.A_MARITL == 4 cps["is_separated"] = person.A_MARITL == 6 - cps["is_unmarried_partner_of_household_head"] = person.PERRP.isin([43, 44, 46, 47]) + cps["is_unmarried_partner_of_household_head"] = person.PERRP.isin( + PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES.keys() + ) # High school or college/university enrollment status. cps["is_full_time_college_student"] = person.A_HSCOL == 2 diff --git a/tests/unit/test_reference_partner.py b/tests/unit/test_reference_partner.py index 83ee6bd28..d579b080a 100644 --- a/tests/unit/test_reference_partner.py +++ b/tests/unit/test_reference_partner.py @@ -7,26 +7,49 @@ from unrelated adults in the same SPM unit. """ -from pathlib import Path +import numpy as np +import pandas as pd + +from policyengine_us_data.datasets.cps.census_cps import PERSON_COLUMNS +from policyengine_us_data.datasets.cps.cps import ( + PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES, + add_personal_variables, +) + + +def _person_frame(**columns): + n_persons = len(next(iter(columns.values()))) + data = {column: np.zeros(n_persons, dtype=int) for column in PERSON_COLUMNS} + data.update(columns) + return pd.DataFrame(data) class TestReferencePartner: """Test suite for CPS relationship-to-reference-person extraction.""" - def test_census_cps_includes_perrp(self): - census_cps_path = Path(__file__).parent.parent.parent / ( - "policyengine_us_data/datasets/cps/census_cps.py" - ) - content = census_cps_path.read_text() + def test_census_cps_loads_perrp(self): + assert "PERRP" in PERSON_COLUMNS - assert '"PERRP"' in content, "PERRP should be in PERSON_COLUMNS" + def test_unmarried_partner_perrp_code_table_matches_census_labels(self): + assert PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES == { + 43: "Opposite Sex Unmarried Partner with Relatives", + 44: "Opposite Sex Unmarried Partner without Relatives", + 46: "Same Sex Unmarried Partner with Relatives", + 47: "Same Sex Unmarried Partner without Relatives", + } def test_cps_maps_unmarried_partner_from_perrp(self): - cps_path = Path(__file__).parent.parent.parent / ( - "policyengine_us_data/datasets/cps/cps.py" + person = _person_frame( + PH_SEQ=np.arange(7) + 1, + A_LINENO=np.ones(7), + A_AGE=np.full(7, 35), + PERRP=np.array([40, 43, 44, 45, 46, 47, 48]), ) - content = cps_path.read_text() - assert 'cps["is_unmarried_partner_of_household_head"]' in content - for code in ("43", "44", "46", "47"): - assert code in content + cps = {} + add_personal_variables(cps, person) + + np.testing.assert_array_equal( + cps["is_unmarried_partner_of_household_head"], + np.array([False, True, True, False, True, True, False]), + ) diff --git a/tests/unit/test_weeks_worked.py b/tests/unit/test_weeks_worked.py index ad5f801af..0c3db2f76 100644 --- a/tests/unit/test_weeks_worked.py +++ b/tests/unit/test_weeks_worked.py @@ -6,36 +6,35 @@ """ import numpy as np -from pathlib import Path +import pandas as pd + +from policyengine_us_data.datasets.cps.census_cps import PERSON_COLUMNS +from policyengine_us_data.datasets.cps.cps import add_personal_income_variables + + +def _person_frame(**columns): + n_persons = len(next(iter(columns.values()))) + data = {column: np.zeros(n_persons, dtype=int) for column in PERSON_COLUMNS} + data.update(columns) + return pd.DataFrame(data) class TestWeeksWorked: """Test suite for weeks_worked variable extraction.""" - def test_census_cps_includes_wkswork(self): - census_cps_path = Path(__file__).parent.parent.parent / ( - "policyengine_us_data/datasets/cps/census_cps.py" - ) - content = census_cps_path.read_text() - - assert '"WKSWORK"' in content, "WKSWORK should be in PERSON_COLUMNS" + def test_census_cps_loads_wkswork(self): + assert "WKSWORK" in PERSON_COLUMNS def test_cps_maps_weeks_worked_from_wkswork(self): - cps_path = Path(__file__).parent.parent.parent / ( - "policyengine_us_data/datasets/cps/cps.py" + person = _person_frame( + A_AGE=np.full(6, 35), + WKSWORK=np.array([-4, 0, 1, 26, 52, 60]), ) - content = cps_path.read_text() - - assert 'cps["weeks_worked"]' in content - assert "person.WKSWORK" in content - assert "np.clip(person.WKSWORK, 0, 52)" in content - def test_weeks_worked_value_range(self): - raw_values = np.array([-4, 0, 1, 26, 52, 60]) - processed = np.clip(raw_values, 0, 52) + cps = {} + add_personal_income_variables(cps, person, 2024) - assert processed.min() >= 0, "Minimum should be >= 0" - assert processed.max() <= 52, "Maximum should be <= 52" - assert processed[0] == 0, "Negative values should clip to 0" - assert processed[3] == 26, "Valid weeks should be preserved" - assert processed[5] == 52, "Values above 52 should clip to 52" + np.testing.assert_array_equal( + cps["weeks_worked"], + np.array([0, 0, 1, 26, 52, 52]), + ) From 89b58ac6b99b47c6d5581562f4228c3882d37c46 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 10 Apr 2026 10:41:00 -0400 Subject: [PATCH 7/8] Fix weeks worked unit test isolation --- policyengine_us_data/datasets/cps/cps.py | 6 +++++- tests/unit/test_weeks_worked.py | 22 +++------------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 8c31d10ce..2b328456b 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -594,6 +594,10 @@ def children_per_parent(col: str) -> pd.DataFrame: add_overtime_occupation(cps, person) +def derive_weeks_worked(weeks_worked: Series | np.ndarray) -> Series | np.ndarray: + return np.clip(weeks_worked, 0, 52) + + def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int): """Add income variables. @@ -619,7 +623,7 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int): cps["weekly_hours_worked"] = person.HRSWK cps["hours_worked_last_week"] = person.A_HRS1 - cps["weeks_worked"] = np.clip(person.WKSWORK, 0, 52) + cps["weeks_worked"] = derive_weeks_worked(person.WKSWORK) cps["taxable_interest_income"] = person.INT_VAL * (p["taxable_interest_fraction"]) cps["tax_exempt_interest_income"] = person.INT_VAL * ( diff --git a/tests/unit/test_weeks_worked.py b/tests/unit/test_weeks_worked.py index 0c3db2f76..7f1bc6959 100644 --- a/tests/unit/test_weeks_worked.py +++ b/tests/unit/test_weeks_worked.py @@ -6,17 +6,9 @@ """ import numpy as np -import pandas as pd from policyengine_us_data.datasets.cps.census_cps import PERSON_COLUMNS -from policyengine_us_data.datasets.cps.cps import add_personal_income_variables - - -def _person_frame(**columns): - n_persons = len(next(iter(columns.values()))) - data = {column: np.zeros(n_persons, dtype=int) for column in PERSON_COLUMNS} - data.update(columns) - return pd.DataFrame(data) +from policyengine_us_data.datasets.cps.cps import derive_weeks_worked class TestWeeksWorked: @@ -25,16 +17,8 @@ class TestWeeksWorked: def test_census_cps_loads_wkswork(self): assert "WKSWORK" in PERSON_COLUMNS - def test_cps_maps_weeks_worked_from_wkswork(self): - person = _person_frame( - A_AGE=np.full(6, 35), - WKSWORK=np.array([-4, 0, 1, 26, 52, 60]), - ) - - cps = {} - add_personal_income_variables(cps, person, 2024) - + def test_cps_derives_weeks_worked_from_wkswork(self): np.testing.assert_array_equal( - cps["weeks_worked"], + derive_weeks_worked(np.array([-4, 0, 1, 26, 52, 60])), np.array([0, 0, 1, 26, 52, 52]), ) From 7f64138e118ec81afc4494bc9381fdd0f12cc938 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Fri, 10 Apr 2026 20:57:01 -0400 Subject: [PATCH 8/8] Handle missing PERRP in synthetic CPS frames --- policyengine_us_data/datasets/cps/cps.py | 7 ++++++- tests/unit/test_reference_partner.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 2b328456b..1244be4e7 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -581,7 +581,12 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["is_surviving_spouse"] = person.A_MARITL == 4 cps["is_separated"] = person.A_MARITL == 6 - cps["is_unmarried_partner_of_household_head"] = person.PERRP.isin( + perrp = ( + person.PERRP + if "PERRP" in person + else pd.Series(0, index=person.index, dtype=np.int16) + ) + cps["is_unmarried_partner_of_household_head"] = perrp.isin( PERRP_UNMARRIED_PARTNER_OF_HOUSEHOLD_HEAD_CODES.keys() ) # High school or college/university enrollment status. diff --git a/tests/unit/test_reference_partner.py b/tests/unit/test_reference_partner.py index d579b080a..7d18ce25b 100644 --- a/tests/unit/test_reference_partner.py +++ b/tests/unit/test_reference_partner.py @@ -53,3 +53,18 @@ def test_cps_maps_unmarried_partner_from_perrp(self): cps["is_unmarried_partner_of_household_head"], np.array([False, True, True, False, True, True, False]), ) + + def test_missing_perrp_defaults_to_false(self): + person = _person_frame( + PH_SEQ=np.arange(3) + 1, + A_LINENO=np.ones(3), + A_AGE=np.full(3, 35), + ).drop(columns="PERRP") + + cps = {} + add_personal_variables(cps, person) + + np.testing.assert_array_equal( + cps["is_unmarried_partner_of_household_head"], + np.array([False, False, False]), + )