diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 9c6225ccd..c097ea0bd 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -45,21 +45,37 @@ def initialize_weight_priors( original_weights: np.ndarray, seed: int = 1456, epsilon: float = 1e-6, + zero_weight_total_share: float = 0.5, ) -> np.ndarray: - """Build deterministic positive priors for sparse reweighting.""" + """Build deterministic positive priors for sparse reweighting. + + PUF clone households enter the extended CPS with zero household weight. + Giving those records near-zero priors leaves them effectively unusable in + log-space optimization. When zero-weight rows are present, preserve the + relative distribution of positive survey weights but reserve a fixed share + of the original total household mass for uniform zero-weight-row priors. + """ weights = np.asarray(original_weights, dtype=np.float64) if np.any(weights < 0): raise ValueError("original_weights must be non-negative") + if weights.size == 0: + return weights.copy() + if not 0 < zero_weight_total_share < 1: + raise ValueError("zero_weight_total_share must be between 0 and 1") priors = np.empty_like(weights, dtype=np.float64) positive_mask = weights > 0 - priors[positive_mask] = weights[positive_mask] - zero_mask = ~positive_mask - if zero_mask.any(): - rng = np.random.default_rng(seed) - priors[zero_mask] = epsilon * rng.uniform(1.0, 2.0, size=zero_mask.sum()) + if not zero_mask.any(): + return weights.copy() + + positive_total = float(weights[positive_mask].sum()) + if positive_total <= 0: + return np.full_like(weights, 1.0, dtype=np.float64) + + priors[positive_mask] = weights[positive_mask] * (1 - zero_weight_total_share) + priors[zero_mask] = positive_total * zero_weight_total_share / zero_mask.sum() return priors diff --git a/policyengine_us_data/storage/calibration_targets/soi_targets.csv b/policyengine_us_data/storage/calibration_targets/soi_targets.csv index b3f22d1e7..dcab8435d 100644 --- a/policyengine_us_data/storage/calibration_targets/soi_targets.csv +++ b/policyengine_us_data/storage/calibration_targets/soi_targets.csv @@ -11929,3 +11929,41 @@ Year,SOI table,XLSX column,XLSX row,Variable,Filing status,AGI lower bound,AGI u 2022,Table 3.3,AP,10,refundable_american_opportunity_credit,All,-inf,inf,False,False,True,5184485000 2023,Table 3.3,AO,10,refundable_american_opportunity_credit,All,-inf,inf,True,False,True,5821688 2023,Table 3.3,AP,10,refundable_american_opportunity_credit,All,-inf,inf,False,False,True,5090364000 +2023,Table 1.4A,BK,11,long_term_capital_gains,All,-inf,1.0,False,False,False,11981913000 +2023,Table 1.4A,BJ,11,long_term_capital_gains,All,-inf,1.0,True,False,False,137016 +2023,Table 1.4A,BK,12,long_term_capital_gains,All,1.0,5000.0,False,False,False,390046000 +2023,Table 1.4A,BJ,12,long_term_capital_gains,All,1.0,5000.0,True,False,False,171586 +2023,Table 1.4A,BK,13,long_term_capital_gains,All,5000.0,10000.0,False,False,False,740521000 +2023,Table 1.4A,BJ,13,long_term_capital_gains,All,5000.0,10000.0,True,False,False,181415 +2023,Table 1.4A,BK,14,long_term_capital_gains,All,10000.0,15000.0,False,False,False,1139960000 +2023,Table 1.4A,BJ,14,long_term_capital_gains,All,10000.0,15000.0,True,False,False,208487 +2023,Table 1.4A,BK,15,long_term_capital_gains,All,15000.0,20000.0,False,False,False,1222242000 +2023,Table 1.4A,BJ,15,long_term_capital_gains,All,15000.0,20000.0,True,False,False,231243 +2023,Table 1.4A,BK,16,long_term_capital_gains,All,20000.0,25000.0,False,False,False,1618072000 +2023,Table 1.4A,BJ,16,long_term_capital_gains,All,20000.0,25000.0,True,False,False,184713 +2023,Table 1.4A,BK,17,long_term_capital_gains,All,25000.0,30000.0,False,False,False,1627983000 +2023,Table 1.4A,BJ,17,long_term_capital_gains,All,25000.0,30000.0,True,False,False,184226 +2023,Table 1.4A,BK,18,long_term_capital_gains,All,30000.0,40000.0,False,False,False,2752465000 +2023,Table 1.4A,BJ,18,long_term_capital_gains,All,30000.0,40000.0,True,False,False,374807 +2023,Table 1.4A,BK,19,long_term_capital_gains,All,40000.0,50000.0,False,False,False,3402047000 +2023,Table 1.4A,BJ,19,long_term_capital_gains,All,40000.0,50000.0,True,False,False,401340 +2023,Table 1.4A,BK,20,long_term_capital_gains,All,50000.0,75000.0,False,False,False,9470818000 +2023,Table 1.4A,BJ,20,long_term_capital_gains,All,50000.0,75000.0,True,False,False,1138440 +2023,Table 1.4A,BK,21,long_term_capital_gains,All,75000.0,100000.0,False,False,False,12715937000 +2023,Table 1.4A,BJ,21,long_term_capital_gains,All,75000.0,100000.0,True,False,False,1185823 +2023,Table 1.4A,BK,22,long_term_capital_gains,All,100000.0,200000.0,False,False,False,63046717000 +2023,Table 1.4A,BJ,22,long_term_capital_gains,All,100000.0,200000.0,True,False,False,3470815 +2023,Table 1.4A,BK,23,long_term_capital_gains,All,200000.0,500000.0,False,False,False,127187338000 +2023,Table 1.4A,BJ,23,long_term_capital_gains,All,200000.0,500000.0,True,False,False,2793458 +2023,Table 1.4A,BK,24,long_term_capital_gains,All,500000.0,1000000.0,False,False,False,100228422000 +2023,Table 1.4A,BJ,24,long_term_capital_gains,All,500000.0,1000000.0,True,False,False,767767 +2023,Table 1.4A,BK,25,long_term_capital_gains,All,1000000.0,1500000.0,False,False,False,56098627000 +2023,Table 1.4A,BJ,25,long_term_capital_gains,All,1000000.0,1500000.0,True,False,False,196019 +2023,Table 1.4A,BK,26,long_term_capital_gains,All,1500000.0,2000000.0,False,False,False,37572096000 +2023,Table 1.4A,BJ,26,long_term_capital_gains,All,1500000.0,2000000.0,True,False,False,83388 +2023,Table 1.4A,BK,27,long_term_capital_gains,All,2000000.0,5000000.0,False,False,False,111769225000 +2023,Table 1.4A,BJ,27,long_term_capital_gains,All,2000000.0,5000000.0,True,False,False,123009 +2023,Table 1.4A,BK,28,long_term_capital_gains,All,5000000.0,10000000.0,False,False,False,82043062000 +2023,Table 1.4A,BJ,28,long_term_capital_gains,All,5000000.0,10000000.0,True,False,False,32657 +2023,Table 1.4A,BK,29,long_term_capital_gains,All,10000000.0,inf,False,False,False,346272458000 +2023,Table 1.4A,BJ,29,long_term_capital_gains,All,10000000.0,inf,True,False,False,22309 diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index d088ac516..d3b8785be 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -281,6 +281,7 @@ def _cbo_program_target_value(sim, variable_name: str, time_period): LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES = { "capital_gains_gross", + "long_term_capital_gains", "ordinary_dividends", "qualified_dividends", "taxable_interest_income", @@ -292,6 +293,7 @@ def _cbo_program_target_value(sim, variable_name: str, time_period): "employment_income", "business_net_profits", "capital_gains_gross", + "long_term_capital_gains", "ordinary_dividends", "partnership_and_s_corp_income", "qualified_dividends", diff --git a/policyengine_us_data/utils/soi.py b/policyengine_us_data/utils/soi.py index 2baca400b..daad17959 100644 --- a/policyengine_us_data/utils/soi.py +++ b/policyengine_us_data/utils/soi.py @@ -8,7 +8,8 @@ "count": "population", "employment_income": "employment_income_before_lsr", "business_net_profits": "total_self_employment_income", - "capital_gains_gross": "long_term_capital_gains", + "capital_gains_gross": "long_term_capital_gains_basis", + "long_term_capital_gains": "long_term_capital_gains_basis", "ordinary_dividends": "non_qualified_dividend_income", "partnership_and_s_corp_income": "partnership_s_corp_income", "qualified_dividends": "qualified_dividend_income", @@ -21,8 +22,8 @@ "total_pension_income": "pension_income", "total_social_security": "social_security", "business_net_losses": "total_self_employment_income", - "capital_gains_distributions": "long_term_capital_gains", - "capital_gains_losses": "long_term_capital_gains", + "capital_gains_distributions": "long_term_capital_gains_basis", + "capital_gains_losses": "long_term_capital_gains_basis", "estate_income": "estate_income", "estate_losses": "estate_income", "exempt_interest": "tax_exempt_interest_income", @@ -89,6 +90,8 @@ def pe(variable): df["capital_gains_losses"] = -pe("loss_limited_net_capital_gains") * ( pe("loss_limited_net_capital_gains") < 0 ) + ltcg = pe("long_term_capital_gains") + df["long_term_capital_gains"] = ltcg * (ltcg > 0) df["estate_income"] = pe("estate_income") * (pe("estate_income") > 0) df["estate_losses"] = -pe("estate_income") * (pe("estate_income") < 0) df["exempt_interest"] = pe("tax_exempt_interest_income") @@ -146,6 +149,12 @@ def puf_to_soi(puf, year): df["capital_gains_distributions"] = puf.E01100 df["capital_gains_gross"] = puf["E01000"] * (puf["E01000"] > 0) df["capital_gains_losses"] = -puf["E01000"] * (puf["E01000"] < 0) + ltcg = ( + puf["long_term_capital_gains"] + if "long_term_capital_gains" in puf + else puf.P23250 + ) + df["long_term_capital_gains"] = ltcg * (ltcg > 0) df["estate_income"] = puf.E26390 df["estate_losses"] = puf.E26400 df["exempt_interest"] = puf.E00400 diff --git a/tests/unit/calibration/test_loss_targets.py b/tests/unit/calibration/test_loss_targets.py index 3e7ee8baf..cd408ef43 100644 --- a/tests/unit/calibration/test_loss_targets.py +++ b/tests/unit/calibration/test_loss_targets.py @@ -15,6 +15,7 @@ BEA_WAGES_AND_SALARIES_LOSS_WEIGHT, BLS_CE_TOTALS, HARD_CODED_TOTALS, + LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES, TRANSFER_BALANCE_TARGETS, _add_bea_state_wage_targets, _add_agi_metric_columns, @@ -39,6 +40,7 @@ get_target_error_normalisation, get_target_loss_weights, ) +from policyengine_us_data.storage import CALIBRATION_FOLDER from policyengine_us_data.db import etl_national_targets from policyengine_us_data.utils.ssi_targets import ( SSI_RECIPIENT_TARGETS_2024, @@ -53,6 +55,29 @@ def test_legacy_loss_targets_include_aggregate_qbi_deduction(): assert "qualified_business_income_deduction" not in AGI_LEVEL_TARGETED_VARIABLES +def test_legacy_loss_targets_include_ltcg_agi_grid(): + assert "long_term_capital_gains" in AGI_LEVEL_TARGETED_VARIABLES + assert "long_term_capital_gains" in LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES + + soi = pd.read_csv(CALIBRATION_FOLDER / "soi_targets.csv") + ltcg = soi[ + (soi["Variable"] == "long_term_capital_gains") + & (soi["SOI table"] == "Table 1.4A") + & (soi["Filing status"] == "All") + & (~soi["Taxable only"]) + & (~soi["Full population"]) + ] + + assert ltcg.groupby("Count").size().to_dict() == {False: 19, True: 19} + assert ltcg["Value"].gt(0).all() + top_bracket = ltcg[ + (~ltcg["Count"]) + & (ltcg["AGI lower bound"] == 10_000_000.0) + & np.isinf(ltcg["AGI upper bound"]) + ] + assert top_bracket["Value"].iat[0] == 346_272_458_000 + + def test_bea_nipa_direct_sum_targets_match_targets_db(): loss_targets_by_variable = { variable: target for _, variable, target in BEA_NIPA_DIRECT_SUM_TARGETS @@ -790,12 +815,16 @@ def test_low_agi_soi_skip_keeps_investment_income_targets(): capital_income_low_agi_row = pd.Series( {"Variable": "capital_gains_gross", "AGI upper bound": 10_000.0} ) + ltcg_low_agi_row = pd.Series( + {"Variable": "long_term_capital_gains", "AGI upper bound": 10_000.0} + ) ordinary_higher_agi_row = pd.Series( {"Variable": "employment_income", "AGI upper bound": 25_000.0} ) assert _should_skip_soi_agi_row(ordinary_low_agi_row) assert not _should_skip_soi_agi_row(capital_income_low_agi_row) + assert not _should_skip_soi_agi_row(ltcg_low_agi_row) assert not _should_skip_soi_agi_row(ordinary_higher_agi_row) @@ -806,6 +835,9 @@ def test_all_return_soi_skip_keeps_investment_income_targets(): capital_income_all_return_row = pd.Series( {"Variable": "capital_gains_gross", "Taxable only": False} ) + ltcg_all_return_row = pd.Series( + {"Variable": "long_term_capital_gains", "Taxable only": False} + ) ordinary_taxable_row = pd.Series( {"Variable": "employment_income", "Taxable only": True} ) @@ -818,12 +850,17 @@ def test_all_return_soi_skip_keeps_investment_income_targets(): capital_income_taxable_row = pd.Series( {"Variable": "capital_gains_gross", "Taxable only": True} ) + ltcg_taxable_row = pd.Series( + {"Variable": "long_term_capital_gains", "Taxable only": True} + ) assert _should_skip_soi_taxability_row(ordinary_all_return_row) assert not _should_skip_soi_taxability_row(capital_income_all_return_row) + assert not _should_skip_soi_taxability_row(ltcg_all_return_row) assert not _should_skip_soi_taxability_row(ordinary_taxable_row) assert not _should_skip_soi_taxability_row(qbi_taxable_row) assert _should_skip_soi_taxability_row(capital_income_taxable_row) + assert _should_skip_soi_taxability_row(ltcg_taxable_row) def test_tanf_hardcoded_target_uses_fy2024_basic_assistance_total(): diff --git a/tests/unit/datasets/test_enhanced_cps_seeding.py b/tests/unit/datasets/test_enhanced_cps_seeding.py index 1f7ffc1aa..60bba54d6 100644 --- a/tests/unit/datasets/test_enhanced_cps_seeding.py +++ b/tests/unit/datasets/test_enhanced_cps_seeding.py @@ -3,7 +3,8 @@ Earlier versions used global ``np.random.normal(1, 0.1, ...)`` jitter before ``reweight()`` reseeded the optimizer. Current code routes both dense CPS weighting paths through ``initialize_weight_priors()``, which preserves positive -survey weights and gives zero-weight clone records deterministic tiny priors. +survey weight shape and gives zero-weight clone records deterministic uniform +prior mass. """ import numpy as np diff --git a/tests/unit/test_enhanced_cps_clone_diagnostics.py b/tests/unit/test_enhanced_cps_clone_diagnostics.py index bd1676120..f8ecfd865 100644 --- a/tests/unit/test_enhanced_cps_clone_diagnostics.py +++ b/tests/unit/test_enhanced_cps_clone_diagnostics.py @@ -14,16 +14,17 @@ ) -def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(): +def test_initialize_weight_priors_gives_zero_weight_records_balanced_mass(): weights = np.array([1_500.0, 0.0, 625.0, 0.0], dtype=np.float64) priors = initialize_weight_priors(weights, seed=123) assert np.all(priors > 0) - assert priors[1] < 1e-4 - assert priors[3] < 1e-4 - assert priors[0] == pytest.approx(1_500.0) - assert priors[2] == pytest.approx(625.0) + assert priors.sum() == pytest.approx(weights.sum()) + assert priors[[0, 2]].sum() == pytest.approx(weights.sum() / 2) + assert priors[[1, 3]].sum() == pytest.approx(weights.sum() / 2) + assert priors[1] == pytest.approx(priors[3]) + assert priors[0] / priors[2] == pytest.approx(weights[0] / weights[2]) def test_initialize_weight_priors_preserves_positive_weights_exactly(): diff --git a/tests/unit/test_soi_utils.py b/tests/unit/test_soi_utils.py index 889fb3566..35d2209c2 100644 --- a/tests/unit/test_soi_utils.py +++ b/tests/unit/test_soi_utils.py @@ -105,6 +105,7 @@ def calculate(self, variable, map_to=None): values = { "self_employment_income": np.array([100.0, -10.0]), "sstb_self_employment_income": np.array([50.0, -25.0]), + "long_term_capital_gains": np.array([25.0, -5.0]), "miscellaneous_income": np.array([12.0, -5.0]), "filing_status": np.array(["SINGLE", "SINGLE"]), "tax_unit_weight": np.ones(n), @@ -124,6 +125,9 @@ def calculate(self, variable, map_to=None): np.testing.assert_array_equal( soi["business_net_losses"].to_numpy(), np.array([0.0, 35.0]) ) + np.testing.assert_array_equal( + soi["long_term_capital_gains"].to_numpy(), np.array([25.0, 0.0]) + ) np.testing.assert_array_equal(soi["other_income"].to_numpy(), np.array([12.0, 0.0])) @@ -199,6 +203,56 @@ def test_get_soi_uses_best_available_year_per_variable(monkeypatch): assert np.isclose(taxable_interest_value, 266.6666666667) +def test_get_soi_uses_ltcg_basis_uprating_for_capital_gains(monkeypatch): + soi_module = load_soi_module() + fake_soi = pd.DataFrame( + [ + { + "Year": 2023, + "Variable": "capital_gains_gross", + "Value": 100.0, + }, + { + "Year": 2023, + "Variable": "long_term_capital_gains", + "Value": 200.0, + }, + ] + ) + for column, default in { + "SOI table": "Table 1.4A", + "XLSX column": "BK", + "XLSX row": 10, + "Filing status": "All", + "AGI lower bound": float("-inf"), + "AGI upper bound": float("inf"), + "Count": False, + "Taxable only": False, + "Full population": True, + }.items(): + fake_soi[column] = default + + uprating = pd.DataFrame( + { + 2023: [1.0, 1.0], + 2024: [2.0, 10.0], + }, + index=["long_term_capital_gains_basis", "employment_income_before_lsr"], + ) + + monkeypatch.setattr(soi_module, "load_tracked_soi_targets", lambda: fake_soi.copy()) + monkeypatch.setattr( + soi_module, + "create_policyengine_uprating_factors_table", + lambda: uprating, + ) + + soi = soi_module.get_soi(2024) + + assert soi.set_index("Variable").loc["capital_gains_gross", "Value"] == 200.0 + assert soi.set_index("Variable").loc["long_term_capital_gains", "Value"] == 400.0 + + def test_get_soi_uses_current_employment_income_uprating_without_legacy_row( monkeypatch, ):