diff --git a/changelog.d/719.added.md b/changelog.d/719.added.md new file mode 100644 index 000000000..649dc5b94 --- /dev/null +++ b/changelog.d/719.added.md @@ -0,0 +1 @@ +Added richer national CTC calibration and validation coverage by loading AGI-split refundable and nonrefundable CTC targets from IRS geography data, expanding CTC diagnostics to AGI-by-filing-status and child-composition tables, and reporting a canonical ARPA-style CTC reform in national H5 validation. diff --git a/policyengine_us_data/calibration/ctc_diagnostics.py b/policyengine_us_data/calibration/ctc_diagnostics.py index cee364c67..f1ee6d73b 100644 --- a/policyengine_us_data/calibration/ctc_diagnostics.py +++ b/policyengine_us_data/calibration/ctc_diagnostics.py @@ -40,6 +40,28 @@ "non_refundable_ctc", ] +CHILD_AGE_GROUP_COLUMNS = [ + "tax_unit_count", + "ctc_qualifying_children", + "ctc_recipient_count", + "refundable_ctc_recipient_count", + "non_refundable_ctc_recipient_count", +] + +COUNT_FORMAT_COLUMNS = { + "tax_unit_count", + "ctc_qualifying_children", + "ctc_recipient_count", + "refundable_ctc_recipient_count", + "non_refundable_ctc_recipient_count", +} + +AMOUNT_FORMAT_COLUMNS = { + "ctc", + "refundable_ctc", + "non_refundable_ctc", +} + def _assign_agi_bands(adjusted_gross_income: np.ndarray) -> pd.Categorical: labels = [label for _, _, label in IRS_AGI_BANDS] @@ -58,15 +80,19 @@ def _normalize_filing_status(filing_status: pd.Series) -> pd.Categorical: return pd.Categorical(labels, categories=FILING_STATUS_ORDER, ordered=True) -def build_ctc_diagnostic_tables(frame: pd.DataFrame) -> dict[str, pd.DataFrame]: - """Aggregate weighted CTC diagnostics by AGI band and filing status.""" - work = frame.copy() - weights = work["tax_unit_weight"].astype(float).to_numpy() +def _assign_ctc_child_count_buckets( + ctc_qualifying_children: np.ndarray, +) -> pd.Categorical: + labels = ["0", "1", "2", "3+"] + bucket = np.full(len(ctc_qualifying_children), labels[-1], dtype=object) + bucket[ctc_qualifying_children <= 0] = "0" + bucket[ctc_qualifying_children == 1] = "1" + bucket[ctc_qualifying_children == 2] = "2" + return pd.Categorical(bucket, categories=labels, ordered=True) - work["agi_band"] = _assign_agi_bands( - work["adjusted_gross_income"].astype(float).to_numpy() - ) - work["filing_status_group"] = _normalize_filing_status(work["filing_status"]) + +def _add_weighted_ctc_columns(work: pd.DataFrame) -> pd.DataFrame: + weights = work["tax_unit_weight"].astype(float).to_numpy() work["tax_unit_count"] = weights work["ctc_qualifying_children"] = ( @@ -87,6 +113,67 @@ def build_ctc_diagnostic_tables(frame: pd.DataFrame) -> dict[str, pd.DataFrame]: work["non_refundable_ctc"].astype(float).to_numpy() * weights ) + return work + + +def _build_child_age_table(work: pd.DataFrame) -> pd.DataFrame | None: + if ( + "ctc_qualifying_children_under_6" not in work + or "ctc_qualifying_children_6_to_17" not in work + ): + return None + + weights = work["tax_unit_weight"].astype(float).to_numpy() + ctc_positive = work["ctc"].astype(float).to_numpy() > 0 + refundable_positive = work["refundable_ctc"].astype(float).to_numpy() > 0 + non_refundable_positive = work["non_refundable_ctc"].astype(float).to_numpy() > 0 + + rows = [] + for label, child_counts in ( + ( + "Under 6", + work["ctc_qualifying_children_under_6"].astype(float).to_numpy(), + ), + ( + "Age 6-17", + work["ctc_qualifying_children_6_to_17"].astype(float).to_numpy(), + ), + ): + has_children = child_counts > 0 + rows.append( + { + "group": label, + "tax_unit_count": float((has_children.astype(float) * weights).sum()), + "ctc_qualifying_children": float((child_counts * weights).sum()), + "ctc_recipient_count": float( + ((ctc_positive & has_children).astype(float) * weights).sum() + ), + "refundable_ctc_recipient_count": float( + ((refundable_positive & has_children).astype(float) * weights).sum() + ), + "non_refundable_ctc_recipient_count": float( + ( + (non_refundable_positive & has_children).astype(float) * weights + ).sum() + ), + } + ) + + return pd.DataFrame(rows, columns=["group"] + CHILD_AGE_GROUP_COLUMNS) + + +def build_ctc_diagnostic_tables(frame: pd.DataFrame) -> dict[str, pd.DataFrame]: + """Aggregate weighted CTC diagnostics by AGI band and filing status.""" + work = frame.copy() + child_counts = work["ctc_qualifying_children"].astype(float).to_numpy() + + work["agi_band"] = _assign_agi_bands( + work["adjusted_gross_income"].astype(float).to_numpy() + ) + work["filing_status_group"] = _normalize_filing_status(work["filing_status"]) + work["child_count_group"] = _assign_ctc_child_count_buckets(child_counts) + work = _add_weighted_ctc_columns(work) + by_agi = ( work.groupby("agi_band", observed=False)[CTC_GROUP_COLUMNS] .sum() @@ -99,26 +186,73 @@ def build_ctc_diagnostic_tables(frame: pd.DataFrame) -> dict[str, pd.DataFrame]: .reset_index() .rename(columns={"filing_status_group": "group"}) ) + by_agi_band_and_filing_status = ( + work.groupby(["agi_band", "filing_status_group"], observed=False)[ + CTC_GROUP_COLUMNS + ] + .sum() + .reset_index() + .rename(columns={"filing_status_group": "filing_status"}) + ) + by_child_count = ( + work.groupby("child_count_group", observed=False)[CTC_GROUP_COLUMNS] + .sum() + .reset_index() + .rename(columns={"child_count_group": "group"}) + ) + by_child_age = _build_child_age_table(frame) - return { + tables = { "by_agi_band": by_agi, "by_filing_status": by_filing_status, + "by_agi_band_and_filing_status": by_agi_band_and_filing_status, + "by_child_count": by_child_count, } + if by_child_age is not None: + tables["by_child_age"] = by_child_age + return tables -def create_ctc_diagnostic_tables(sim) -> dict[str, pd.DataFrame]: +def create_ctc_diagnostic_tables(sim, period=None) -> dict[str, pd.DataFrame]: """Calculate weighted CTC diagnostic tables from a microsimulation.""" frame = pd.DataFrame( { - "adjusted_gross_income": sim.calculate("adjusted_gross_income").values, - "filing_status": sim.calculate("filing_status").values, - "tax_unit_weight": sim.calculate("tax_unit_weight").values, - "ctc_qualifying_children": sim.calculate("ctc_qualifying_children").values, - "ctc": sim.calculate("ctc").values, - "refundable_ctc": sim.calculate("refundable_ctc").values, - "non_refundable_ctc": sim.calculate("non_refundable_ctc").values, + "adjusted_gross_income": sim.calculate( + "adjusted_gross_income", period=period + ).values, + "filing_status": sim.calculate("filing_status", period=period).values, + "tax_unit_weight": sim.calculate("tax_unit_weight", period=period).values, + "ctc_qualifying_children": sim.calculate( + "ctc_qualifying_children", period=period + ).values, + "ctc": sim.calculate("ctc", period=period).values, + "refundable_ctc": sim.calculate("refundable_ctc", period=period).values, + "non_refundable_ctc": sim.calculate( + "non_refundable_ctc", period=period + ).values, } ) + + try: + ctc_qualifying_child = sim.calculate( + "ctc_qualifying_child", + map_to="person", + period=period, + ).values.astype(bool) + age = sim.calculate("age", map_to="person", period=period).values.astype(float) + frame["ctc_qualifying_children_under_6"] = sim.map_result( + (ctc_qualifying_child & (age < 6)).astype(float), + "person", + "tax_unit", + ) + frame["ctc_qualifying_children_6_to_17"] = sim.map_result( + (ctc_qualifying_child & (age >= 6) & (age < 18)).astype(float), + "person", + "tax_unit", + ) + except Exception: + pass + return build_ctc_diagnostic_tables(frame) @@ -132,14 +266,9 @@ def _format_amount(value: float) -> str: def format_ctc_diagnostic_table(table: pd.DataFrame) -> str: display = table.copy() - for column in [ - "tax_unit_count", - "ctc_qualifying_children", - "ctc_recipient_count", - "refundable_ctc_recipient_count", - "non_refundable_ctc_recipient_count", - ]: - display[column] = display[column].map(_format_count) - for column in ["ctc", "refundable_ctc", "non_refundable_ctc"]: - display[column] = display[column].map(_format_amount) + for column in display.columns: + if column in COUNT_FORMAT_COLUMNS: + display[column] = display[column].map(_format_count) + elif column in AMOUNT_FORMAT_COLUMNS: + display[column] = display[column].map(_format_amount) return display.to_string(index=False) diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml index 8cd182ec0..41c7474dd 100644 --- a/policyengine_us_data/calibration/target_config.yaml +++ b/policyengine_us_data/calibration/target_config.yaml @@ -154,9 +154,15 @@ include: - variable: refundable_ctc geo_level: national domain_variable: refundable_ctc + - variable: refundable_ctc + geo_level: national + domain_variable: adjusted_gross_income,refundable_ctc - variable: non_refundable_ctc geo_level: national domain_variable: non_refundable_ctc + - variable: non_refundable_ctc + geo_level: national + domain_variable: adjusted_gross_income,non_refundable_ctc - variable: self_employment_income geo_level: national domain_variable: self_employment_income @@ -181,9 +187,15 @@ include: - variable: tax_unit_count geo_level: national domain_variable: refundable_ctc + - variable: tax_unit_count + geo_level: national + domain_variable: adjusted_gross_income,refundable_ctc - variable: tax_unit_count geo_level: national domain_variable: non_refundable_ctc + - variable: tax_unit_count + geo_level: national + domain_variable: adjusted_gross_income,non_refundable_ctc # Restore old loss.py's self-employment return-count target. - variable: tax_unit_count geo_level: national diff --git a/policyengine_us_data/calibration/validate_national_h5.py b/policyengine_us_data/calibration/validate_national_h5.py index 3f79f7aef..e4097ae23 100644 --- a/policyengine_us_data/calibration/validate_national_h5.py +++ b/policyengine_us_data/calibration/validate_national_h5.py @@ -14,6 +14,8 @@ import argparse import os +import pandas as pd + from policyengine_us_data.calibration.ctc_diagnostics import ( create_ctc_diagnostic_tables, format_ctc_diagnostic_table, @@ -67,6 +69,88 @@ "ctc_qualifying_children", } +CANONICAL_CTC_REFORM_VARIABLES = [ + "ctc_value", + "ctc", + "refundable_ctc", + "non_refundable_ctc", + "eitc", + "household_net_income", +] + +CANONICAL_CTC_REFORM_DICT = { + "gov.irs.credits.eitc.max[0].amount": {"2025-01-01.2100-12-31": 2_000}, + "gov.irs.credits.eitc.max[1].amount": {"2025-01-01.2100-12-31": 2_000}, + "gov.irs.credits.eitc.max[2].amount": {"2025-01-01.2100-12-31": 2_000}, + "gov.irs.credits.eitc.max[3].amount": {"2025-01-01.2100-12-31": 2_000}, + "gov.irs.credits.ctc.phase_out.amount": {"2025-01-01.2100-12-31": 25}, + "gov.irs.credits.ctc.amount.arpa[0].amount": {"2025-01-01.2100-12-31": 4_800}, + "gov.irs.credits.ctc.amount.arpa[1].amount": {"2025-01-01.2100-12-31": 4_800}, + "gov.irs.credits.ctc.phase_out.arpa.amount": {"2025-01-01.2100-12-31": 25}, + "gov.contrib.ctc.minimum_refundable.in_effect": {"2025-01-01.2100-12-31": True}, + "gov.contrib.ctc.per_child_phase_in.in_effect": {"2025-01-01.2100-12-31": True}, + "gov.irs.credits.ctc.phase_out.arpa.in_effect": {"2025-01-01.2100-12-31": True}, + "gov.irs.credits.ctc.refundable.phase_in.rate": {"2025-01-01.2100-12-31": 0.2}, + "gov.irs.credits.eitc.phase_in_rate[0].amount": {"2025-01-01.2100-12-31": 0.2}, + "gov.irs.credits.eitc.phase_in_rate[1].amount": {"2025-01-01.2100-12-31": 0.2}, + "gov.irs.credits.eitc.phase_in_rate[2].amount": {"2025-01-01.2100-12-31": 0.2}, + "gov.irs.credits.eitc.phase_in_rate[3].amount": {"2025-01-01.2100-12-31": 0.2}, + "gov.contrib.ctc.per_child_phase_out.in_effect": {"2025-01-01.2100-12-31": True}, + "gov.irs.credits.ctc.phase_out.threshold.JOINT": {"2025-01-01.2100-12-31": 200_000}, + "gov.irs.credits.ctc.refundable.individual_max": {"2025-01-01.2100-12-31": 4_800}, + "gov.irs.credits.eitc.phase_out.rate[0].amount": {"2025-01-01.2100-12-31": 0.1}, + "gov.irs.credits.eitc.phase_out.rate[1].amount": {"2025-01-01.2100-12-31": 0.1}, + "gov.irs.credits.eitc.phase_out.rate[2].amount": {"2025-01-01.2100-12-31": 0.1}, + "gov.irs.credits.eitc.phase_out.rate[3].amount": {"2025-01-01.2100-12-31": 0.1}, + "gov.irs.credits.ctc.phase_out.threshold.SINGLE": { + "2025-01-01.2100-12-31": 100_000 + }, + "gov.irs.credits.eitc.phase_out.start[0].amount": {"2025-01-01.2100-12-31": 20_000}, + "gov.irs.credits.eitc.phase_out.start[1].amount": {"2025-01-01.2100-12-31": 20_000}, + "gov.irs.credits.eitc.phase_out.start[2].amount": {"2025-01-01.2100-12-31": 20_000}, + "gov.irs.credits.eitc.phase_out.start[3].amount": {"2025-01-01.2100-12-31": 20_000}, + "gov.irs.credits.ctc.phase_out.threshold.SEPARATE": { + "2025-01-01.2100-12-31": 100_000 + }, + "gov.contrib.ctc.per_child_phase_out.avoid_overlap": { + "2025-01-01.2100-12-31": True + }, + "gov.irs.credits.ctc.refundable.phase_in.threshold": {"2025-01-01.2100-12-31": 0}, + "gov.irs.credits.ctc.phase_out.arpa.threshold.JOINT": { + "2025-01-01.2100-12-31": 35_000 + }, + "gov.contrib.ctc.minimum_refundable.amount[0].amount": { + "2025-01-01.2100-12-31": 2_400 + }, + "gov.contrib.ctc.minimum_refundable.amount[1].amount": { + "2025-01-01.2100-12-31": 2_400 + }, + "gov.irs.credits.ctc.phase_out.arpa.threshold.SINGLE": { + "2025-01-01.2100-12-31": 25_000 + }, + "gov.irs.credits.eitc.phase_out.joint_bonus[0].amount": { + "2025-01-01.2100-12-31": 7_000 + }, + "gov.irs.credits.eitc.phase_out.joint_bonus[1].amount": { + "2025-01-01.2100-12-31": 7_000 + }, + "gov.irs.credits.ctc.phase_out.arpa.threshold.SEPARATE": { + "2025-01-01.2100-12-31": 25_000 + }, + "gov.irs.credits.ctc.phase_out.threshold.SURVIVING_SPOUSE": { + "2025-01-01.2100-12-31": 100_000 + }, + "gov.irs.credits.ctc.phase_out.threshold.HEAD_OF_HOUSEHOLD": { + "2025-01-01.2100-12-31": 100_000 + }, + "gov.irs.credits.ctc.phase_out.arpa.threshold.SURVIVING_SPOUSE": { + "2025-01-01.2100-12-31": 25_000 + }, + "gov.irs.credits.ctc.phase_out.arpa.threshold.HEAD_OF_HOUSEHOLD": { + "2025-01-01.2100-12-31": 25_000 + }, +} + def get_reference_values(reference_year: int = 2024): """Return national validation references for the current production year.""" @@ -86,7 +170,7 @@ def get_reference_values(reference_year: int = 2024): def get_ctc_diagnostic_outputs(sim) -> dict[str, str]: """Return formatted CTC diagnostics for human-readable validation output.""" tables = create_ctc_diagnostic_tables(sim) - return { + outputs = { "CTC DIAGNOSTICS BY AGI BAND": format_ctc_diagnostic_table( tables["by_agi_band"] ), @@ -94,6 +178,132 @@ def get_ctc_diagnostic_outputs(sim) -> dict[str, str]: tables["by_filing_status"] ), } + if "by_agi_band_and_filing_status" in tables: + outputs["CTC DIAGNOSTICS BY AGI BAND AND FILING STATUS"] = ( + format_ctc_diagnostic_table(tables["by_agi_band_and_filing_status"]) + ) + if "by_child_count" in tables: + outputs["CTC DIAGNOSTICS BY QUALIFYING-CHILD COUNT"] = ( + format_ctc_diagnostic_table(tables["by_child_count"]) + ) + if "by_child_age" in tables: + outputs["CTC DIAGNOSTICS BY QUALIFYING-CHILD AGE"] = ( + format_ctc_diagnostic_table(tables["by_child_age"]) + ) + return outputs + + +def build_canonical_ctc_reform_summary( + baseline_sim, + reformed_sim, + *, + period: int = 2025, +) -> pd.DataFrame: + rows = [] + for variable in CANONICAL_CTC_REFORM_VARIABLES: + baseline = float(baseline_sim.calculate(variable, period=period).sum()) + reformed = float(reformed_sim.calculate(variable, period=period).sum()) + rows.append( + { + "variable": variable, + "baseline": baseline, + "reformed": reformed, + "delta": reformed - baseline, + } + ) + return pd.DataFrame(rows) + + +def _format_canonical_ctc_reform_summary(table: pd.DataFrame) -> str: + display = table.copy() + for column in ("baseline", "reformed", "delta"): + display[column] = display[column].map(lambda value: f"${value / 1e9:,.1f}B") + return display.to_string(index=False) + + +def _subtract_diagnostic_tables( + baseline_tables: dict[str, pd.DataFrame], + reformed_tables: dict[str, pd.DataFrame], +) -> dict[str, pd.DataFrame]: + delta_tables = {} + for name, baseline in baseline_tables.items(): + if name not in reformed_tables: + continue + reformed = reformed_tables[name] + numeric_columns = [ + column + for column in baseline.columns + if column in reformed.columns + and pd.api.types.is_numeric_dtype(baseline[column]) + and pd.api.types.is_numeric_dtype(reformed[column]) + ] + id_columns = [ + column + for column in baseline.columns + if column in reformed.columns and column not in numeric_columns + ] + merged = baseline.merge( + reformed, + on=id_columns, + suffixes=("_baseline", "_reformed"), + ) + delta = merged[id_columns].copy() + for column in numeric_columns: + delta[column] = merged[f"{column}_reformed"] - merged[f"{column}_baseline"] + delta_tables[name] = delta + return delta_tables + + +def _create_canonical_ctc_reform(): + from policyengine_core.reforms import Reform + + return Reform.from_dict(CANONICAL_CTC_REFORM_DICT, country_id="us") + + +def get_canonical_ctc_reform_outputs( + dataset_path: str, + *, + baseline_sim=None, + period: int = 2025, +) -> dict[str, str]: + from policyengine_us import Microsimulation + + if baseline_sim is None: + baseline_sim = Microsimulation(dataset=dataset_path) + + reformed_sim = Microsimulation( + dataset=dataset_path, + reform=_create_canonical_ctc_reform(), + ) + + outputs = { + "CANONICAL CTC REFORM NATIONAL DELTAS": _format_canonical_ctc_reform_summary( + build_canonical_ctc_reform_summary( + baseline_sim, + reformed_sim, + period=period, + ) + ) + } + + delta_tables = _subtract_diagnostic_tables( + create_ctc_diagnostic_tables(baseline_sim, period=period), + create_ctc_diagnostic_tables(reformed_sim, period=period), + ) + section_names = { + "by_agi_band": "CANONICAL CTC REFORM DELTAS BY AGI BAND", + "by_filing_status": "CANONICAL CTC REFORM DELTAS BY FILING STATUS", + "by_agi_band_and_filing_status": ( + "CANONICAL CTC REFORM DELTAS BY AGI BAND AND FILING STATUS" + ), + "by_child_count": "CANONICAL CTC REFORM DELTAS BY QUALIFYING-CHILD COUNT", + "by_child_age": "CANONICAL CTC REFORM DELTAS BY QUALIFYING-CHILD AGE", + } + for name, table in delta_tables.items(): + if name in section_names: + outputs[section_names[name]] = format_ctc_diagnostic_table(table) + + return outputs def resolve_dataset_path(dataset_path: str) -> str: @@ -198,6 +408,15 @@ def main(argv=None): print("=" * 70) print(section_output) + for section_name, section_output in get_canonical_ctc_reform_outputs( + resolved_dataset_path, + baseline_sim=sim, + ).items(): + print("\n" + "=" * 70) + print(section_name) + print("=" * 70) + print(section_output) + print("\n" + "=" * 70) print("STRUCTURAL CHECKS") print("=" * 70) diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py index 8f25dc636..aeed698e7 100644 --- a/policyengine_us_data/db/etl_irs_soi.py +++ b/policyengine_us_data/db/etl_irs_soi.py @@ -157,6 +157,8 @@ def _skip_coarse_state_agi_person_count_target(geo_type: str, agi_stub: int) -> "unemployment_compensation": "unemployment_compensation", } +CTC_GEOGRAPHY_TARGET_VARIABLES = ("refundable_ctc", "non_refundable_ctc") + def create_records(df, breakdown_variable, target_variable): """Transforms a DataFrame subset into a standardized list of records.""" @@ -384,6 +386,54 @@ def _get_national_geography_soi_target_from_year( } +def _get_national_geography_soi_agi_targets_from_year( + variable: str, + geography_year: int, +) -> list[dict]: + spec = _get_geography_file_aggregate_target_spec(variable) + code = spec["code"] + + raw_df = extract_soi_data(geography_year) + if "CONG_DISTRICT" in raw_df.columns: + district_mask = raw_df["CONG_DISTRICT"] == 0 + else: + district_mask = True + state_rows = raw_df[ + (raw_df["STATE"] != "US") + & district_mask + & raw_df["agi_stub"].isin(AGI_STUB_TO_INCOME_RANGE) + ] + if state_rows.empty: + raise ValueError( + f"IRS geography SOI file for {geography_year} is missing state AGI rows " + f"for {variable}" + ) + + grouped = ( + state_rows.groupby("agi_stub", sort=True)[[f"N{code}", f"A{code}"]] + .sum() + .reset_index() + ) + + targets = [] + for row in grouped.itertuples(index=False): + agi_stub = int(row.agi_stub) + agi_lower_bound, agi_upper_bound = AGI_STUB_TO_INCOME_RANGE[agi_stub] + targets.append( + { + "variable": variable, + "source_year": geography_year, + "agi_stub": agi_stub, + "agi_lower_bound": float(agi_lower_bound), + "agi_upper_bound": float(agi_upper_bound), + "count": float(getattr(row, f"N{code}")), + "amount": float(getattr(row, f"A{code}")) * 1_000, + } + ) + + return targets + + def get_national_geography_soi_target( variable: str, dataset_year: int, @@ -395,6 +445,17 @@ def get_national_geography_soi_target( return _get_national_geography_soi_target_from_year(variable, geography_year) +def get_national_geography_soi_agi_targets( + variable: str, + dataset_year: int, + *, + lag: int = IRS_SOI_LAG_YEARS, +) -> list[dict]: + """Return national AGI-band count and amount targets from the geography file.""" + geography_year = get_geography_soi_year(dataset_year, lag=lag) + return _get_national_geography_soi_agi_targets_from_year(variable, geography_year) + + def _upsert_target( session: Session, *, @@ -469,11 +530,64 @@ def _get_or_create_national_domain_stratum( return stratum +def _get_or_create_national_agi_domain_stratum( + session: Session, + national_filer_stratum_id: int, + variable: str, + agi_lower_bound: float, + agi_upper_bound: float, +) -> Stratum: + note = ( + "National filers, AGI >= " + f"{agi_lower_bound}, AGI < {agi_upper_bound}, {variable} > 0" + ) + stratum = session.exec( + select(Stratum).where( + Stratum.parent_stratum_id == national_filer_stratum_id, + Stratum.notes == note, + ) + ).first() + if stratum: + return stratum + + stratum = Stratum( + parent_stratum_id=national_filer_stratum_id, + notes=note, + ) + stratum.constraints_rel.extend( + [ + StratumConstraint( + constraint_variable="tax_unit_is_filer", + operation="==", + value="1", + ), + StratumConstraint( + constraint_variable="adjusted_gross_income", + operation=">=", + value=str(agi_lower_bound), + ), + StratumConstraint( + constraint_variable="adjusted_gross_income", + operation="<", + value=str(agi_upper_bound), + ), + StratumConstraint( + constraint_variable=variable, + operation=">", + value="0", + ), + ] + ) + session.add(stratum) + session.flush() + return stratum + + def load_national_geography_ctc_targets( session: Session, national_filer_stratum_id: int, geography_year: int ) -> None: """Create national aggregate CTC targets from the IRS geography file.""" - for variable in ("refundable_ctc", "non_refundable_ctc"): + for variable in CTC_GEOGRAPHY_TARGET_VARIABLES: target = _get_national_geography_soi_target_from_year(variable, geography_year) stratum = _get_or_create_national_domain_stratum( session, @@ -504,6 +618,47 @@ def load_national_geography_ctc_targets( ) +def load_national_geography_ctc_agi_targets( + session: Session, + national_filer_stratum_id: int, + geography_year: int, +) -> None: + """Create national AGI-split CTC targets from the IRS geography file.""" + for variable in CTC_GEOGRAPHY_TARGET_VARIABLES: + for target in _get_national_geography_soi_agi_targets_from_year( + variable, geography_year + ): + stratum = _get_or_create_national_agi_domain_stratum( + session, + national_filer_stratum_id, + variable, + target["agi_lower_bound"], + target["agi_upper_bound"], + ) + notes = ( + f"IRS geography-file national AGI target " + f"(source year {target['source_year']}, agi_stub {target['agi_stub']})" + ) + _upsert_target( + session, + stratum_id=stratum.stratum_id, + variable="tax_unit_count", + period=geography_year, + value=target["count"], + source="IRS SOI", + notes=notes, + ) + _upsert_target( + session, + stratum_id=stratum.stratum_id, + variable=variable, + period=geography_year, + value=target["amount"], + source="IRS SOI", + notes=notes, + ) + + def load_national_workbook_soi_targets( session: Session, national_filer_stratum_id: int, target_year: int ) -> None: @@ -924,6 +1079,7 @@ def load_soi_data(long_dfs, year, national_year: Optional[int] = None): filer_strata["district"][district_geoid] = district_filer_stratum.stratum_id load_national_geography_ctc_targets(session, filer_strata["national"], year) + load_national_geography_ctc_agi_targets(session, filer_strata["national"], year) if national_year is not None: load_national_workbook_soi_targets( diff --git a/tests/unit/calibration/test_ctc_diagnostics.py b/tests/unit/calibration/test_ctc_diagnostics.py index f483efb0d..33731cc97 100644 --- a/tests/unit/calibration/test_ctc_diagnostics.py +++ b/tests/unit/calibration/test_ctc_diagnostics.py @@ -186,3 +186,99 @@ def test_build_ctc_diagnostic_tables_aggregates_weights_by_group(): assert ( by_filing_status.loc["Joint / surviving spouse", "non_refundable_ctc"] == 75.0 ) + + +def test_build_ctc_diagnostic_tables_adds_ctc_composition_breakdowns(): + frame = pd.DataFrame( + { + "adjusted_gross_income": [ + 12_000.0, + 12_000.0, + 80_000.0, + 250_000.0, + ], + "filing_status": [ + "SINGLE", + "HEAD_OF_HOUSEHOLD", + "JOINT", + "JOINT", + ], + "tax_unit_weight": [ + 1.0, + 2.0, + 3.0, + 4.0, + ], + "ctc_qualifying_children": [ + 0.0, + 1.0, + 2.0, + 3.0, + ], + "ctc_qualifying_children_under_6": [ + 0.0, + 1.0, + 1.0, + 2.0, + ], + "ctc_qualifying_children_6_to_17": [ + 0.0, + 0.0, + 1.0, + 1.0, + ], + "ctc": [ + 0.0, + 1_000.0, + 4_000.0, + 6_000.0, + ], + "refundable_ctc": [ + 0.0, + 600.0, + 2_500.0, + 3_000.0, + ], + "non_refundable_ctc": [ + 0.0, + 400.0, + 1_500.0, + 3_000.0, + ], + } + ) + + tables = build_ctc_diagnostic_tables(frame) + + by_agi_and_status = tables["by_agi_band_and_filing_status"].set_index( + ["agi_band", "filing_status"] + ) + assert by_agi_and_status.loc[("$10k-$25k", "Single"), "tax_unit_count"] == 1.0 + assert ( + by_agi_and_status.loc[ + ("$10k-$25k", "Head of household"), + "ctc_recipient_count", + ] + == 2.0 + ) + assert ( + by_agi_and_status.loc[ + ("$75k-$100k", "Joint / surviving spouse"), + "ctc", + ] + == 12_000.0 + ) + + by_child_count = tables["by_child_count"].set_index("group") + assert by_child_count.loc["0", "tax_unit_count"] == 1.0 + assert by_child_count.loc["1", "ctc"] == 2_000.0 + assert by_child_count.loc["2", "refundable_ctc"] == 7_500.0 + assert by_child_count.loc["3+", "non_refundable_ctc"] == 12_000.0 + + by_child_age = tables["by_child_age"].set_index("group") + assert by_child_age.loc["Under 6", "tax_unit_count"] == 9.0 + assert by_child_age.loc["Under 6", "ctc_qualifying_children"] == 13.0 + assert by_child_age.loc["Under 6", "ctc_recipient_count"] == 9.0 + assert by_child_age.loc["Age 6-17", "tax_unit_count"] == 7.0 + assert by_child_age.loc["Age 6-17", "ctc_qualifying_children"] == 7.0 + assert by_child_age.loc["Age 6-17", "non_refundable_ctc_recipient_count"] == 7.0 diff --git a/tests/unit/calibration/test_target_config.py b/tests/unit/calibration/test_target_config.py index 5b702f613..995649a66 100644 --- a/tests/unit/calibration/test_target_config.py +++ b/tests/unit/calibration/test_target_config.py @@ -158,6 +158,38 @@ def test_training_config_includes_national_non_refundable_ctc_targets(self): "domain_variable": "non_refundable_ctc", } in include_rules + def test_training_config_includes_national_ctc_agi_targets(self): + config = load_target_config( + str( + Path(__file__).resolve().parents[3] + / "policyengine_us_data" + / "calibration" + / "target_config.yaml" + ) + ) + + include_rules = config["include"] + assert { + "variable": "refundable_ctc", + "geo_level": "national", + "domain_variable": "adjusted_gross_income,refundable_ctc", + } in include_rules + assert { + "variable": "tax_unit_count", + "geo_level": "national", + "domain_variable": "adjusted_gross_income,refundable_ctc", + } in include_rules + assert { + "variable": "non_refundable_ctc", + "geo_level": "national", + "domain_variable": "adjusted_gross_income,non_refundable_ctc", + } in include_rules + assert { + "variable": "tax_unit_count", + "geo_level": "national", + "domain_variable": "adjusted_gross_income,non_refundable_ctc", + } in include_rules + def test_training_config_includes_district_non_refundable_ctc_target(self): config = load_target_config( str( diff --git a/tests/unit/calibration/test_validate_national_h5.py b/tests/unit/calibration/test_validate_national_h5.py index 771f331fb..219943774 100644 --- a/tests/unit/calibration/test_validate_national_h5.py +++ b/tests/unit/calibration/test_validate_national_h5.py @@ -1,6 +1,9 @@ import os +import pandas as pd + from policyengine_us_data.calibration.validate_national_h5 import ( + build_canonical_ctc_reform_summary, get_ctc_diagnostic_outputs, get_reference_values, resolve_dataset_path, @@ -34,12 +37,15 @@ def test_reference_values_use_irs_ctc_component_targets(monkeypatch): ) -def test_ctc_diagnostic_outputs_format_both_sections(monkeypatch): +def test_ctc_diagnostic_outputs_format_all_sections(monkeypatch): monkeypatch.setattr( "policyengine_us_data.calibration.validate_national_h5.create_ctc_diagnostic_tables", lambda sim: { "by_agi_band": "agi_table", "by_filing_status": "filing_status_table", + "by_agi_band_and_filing_status": "agi_filing_table", + "by_child_count": "child_count_table", + "by_child_age": "child_age_table", }, ) monkeypatch.setattr( @@ -52,6 +58,9 @@ def test_ctc_diagnostic_outputs_format_both_sections(monkeypatch): assert outputs == { "CTC DIAGNOSTICS BY AGI BAND": "formatted:agi_table", "CTC DIAGNOSTICS BY FILING STATUS": "formatted:filing_status_table", + "CTC DIAGNOSTICS BY AGI BAND AND FILING STATUS": "formatted:agi_filing_table", + "CTC DIAGNOSTICS BY QUALIFYING-CHILD COUNT": "formatted:child_count_table", + "CTC DIAGNOSTICS BY QUALIFYING-CHILD AGE": "formatted:child_age_table", } @@ -84,3 +93,60 @@ def fake_download(**kwargs): "token": os.environ.get("HUGGING_FACE_TOKEN"), } ] + + +class _FakeArrayResult: + def __init__(self, values): + self._values = values + + @property + def values(self): + return self._values + + def sum(self): + return self._values.sum() + + +class _FakeSummarySim: + def __init__(self, values_by_variable): + self.values_by_variable = values_by_variable + + def calculate(self, variable, period=None, map_to=None): + assert map_to is None + return _FakeArrayResult(self.values_by_variable[variable]) + + +def test_build_canonical_ctc_reform_summary_reports_level_and_delta(): + baseline = _FakeSummarySim( + { + "ctc_value": pd.Series([100.0, 50.0]).to_numpy(), + "ctc": pd.Series([90.0]).to_numpy(), + "refundable_ctc": pd.Series([40.0]).to_numpy(), + "non_refundable_ctc": pd.Series([50.0]).to_numpy(), + "eitc": pd.Series([20.0]).to_numpy(), + "household_net_income": pd.Series([500.0, 200.0]).to_numpy(), + } + ) + reformed = _FakeSummarySim( + { + "ctc_value": pd.Series([130.0, 70.0]).to_numpy(), + "ctc": pd.Series([120.0]).to_numpy(), + "refundable_ctc": pd.Series([70.0]).to_numpy(), + "non_refundable_ctc": pd.Series([50.0]).to_numpy(), + "eitc": pd.Series([35.0]).to_numpy(), + "household_net_income": pd.Series([540.0, 215.0]).to_numpy(), + } + ) + + summary = build_canonical_ctc_reform_summary( + baseline, + reformed, + period=2025, + ).set_index("variable") + + assert summary.loc["ctc_value", "baseline"] == 150.0 + assert summary.loc["ctc_value", "reformed"] == 200.0 + assert summary.loc["ctc_value", "delta"] == 50.0 + assert summary.loc["refundable_ctc", "delta"] == 30.0 + assert summary.loc["non_refundable_ctc", "delta"] == 0.0 + assert summary.loc["household_net_income", "delta"] == 55.0 diff --git a/tests/unit/test_etl_irs_soi_overlay.py b/tests/unit/test_etl_irs_soi_overlay.py index 7626b92db..230438aa1 100644 --- a/tests/unit/test_etl_irs_soi_overlay.py +++ b/tests/unit/test_etl_irs_soi_overlay.py @@ -1,4 +1,5 @@ import pandas as pd +from sqlalchemy import text from sqlmodel import Session, select from policyengine_us_data.calibration.unified_matrix_builder import ( @@ -13,11 +14,13 @@ from policyengine_us_data.db.etl_irs_soi import ( GEOGRAPHY_FILE_TARGET_SPECS, get_geography_soi_year, + get_national_geography_soi_agi_targets, get_national_geography_soi_target, _get_geography_file_aggregate_target_spec, _skip_coarse_state_agi_person_count_target, _get_or_create_national_domain_stratum, _upsert_target, + load_national_geography_ctc_agi_targets, load_national_geography_ctc_targets, load_national_workbook_soi_targets, ) @@ -246,6 +249,76 @@ def test_get_national_geography_soi_target_reads_amount_and_count(monkeypatch): assert non_refundable_target["amount"] == 81_000.0 +def test_get_national_geography_soi_agi_targets_aggregates_state_rows(monkeypatch): + fake_raw = pd.DataFrame( + [ + { + "STATE": "US", + "CONG_DISTRICT": 0, + "agi_stub": 0, + "N11070": 99.0, + "A11070": 999.0, + }, + { + "STATE": "CA", + "CONG_DISTRICT": 0, + "agi_stub": 1, + "N11070": 10.0, + "A11070": 20.0, + }, + { + "STATE": "NY", + "CONG_DISTRICT": 0, + "agi_stub": 1, + "N11070": 3.0, + "A11070": 7.0, + }, + { + "STATE": "CA", + "CONG_DISTRICT": 0, + "agi_stub": 2, + "N11070": 8.0, + "A11070": 11.0, + }, + { + "STATE": "CA", + "CONG_DISTRICT": 12, + "agi_stub": 2, + "N11070": 100.0, + "A11070": 100.0, + }, + ] + ) + + monkeypatch.setattr( + "policyengine_us_data.db.etl_irs_soi.extract_soi_data", + lambda year: fake_raw, + ) + + targets = get_national_geography_soi_agi_targets("refundable_ctc", 2024) + + assert targets == [ + { + "variable": "refundable_ctc", + "source_year": 2022, + "agi_stub": 1, + "agi_lower_bound": float("-inf"), + "agi_upper_bound": 1.0, + "count": 13.0, + "amount": 27_000.0, + }, + { + "variable": "refundable_ctc", + "source_year": 2022, + "agi_stub": 2, + "agi_lower_bound": 1.0, + "agi_upper_bound": 10_000.0, + "count": 8.0, + "amount": 11_000.0, + }, + ] + + def test_load_national_geography_ctc_targets_uses_geography_year_for_ctc_periods( monkeypatch, tmp_path ): @@ -299,3 +372,64 @@ def test_load_national_geography_ctc_targets_uses_geography_year_for_ctc_periods "tax_unit_count": expected["count"], variable: expected["amount"], } + + +def test_load_national_geography_ctc_agi_targets_creates_agi_domain_strata( + monkeypatch, tmp_path +): + db_uri, engine = _create_test_engine(tmp_path) + + monkeypatch.setattr( + "policyengine_us_data.db.etl_irs_soi._get_national_geography_soi_agi_targets_from_year", + lambda variable, geography_year: [ + { + "variable": variable, + "source_year": geography_year, + "agi_stub": 7, + "agi_lower_bound": 100_000.0, + "agi_upper_bound": 200_000.0, + "count": 12.0, + "amount": 34_000.0, + } + ], + ) + + with Session(engine) as session: + national_filer_stratum = _create_national_filer_stratum(session) + load_national_geography_ctc_agi_targets( + session, + national_filer_stratum.stratum_id, + 2022, + ) + session.commit() + + builder = UnifiedMatrixBuilder(db_uri=db_uri, time_period=2024) + rows = builder._query_targets( + { + "geo_level": "national", + "variables": ["tax_unit_count", "refundable_ctc", "non_refundable_ctc"], + "domain_variables": ["adjusted_gross_income,refundable_ctc"], + } + ) + + assert set(rows["variable"]) == {"tax_unit_count", "refundable_ctc"} + assert set(rows["period"].astype(int)) == {2022} + assert set(rows["value"].astype(float)) == {12.0, 34_000.0} + + with engine.connect() as conn: + overview_rows = conn.execute( + text( + """ + SELECT domain_variable, geographic_id + FROM target_overview + WHERE geo_level = 'national' + AND period = 2022 + AND variable IN ('tax_unit_count', 'refundable_ctc') + AND domain_variable LIKE '%refundable_ctc%' + AND domain_variable LIKE '%adjusted_gross_income%' + """ + ) + ).fetchall() + + assert overview_rows + assert all(row.geographic_id == "US" for row in overview_rows)