Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/1141.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Replace manually curated negative income calibration targets with source-backed SOI negative AGI and loss-component controls.
10 changes: 5 additions & 5 deletions paper/sections/methodology/loss_matrix.tex
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,12 @@ \subsubsection{CPS-Derived Statistics}
\item Rent: \$735B
\end{itemize}

\subsubsection{Market Income Targets}
\subsubsection{Negative AGI and Loss Component Targets}

From IRS SOI PUF estimates:
From IRS SOI Publication 1304 tables:
\begin{itemize}
\item Total negative household market income: -\$138B
\item Count of households with negative market income: 3M
\item All-return negative AGI amount and return count
\item Taxable-return AGI-bin targets for positive-valued business, capital gains, estate, partnership/S-corp, and rent/royalty loss components
\end{itemize}

\subsubsection{Healthcare Spending by Age}
Expand Down Expand Up @@ -150,4 +150,4 @@ \subsubsection{Target Validation}
\item Consistent uprating factors applied across related targets
\end{itemize}

The resulting 7,000+ targets provide comprehensive coverage of income distributions, program participation, demographic patterns, and tax expenditure utilization, ensuring the enhanced dataset accurately reflects the complexity of the US tax and benefit system. The majority of targets come from IRS Statistics of Income data (over 5,300 targets), supplemented by state-level demographic and program participation data (over 1,700 targets).
The resulting 7,000+ targets provide comprehensive coverage of income distributions, program participation, demographic patterns, and tax expenditure utilization, ensuring the enhanced dataset accurately reflects the complexity of the US tax and benefit system. The majority of targets come from IRS Statistics of Income data (over 5,300 targets), supplemented by state-level demographic and program participation data (over 1,700 targets).
22 changes: 22 additions & 0 deletions policyengine_us_data/calibration/target_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,28 @@ include:
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,taxable_interest_income
# SOI AGI-binned loss-component targets. These replace rough manually
# curated negative-income controls with source-backed component constraints.
# Include the loss components that have tax-unit-level PolicyEngine
# variables, so the DB matrix matches SOI return-level netting.
- variable: loss_limited_net_capital_gains
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits,loss_limited_net_capital_gains
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits,loss_limited_net_capital_gains
- variable: tax_unit_partnership_s_corp_income
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits,tax_unit_partnership_s_corp_income
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits,tax_unit_partnership_s_corp_income
- variable: tax_unit_rental_income
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits,tax_unit_rental_income
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits,tax_unit_rental_income
- variable: tax_exempt_interest_income
geo_level: national
domain_variable: tax_exempt_interest_income
Expand Down
200 changes: 200 additions & 0 deletions policyengine_us_data/db/etl_irs_soi.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,11 +204,17 @@ def _skip_coarse_state_agi_person_count_target(geo_type: str, agi_stub: int) ->
"adjusted_gross_income": "adjusted_gross_income",
"count": "tax_unit_count",
}
SOI_NEGATIVE_AGI_TARGET_VARIABLES = dict(SOI_TAXABLE_AGI_TARGET_VARIABLES)
SOI_TAXABLE_AGI_DOMAIN_TARGET_VARIABLES = {
"employment_income": "irs_employment_income",
"total_pension_income": "pension_income",
"total_social_security": "social_security",
}
SOI_TAXABLE_LOSS_AGI_TARGET_VARIABLES = {
"capital_gains_losses": "loss_limited_net_capital_gains",
"partnership_and_s_corp_losses": "tax_unit_partnership_s_corp_income",
"rent_and_royalty_net_losses": "tax_unit_rental_income",
}
SOI_FILING_STATUS_CONSTRAINTS = {
"Single": ("==", "SINGLE"),
"Head of Household": ("==", "HEAD_OF_HOUSEHOLD"),
Expand Down Expand Up @@ -694,6 +700,110 @@ def _get_or_create_national_agi_domain_stratum(
return stratum


def _get_or_create_national_agi_stratum(
session: Session,
national_filer_stratum_id: int,
*,
agi_lower_bound: float,
agi_upper_bound: float,
) -> Stratum:
note = f"National filers, AGI >= {agi_lower_bound}, AGI < {agi_upper_bound}"
stratum = session.exec(
select(Stratum).where(
Stratum.parent_stratum_id == national_filer_stratum_id,
Stratum.notes == note,
)
).first()
if stratum:
return stratum

stratum = Stratum(
parent_stratum_id=national_filer_stratum_id,
notes=note,
)
stratum.constraints_rel.extend(
[
StratumConstraint(
constraint_variable="tax_unit_is_filer",
operation="==",
value="1",
),
StratumConstraint(
constraint_variable="adjusted_gross_income",
operation=">=",
value=str(agi_lower_bound),
),
StratumConstraint(
constraint_variable="adjusted_gross_income",
operation="<",
value=str(agi_upper_bound),
),
]
)
session.add(stratum)
session.flush()
return stratum


def _get_or_create_national_taxable_agi_negative_domain_stratum(
session: Session,
national_filer_stratum_id: int,
*,
domain_variable: str,
agi_lower_bound: float,
agi_upper_bound: float,
) -> Stratum:
note = (
"National taxable filers, AGI >= "
f"{agi_lower_bound}, AGI < {agi_upper_bound}, {domain_variable} < 0"
)
stratum = session.exec(
select(Stratum).where(
Stratum.parent_stratum_id == national_filer_stratum_id,
Stratum.notes == note,
)
).first()
if stratum:
return stratum

stratum = Stratum(
parent_stratum_id=national_filer_stratum_id,
notes=note,
)
stratum.constraints_rel.extend(
[
StratumConstraint(
constraint_variable="tax_unit_is_filer",
operation="==",
value="1",
),
StratumConstraint(
constraint_variable="income_tax_before_credits",
operation=">",
value="0",
),
StratumConstraint(
constraint_variable="adjusted_gross_income",
operation=">=",
value=str(agi_lower_bound),
),
StratumConstraint(
constraint_variable="adjusted_gross_income",
operation="<",
value=str(agi_upper_bound),
),
StratumConstraint(
constraint_variable=domain_variable,
operation="<",
value="0",
),
]
)
session.add(stratum)
session.flush()
return stratum


def _get_or_create_national_eitc_agi_child_stratum(
session: Session,
national_filer_stratum_id: int,
Expand Down Expand Up @@ -1122,6 +1232,86 @@ def load_national_taxable_agi_domain_filing_status_targets(
)


def load_national_negative_agi_targets(
session: Session,
national_filer_stratum_id: int,
target_year: int,
) -> None:
"""Create all-return negative-AGI amount and count targets."""
soi = get_soi(target_year)
rows = soi[
soi["Variable"].isin(SOI_NEGATIVE_AGI_TARGET_VARIABLES)
& (soi["Filing status"] == "All")
& (soi["AGI lower bound"] == -np.inf)
& (soi["AGI upper bound"] == 0)
& (~soi["Taxable only"])
].copy()

for _, row in rows.iterrows():
source_variable = row["Variable"]
target_variable = SOI_NEGATIVE_AGI_TARGET_VARIABLES[source_variable]
stratum = _get_or_create_national_agi_stratum(
session,
national_filer_stratum_id,
agi_lower_bound=float(row["AGI lower bound"]),
agi_upper_bound=float(row["AGI upper bound"]),
)
notes = (
f"Publication 1304 {row['SOI table']} all-return negative-AGI "
f"target (source year {int(row['Year'])}, row {int(row['XLSX row'])})"
)
_upsert_target(
session,
stratum_id=stratum.stratum_id,
variable=target_variable,
period=int(target_year),
value=float(row["Value"]),
source="IRS SOI",
notes=notes,
)


def load_national_taxable_loss_agi_targets(
session: Session,
national_filer_stratum_id: int,
target_year: int,
) -> None:
"""Create taxable loss-component targets by AGI band."""
soi = get_soi(target_year)
rows = soi[
soi["Variable"].isin(SOI_TAXABLE_LOSS_AGI_TARGET_VARIABLES)
& (soi["Filing status"] == "All")
& (soi["Taxable only"])
& (~soi["Full population"])
& (soi["Value"] > 0)
].copy()

for _, row in rows.iterrows():
source_variable = row["Variable"]
target_variable = SOI_TAXABLE_LOSS_AGI_TARGET_VARIABLES[source_variable]
stratum = _get_or_create_national_taxable_agi_negative_domain_stratum(
session,
national_filer_stratum_id,
domain_variable=target_variable,
agi_lower_bound=float(row["AGI lower bound"]),
agi_upper_bound=float(row["AGI upper bound"]),
)
notes = (
f"Publication 1304 {row['SOI table']} taxable AGI-band "
f"{source_variable} target "
f"(source year {int(row['Year'])}, row {int(row['XLSX row'])})"
)
_upsert_target(
session,
stratum_id=stratum.stratum_id,
variable="tax_unit_count" if bool(row["Count"]) else target_variable,
period=int(target_year),
value=(float(row["Value"]) if bool(row["Count"]) else -float(row["Value"])),
source="IRS SOI",
notes=notes,
)


def load_national_workbook_soi_targets(
session: Session, national_filer_stratum_id: int, target_year: int
) -> None:
Expand Down Expand Up @@ -1721,6 +1911,16 @@ def load_soi_data(
filer_strata["national"],
target_year or national_year,
)
load_national_negative_agi_targets(
session,
filer_strata["national"],
target_year or national_year,
)
load_national_taxable_loss_agi_targets(
session,
filer_strata["national"],
target_year or national_year,
)
load_national_fine_agi_targets(session, filer_strata["national"], national_year)
load_national_ltcg_agi_targets(session, filer_strata["national"], national_year)

Expand Down
53 changes: 39 additions & 14 deletions policyengine_us_data/utils/loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,19 @@ def _cbo_program_target_value(sim, variable_name: str, time_period):
"taxable_interest_income",
}

SOI_NEGATIVE_AGI_TARGETED_VARIABLES = (
"adjusted_gross_income",
"count",
)

AGI_LEVEL_LOSS_TARGETED_VARIABLES = (
"business_net_losses",
"capital_gains_losses",
"estate_losses",
"partnership_and_s_corp_losses",
"rent_and_royalty_net_losses",
)

AGI_LEVEL_TARGETED_VARIABLES = (
"adjusted_gross_income",
"count",
Expand Down Expand Up @@ -1217,15 +1230,31 @@ def get_target_loss_weights(target_names):
return weights


def _is_negative_agi_all_returns_row(row) -> bool:
return (
row["Variable"] in SOI_NEGATIVE_AGI_TARGETED_VARIABLES
and row["Filing status"] == "All"
and row["AGI lower bound"] == -np.inf
and row["AGI upper bound"] == 0
and not row["Taxable only"]
)


def _should_skip_soi_agi_row(row) -> bool:
"""Skip fragile low-AGI SOI rows except for investment-income controls."""
"""Skip fragile low-AGI SOI rows except selected source-backed controls."""
if _is_negative_agi_all_returns_row(row):
return False
if row["Variable"] in AGI_LEVEL_LOSS_TARGETED_VARIABLES:
return False
if row["AGI upper bound"] > 10_000:
return False
return row["Variable"] not in LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES


def _should_skip_soi_taxability_row(row) -> bool:
"""Use all-return SOI rows only for investment-income controls."""
if _is_negative_agi_all_returns_row(row):
return False
if row["Variable"] in LOW_AGI_INVESTMENT_INCOME_SOI_VARIABLES:
return row["Taxable only"]
return not row["Taxable only"]
Expand All @@ -1244,8 +1273,14 @@ def build_loss_matrix(dataset: type, time_period):
for variable in AGGREGATE_LEVEL_TARGETED_VARIABLES
if variable in df.columns
]
agi_level_loss_targeted_variables = [
variable
for variable in AGI_LEVEL_LOSS_TARGETED_VARIABLES
if variable in df.columns
]
soi_subset = soi_subset[
soi_subset.Variable.isin(AGI_LEVEL_TARGETED_VARIABLES)
| soi_subset.Variable.isin(agi_level_loss_targeted_variables)
| (
soi_subset.Variable.isin(aggregate_level_targeted_variables)
& (soi_subset["AGI lower bound"] == -np.inf)
Expand All @@ -1259,6 +1294,9 @@ def build_loss_matrix(dataset: type, time_period):
if _should_skip_soi_agi_row(row):
continue

if row["Variable"] in AGI_LEVEL_LOSS_TARGETED_VARIABLES and row["Value"] <= 0:
continue

mask = (
(agi >= row["AGI lower bound"]) * (agi < row["AGI upper bound"]) * filer
) > 0
Expand Down Expand Up @@ -1587,19 +1625,6 @@ def build_loss_matrix(dataset: type, time_period):
time_period,
)

# Negative household market income total rough estimate from the IRS SOI PUF

market_income = sim.calculate("household_market_income").values
loss_matrix["nation/irs/negative_household_market_income_total"] = market_income * (
market_income < 0
)
targets_array.append(-138e9)

loss_matrix["nation/irs/negative_household_market_income_count"] = (
market_income < 0
).astype(float)
targets_array.append(3e6)

# Healthcare spending by age.
# Each row targets a decade of ages (lower_bound to lower_bound + 9).
# The top row is treated as unbounded (age >= lower_bound) so the
Expand Down
6 changes: 1 addition & 5 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,7 @@ classifiers = [
"Programming Language :: Python :: 3.14",
]
dependencies = [
# Temporary GitHub pin: policyengine-us 1.706.14 is blocked from PyPI by
# the project-size limit, but us-data needs the merged desired retirement
# contribution variables, FLSA overtime constants, and data-backed
# Medicaid cost input before the next PyPI release is available.
"policyengine-us @ git+https://github.com/PolicyEngine/policyengine-us.git@1da04a64dcdce26834b063d68daa835765a5d8ed",
"policyengine-us==1.709.1",
# policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for
# PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
# after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.
Expand Down
Loading