From 822b5e3465bc6fb5b3c105516ad23a6244dd903f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 24 May 2026 09:15:00 -0400 Subject: [PATCH 1/5] Add collapsed capital gains basis imputation --- .../calibration/puf_impute.py | 105 +++- policyengine_us_data/datasets/puf/puf.py | 64 +++ .../utils/capital_gains_basis.py | 499 ++++++++++++++++++ .../test_calibration_puf_impute.py | 43 ++ tests/unit/datasets/test_irs_puf.py | 25 + tests/unit/test_capital_gains_basis.py | 99 ++++ 6 files changed, 825 insertions(+), 10 deletions(-) create mode 100644 policyengine_us_data/utils/capital_gains_basis.py create mode 100644 tests/unit/test_capital_gains_basis.py diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index f8dd58fa9..e0d7c5895 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -27,6 +27,15 @@ ) from policyengine_us_data.pipeline_metadata import pipeline_node from policyengine_us_data.pipeline_schema import PipelineNode +from policyengine_us_data.utils.capital_gains_basis import ( + CAPITAL_GAINS_BASIS_VARIABLES, + LONG_TERM_CAPITAL_GAINS_BASIS, + LONG_TERM_CAPITAL_GAINS_YEARS_HELD, + impute_person_level_long_term_capital_gains_basis, +) +from policyengine_us_data.utils.policyengine import ( + has_policyengine_us_variables, +) logger = logging.getLogger(__name__) @@ -59,6 +68,8 @@ "interest_deduction", "tax_exempt_pension_income", "long_term_capital_gains", + "long_term_capital_gains_basis", + "long_term_capital_gains_years_held", "unreimbursed_business_employee_expenses", "pre_tax_contributions", "taxable_ira_distributions", @@ -111,6 +122,8 @@ "self_employment_income_would_be_qualified", ] +DETERMINISTIC_IMPUTED_VARIABLES = list(CAPITAL_GAINS_BASIS_VARIABLES) + SS_SUBCOMPONENTS = [ "social_security_retirement", "social_security_disability", @@ -190,6 +203,68 @@ RETIREMENT_PREDICTORS = RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS +def _person_weights_from_household_weights( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, +) -> Optional[np.ndarray]: + household_weight = data.get("household_weight", {}).get(time_period) + if household_weight is None: + return None + + person_id = data.get("person_id", {}).get(time_period) + if person_id is not None and len(household_weight) == len(person_id): + return np.asarray(household_weight, dtype=float) + + person_household_id = data.get("person_household_id", {}).get(time_period) + household_id = data.get("household_id", {}).get(time_period) + if person_household_id is None or household_id is None: + return None + if len(household_weight) != len(household_id): + return None + + household_weight_by_id = dict(zip(household_id.tolist(), household_weight)) + return np.asarray( + [ + household_weight_by_id.get(household_id, 1.0) + for household_id in person_household_id + ], + dtype=float, + ) + + +def _impute_long_term_capital_gains_basis( + data: Dict[str, Dict[int, np.ndarray]], + time_period: int, +) -> None: + """Add deterministic basis and holding period fields to cloned data.""" + + if not has_policyengine_us_variables(*CAPITAL_GAINS_BASIS_VARIABLES): + return + if ( + "long_term_capital_gains" not in data + or "person_tax_unit_id" not in data + or "person_id" not in data + ): + return + + imputation = impute_person_level_long_term_capital_gains_basis( + data["long_term_capital_gains"][time_period], + person_tax_unit_ids=data["person_tax_unit_id"][time_period], + person_ids=data["person_id"][time_period], + person_sample_weight=_person_weights_from_household_weights( + data, + time_period, + ), + tax_year=time_period, + ) + data[LONG_TERM_CAPITAL_GAINS_BASIS] = { + time_period: imputation.basis.astype(np.float32) + } + data[LONG_TERM_CAPITAL_GAINS_YEARS_HELD] = { + time_period: imputation.years_held.astype(np.float32) + } + + def _get_retirement_limits(year: int) -> dict: """Return contribution limits for the given tax year. @@ -517,8 +592,8 @@ def puf_clone_dataset( person_count, ) - y_full = None - y_override = None + y_full = {} + y_override = {} if not skip_qrf and puf_dataset is not None: y_full, y_override = _run_qrf_imputation( data, @@ -548,12 +623,12 @@ def _map_to_entity(pred_values, variable_name): # Impute weeks_unemployed for PUF half puf_weeks = None - if y_full is not None and dataset_path is not None: + if y_full and dataset_path is not None: puf_weeks = _impute_weeks_unemployed(data, y_full, time_period, dataset_path) # Impute retirement contributions for PUF half puf_retirement = None - if y_full is not None and dataset_path is not None: + if y_full and dataset_path is not None: puf_retirement = _impute_retirement_contributions( data, y_full, time_period, dataset_path ) @@ -566,10 +641,10 @@ def _map_to_entity(pred_values, variable_name): values = time_dict[time_period] - if variable in OVERRIDDEN_IMPUTED_VARIABLES and y_override: + if variable in y_override: pred = _map_to_entity(y_override[variable], variable) new_data[variable] = {time_period: np.concatenate([pred, pred])} - elif variable in IMPUTED_VARIABLES and y_full: + elif variable in y_full: pred = _map_to_entity(y_full[variable], variable) new_data[variable] = {time_period: np.concatenate([values, pred])} elif "_id" in variable and np.issubdtype(values.dtype, np.number): @@ -624,7 +699,9 @@ def _map_to_entity(pred_values, variable_name): } if y_full: - for var in IMPUTED_VARIABLES: + for var in y_full: + if var in PUF_REPORTED_CALCULATED_TAX_OUTPUT_VARIABLES: + continue if var not in data: pred = _map_to_entity(y_full[var], var) new_data[var] = {time_period: np.concatenate([pred, pred])} @@ -632,6 +709,8 @@ def _map_to_entity(pred_values, variable_name): if cps_sim is not None: del cps_sim + _impute_long_term_capital_gains_basis(new_data, time_period) + # Ensure SS sub-components match the (possibly imputed) total. reconcile_ss_subcomponents(new_data, person_count, time_period) @@ -942,8 +1021,14 @@ def _run_qrf_imputation( puf_agi = puf_sim.calculate("adjusted_gross_income", map_to="person").values + qrf_imputed_variables = [ + variable + for variable in IMPUTED_VARIABLES + if variable not in DETERMINISTIC_IMPUTED_VARIABLES + ] + X_train_full = puf_sim.calculate_dataframe( - DEMOGRAPHIC_PREDICTORS + IMPUTED_VARIABLES + DEMOGRAPHIC_PREDICTORS + qrf_imputed_variables ) X_train_override = puf_sim.calculate_dataframe( @@ -972,9 +1057,9 @@ def _run_qrf_imputation( if pred in data: X_test[pred] = data[pred][time_period].astype(np.float32) - logger.info("Imputing %d PUF variables (full)", len(IMPUTED_VARIABLES)) + logger.info("Imputing %d PUF variables (full)", len(qrf_imputed_variables)) y_full = _sequential_qrf( - X_train_full, X_test, DEMOGRAPHIC_PREDICTORS, IMPUTED_VARIABLES + X_train_full, X_test, DEMOGRAPHIC_PREDICTORS, qrf_imputed_variables ) logger.info( diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index ff390420e..60a344442 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -21,6 +21,12 @@ STRUCTURAL_MORTGAGE_VARIABLES, convert_mortgage_interest_to_structural_inputs, ) +from policyengine_us_data.utils.capital_gains_basis import ( + CAPITAL_GAINS_BASIS_VARIABLES, + LONG_TERM_CAPITAL_GAINS_YEARS_HELD, + add_long_term_capital_gains_basis_to_puf_frame, + impute_person_level_long_term_capital_gains_basis, +) from policyengine_us_data.utils.policyengine import ( has_policyengine_us_variables, ) @@ -614,9 +620,59 @@ def _with_lifetime_learning_credit_inputs( def _person_financial_value_from_puf_row(variable: str, row, share: float): if variable in PUF_LLC_ELIGIBILITY_INPUTS: return bool(row[variable]) and row["qualified_tuition_expenses"] * share > 0 + if variable == LONG_TERM_CAPITAL_GAINS_YEARS_HELD: + return row[variable] if row["long_term_capital_gains"] * share != 0 else 0 return row[variable] * share +def _person_weights_from_household_weights(arrays: dict[str, np.ndarray]): + household_weight = arrays.get("household_weight") + if household_weight is None: + return None + person_ids = arrays.get("person_id") + if person_ids is not None and len(household_weight) == len(person_ids): + return np.asarray(household_weight, dtype=float) + person_household_id = arrays.get("person_household_id") + household_id = arrays.get("household_id") + if person_household_id is None or household_id is None: + return None + if len(household_weight) != len(household_id): + return None + household_weight_by_id = dict(zip(household_id.tolist(), household_weight)) + return np.asarray( + [ + household_weight_by_id.get(household_id, 1.0) + for household_id in person_household_id + ], + dtype=float, + ) + + +def _with_capital_gains_basis_inputs( + arrays: dict[str, np.ndarray], + time_period: int, +) -> dict[str, np.ndarray]: + """Populate capital-gains basis inputs when PE-US supports them.""" + + if not has_policyengine_us_variables(*CAPITAL_GAINS_BASIS_VARIABLES): + return arrays + if "long_term_capital_gains" not in arrays or "person_tax_unit_id" not in arrays: + return arrays + if all(variable in arrays for variable in CAPITAL_GAINS_BASIS_VARIABLES): + return arrays + + imputation = impute_person_level_long_term_capital_gains_basis( + arrays["long_term_capital_gains"], + person_tax_unit_ids=arrays["person_tax_unit_id"], + person_ids=arrays.get("person_id"), + person_sample_weight=_person_weights_from_household_weights(arrays), + tax_year=time_period, + ) + arrays.setdefault("long_term_capital_gains_basis", imputation.basis) + arrays.setdefault("long_term_capital_gains_years_held", imputation.years_held) + return arrays + + @pipeline_node( PipelineNode( id="preprocess_puf", @@ -650,6 +706,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: puf["health_savings_account_ald"] = puf.E03290 puf["interest_deduction"] = puf.E19200 puf["long_term_capital_gains"] = puf.P23250 + puf = add_long_term_capital_gains_basis_to_puf_frame(puf) puf["long_term_capital_gains_on_collectibles"] = puf.E24518 # Split medical expenses using CPS fractions for ( @@ -814,6 +871,8 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame: "health_savings_account_ald", "interest_deduction", "long_term_capital_gains", + "long_term_capital_gains_basis", + "long_term_capital_gains_years_held", "long_term_capital_gains_on_collectibles", "unreimbursed_business_employee_expenses", "non_qualified_dividend_income", @@ -1372,6 +1431,7 @@ def generate(self): growth = current_index / start_index arrays[variable] = arrays[variable] * growth arrays = _with_lifetime_learning_credit_inputs(arrays) + arrays = _with_capital_gains_basis_inputs(arrays, self.time_period) self._save_current_qbi_dataset(arrays) return @@ -1478,6 +1538,10 @@ def generate(self): variable: values[self.time_period] for variable, values in holder_tp.items() } self.holder = _with_lifetime_learning_credit_inputs(self.holder) + self.holder = _with_capital_gains_basis_inputs( + self.holder, + self.time_period, + ) self._save_current_qbi_dataset(self.holder) def add_tax_unit(self, row, tax_unit_id): diff --git a/policyengine_us_data/utils/capital_gains_basis.py b/policyengine_us_data/utils/capital_gains_basis.py new file mode 100644 index 000000000..dabbf32b7 --- /dev/null +++ b/policyengine_us_data/utils/capital_gains_basis.py @@ -0,0 +1,499 @@ +"""Collapsed SOCA-style basis and holding-period imputation. + +This module creates one representative long-term capital-gains holding +period and cost basis per tax unit, then stores the result on people. +""" + +from __future__ import annotations + +from dataclasses import dataclass +import hashlib + +import numpy as np +import pandas as pd + + +LONG_TERM_CAPITAL_GAINS_BASIS = "long_term_capital_gains_basis" +LONG_TERM_CAPITAL_GAINS_YEARS_HELD = "long_term_capital_gains_years_held" +CAPITAL_GAINS_BASIS_VARIABLES = ( + LONG_TERM_CAPITAL_GAINS_BASIS, + LONG_TERM_CAPITAL_GAINS_YEARS_HELD, +) + + +@dataclass(frozen=True) +class CapitalGainsBasisResource: + bucket_names: tuple[str, ...] + bucket_lower_years: tuple[float, ...] + bucket_upper_years: tuple[float, ...] + bucket_midpoint_years: tuple[float, ...] + gain_dollar_shares: tuple[float, ...] + loss_dollar_shares: tuple[float, ...] + gain_basis_sales_ratios: tuple[float, ...] + loss_basis_sales_ratios: tuple[float, ...] + weibull_shape: float = 0.7711 + weibull_scale: float = 9.1458 + gain_bsr_floor: float = 0.001 + gain_bsr_ceiling: float = 0.999 + loss_bsr_floor: float = 1.001 + loss_bsr_ceiling: float = 100.0 + + +DEFAULT_SOCA_RESOURCE = CapitalGainsBasisResource( + bucket_names=( + "Under 18 months", + "18 months under 2 years", + "2 years under 3 years", + "3 years under 4 years", + "4 years under 5 years", + "5 years under 10 years", + "10 years under 15 years", + "15 years under 20 years", + "20 years or more", + ), + bucket_lower_years=(1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 20.0), + bucket_upper_years=(1.5, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 20.0, np.inf), + bucket_midpoint_years=(1.25, 1.75, 2.5, 3.5, 4.5, 7.5, 12.5, 17.5, 27.5), + # IRS SOI Sales of Capital Assets, 2013-2015, as compacted by holding + # period. Gain and loss shares are dollar-weighted within sign. + gain_dollar_shares=( + 0.09265227810410295, + 0.07010752237381986, + 0.10520812743077781, + 0.08298825059272091, + 0.0743411463263887, + 0.20575715295741653, + 0.11737305049024667, + 0.0755711716188086, + 0.17600130010571793, + ), + loss_dollar_shares=( + 0.1482795960468827, + 0.10517359565806916, + 0.13462839298735996, + 0.09369439202769471, + 0.07359794203326578, + 0.29944369490176603, + 0.09122874216922709, + 0.03107240651916826, + 0.0228812376565664, + ), + gain_basis_sales_ratios=( + 0.8478328330650043, + 0.8160574582327029, + 0.8021013607528408, + 0.8060933128473603, + 0.7693845730205952, + 0.770253613744043, + 0.6358599451460517, + 0.5146618879371708, + 0.41336120762839623, + ), + loss_basis_sales_ratios=( + 1.1483448499553495, + 1.1815679561321597, + 1.2064462486658172, + 1.261228512659838, + 1.329629488793004, + 1.395990775722507, + 1.4840458650441617, + 1.636674383986131, + 1.63034354751029, + ), +) + + +@dataclass(frozen=True) +class CapitalGainsBasisImputation: + basis: np.ndarray + years_held: np.ndarray + holding_period_bucket: np.ndarray + + +def impute_tax_unit_long_term_capital_gains_basis( + gains: np.ndarray, + *, + tax_unit_ids: np.ndarray, + sample_weight: np.ndarray | None = None, + tax_year: int = 2017, + resource: CapitalGainsBasisResource = DEFAULT_SOCA_RESOURCE, + imputation_version: str = "soca_collapsed_v1", +) -> CapitalGainsBasisImputation: + """Impute collapsed basis and holding period for tax-unit gains. + + Args: + gains: Net long-term capital gains by tax unit. + tax_unit_ids: Stable tax-unit identifiers. + sample_weight: Optional tax-unit weights for dollar-share quotas. + tax_year: Sale year used only in deterministic keys for now. + resource: Holding-period and basis-to-sales resource. + imputation_version: Stable key salt. + + Returns: + Basis, years held, and zero-based holding-period bucket arrays. + """ + + gains = np.asarray(gains, dtype=float) + tax_unit_ids = np.asarray(tax_unit_ids) + if gains.shape[0] != tax_unit_ids.shape[0]: + raise ValueError("gains and tax_unit_ids must have the same length") + + if sample_weight is None: + weights = np.ones_like(gains, dtype=float) + else: + weights = np.asarray(sample_weight, dtype=float) + if weights.shape[0] != gains.shape[0]: + raise ValueError("sample_weight must match gains length") + if not np.any(weights > 0): + weights = np.ones_like(gains, dtype=float) + + buckets = _assign_holding_period_buckets( + gains, + tax_unit_ids=tax_unit_ids, + sample_weight=weights, + tax_year=tax_year, + resource=resource, + imputation_version=imputation_version, + ) + years_held = _draw_years_held( + buckets, + tax_unit_ids=tax_unit_ids, + gains=gains, + tax_year=tax_year, + resource=resource, + imputation_version=imputation_version, + ) + basis = _basis_from_gains_and_years(gains, years_held, resource) + + zero_gain = gains == 0 + buckets = np.where(zero_gain, -1, buckets) + years_held = np.where(zero_gain, 0.0, years_held) + basis = np.where(zero_gain, 0.0, basis) + return CapitalGainsBasisImputation( + basis=basis, + years_held=years_held, + holding_period_bucket=buckets, + ) + + +def impute_person_level_long_term_capital_gains_basis( + person_gains: np.ndarray, + *, + person_tax_unit_ids: np.ndarray, + person_ids: np.ndarray | None = None, + person_sample_weight: np.ndarray | None = None, + tax_year: int = 2017, + resource: CapitalGainsBasisResource = DEFAULT_SOCA_RESOURCE, + imputation_version: str = "soca_collapsed_v1", +) -> CapitalGainsBasisImputation: + """Impute tax-unit-collapsed basis and allocate it to people. + + The representative holding period is shared by every person with + nonzero long-term gains in the tax unit. Basis is allocated by each + person's absolute long-term gain so aggregation reproduces the + collapsed tax-unit basis exactly. + """ + + person_gains = np.asarray(person_gains, dtype=float) + person_tax_unit_ids = np.asarray(person_tax_unit_ids) + if person_gains.shape[0] != person_tax_unit_ids.shape[0]: + raise ValueError("person_gains and person_tax_unit_ids must match") + + if person_ids is None: + person_ids = np.arange(person_gains.shape[0]) + else: + person_ids = np.asarray(person_ids) + if person_ids.shape[0] != person_gains.shape[0]: + raise ValueError("person_ids must match person_gains length") + + frame = pd.DataFrame( + { + "person_gain": person_gains, + "tax_unit_id": person_tax_unit_ids, + } + ) + if person_sample_weight is not None: + sample_weight = np.asarray(person_sample_weight, dtype=float) + if sample_weight.shape[0] != person_gains.shape[0]: + raise ValueError("person_sample_weight must match person_gains length") + frame["sample_weight"] = sample_weight + else: + frame["sample_weight"] = 1.0 + + grouped = frame.groupby("tax_unit_id", sort=False).agg( + gain=("person_gain", "sum"), + sample_weight=("sample_weight", "max"), + ) + tax_unit_imputation = impute_tax_unit_long_term_capital_gains_basis( + grouped["gain"].to_numpy(), + tax_unit_ids=grouped.index.to_numpy(), + sample_weight=grouped["sample_weight"].to_numpy(), + tax_year=tax_year, + resource=resource, + imputation_version=imputation_version, + ) + + basis_by_tax_unit = pd.Series(tax_unit_imputation.basis, index=grouped.index) + years_by_tax_unit = pd.Series(tax_unit_imputation.years_held, index=grouped.index) + bucket_by_tax_unit = pd.Series( + tax_unit_imputation.holding_period_bucket, + index=grouped.index, + ) + abs_gain_sum = ( + frame.assign(abs_gain=np.abs(person_gains)) + .groupby("tax_unit_id", sort=False)["abs_gain"] + .transform("sum") + ) + tax_unit_basis = frame["tax_unit_id"].map(basis_by_tax_unit).to_numpy() + tax_unit_years = frame["tax_unit_id"].map(years_by_tax_unit).to_numpy() + tax_unit_buckets = frame["tax_unit_id"].map(bucket_by_tax_unit).to_numpy() + + abs_person_gain = np.abs(person_gains) + basis = np.divide( + tax_unit_basis * abs_person_gain, + abs_gain_sum.to_numpy(), + out=np.zeros_like(person_gains, dtype=float), + where=abs_gain_sum.to_numpy() > 0, + ) + years_held = np.where(abs_person_gain > 0, tax_unit_years, 0.0) + buckets = np.where(abs_person_gain > 0, tax_unit_buckets, -1) + return CapitalGainsBasisImputation( + basis=basis, + years_held=years_held, + holding_period_bucket=buckets, + ) + + +def add_long_term_capital_gains_basis_to_puf_frame( + puf: pd.DataFrame, + *, + tax_year: int = 2017, + resource: CapitalGainsBasisResource = DEFAULT_SOCA_RESOURCE, +) -> pd.DataFrame: + """Add collapsed basis and holding period columns to a PUF frame.""" + + if "long_term_capital_gains" not in puf: + return puf + record_ids = puf["RECID"].to_numpy() if "RECID" in puf else puf.index.to_numpy() + weights = puf["S006"].to_numpy() if "S006" in puf else None + imputation = impute_tax_unit_long_term_capital_gains_basis( + puf["long_term_capital_gains"].to_numpy(), + tax_unit_ids=record_ids, + sample_weight=weights, + tax_year=tax_year, + resource=resource, + ) + puf[LONG_TERM_CAPITAL_GAINS_BASIS] = imputation.basis + puf[LONG_TERM_CAPITAL_GAINS_YEARS_HELD] = imputation.years_held + return puf + + +def _assign_holding_period_buckets( + gains: np.ndarray, + *, + tax_unit_ids: np.ndarray, + sample_weight: np.ndarray, + tax_year: int, + resource: CapitalGainsBasisResource, + imputation_version: str, +) -> np.ndarray: + buckets = np.full(gains.shape[0], -1, dtype=int) + for sign, probabilities, label in ( + (1, resource.gain_dollar_shares, "gain"), + (-1, resource.loss_dollar_shares, "loss"), + ): + mask = gains * sign > 0 + if not np.any(mask): + continue + + probabilities_array = _normalise_probabilities(probabilities) + masked_indices = np.flatnonzero(mask) + dollar_weights = np.abs(gains[mask]) * sample_weight[mask] + keys = _stable_uniforms( + tax_unit_ids[mask], + salt=f"{imputation_version}|{tax_year}|bucket|{label}", + ) + if dollar_weights.sum() <= 0: + assigned = np.searchsorted( + np.cumsum(probabilities_array), + keys, + side="right", + ) + buckets[masked_indices] = np.minimum( + assigned, + len(probabilities_array) - 1, + ) + continue + + order = np.argsort(keys, kind="mergesort") + sorted_indices = masked_indices[order] + sorted_weights = dollar_weights[order] + weighted_midpoints = ( + np.cumsum(sorted_weights) - 0.5 * sorted_weights + ) / sorted_weights.sum() + assigned = np.searchsorted( + np.cumsum(probabilities_array), + weighted_midpoints, + side="right", + ) + buckets[sorted_indices] = np.minimum(assigned, len(probabilities_array) - 1) + return buckets + + +def _draw_years_held( + buckets: np.ndarray, + *, + tax_unit_ids: np.ndarray, + gains: np.ndarray, + tax_year: int, + resource: CapitalGainsBasisResource, + imputation_version: str, +) -> np.ndarray: + years = np.zeros_like(gains, dtype=float) + for bucket in range(len(resource.bucket_names)): + mask = buckets == bucket + if not np.any(mask): + continue + signs = np.where(gains[mask] > 0, "gain", "loss") + salts = [ + f"{imputation_version}|{tax_year}|years|{sign}|{bucket}" for sign in signs + ] + uniforms = np.array( + [ + _stable_uniform(record_id, salt=salt) + for record_id, salt in zip(tax_unit_ids[mask], salts) + ], + dtype=float, + ) + years[mask] = _draw_years_in_bucket(bucket, uniforms, resource) + return years + + +def _draw_years_in_bucket( + bucket: int, + uniforms: np.ndarray, + resource: CapitalGainsBasisResource, +) -> np.ndarray: + lo = resource.bucket_lower_years[bucket] + hi = resource.bucket_upper_years[bucket] + if lo >= 20: + return 20 + (-np.log1p(-uniforms) / _top_bucket_exponential_rate(resource)) + + x_lo = lo - 1 + x_hi = hi - 1 + f_lo = _weibull_cdf(x_lo, resource.weibull_shape, resource.weibull_scale) + f_hi = _weibull_cdf(x_hi, resource.weibull_shape, resource.weibull_scale) + u = f_lo + uniforms * (f_hi - f_lo) + return 1 + _weibull_quantile(u, resource.weibull_shape, resource.weibull_scale) + + +def _basis_from_gains_and_years( + gains: np.ndarray, + years_held: np.ndarray, + resource: CapitalGainsBasisResource, +) -> np.ndarray: + basis = np.zeros_like(gains, dtype=float) + positive = gains > 0 + negative = gains < 0 + + gain_bsr = _gain_basis_sales_ratio(years_held[positive], resource) + basis[positive] = np.abs(gains[positive]) * gain_bsr / (1 - gain_bsr) + + loss_bsr = _loss_basis_sales_ratio(years_held[negative], resource) + basis[negative] = np.abs(gains[negative]) * loss_bsr / (loss_bsr - 1) + return basis + + +def _gain_basis_sales_ratio( + years_held: np.ndarray, + resource: CapitalGainsBasisResource, +) -> np.ndarray: + h_top = _top_bucket_mean(resource) + h_knots = np.asarray(resource.bucket_midpoint_years, dtype=float) + h_knots[-1] = h_top + ratio_knots = np.asarray(resource.gain_basis_sales_ratios, dtype=float) + interpolated = np.interp( + np.minimum(years_held, h_top), + h_knots, + ratio_knots, + left=ratio_knots[0], + right=ratio_knots[-1], + ) + g_extrap = (1 / ratio_knots[-1]) ** (1 / h_top) - 1 + extrapolated = 1 / (1 + g_extrap) ** years_held + ratio = np.where(years_held <= h_top, interpolated, extrapolated) + return np.clip(ratio, resource.gain_bsr_floor, resource.gain_bsr_ceiling) + + +def _loss_basis_sales_ratio( + years_held: np.ndarray, + resource: CapitalGainsBasisResource, +) -> np.ndarray: + h_knots = np.asarray(resource.bucket_midpoint_years, dtype=float) + h_knots[-1] = _top_bucket_mean(resource) + ratio_knots = np.asarray(resource.loss_basis_sales_ratios, dtype=float) + ratio = np.interp( + years_held, + h_knots, + ratio_knots, + left=ratio_knots[0], + right=ratio_knots[-1], + ) + return np.clip(ratio, resource.loss_bsr_floor, resource.loss_bsr_ceiling) + + +def _top_bucket_mean(resource: CapitalGainsBasisResource) -> float: + return 20 + 1 / _top_bucket_exponential_rate(resource) + + +def _top_bucket_exponential_rate(resource: CapitalGainsBasisResource) -> float: + density_at_boundary = _weibull_pdf( + 19, + resource.weibull_shape, + resource.weibull_scale, + ) + bucket_8_mass = _weibull_cdf( + 19, + resource.weibull_shape, + resource.weibull_scale, + ) - _weibull_cdf(14, resource.weibull_shape, resource.weibull_scale) + gain_shares = np.asarray(resource.gain_dollar_shares, dtype=float) + return density_at_boundary / bucket_8_mass * gain_shares[7] / gain_shares[8] + + +def _weibull_cdf(x: float, shape: float, scale: float) -> float: + return 1 - np.exp(-((x / scale) ** shape)) + + +def _weibull_pdf(x: float, shape: float, scale: float) -> float: + return ( + (shape / scale) * ((x / scale) ** (shape - 1)) * np.exp(-((x / scale) ** shape)) + ) + + +def _weibull_quantile(u: np.ndarray, shape: float, scale: float) -> np.ndarray: + u = np.clip(u, np.finfo(float).tiny, np.nextafter(1.0, 0.0)) + return scale * (-np.log1p(-u)) ** (1 / shape) + + +def _normalise_probabilities(probabilities: tuple[float, ...]) -> np.ndarray: + probabilities_array = np.asarray(probabilities, dtype=float) + total = probabilities_array.sum() + if total <= 0: + raise ValueError("holding-period probabilities must sum to a positive value") + return probabilities_array / total + + +def _stable_uniforms(values: np.ndarray, *, salt: str) -> np.ndarray: + return np.array( + [_stable_uniform(value, salt=salt) for value in values], dtype=float + ) + + +def _stable_uniform(value, *, salt: str) -> float: + digest = hashlib.blake2b( + f"{salt}|{value}".encode("utf-8"), + digest_size=8, + ).digest() + integer = int.from_bytes(digest, byteorder="big", signed=False) + return (integer + 0.5) / 2**64 diff --git a/tests/unit/calibration/test_calibration_puf_impute.py b/tests/unit/calibration/test_calibration_puf_impute.py index b29914ca3..68d8a59a9 100644 --- a/tests/unit/calibration/test_calibration_puf_impute.py +++ b/tests/unit/calibration/test_calibration_puf_impute.py @@ -10,6 +10,7 @@ from policyengine_us_data.calibration import puf_impute as puf_impute_module from policyengine_us_data.calibration.puf_impute import ( DEMOGRAPHIC_PREDICTORS, + DETERMINISTIC_IMPUTED_VARIABLES, IMPUTED_VARIABLES, OVERRIDDEN_IMPUTED_VARIABLES, _impute_retirement_contributions, @@ -207,6 +208,14 @@ def test_demographic_predictors_excludes_state(self): def test_imputed_variables_not_empty(self): assert len(IMPUTED_VARIABLES) > 0 + def test_capital_gains_basis_fields_are_stage_one_outputs(self): + expected = { + "long_term_capital_gains_basis", + "long_term_capital_gains_years_held", + } + assert expected <= set(IMPUTED_VARIABLES) + assert expected <= set(DETERMINISTIC_IMPUTED_VARIABLES) + def test_overridden_subset_of_imputed(self): for var in OVERRIDDEN_IMPUTED_VARIABLES: assert var in IMPUTED_VARIABLES @@ -301,6 +310,40 @@ def fake_run_qrf_imputation(*args, **kwargs): np.testing.assert_array_equal(employment[:20], data["employment_income"][2024]) np.testing.assert_array_equal(employment[20:], y_full["employment_income"]) + def test_capital_gains_basis_is_deterministically_imputed(self, monkeypatch): + data = _make_mock_data(n_persons=4, n_households=2) + data["person_tax_unit_id"] = {2024: np.array([1, 1, 2, 2])} + data["person_household_id"] = {2024: np.array([1, 1, 2, 2])} + data["long_term_capital_gains"] = { + 2024: np.array([100.0, -40.0, 0.0, 200.0], dtype=np.float32) + } + + monkeypatch.setattr( + puf_impute_module, + "has_policyengine_us_variables", + lambda *variables: True, + ) + + result = puf_clone_dataset( + data=data, + state_fips=np.array([1, 2]), + time_period=2024, + skip_qrf=True, + ) + + basis = result["long_term_capital_gains_basis"][2024] + years = result["long_term_capital_gains_years_held"][2024] + gains = result["long_term_capital_gains"][2024] + tax_unit_ids = result["person_tax_unit_id"][2024] + + assert np.all(basis[gains != 0] > 0) + assert np.all(years[gains != 0] > 0) + assert np.all(basis[gains == 0] == 0) + assert np.all(years[gains == 0] == 0) + for tax_unit_id in np.unique(tax_unit_ids[gains != 0]): + mask = (tax_unit_ids == tax_unit_id) & (gains != 0) + assert np.unique(years[mask]).size == 1 + def test_sstb_qbi_split_variables_imputed(self): expected = { "sstb_self_employment_income", diff --git a/tests/unit/datasets/test_irs_puf.py b/tests/unit/datasets/test_irs_puf.py index 8d1f8e380..31c62d1c4 100644 --- a/tests/unit/datasets/test_irs_puf.py +++ b/tests/unit/datasets/test_irs_puf.py @@ -6,6 +6,7 @@ PUF, QBI_SIMULATION_VERSION, QBI_SIMULATION_VERSION_ATTR, + _person_financial_value_from_puf_row, ) @@ -25,6 +26,30 @@ def test_irs_puf_generates(year: int): dataset_by_year[year](require=True) +def test_puf_person_split_keeps_capital_gains_holding_period_collapsed(): + row = { + "long_term_capital_gains": 1_000.0, + "long_term_capital_gains_years_held": 12.5, + } + + assert ( + _person_financial_value_from_puf_row( + "long_term_capital_gains_years_held", + row, + 0.25, + ) + == 12.5 + ) + assert ( + _person_financial_value_from_puf_row( + "long_term_capital_gains_years_held", + row, + 0.0, + ) + == 0 + ) + + def test_puf_load_dataset_backfills_sstb_split_inputs(tmp_path): class DummyPUF(PUF): label = "Dummy PUF" diff --git a/tests/unit/test_capital_gains_basis.py b/tests/unit/test_capital_gains_basis.py new file mode 100644 index 000000000..a102b4f95 --- /dev/null +++ b/tests/unit/test_capital_gains_basis.py @@ -0,0 +1,99 @@ +import numpy as np +import pandas as pd +import pytest + +from policyengine_us_data.utils.capital_gains_basis import ( + LONG_TERM_CAPITAL_GAINS_BASIS, + LONG_TERM_CAPITAL_GAINS_YEARS_HELD, + add_long_term_capital_gains_basis_to_puf_frame, + impute_person_level_long_term_capital_gains_basis, + impute_tax_unit_long_term_capital_gains_basis, +) + + +def test_tax_unit_imputation_is_record_stable_under_shuffle(): + gains = np.array([1_000, 20_000, -4_000, 0, 7_500, -12_000], dtype=float) + ids = np.array([101, 102, 103, 104, 105, 106]) + weights = np.array([10, 2, 5, 1, 8, 3], dtype=float) + + direct = impute_tax_unit_long_term_capital_gains_basis( + gains, + tax_unit_ids=ids, + sample_weight=weights, + tax_year=2026, + ) + + order = np.array([4, 2, 0, 5, 1, 3]) + shuffled = impute_tax_unit_long_term_capital_gains_basis( + gains[order], + tax_unit_ids=ids[order], + sample_weight=weights[order], + tax_year=2026, + ) + + reverse_order = np.argsort(order) + np.testing.assert_allclose(direct.basis, shuffled.basis[reverse_order]) + np.testing.assert_allclose(direct.years_held, shuffled.years_held[reverse_order]) + np.testing.assert_array_equal( + direct.holding_period_bucket, + shuffled.holding_period_bucket[reverse_order], + ) + + +def test_zero_gain_records_get_zero_basis_and_holding_period(): + imputation = impute_tax_unit_long_term_capital_gains_basis( + np.array([0.0]), + tax_unit_ids=np.array([1]), + tax_year=2026, + ) + + assert imputation.basis[0] == 0 + assert imputation.years_held[0] == 0 + assert imputation.holding_period_bucket[0] == -1 + + +def test_person_allocation_preserves_collapsed_tax_unit_basis(): + gains = np.array([100.0, -40.0, 0.0, -80.0]) + tax_unit_ids = np.array([1, 1, 2, 3]) + person_ids = np.array([11, 12, 21, 31]) + + person_imputation = impute_person_level_long_term_capital_gains_basis( + gains, + person_tax_unit_ids=tax_unit_ids, + person_ids=person_ids, + tax_year=2026, + ) + tax_unit_imputation = impute_tax_unit_long_term_capital_gains_basis( + np.array([60.0, 0.0, -80.0]), + tax_unit_ids=np.array([1, 2, 3]), + tax_year=2026, + ) + + assert person_imputation.years_held[0] == pytest.approx( + person_imputation.years_held[1] + ) + assert person_imputation.basis[:2].sum() == pytest.approx( + tax_unit_imputation.basis[0] + ) + assert person_imputation.basis[2] == 0 + assert person_imputation.years_held[2] == 0 + assert person_imputation.basis[3] == pytest.approx(tax_unit_imputation.basis[2]) + + +def test_puf_frame_helper_adds_basis_and_years(): + puf = pd.DataFrame( + { + "RECID": [10, 11, 12], + "S006": [100.0, 200.0, 300.0], + "long_term_capital_gains": [5_000.0, -2_000.0, 0.0], + } + ) + + result = add_long_term_capital_gains_basis_to_puf_frame(puf.copy(), tax_year=2026) + + assert LONG_TERM_CAPITAL_GAINS_BASIS in result + assert LONG_TERM_CAPITAL_GAINS_YEARS_HELD in result + assert result.loc[0, LONG_TERM_CAPITAL_GAINS_BASIS] > 0 + assert result.loc[1, LONG_TERM_CAPITAL_GAINS_BASIS] > 0 + assert result.loc[2, LONG_TERM_CAPITAL_GAINS_BASIS] == 0 + assert result.loc[2, LONG_TERM_CAPITAL_GAINS_YEARS_HELD] == 0 From a581a14f13496267b03894c1d485816ea85db1d4 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 24 May 2026 09:19:42 -0400 Subject: [PATCH 2/5] Add changelog for capital gains basis imputation --- changelog.d/1128.added | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/1128.added diff --git a/changelog.d/1128.added b/changelog.d/1128.added new file mode 100644 index 000000000..10d7af0b2 --- /dev/null +++ b/changelog.d/1128.added @@ -0,0 +1 @@ +Added deterministic collapsed long-term capital gains basis and holding-period imputation. From d15956785bba3ecbd00aaf898fa58d27d673bc2f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sun, 24 May 2026 09:35:24 -0400 Subject: [PATCH 3/5] Fix capital gains basis read backfill --- policyengine_us_data/datasets/puf/puf.py | 86 ++++++++++++++++++- .../test_calibration_puf_impute.py | 53 ++++++++++++ tests/unit/datasets/test_irs_puf.py | 86 +++++++++++++++++++ 3 files changed, 223 insertions(+), 2 deletions(-) diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index 60a344442..6a2c0c3f0 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -1340,6 +1340,88 @@ def _ensure_sstb_split_inputs(self) -> dict[str, np.ndarray]: return overrides + def _capital_gains_basis_overrides( + self, + existing_overrides: dict[str, np.ndarray] | None = None, + ) -> dict[str, np.ndarray]: + if not has_policyengine_us_variables(*CAPITAL_GAINS_BASIS_VARIABLES): + return {} + if not self.file_path.exists(): + return {} + + existing_overrides = existing_overrides or {} + with h5py.File(self.file_path, "r") as file_handle: + keys = set(file_handle.keys()) | set(existing_overrides) + if all(variable in keys for variable in CAPITAL_GAINS_BASIS_VARIABLES): + return {} + if ( + "long_term_capital_gains" not in keys + or "person_tax_unit_id" not in keys + ): + return {} + + gains = self._values_from_file_or_overrides( + file_handle, + "long_term_capital_gains", + existing_overrides, + 0, + ) + length = len(gains) + arrays = { + "long_term_capital_gains": gains, + "person_tax_unit_id": self._values_from_file_or_overrides( + file_handle, + "person_tax_unit_id", + existing_overrides, + length, + ), + } + for variable in ( + "person_id", + "household_weight", + "person_household_id", + "household_id", + *CAPITAL_GAINS_BASIS_VARIABLES, + ): + if variable in keys: + arrays[variable] = self._values_from_file_or_overrides( + file_handle, + variable, + existing_overrides, + length, + ) + + arrays = _with_capital_gains_basis_inputs(arrays, self.time_period) + return { + variable: np.asarray(arrays[variable]) + for variable in CAPITAL_GAINS_BASIS_VARIABLES + if variable not in keys and variable in arrays + } + + def _ensure_capital_gains_basis_inputs( + self, + existing_overrides: dict[str, np.ndarray] | None = None, + ) -> dict[str, np.ndarray]: + overrides = self._capital_gains_basis_overrides(existing_overrides) + if not overrides: + return {} + + try: + with h5py.File(self.file_path, "r+") as file_handle: + for key, values in overrides.items(): + self._replace_array(file_handle, key, values) + except OSError: + pass + + return overrides + + def _ensure_read_overrides(self) -> dict[str, np.ndarray]: + sstb_overrides = self._ensure_sstb_split_inputs() + capital_gains_overrides = self._ensure_capital_gains_basis_inputs( + sstb_overrides + ) + return {**sstb_overrides, **capital_gains_overrides} + class _OverrideView: def __init__(self, backing, overrides: dict[str, np.ndarray]): self._backing = backing @@ -1393,7 +1475,7 @@ def __getattr__(self, name): def load(self, key=None, mode="r"): if mode == "r": - overrides = self._ensure_sstb_split_inputs() + overrides = self._ensure_read_overrides() if key in overrides: return overrides[key] if key is None and overrides: @@ -1401,7 +1483,7 @@ def load(self, key=None, mode="r"): return super().load(key=key, mode=mode) def load_dataset(self): - overrides = self._ensure_sstb_split_inputs() + overrides = self._ensure_read_overrides() arrays = super().load_dataset() arrays.update(overrides) return arrays diff --git a/tests/unit/calibration/test_calibration_puf_impute.py b/tests/unit/calibration/test_calibration_puf_impute.py index 68d8a59a9..68f442bfe 100644 --- a/tests/unit/calibration/test_calibration_puf_impute.py +++ b/tests/unit/calibration/test_calibration_puf_impute.py @@ -216,6 +216,59 @@ def test_capital_gains_basis_fields_are_stage_one_outputs(self): assert expected <= set(IMPUTED_VARIABLES) assert expected <= set(DETERMINISTIC_IMPUTED_VARIABLES) + def test_qrf_excludes_deterministic_capital_gains_basis_outputs( + self, + monkeypatch, + ): + import policyengine_us + + data = _make_mock_data(n_persons=4, n_households=2) + + class FakeCalculation: + values = np.array([100.0, 200.0, 300.0, 400.0], dtype=np.float32) + + class FakeMicrosimulation: + def __init__(self, dataset): + self.dataset = dataset + + def calculate(self, variable, map_to=None): + return FakeCalculation() + + def calculate_dataframe(self, variables): + return pd.DataFrame( + {variable: np.arange(4, dtype=np.float32) for variable in variables} + ) + + captured_output_vars = [] + + def fake_sequential_qrf(X_train, X_test, predictors, output_vars): + captured_output_vars.append(tuple(output_vars)) + return { + variable: np.zeros(len(X_test), dtype=np.float32) + for variable in output_vars + } + + monkeypatch.setattr(policyengine_us, "Microsimulation", FakeMicrosimulation) + monkeypatch.setattr( + puf_impute_module, + "_sequential_qrf", + fake_sequential_qrf, + ) + + puf_impute_module._run_qrf_imputation( + data=data, + time_period=2024, + puf_dataset=object(), + ) + + deterministic_outputs = set(DETERMINISTIC_IMPUTED_VARIABLES) + assert captured_output_vars + for output_vars in captured_output_vars: + assert deterministic_outputs.isdisjoint(output_vars) + assert set(captured_output_vars[0]) == ( + set(IMPUTED_VARIABLES) - deterministic_outputs + ) + def test_overridden_subset_of_imputed(self): for var in OVERRIDDEN_IMPUTED_VARIABLES: assert var in IMPUTED_VARIABLES diff --git a/tests/unit/datasets/test_irs_puf.py b/tests/unit/datasets/test_irs_puf.py index 31c62d1c4..f19ec4fa8 100644 --- a/tests/unit/datasets/test_irs_puf.py +++ b/tests/unit/datasets/test_irs_puf.py @@ -2,6 +2,7 @@ import numpy as np import pytest +from policyengine_us_data.datasets.puf import puf as puf_module from policyengine_us_data.datasets.puf.puf import ( PUF, QBI_SIMULATION_VERSION, @@ -14,6 +15,19 @@ def _mark_current_qbi_simulation(file_handle): file_handle.attrs[QBI_SIMULATION_VERSION_ATTR] = QBI_SIMULATION_VERSION +def _write_capital_gains_basis_source_file(path): + with h5py.File(path, "w") as file_handle: + file_handle.create_dataset("person_id", data=np.array([1, 2, 3, 4])) + file_handle.create_dataset("person_tax_unit_id", data=np.array([1, 1, 2, 2])) + file_handle.create_dataset("person_household_id", data=np.array([1, 1, 2, 2])) + file_handle.create_dataset("household_id", data=np.array([1, 2])) + file_handle.create_dataset("household_weight", data=np.array([100.0, 200.0])) + file_handle.create_dataset( + "long_term_capital_gains", + data=np.array([100.0, -40.0, 0.0, 200.0]), + ) + + @pytest.mark.skip(reason="This test requires private data.") @pytest.mark.parametrize("year", [2015]) def test_irs_puf_generates(year: int): @@ -50,6 +64,78 @@ def test_puf_person_split_keeps_capital_gains_holding_period_collapsed(): ) +def test_puf_load_dataset_backfills_capital_gains_basis_inputs( + tmp_path, + monkeypatch, +): + monkeypatch.setattr( + puf_module, + "has_policyengine_us_variables", + lambda *variables: True, + ) + + class DummyPUF(PUF): + label = "Dummy PUF" + name = "dummy_puf" + time_period = 2024 + file_path = tmp_path / "dummy_puf.h5" + + _write_capital_gains_basis_source_file(DummyPUF.file_path) + + arrays = DummyPUF().load_dataset() + + basis = arrays["long_term_capital_gains_basis"] + years = arrays["long_term_capital_gains_years_held"] + gains = arrays["long_term_capital_gains"] + + assert np.all(basis[gains != 0] > 0) + assert np.all(years[gains != 0] > 0) + assert np.all(basis[gains == 0] == 0) + assert np.all(years[gains == 0] == 0) + + with h5py.File(DummyPUF.file_path, "r") as file_handle: + assert "long_term_capital_gains_basis" in file_handle + assert "long_term_capital_gains_years_held" in file_handle + + +def test_puf_load_key_backfills_read_only_capital_gains_basis_inputs( + tmp_path, + monkeypatch, +): + monkeypatch.setattr( + puf_module, + "has_policyengine_us_variables", + lambda *variables: True, + ) + + class DummyPUF(PUF): + label = "Dummy PUF" + name = "dummy_puf" + time_period = 2024 + file_path = tmp_path / "dummy_puf.h5" + + _write_capital_gains_basis_source_file(DummyPUF.file_path) + DummyPUF.file_path.chmod(0o444) + + dataset = DummyPUF() + try: + basis = dataset.load("long_term_capital_gains_basis") + years = dataset.load("long_term_capital_gains_years_held") + reader = dataset.load() + np.testing.assert_array_equal( + reader["long_term_capital_gains_basis"], + basis, + ) + reader.close() + finally: + DummyPUF.file_path.chmod(0o644) + + assert np.all(basis[[0, 1, 3]] > 0) + assert basis[2] == 0 + assert np.all(years[[0, 1, 3]] > 0) + assert years[2] == 0 + + def test_puf_load_dataset_backfills_sstb_split_inputs(tmp_path): class DummyPUF(PUF): label = "Dummy PUF" From 6c6922327e31bd832abedfc211ae7b46f35c966e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 25 May 2026 06:17:06 -0400 Subject: [PATCH 4/5] Use unreleased policyengine-us git ref --- .../check_policyengine_us_dependency.py | 54 ++++++++++++++++--- pyproject.toml | 2 +- tests/unit/test_publication_scripts.py | 28 ++++++++++ uv.lock | 10 ++-- 4 files changed, 78 insertions(+), 16 deletions(-) diff --git a/.github/scripts/check_policyengine_us_dependency.py b/.github/scripts/check_policyengine_us_dependency.py index 1fd574bf1..85fdf4bde 100644 --- a/.github/scripts/check_policyengine_us_dependency.py +++ b/.github/scripts/check_policyengine_us_dependency.py @@ -18,6 +18,8 @@ PYPI_JSON_TIMEOUT_SECONDS = 20 POLICYENGINE_US = "policyengine-us" STALE_LOCK_PREFIX = "uv.lock has policyengine-us " +LOCK_GIT_REF_PREFIX = "uv.lock resolves policyengine-us from a Git ref." +PROJECT_GIT_REF_PREFIX = "pyproject.toml pins policyengine-us to a Git ref." def _annotation(level: str, message: str) -> str: @@ -86,6 +88,8 @@ def _latest_pypi_version() -> str: def check_dependency(root: Path, latest_version: str | None = None) -> list[str]: locked_version, source = _locked_policyengine_us(root) project_dependency = _project_policyengine_us_dependency(root) + lock_uses_git_ref = "git" in source + project_uses_git_ref = "@" in project_dependency and "git+" in project_dependency violations: list[str] = [] if ( @@ -99,27 +103,40 @@ def check_dependency(root: Path, latest_version: str | None = None) -> list[str] ) expected_dependency = f"{POLICYENGINE_US}=={locked_version}" - if project_dependency != expected_dependency: + if not project_uses_git_ref and project_dependency != expected_dependency: violations.append( f"pyproject.toml must pin {expected_dependency} to match uv.lock; " f"found {project_dependency!r}." ) - if "git" in source: + if lock_uses_git_ref: violations.append( - "uv.lock resolves policyengine-us from a Git ref. Prefer an exact " + f"{LOCK_GIT_REF_PREFIX} Prefer an exact " f"PyPI release pin once policyengine-us {locked_version} is published." ) - if "@" in project_dependency and "git+" in project_dependency: + if project_uses_git_ref: violations.append( - "pyproject.toml pins policyengine-us to a Git ref. Prefer an exact " + f"{PROJECT_GIT_REF_PREFIX} Prefer an exact " "PyPI release pin for production data builds." ) return violations +def _is_unreleased_git_ref_violation( + violation: str, + locked_version: str, + latest_version: str | None, +) -> bool: + if latest_version is None: + return False + git_ref_violation = violation.startswith( + LOCK_GIT_REF_PREFIX + ) or violation.startswith(PROJECT_GIT_REF_PREFIX) + return git_ref_violation and _compare_versions(locked_version, latest_version) > 0 + + def main() -> int: parser = argparse.ArgumentParser() parser.add_argument( @@ -163,17 +180,30 @@ def main() -> int: print(f"policyengine-us dependency is current at {locked_version}.") return 0 + locked_version, _source = _locked_policyengine_us(REPO_ROOT) has_blocking_violation = False allowed_stale_version = False + allowed_unreleased_git_ref = False for violation in violations: stale_version_violation = violation.startswith(STALE_LOCK_PREFIX) allowed_by_override = allow_stale and stale_version_violation - level = "warning" if args.mode == "warn" or allowed_by_override else "error" + allowed_git_ref = _is_unreleased_git_ref_violation( + violation, + locked_version, + latest_version, + ) + level = ( + "warning" + if args.mode == "warn" or allowed_by_override or allowed_git_ref + else "error" + ) print(_annotation(level, violation)) - if args.mode == "fail" and not allowed_by_override: + if args.mode == "fail" and not allowed_by_override and not allowed_git_ref: has_blocking_violation = True if allowed_by_override: allowed_stale_version = True + if allowed_git_ref: + allowed_unreleased_git_ref = True if allowed_stale_version: print( @@ -183,10 +213,18 @@ def main() -> int: "policyengine-us lagging the latest PyPI release.", ) ) + if allowed_unreleased_git_ref: + print( + _annotation( + "warning", + "policyengine-us is pinned to an unreleased Git ref; switch to " + f"policyengine-us=={locked_version} once that PyPI release exists.", + ) + ) if has_blocking_violation: return 1 - if allowed_stale_version: + if allowed_stale_version or allowed_unreleased_git_ref: return 0 return 1 if args.mode == "fail" else 0 diff --git a/pyproject.toml b/pyproject.toml index 48c623b35..43825811c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ classifiers = [ "Programming Language :: Python :: 3.14", ] dependencies = [ - "policyengine-us==1.705.15", + "policyengine-us @ git+https://github.com/PolicyEngine/policyengine-us@0dcc69aa7a38be901141d38c8dbb7a9c870f2561", # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+. diff --git a/tests/unit/test_publication_scripts.py b/tests/unit/test_publication_scripts.py index d7df93b0c..8180847f4 100644 --- a/tests/unit/test_publication_scripts.py +++ b/tests/unit/test_publication_scripts.py @@ -282,6 +282,34 @@ def test_policyengine_us_dependency_check_flags_git_refs(tmp_path): assert any("Git ref" in violation for violation in violations) +def test_policyengine_us_dependency_check_allows_unreleased_git_refs( + tmp_path, + monkeypatch, +): + module = _load_script( + ".github/scripts/check_policyengine_us_dependency.py", + "check_policyengine_us_dependency_unreleased_git_test", + ) + _write_pyproject_with_policyengine_us( + tmp_path, + "policyengine-us @ git+https://github.com/PolicyEngine/policyengine-us@abc", + ) + _write_uv_lock_for_policyengine_us( + tmp_path, + "1.691.12", + source='{ git = "https://github.com/PolicyEngine/policyengine-us?rev=abc#abc" }', + ) + monkeypatch.setattr(module, "REPO_ROOT", tmp_path) + monkeypatch.setattr(module, "_latest_pypi_version", lambda: "1.691.11") + monkeypatch.setattr( + sys, + "argv", + ["check_policyengine_us_dependency.py", "--mode", "fail"], + ) + + assert module.main() == 0 + + def test_policyengine_us_dependency_check_flags_non_exact_pyproject_pin(tmp_path): module = _load_script( ".github/scripts/check_policyengine_us_dependency.py", diff --git a/uv.lock b/uv.lock index 5cda418f1..eb9abfb9e 100644 --- a/uv.lock +++ b/uv.lock @@ -2122,8 +2122,8 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.705.15" -source = { registry = "https://pypi.org/simple" } +version = "1.706.14" +source = { git = "https://github.com/PolicyEngine/policyengine-us?rev=0dcc69aa7a38be901141d38c8dbb7a9c870f2561#0dcc69aa7a38be901141d38c8dbb7a9c870f2561" } dependencies = [ { name = "microdf-python" }, { name = "pandas" }, @@ -2132,10 +2132,6 @@ dependencies = [ { name = "tables" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/54/cc/921f994e5c688be0f45dbe8d7bce209ee45225d328a9724cf363df225ce8/policyengine_us-1.705.15.tar.gz", hash = "sha256:559d79690cb1d79615479ed2c71a53510e8cfea56d2398a37120f6797548c26b", size = 9927111, upload-time = "2026-05-24T02:32:30.769Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/24/02/b8ae7ec50124bc4f442c8e3f662bf02d0f670d288c63e367dff75ed8c374/policyengine_us-1.705.15-py3-none-any.whl", hash = "sha256:aeafbbbef2a8de88cb73da8c234943784789023e7d32e05a9560e1a7dd713c70", size = 10758474, upload-time = "2026-05-24T02:32:26.588Z" }, -] [[package]] name = "policyengine-us-data" @@ -2204,7 +2200,7 @@ requires-dist = [ { name = "pandas", specifier = ">=2.3.1" }, { name = "pip-system-certs", specifier = ">=3.0" }, { name = "policyengine-core", specifier = ">=3.26.1,<3.27" }, - { name = "policyengine-us", specifier = "==1.705.15" }, + { name = "policyengine-us", git = "https://github.com/PolicyEngine/policyengine-us?rev=0dcc69aa7a38be901141d38c8dbb7a9c870f2561" }, { name = "requests", specifier = ">=2.25.0" }, { name = "samplics", marker = "extra == 'calibration'" }, { name = "scipy", specifier = ">=1.15.3" }, From 33326978c4d78f9e565d32673f242d3048a17243 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Mon, 25 May 2026 06:44:44 -0400 Subject: [PATCH 5/5] Handle mock variable metadata in PUF clone --- policyengine_us_data/calibration/puf_impute.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index e0d7c5895..19a86dd31 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -616,7 +616,9 @@ def _map_to_entity(pred_values, variable_name): var_meta = tbs.variables.get(variable_name) if var_meta is None: return pred_values - entity = var_meta.entity.key + entity = getattr(getattr(var_meta, "entity", None), "key", None) + if not isinstance(entity, str): + return pred_values if entity != "person": return cps_sim.populations[entity].value_from_first_person(pred_values) return pred_values