From 822b5e3465bc6fb5b3c105516ad23a6244dd903f Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 24 May 2026 09:15:00 -0400
Subject: [PATCH 1/5] Add collapsed capital gains basis imputation

---
 .../calibration/puf_impute.py                 | 105 +++-
 policyengine_us_data/datasets/puf/puf.py      |  64 +++
 .../utils/capital_gains_basis.py              | 499 ++++++++++++++++++
 .../test_calibration_puf_impute.py            |  43 ++
 tests/unit/datasets/test_irs_puf.py           |  25 +
 tests/unit/test_capital_gains_basis.py        |  99 ++++
 6 files changed, 825 insertions(+), 10 deletions(-)
 create mode 100644 policyengine_us_data/utils/capital_gains_basis.py
 create mode 100644 tests/unit/test_capital_gains_basis.py

diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py
index f8dd58fa9..e0d7c5895 100644
--- a/policyengine_us_data/calibration/puf_impute.py
+++ b/policyengine_us_data/calibration/puf_impute.py
@@ -27,6 +27,15 @@
 )
 from policyengine_us_data.pipeline_metadata import pipeline_node
 from policyengine_us_data.pipeline_schema import PipelineNode
+from policyengine_us_data.utils.capital_gains_basis import (
+    CAPITAL_GAINS_BASIS_VARIABLES,
+    LONG_TERM_CAPITAL_GAINS_BASIS,
+    LONG_TERM_CAPITAL_GAINS_YEARS_HELD,
+    impute_person_level_long_term_capital_gains_basis,
+)
+from policyengine_us_data.utils.policyengine import (
+    has_policyengine_us_variables,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -59,6 +68,8 @@
     "interest_deduction",
     "tax_exempt_pension_income",
     "long_term_capital_gains",
+    "long_term_capital_gains_basis",
+    "long_term_capital_gains_years_held",
     "unreimbursed_business_employee_expenses",
     "pre_tax_contributions",
     "taxable_ira_distributions",
@@ -111,6 +122,8 @@
     "self_employment_income_would_be_qualified",
 ]
 
+DETERMINISTIC_IMPUTED_VARIABLES = list(CAPITAL_GAINS_BASIS_VARIABLES)
+
 SS_SUBCOMPONENTS = [
     "social_security_retirement",
     "social_security_disability",
@@ -190,6 +203,68 @@
 RETIREMENT_PREDICTORS = RETIREMENT_DEMOGRAPHIC_PREDICTORS + RETIREMENT_INCOME_PREDICTORS
 
 
+def _person_weights_from_household_weights(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+) -> Optional[np.ndarray]:
+    household_weight = data.get("household_weight", {}).get(time_period)
+    if household_weight is None:
+        return None
+
+    person_id = data.get("person_id", {}).get(time_period)
+    if person_id is not None and len(household_weight) == len(person_id):
+        return np.asarray(household_weight, dtype=float)
+
+    person_household_id = data.get("person_household_id", {}).get(time_period)
+    household_id = data.get("household_id", {}).get(time_period)
+    if person_household_id is None or household_id is None:
+        return None
+    if len(household_weight) != len(household_id):
+        return None
+
+    household_weight_by_id = dict(zip(household_id.tolist(), household_weight))
+    return np.asarray(
+        [
+            household_weight_by_id.get(household_id, 1.0)
+            for household_id in person_household_id
+        ],
+        dtype=float,
+    )
+
+
+def _impute_long_term_capital_gains_basis(
+    data: Dict[str, Dict[int, np.ndarray]],
+    time_period: int,
+) -> None:
+    """Add deterministic basis and holding period fields to cloned data."""
+
+    if not has_policyengine_us_variables(*CAPITAL_GAINS_BASIS_VARIABLES):
+        return
+    if (
+        "long_term_capital_gains" not in data
+        or "person_tax_unit_id" not in data
+        or "person_id" not in data
+    ):
+        return
+
+    imputation = impute_person_level_long_term_capital_gains_basis(
+        data["long_term_capital_gains"][time_period],
+        person_tax_unit_ids=data["person_tax_unit_id"][time_period],
+        person_ids=data["person_id"][time_period],
+        person_sample_weight=_person_weights_from_household_weights(
+            data,
+            time_period,
+        ),
+        tax_year=time_period,
+    )
+    data[LONG_TERM_CAPITAL_GAINS_BASIS] = {
+        time_period: imputation.basis.astype(np.float32)
+    }
+    data[LONG_TERM_CAPITAL_GAINS_YEARS_HELD] = {
+        time_period: imputation.years_held.astype(np.float32)
+    }
+
+
 def _get_retirement_limits(year: int) -> dict:
     """Return contribution limits for the given tax year.
 
@@ -517,8 +592,8 @@ def puf_clone_dataset(
         person_count,
     )
 
-    y_full = None
-    y_override = None
+    y_full = {}
+    y_override = {}
     if not skip_qrf and puf_dataset is not None:
         y_full, y_override = _run_qrf_imputation(
             data,
@@ -548,12 +623,12 @@ def _map_to_entity(pred_values, variable_name):
 
     # Impute weeks_unemployed for PUF half
     puf_weeks = None
-    if y_full is not None and dataset_path is not None:
+    if y_full and dataset_path is not None:
         puf_weeks = _impute_weeks_unemployed(data, y_full, time_period, dataset_path)
 
     # Impute retirement contributions for PUF half
     puf_retirement = None
-    if y_full is not None and dataset_path is not None:
+    if y_full and dataset_path is not None:
         puf_retirement = _impute_retirement_contributions(
             data, y_full, time_period, dataset_path
         )
@@ -566,10 +641,10 @@ def _map_to_entity(pred_values, variable_name):
 
         values = time_dict[time_period]
 
-        if variable in OVERRIDDEN_IMPUTED_VARIABLES and y_override:
+        if variable in y_override:
             pred = _map_to_entity(y_override[variable], variable)
             new_data[variable] = {time_period: np.concatenate([pred, pred])}
-        elif variable in IMPUTED_VARIABLES and y_full:
+        elif variable in y_full:
             pred = _map_to_entity(y_full[variable], variable)
             new_data[variable] = {time_period: np.concatenate([values, pred])}
         elif "_id" in variable and np.issubdtype(values.dtype, np.number):
@@ -624,7 +699,9 @@ def _map_to_entity(pred_values, variable_name):
         }
 
     if y_full:
-        for var in IMPUTED_VARIABLES:
+        for var in y_full:
+            if var in PUF_REPORTED_CALCULATED_TAX_OUTPUT_VARIABLES:
+                continue
             if var not in data:
                 pred = _map_to_entity(y_full[var], var)
                 new_data[var] = {time_period: np.concatenate([pred, pred])}
@@ -632,6 +709,8 @@ def _map_to_entity(pred_values, variable_name):
     if cps_sim is not None:
         del cps_sim
 
+    _impute_long_term_capital_gains_basis(new_data, time_period)
+
     # Ensure SS sub-components match the (possibly imputed) total.
     reconcile_ss_subcomponents(new_data, person_count, time_period)
 
@@ -942,8 +1021,14 @@ def _run_qrf_imputation(
 
     puf_agi = puf_sim.calculate("adjusted_gross_income", map_to="person").values
 
+    qrf_imputed_variables = [
+        variable
+        for variable in IMPUTED_VARIABLES
+        if variable not in DETERMINISTIC_IMPUTED_VARIABLES
+    ]
+
     X_train_full = puf_sim.calculate_dataframe(
-        DEMOGRAPHIC_PREDICTORS + IMPUTED_VARIABLES
+        DEMOGRAPHIC_PREDICTORS + qrf_imputed_variables
     )
 
     X_train_override = puf_sim.calculate_dataframe(
@@ -972,9 +1057,9 @@ def _run_qrf_imputation(
             if pred in data:
                 X_test[pred] = data[pred][time_period].astype(np.float32)
 
-    logger.info("Imputing %d PUF variables (full)", len(IMPUTED_VARIABLES))
+    logger.info("Imputing %d PUF variables (full)", len(qrf_imputed_variables))
     y_full = _sequential_qrf(
-        X_train_full, X_test, DEMOGRAPHIC_PREDICTORS, IMPUTED_VARIABLES
+        X_train_full, X_test, DEMOGRAPHIC_PREDICTORS, qrf_imputed_variables
     )
 
     logger.info(
diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
index ff390420e..60a344442 100644
--- a/policyengine_us_data/datasets/puf/puf.py
+++ b/policyengine_us_data/datasets/puf/puf.py
@@ -21,6 +21,12 @@
     STRUCTURAL_MORTGAGE_VARIABLES,
     convert_mortgage_interest_to_structural_inputs,
 )
+from policyengine_us_data.utils.capital_gains_basis import (
+    CAPITAL_GAINS_BASIS_VARIABLES,
+    LONG_TERM_CAPITAL_GAINS_YEARS_HELD,
+    add_long_term_capital_gains_basis_to_puf_frame,
+    impute_person_level_long_term_capital_gains_basis,
+)
 from policyengine_us_data.utils.policyengine import (
     has_policyengine_us_variables,
 )
@@ -614,9 +620,59 @@ def _with_lifetime_learning_credit_inputs(
 def _person_financial_value_from_puf_row(variable: str, row, share: float):
     if variable in PUF_LLC_ELIGIBILITY_INPUTS:
         return bool(row[variable]) and row["qualified_tuition_expenses"] * share > 0
+    if variable == LONG_TERM_CAPITAL_GAINS_YEARS_HELD:
+        return row[variable] if row["long_term_capital_gains"] * share != 0 else 0
     return row[variable] * share
 
 
+def _person_weights_from_household_weights(arrays: dict[str, np.ndarray]):
+    household_weight = arrays.get("household_weight")
+    if household_weight is None:
+        return None
+    person_ids = arrays.get("person_id")
+    if person_ids is not None and len(household_weight) == len(person_ids):
+        return np.asarray(household_weight, dtype=float)
+    person_household_id = arrays.get("person_household_id")
+    household_id = arrays.get("household_id")
+    if person_household_id is None or household_id is None:
+        return None
+    if len(household_weight) != len(household_id):
+        return None
+    household_weight_by_id = dict(zip(household_id.tolist(), household_weight))
+    return np.asarray(
+        [
+            household_weight_by_id.get(household_id, 1.0)
+            for household_id in person_household_id
+        ],
+        dtype=float,
+    )
+
+
+def _with_capital_gains_basis_inputs(
+    arrays: dict[str, np.ndarray],
+    time_period: int,
+) -> dict[str, np.ndarray]:
+    """Populate capital-gains basis inputs when PE-US supports them."""
+
+    if not has_policyengine_us_variables(*CAPITAL_GAINS_BASIS_VARIABLES):
+        return arrays
+    if "long_term_capital_gains" not in arrays or "person_tax_unit_id" not in arrays:
+        return arrays
+    if all(variable in arrays for variable in CAPITAL_GAINS_BASIS_VARIABLES):
+        return arrays
+
+    imputation = impute_person_level_long_term_capital_gains_basis(
+        arrays["long_term_capital_gains"],
+        person_tax_unit_ids=arrays["person_tax_unit_id"],
+        person_ids=arrays.get("person_id"),
+        person_sample_weight=_person_weights_from_household_weights(arrays),
+        tax_year=time_period,
+    )
+    arrays.setdefault("long_term_capital_gains_basis", imputation.basis)
+    arrays.setdefault("long_term_capital_gains_years_held", imputation.years_held)
+    return arrays
+
+
 @pipeline_node(
     PipelineNode(
         id="preprocess_puf",
@@ -650,6 +706,7 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     puf["health_savings_account_ald"] = puf.E03290
     puf["interest_deduction"] = puf.E19200
     puf["long_term_capital_gains"] = puf.P23250
+    puf = add_long_term_capital_gains_basis_to_puf_frame(puf)
     puf["long_term_capital_gains_on_collectibles"] = puf.E24518
     # Split medical expenses using CPS fractions
     for (
@@ -814,6 +871,8 @@ def preprocess_puf(puf: pd.DataFrame) -> pd.DataFrame:
     "health_savings_account_ald",
     "interest_deduction",
     "long_term_capital_gains",
+    "long_term_capital_gains_basis",
+    "long_term_capital_gains_years_held",
     "long_term_capital_gains_on_collectibles",
     "unreimbursed_business_employee_expenses",
     "non_qualified_dividend_income",
@@ -1372,6 +1431,7 @@ def generate(self):
                     growth = current_index / start_index
                     arrays[variable] = arrays[variable] * growth
             arrays = _with_lifetime_learning_credit_inputs(arrays)
+            arrays = _with_capital_gains_basis_inputs(arrays, self.time_period)
             self._save_current_qbi_dataset(arrays)
             return
 
@@ -1478,6 +1538,10 @@ def generate(self):
             variable: values[self.time_period] for variable, values in holder_tp.items()
         }
         self.holder = _with_lifetime_learning_credit_inputs(self.holder)
+        self.holder = _with_capital_gains_basis_inputs(
+            self.holder,
+            self.time_period,
+        )
         self._save_current_qbi_dataset(self.holder)
 
     def add_tax_unit(self, row, tax_unit_id):
diff --git a/policyengine_us_data/utils/capital_gains_basis.py b/policyengine_us_data/utils/capital_gains_basis.py
new file mode 100644
index 000000000..dabbf32b7
--- /dev/null
+++ b/policyengine_us_data/utils/capital_gains_basis.py
@@ -0,0 +1,499 @@
+"""Collapsed SOCA-style basis and holding-period imputation.
+
+This module creates one representative long-term capital-gains holding
+period and cost basis per tax unit, then stores the result on people.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+import hashlib
+
+import numpy as np
+import pandas as pd
+
+
+LONG_TERM_CAPITAL_GAINS_BASIS = "long_term_capital_gains_basis"
+LONG_TERM_CAPITAL_GAINS_YEARS_HELD = "long_term_capital_gains_years_held"
+CAPITAL_GAINS_BASIS_VARIABLES = (
+    LONG_TERM_CAPITAL_GAINS_BASIS,
+    LONG_TERM_CAPITAL_GAINS_YEARS_HELD,
+)
+
+
+@dataclass(frozen=True)
+class CapitalGainsBasisResource:
+    bucket_names: tuple[str, ...]
+    bucket_lower_years: tuple[float, ...]
+    bucket_upper_years: tuple[float, ...]
+    bucket_midpoint_years: tuple[float, ...]
+    gain_dollar_shares: tuple[float, ...]
+    loss_dollar_shares: tuple[float, ...]
+    gain_basis_sales_ratios: tuple[float, ...]
+    loss_basis_sales_ratios: tuple[float, ...]
+    weibull_shape: float = 0.7711
+    weibull_scale: float = 9.1458
+    gain_bsr_floor: float = 0.001
+    gain_bsr_ceiling: float = 0.999
+    loss_bsr_floor: float = 1.001
+    loss_bsr_ceiling: float = 100.0
+
+
+DEFAULT_SOCA_RESOURCE = CapitalGainsBasisResource(
+    bucket_names=(
+        "Under 18 months",
+        "18 months under 2 years",
+        "2 years under 3 years",
+        "3 years under 4 years",
+        "4 years under 5 years",
+        "5 years under 10 years",
+        "10 years under 15 years",
+        "15 years under 20 years",
+        "20 years or more",
+    ),
+    bucket_lower_years=(1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 20.0),
+    bucket_upper_years=(1.5, 2.0, 3.0, 4.0, 5.0, 10.0, 15.0, 20.0, np.inf),
+    bucket_midpoint_years=(1.25, 1.75, 2.5, 3.5, 4.5, 7.5, 12.5, 17.5, 27.5),
+    # IRS SOI Sales of Capital Assets, 2013-2015, as compacted by holding
+    # period. Gain and loss shares are dollar-weighted within sign.
+    gain_dollar_shares=(
+        0.09265227810410295,
+        0.07010752237381986,
+        0.10520812743077781,
+        0.08298825059272091,
+        0.0743411463263887,
+        0.20575715295741653,
+        0.11737305049024667,
+        0.0755711716188086,
+        0.17600130010571793,
+    ),
+    loss_dollar_shares=(
+        0.1482795960468827,
+        0.10517359565806916,
+        0.13462839298735996,
+        0.09369439202769471,
+        0.07359794203326578,
+        0.29944369490176603,
+        0.09122874216922709,
+        0.03107240651916826,
+        0.0228812376565664,
+    ),
+    gain_basis_sales_ratios=(
+        0.8478328330650043,
+        0.8160574582327029,
+        0.8021013607528408,
+        0.8060933128473603,
+        0.7693845730205952,
+        0.770253613744043,
+        0.6358599451460517,
+        0.5146618879371708,
+        0.41336120762839623,
+    ),
+    loss_basis_sales_ratios=(
+        1.1483448499553495,
+        1.1815679561321597,
+        1.2064462486658172,
+        1.261228512659838,
+        1.329629488793004,
+        1.395990775722507,
+        1.4840458650441617,
+        1.636674383986131,
+        1.63034354751029,
+    ),
+)
+
+
+@dataclass(frozen=True)
+class CapitalGainsBasisImputation:
+    basis: np.ndarray
+    years_held: np.ndarray
+    holding_period_bucket: np.ndarray
+
+
+def impute_tax_unit_long_term_capital_gains_basis(
+    gains: np.ndarray,
+    *,
+    tax_unit_ids: np.ndarray,
+    sample_weight: np.ndarray | None = None,
+    tax_year: int = 2017,
+    resource: CapitalGainsBasisResource = DEFAULT_SOCA_RESOURCE,
+    imputation_version: str = "soca_collapsed_v1",
+) -> CapitalGainsBasisImputation:
+    """Impute collapsed basis and holding period for tax-unit gains.
+
+    Args:
+        gains: Net long-term capital gains by tax unit.
+        tax_unit_ids: Stable tax-unit identifiers.
+        sample_weight: Optional tax-unit weights for dollar-share quotas.
+        tax_year: Sale year used only in deterministic keys for now.
+        resource: Holding-period and basis-to-sales resource.
+        imputation_version: Stable key salt.
+
+    Returns:
+        Basis, years held, and zero-based holding-period bucket arrays.
+    """
+
+    gains = np.asarray(gains, dtype=float)
+    tax_unit_ids = np.asarray(tax_unit_ids)
+    if gains.shape[0] != tax_unit_ids.shape[0]:
+        raise ValueError("gains and tax_unit_ids must have the same length")
+
+    if sample_weight is None:
+        weights = np.ones_like(gains, dtype=float)
+    else:
+        weights = np.asarray(sample_weight, dtype=float)
+        if weights.shape[0] != gains.shape[0]:
+            raise ValueError("sample_weight must match gains length")
+        if not np.any(weights > 0):
+            weights = np.ones_like(gains, dtype=float)
+
+    buckets = _assign_holding_period_buckets(
+        gains,
+        tax_unit_ids=tax_unit_ids,
+        sample_weight=weights,
+        tax_year=tax_year,
+        resource=resource,
+        imputation_version=imputation_version,
+    )
+    years_held = _draw_years_held(
+        buckets,
+        tax_unit_ids=tax_unit_ids,
+        gains=gains,
+        tax_year=tax_year,
+        resource=resource,
+        imputation_version=imputation_version,
+    )
+    basis = _basis_from_gains_and_years(gains, years_held, resource)
+
+    zero_gain = gains == 0
+    buckets = np.where(zero_gain, -1, buckets)
+    years_held = np.where(zero_gain, 0.0, years_held)
+    basis = np.where(zero_gain, 0.0, basis)
+    return CapitalGainsBasisImputation(
+        basis=basis,
+        years_held=years_held,
+        holding_period_bucket=buckets,
+    )
+
+
+def impute_person_level_long_term_capital_gains_basis(
+    person_gains: np.ndarray,
+    *,
+    person_tax_unit_ids: np.ndarray,
+    person_ids: np.ndarray | None = None,
+    person_sample_weight: np.ndarray | None = None,
+    tax_year: int = 2017,
+    resource: CapitalGainsBasisResource = DEFAULT_SOCA_RESOURCE,
+    imputation_version: str = "soca_collapsed_v1",
+) -> CapitalGainsBasisImputation:
+    """Impute tax-unit-collapsed basis and allocate it to people.
+
+    The representative holding period is shared by every person with
+    nonzero long-term gains in the tax unit. Basis is allocated by each
+    person's absolute long-term gain so aggregation reproduces the
+    collapsed tax-unit basis exactly.
+    """
+
+    person_gains = np.asarray(person_gains, dtype=float)
+    person_tax_unit_ids = np.asarray(person_tax_unit_ids)
+    if person_gains.shape[0] != person_tax_unit_ids.shape[0]:
+        raise ValueError("person_gains and person_tax_unit_ids must match")
+
+    if person_ids is None:
+        person_ids = np.arange(person_gains.shape[0])
+    else:
+        person_ids = np.asarray(person_ids)
+        if person_ids.shape[0] != person_gains.shape[0]:
+            raise ValueError("person_ids must match person_gains length")
+
+    frame = pd.DataFrame(
+        {
+            "person_gain": person_gains,
+            "tax_unit_id": person_tax_unit_ids,
+        }
+    )
+    if person_sample_weight is not None:
+        sample_weight = np.asarray(person_sample_weight, dtype=float)
+        if sample_weight.shape[0] != person_gains.shape[0]:
+            raise ValueError("person_sample_weight must match person_gains length")
+        frame["sample_weight"] = sample_weight
+    else:
+        frame["sample_weight"] = 1.0
+
+    grouped = frame.groupby("tax_unit_id", sort=False).agg(
+        gain=("person_gain", "sum"),
+        sample_weight=("sample_weight", "max"),
+    )
+    tax_unit_imputation = impute_tax_unit_long_term_capital_gains_basis(
+        grouped["gain"].to_numpy(),
+        tax_unit_ids=grouped.index.to_numpy(),
+        sample_weight=grouped["sample_weight"].to_numpy(),
+        tax_year=tax_year,
+        resource=resource,
+        imputation_version=imputation_version,
+    )
+
+    basis_by_tax_unit = pd.Series(tax_unit_imputation.basis, index=grouped.index)
+    years_by_tax_unit = pd.Series(tax_unit_imputation.years_held, index=grouped.index)
+    bucket_by_tax_unit = pd.Series(
+        tax_unit_imputation.holding_period_bucket,
+        index=grouped.index,
+    )
+    abs_gain_sum = (
+        frame.assign(abs_gain=np.abs(person_gains))
+        .groupby("tax_unit_id", sort=False)["abs_gain"]
+        .transform("sum")
+    )
+    tax_unit_basis = frame["tax_unit_id"].map(basis_by_tax_unit).to_numpy()
+    tax_unit_years = frame["tax_unit_id"].map(years_by_tax_unit).to_numpy()
+    tax_unit_buckets = frame["tax_unit_id"].map(bucket_by_tax_unit).to_numpy()
+
+    abs_person_gain = np.abs(person_gains)
+    basis = np.divide(
+        tax_unit_basis * abs_person_gain,
+        abs_gain_sum.to_numpy(),
+        out=np.zeros_like(person_gains, dtype=float),
+        where=abs_gain_sum.to_numpy() > 0,
+    )
+    years_held = np.where(abs_person_gain > 0, tax_unit_years, 0.0)
+    buckets = np.where(abs_person_gain > 0, tax_unit_buckets, -1)
+    return CapitalGainsBasisImputation(
+        basis=basis,
+        years_held=years_held,
+        holding_period_bucket=buckets,
+    )
+
+
+def add_long_term_capital_gains_basis_to_puf_frame(
+    puf: pd.DataFrame,
+    *,
+    tax_year: int = 2017,
+    resource: CapitalGainsBasisResource = DEFAULT_SOCA_RESOURCE,
+) -> pd.DataFrame:
+    """Add collapsed basis and holding period columns to a PUF frame."""
+
+    if "long_term_capital_gains" not in puf:
+        return puf
+    record_ids = puf["RECID"].to_numpy() if "RECID" in puf else puf.index.to_numpy()
+    weights = puf["S006"].to_numpy() if "S006" in puf else None
+    imputation = impute_tax_unit_long_term_capital_gains_basis(
+        puf["long_term_capital_gains"].to_numpy(),
+        tax_unit_ids=record_ids,
+        sample_weight=weights,
+        tax_year=tax_year,
+        resource=resource,
+    )
+    puf[LONG_TERM_CAPITAL_GAINS_BASIS] = imputation.basis
+    puf[LONG_TERM_CAPITAL_GAINS_YEARS_HELD] = imputation.years_held
+    return puf
+
+
+def _assign_holding_period_buckets(
+    gains: np.ndarray,
+    *,
+    tax_unit_ids: np.ndarray,
+    sample_weight: np.ndarray,
+    tax_year: int,
+    resource: CapitalGainsBasisResource,
+    imputation_version: str,
+) -> np.ndarray:
+    buckets = np.full(gains.shape[0], -1, dtype=int)
+    for sign, probabilities, label in (
+        (1, resource.gain_dollar_shares, "gain"),
+        (-1, resource.loss_dollar_shares, "loss"),
+    ):
+        mask = gains * sign > 0
+        if not np.any(mask):
+            continue
+
+        probabilities_array = _normalise_probabilities(probabilities)
+        masked_indices = np.flatnonzero(mask)
+        dollar_weights = np.abs(gains[mask]) * sample_weight[mask]
+        keys = _stable_uniforms(
+            tax_unit_ids[mask],
+            salt=f"{imputation_version}|{tax_year}|bucket|{label}",
+        )
+        if dollar_weights.sum() <= 0:
+            assigned = np.searchsorted(
+                np.cumsum(probabilities_array),
+                keys,
+                side="right",
+            )
+            buckets[masked_indices] = np.minimum(
+                assigned,
+                len(probabilities_array) - 1,
+            )
+            continue
+
+        order = np.argsort(keys, kind="mergesort")
+        sorted_indices = masked_indices[order]
+        sorted_weights = dollar_weights[order]
+        weighted_midpoints = (
+            np.cumsum(sorted_weights) - 0.5 * sorted_weights
+        ) / sorted_weights.sum()
+        assigned = np.searchsorted(
+            np.cumsum(probabilities_array),
+            weighted_midpoints,
+            side="right",
+        )
+        buckets[sorted_indices] = np.minimum(assigned, len(probabilities_array) - 1)
+    return buckets
+
+
+def _draw_years_held(
+    buckets: np.ndarray,
+    *,
+    tax_unit_ids: np.ndarray,
+    gains: np.ndarray,
+    tax_year: int,
+    resource: CapitalGainsBasisResource,
+    imputation_version: str,
+) -> np.ndarray:
+    years = np.zeros_like(gains, dtype=float)
+    for bucket in range(len(resource.bucket_names)):
+        mask = buckets == bucket
+        if not np.any(mask):
+            continue
+        signs = np.where(gains[mask] > 0, "gain", "loss")
+        salts = [
+            f"{imputation_version}|{tax_year}|years|{sign}|{bucket}" for sign in signs
+        ]
+        uniforms = np.array(
+            [
+                _stable_uniform(record_id, salt=salt)
+                for record_id, salt in zip(tax_unit_ids[mask], salts)
+            ],
+            dtype=float,
+        )
+        years[mask] = _draw_years_in_bucket(bucket, uniforms, resource)
+    return years
+
+
+def _draw_years_in_bucket(
+    bucket: int,
+    uniforms: np.ndarray,
+    resource: CapitalGainsBasisResource,
+) -> np.ndarray:
+    lo = resource.bucket_lower_years[bucket]
+    hi = resource.bucket_upper_years[bucket]
+    if lo >= 20:
+        return 20 + (-np.log1p(-uniforms) / _top_bucket_exponential_rate(resource))
+
+    x_lo = lo - 1
+    x_hi = hi - 1
+    f_lo = _weibull_cdf(x_lo, resource.weibull_shape, resource.weibull_scale)
+    f_hi = _weibull_cdf(x_hi, resource.weibull_shape, resource.weibull_scale)
+    u = f_lo + uniforms * (f_hi - f_lo)
+    return 1 + _weibull_quantile(u, resource.weibull_shape, resource.weibull_scale)
+
+
+def _basis_from_gains_and_years(
+    gains: np.ndarray,
+    years_held: np.ndarray,
+    resource: CapitalGainsBasisResource,
+) -> np.ndarray:
+    basis = np.zeros_like(gains, dtype=float)
+    positive = gains > 0
+    negative = gains < 0
+
+    gain_bsr = _gain_basis_sales_ratio(years_held[positive], resource)
+    basis[positive] = np.abs(gains[positive]) * gain_bsr / (1 - gain_bsr)
+
+    loss_bsr = _loss_basis_sales_ratio(years_held[negative], resource)
+    basis[negative] = np.abs(gains[negative]) * loss_bsr / (loss_bsr - 1)
+    return basis
+
+
+def _gain_basis_sales_ratio(
+    years_held: np.ndarray,
+    resource: CapitalGainsBasisResource,
+) -> np.ndarray:
+    h_top = _top_bucket_mean(resource)
+    h_knots = np.asarray(resource.bucket_midpoint_years, dtype=float)
+    h_knots[-1] = h_top
+    ratio_knots = np.asarray(resource.gain_basis_sales_ratios, dtype=float)
+    interpolated = np.interp(
+        np.minimum(years_held, h_top),
+        h_knots,
+        ratio_knots,
+        left=ratio_knots[0],
+        right=ratio_knots[-1],
+    )
+    g_extrap = (1 / ratio_knots[-1]) ** (1 / h_top) - 1
+    extrapolated = 1 / (1 + g_extrap) ** years_held
+    ratio = np.where(years_held <= h_top, interpolated, extrapolated)
+    return np.clip(ratio, resource.gain_bsr_floor, resource.gain_bsr_ceiling)
+
+
+def _loss_basis_sales_ratio(
+    years_held: np.ndarray,
+    resource: CapitalGainsBasisResource,
+) -> np.ndarray:
+    h_knots = np.asarray(resource.bucket_midpoint_years, dtype=float)
+    h_knots[-1] = _top_bucket_mean(resource)
+    ratio_knots = np.asarray(resource.loss_basis_sales_ratios, dtype=float)
+    ratio = np.interp(
+        years_held,
+        h_knots,
+        ratio_knots,
+        left=ratio_knots[0],
+        right=ratio_knots[-1],
+    )
+    return np.clip(ratio, resource.loss_bsr_floor, resource.loss_bsr_ceiling)
+
+
+def _top_bucket_mean(resource: CapitalGainsBasisResource) -> float:
+    return 20 + 1 / _top_bucket_exponential_rate(resource)
+
+
+def _top_bucket_exponential_rate(resource: CapitalGainsBasisResource) -> float:
+    density_at_boundary = _weibull_pdf(
+        19,
+        resource.weibull_shape,
+        resource.weibull_scale,
+    )
+    bucket_8_mass = _weibull_cdf(
+        19,
+        resource.weibull_shape,
+        resource.weibull_scale,
+    ) - _weibull_cdf(14, resource.weibull_shape, resource.weibull_scale)
+    gain_shares = np.asarray(resource.gain_dollar_shares, dtype=float)
+    return density_at_boundary / bucket_8_mass * gain_shares[7] / gain_shares[8]
+
+
+def _weibull_cdf(x: float, shape: float, scale: float) -> float:
+    return 1 - np.exp(-((x / scale) ** shape))
+
+
+def _weibull_pdf(x: float, shape: float, scale: float) -> float:
+    return (
+        (shape / scale) * ((x / scale) ** (shape - 1)) * np.exp(-((x / scale) ** shape))
+    )
+
+
+def _weibull_quantile(u: np.ndarray, shape: float, scale: float) -> np.ndarray:
+    u = np.clip(u, np.finfo(float).tiny, np.nextafter(1.0, 0.0))
+    return scale * (-np.log1p(-u)) ** (1 / shape)
+
+
+def _normalise_probabilities(probabilities: tuple[float, ...]) -> np.ndarray:
+    probabilities_array = np.asarray(probabilities, dtype=float)
+    total = probabilities_array.sum()
+    if total <= 0:
+        raise ValueError("holding-period probabilities must sum to a positive value")
+    return probabilities_array / total
+
+
+def _stable_uniforms(values: np.ndarray, *, salt: str) -> np.ndarray:
+    return np.array(
+        [_stable_uniform(value, salt=salt) for value in values], dtype=float
+    )
+
+
+def _stable_uniform(value, *, salt: str) -> float:
+    digest = hashlib.blake2b(
+        f"{salt}|{value}".encode("utf-8"),
+        digest_size=8,
+    ).digest()
+    integer = int.from_bytes(digest, byteorder="big", signed=False)
+    return (integer + 0.5) / 2**64
diff --git a/tests/unit/calibration/test_calibration_puf_impute.py b/tests/unit/calibration/test_calibration_puf_impute.py
index b29914ca3..68d8a59a9 100644
--- a/tests/unit/calibration/test_calibration_puf_impute.py
+++ b/tests/unit/calibration/test_calibration_puf_impute.py
@@ -10,6 +10,7 @@
 from policyengine_us_data.calibration import puf_impute as puf_impute_module
 from policyengine_us_data.calibration.puf_impute import (
     DEMOGRAPHIC_PREDICTORS,
+    DETERMINISTIC_IMPUTED_VARIABLES,
     IMPUTED_VARIABLES,
     OVERRIDDEN_IMPUTED_VARIABLES,
     _impute_retirement_contributions,
@@ -207,6 +208,14 @@ def test_demographic_predictors_excludes_state(self):
     def test_imputed_variables_not_empty(self):
         assert len(IMPUTED_VARIABLES) > 0
 
+    def test_capital_gains_basis_fields_are_stage_one_outputs(self):
+        expected = {
+            "long_term_capital_gains_basis",
+            "long_term_capital_gains_years_held",
+        }
+        assert expected <= set(IMPUTED_VARIABLES)
+        assert expected <= set(DETERMINISTIC_IMPUTED_VARIABLES)
+
     def test_overridden_subset_of_imputed(self):
         for var in OVERRIDDEN_IMPUTED_VARIABLES:
             assert var in IMPUTED_VARIABLES
@@ -301,6 +310,40 @@ def fake_run_qrf_imputation(*args, **kwargs):
         np.testing.assert_array_equal(employment[:20], data["employment_income"][2024])
         np.testing.assert_array_equal(employment[20:], y_full["employment_income"])
 
+    def test_capital_gains_basis_is_deterministically_imputed(self, monkeypatch):
+        data = _make_mock_data(n_persons=4, n_households=2)
+        data["person_tax_unit_id"] = {2024: np.array([1, 1, 2, 2])}
+        data["person_household_id"] = {2024: np.array([1, 1, 2, 2])}
+        data["long_term_capital_gains"] = {
+            2024: np.array([100.0, -40.0, 0.0, 200.0], dtype=np.float32)
+        }
+
+        monkeypatch.setattr(
+            puf_impute_module,
+            "has_policyengine_us_variables",
+            lambda *variables: True,
+        )
+
+        result = puf_clone_dataset(
+            data=data,
+            state_fips=np.array([1, 2]),
+            time_period=2024,
+            skip_qrf=True,
+        )
+
+        basis = result["long_term_capital_gains_basis"][2024]
+        years = result["long_term_capital_gains_years_held"][2024]
+        gains = result["long_term_capital_gains"][2024]
+        tax_unit_ids = result["person_tax_unit_id"][2024]
+
+        assert np.all(basis[gains != 0] > 0)
+        assert np.all(years[gains != 0] > 0)
+        assert np.all(basis[gains == 0] == 0)
+        assert np.all(years[gains == 0] == 0)
+        for tax_unit_id in np.unique(tax_unit_ids[gains != 0]):
+            mask = (tax_unit_ids == tax_unit_id) & (gains != 0)
+            assert np.unique(years[mask]).size == 1
+
     def test_sstb_qbi_split_variables_imputed(self):
         expected = {
             "sstb_self_employment_income",
diff --git a/tests/unit/datasets/test_irs_puf.py b/tests/unit/datasets/test_irs_puf.py
index 8d1f8e380..31c62d1c4 100644
--- a/tests/unit/datasets/test_irs_puf.py
+++ b/tests/unit/datasets/test_irs_puf.py
@@ -6,6 +6,7 @@
     PUF,
     QBI_SIMULATION_VERSION,
     QBI_SIMULATION_VERSION_ATTR,
+    _person_financial_value_from_puf_row,
 )
 
 
@@ -25,6 +26,30 @@ def test_irs_puf_generates(year: int):
     dataset_by_year[year](require=True)
 
 
+def test_puf_person_split_keeps_capital_gains_holding_period_collapsed():
+    row = {
+        "long_term_capital_gains": 1_000.0,
+        "long_term_capital_gains_years_held": 12.5,
+    }
+
+    assert (
+        _person_financial_value_from_puf_row(
+            "long_term_capital_gains_years_held",
+            row,
+            0.25,
+        )
+        == 12.5
+    )
+    assert (
+        _person_financial_value_from_puf_row(
+            "long_term_capital_gains_years_held",
+            row,
+            0.0,
+        )
+        == 0
+    )
+
+
 def test_puf_load_dataset_backfills_sstb_split_inputs(tmp_path):
     class DummyPUF(PUF):
         label = "Dummy PUF"
diff --git a/tests/unit/test_capital_gains_basis.py b/tests/unit/test_capital_gains_basis.py
new file mode 100644
index 000000000..a102b4f95
--- /dev/null
+++ b/tests/unit/test_capital_gains_basis.py
@@ -0,0 +1,99 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from policyengine_us_data.utils.capital_gains_basis import (
+    LONG_TERM_CAPITAL_GAINS_BASIS,
+    LONG_TERM_CAPITAL_GAINS_YEARS_HELD,
+    add_long_term_capital_gains_basis_to_puf_frame,
+    impute_person_level_long_term_capital_gains_basis,
+    impute_tax_unit_long_term_capital_gains_basis,
+)
+
+
+def test_tax_unit_imputation_is_record_stable_under_shuffle():
+    gains = np.array([1_000, 20_000, -4_000, 0, 7_500, -12_000], dtype=float)
+    ids = np.array([101, 102, 103, 104, 105, 106])
+    weights = np.array([10, 2, 5, 1, 8, 3], dtype=float)
+
+    direct = impute_tax_unit_long_term_capital_gains_basis(
+        gains,
+        tax_unit_ids=ids,
+        sample_weight=weights,
+        tax_year=2026,
+    )
+
+    order = np.array([4, 2, 0, 5, 1, 3])
+    shuffled = impute_tax_unit_long_term_capital_gains_basis(
+        gains[order],
+        tax_unit_ids=ids[order],
+        sample_weight=weights[order],
+        tax_year=2026,
+    )
+
+    reverse_order = np.argsort(order)
+    np.testing.assert_allclose(direct.basis, shuffled.basis[reverse_order])
+    np.testing.assert_allclose(direct.years_held, shuffled.years_held[reverse_order])
+    np.testing.assert_array_equal(
+        direct.holding_period_bucket,
+        shuffled.holding_period_bucket[reverse_order],
+    )
+
+
+def test_zero_gain_records_get_zero_basis_and_holding_period():
+    imputation = impute_tax_unit_long_term_capital_gains_basis(
+        np.array([0.0]),
+        tax_unit_ids=np.array([1]),
+        tax_year=2026,
+    )
+
+    assert imputation.basis[0] == 0
+    assert imputation.years_held[0] == 0
+    assert imputation.holding_period_bucket[0] == -1
+
+
+def test_person_allocation_preserves_collapsed_tax_unit_basis():
+    gains = np.array([100.0, -40.0, 0.0, -80.0])
+    tax_unit_ids = np.array([1, 1, 2, 3])
+    person_ids = np.array([11, 12, 21, 31])
+
+    person_imputation = impute_person_level_long_term_capital_gains_basis(
+        gains,
+        person_tax_unit_ids=tax_unit_ids,
+        person_ids=person_ids,
+        tax_year=2026,
+    )
+    tax_unit_imputation = impute_tax_unit_long_term_capital_gains_basis(
+        np.array([60.0, 0.0, -80.0]),
+        tax_unit_ids=np.array([1, 2, 3]),
+        tax_year=2026,
+    )
+
+    assert person_imputation.years_held[0] == pytest.approx(
+        person_imputation.years_held[1]
+    )
+    assert person_imputation.basis[:2].sum() == pytest.approx(
+        tax_unit_imputation.basis[0]
+    )
+    assert person_imputation.basis[2] == 0
+    assert person_imputation.years_held[2] == 0
+    assert person_imputation.basis[3] == pytest.approx(tax_unit_imputation.basis[2])
+
+
+def test_puf_frame_helper_adds_basis_and_years():
+    puf = pd.DataFrame(
+        {
+            "RECID": [10, 11, 12],
+            "S006": [100.0, 200.0, 300.0],
+            "long_term_capital_gains": [5_000.0, -2_000.0, 0.0],
+        }
+    )
+
+    result = add_long_term_capital_gains_basis_to_puf_frame(puf.copy(), tax_year=2026)
+
+    assert LONG_TERM_CAPITAL_GAINS_BASIS in result
+    assert LONG_TERM_CAPITAL_GAINS_YEARS_HELD in result
+    assert result.loc[0, LONG_TERM_CAPITAL_GAINS_BASIS] > 0
+    assert result.loc[1, LONG_TERM_CAPITAL_GAINS_BASIS] > 0
+    assert result.loc[2, LONG_TERM_CAPITAL_GAINS_BASIS] == 0
+    assert result.loc[2, LONG_TERM_CAPITAL_GAINS_YEARS_HELD] == 0

From a581a14f13496267b03894c1d485816ea85db1d4 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 24 May 2026 09:19:42 -0400
Subject: [PATCH 2/5] Add changelog for capital gains basis imputation

---
 changelog.d/1128.added | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 changelog.d/1128.added

diff --git a/changelog.d/1128.added b/changelog.d/1128.added
new file mode 100644
index 000000000..10d7af0b2
--- /dev/null
+++ b/changelog.d/1128.added
@@ -0,0 +1 @@
+Added deterministic collapsed long-term capital gains basis and holding-period imputation.

From d15956785bba3ecbd00aaf898fa58d27d673bc2f Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Sun, 24 May 2026 09:35:24 -0400
Subject: [PATCH 3/5] Fix capital gains basis read backfill

---
 policyengine_us_data/datasets/puf/puf.py      | 86 ++++++++++++++++++-
 .../test_calibration_puf_impute.py            | 53 ++++++++++++
 tests/unit/datasets/test_irs_puf.py           | 86 +++++++++++++++++++
 3 files changed, 223 insertions(+), 2 deletions(-)

diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
index 60a344442..6a2c0c3f0 100644
--- a/policyengine_us_data/datasets/puf/puf.py
+++ b/policyengine_us_data/datasets/puf/puf.py
@@ -1340,6 +1340,88 @@ def _ensure_sstb_split_inputs(self) -> dict[str, np.ndarray]:
 
         return overrides
 
+    def _capital_gains_basis_overrides(
+        self,
+        existing_overrides: dict[str, np.ndarray] | None = None,
+    ) -> dict[str, np.ndarray]:
+        if not has_policyengine_us_variables(*CAPITAL_GAINS_BASIS_VARIABLES):
+            return {}
+        if not self.file_path.exists():
+            return {}
+
+        existing_overrides = existing_overrides or {}
+        with h5py.File(self.file_path, "r") as file_handle:
+            keys = set(file_handle.keys()) | set(existing_overrides)
+            if all(variable in keys for variable in CAPITAL_GAINS_BASIS_VARIABLES):
+                return {}
+            if (
+                "long_term_capital_gains" not in keys
+                or "person_tax_unit_id" not in keys
+            ):
+                return {}
+
+            gains = self._values_from_file_or_overrides(
+                file_handle,
+                "long_term_capital_gains",
+                existing_overrides,
+                0,
+            )
+            length = len(gains)
+            arrays = {
+                "long_term_capital_gains": gains,
+                "person_tax_unit_id": self._values_from_file_or_overrides(
+                    file_handle,
+                    "person_tax_unit_id",
+                    existing_overrides,
+                    length,
+                ),
+            }
+            for variable in (
+                "person_id",
+                "household_weight",
+                "person_household_id",
+                "household_id",
+                *CAPITAL_GAINS_BASIS_VARIABLES,
+            ):
+                if variable in keys:
+                    arrays[variable] = self._values_from_file_or_overrides(
+                        file_handle,
+                        variable,
+                        existing_overrides,
+                        length,
+                    )
+
+        arrays = _with_capital_gains_basis_inputs(arrays, self.time_period)
+        return {
+            variable: np.asarray(arrays[variable])
+            for variable in CAPITAL_GAINS_BASIS_VARIABLES
+            if variable not in keys and variable in arrays
+        }
+
+    def _ensure_capital_gains_basis_inputs(
+        self,
+        existing_overrides: dict[str, np.ndarray] | None = None,
+    ) -> dict[str, np.ndarray]:
+        overrides = self._capital_gains_basis_overrides(existing_overrides)
+        if not overrides:
+            return {}
+
+        try:
+            with h5py.File(self.file_path, "r+") as file_handle:
+                for key, values in overrides.items():
+                    self._replace_array(file_handle, key, values)
+        except OSError:
+            pass
+
+        return overrides
+
+    def _ensure_read_overrides(self) -> dict[str, np.ndarray]:
+        sstb_overrides = self._ensure_sstb_split_inputs()
+        capital_gains_overrides = self._ensure_capital_gains_basis_inputs(
+            sstb_overrides
+        )
+        return {**sstb_overrides, **capital_gains_overrides}
+
     class _OverrideView:
         def __init__(self, backing, overrides: dict[str, np.ndarray]):
             self._backing = backing
@@ -1393,7 +1475,7 @@ def __getattr__(self, name):
 
     def load(self, key=None, mode="r"):
         if mode == "r":
-            overrides = self._ensure_sstb_split_inputs()
+            overrides = self._ensure_read_overrides()
             if key in overrides:
                 return overrides[key]
             if key is None and overrides:
@@ -1401,7 +1483,7 @@ def load(self, key=None, mode="r"):
         return super().load(key=key, mode=mode)
 
     def load_dataset(self):
-        overrides = self._ensure_sstb_split_inputs()
+        overrides = self._ensure_read_overrides()
         arrays = super().load_dataset()
         arrays.update(overrides)
         return arrays
diff --git a/tests/unit/calibration/test_calibration_puf_impute.py b/tests/unit/calibration/test_calibration_puf_impute.py
index 68d8a59a9..68f442bfe 100644
--- a/tests/unit/calibration/test_calibration_puf_impute.py
+++ b/tests/unit/calibration/test_calibration_puf_impute.py
@@ -216,6 +216,59 @@ def test_capital_gains_basis_fields_are_stage_one_outputs(self):
         assert expected <= set(IMPUTED_VARIABLES)
         assert expected <= set(DETERMINISTIC_IMPUTED_VARIABLES)
 
+    def test_qrf_excludes_deterministic_capital_gains_basis_outputs(
+        self,
+        monkeypatch,
+    ):
+        import policyengine_us
+
+        data = _make_mock_data(n_persons=4, n_households=2)
+
+        class FakeCalculation:
+            values = np.array([100.0, 200.0, 300.0, 400.0], dtype=np.float32)
+
+        class FakeMicrosimulation:
+            def __init__(self, dataset):
+                self.dataset = dataset
+
+            def calculate(self, variable, map_to=None):
+                return FakeCalculation()
+
+            def calculate_dataframe(self, variables):
+                return pd.DataFrame(
+                    {variable: np.arange(4, dtype=np.float32) for variable in variables}
+                )
+
+        captured_output_vars = []
+
+        def fake_sequential_qrf(X_train, X_test, predictors, output_vars):
+            captured_output_vars.append(tuple(output_vars))
+            return {
+                variable: np.zeros(len(X_test), dtype=np.float32)
+                for variable in output_vars
+            }
+
+        monkeypatch.setattr(policyengine_us, "Microsimulation", FakeMicrosimulation)
+        monkeypatch.setattr(
+            puf_impute_module,
+            "_sequential_qrf",
+            fake_sequential_qrf,
+        )
+
+        puf_impute_module._run_qrf_imputation(
+            data=data,
+            time_period=2024,
+            puf_dataset=object(),
+        )
+
+        deterministic_outputs = set(DETERMINISTIC_IMPUTED_VARIABLES)
+        assert captured_output_vars
+        for output_vars in captured_output_vars:
+            assert deterministic_outputs.isdisjoint(output_vars)
+        assert set(captured_output_vars[0]) == (
+            set(IMPUTED_VARIABLES) - deterministic_outputs
+        )
+
     def test_overridden_subset_of_imputed(self):
         for var in OVERRIDDEN_IMPUTED_VARIABLES:
             assert var in IMPUTED_VARIABLES
diff --git a/tests/unit/datasets/test_irs_puf.py b/tests/unit/datasets/test_irs_puf.py
index 31c62d1c4..f19ec4fa8 100644
--- a/tests/unit/datasets/test_irs_puf.py
+++ b/tests/unit/datasets/test_irs_puf.py
@@ -2,6 +2,7 @@
 import numpy as np
 import pytest
 
+from policyengine_us_data.datasets.puf import puf as puf_module
 from policyengine_us_data.datasets.puf.puf import (
     PUF,
     QBI_SIMULATION_VERSION,
@@ -14,6 +15,19 @@ def _mark_current_qbi_simulation(file_handle):
     file_handle.attrs[QBI_SIMULATION_VERSION_ATTR] = QBI_SIMULATION_VERSION
 
 
+def _write_capital_gains_basis_source_file(path):
+    with h5py.File(path, "w") as file_handle:
+        file_handle.create_dataset("person_id", data=np.array([1, 2, 3, 4]))
+        file_handle.create_dataset("person_tax_unit_id", data=np.array([1, 1, 2, 2]))
+        file_handle.create_dataset("person_household_id", data=np.array([1, 1, 2, 2]))
+        file_handle.create_dataset("household_id", data=np.array([1, 2]))
+        file_handle.create_dataset("household_weight", data=np.array([100.0, 200.0]))
+        file_handle.create_dataset(
+            "long_term_capital_gains",
+            data=np.array([100.0, -40.0, 0.0, 200.0]),
+        )
+
+
 @pytest.mark.skip(reason="This test requires private data.")
 @pytest.mark.parametrize("year", [2015])
 def test_irs_puf_generates(year: int):
@@ -50,6 +64,78 @@ def test_puf_person_split_keeps_capital_gains_holding_period_collapsed():
     )
 
 
+def test_puf_load_dataset_backfills_capital_gains_basis_inputs(
+    tmp_path,
+    monkeypatch,
+):
+    monkeypatch.setattr(
+        puf_module,
+        "has_policyengine_us_variables",
+        lambda *variables: True,
+    )
+
+    class DummyPUF(PUF):
+        label = "Dummy PUF"
+        name = "dummy_puf"
+        time_period = 2024
+        file_path = tmp_path / "dummy_puf.h5"
+
+    _write_capital_gains_basis_source_file(DummyPUF.file_path)
+
+    arrays = DummyPUF().load_dataset()
+
+    basis = arrays["long_term_capital_gains_basis"]
+    years = arrays["long_term_capital_gains_years_held"]
+    gains = arrays["long_term_capital_gains"]
+
+    assert np.all(basis[gains != 0] > 0)
+    assert np.all(years[gains != 0] > 0)
+    assert np.all(basis[gains == 0] == 0)
+    assert np.all(years[gains == 0] == 0)
+
+    with h5py.File(DummyPUF.file_path, "r") as file_handle:
+        assert "long_term_capital_gains_basis" in file_handle
+        assert "long_term_capital_gains_years_held" in file_handle
+
+
+def test_puf_load_key_backfills_read_only_capital_gains_basis_inputs(
+    tmp_path,
+    monkeypatch,
+):
+    monkeypatch.setattr(
+        puf_module,
+        "has_policyengine_us_variables",
+        lambda *variables: True,
+    )
+
+    class DummyPUF(PUF):
+        label = "Dummy PUF"
+        name = "dummy_puf"
+        time_period = 2024
+        file_path = tmp_path / "dummy_puf.h5"
+
+    _write_capital_gains_basis_source_file(DummyPUF.file_path)
+    DummyPUF.file_path.chmod(0o444)
+
+    dataset = DummyPUF()
+    try:
+        basis = dataset.load("long_term_capital_gains_basis")
+        years = dataset.load("long_term_capital_gains_years_held")
+        reader = dataset.load()
+        np.testing.assert_array_equal(
+            reader["long_term_capital_gains_basis"],
+            basis,
+        )
+        reader.close()
+    finally:
+        DummyPUF.file_path.chmod(0o644)
+
+    assert np.all(basis[[0, 1, 3]] > 0)
+    assert basis[2] == 0
+    assert np.all(years[[0, 1, 3]] > 0)
+    assert years[2] == 0
+
+
 def test_puf_load_dataset_backfills_sstb_split_inputs(tmp_path):
     class DummyPUF(PUF):
         label = "Dummy PUF"

From 6c6922327e31bd832abedfc211ae7b46f35c966e Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Mon, 25 May 2026 06:17:06 -0400
Subject: [PATCH 4/5] Use unreleased policyengine-us git ref

---
 .../check_policyengine_us_dependency.py       | 54 ++++++++++++++++---
 pyproject.toml                                |  2 +-
 tests/unit/test_publication_scripts.py        | 28 ++++++++++
 uv.lock                                       | 10 ++--
 4 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/.github/scripts/check_policyengine_us_dependency.py b/.github/scripts/check_policyengine_us_dependency.py
index 1fd574bf1..85fdf4bde 100644
--- a/.github/scripts/check_policyengine_us_dependency.py
+++ b/.github/scripts/check_policyengine_us_dependency.py
@@ -18,6 +18,8 @@
 PYPI_JSON_TIMEOUT_SECONDS = 20
 POLICYENGINE_US = "policyengine-us"
 STALE_LOCK_PREFIX = "uv.lock has policyengine-us "
+LOCK_GIT_REF_PREFIX = "uv.lock resolves policyengine-us from a Git ref."
+PROJECT_GIT_REF_PREFIX = "pyproject.toml pins policyengine-us to a Git ref."
 
 
 def _annotation(level: str, message: str) -> str:
@@ -86,6 +88,8 @@ def _latest_pypi_version() -> str:
 def check_dependency(root: Path, latest_version: str | None = None) -> list[str]:
     locked_version, source = _locked_policyengine_us(root)
     project_dependency = _project_policyengine_us_dependency(root)
+    lock_uses_git_ref = "git" in source
+    project_uses_git_ref = "@" in project_dependency and "git+" in project_dependency
 
     violations: list[str] = []
     if (
@@ -99,27 +103,40 @@ def check_dependency(root: Path, latest_version: str | None = None) -> list[str]
         )
 
     expected_dependency = f"{POLICYENGINE_US}=={locked_version}"
-    if project_dependency != expected_dependency:
+    if not project_uses_git_ref and project_dependency != expected_dependency:
         violations.append(
             f"pyproject.toml must pin {expected_dependency} to match uv.lock; "
             f"found {project_dependency!r}."
         )
 
-    if "git" in source:
+    if lock_uses_git_ref:
         violations.append(
-            "uv.lock resolves policyengine-us from a Git ref. Prefer an exact "
+            f"{LOCK_GIT_REF_PREFIX} Prefer an exact "
             f"PyPI release pin once policyengine-us {locked_version} is published."
         )
 
-    if "@" in project_dependency and "git+" in project_dependency:
+    if project_uses_git_ref:
         violations.append(
-            "pyproject.toml pins policyengine-us to a Git ref. Prefer an exact "
+            f"{PROJECT_GIT_REF_PREFIX} Prefer an exact "
             "PyPI release pin for production data builds."
         )
 
     return violations
 
 
+def _is_unreleased_git_ref_violation(
+    violation: str,
+    locked_version: str,
+    latest_version: str | None,
+) -> bool:
+    if latest_version is None:
+        return False
+    git_ref_violation = violation.startswith(
+        LOCK_GIT_REF_PREFIX
+    ) or violation.startswith(PROJECT_GIT_REF_PREFIX)
+    return git_ref_violation and _compare_versions(locked_version, latest_version) > 0
+
+
 def main() -> int:
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -163,17 +180,30 @@ def main() -> int:
         print(f"policyengine-us dependency is current at {locked_version}.")
         return 0
 
+    locked_version, _source = _locked_policyengine_us(REPO_ROOT)
     has_blocking_violation = False
     allowed_stale_version = False
+    allowed_unreleased_git_ref = False
     for violation in violations:
         stale_version_violation = violation.startswith(STALE_LOCK_PREFIX)
         allowed_by_override = allow_stale and stale_version_violation
-        level = "warning" if args.mode == "warn" or allowed_by_override else "error"
+        allowed_git_ref = _is_unreleased_git_ref_violation(
+            violation,
+            locked_version,
+            latest_version,
+        )
+        level = (
+            "warning"
+            if args.mode == "warn" or allowed_by_override or allowed_git_ref
+            else "error"
+        )
         print(_annotation(level, violation))
-        if args.mode == "fail" and not allowed_by_override:
+        if args.mode == "fail" and not allowed_by_override and not allowed_git_ref:
             has_blocking_violation = True
         if allowed_by_override:
             allowed_stale_version = True
+        if allowed_git_ref:
+            allowed_unreleased_git_ref = True
 
     if allowed_stale_version:
         print(
@@ -183,10 +213,18 @@ def main() -> int:
                 "policyengine-us lagging the latest PyPI release.",
             )
         )
+    if allowed_unreleased_git_ref:
+        print(
+            _annotation(
+                "warning",
+                "policyengine-us is pinned to an unreleased Git ref; switch to "
+                f"policyengine-us=={locked_version} once that PyPI release exists.",
+            )
+        )
 
     if has_blocking_violation:
         return 1
-    if allowed_stale_version:
+    if allowed_stale_version or allowed_unreleased_git_ref:
         return 0
 
     return 1 if args.mode == "fail" else 0
diff --git a/pyproject.toml b/pyproject.toml
index 48c623b35..43825811c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,7 +22,7 @@ classifiers = [
     "Programming Language :: Python :: 3.14",
 ]
 dependencies = [
-    "policyengine-us==1.705.15",
+    "policyengine-us @ git+https://github.com/PolicyEngine/policyengine-us@0dcc69aa7a38be901141d38c8dbb7a9c870f2561",
     # policyengine-core 3.26.1 is the current 3.26.x runtime and includes the fix for
     # PolicyEngine/policyengine-core#482 (user-set ETERNITY inputs lost
     # after _invalidate_all_caches) and is required by policyengine-us 1.682.1+.
diff --git a/tests/unit/test_publication_scripts.py b/tests/unit/test_publication_scripts.py
index d7df93b0c..8180847f4 100644
--- a/tests/unit/test_publication_scripts.py
+++ b/tests/unit/test_publication_scripts.py
@@ -282,6 +282,34 @@ def test_policyengine_us_dependency_check_flags_git_refs(tmp_path):
     assert any("Git ref" in violation for violation in violations)
 
 
+def test_policyengine_us_dependency_check_allows_unreleased_git_refs(
+    tmp_path,
+    monkeypatch,
+):
+    module = _load_script(
+        ".github/scripts/check_policyengine_us_dependency.py",
+        "check_policyengine_us_dependency_unreleased_git_test",
+    )
+    _write_pyproject_with_policyengine_us(
+        tmp_path,
+        "policyengine-us @ git+https://github.com/PolicyEngine/policyengine-us@abc",
+    )
+    _write_uv_lock_for_policyengine_us(
+        tmp_path,
+        "1.691.12",
+        source='{ git = "https://github.com/PolicyEngine/policyengine-us?rev=abc#abc" }',
+    )
+    monkeypatch.setattr(module, "REPO_ROOT", tmp_path)
+    monkeypatch.setattr(module, "_latest_pypi_version", lambda: "1.691.11")
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        ["check_policyengine_us_dependency.py", "--mode", "fail"],
+    )
+
+    assert module.main() == 0
+
+
 def test_policyengine_us_dependency_check_flags_non_exact_pyproject_pin(tmp_path):
     module = _load_script(
         ".github/scripts/check_policyengine_us_dependency.py",
diff --git a/uv.lock b/uv.lock
index 5cda418f1..eb9abfb9e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2122,8 +2122,8 @@ wheels = [
 
 [[package]]
 name = "policyengine-us"
-version = "1.705.15"
-source = { registry = "https://pypi.org/simple" }
+version = "1.706.14"
+source = { git = "https://github.com/PolicyEngine/policyengine-us?rev=0dcc69aa7a38be901141d38c8dbb7a9c870f2561#0dcc69aa7a38be901141d38c8dbb7a9c870f2561" }
 dependencies = [
     { name = "microdf-python" },
     { name = "pandas" },
@@ -2132,10 +2132,6 @@ dependencies = [
     { name = "tables" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/54/cc/921f994e5c688be0f45dbe8d7bce209ee45225d328a9724cf363df225ce8/policyengine_us-1.705.15.tar.gz", hash = "sha256:559d79690cb1d79615479ed2c71a53510e8cfea56d2398a37120f6797548c26b", size = 9927111, upload-time = "2026-05-24T02:32:30.769Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/24/02/b8ae7ec50124bc4f442c8e3f662bf02d0f670d288c63e367dff75ed8c374/policyengine_us-1.705.15-py3-none-any.whl", hash = "sha256:aeafbbbef2a8de88cb73da8c234943784789023e7d32e05a9560e1a7dd713c70", size = 10758474, upload-time = "2026-05-24T02:32:26.588Z" },
-]
 
 [[package]]
 name = "policyengine-us-data"
@@ -2204,7 +2200,7 @@ requires-dist = [
     { name = "pandas", specifier = ">=2.3.1" },
     { name = "pip-system-certs", specifier = ">=3.0" },
     { name = "policyengine-core", specifier = ">=3.26.1,<3.27" },
-    { name = "policyengine-us", specifier = "==1.705.15" },
+    { name = "policyengine-us", git = "https://github.com/PolicyEngine/policyengine-us?rev=0dcc69aa7a38be901141d38c8dbb7a9c870f2561" },
     { name = "requests", specifier = ">=2.25.0" },
     { name = "samplics", marker = "extra == 'calibration'" },
     { name = "scipy", specifier = ">=1.15.3" },

From 33326978c4d78f9e565d32673f242d3048a17243 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Mon, 25 May 2026 06:44:44 -0400
Subject: [PATCH 5/5] Handle mock variable metadata in PUF clone

---
 policyengine_us_data/calibration/puf_impute.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py
index e0d7c5895..19a86dd31 100644
--- a/policyengine_us_data/calibration/puf_impute.py
+++ b/policyengine_us_data/calibration/puf_impute.py
@@ -616,7 +616,9 @@ def _map_to_entity(pred_values, variable_name):
         var_meta = tbs.variables.get(variable_name)
         if var_meta is None:
             return pred_values
-        entity = var_meta.entity.key
+        entity = getattr(getattr(var_meta, "entity", None), "key", None)
+        if not isinstance(entity, str):
+            return pred_values
         if entity != "person":
             return cps_sim.populations[entity].value_from_first_person(pred_values)
         return pred_values