diff --git a/.gitignore b/.gitignore index 83e37d6a4..2ddecaa05 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,12 @@ **/__pycache__ **/.DS_STORE **/*.h5 +**/*.h5.lock **/*.npy **/*.csv **/*.csv.gz +**/pu*_csv.zip +**/*.clone_diagnostics.json **/_build **/*.pkl **/*.db diff --git a/changelog.d/1131.fixed b/changelog.d/1131.fixed new file mode 100644 index 000000000..18ab53c26 --- /dev/null +++ b/changelog.d/1131.fixed @@ -0,0 +1 @@ +Refine the SIPP SSI disability training candidate screen to use SGA and approximate SSI countable income, and remove the manual cache-version suffix. diff --git a/policyengine_us_data/calibration/chunked_matrix_assembler.py b/policyengine_us_data/calibration/chunked_matrix_assembler.py index 59b6699bb..fea94b247 100644 --- a/policyengine_us_data/calibration/chunked_matrix_assembler.py +++ b/policyengine_us_data/calibration/chunked_matrix_assembler.py @@ -380,7 +380,9 @@ def run_single_chunk(self, chunk_id: int) -> ChunkResult: continue try: hh_vars[variable] = chunk_sim.calculate( - variable, state.time_period, map_to="household" + variable, + state.time_period, + map_to="household", ).values.astype(np.float32) except Exception as exc: logger.warning( @@ -394,7 +396,9 @@ def run_single_chunk(self, chunk_id: int) -> ChunkResult: continue try: target_entity_vars[variable] = chunk_sim.calculate( - variable, state.time_period, map_to=entity_key + variable, + state.time_period, + map_to=entity_key, ).values.astype(np.float32) except Exception as exc: logger.warning( diff --git a/policyengine_us_data/calibration/sanity_checks.py b/policyengine_us_data/calibration/sanity_checks.py index 003c159a1..7b922ce90 100644 --- a/policyengine_us_data/calibration/sanity_checks.py +++ b/policyengine_us_data/calibration/sanity_checks.py @@ -36,9 +36,7 @@ "income_tax_before_credits", ] -COMPUTED_KEY_MONETARY_VARS = [ - "ssi_federal_fiscal_year_outlays", -] +COMPUTED_KEY_MONETARY_VARS = [] TAKEUP_VARS = [ "takes_up_snap_if_eligible", @@ -665,6 +663,9 @@ def _append_finite_check(var: str, vals) -> None: def _computed_key_monetary_values(h5_path: str, period: int) -> dict[str, np.ndarray]: + if not COMPUTED_KEY_MONETARY_VARS: + return {} + try: from policyengine_us import Microsimulation diff --git a/policyengine_us_data/calibration/source_impute.py b/policyengine_us_data/calibration/source_impute.py index 51d799b54..c7826dd3b 100644 --- a/policyengine_us_data/calibration/source_impute.py +++ b/policyengine_us_data/calibration/source_impute.py @@ -51,6 +51,7 @@ SSI_DISABILITY_EXPORT_VARIABLES, VEHICLE_MODEL_PREDICTORS, build_vehicle_training_frame, + ensure_sipp_file, get_ssi_disability_model, predict_ssi_disability_criteria, preserve_under_65_ssi_disability_criteria, @@ -663,16 +664,26 @@ def _impute_sipp( Returns: Updated data dict. """ - from huggingface_hub import hf_hub_download - from policyengine_us_data.storage import STORAGE_FOLDER - - hf_hub_download( - repo_id="PolicyEngine/policyengine-us-data", - filename="pu2023_slim.csv", - repo_type="model", - local_dir=STORAGE_FOLDER, + tip_cols = ( + [ + "SSUID", + "MONTHCODE", + "WPFINWGT", + "TAGE", + "TPTOTINC", + ] + + SIPP_JOB_OCCUPATION_COLUMNS + + SIPP_TIP_AMOUNT_COLUMNS + + [ + SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN[column] + for column in SIPP_TIP_AMOUNT_COLUMNS + ] + ) + sipp_df = pd.read_csv( + ensure_sipp_file(), + delimiter="|", + usecols=tip_cols, ) - sipp_df = pd.read_csv(STORAGE_FOLDER / "pu2023_slim.csv") tip_amount_columns = [ column for column in SIPP_TIP_AMOUNT_COLUMNS if column in sipp_df @@ -788,12 +799,6 @@ def _impute_sipp( # Asset imputation try: - hf_hub_download( - repo_id="PolicyEngine/policyengine-us-data", - filename="pu2023.csv", - repo_type="model", - local_dir=STORAGE_FOLDER, - ) asset_cols = ( [ "SSUID", @@ -817,7 +822,7 @@ def _impute_sipp( + SIPP_ASSET_ALLOCATION_COLUMNS ) asset_df = pd.read_csv( - STORAGE_FOLDER / "pu2023.csv", + ensure_sipp_file(), delimiter="|", usecols=asset_cols, ) diff --git a/policyengine_us_data/calibration/target_config.yaml b/policyengine_us_data/calibration/target_config.yaml index 51c282129..aae559ac8 100644 --- a/policyengine_us_data/calibration/target_config.yaml +++ b/policyengine_us_data/calibration/target_config.yaml @@ -205,7 +205,7 @@ include: geo_level: national - variable: social_security_survivors geo_level: national - - variable: ssi_federal_fiscal_year_outlays + - variable: ssi geo_level: national - variable: person_count geo_level: national diff --git a/policyengine_us_data/datasets/sipp/README.md b/policyengine_us_data/datasets/sipp/README.md index c30316ae7..a9ce0d3ce 100644 --- a/policyengine_us_data/datasets/sipp/README.md +++ b/policyengine_us_data/datasets/sipp/README.md @@ -18,8 +18,8 @@ SIPP panel wave. These are the canonical reference for every variable name, value code, and weighting construct used by the code in this folder: -- [SIPP 2023 public-use data dictionary (PDF)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/data-dictionaries/2023/2023_SIPP_Data_Dictionary.pdf) -- [SIPP 2023 users' guide (PDF, Aug 2026 revision)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/methodology/2023_SIPP_Users_Guide_AUG26.pdf) +- [SIPP 2024 public-use data dictionary (PDF)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/data-dictionaries/2024/2024_SIPP_Data_Dictionary.pdf) +- [SIPP 2024 users' guide (PDF)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/methodology/2024_SIPP_Users_Guide.pdf) See also: @@ -30,15 +30,16 @@ See also: ## Data products in this folder - `sipp.py` — trains and caches QRF imputation models (`get_tip_model`, - `get_asset_model`, `get_vehicle_model`) from SIPP 2023 person-month + `get_asset_model`, `get_vehicle_model`) from SIPP 2024 person-month data. The training frame is filtered to `MONTHCODE == 12` (December) so every row represents one person-year rather than twelve annualized months. -The raw SIPP CSVs (`pu2023.csv` and the slim variant `pu2023_slim.csv`) -are mirrored on the `PolicyEngine/policyengine-us-data` HuggingFace model -repo and downloaded on demand when a training run is needed. They are -not vendored in this Git repository. +The raw SIPP CSV (`pu2024.csv`) is downloaded on demand when a training +run is needed. The downloader first checks the +`PolicyEngine/policyengine-us-data` HuggingFace model repo for a cached +copy, then falls back to Census's public `pu2024_csv.zip` archive. The raw +file is not vendored in this Git repository. ## Licensing diff --git a/policyengine_us_data/datasets/sipp/sipp.py b/policyengine_us_data/datasets/sipp/sipp.py index 54f90f044..5e89baf82 100644 --- a/policyengine_us_data/datasets/sipp/sipp.py +++ b/policyengine_us_data/datasets/sipp/sipp.py @@ -1,14 +1,19 @@ -import pandas as pd -import numpy as np -from microimpute.models.qrf import QRF -from policyengine_us_data.storage import STORAGE_FOLDER -from policyengine_us_data.utils.randomness import seeded_rng import pickle +from urllib.error import HTTPError, URLError +from urllib.request import urlretrieve +from zipfile import ZipFile + from huggingface_hub import hf_hub_download +import numpy as np +import pandas as pd +from microimpute.models.qrf import QRF + from policyengine_us_data.datasets.cps.tipped_occupation import ( derive_any_treasury_tipped_occupation_code, derive_is_tipped_occupation, ) +from policyengine_us_data.storage import STORAGE_FOLDER +from policyengine_us_data.utils.randomness import seeded_rng from policyengine_us_data.utils.source_quality import ( cap_training_sample, filter_positive_finite_weight_rows, @@ -19,6 +24,15 @@ ) +SIPP_YEAR = 2024 +SIPP_REFERENCE_YEAR = 2023 +SIPP_FULL_FILE = f"pu{SIPP_YEAR}.csv" +SIPP_FULL_ZIP_FILE = f"pu{SIPP_YEAR}_csv.zip" +SIPP_FULL_ZIP_URL = ( + "https://www2.census.gov/programs-surveys/sipp/data/datasets/" + f"{SIPP_YEAR}/{SIPP_FULL_ZIP_FILE}" +) + SIPP_JOB_OCCUPATION_COLUMNS = [f"TJB{i}_OCC" for i in range(1, 8)] SIPP_TIP_AMOUNT_COLUMNS = [f"TJB{i}_TXAMT" for i in range(1, 8)] SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN = { @@ -48,7 +62,6 @@ SSI_DISABILITY_CRITERIA_VARIABLE = "meets_ssi_disability_criteria" SSI_DISABILITY_MODEL_VARIABLE = SSI_DISABILITY_CRITERIA_VARIABLE -SSI_DISABILITY_MODEL_VERSION = 6 SSI_DISABILITY_EXPORT_VARIABLES = (SSI_DISABILITY_CRITERIA_VARIABLE,) # These six CPS/SIPP difficulty items are construction-time predictors for the @@ -92,63 +105,88 @@ ] -def train_tip_model(): - DOWNLOAD_FULL_SIPP = False +def ensure_sipp_file(filename: str = SIPP_FULL_FILE): + """Return a local SIPP public-use file, downloading it if needed.""" + + local_path = STORAGE_FOLDER / filename + if local_path.exists(): + return local_path - if DOWNLOAD_FULL_SIPP: - hf_hub_download( + try: + downloaded_path = hf_hub_download( repo_id="PolicyEngine/policyengine-us-data", - filename="pu2023.csv", + filename=filename, repo_type="model", local_dir=STORAGE_FOLDER, ) - cols = [ - "SSUID", - "PNUM", - "MONTHCODE", - "ERESIDENCEID", - "ERELRPE", - "SPANEL", - "SWAVE", - "WPFINWGT", - "ESEX", - "TAGE", - "TAGE_EHC", - "ERACE", - "EORIGIN", - "EEDUC", - "EDEPCLM", - "EMS", - "EFSTATUS", - "TJB1_TXAMT", - "TJB1_MSUM", - "TJB1_OCC", - "TJB1_IND", - "AJB1_TXAMT", - "TPTOTINC", - ] + if downloaded_path: + return downloaded_path + except Exception: + if filename != SIPP_FULL_FILE: + raise + _download_sipp_full_file_from_census() + + if not local_path.exists(): + raise FileNotFoundError(f"Could not download {filename}") + return local_path + + +def _download_sipp_full_file_from_census(): + zip_path = STORAGE_FOLDER / SIPP_FULL_ZIP_FILE + if not zip_path.exists(): + try: + urlretrieve(SIPP_FULL_ZIP_URL, zip_path) + except (HTTPError, URLError) as error: + raise FileNotFoundError( + f"Could not download {SIPP_FULL_FILE} from HuggingFace or " + f"Census at {SIPP_FULL_ZIP_URL}" + ) from error + + with ZipFile(zip_path) as archive: + if SIPP_FULL_FILE not in archive.namelist(): + raise FileNotFoundError( + f"{SIPP_FULL_ZIP_FILE} does not contain {SIPP_FULL_FILE}" + ) + archive.extract(SIPP_FULL_FILE, STORAGE_FOLDER) - for col in cols: - if "JB1" in col: - for i in range(2, 8): - cols.append(col.replace("JB1", f"JB{i}")) - df = pd.read_csv( - STORAGE_FOLDER / "pu2023.csv", - delimiter="|", - usecols=cols, - ) +def train_tip_model(): + cols = [ + "SSUID", + "PNUM", + "MONTHCODE", + "ERESIDENCEID", + "ERELRPE", + "SPANEL", + "SWAVE", + "WPFINWGT", + "ESEX", + "TAGE", + "TAGE_EHC", + "ERACE", + "EORIGIN", + "EEDUC", + "EDEPCLM", + "EMS", + "EFSTATUS", + "TJB1_TXAMT", + "TJB1_MSUM", + "TJB1_OCC", + "TJB1_IND", + "AJB1_TXAMT", + "TPTOTINC", + ] - else: - hf_hub_download( - repo_id="PolicyEngine/policyengine-us-data", - filename="pu2023_slim.csv", - repo_type="model", - local_dir=STORAGE_FOLDER, - ) - df = pd.read_csv( - STORAGE_FOLDER / "pu2023_slim.csv", - ) + for col in cols.copy(): + if "JB1" in col: + for i in range(2, 8): + cols.append(col.replace("JB1", f"JB{i}")) + + df = pd.read_csv( + ensure_sipp_file(), + delimiter="|", + usecols=cols, + ) # Sum tip dollar-amount columns (TJB*_TXAMT) across all jobs. # Previously used `str.contains("TXAMT")`, which also picked up # AJB*_TXAMT Census allocation flags (small ints 0/1/2 indicating @@ -256,7 +294,7 @@ def get_tip_model() -> QRF: return model -# Asset imputation from SIPP 2023 +# Asset imputation from the latest available SIPP public-use file # Imputes asset categories separately for policy flexibility ASSET_JOB_EARNINGS_COLUMNS = [f"TJB{i}_MSUM" for i in range(1, 8)] @@ -459,6 +497,42 @@ def _yes(df: pd.DataFrame, column: str) -> pd.Series: return values.fillna(0).astype(float).eq(1) +def _sipp_monthly_earned_income(df: pd.DataFrame) -> pd.Series: + """Approximate monthly earned income from SIPP job earnings columns.""" + job_cols = [col for col in ASSET_JOB_EARNINGS_COLUMNS if col in df] + if job_cols: + return df[job_cols].fillna(0).sum(axis=1) + return df["TPTOTINC"].fillna(0) + + +def _sipp_monthly_unearned_income( + df: pd.DataFrame, monthly_earned_income: pd.Series +) -> pd.Series: + """Approximate monthly unearned income as total income net of job earnings.""" + return (df["TPTOTINC"].fillna(0) - monthly_earned_income).clip(lower=0) + + +def _approximate_monthly_ssi_countable_income( + monthly_earned_income: pd.Series, + monthly_unearned_income: pd.Series, + *, + general_exclusion: float, + earned_exclusion: float, + earned_share_excluded: float, +) -> pd.Series: + """Apply standard SSI income exclusions to monthly SIPP income proxies.""" + applied_general = np.minimum(general_exclusion, monthly_unearned_income) + countable_unearned = monthly_unearned_income - applied_general + leftover_general = general_exclusion - applied_general + + earned_after_flat_exclusions = ( + monthly_earned_income - earned_exclusion - leftover_general + ).clip(lower=0) + countable_earned = earned_after_flat_exclusions * (1 - earned_share_excluded) + + return countable_unearned + countable_earned + + def _add_ssi_disability_difficulty_predictors(df: pd.DataFrame) -> None: for predictor, source_column in SIPP_SSI_DISABILITY_DIFFICULTY_COLUMNS.items(): df[predictor] = _yes(df, source_column) @@ -491,25 +565,35 @@ def _observed_ssi_disability_label_mask( def _ssi_financial_candidate_mask( df: pd.DataFrame, time_period: int = 2024 ) -> pd.Series: - """Approximate non-disability SSI financial eligibility in SIPP. + """Approximate non-disability SSI screening eligibility in SIPP. This is only a training-frame screen. It avoids treating people whose - resources or income make SSI receipt structurally unlikely as clean - non-disabled labels. + resources, countable income, or SGA-level earnings make SSI receipt + structurally unlikely as clean non-disabled labels. """ try: from policyengine_us import CountryTaxBenefitSystem - p = CountryTaxBenefitSystem().parameters(f"{time_period}-01-01").gov.ssa.ssi + parameters = CountryTaxBenefitSystem().parameters(f"{time_period}-01-01") + p = parameters.gov.ssa.ssi individual_resource_limit = float(p.eligibility.resources.limit.individual) couple_resource_limit = float(p.eligibility.resources.limit.couple) individual_fbr = float(p.amount.individual) couple_fbr = float(p.amount.couple) + income_exclusions = p.income.exclusions + general_exclusion = float(income_exclusions.general) + earned_exclusion = float(income_exclusions.earned) + earned_share_excluded = float(income_exclusions.earned_share) + non_blind_sga = float(parameters.gov.ssa.sga.non_blind) except Exception: individual_resource_limit = 2_000.0 couple_resource_limit = 3_000.0 individual_fbr = 943.0 couple_fbr = 1_415.0 + general_exclusion = 20.0 + earned_exclusion = 65.0 + earned_share_excluded = 0.5 + non_blind_sga = 1_550.0 resource_limit = np.where( df["is_married"].astype(bool), @@ -526,9 +610,23 @@ def _ssi_financial_candidate_mask( + df["stock_assets"].fillna(0) + df["bond_assets"].fillna(0) ) - monthly_income = df["TPTOTINC"].fillna(0) - return (liquid_resources <= resource_limit) & ( - monthly_income <= monthly_income_limit * 2 + monthly_earned_income = _sipp_monthly_earned_income(df) + monthly_unearned_income = _sipp_monthly_unearned_income(df, monthly_earned_income) + monthly_countable_income = _approximate_monthly_ssi_countable_income( + monthly_earned_income, + monthly_unearned_income, + general_exclusion=general_exclusion, + earned_exclusion=earned_exclusion, + earned_share_excluded=earned_share_excluded, + ) + difficulty_seeing = df.get("difficulty_seeing", _yes(df, "ESEEING")) + is_blind = pd.Series(difficulty_seeing, index=df.index).fillna(False).astype(bool) + passes_sga_gate = is_blind | monthly_earned_income.le(non_blind_sga) + + return ( + (liquid_resources <= resource_limit) + & monthly_countable_income.le(monthly_income_limit) + & passes_sga_gate ) @@ -544,7 +642,7 @@ def build_ssi_disability_training_frame( df["age"] = df.TAGE df["is_female"] = df.ESEX == 2 df["is_married"] = df.EMS == 1 - df["employment_income"] = df.TPTOTINC.fillna(0) * 12 + df["employment_income"] = _sipp_monthly_earned_income(df) * 12 df["interest_income"] = (df["TINC_BANK"].fillna(0) + df["TINC_BOND"].fillna(0)) * 12 df["dividend_income"] = df["TINC_STMF"].fillna(0) * 12 df["rental_income"] = df["TINC_RENT"].fillna(0) * 12 @@ -698,7 +796,7 @@ def predict_ssi_disability_criteria(model, receiver_df: pd.DataFrame) -> np.ndar def train_asset_model(): - """Train QRF model for liquid asset categories using SIPP 2023 data. + """Train QRF model for liquid asset categories using SIPP data. Imputes three asset categories separately: - bank_account_assets: checking, savings, money market (TVAL_BANK) @@ -707,15 +805,8 @@ def train_asset_model(): Policy models can then define countable resources based on rules. """ - hf_hub_download( - repo_id="PolicyEngine/policyengine-us-data", - filename="pu2023.csv", - repo_type="model", - local_dir=STORAGE_FOLDER, - ) - df = pd.read_csv( - STORAGE_FOLDER / "pu2023.csv", + ensure_sipp_file(), delimiter="|", usecols=ASSET_COLUMNS, ) @@ -784,7 +875,7 @@ def train_asset_model(): def get_asset_model() -> QRF: """Get or train the liquid asset imputation model.""" - model_path = STORAGE_FOLDER / "liquid_assets_v3.pkl" + model_path = STORAGE_FOLDER / f"liquid_assets_sipp_{SIPP_YEAR}.pkl" if not model_path.exists(): model = train_asset_model() @@ -800,15 +891,8 @@ def get_asset_model() -> QRF: def train_ssi_disability_model(time_period: int = 2024): """Train a boolean model for likely SSI disability criteria passage.""" - hf_hub_download( - repo_id="PolicyEngine/policyengine-us-data", - filename="pu2023.csv", - repo_type="model", - local_dir=STORAGE_FOLDER, - ) - df = pd.read_csv( - STORAGE_FOLDER / "pu2023.csv", + ensure_sipp_file(), delimiter="|", usecols=SSI_DISABILITY_COLUMNS, ) @@ -862,22 +946,14 @@ def get_ssi_disability_model(time_period: int = 2024) -> QRF: def _ssi_disability_model_path(time_period: int): return ( - STORAGE_FOLDER - / f"ssi_disability_criteria_v{SSI_DISABILITY_MODEL_VERSION}_{time_period}.pkl" + STORAGE_FOLDER / f"ssi_disability_criteria_{time_period}_sipp_{SIPP_YEAR}.pkl" ) def build_vehicle_training_frame() -> pd.DataFrame: """Build a household-level SIPP frame for vehicle asset imputation.""" - hf_hub_download( - repo_id="PolicyEngine/policyengine-us-data", - filename="pu2023.csv", - repo_type="model", - local_dir=STORAGE_FOLDER, - ) - df = pd.read_csv( - STORAGE_FOLDER / "pu2023.csv", + ensure_sipp_file(), delimiter="|", usecols=VEHICLE_COLUMNS, ) @@ -949,7 +1025,7 @@ def build_vehicle_training_frame() -> pd.DataFrame: def train_vehicle_model(): - """Train a household-level vehicle asset model from SIPP 2023.""" + """Train a household-level vehicle asset model from SIPP.""" sipp = build_vehicle_training_frame() sipp = sipp[~sipp.isna().any(axis=1)] vehicle_vars = [ @@ -986,7 +1062,7 @@ def train_vehicle_model(): def get_vehicle_model() -> QRF: """Get or train the household vehicle imputation model.""" - model_path = STORAGE_FOLDER / "household_vehicle_assets_v2.pkl" + model_path = STORAGE_FOLDER / f"household_vehicle_assets_sipp_{SIPP_YEAR}.pkl" if not model_path.exists(): model = train_vehicle_model() diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 56a6d0e92..d521928f9 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -27,13 +27,12 @@ get_geographic_strata, ) from policyengine_us_data.utils.ssi_targets import ( - SSI_PAYMENT_TARGET_SOURCE, + SSI_CBO_TARGET_SOURCE, SSI_RECIPIENT_TARGET_NOTES, SSI_RECIPIENT_TARGET_SOURCE, SSI_RECIPIENT_TARGET_YEAR, SSI_RECIPIENT_TARGETS_2024, - get_ssi_payment_target_notes, - scale_ssi_fiscal_year_target_for_single_year_data, + get_ssi_annual_payment_target, ) from policyengine_us_data.utils.target_variables import ( target_variable_components, @@ -781,14 +780,13 @@ def extract_national_targets(year: int = DEFAULT_YEAR): "income_tax_positive", "snap", "social_security", - "ssi_federal_fiscal_year_outlays", + "ssi", "unemployment_compensation", ] # Mapping from target variable to CBO parameter name (when different) cbo_param_name_map = { "income_tax_positive": "income_tax", # CBO param is income_tax - "ssi_federal_fiscal_year_outlays": "ssi", } cbo_targets = [] @@ -800,12 +798,14 @@ def extract_national_targets(year: int = DEFAULT_YEAR): ).calibration.gov.cbo._children[param_name] source = "CBO Budget Projections" notes = f"CBO projection for {variable_name}" - if variable_name == "ssi_federal_fiscal_year_outlays": - value = scale_ssi_fiscal_year_target_for_single_year_data( - value, time_period - ) - source = SSI_PAYMENT_TARGET_SOURCE - notes = get_ssi_payment_target_notes(time_period) + if variable_name == "ssi": + ssi_target = get_ssi_annual_payment_target(time_period) + if ssi_target is None: + source = SSI_CBO_TARGET_SOURCE + else: + value = ssi_target["value"] + source = ssi_target["source"] + notes = ssi_target["notes"] cbo_targets.append( { "variable": variable_name, @@ -951,14 +951,6 @@ def load_national_targets( for _, target_data in direct_targets_df.iterrows(): target_year = target_data["year"] _register_target_variable(session, target_data["variable"]) - if target_data["variable"] == "ssi_federal_fiscal_year_outlays": - _deactivate_replaced_national_target( - session, - stratum_id=us_stratum.stratum_id, - old_variable="ssi", - new_variable="ssi_federal_fiscal_year_outlays", - period=target_year, - ) # Check if target already exists existing_target = session.exec( select(Target).where( diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index d088ac516..6af3ae0d3 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -28,7 +28,7 @@ from policyengine_us_data.utils.soi import pe_to_soi, get_soi, get_tracked_soi_row from policyengine_us_data.utils.ssi_targets import ( SSI_RECIPIENT_TARGETS_2024, - scale_ssi_fiscal_year_target_for_single_year_data, + get_ssi_annual_payment_target, ) from policyengine_us_data.utils.target_variables import ( target_variable_components, @@ -100,13 +100,12 @@ "income_tax_positive", "snap", "social_security", - "ssi_federal_fiscal_year_outlays", + "ssi", "unemployment_compensation", ] CBO_PARAM_NAME_MAP = { "income_tax_positive": "income_tax", - "ssi_federal_fiscal_year_outlays": "ssi", } HARD_CODED_TOTALS = { @@ -250,13 +249,15 @@ def _add_ssi_recipient_targets(loss_matrix, targets_array, sim, time_period): def _cbo_program_target_value(sim, variable_name: str, time_period): + if variable_name == "ssi": + ssi_target = get_ssi_annual_payment_target(time_period) + if ssi_target is not None: + return ssi_target["value"] + param_name = CBO_PARAM_NAME_MAP.get(variable_name, variable_name) - value = sim.tax_benefit_system.parameters( - time_period - ).calibration.gov.cbo._children[param_name] - if variable_name == "ssi_federal_fiscal_year_outlays": - return scale_ssi_fiscal_year_target_for_single_year_data(value, time_period) - return value + return sim.tax_benefit_system.parameters(time_period).calibration.gov.cbo._children[ + param_name + ] ACA_SPENDING_TARGETS = { @@ -1344,7 +1345,11 @@ def build_loss_matrix(dataset: type, time_period): for variable_name in CBO_PROGRAMS: label = f"nation/cbo/{variable_name}" - loss_matrix[label] = sim.calculate(variable_name, map_to="household").values + loss_matrix[label] = sim.calculate( + variable_name, + time_period, + map_to="household", + ).values if any(loss_matrix[label].isna()): raise ValueError(f"Missing values for {label}") targets_array.append(_cbo_program_target_value(sim, variable_name, time_period)) diff --git a/policyengine_us_data/utils/ssi_targets.py b/policyengine_us_data/utils/ssi_targets.py index ec87c6e0b..22f95aa06 100644 --- a/policyengine_us_data/utils/ssi_targets.py +++ b/policyengine_us_data/utils/ssi_targets.py @@ -1,97 +1,41 @@ """Shared SSI calibration targets.""" -from datetime import date, timedelta - SSI_CBO_TARGET_SOURCE = ( "https://www.cbo.gov/system/files/2026-02/51313-2026-02-ssi.xlsx" ) -SSI_PAYMENT_TIMING_SOURCE = "https://www.ssa.gov/oact/ssir/SSI24/IV_C_Payments.html" -SSI_PAYMENT_RULE_SOURCE = "https://www.ssa.gov/OP_Home/cfr20/416/416-0502.htm" -SSI_PAYMENT_TARGET_SOURCE = ( - f"{SSI_CBO_TARGET_SOURCE}; {SSI_PAYMENT_TIMING_SOURCE}; {SSI_PAYMENT_RULE_SOURCE}" +SSI_ANNUAL_PAYMENT_TARGET_SOURCE = ( + "https://www.ssa.gov/policy/docs/statcomps/ssi_asr/2024/sect01.html" ) +SSI_OACT_PAYMENT_DATE_TARGET_SOURCE = ( + "https://www.ssa.gov/oact/ssir/SSI25/IV_C_Payments.html" +) +SSI_OACT_CY2024_PAYMENT_DATE_ALL = 63_080_000_000 +SSI_OACT_FY2024_PAYMENT_DATE_ALL = 57_600_000_000 +SSI_ANNUAL_PAYMENT_TARGET_NOTES = ( + "SSA SSI Annual Statistical Report, 2024, Table 2; Federal SSI total " + "annual payments for all recipients, excluding federally administered " + "state supplementation. ASR allocates payments to the month due, so this " + "target aligns with annual `ssi` over January-December benefit months. " + "Do not replace it with OACT payment-date accounting; OACT Table IV.C2 " + "reports FY2024 all Federal SSI payments of $57.600B because fiscal-year " + "payment-date totals can contain 11, 12, or 13 monthly payments. The " + "smaller gap between ASR CY2024 and OACT FY2024 is not a pure 12-vs-11 " + "month comparison: OACT Table IV.C1 reports CY2024 all Federal SSI " + "payments of $63.080B on a payment-date basis, $5.480B above OACT " + "FY2024, and OACT obligations are not reduced for certain recovered " + "overpayments remitted directly to Treasury that ASR nets out." +) +SSI_ANNUAL_PAYMENT_TARGETS = { + 2024: { + "value": 59_665_127_000, + "source": SSI_ANNUAL_PAYMENT_TARGET_SOURCE, + "notes": SSI_ANNUAL_PAYMENT_TARGET_NOTES, + }, +} -def _as_fiscal_year(year) -> int: - return int(str(year)[:4]) - - -def _is_new_years_day_observed(day: date) -> bool: - new_years_day = date(day.year, 1, 1) - next_new_years_day = date(day.year + 1, 1, 1) - return ( - day == new_years_day - or (new_years_day.weekday() == 6 and day == date(day.year, 1, 2)) - or (next_new_years_day.weekday() == 5 and day == date(day.year, 12, 31)) - ) - - -def _is_labor_day(day: date) -> bool: - return day.month == 9 and day.weekday() == 0 and day.day <= 7 - - -def _is_federal_holiday_affecting_ssi_payment(day: date) -> bool: - return _is_new_years_day_observed(day) or _is_labor_day(day) - - -def _ssi_payment_date(year: int, month: int) -> date: - payment_date = date(year, month, 1) - while payment_date.weekday() >= 5 or _is_federal_holiday_affecting_ssi_payment( - payment_date - ): - payment_date -= timedelta(days=1) - return payment_date - - -def _ssi_fiscal_year_benefit_months(year) -> list[date]: - fiscal_year = _as_fiscal_year(year) - fiscal_year_start = date(fiscal_year - 1, 10, 1) - fiscal_year_end = date(fiscal_year, 9, 30) - - benefit_months = [] - for calendar_year in (fiscal_year - 1, fiscal_year): - for month in range(1, 13): - payment_day = _ssi_payment_date(calendar_year, month) - if fiscal_year_start <= payment_day <= fiscal_year_end: - benefit_months.append(date(calendar_year, month, 1)) - return benefit_months - - -def get_ssi_fiscal_year_payment_count(year) -> int: - """Return SSI benefit months with payment dates in the federal fiscal year.""" - return len(_ssi_fiscal_year_benefit_months(year)) - - -def get_ssi_single_year_available_payment_count(year) -> int: - """Return fiscal-year SSI benefit months available from a single-year H5.""" - fiscal_year = _as_fiscal_year(year) - return sum( - benefit_month.year == fiscal_year - for benefit_month in _ssi_fiscal_year_benefit_months(year) - ) - - -def scale_ssi_fiscal_year_target_for_single_year_data(value, year) -> float: - """Scale full fiscal-year SSI outlays to months computable from one H5 year.""" - return ( - float(value) - * get_ssi_single_year_available_payment_count(year) - / get_ssi_fiscal_year_payment_count(year) - ) - - -def get_ssi_payment_target_notes(year) -> str: - fiscal_year = _as_fiscal_year(year) - available_count = get_ssi_single_year_available_payment_count(year) - payment_count = get_ssi_fiscal_year_payment_count(year) - return ( - "CBO SSI federal fiscal-year outlays scaled to the benefit months " - "computable from a single-year PolicyEngine-US-data H5 using " - "policyengine-us ssi_federal_fiscal_year_outlays; " - f"FY{fiscal_year} has {payment_count} SSI benefit months paid in the " - f"federal fiscal year, of which {available_count} are benefit months " - f"in calendar year {fiscal_year}" - ) +def get_ssi_annual_payment_target(year) -> dict | None: + return SSI_ANNUAL_PAYMENT_TARGETS.get(int(str(year)[:4])) SSI_RECIPIENT_TARGET_YEAR = 2024 diff --git a/tests/unit/calibration/test_hourly_wage_income_consistency.py b/tests/unit/calibration/test_hourly_wage_income_consistency.py index 235a9ecf3..68f19326b 100644 --- a/tests/unit/calibration/test_hourly_wage_income_consistency.py +++ b/tests/unit/calibration/test_hourly_wage_income_consistency.py @@ -62,23 +62,14 @@ def test_run_sanity_checks_adds_hourly_wage_income_consistency(tmp_path): assert by_check["hourly_wage_income_consistency_overtime"]["status"] == "WARN" -def test_run_sanity_checks_keeps_raw_ssi_and_checks_computed_outlays( - tmp_path, monkeypatch -): +def test_run_sanity_checks_keeps_raw_ssi_without_computed_outlays(tmp_path): h5_path = tmp_path / "sample.h5" with h5py.File(h5_path, "w") as h5: _write_period_dataset(h5, "household_weight", [1.0, 1.0]) _write_period_dataset(h5, "ssi", [100.0, 0.0]) - monkeypatch.setattr( - "policyengine_us_data.calibration.sanity_checks._computed_key_monetary_values", - lambda h5_path, period: { - "ssi_federal_fiscal_year_outlays": np.array([100.0, np.inf]) - }, - ) - diagnostics = run_sanity_checks(str(h5_path), period=2024) by_check = {diagnostic["check"]: diagnostic for diagnostic in diagnostics} assert by_check["no_nan_inf_ssi"]["status"] == "PASS" - assert by_check["no_nan_inf_ssi_federal_fiscal_year_outlays"]["status"] == "FAIL" + assert "no_nan_inf_ssi_federal_fiscal_year_outlays" not in by_check diff --git a/tests/unit/calibration/test_loss_targets.py b/tests/unit/calibration/test_loss_targets.py index 3e7ee8baf..521a09f01 100644 --- a/tests/unit/calibration/test_loss_targets.py +++ b/tests/unit/calibration/test_loss_targets.py @@ -42,9 +42,6 @@ from policyengine_us_data.db import etl_national_targets from policyengine_us_data.utils.ssi_targets import ( SSI_RECIPIENT_TARGETS_2024, - get_ssi_fiscal_year_payment_count, - get_ssi_single_year_available_payment_count, - scale_ssi_fiscal_year_target_for_single_year_data, ) @@ -381,28 +378,10 @@ def test_add_ssi_recipient_targets_adds_total_and_age_counts(): ) -def test_ssi_payment_targets_scale_to_single_year_fiscal_year_coverage(): - assert get_ssi_fiscal_year_payment_count(2024) == 11 - assert get_ssi_single_year_available_payment_count(2024) == 9 - assert get_ssi_fiscal_year_payment_count(2025) == 12 - assert get_ssi_single_year_available_payment_count(2025) == 9 - assert get_ssi_fiscal_year_payment_count(2028) == 13 - assert get_ssi_single_year_available_payment_count(2028) == 10 - - assert scale_ssi_fiscal_year_target_for_single_year_data( - 57_000_000_000, 2024 - ) == pytest.approx(57_000_000_000 * 9 / 11) - assert scale_ssi_fiscal_year_target_for_single_year_data( - 75_400_000_000, 2028 - ) == pytest.approx(75_400_000_000 * 10 / 13) - - -def test_legacy_cbo_ssi_target_uses_single_year_fiscal_year_coverage(): +def test_legacy_cbo_ssi_target_uses_ssa_actual_when_available(): sim = _FakeCBOProgramTargetSimulation() - assert _cbo_program_target_value( - sim, "ssi_federal_fiscal_year_outlays", 2024 - ) == pytest.approx(57_000_000_000 * 9 / 11) + assert _cbo_program_target_value(sim, "ssi", 2024) == 59_665_127_000 assert _cbo_program_target_value(sim, "snap", 2024) == 1_000.0 diff --git a/tests/unit/calibration/test_source_impute.py b/tests/unit/calibration/test_source_impute.py index df13ab4a3..f4a88fa07 100644 --- a/tests/unit/calibration/test_source_impute.py +++ b/tests/unit/calibration/test_source_impute.py @@ -479,9 +479,10 @@ def test_calibration_sipp_qrf_passes_target_filters(self, monkeypatch): ) def fake_read_csv(path, *args, **kwargs): - if str(path).endswith("pu2023_slim.csv"): + usecols = set(kwargs.get("usecols") or []) + if "TJB1_TXAMT" in usecols: return tip_source.copy() - if str(path).endswith("pu2023.csv"): + if "TVAL_BANK" in usecols: return asset_source.copy() raise AssertionError(f"Unexpected read_csv path: {path}") @@ -513,6 +514,11 @@ def predict(self, X_test): "hf_hub_download", lambda *args, **kwargs: None, ) + monkeypatch.setattr( + source_impute, + "ensure_sipp_file", + lambda: "pu2024.csv", + ) monkeypatch.setattr(source_impute.pd, "read_csv", fake_read_csv) monkeypatch.setattr(source_impute, "QRF", FakeQRF) monkeypatch.setattr( diff --git a/tests/unit/calibration/test_target_config.py b/tests/unit/calibration/test_target_config.py index 60f862a90..fe7b3055e 100644 --- a/tests/unit/calibration/test_target_config.py +++ b/tests/unit/calibration/test_target_config.py @@ -546,10 +546,7 @@ def test_training_config_includes_ssi_recipient_count_targets(self): ) include_rules = config["include"] - assert { - "variable": "ssi_federal_fiscal_year_outlays", - "geo_level": "national", - } in include_rules + assert {"variable": "ssi", "geo_level": "national"} in include_rules assert { "variable": "person_count", "geo_level": "national", diff --git a/tests/unit/datasets/test_sipp_ssi_disability.py b/tests/unit/datasets/test_sipp_ssi_disability.py index b8289643d..03bec8249 100644 --- a/tests/unit/datasets/test_sipp_ssi_disability.py +++ b/tests/unit/datasets/test_sipp_ssi_disability.py @@ -15,7 +15,6 @@ ) from policyengine_us_data.datasets.sipp.sipp import ( SSI_DISABILITY_COLUMNS, - SSI_DISABILITY_MODEL_VERSION, _ssi_disability_model_path, ) @@ -71,6 +70,37 @@ def test_build_ssi_disability_training_frame_screens_financially(): ) +def test_build_ssi_disability_training_frame_screens_nonblind_sga(): + frame = _base_sipp_frame().iloc[[2]].copy() + frame["TPTOTINC"] = 1_600.0 + frame["TJB1_MSUM"] = 1_600.0 + + result = build_ssi_disability_training_frame(frame) + + assert not result["ssi_disability_training_candidate"].iloc[0] + + +def test_build_ssi_disability_training_frame_does_not_sga_screen_blind_records(): + frame = _base_sipp_frame().iloc[[2]].copy() + frame["TPTOTINC"] = 1_600.0 + frame["TJB1_MSUM"] = 1_600.0 + frame["ESEEING"] = 1 + + result = build_ssi_disability_training_frame(frame) + + assert result["ssi_disability_training_candidate"].iloc[0] + + +def test_build_ssi_disability_training_frame_uses_countable_income_threshold(): + frame = _base_sipp_frame().iloc[[2]].copy() + frame["TPTOTINC"] = 1_500.0 + frame["TJB1_MSUM"] = 0.0 + + result = build_ssi_disability_training_frame(frame) + + assert not result["ssi_disability_training_candidate"].iloc[0] + + def test_build_ssi_disability_training_frame_uses_all_disability_amounts(): frame = _base_sipp_frame().iloc[[2]].copy() frame["TDIS6AMT"] = 100 @@ -81,7 +111,7 @@ def test_build_ssi_disability_training_frame_uses_all_disability_amounts(): def test_ssi_disability_training_usecols_include_label_and_income_columns(): - assert {"TPTOTINC", "RSSI_YRYN"} <= set(SSI_DISABILITY_COLUMNS) + assert {"TPTOTINC", "TJB1_MSUM", "RSSI_YRYN"} <= set(SSI_DISABILITY_COLUMNS) assert {"ASSI_YRYN", "ASSI_BRSN"} <= set(SSI_DISABILITY_COLUMNS) assert { "ESELFCARE", @@ -100,10 +130,10 @@ def test_ssi_disability_predictors_use_six_comparable_difficulty_items(): assert "is_disabled" not in SSI_DISABILITY_MODEL_PREDICTORS -def test_ssi_disability_model_cache_version_tracks_predictor_schema(): - assert SSI_DISABILITY_MODEL_VERSION == 6 - assert _ssi_disability_model_path(2024).name == ( - "ssi_disability_criteria_v6_2024.pkl" +def test_ssi_disability_model_cache_path_uses_training_period(): + assert ( + _ssi_disability_model_path(2024).name + == "ssi_disability_criteria_2024_sipp_2024.pkl" ) diff --git a/tests/unit/datasets/test_sipp_tip_columns.py b/tests/unit/datasets/test_sipp_tip_columns.py index de04dee1c..946ef83b8 100644 --- a/tests/unit/datasets/test_sipp_tip_columns.py +++ b/tests/unit/datasets/test_sipp_tip_columns.py @@ -61,7 +61,7 @@ def test_tip_sum_excludes_allocation_flags(): def test_train_tip_model_requires_allocation_flags_for_present_tip_columns( monkeypatch, ): - monkeypatch.setattr(sipp_module, "hf_hub_download", lambda *args, **kwargs: None) + monkeypatch.setattr(sipp_module, "ensure_sipp_file", lambda: "pu2024.csv") monkeypatch.setattr( sipp_module.pd, "read_csv", @@ -73,7 +73,7 @@ def test_train_tip_model_requires_allocation_flags_for_present_tip_columns( def test_train_tip_model_drops_non_positive_weights(monkeypatch): - monkeypatch.setattr(sipp_module, "hf_hub_download", lambda *args, **kwargs: None) + monkeypatch.setattr(sipp_module, "ensure_sipp_file", lambda: "pu2024.csv") data = { "SSUID": [1, 2, 3, 4], @@ -117,7 +117,7 @@ def fit( def test_train_tip_model_keeps_reported_sipp_status_flags(monkeypatch): - monkeypatch.setattr(sipp_module, "hf_hub_download", lambda *args, **kwargs: None) + monkeypatch.setattr(sipp_module, "ensure_sipp_file", lambda: "pu2024.csv") data = { "SSUID": [1, 2, 3, 4], diff --git a/tests/unit/test_etl_national_targets.py b/tests/unit/test_etl_national_targets.py index a72d21d34..6f0fe18b5 100644 --- a/tests/unit/test_etl_national_targets.py +++ b/tests/unit/test_etl_national_targets.py @@ -20,7 +20,7 @@ load_state_acs_rent_targets, ) from policyengine_us_data.utils.ssi_targets import ( - SSI_PAYMENT_TARGET_SOURCE, + SSI_ANNUAL_PAYMENT_TARGET_SOURCE, SSI_RECIPIENT_TARGETS_2024, ) @@ -442,7 +442,7 @@ def test_extract_national_targets_includes_ssi_count_targets(): } -def test_extract_national_targets_uses_ssi_fiscal_year_outlays_target(monkeypatch): +def test_extract_national_targets_uses_ssi_ssa_actual_when_available(monkeypatch): class FakeIncomeBySource: _children = { target["parameter"]: 0 @@ -485,72 +485,20 @@ class FakeTaxBenefitSystem: raw_targets = extract_national_targets(year=2024) ssi_target = next( - target - for target in raw_targets["cbo_targets"] - if target["variable"] == "ssi_federal_fiscal_year_outlays" - ) - - assert ssi_target["value"] == 57_000_000_000 * 9 / 11 - assert ssi_target["source"] == SSI_PAYMENT_TARGET_SOURCE - assert "single-year PolicyEngine-US-data H5" in ssi_target["notes"] - - -def test_load_national_targets_deactivates_legacy_ssi_dollar_target( - tmp_path, monkeypatch -): - calibration_dir = tmp_path / "calibration" - calibration_dir.mkdir() - db_uri = f"sqlite:///{calibration_dir / 'policy_data.db'}" - engine = create_database(db_uri) - - with Session(engine) as session: - national = _make_stratum(session, notes="United States") - session.add( - Target( - stratum_id=national.stratum_id, - variable="ssi", - period=2024, - value=57_000_000_000, - active=True, - notes="legacy SSI dollar target", - ) - ) - session.commit() - - monkeypatch.setattr( - "policyengine_us_data.db.etl_national_targets.STORAGE_FOLDER", - tmp_path, + target for target in raw_targets["cbo_targets"] if target["variable"] == "ssi" ) - load_national_targets( - direct_targets_df=pd.DataFrame( - [ - { - "variable": "ssi_federal_fiscal_year_outlays", - "value": 57_000_000_000 * 9 / 11, - "source": SSI_PAYMENT_TARGET_SOURCE, - "notes": "CBO SSI federal fiscal-year outlays", - "year": 2024, - } - ] - ), - tax_filer_df=pd.DataFrame(), - tax_expenditure_df=pd.DataFrame(), - conditional_targets=[], - ) - - with Session(engine) as session: - legacy_target = session.exec( - select(Target).where(Target.variable == "ssi") - ).one() - new_target = session.exec( - select(Target).where(Target.variable == "ssi_federal_fiscal_year_outlays") - ).one() - - assert legacy_target.active is False - assert "replaced this target concept" in legacy_target.notes - assert new_target.active is True - assert new_target.value == 57_000_000_000 * 9 / 11 + assert ssi_target["value"] == 59_665_127_000 + assert ssi_target["source"] == SSI_ANNUAL_PAYMENT_TARGET_SOURCE + assert "SSA SSI Annual Statistical Report, 2024, Table 2" in ssi_target["notes"] + assert "month due" in ssi_target["notes"] + assert "annual `ssi` over January-December benefit months" in ssi_target["notes"] + assert "OACT Table IV.C2" in ssi_target["notes"] + assert "$57.600B" in ssi_target["notes"] + assert "OACT Table IV.C1" in ssi_target["notes"] + assert "$63.080B" in ssi_target["notes"] + assert "$5.480B above OACT FY2024" in ssi_target["notes"] + assert "recovered overpayments" in ssi_target["notes"] def test_load_national_targets_uses_medicaid_enrolled_for_enrollment_counts( diff --git a/validation/stage_1/test_policy_data_db.py b/validation/stage_1/test_policy_data_db.py index 4d516c766..c0f682604 100644 --- a/validation/stage_1/test_policy_data_db.py +++ b/validation/stage_1/test_policy_data_db.py @@ -50,7 +50,7 @@ def test_national_targets_loaded(built_db): "long_term_capital_gains", "snap", "social_security", - "ssi_federal_fiscal_year_outlays", + "ssi", ]: assert expected in variables, ( f"National target '{expected}' missing. Found: {sorted(variables)}"