Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
**/__pycache__
**/.DS_STORE
**/*.h5
**/*.h5.lock
**/*.npy
**/*.csv
**/*.csv.gz
**/pu*_csv.zip
**/*.clone_diagnostics.json
**/_build
**/*.pkl
**/*.db
Expand Down
1 change: 1 addition & 0 deletions changelog.d/1131.fixed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Refine the SIPP SSI disability training candidate screen to use SGA and approximate SSI countable income, and remove the manual cache-version suffix.
8 changes: 6 additions & 2 deletions policyengine_us_data/calibration/chunked_matrix_assembler.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,9 @@ def run_single_chunk(self, chunk_id: int) -> ChunkResult:
continue
try:
hh_vars[variable] = chunk_sim.calculate(
variable, state.time_period, map_to="household"
variable,
state.time_period,
map_to="household",
).values.astype(np.float32)
except Exception as exc:
logger.warning(
Expand All @@ -394,7 +396,9 @@ def run_single_chunk(self, chunk_id: int) -> ChunkResult:
continue
try:
target_entity_vars[variable] = chunk_sim.calculate(
variable, state.time_period, map_to=entity_key
variable,
state.time_period,
map_to=entity_key,
).values.astype(np.float32)
except Exception as exc:
logger.warning(
Expand Down
7 changes: 4 additions & 3 deletions policyengine_us_data/calibration/sanity_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@
"income_tax_before_credits",
]

COMPUTED_KEY_MONETARY_VARS = [
"ssi_federal_fiscal_year_outlays",
]
COMPUTED_KEY_MONETARY_VARS = []

TAKEUP_VARS = [
"takes_up_snap_if_eligible",
Expand Down Expand Up @@ -665,6 +663,9 @@ def _append_finite_check(var: str, vals) -> None:


def _computed_key_monetary_values(h5_path: str, period: int) -> dict[str, np.ndarray]:
if not COMPUTED_KEY_MONETARY_VARS:
return {}

try:
from policyengine_us import Microsimulation

Expand Down
37 changes: 21 additions & 16 deletions policyengine_us_data/calibration/source_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
SSI_DISABILITY_EXPORT_VARIABLES,
VEHICLE_MODEL_PREDICTORS,
build_vehicle_training_frame,
ensure_sipp_file,
get_ssi_disability_model,
predict_ssi_disability_criteria,
preserve_under_65_ssi_disability_criteria,
Expand Down Expand Up @@ -663,16 +664,26 @@ def _impute_sipp(
Returns:
Updated data dict.
"""
from huggingface_hub import hf_hub_download
from policyengine_us_data.storage import STORAGE_FOLDER

hf_hub_download(
repo_id="PolicyEngine/policyengine-us-data",
filename="pu2023_slim.csv",
repo_type="model",
local_dir=STORAGE_FOLDER,
tip_cols = (
[
"SSUID",
"MONTHCODE",
"WPFINWGT",
"TAGE",
"TPTOTINC",
]
+ SIPP_JOB_OCCUPATION_COLUMNS
+ SIPP_TIP_AMOUNT_COLUMNS
+ [
SIPP_TIP_AMOUNT_TO_ALLOCATION_COLUMN[column]
for column in SIPP_TIP_AMOUNT_COLUMNS
]
)
sipp_df = pd.read_csv(
ensure_sipp_file(),
delimiter="|",
usecols=tip_cols,
)
sipp_df = pd.read_csv(STORAGE_FOLDER / "pu2023_slim.csv")

tip_amount_columns = [
column for column in SIPP_TIP_AMOUNT_COLUMNS if column in sipp_df
Expand Down Expand Up @@ -788,12 +799,6 @@ def _impute_sipp(

# Asset imputation
try:
hf_hub_download(
repo_id="PolicyEngine/policyengine-us-data",
filename="pu2023.csv",
repo_type="model",
local_dir=STORAGE_FOLDER,
)
asset_cols = (
[
"SSUID",
Expand All @@ -817,7 +822,7 @@ def _impute_sipp(
+ SIPP_ASSET_ALLOCATION_COLUMNS
)
asset_df = pd.read_csv(
STORAGE_FOLDER / "pu2023.csv",
ensure_sipp_file(),
delimiter="|",
usecols=asset_cols,
)
Expand Down
2 changes: 1 addition & 1 deletion policyengine_us_data/calibration/target_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ include:
geo_level: national
- variable: social_security_survivors
geo_level: national
- variable: ssi_federal_fiscal_year_outlays
- variable: ssi
geo_level: national
- variable: person_count
geo_level: national
Expand Down
15 changes: 8 additions & 7 deletions policyengine_us_data/datasets/sipp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ SIPP panel wave. These are the canonical reference for every variable
name, value code, and weighting construct used by the code in this
folder:

- [SIPP 2023 public-use data dictionary (PDF)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/data-dictionaries/2023/2023_SIPP_Data_Dictionary.pdf)
- [SIPP 2023 users' guide (PDF, Aug 2026 revision)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/methodology/2023_SIPP_Users_Guide_AUG26.pdf)
- [SIPP 2024 public-use data dictionary (PDF)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/data-dictionaries/2024/2024_SIPP_Data_Dictionary.pdf)
- [SIPP 2024 users' guide (PDF)](https://www2.census.gov/programs-surveys/sipp/tech-documentation/methodology/2024_SIPP_Users_Guide.pdf)

See also:

Expand All @@ -30,15 +30,16 @@ See also:
## Data products in this folder

- `sipp.py` — trains and caches QRF imputation models (`get_tip_model`,
`get_asset_model`, `get_vehicle_model`) from SIPP 2023 person-month
`get_asset_model`, `get_vehicle_model`) from SIPP 2024 person-month
data. The training frame is filtered to `MONTHCODE == 12` (December)
so every row represents one person-year rather than twelve annualized
months.

The raw SIPP CSVs (`pu2023.csv` and the slim variant `pu2023_slim.csv`)
are mirrored on the `PolicyEngine/policyengine-us-data` HuggingFace model
repo and downloaded on demand when a training run is needed. They are
not vendored in this Git repository.
The raw SIPP CSV (`pu2024.csv`) is downloaded on demand when a training
run is needed. The downloader first checks the
`PolicyEngine/policyengine-us-data` HuggingFace model repo for a cached
copy, then falls back to Census's public `pu2024_csv.zip` archive. The raw
file is not vendored in this Git repository.

## Licensing

Expand Down
Loading
Loading