Skip to content

Commit e45480a

Browse files
committed
Document mortgage pipeline steps
1 parent 408ce16 commit e45480a

4 files changed

Lines changed: 144 additions & 28 deletions

File tree

docs/pipeline-diagrams/app/pipeline.json

Lines changed: 94 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,14 @@
416416
"description": "QRF imputation for age, gender, and earnings split",
417417
"details": "Imputes AGEDP1-3, AGERANGE, EARNSPLIT, and GENDER from matched PUF demographic records",
418418
"source_file": "policyengine_us_data/datasets/puf/puf.py"
419+
},
420+
{
421+
"id": "mortgage_convert",
422+
"label": "Structural Mortgage Conversion",
423+
"node_type": "process",
424+
"description": "Convert deductible mortgage interest into structural mortgage balances, interest, and origination-year inputs",
425+
"details": "Preserves current-law deductible mortgage and total interest deductions while deriving first-lien, secondary acquisition-debt, and non-mortgage residual interest inputs",
426+
"source_file": "policyengine_us_data/utils/mortgage_interest.py"
419427
}
420428
],
421429
"edges": [
@@ -534,6 +542,11 @@
534542
},
535543
{
536544
"source": "impute_puf_demographics",
545+
"target": "mortgage_convert",
546+
"edge_type": "data_flow"
547+
},
548+
{
549+
"source": "mortgage_convert",
537550
"target": "out_puf",
538551
"edge_type": "produces_artifact",
539552
"label": "puf_2024.h5"
@@ -581,7 +594,7 @@
581594
"id": 2,
582595
"label": "Stage 2",
583596
"title": "Stage 2: Extended CPS (PUF Clone)",
584-
"description": "Merge CPS + PUF via cloning, impute 64 income vars + 51 override vars via QRF",
597+
"description": "Merge CPS + PUF via cloning, rematch clone features, QRF-impute incomes and CPS-only vars, then finalize Extended CPS inputs",
585598
"country": "us",
586599
"nodes": [
587600
{
@@ -602,6 +615,12 @@
602615
"node_type": "input",
603616
"description": "Census block populations"
604617
},
618+
{
619+
"id": "in_scf_s2",
620+
"label": "SCF_2022",
621+
"node_type": "input",
622+
"description": "From Stage 0 (mortgage-balance donor sample)"
623+
},
605624
{
606625
"id": "geo_assign_s2",
607626
"label": "Geography Assignment",
@@ -620,6 +639,12 @@
620639
"node_type": "utility",
621640
"description": "fit_predict() for sequential imputation"
622641
},
642+
{
643+
"id": "util_knn_s2",
644+
"label": "sklearn NearestNeighbors",
645+
"node_type": "utility",
646+
"description": "Role-aware donor matching on standardized clone predictors"
647+
},
623648
{
624649
"id": "record_double",
625650
"label": "Record Doubling",
@@ -634,14 +659,6 @@
634659
"description": "64 income variables \u2014 training on PUF ~20K records, 7 demographic predictors",
635660
"source_file": "policyengine_us_data/calibration/puf_impute.py"
636661
},
637-
{
638-
"id": "qrf_pass2",
639-
"label": "QRF Pass 2: Override Imputation",
640-
"node_type": "process",
641-
"description": "Replace the PUF clone half with second-stage CPS-only QRF outputs",
642-
"details": "Keeps original CPS donor values in the first half and maps person-level predictions onto each target entity before splicing",
643-
"source_file": "policyengine_us_data/datasets/cps/extended_cps.py"
644-
},
645662
{
646663
"id": "retire_impute",
647664
"label": "Retirement Contribution Imputation",
@@ -663,6 +680,14 @@
663680
"description": "Retirement/Disability/Survivors/Dependents \u2014 scaled to match PUF total",
664681
"source_file": "policyengine_us_data/calibration/puf_impute.py"
665682
},
683+
{
684+
"id": "clone_features",
685+
"label": "Clone Feature Rematching",
686+
"node_type": "process",
687+
"description": "kNN donor rematch of clone-half sex, race, Hispanic status, and occupation fields",
688+
"details": "Matches within tax-unit roles using demographics plus imputed income, then derives overtime and tipped-occupation inputs from donor occupations when available",
689+
"source_file": "policyengine_us_data/datasets/cps/extended_cps.py"
690+
},
666691
{
667692
"id": "cps_only",
668693
"label": "CPS-Only Variable Re-imputation",
@@ -671,6 +696,30 @@
671696
"details": "Trains on CPS persons and predicts clone-half values from demographics plus PUF-imputed income, then applies retirement and ORG domain constraints",
672697
"source_file": "policyengine_us_data/datasets/cps/extended_cps.py"
673698
},
699+
{
700+
"id": "qrf_pass2",
701+
"label": "QRF Pass 2: Override Imputation",
702+
"node_type": "process",
703+
"description": "Replace the PUF clone half with second-stage CPS-only QRF outputs",
704+
"details": "Keeps original CPS donor values in the first half, maps person-level predictions onto each target entity, and rebuilds capped childcare on the clone half",
705+
"source_file": "policyengine_us_data/datasets/cps/extended_cps.py"
706+
},
707+
{
708+
"id": "mortgage_hints",
709+
"label": "Mortgage Balance Hint Imputation",
710+
"node_type": "process",
711+
"description": "Impute tax-unit mortgage balance hints from SCF donor balances",
712+
"details": "Fits a weighted QRF on SCF mortgage holders, predicts first-lien and secondary acquisition-debt balance hints, and enforces conservative nonnegative ordering",
713+
"source_file": "policyengine_us_data/utils/mortgage_interest.py"
714+
},
715+
{
716+
"id": "mortgage_convert",
717+
"label": "Structural Mortgage Conversion",
718+
"node_type": "process",
719+
"description": "Convert deductible mortgage interest into structural mortgage balances, interest, and origination-year inputs",
720+
"details": "Preserves current-law deductible mortgage and total interest deductions while deriving first-lien, secondary acquisition-debt, and non-mortgage residual interest inputs",
721+
"source_file": "policyengine_us_data/utils/mortgage_interest.py"
722+
},
674723
{
675724
"id": "formula_drop",
676725
"label": "Formula Variable Dropping",
@@ -717,16 +766,11 @@
717766
},
718767
{
719768
"source": "qrf_pass1",
720-
"target": "qrf_pass2",
721-
"edge_type": "data_flow"
722-
},
723-
{
724-
"source": "qrf_pass2",
725769
"target": "retire_impute",
726770
"edge_type": "data_flow"
727771
},
728772
{
729-
"source": "qrf_pass2",
773+
"source": "qrf_pass1",
730774
"target": "weeks_impute",
731775
"edge_type": "data_flow"
732776
},
@@ -742,11 +786,37 @@
742786
},
743787
{
744788
"source": "ss_reconcile",
789+
"target": "clone_features",
790+
"edge_type": "data_flow"
791+
},
792+
{
793+
"source": "clone_features",
745794
"target": "cps_only",
746795
"edge_type": "data_flow"
747796
},
748797
{
749798
"source": "cps_only",
799+
"target": "qrf_pass2",
800+
"edge_type": "data_flow"
801+
},
802+
{
803+
"source": "qrf_pass2",
804+
"target": "mortgage_hints",
805+
"edge_type": "data_flow"
806+
},
807+
{
808+
"source": "in_scf_s2",
809+
"target": "mortgage_hints",
810+
"edge_type": "data_flow",
811+
"label": "SCF donor sample"
812+
},
813+
{
814+
"source": "mortgage_hints",
815+
"target": "mortgage_convert",
816+
"edge_type": "data_flow"
817+
},
818+
{
819+
"source": "mortgage_convert",
750820
"target": "formula_drop",
751821
"edge_type": "data_flow"
752822
},
@@ -762,12 +832,17 @@
762832
},
763833
{
764834
"source": "util_qrf_s2",
765-
"target": "qrf_pass2",
835+
"target": "cps_only",
766836
"edge_type": "uses_utility"
767837
},
768838
{
769839
"source": "util_qrf_s2",
770-
"target": "cps_only",
840+
"target": "mortgage_hints",
841+
"edge_type": "uses_utility"
842+
},
843+
{
844+
"source": "util_knn_s2",
845+
"target": "clone_features",
771846
"edge_type": "uses_utility"
772847
}
773848
],
@@ -2006,7 +2081,7 @@
20062081
}
20072082
],
20082083
"metadata": {
2009-
"total_nodes": 156,
2010-
"total_edges": 159
2084+
"total_nodes": 162,
2085+
"total_edges": 165
20112086
}
20122087
}

pipeline_stages.yaml

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,8 @@ stages:
236236
- { source: preprocess_puf, target: simulate_qbi, edge_type: data_flow }
237237
- { source: simulate_qbi, target: impute_puf_demographics, edge_type: data_flow }
238238
- { source: in_demographics, target: impute_puf_demographics, edge_type: data_flow, label: "demographics_2015.csv" }
239-
- { source: impute_puf_demographics, target: out_puf, edge_type: produces_artifact, label: "puf_2024.h5" }
239+
- { source: impute_puf_demographics, target: mortgage_convert, edge_type: data_flow }
240+
- { source: mortgage_convert, target: out_puf, edge_type: produces_artifact, label: "puf_2024.h5" }
240241
- { source: in_uprating, target: out_puf, edge_type: data_flow, label: "SOI growth rates" }
241242
# Utility edges
242243
- { source: util_seeded_rng, target: add_takeup, edge_type: uses_utility }
@@ -252,7 +253,7 @@ stages:
252253
- id: 2
253254
label: "Stage 2"
254255
title: "Stage 2: Extended CPS (PUF Clone)"
255-
description: "Merge CPS + PUF via cloning, impute 64 income vars + 51 override vars via QRF"
256+
description: "Merge CPS + PUF via cloning, rematch clone features, QRF-impute incomes and CPS-only vars, then finalize Extended CPS inputs"
256257
country: us
257258
extra_nodes:
258259
- id: in_cps_s2
@@ -267,6 +268,10 @@ stages:
267268
label: "block_cd_distributions.csv.gz"
268269
node_type: input
269270
description: "Census block populations"
271+
- id: in_scf_s2
272+
label: "SCF_2022"
273+
node_type: input
274+
description: "From Stage 0 (mortgage-balance donor sample)"
270275
- id: geo_assign_s2
271276
label: "Geography Assignment"
272277
node_type: process
@@ -279,24 +284,33 @@ stages:
279284
label: "microimpute QRF"
280285
node_type: utility
281286
description: "fit_predict() for sequential imputation"
287+
- id: util_knn_s2
288+
label: "sklearn NearestNeighbors"
289+
node_type: utility
290+
description: "Role-aware donor matching on standardized clone predictors"
282291
edges:
283292
- { source: in_cps_s2, target: geo_assign_s2, edge_type: data_flow, label: "CPS records" }
284293
- { source: in_blocks_s2, target: geo_assign_s2, edge_type: data_flow, label: "block populations" }
285294
- { source: in_puf_s2, target: record_double, edge_type: data_flow, label: "PUF records" }
286295
- { source: in_cps_s2, target: record_double, edge_type: data_flow, label: "CPS records" }
287296
- { source: geo_assign_s2, target: record_double, edge_type: data_flow }
288297
- { source: record_double, target: qrf_pass1, edge_type: data_flow }
289-
- { source: qrf_pass1, target: qrf_pass2, edge_type: data_flow }
290-
- { source: qrf_pass2, target: retire_impute, edge_type: data_flow }
291-
- { source: qrf_pass2, target: weeks_impute, edge_type: data_flow }
298+
- { source: qrf_pass1, target: retire_impute, edge_type: data_flow }
299+
- { source: qrf_pass1, target: weeks_impute, edge_type: data_flow }
292300
- { source: retire_impute, target: ss_reconcile, edge_type: data_flow }
293301
- { source: weeks_impute, target: ss_reconcile, edge_type: data_flow }
294-
- { source: ss_reconcile, target: cps_only, edge_type: data_flow }
295-
- { source: cps_only, target: formula_drop, edge_type: data_flow }
302+
- { source: ss_reconcile, target: clone_features, edge_type: data_flow }
303+
- { source: clone_features, target: cps_only, edge_type: data_flow }
304+
- { source: cps_only, target: qrf_pass2, edge_type: data_flow }
305+
- { source: qrf_pass2, target: mortgage_hints, edge_type: data_flow }
306+
- { source: in_scf_s2, target: mortgage_hints, edge_type: data_flow, label: "SCF donor sample" }
307+
- { source: mortgage_hints, target: mortgage_convert, edge_type: data_flow }
308+
- { source: mortgage_convert, target: formula_drop, edge_type: data_flow }
296309
- { source: formula_drop, target: out_ext, edge_type: produces_artifact }
297310
- { source: util_qrf_s2, target: qrf_pass1, edge_type: uses_utility }
298-
- { source: util_qrf_s2, target: qrf_pass2, edge_type: uses_utility }
299311
- { source: util_qrf_s2, target: cps_only, edge_type: uses_utility }
312+
- { source: util_qrf_s2, target: mortgage_hints, edge_type: uses_utility }
313+
- { source: util_knn_s2, target: clone_features, edge_type: uses_utility }
300314

301315
# ══════════════════════════════════════════════════════════════
302316
# Stage 3a: Enhanced CPS Reweighting (ECPS — deprecated)

policyengine_us_data/datasets/cps/extended_cps.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,14 @@ def _impute_clone_cps_features(
350350
return predictions
351351

352352

353+
@pipeline_node(PipelineNode(
354+
id="clone_features",
355+
label="Clone Feature Rematching",
356+
node_type="process",
357+
description="kNN donor rematch of clone-half sex, race, Hispanic status, and occupation fields",
358+
details="Matches within tax-unit roles using demographics plus imputed income, then derives overtime and tipped-occupation inputs from donor occupations when available",
359+
source_file="policyengine_us_data/datasets/cps/extended_cps.py",
360+
))
353361
def _splice_clone_feature_predictions(
354362
data: dict,
355363
predictions: pd.DataFrame,
@@ -755,7 +763,7 @@ def _apply_post_processing(predictions, X_test, time_period, data):
755763
label="QRF Pass 2: Override Imputation",
756764
node_type="process",
757765
description="Replace the PUF clone half with second-stage CPS-only QRF outputs",
758-
details="Keeps original CPS donor values in the first half and maps person-level predictions onto each target entity before splicing",
766+
details="Keeps original CPS donor values in the first half, maps person-level predictions onto each target entity, and rebuilds capped childcare on the clone half",
759767
source_file="policyengine_us_data/datasets/cps/extended_cps.py",
760768
))
761769
def _splice_cps_only_predictions(

policyengine_us_data/utils/mortgage_interest.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
import numpy as np
66
import pandas as pd
77

8+
from policyengine_us_data.pipeline_metadata import pipeline_node
9+
from policyengine_us_data.pipeline_schema import PipelineNode
10+
811

912
STRUCTURAL_MORTGAGE_VARIABLES = (
1013
"first_home_mortgage_balance",
@@ -33,6 +36,14 @@
3336
]
3437

3538

39+
@pipeline_node(PipelineNode(
40+
id="mortgage_hints",
41+
label="Mortgage Balance Hint Imputation",
42+
node_type="process",
43+
description="Impute tax-unit mortgage balance hints from SCF donor balances",
44+
details="Fits a weighted QRF on SCF mortgage holders, predicts first-lien and secondary acquisition-debt balance hints, and enforces conservative nonnegative ordering",
45+
source_file="policyengine_us_data/utils/mortgage_interest.py",
46+
))
3647
def impute_tax_unit_mortgage_balance_hints(
3748
data: Dict[str, Dict[int, np.ndarray]],
3849
time_period: int,
@@ -99,6 +110,14 @@ def impute_tax_unit_mortgage_balance_hints(
99110
return data
100111

101112

113+
@pipeline_node(PipelineNode(
114+
id="mortgage_convert",
115+
label="Structural Mortgage Conversion",
116+
node_type="process",
117+
description="Convert deductible mortgage interest into structural mortgage balances, interest, and origination-year inputs",
118+
details="Preserves current-law deductible mortgage and total interest deductions while deriving first-lien, secondary acquisition-debt, and non-mortgage residual interest inputs",
119+
source_file="policyengine_us_data/utils/mortgage_interest.py",
120+
))
102121
def convert_mortgage_interest_to_structural_inputs(
103122
data: Dict[str, Dict[int, np.ndarray]],
104123
time_period: int,

0 commit comments

Comments
 (0)