From 2e7684372ecdd88327d391ed2793610eaf04aa4b Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 11 Apr 2026 10:52:01 -0400 Subject: [PATCH 1/6] Add legacy ESA and JSA proxy fields --- policyengine_uk_data/datasets/frs.py | 130 ++++++++++++++++++ .../tests/test_legacy_benefit_proxies.py | 108 +++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 policyengine_uk_data/tests/test_legacy_benefit_proxies.py diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 7cc05cd3..0f75612e 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -22,6 +22,130 @@ from policyengine_uk_data.parameters import load_take_up_rate, load_parameter +LEGACY_JOBSEEKER_MIN_AGE = 18 +ESA_MIN_AGE = 16 +ESA_HEALTH_EMPLOYMENT_STATUSES = ( + "LONG_TERM_DISABLED", + "SHORT_TERM_DISABLED", +) + + +def derive_legacy_jobseeker_proxy( + age, + employment_status, + hours_worked, + state_pension_age, +) -> np.ndarray: + """Approximate legacy JSA claimant-state from observed survey data. + + This is intentionally a proxy, not a legislative determination. It + identifies person-level working-age adults who report being unemployed + and not currently working any hours. + """ + + age = np.asarray(age) + employment_status = np.asarray(employment_status) + hours_worked = np.asarray(hours_worked) + state_pension_age = np.asarray(state_pension_age) + + return ( + (age >= LEGACY_JOBSEEKER_MIN_AGE) + & (age < state_pension_age) + & (employment_status == "UNEMPLOYED") + & (hours_worked <= 0) + ) + + +def derive_esa_health_condition_proxy( + age, + employment_status, + state_pension_age, +) -> np.ndarray: + """Approximate working-age ESA health-related claimant-state. + + This proxy relies only on person-level labour market status, not on + current disability or incapacity benefit receipt. It is a dataset-side + approximation for future modelling, not a direct observation of ESA + legal entitlement or LCW/LCWRA status. + """ + + age = np.asarray(age) + employment_status = np.asarray(employment_status) + state_pension_age = np.asarray(state_pension_age) + disability_labour_market_state = np.isin( + employment_status, ESA_HEALTH_EMPLOYMENT_STATUSES + ) + + return ( + (age >= ESA_MIN_AGE) + & (age < state_pension_age) + & disability_labour_market_state + ) + + +def derive_esa_support_group_proxy( + age, + employment_status, + hours_worked, + esa_health_condition_proxy, + state_pension_age, +) -> np.ndarray: + """Approximate a severe-health ESA subgroup akin to support group. + + This is a stricter subset of ``esa_health_condition_proxy`` intended + for future legacy ESA approximation work. It uses only non-receipt + labour market signals already available in the survey. + """ + + age = np.asarray(age) + employment_status = np.asarray(employment_status) + hours_worked = np.asarray(hours_worked) + esa_health_condition_proxy = np.asarray(esa_health_condition_proxy) + state_pension_age = np.asarray(state_pension_age) + severe_health_evidence = (employment_status == "LONG_TERM_DISABLED") & ( + hours_worked <= 0 + ) + + return ( + (age >= ESA_MIN_AGE) + & (age < state_pension_age) + & esa_health_condition_proxy + & severe_health_evidence + ) + + +def add_legacy_benefit_proxies( + pe_person: pd.DataFrame, state_pension_age +) -> pd.DataFrame: + """Populate person-scoped ESA/JSA proxy columns on the person frame. + + These remain person-level by design because the claimant-state inputs + they approximate attach to individuals. Downstream benunit-level legacy + benefit models should aggregate them explicitly rather than assuming the + raw survey contains a benunit claimant-state field. + """ + + pe_person["legacy_jobseeker_proxy"] = derive_legacy_jobseeker_proxy( + age=pe_person.age, + employment_status=pe_person.employment_status, + hours_worked=pe_person.hours_worked, + state_pension_age=state_pension_age, + ) + pe_person["esa_health_condition_proxy"] = derive_esa_health_condition_proxy( + age=pe_person.age, + employment_status=pe_person.employment_status, + state_pension_age=state_pension_age, + ) + pe_person["esa_support_group_proxy"] = derive_esa_support_group_proxy( + age=pe_person.age, + employment_status=pe_person.employment_status, + hours_worked=pe_person.hours_worked, + esa_health_condition_proxy=pe_person.esa_health_condition_proxy, + state_pension_age=state_pension_age, + ) + return pe_person + + def create_frs( raw_frs_folder: str, year: int, @@ -744,6 +868,7 @@ def determine_education_level(fted_val, typeed2_val, age_val): sim = Microsimulation(dataset=dataset) region = sim.populations["benunit"].household("region", dataset.time_period) lha_category = sim.calculate("LHA_category", year) + state_pension_age = sim.calculate("state_pension_age", year).values brma = np.empty(len(region), dtype=object) @@ -808,6 +933,11 @@ def determine_education_level(fted_val, typeed2_val, age_val): paragraph_3 | paragraph_4 | paragraph_5 ) + # Dataset-side claimant-state approximations for future legacy ESA/JSA + # modelling. These are explicit proxies based on observed survey + # conditions, not legislative determinations. + pe_person = add_legacy_benefit_proxies(pe_person, state_pension_age) + # Generate stochastic take-up decisions # All randomness is generated here in the data package using take-up rates # stored in YAML parameter files. This keeps the country package purely diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py new file mode 100644 index 00000000..bd77c818 --- /dev/null +++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py @@ -0,0 +1,108 @@ +import inspect + +import numpy as np +import pandas as pd + +from policyengine_uk_data.datasets.frs import ( + add_legacy_benefit_proxies, + create_frs, + derive_esa_health_condition_proxy, + derive_esa_support_group_proxy, + derive_legacy_jobseeker_proxy, +) + + +def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_non_workers(): + result = derive_legacy_jobseeker_proxy( + age=np.array([18, 30, 66, 17, 25, 25, 66]), + employment_status=np.array( + [ + "UNEMPLOYED", + "UNEMPLOYED", + "UNEMPLOYED", + "UNEMPLOYED", + "STUDENT", + "CARER", + "UNEMPLOYED", + ] + ), + hours_worked=np.array([0, 12, 0, 0, 0, 0, 0]), + state_pension_age=np.array([66, 66, 66, 66, 66, 66, 67]), + ) + + assert result.tolist() == [True, False, False, False, False, False, True] + + +def test_esa_health_condition_proxy_uses_disabled_employment_states(): + result = derive_esa_health_condition_proxy( + age=np.array([16, 45, 45, 66]), + employment_status=np.array( + [ + "LONG_TERM_DISABLED", + "SHORT_TERM_DISABLED", + "FT_EMPLOYED", + "LONG_TERM_DISABLED", + ] + ), + state_pension_age=np.array([66, 66, 66, 66]), + ) + + assert result.tolist() == [True, True, False, False] + + +def test_esa_support_group_proxy_is_stricter_subset_of_health_proxy(): + health_proxy = np.array([True, True, True, False]) + result = derive_esa_support_group_proxy( + age=np.array([16, 45, 45, 66]), + employment_status=np.array( + [ + "LONG_TERM_DISABLED", + "SHORT_TERM_DISABLED", + "LONG_TERM_DISABLED", + "FT_EMPLOYED", + ] + ), + hours_worked=np.array([0, 0, 12, 0]), + esa_health_condition_proxy=health_proxy, + state_pension_age=np.array([66, 66, 66, 66]), + ) + + assert result.tolist() == [True, False, False, False] + + +def test_add_legacy_benefit_proxies_wires_all_three_columns(): + pe_person = pd.DataFrame( + { + "age": [18, 45, 45, 66], + "employment_status": [ + "UNEMPLOYED", + "LONG_TERM_DISABLED", + "SHORT_TERM_DISABLED", + "LONG_TERM_DISABLED", + ], + "hours_worked": [0, 0, 12, 0], + "is_disabled_for_benefits": [False, True, False, True], + "is_severely_disabled_for_benefits": [False, False, True, True], + "esa_income_reported": [0.0, 0.0, 100.0, 0.0], + "esa_contrib_reported": [0.0, 0.0, 0.0, 0.0], + "incapacity_benefit_reported": [0.0, 0.0, 0.0, 0.0], + "sda_reported": [0.0, 0.0, 0.0, 0.0], + } + ) + + result = add_legacy_benefit_proxies( + pe_person.copy(), state_pension_age=np.array([66, 66, 66, 66]) + ) + + assert result["legacy_jobseeker_proxy"].tolist() == [True, False, False, False] + assert result["esa_health_condition_proxy"].tolist() == [False, True, True, False] + assert result["esa_support_group_proxy"].tolist() == [False, True, False, False] + + +def test_create_frs_calls_add_legacy_benefit_proxies(): + source = inspect.getsource(create_frs) + + assert ( + 'state_pension_age = sim.calculate("state_pension_age", year).values' in source + ) + assert "add_legacy_benefit_proxies(pe_person, state_pension_age)" in source From e568e787f3955a6f14467766ecf32fdb9c16c023 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 11 Apr 2026 11:31:26 -0400 Subject: [PATCH 2/6] Harden legacy ESA and JSA proxy coverage --- policyengine_uk_data/datasets/frs.py | 62 ++- .../tests/test_legacy_benefit_proxies.py | 373 +++++++++++++++++- 2 files changed, 405 insertions(+), 30 deletions(-) diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 0f75612e..8cebdd7f 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -23,6 +23,7 @@ LEGACY_JOBSEEKER_MIN_AGE = 18 +LEGACY_JOBSEEKER_MAX_HOURS = 16 ESA_MIN_AGE = 16 ESA_HEALTH_EMPLOYMENT_STATUSES = ( "LONG_TERM_DISABLED", @@ -34,31 +35,38 @@ def derive_legacy_jobseeker_proxy( age, employment_status, hours_worked, + current_education, + employment_status_reported, state_pension_age, ) -> np.ndarray: """Approximate legacy JSA claimant-state from observed survey data. This is intentionally a proxy, not a legislative determination. It identifies person-level working-age adults who report being unemployed - and not currently working any hours. + and working less than the legacy JSA 16-hour limit. """ age = np.asarray(age) employment_status = np.asarray(employment_status) hours_worked = np.asarray(hours_worked) + current_education = np.asarray(current_education) + employment_status_reported = np.asarray(employment_status_reported) state_pension_age = np.asarray(state_pension_age) return ( - (age >= LEGACY_JOBSEEKER_MIN_AGE) + employment_status_reported + & (age >= LEGACY_JOBSEEKER_MIN_AGE) & (age < state_pension_age) & (employment_status == "UNEMPLOYED") - & (hours_worked <= 0) + & (hours_worked < LEGACY_JOBSEEKER_MAX_HOURS) + & (current_education == "NOT_IN_EDUCATION") ) def derive_esa_health_condition_proxy( age, employment_status, + employment_status_reported, state_pension_age, ) -> np.ndarray: """Approximate working-age ESA health-related claimant-state. @@ -71,13 +79,15 @@ def derive_esa_health_condition_proxy( age = np.asarray(age) employment_status = np.asarray(employment_status) + employment_status_reported = np.asarray(employment_status_reported) state_pension_age = np.asarray(state_pension_age) disability_labour_market_state = np.isin( employment_status, ESA_HEALTH_EMPLOYMENT_STATUSES ) return ( - (age >= ESA_MIN_AGE) + employment_status_reported + & (age >= ESA_MIN_AGE) & (age < state_pension_age) & disability_labour_market_state ) @@ -88,6 +98,7 @@ def derive_esa_support_group_proxy( employment_status, hours_worked, esa_health_condition_proxy, + employment_status_reported, state_pension_age, ) -> np.ndarray: """Approximate a severe-health ESA subgroup akin to support group. @@ -101,13 +112,15 @@ def derive_esa_support_group_proxy( employment_status = np.asarray(employment_status) hours_worked = np.asarray(hours_worked) esa_health_condition_proxy = np.asarray(esa_health_condition_proxy) + employment_status_reported = np.asarray(employment_status_reported) state_pension_age = np.asarray(state_pension_age) severe_health_evidence = (employment_status == "LONG_TERM_DISABLED") & ( hours_worked <= 0 ) return ( - (age >= ESA_MIN_AGE) + employment_status_reported + & (age >= ESA_MIN_AGE) & (age < state_pension_age) & esa_health_condition_proxy & severe_health_evidence @@ -115,7 +128,7 @@ def derive_esa_support_group_proxy( def add_legacy_benefit_proxies( - pe_person: pd.DataFrame, state_pension_age + pe_person: pd.DataFrame, employment_status_reported, state_pension_age ) -> pd.DataFrame: """Populate person-scoped ESA/JSA proxy columns on the person frame. @@ -129,11 +142,14 @@ def add_legacy_benefit_proxies( age=pe_person.age, employment_status=pe_person.employment_status, hours_worked=pe_person.hours_worked, + current_education=pe_person.current_education, + employment_status_reported=employment_status_reported, state_pension_age=state_pension_age, ) pe_person["esa_health_condition_proxy"] = derive_esa_health_condition_proxy( age=pe_person.age, employment_status=pe_person.employment_status, + employment_status_reported=employment_status_reported, state_pension_age=state_pension_age, ) pe_person["esa_support_group_proxy"] = derive_esa_support_group_proxy( @@ -141,11 +157,39 @@ def add_legacy_benefit_proxies( employment_status=pe_person.employment_status, hours_worked=pe_person.hours_worked, esa_health_condition_proxy=pe_person.esa_health_condition_proxy, + employment_status_reported=employment_status_reported, state_pension_age=state_pension_age, ) return pe_person +def apply_legacy_benefit_proxies( + pe_person: pd.DataFrame, sim, year: int, employment_status_reported +) -> pd.DataFrame: + """Attach legacy ESA/JSA proxies using post-build simulation context.""" + + state_pension_age = sim.calculate("state_pension_age", year).values + return add_legacy_benefit_proxies( + pe_person, + employment_status_reported=employment_status_reported, + state_pension_age=state_pension_age, + ) + + +def attach_legacy_benefit_proxies_from_frs_person( + pe_person: pd.DataFrame, person: pd.DataFrame, sim, year: int +) -> pd.DataFrame: + """Bridge raw FRS person fields into the proxy derivation hook.""" + + employment_status_reported = person.empstati.fillna(0).to_numpy() > 0 + return apply_legacy_benefit_proxies( + pe_person, + sim, + year, + employment_status_reported=employment_status_reported, + ) + + def create_frs( raw_frs_folder: str, year: int, @@ -868,8 +912,6 @@ def determine_education_level(fted_val, typeed2_val, age_val): sim = Microsimulation(dataset=dataset) region = sim.populations["benunit"].household("region", dataset.time_period) lha_category = sim.calculate("LHA_category", year) - state_pension_age = sim.calculate("state_pension_age", year).values - brma = np.empty(len(region), dtype=object) # Sample from a random BRMA in the region, weighted by the number of observations in each BRMA @@ -936,7 +978,9 @@ def determine_education_level(fted_val, typeed2_val, age_val): # Dataset-side claimant-state approximations for future legacy ESA/JSA # modelling. These are explicit proxies based on observed survey # conditions, not legislative determinations. - pe_person = add_legacy_benefit_proxies(pe_person, state_pension_age) + pe_person = attach_legacy_benefit_proxies_from_frs_person( + pe_person, person, sim, year + ) # Generate stochastic take-up decisions # All randomness is generated here in the data package using take-up rates diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py index bd77c818..f8d6a146 100644 --- a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py +++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py @@ -1,10 +1,12 @@ -import inspect - import numpy as np import pandas as pd +import policyengine_uk +import policyengine_uk_data.datasets.frs as frs_module from policyengine_uk_data.datasets.frs import ( add_legacy_benefit_proxies, + attach_legacy_benefit_proxies_from_frs_person, + apply_legacy_benefit_proxies, create_frs, derive_esa_health_condition_proxy, derive_esa_support_group_proxy, @@ -12,62 +14,106 @@ ) -def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_non_workers(): +class FakeSim: + def __init__(self, state_pension_age): + self._state_pension_age = np.asarray(state_pension_age) + + def calculate(self, variable, period): + assert variable == "state_pension_age" + return pd.Series(self._state_pension_age) + + +def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_low_hours(): result = derive_legacy_jobseeker_proxy( - age=np.array([18, 30, 66, 17, 25, 25, 66]), + age=np.array([18, 30, 30, 66, 17, 25, 25, 66, 30, 30]), employment_status=np.array( [ "UNEMPLOYED", "UNEMPLOYED", "UNEMPLOYED", "UNEMPLOYED", + "UNEMPLOYED", "STUDENT", "CARER", "UNEMPLOYED", + "UNEMPLOYED", + "UNEMPLOYED", ] ), - hours_worked=np.array([0, 12, 0, 0, 0, 0, 0]), - state_pension_age=np.array([66, 66, 66, 66, 66, 66, 67]), + hours_worked=np.array([0, 12, 16, 0, 0, 0, 0, 0, 0, 0]), + current_education=np.array( + [ + "NOT_IN_EDUCATION", + "NOT_IN_EDUCATION", + "NOT_IN_EDUCATION", + "NOT_IN_EDUCATION", + "NOT_IN_EDUCATION", + "TERTIARY", + "NOT_IN_EDUCATION", + "NOT_IN_EDUCATION", + "UPPER_SECONDARY", + "NOT_IN_EDUCATION", + ] + ), + employment_status_reported=np.array( + [True, True, True, True, True, True, True, True, True, False] + ), + state_pension_age=np.array([66, 66, 66, 66, 66, 66, 66, 67, 66, 66]), ) - assert result.tolist() == [True, False, False, False, False, False, True] + assert result.tolist() == [ + True, + True, + False, + False, + False, + False, + False, + True, + False, + False, + ] def test_esa_health_condition_proxy_uses_disabled_employment_states(): result = derive_esa_health_condition_proxy( - age=np.array([16, 45, 45, 66]), + age=np.array([16, 45, 45, 66, 45]), employment_status=np.array( [ "LONG_TERM_DISABLED", "SHORT_TERM_DISABLED", "FT_EMPLOYED", "LONG_TERM_DISABLED", + "LONG_TERM_DISABLED", ] ), - state_pension_age=np.array([66, 66, 66, 66]), + employment_status_reported=np.array([True, True, True, True, False]), + state_pension_age=np.array([66, 66, 66, 66, 66]), ) - assert result.tolist() == [True, True, False, False] + assert result.tolist() == [True, True, False, False, False] def test_esa_support_group_proxy_is_stricter_subset_of_health_proxy(): - health_proxy = np.array([True, True, True, False]) + health_proxy = np.array([True, True, True, False, True]) result = derive_esa_support_group_proxy( - age=np.array([16, 45, 45, 66]), + age=np.array([16, 45, 45, 66, 45]), employment_status=np.array( [ "LONG_TERM_DISABLED", "SHORT_TERM_DISABLED", "LONG_TERM_DISABLED", "FT_EMPLOYED", + "LONG_TERM_DISABLED", ] ), - hours_worked=np.array([0, 0, 12, 0]), + hours_worked=np.array([0, 0, 12, 0, 0]), esa_health_condition_proxy=health_proxy, - state_pension_age=np.array([66, 66, 66, 66]), + employment_status_reported=np.array([True, True, True, True, False]), + state_pension_age=np.array([66, 66, 66, 66, 66]), ) - assert result.tolist() == [True, False, False, False] + assert result.tolist() == [True, False, False, False, False] def test_add_legacy_benefit_proxies_wires_all_three_columns(): @@ -81,6 +127,12 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns(): "LONG_TERM_DISABLED", ], "hours_worked": [0, 0, 12, 0], + "current_education": [ + "NOT_IN_EDUCATION", + "NOT_IN_EDUCATION", + "NOT_IN_EDUCATION", + "NOT_IN_EDUCATION", + ], "is_disabled_for_benefits": [False, True, False, True], "is_severely_disabled_for_benefits": [False, False, True, True], "esa_income_reported": [0.0, 0.0, 100.0, 0.0], @@ -91,7 +143,9 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns(): ) result = add_legacy_benefit_proxies( - pe_person.copy(), state_pension_age=np.array([66, 66, 66, 66]) + pe_person.copy(), + employment_status_reported=np.array([True, True, True, False]), + state_pension_age=np.array([66, 66, 66, 66]), ) assert result["legacy_jobseeker_proxy"].tolist() == [True, False, False, False] @@ -99,10 +153,287 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns(): assert result["esa_support_group_proxy"].tolist() == [False, True, False, False] -def test_create_frs_calls_add_legacy_benefit_proxies(): - source = inspect.getsource(create_frs) +def test_apply_legacy_benefit_proxies_uses_sim_state_pension_age(): + pe_person = pd.DataFrame( + { + "age": [66, 66], + "employment_status": ["UNEMPLOYED", "UNEMPLOYED"], + "hours_worked": [0, 0], + "current_education": ["NOT_IN_EDUCATION", "NOT_IN_EDUCATION"], + } + ) + + result = apply_legacy_benefit_proxies( + pe_person.copy(), + FakeSim([66, 67]), + 2025, + employment_status_reported=np.array([True, True]), + ) + + assert result["legacy_jobseeker_proxy"].tolist() == [False, True] + + +def test_attach_legacy_benefit_proxies_from_frs_person_uses_empstati_mask(): + pe_person = pd.DataFrame( + { + "age": [30, 30], + "employment_status": ["UNEMPLOYED", "LONG_TERM_DISABLED"], + "hours_worked": [12, 0], + "current_education": ["NOT_IN_EDUCATION", "NOT_IN_EDUCATION"], + } + ) + person = pd.DataFrame({"empstati": [1, np.nan]}) + + result = attach_legacy_benefit_proxies_from_frs_person( + pe_person.copy(), + person, + FakeSim([66, 66]), + 2025, + ) + + assert result["legacy_jobseeker_proxy"].tolist() == [True, False] + assert result["esa_health_condition_proxy"].tolist() == [False, False] + assert result["esa_support_group_proxy"].tolist() == [False, False] + + +class FakeBenunitPopulation: + def __init__(self, dataset): + self.dataset = dataset + + def household(self, variable, period): + if variable == "region": + return np.array(["LONDON"]) + if variable == "household_id": + return np.array([100]) + raise KeyError(variable) + - assert ( - 'state_pension_age = sim.calculate("state_pension_age", year).values' in source +class FakeMicrosimulation: + def __init__(self, dataset): + self.dataset = dataset + self.populations = {"benunit": FakeBenunitPopulation(dataset)} + self.tax_benefit_system = type( + "FakeTaxBenefitSystem", + (), + { + "parameters": lambda self, year: type( + "FakeParametersRoot", + (), + { + "gov": type( + "FakeGov", + (), + { + "dwp": type( + "FakeDwp", + (), + { + "dla": type( + "FakeDla", + (), + { + "self_care": type( + "FakeSelfCare", + (), + {"higher": 1}, + )() + }, + )(), + "pip": type( + "FakePip", + (), + { + "daily_living": type( + "FakeDailyLiving", + (), + {"enhanced": 1}, + )() + }, + )(), + }, + )() + }, + )() + }, + )() + }, + )() + + def calculate(self, variable, year=None): + if variable == "LHA_category": + return np.array(["A"]) + if variable == "household_id": + return np.array([100]) + if variable == "state_pension_age": + return pd.Series([66]) + raise KeyError(variable) + + +def test_create_frs_smoke_includes_legacy_proxy_columns(tmp_path, monkeypatch): + original_read_csv = frs_module.pd.read_csv + + def fake_read_csv(path, *args, **kwargs): + if str(path).endswith("lha_list_of_rents.csv.gz"): + return pd.DataFrame( + {"region": ["LONDON"], "lha_category": ["A"], "brma": ["BRMA1"]} + ) + return original_read_csv(path, *args, **kwargs) + + monkeypatch.setattr(policyengine_uk, "Microsimulation", FakeMicrosimulation) + monkeypatch.setattr(frs_module.pd, "read_csv", fake_read_csv) + monkeypatch.setattr(frs_module, "load_take_up_rate", lambda *args, **kwargs: 0.0) + monkeypatch.setattr(frs_module, "load_parameter", lambda *args, **kwargs: 0.0) + monkeypatch.setattr( + frs_module, "sum_to_entity", lambda values, ids, index: np.zeros(len(index)) + ) + monkeypatch.setattr( + frs_module, + "sum_from_positive_fields", + lambda table, fields: np.zeros(len(table)), ) - assert "add_legacy_benefit_proxies(pe_person, state_pension_age)" in source + monkeypatch.setattr( + frs_module, + "sum_positive_variables", + lambda variables: ( + np.sum(np.vstack([np.asarray(v) for v in variables]), axis=0) + if variables + else 0 + ), + ) + monkeypatch.setattr( + frs_module, + "fill_with_mean", + lambda table, indicator, amount: np.zeros(len(table)), + ) + + adult = pd.DataFrame( + [ + { + "sernum": 100, + "benunit": 1, + "person": 1, + "accssamt": 0, + "adema": 0, + "ademaamt": 0, + "age": 30, + "age80": 30, + "cvpay": 0, + "educft": 0, + "educqual": 0, + "eduma": 0, + "edumaamt": 0, + "empstati": 8, + "fsbval": 0, + "fsfvval": 0, + "fsmval": 0, + "fted": 0, + "heartval": 0, + "hrpid": 1, + "inearns": 0, + "marital": 0, + "mntamt1": 0, + "mntamt2": 0, + "mntus1": 0, + "mntusam1": 0, + "redamt": 0, + "royyr1": 0, + "seincam2": 0, + "sex": 1, + "slrepamt": 0, + "smpadj": 0, + "sspadj": 0, + "tothours": 0, + "tuborr": 0, + "typeed2": 0, + "uperson": 1, + "allpay2": 0, + "royyr2": 0, + "royyr3": 0, + "royyr4": 0, + "chamtern": 0, + "chamttst": 0, + "apamt": 0, + "apdamt": 0, + "pareamt": 0, + "allpay3": 0, + "allpay4": 0, + "grtdir1": 0, + "grtdir2": 0, + } + ] + ) + child = pd.DataFrame(columns=adult.columns) + benunit = pd.DataFrame([{"sernum": 100, "benunit": 1, "famtypb2": 1}]) + househol = pd.DataFrame( + [ + { + "sernum": 100, + "adulth": 1, + "bedroom6": 1, + "csewamt": 0, + "ctannual": 0, + "ctband": 1, + "ctrebamt": 0, + "cwatamtd": 0, + "gross4": 0, + "gvtregno": 1, + "hhrent": 0, + "mortint": 0, + "ptentyp2": 0, + "rt2rebam": 0, + "struins": 0, + "subrent": 0, + "tentyp2": 0, + "typeacc": 0, + "watsewrt": 0, + "niratlia": 0, + **{f"chrgamt{i}": 0 for i in range(1, 10)}, + } + ] + ) + raw_tables = { + "adult": adult, + "child": child, + "benunit": benunit, + "househol": househol, + "pension": pd.DataFrame( + columns=[ + "person", + "sernum", + "penoth", + "penpay", + "poamt", + "poinc", + "ptamt", + "ptinc", + ] + ), + "oddjob": pd.DataFrame(columns=["person", "sernum", "ojamt", "ojnow"]), + "accounts": pd.DataFrame( + columns=["person", "sernum", "accint", "acctax", "invtax", "account"] + ), + "job": pd.DataFrame(columns=["person", "sernum", "deduc1", "spnamt", "salsac"]), + "benefits": pd.DataFrame( + columns=["person", "sernum", "benamt", "benefit", "var2"] + ), + "maint": pd.DataFrame(columns=["person", "sernum", "mramt", "mruamt", "mrus"]), + "penprov": pd.DataFrame(columns=["person", "sernum", "penamt", "stemppen"]), + "chldcare": pd.DataFrame( + columns=["person", "sernum", "chamt", "cost", "registrd"] + ), + "extchild": pd.DataFrame(columns=["sernum", "nhhamt"]), + "mortgage": pd.DataFrame( + columns=["sernum", "borramt", "mortend", "rmamt", "rmort"] + ), + } + + for name, table in raw_tables.items(): + table.to_csv(tmp_path / f"{name}.tab", sep="\t", index=False) + + dataset = create_frs(tmp_path, 2025) + + assert { + "legacy_jobseeker_proxy", + "esa_health_condition_proxy", + "esa_support_group_proxy", + }.issubset(dataset.person.columns) From 3a709663b64751f995d6692c6c828c259a46c0ee Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 11 Apr 2026 11:34:11 -0400 Subject: [PATCH 3/6] Match JSA proxy hours to annualized units --- policyengine_uk_data/datasets/frs.py | 12 +++++++++--- .../tests/test_legacy_benefit_proxies.py | 8 ++++---- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 8cebdd7f..0b2ef6ff 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -23,7 +23,11 @@ LEGACY_JOBSEEKER_MIN_AGE = 18 -LEGACY_JOBSEEKER_MAX_HOURS = 16 +HOURS_WORKED_WEEKS_PER_YEAR = 52 +LEGACY_JOBSEEKER_MAX_WEEKLY_HOURS = 16 +LEGACY_JOBSEEKER_MAX_ANNUAL_HOURS = ( + LEGACY_JOBSEEKER_MAX_WEEKLY_HOURS * HOURS_WORKED_WEEKS_PER_YEAR +) ESA_MIN_AGE = 16 ESA_HEALTH_EMPLOYMENT_STATUSES = ( "LONG_TERM_DISABLED", @@ -43,7 +47,9 @@ def derive_legacy_jobseeker_proxy( This is intentionally a proxy, not a legislative determination. It identifies person-level working-age adults who report being unemployed - and working less than the legacy JSA 16-hour limit. + and working less than the legacy JSA 16-hour weekly limit. The + ``hours_worked`` input is the annualised FRS-derived measure used in the + dataset, so the threshold is converted to annual hours here. """ age = np.asarray(age) @@ -58,7 +64,7 @@ def derive_legacy_jobseeker_proxy( & (age >= LEGACY_JOBSEEKER_MIN_AGE) & (age < state_pension_age) & (employment_status == "UNEMPLOYED") - & (hours_worked < LEGACY_JOBSEEKER_MAX_HOURS) + & (hours_worked < LEGACY_JOBSEEKER_MAX_ANNUAL_HOURS) & (current_education == "NOT_IN_EDUCATION") ) diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py index f8d6a146..29846f61 100644 --- a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py +++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py @@ -40,7 +40,7 @@ def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_low_hours(): "UNEMPLOYED", ] ), - hours_worked=np.array([0, 12, 16, 0, 0, 0, 0, 0, 0, 0]), + hours_worked=np.array([0, 12 * 52, 16 * 52, 0, 0, 0, 0, 0, 0, 0]), current_education=np.array( [ "NOT_IN_EDUCATION", @@ -107,7 +107,7 @@ def test_esa_support_group_proxy_is_stricter_subset_of_health_proxy(): "LONG_TERM_DISABLED", ] ), - hours_worked=np.array([0, 0, 12, 0, 0]), + hours_worked=np.array([0, 0, 12 * 52, 0, 0]), esa_health_condition_proxy=health_proxy, employment_status_reported=np.array([True, True, True, True, False]), state_pension_age=np.array([66, 66, 66, 66, 66]), @@ -126,7 +126,7 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns(): "SHORT_TERM_DISABLED", "LONG_TERM_DISABLED", ], - "hours_worked": [0, 0, 12, 0], + "hours_worked": [0, 0, 12 * 52, 0], "current_education": [ "NOT_IN_EDUCATION", "NOT_IN_EDUCATION", @@ -178,7 +178,7 @@ def test_attach_legacy_benefit_proxies_from_frs_person_uses_empstati_mask(): { "age": [30, 30], "employment_status": ["UNEMPLOYED", "LONG_TERM_DISABLED"], - "hours_worked": [12, 0], + "hours_worked": [12 * 52, 0], "current_education": ["NOT_IN_EDUCATION", "NOT_IN_EDUCATION"], } ) From e13fcd0ff90a70c2176cb0b73f9632f83342e74c Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 11 Apr 2026 22:35:17 -0400 Subject: [PATCH 4/6] Source JSA proxy hours from policyengine-uk --- policyengine_uk_data/datasets/frs.py | 31 ++++++++++++++----- .../tests/test_legacy_benefit_proxies.py | 10 ++++++ 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 0b2ef6ff..8f20aa84 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -7,10 +7,13 @@ modelling and policy analysis. """ -from policyengine_uk.data import UKSingleYearDataset +from functools import lru_cache from pathlib import Path -import pandas as pd + import numpy as np +import pandas as pd +from policyengine_uk import CountryTaxBenefitSystem +from policyengine_uk.data import UKSingleYearDataset from policyengine_uk_data.utils.datasets import ( sum_to_entity, categorical, @@ -24,10 +27,6 @@ LEGACY_JOBSEEKER_MIN_AGE = 18 HOURS_WORKED_WEEKS_PER_YEAR = 52 -LEGACY_JOBSEEKER_MAX_WEEKLY_HOURS = 16 -LEGACY_JOBSEEKER_MAX_ANNUAL_HOURS = ( - LEGACY_JOBSEEKER_MAX_WEEKLY_HOURS * HOURS_WORKED_WEEKS_PER_YEAR -) ESA_MIN_AGE = 16 ESA_HEALTH_EMPLOYMENT_STATUSES = ( "LONG_TERM_DISABLED", @@ -35,6 +34,15 @@ ) +@lru_cache(maxsize=None) +def load_legacy_jobseeker_max_annual_hours(year: int) -> int: + """Read the JSA single-claimant hours rule from policyengine-uk.""" + + system = CountryTaxBenefitSystem() + max_weekly_hours = int(system.parameters.gov.dwp.JSA.hours.single(str(year))) + return max_weekly_hours * HOURS_WORKED_WEEKS_PER_YEAR + + def derive_legacy_jobseeker_proxy( age, employment_status, @@ -42,6 +50,7 @@ def derive_legacy_jobseeker_proxy( current_education, employment_status_reported, state_pension_age, + max_annual_hours, ) -> np.ndarray: """Approximate legacy JSA claimant-state from observed survey data. @@ -64,7 +73,7 @@ def derive_legacy_jobseeker_proxy( & (age >= LEGACY_JOBSEEKER_MIN_AGE) & (age < state_pension_age) & (employment_status == "UNEMPLOYED") - & (hours_worked < LEGACY_JOBSEEKER_MAX_ANNUAL_HOURS) + & (hours_worked < max_annual_hours) & (current_education == "NOT_IN_EDUCATION") ) @@ -134,7 +143,10 @@ def derive_esa_support_group_proxy( def add_legacy_benefit_proxies( - pe_person: pd.DataFrame, employment_status_reported, state_pension_age + pe_person: pd.DataFrame, + employment_status_reported, + state_pension_age, + legacy_jobseeker_max_annual_hours, ) -> pd.DataFrame: """Populate person-scoped ESA/JSA proxy columns on the person frame. @@ -151,6 +163,7 @@ def add_legacy_benefit_proxies( current_education=pe_person.current_education, employment_status_reported=employment_status_reported, state_pension_age=state_pension_age, + max_annual_hours=legacy_jobseeker_max_annual_hours, ) pe_person["esa_health_condition_proxy"] = derive_esa_health_condition_proxy( age=pe_person.age, @@ -175,10 +188,12 @@ def apply_legacy_benefit_proxies( """Attach legacy ESA/JSA proxies using post-build simulation context.""" state_pension_age = sim.calculate("state_pension_age", year).values + legacy_jobseeker_max_annual_hours = load_legacy_jobseeker_max_annual_hours(year) return add_legacy_benefit_proxies( pe_person, employment_status_reported=employment_status_reported, state_pension_age=state_pension_age, + legacy_jobseeker_max_annual_hours=legacy_jobseeker_max_annual_hours, ) diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py index 29846f61..4f754d54 100644 --- a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py +++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py @@ -11,6 +11,7 @@ derive_esa_health_condition_proxy, derive_esa_support_group_proxy, derive_legacy_jobseeker_proxy, + load_legacy_jobseeker_max_annual_hours, ) @@ -24,6 +25,7 @@ def calculate(self, variable, period): def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_low_hours(): + max_annual_hours = load_legacy_jobseeker_max_annual_hours(2025) result = derive_legacy_jobseeker_proxy( age=np.array([18, 30, 30, 66, 17, 25, 25, 66, 30, 30]), employment_status=np.array( @@ -59,6 +61,7 @@ def test_legacy_jobseeker_proxy_tracks_unemployed_working_age_low_hours(): [True, True, True, True, True, True, True, True, True, False] ), state_pension_age=np.array([66, 66, 66, 66, 66, 66, 66, 67, 66, 66]), + max_annual_hours=max_annual_hours, ) assert result.tolist() == [ @@ -146,6 +149,9 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns(): pe_person.copy(), employment_status_reported=np.array([True, True, True, False]), state_pension_age=np.array([66, 66, 66, 66]), + legacy_jobseeker_max_annual_hours=load_legacy_jobseeker_max_annual_hours( + 2025 + ), ) assert result["legacy_jobseeker_proxy"].tolist() == [True, False, False, False] @@ -153,6 +159,10 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns(): assert result["esa_support_group_proxy"].tolist() == [False, True, False, False] +def test_legacy_jobseeker_hours_limit_matches_policyengine_uk_parameter(): + assert load_legacy_jobseeker_max_annual_hours(2025) == 16 * 52 + + def test_apply_legacy_benefit_proxies_uses_sim_state_pension_age(): pe_person = pd.DataFrame( { From 29d3b1b1017d0796b3bfc11d783079e849e37790 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 11 Apr 2026 22:51:49 -0400 Subject: [PATCH 5/6] Use policyengine-uk employment status enum --- policyengine_uk_data/datasets/frs.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 8f20aa84..c1692475 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -14,6 +14,9 @@ import pandas as pd from policyengine_uk import CountryTaxBenefitSystem from policyengine_uk.data import UKSingleYearDataset +from policyengine_uk.variables.household.income.employment_status import ( + EmploymentStatus, +) from policyengine_uk_data.utils.datasets import ( sum_to_entity, categorical, @@ -29,8 +32,8 @@ HOURS_WORKED_WEEKS_PER_YEAR = 52 ESA_MIN_AGE = 16 ESA_HEALTH_EMPLOYMENT_STATUSES = ( - "LONG_TERM_DISABLED", - "SHORT_TERM_DISABLED", + EmploymentStatus.LONG_TERM_DISABLED.name, + EmploymentStatus.SHORT_TERM_DISABLED.name, ) From 49cb77a74cbac7d6b502adc51a1e3644df83bb00 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 11 Apr 2026 22:56:22 -0400 Subject: [PATCH 6/6] Format legacy benefit proxy files --- policyengine_uk_data/tests/test_legacy_benefit_proxies.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py index 4f754d54..07479277 100644 --- a/policyengine_uk_data/tests/test_legacy_benefit_proxies.py +++ b/policyengine_uk_data/tests/test_legacy_benefit_proxies.py @@ -149,9 +149,7 @@ def test_add_legacy_benefit_proxies_wires_all_three_columns(): pe_person.copy(), employment_status_reported=np.array([True, True, True, False]), state_pension_age=np.array([66, 66, 66, 66]), - legacy_jobseeker_max_annual_hours=load_legacy_jobseeker_max_annual_hours( - 2025 - ), + legacy_jobseeker_max_annual_hours=load_legacy_jobseeker_max_annual_hours(2025), ) assert result["legacy_jobseeker_proxy"].tolist() == [True, False, False, False]