diff --git a/policyengine_uk_data/datasets/imputations/income.py b/policyengine_uk_data/datasets/imputations/income.py index d39392b74..e961c3691 100644 --- a/policyengine_uk_data/datasets/imputations/income.py +++ b/policyengine_uk_data/datasets/imputations/income.py @@ -123,21 +123,6 @@ def generate_spi_table(spi: pd.DataFrame): INCOME_MODEL_PATH = STORAGE_FOLDER / "income.pkl" -def _safe_rescale_factor(original: float, new: float) -> float: - """Return the rent/mortgage rescaling factor used after income imputation. - - Guards against a degenerate input where the seed dataset's imputation - columns sum to zero (e.g. the zero-weight synthetic copy used in - ``impute_income`` before incomes have been populated). In that case we - cannot compute a meaningful ratio, so leave housing costs untouched - (factor=1.0) rather than raising ``ZeroDivisionError`` or silently - propagating NaN / inf into downstream household tables. - """ - if original == 0: - return 1.0 - return new / original - - def save_imputation_models(): """ Train and save income imputation model. @@ -198,22 +183,20 @@ def impute_over_incomes( dataset = dataset.copy() sim = Microsimulation(dataset=dataset) input_df = sim.calculate_dataframe(["age", "gender", "region"]) - original_income_total = dataset.person[INCOME_COMPONENTS].copy().sum().sum() output_df = model.predict(input_df) for column in output_variables: dataset.person[column] = output_df[column].fillna(0).values - new_income_total = dataset.person[INCOME_COMPONENTS].sum().sum() - adjustment_factor = _safe_rescale_factor(original_income_total, new_income_total) - # Adjust rent and mortgage interest and capital repayments proportionally - dataset.household["rent"] = dataset.household["rent"] * adjustment_factor - dataset.household["mortgage_interest_repayment"] = ( - dataset.household["mortgage_interest_repayment"] * adjustment_factor - ) - dataset.household["mortgage_capital_repayment"] = ( - dataset.household["mortgage_capital_repayment"] * adjustment_factor - ) + # Housing costs (rent, mortgage interest, mortgage capital) used to be + # rescaled here by new_income_total / original_income_total across + # INCOME_COMPONENTS. Because FRS dividend_income is near-zero and the + # SPI-trained QRF predicts materially larger dividends, the ratio + # inflated rent/mortgage by ~2.5× uniformly in the built enhanced FRS + # — pushing AHC poverty rates 10–18 pp above HBAI for non-pensioners + # (see issue #367). Housing costs now pass through unchanged; their + # year-on-year growth is handled by per-variable OBR uprating indices, + # not by income-imputation side-effects. return dataset diff --git a/policyengine_uk_data/tests/test_income_imputation_preserves_housing_costs.py b/policyengine_uk_data/tests/test_income_imputation_preserves_housing_costs.py new file mode 100644 index 000000000..97d7509be --- /dev/null +++ b/policyengine_uk_data/tests/test_income_imputation_preserves_housing_costs.py @@ -0,0 +1,156 @@ +"""Regression test for issue #367: housing-cost pass-through. + +Before this fix, ``impute_over_incomes`` multiplied ``rent``, +``mortgage_interest_repayment`` and ``mortgage_capital_repayment`` by +``new_income_total / original_income_total``. Because FRS under-reports +dividends while the SPI-trained QRF predicts realistic dividend values, +the ratio inflated housing costs ~2.5× in the built enhanced FRS, +pushing AHC poverty rates 10-18 pp above HBAI for non-pensioners. + +These tests guard against the rescaling coming back in any form. +""" + +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + + +class _FakeQRFModel: + """Minimal stub with the interface `impute_over_incomes` expects.""" + + def __init__(self, imputations, multiplier: float = 1.0): + self._imputations = list(imputations) + self._multiplier = multiplier + + def predict(self, X: pd.DataFrame) -> pd.DataFrame: + # Plausible non-negative predictions, scaled by a caller-controlled + # multiplier so we can simulate huge dividend imputations on FRS + # donor rows without needing a real SPI-trained QRF. + n = len(X) + rng = np.random.default_rng(0) + return pd.DataFrame( + { + col: rng.exponential(1_000 * self._multiplier, size=n) + for col in self._imputations + } + ) + + +def _tiny_frs_dataset(): + """Load the committed tiny FRS dataset (1 000 households) if available.""" + from policyengine_uk.data import UKSingleYearDataset + from policyengine_uk_data.storage import STORAGE_FOLDER + + path = STORAGE_FOLDER / "frs_2023_24_tiny.h5" + if not path.exists(): + pytest.skip("Tiny FRS dataset not available") + return UKSingleYearDataset(path) + + +def test_housing_costs_pass_through_unchanged(): + """Housing-cost columns must be byte-identical on exit. + + Even when imputed dividends dwarf the FRS baseline (multiplier=100), + rent and mortgage values must not be rescaled. + """ + from policyengine_uk_data.datasets.imputations.income import ( + IMPUTATIONS, + impute_over_incomes, + ) + + ds = _tiny_frs_dataset() + rent_in = ds.household["rent"].to_numpy().copy() + mi_in = ds.household["mortgage_interest_repayment"].to_numpy().copy() + mc_in = ds.household["mortgage_capital_repayment"].to_numpy().copy() + + # If the old rescaling logic were still here, rent/mortgage would come + # out ~100× larger. + model = _FakeQRFModel(IMPUTATIONS, multiplier=100.0) + result = impute_over_incomes(ds, model, ["dividend_income"]) + + np.testing.assert_array_equal(result.household["rent"].to_numpy(), rent_in) + np.testing.assert_array_equal( + result.household["mortgage_interest_repayment"].to_numpy(), mi_in + ) + np.testing.assert_array_equal( + result.household["mortgage_capital_repayment"].to_numpy(), mc_in + ) + + +def test_only_listed_outputs_are_overwritten(): + """`output_variables` may be touched; other income columns must not be.""" + from policyengine_uk_data.datasets.imputations.income import ( + IMPUTATIONS, + impute_over_incomes, + ) + + ds = _tiny_frs_dataset() + employment_in = ds.person["employment_income"].to_numpy().copy() + dividend_in = ds.person["dividend_income"].to_numpy().copy() + + model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0) + result = impute_over_incomes(ds, model, ["dividend_income"]) + + np.testing.assert_array_equal( + result.person["employment_income"].to_numpy(), employment_in + ) + # dividend_income was listed — prediction output should differ from the + # near-zero FRS baseline for at least most rows. + assert not np.array_equal(result.person["dividend_income"].to_numpy(), dividend_in) + + +def test_housing_costs_preserved_when_income_baseline_is_zero(): + """Covers the zero-baseline shape the old `_safe_rescale_factor` guarded.""" + from policyengine_uk_data.datasets.imputations.income import ( + IMPUTATIONS, + INCOME_COMPONENTS, + impute_over_incomes, + ) + + ds = _tiny_frs_dataset() + for col in INCOME_COMPONENTS: + if col in ds.person.columns: + ds.person[col] = 0.0 + + rent_in = ds.household["rent"].to_numpy().copy() + model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0) + result = impute_over_incomes(ds, model, ["dividend_income"]) + + out = result.household["rent"].to_numpy() + assert np.all(np.isfinite(out)) + np.testing.assert_array_equal(out, rent_in) + + +def test_built_enhanced_frs_housing_costs_track_raw_frs(): + """Regression: after build, enhanced FRS per-renter rent should be close + to raw FRS per-renter rent (modulo small uprating / calibration effects). + + Pre-fix, the built enhanced FRS had rent values 2.5× raw FRS. The tight + tolerance here (30 %) will fail on any dataset rebuilt with the old + rescaling logic. + """ + from policyengine_uk.data import UKSingleYearDataset + from policyengine_uk_data.storage import STORAGE_FOLDER + + raw_path = STORAGE_FOLDER / "frs_2023_24.h5" + enh_path = STORAGE_FOLDER / "enhanced_frs_2023_24.h5" + if not (raw_path.exists() and enh_path.exists()): + pytest.skip("Full raw and enhanced FRS datasets not available") + + raw = UKSingleYearDataset(raw_path) + enh = UKSingleYearDataset(enh_path) + + for col in ("rent", "mortgage_interest_repayment", "mortgage_capital_repayment"): + r = raw.household[col].to_numpy() + e = enh.household[col].to_numpy() + r_med = float(np.median(r[r > 0])) if (r > 0).any() else 0.0 + e_med = float(np.median(e[e > 0])) if (e > 0).any() else 0.0 + assert r_med > 0, f"Raw FRS has no positive {col}" + ratio = e_med / r_med + assert 0.7 < ratio < 1.3, ( + f"Enhanced {col} median (£{e_med:,.0f}) diverges from raw " + f"(£{r_med:,.0f}) by ratio {ratio:.2f}x; expected near 1.0. " + "Housing-cost rescaling may have been reintroduced (see #367)." + ) diff --git a/policyengine_uk_data/tests/test_income_rescale_factor.py b/policyengine_uk_data/tests/test_income_rescale_factor.py deleted file mode 100644 index 30d7c7e86..000000000 --- a/policyengine_uk_data/tests/test_income_rescale_factor.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Unit tests for the rent/mortgage rescale factor helper in income.py. - -Guards the zero-division bug reported in the bug hunt (finding U3): -`impute_over_incomes` computed ``new_income_total / original_income_total`` -with no check for the degenerate case where the seed dataset had zero in -every imputation column — which is exactly the shape of the -`zero_weight_copy` branch inside `impute_income`. -""" - -from __future__ import annotations - -import math - -import pytest - - -def test_safe_rescale_factor_with_zero_original_returns_one(): - from policyengine_uk_data.datasets.imputations.income import ( - _safe_rescale_factor, - ) - - # The bug: dividing by zero raised ZeroDivisionError (or produced inf). - # The fix: leave housing costs untouched when we have no baseline. - assert _safe_rescale_factor(0, 123_456) == 1.0 - assert _safe_rescale_factor(0.0, 0.0) == 1.0 - - -def test_safe_rescale_factor_with_nonzero_original_returns_ratio(): - from policyengine_uk_data.datasets.imputations.income import ( - _safe_rescale_factor, - ) - - assert _safe_rescale_factor(1_000.0, 2_500.0) == pytest.approx(2.5) - assert _safe_rescale_factor(42.0, 42.0) == pytest.approx(1.0) - - -def test_safe_rescale_factor_preserves_finiteness(): - from policyengine_uk_data.datasets.imputations.income import ( - _safe_rescale_factor, - ) - - # Non-zero inputs must still return finite floats. - for original, new in [(1e9, 2e9), (1e-6, 1e-9), (100.0, 0.0)]: - factor = _safe_rescale_factor(original, new) - assert math.isfinite(factor), (original, new, factor)