Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 9 additions & 26 deletions policyengine_uk_data/datasets/imputations/income.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,21 +123,6 @@ def generate_spi_table(spi: pd.DataFrame):
INCOME_MODEL_PATH = STORAGE_FOLDER / "income.pkl"


def _safe_rescale_factor(original: float, new: float) -> float:
"""Return the rent/mortgage rescaling factor used after income imputation.

Guards against a degenerate input where the seed dataset's imputation
columns sum to zero (e.g. the zero-weight synthetic copy used in
``impute_income`` before incomes have been populated). In that case we
cannot compute a meaningful ratio, so leave housing costs untouched
(factor=1.0) rather than raising ``ZeroDivisionError`` or silently
propagating NaN / inf into downstream household tables.
"""
if original == 0:
return 1.0
return new / original


def save_imputation_models():
"""
Train and save income imputation model.
Expand Down Expand Up @@ -198,22 +183,20 @@ def impute_over_incomes(
dataset = dataset.copy()
sim = Microsimulation(dataset=dataset)
input_df = sim.calculate_dataframe(["age", "gender", "region"])
original_income_total = dataset.person[INCOME_COMPONENTS].copy().sum().sum()
output_df = model.predict(input_df)

for column in output_variables:
dataset.person[column] = output_df[column].fillna(0).values

new_income_total = dataset.person[INCOME_COMPONENTS].sum().sum()
adjustment_factor = _safe_rescale_factor(original_income_total, new_income_total)
# Adjust rent and mortgage interest and capital repayments proportionally
dataset.household["rent"] = dataset.household["rent"] * adjustment_factor
dataset.household["mortgage_interest_repayment"] = (
dataset.household["mortgage_interest_repayment"] * adjustment_factor
)
dataset.household["mortgage_capital_repayment"] = (
dataset.household["mortgage_capital_repayment"] * adjustment_factor
)
# Housing costs (rent, mortgage interest, mortgage capital) used to be
# rescaled here by new_income_total / original_income_total across
# INCOME_COMPONENTS. Because FRS dividend_income is near-zero and the
# SPI-trained QRF predicts materially larger dividends, the ratio
# inflated rent/mortgage by ~2.5× uniformly in the built enhanced FRS
# — pushing AHC poverty rates 10–18 pp above HBAI for non-pensioners
# (see issue #367). Housing costs now pass through unchanged; their
# year-on-year growth is handled by per-variable OBR uprating indices,
# not by income-imputation side-effects.

return dataset

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""Regression test for issue #367: housing-cost pass-through.

Before this fix, ``impute_over_incomes`` multiplied ``rent``,
``mortgage_interest_repayment`` and ``mortgage_capital_repayment`` by
``new_income_total / original_income_total``. Because FRS under-reports
dividends while the SPI-trained QRF predicts realistic dividend values,
the ratio inflated housing costs ~2.5× in the built enhanced FRS,
pushing AHC poverty rates 10-18 pp above HBAI for non-pensioners.

These tests guard against the rescaling coming back in any form.
"""

from __future__ import annotations

import numpy as np
import pandas as pd
import pytest


class _FakeQRFModel:
"""Minimal stub with the interface `impute_over_incomes` expects."""

def __init__(self, imputations, multiplier: float = 1.0):
self._imputations = list(imputations)
self._multiplier = multiplier

def predict(self, X: pd.DataFrame) -> pd.DataFrame:
# Plausible non-negative predictions, scaled by a caller-controlled
# multiplier so we can simulate huge dividend imputations on FRS
# donor rows without needing a real SPI-trained QRF.
n = len(X)
rng = np.random.default_rng(0)
return pd.DataFrame(
{
col: rng.exponential(1_000 * self._multiplier, size=n)
for col in self._imputations
}
)


def _tiny_frs_dataset():
"""Load the committed tiny FRS dataset (1 000 households) if available."""
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk_data.storage import STORAGE_FOLDER

path = STORAGE_FOLDER / "frs_2023_24_tiny.h5"
if not path.exists():
pytest.skip("Tiny FRS dataset not available")
return UKSingleYearDataset(path)


def test_housing_costs_pass_through_unchanged():
"""Housing-cost columns must be byte-identical on exit.

Even when imputed dividends dwarf the FRS baseline (multiplier=100),
rent and mortgage values must not be rescaled.
"""
from policyengine_uk_data.datasets.imputations.income import (
IMPUTATIONS,
impute_over_incomes,
)

ds = _tiny_frs_dataset()
rent_in = ds.household["rent"].to_numpy().copy()
mi_in = ds.household["mortgage_interest_repayment"].to_numpy().copy()
mc_in = ds.household["mortgage_capital_repayment"].to_numpy().copy()

# If the old rescaling logic were still here, rent/mortgage would come
# out ~100× larger.
model = _FakeQRFModel(IMPUTATIONS, multiplier=100.0)
result = impute_over_incomes(ds, model, ["dividend_income"])

np.testing.assert_array_equal(result.household["rent"].to_numpy(), rent_in)
np.testing.assert_array_equal(
result.household["mortgage_interest_repayment"].to_numpy(), mi_in
)
np.testing.assert_array_equal(
result.household["mortgage_capital_repayment"].to_numpy(), mc_in
)


def test_only_listed_outputs_are_overwritten():
"""`output_variables` may be touched; other income columns must not be."""
from policyengine_uk_data.datasets.imputations.income import (
IMPUTATIONS,
impute_over_incomes,
)

ds = _tiny_frs_dataset()
employment_in = ds.person["employment_income"].to_numpy().copy()
dividend_in = ds.person["dividend_income"].to_numpy().copy()

model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0)
result = impute_over_incomes(ds, model, ["dividend_income"])

np.testing.assert_array_equal(
result.person["employment_income"].to_numpy(), employment_in
)
# dividend_income was listed — prediction output should differ from the
# near-zero FRS baseline for at least most rows.
assert not np.array_equal(result.person["dividend_income"].to_numpy(), dividend_in)


def test_housing_costs_preserved_when_income_baseline_is_zero():
"""Covers the zero-baseline shape the old `_safe_rescale_factor` guarded."""
from policyengine_uk_data.datasets.imputations.income import (
IMPUTATIONS,
INCOME_COMPONENTS,
impute_over_incomes,
)

ds = _tiny_frs_dataset()
for col in INCOME_COMPONENTS:
if col in ds.person.columns:
ds.person[col] = 0.0

rent_in = ds.household["rent"].to_numpy().copy()
model = _FakeQRFModel(IMPUTATIONS, multiplier=1.0)
result = impute_over_incomes(ds, model, ["dividend_income"])

out = result.household["rent"].to_numpy()
assert np.all(np.isfinite(out))
np.testing.assert_array_equal(out, rent_in)


def test_built_enhanced_frs_housing_costs_track_raw_frs():
"""Regression: after build, enhanced FRS per-renter rent should be close
to raw FRS per-renter rent (modulo small uprating / calibration effects).

Pre-fix, the built enhanced FRS had rent values 2.5× raw FRS. The tight
tolerance here (30 %) will fail on any dataset rebuilt with the old
rescaling logic.
"""
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk_data.storage import STORAGE_FOLDER

raw_path = STORAGE_FOLDER / "frs_2023_24.h5"
enh_path = STORAGE_FOLDER / "enhanced_frs_2023_24.h5"
if not (raw_path.exists() and enh_path.exists()):
pytest.skip("Full raw and enhanced FRS datasets not available")

raw = UKSingleYearDataset(raw_path)
enh = UKSingleYearDataset(enh_path)

for col in ("rent", "mortgage_interest_repayment", "mortgage_capital_repayment"):
r = raw.household[col].to_numpy()
e = enh.household[col].to_numpy()
r_med = float(np.median(r[r > 0])) if (r > 0).any() else 0.0
e_med = float(np.median(e[e > 0])) if (e > 0).any() else 0.0
assert r_med > 0, f"Raw FRS has no positive {col}"
ratio = e_med / r_med
assert 0.7 < ratio < 1.3, (
f"Enhanced {col} median (£{e_med:,.0f}) diverges from raw "
f"(£{r_med:,.0f}) by ratio {ratio:.2f}x; expected near 1.0. "
"Housing-cost rescaling may have been reintroduced (see #367)."
)
45 changes: 0 additions & 45 deletions policyengine_uk_data/tests/test_income_rescale_factor.py

This file was deleted.

Loading