From ee90926f33ea7067b5a0ec6e6ef49a69fe7f4f4f Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Mon, 20 Apr 2026 12:13:18 +0100 Subject: [PATCH 01/11] Add LA-level household land value calibration targets Generalises targets/sources/mhclg_regional_land.py to local-authority level. Each LA's share of national household land is proportional to households x avg_house_price, scaled to the ONS National Balance Sheet household-land series. Inputs (all already used elsewhere in the repo): - storage/la_land_values.csv: 360 LAs with households (from the existing local_authority_weights.h5 matrix) and avg_house_price (HM Land Registry UK HPI Dec 2025). - _land.HOUSEHOLD_LAND_VALUES for the national anchor. Tests cover CSV data quality, share/target aggregation, sensible ordering (K&C > Blackpool by >3x, London boroughs in top quintile), and registry integration. Updates test_regional_land_value_targets.py to filter by GeographicLevel.REGION now that LA targets share the same name prefix. Closes #370 Co-Authored-By: Claude Opus 4.7 (1M context) --- changelog.d/370.md | 1 + .../storage/la_land_values.csv | 361 ++++++++++++++++++ .../targets/sources/la_land.py | 90 +++++ .../tests/test_la_land_value_targets.py | 185 +++++++++ .../tests/test_regional_land_value_targets.py | 4 +- 5 files changed, 639 insertions(+), 2 deletions(-) create mode 100644 changelog.d/370.md create mode 100644 policyengine_uk_data/storage/la_land_values.csv create mode 100644 policyengine_uk_data/targets/sources/la_land.py create mode 100644 policyengine_uk_data/tests/test_la_land_value_targets.py diff --git a/changelog.d/370.md b/changelog.d/370.md new file mode 100644 index 000000000..3778bcf57 --- /dev/null +++ b/changelog.d/370.md @@ -0,0 +1 @@ +Add LA-level household land value calibration targets for all 360 UK local authorities, generalising the regional method to LA granularity using HM Land Registry UK HPI and PolicyEngine's LA household counts. diff --git a/policyengine_uk_data/storage/la_land_values.csv b/policyengine_uk_data/storage/la_land_values.csv new file mode 100644 index 000000000..e0e4842c6 --- /dev/null +++ b/policyengine_uk_data/storage/la_land_values.csv @@ -0,0 +1,361 @@ +code,name,households,avg_house_price +E06000001,Hartlepool,42687,132463 +E06000002,Middlesbrough,62873,141270 +E06000003,Redcar and Cleveland,64597,146735 +E06000004,Stockton-on-Tees,88187,170575 +E06000005,Darlington,50525,159542 +E06000006,Halton,58738,189414 +E06000007,Warrington,96224,251903 +E06000008,Blackburn with Darwen,61298,162893 +E06000009,Blackpool,69328,136485 +E06000010,"Kingston upon Hull, City of",119535,131323 +E06000011,East Riding of Yorkshire,160848,221028 +E06000012,North East Lincolnshire,72731,148043 +E06000013,North Lincolnshire,77080,180202 +E06000014,York,90598,306571 +E06000015,Derby,110196,205530 +E06000016,Leicester,134204,226491 +E06000017,Rutland,18295,318903 +E06000018,Nottingham,131657,193888 +E06000019,"Herefordshire, County of",86625,287302 +E06000020,Telford and Wrekin,79145,215752 +E06000021,Stoke-on-Trent,115950,147831 +E06000022,Bath and North East Somerset,84855,407049 +E06000023,"Bristol, City of",205211,353265 +E06000024,North Somerset,102261,312318 +E06000025,South Gloucestershire,129657,332736 +E06000026,Plymouth,126166,218085 +E06000027,Torbay,66499,232041 +E06000030,Swindon,100125,260905 +E06000031,Peterborough,88023,231757 +E06000032,Luton,84690,281149 +E06000033,Southend-on-Sea,80611,333356 +E06000034,Thurrock,70478,325936 +E06000035,Medway,118085,298520 +E06000036,Bracknell Forest,52392,394751 +E06000037,West Berkshire,70169,400144 +E06000038,Reading,70322,354805 +E06000039,Slough,54550,337206 +E06000040,Windsor and Maidenhead,64836,572852 +E06000041,Wokingham,72272,503052 +E06000042,Milton Keynes,119789,328697 +E06000043,Brighton and Hove,126374,410203 +E06000044,Portsmouth,93797,249460 +E06000045,Southampton,107827,233920 +E06000046,Isle of Wight,71945,247936 +E06000047,County Durham,253106,143291 +E06000049,Cheshire East,185025,306039 +E06000050,Cheshire West and Chester,162209,265955 +E06000051,Shropshire,146609,281161 +E06000052,Cornwall,265515,277318 +E06000053,Isles of Scilly,2492115,308582 +E06000054,Wiltshire,230273,332483 +E06000055,Bedford,82624,331140 +E06000056,Central Bedfordshire,129846,358488 +E06000057,Northumberland,154547,214989 +E06000058,"Bournemouth, Christchurch and Poole",180181,309673 +E06000059,Dorset,180376,332378 +E06000060,Buckinghamshire,237680,487653 +E06000061,North Northamptonshire,164829,258515 +E06000062,West Northamptonshire,180295,294010 +E06000063,Cumberland,91136,174281 +E06000064,Westmorland and Furness,94053,227777 +E06000065,North Yorkshire,127165,272111 +E06000066,Somerset,107037,278440 +E07000008,Cambridge,55946,485985 +E07000009,East Cambridgeshire,39423,357866 +E07000010,Fenland,46560,234696 +E07000011,Huntingdonshire,81045,310990 +E07000012,South Cambridgeshire,71683,433729 +E07000032,Amber Valley,59550,234786 +E07000033,Bolsover,37016,185199 +E07000034,Chesterfield,49895,200389 +E07000035,Derbyshire Dales,34180,344946 +E07000036,Erewash,53328,219887 +E07000037,High Peak,42696,262663 +E07000038,North East Derbyshire,48876,249829 +E07000039,South Derbyshire,47323,257691 +E07000040,East Devon,69365,343715 +E07000041,Exeter,54901,283194 +E07000042,Mid Devon,37599,299716 +E07000043,North Devon,44797,286805 +E07000044,South Hams,41310,368805 +E07000045,Teignbridge,62939,297853 +E07000046,Torridge,31384,269713 +E07000047,West Devon,25342,308836 +E07000061,Eastbourne,48596,251894 +E07000062,Hastings,42726,240579 +E07000063,Lewes,45592,354787 +E07000064,Rother,44092,340936 +E07000065,Wealden,69502,399752 +E07000066,Basildon,80242,362898 +E07000067,Braintree,69538,332140 +E07000068,Brentwood,33376,520013 +E07000069,Castle Point,39330,365893 +E07000070,Chelmsford,77893,383278 +E07000071,Colchester,84604,300310 +E07000072,Epping Forest,57631,549606 +E07000073,Harlow,40148,314356 +E07000074,Maldon,29084,395716 +E07000075,Rochford,37856,407373 +E07000076,Tendring,73646,268088 +E07000077,Uttlesford,38874,490567 +E07000078,Cheltenham,55163,336877 +E07000079,Cotswold,42623,415906 +E07000080,Forest of Dean,39023,299003 +E07000081,Gloucester,57916,236963 +E07000082,Stroud,54475,333064 +E07000083,Tewkesbury,43553,331405 +E07000084,Basingstoke and Deane,82589,373985 +E07000085,East Hampshire,55752,440091 +E07000086,Eastleigh,60138,311946 +E07000087,Fareham,52074,332945 +E07000088,Gosport,37662,230613 +E07000089,Hart,43057,495632 +E07000090,Havant,55546,316941 +E07000091,New Forest,88338,392680 +E07000092,Rushmoor,42222,332762 +E07000093,Test Valley,58786,370524 +E07000094,Winchester,56541,465183 +E07000095,Broxbourne,41490,407544 +E07000096,Dacorum,67995,467070 +E07000098,Hertsmere,46391,552787 +E07000099,North Hertfordshire,59481,419688 +E07000102,Three Rivers,38141,585189 +E07000103,Watford,42014,392605 +E07000105,Ashford,55344,347180 +E07000106,Canterbury,67957,335342 +E07000107,Dartford,47304,356253 +E07000108,Dover,52754,293719 +E07000109,Gravesham,44362,347503 +E07000110,Maidstone,76476,358161 +E07000111,Sevenoaks,52383,535114 +E07000112,Folkestone and Hythe,50991,310400 +E07000113,Swale,63290,289925 +E07000114,Thanet,65339,263646 +E07000115,Tonbridge and Malling,56505,406667 +E07000116,Tunbridge Wells,51447,465399 +E07000117,Burnley,41847,131476 +E07000118,Chorley,52608,208587 +E07000119,Fylde,39584,227834 +E07000120,Hyndburn,37173,135525 +E07000121,Lancaster,63938,200827 +E07000122,Pendle,39879,146756 +E07000123,Preston,63040,189448 +E07000124,Ribble Valley,28002,280677 +E07000125,Rossendale,31819,197028 +E07000126,South Ribble,51099,207382 +E07000127,West Lancashire,51070,229049 +E07000128,Wyre,53177,193044 +E07000129,Blaby,45683,295289 +E07000130,Charnwood,75849,275442 +E07000131,Harborough,42531,344277 +E07000132,Hinckley and Bosworth,52155,261427 +E07000133,Melton,23700,290884 +E07000134,North West Leicestershire,46330,287928 +E07000135,Oadby and Wigston,23560,271073 +E07000136,Boston,30365,192397 +E07000137,East Lindsey,67863,218674 +E07000138,Lincoln,44186,186704 +E07000139,North Kesteven,53366,244502 +E07000140,South Holland,42889,227339 +E07000141,South Kesteven,66371,255822 +E07000142,West Lindsey,44831,212770 +E07000143,Breckland,63896,277510 +E07000144,Broadland,59286,315312 +E07000145,Great Yarmouth,46015,208770 +E07000146,King's Lynn and West Norfolk,72550,268435 +E07000147,North Norfolk,51309,288661 +E07000148,Norwich,66263,222723 +E07000149,South Norfolk,64294,312743 +E07000170,Ashfield,56709,190280 +E07000171,Bassetlaw,54577,207390 +E07000172,Broxtowe,51864,256619 +E07000173,Gedling,55718,249519 +E07000174,Mansfield,50295,190540 +E07000175,Newark and Sherwood,58096,240202 +E07000176,Rushcliffe,53065,333758 +E07000177,Cherwell,71480,355186 +E07000178,Oxford,58982,480531 +E07000179,South Oxfordshire,64446,462111 +E07000180,Vale of White Horse,62792,411832 +E07000181,West Oxfordshire,51488,422389 +E07000192,Cannock Chase,45724,234415 +E07000193,East Staffordshire,54623,223975 +E07000194,Lichfield,50572,326691 +E07000195,Newcastle-under-Lyme,57040,202139 +E07000196,South Staffordshire,50237,295861 +E07000197,Stafford,66115,265685 +E07000198,Staffordshire Moorlands,45583,216108 +E07000199,Tamworth,34257,233451 +E07000200,Babergh,42926,331953 +E07000202,Ipswich,64321,221494 +E07000203,Mid Suffolk,49895,319139 +E07000207,Elmbridge,61225,743009 +E07000208,Epsom and Ewell,33649,545227 +E07000209,Guildford,60529,523409 +E07000210,Mole Valley,40275,557926 +E07000211,Reigate and Banstead,64563,466628 +E07000212,Runnymede,36674,480402 +E07000213,Spelthorne,46050,440782 +E07000214,Surrey Heath,38463,462511 +E07000215,Tandridge,37661,495400 +E07000216,Waverley,55267,555732 +E07000217,Woking,43207,433269 +E07000218,North Warwickshire,29186,269540 +E07000219,Nuneaton and Bedworth,59777,234032 +E07000220,Rugby,49396,275985 +E07000221,Stratford-on-Avon,62587,390081 +E07000222,Warwick,66686,366063 +E07000223,Adur,29062,370708 +E07000224,Arun,76206,325042 +E07000225,Chichester,57159,441151 +E07000226,Crawley,47401,336354 +E07000227,Horsham,64939,441285 +E07000228,Mid Sussex,66723,436743 +E07000229,Worthing,53504,308125 +E07000234,Bromsgrove,43061,333847 +E07000235,Malvern Hills,36187,338287 +E07000236,Redditch,38123,248001 +E07000237,Worcester,47138,249218 +E07000238,Wychavon,61914,333519 +E07000239,Wyre Forest,47963,234081 +E07000240,St Albans,61308,609646 +E07000241,Welwyn Hatfield,48609,443795 +E07000242,East Hertfordshire,64958,460270 +E07000243,Stevenage,38761,323392 +E07000244,East Suffolk,118154,283660 +E07000245,West Suffolk,81715,295650 +E08000001,Bolton,123006,200491 +E08000002,Bury,83538,237721 +E08000003,Manchester,230104,257630 +E08000004,Oldham,96814,213245 +E08000005,Rochdale,93421,209799 +E08000006,Salford,120347,226427 +E08000007,Stockport,134807,306235 +E08000008,Tameside,106293,211680 +E08000009,Trafford,99920,378514 +E08000010,Wigan,150180,191180 +E08000011,Knowsley,68335,190405 +E08000012,Liverpool,215816,184804 +E08000013,St. Helens,86307,179209 +E08000014,Sefton,130413,220702 +E08000015,Wirral,154379,218516 +E08000016,Barnsley,111728,172458 +E08000017,Doncaster,140435,173424 +E08000018,Rotherham,120506,191161 +E08000019,Sheffield,245451,220445 +E08000021,Newcastle upon Tyne,127262,207936 +E08000022,North Tyneside,100515,202840 +E08000023,South Tyneside,71355,165647 +E08000024,Sunderland,129173,146527 +E08000025,Birmingham,443632,232844 +E08000026,Coventry,141539,226361 +E08000027,Dudley,146542,227378 +E08000028,Sandwell,135966,200069 +E08000029,Solihull,93737,328744 +E08000030,Walsall,121137,215676 +E08000031,Wolverhampton,106933,213273 +E08000032,Bradford,218386,189396 +E08000033,Calderdale,95154,186573 +E08000034,Kirklees,185181,205944 +E08000035,Leeds,378060,246293 +E08000036,Wakefield,159785,199323 +E08000037,Gateshead,93642,151480 +E09000001,City of London,5133,740433 +E09000002,Barking and Dagenham,76891,353512 +E09000003,Barnet,156752,594093 +E09000004,Bexley,102241,410346 +E09000005,Brent,130863,568171 +E09000006,Bromley,142895,535306 +E09000007,Camden,94816,783812 +E09000008,Croydon,163059,402126 +E09000009,Ealing,145231,575503 +E09000010,Enfield,125214,471381 +E09000011,Greenwich,119526,474935 +E09000012,Hackney,114015,614552 +E09000013,Hammersmith and Fulham,83515,713773 +E09000014,Haringey,106436,626807 +E09000015,Harrow,92681,530409 +E09000016,Havering,108202,452231 +E09000017,Hillingdon,116295,477979 +E09000018,Hounslow,113871,519639 +E09000019,Islington,98568,699726 +E09000020,Kensington and Chelsea,70165,1178497 +E09000021,Kingston upon Thames,70208,573027 +E09000022,Lambeth,138311,538500 +E09000023,Lewisham,127327,493356 +E09000024,Merton,83414,601814 +E09000025,Newham,122280,405619 +E09000026,Redbridge,112886,495269 +E09000027,Richmond upon Thames,84981,777164 +E09000028,Southwark,134900,589636 +E09000029,Sutton,86112,453058 +E09000030,Tower Hamlets,123601,463527 +E09000031,Waltham Forest,109286,525738 +E09000032,Wandsworth,141843,689285 +E09000033,Westminster,100112,880389 +N09000001,Antrim and Newtownabbey,83744,197918 +N09000002,"Armagh City, Banbridge and Craigavon",87066,179907 +N09000003,Belfast,87441,178459 +N09000004,Causeway Coast and Glens,85707,213957 +N09000005,Derry City and Strabane,86420,177589 +N09000006,Fermanagh and Omagh,87392,194970 +N09000007,Lisburn and Castlereagh,89622,231628 +N09000008,Mid and East Antrim,84384,173261 +N09000009,Mid Ulster,85900,189185 +N09000010,"Newry, Mourne and Down",88089,218595 +S12000005,Clackmannanshire,85677,171785 +S12000006,Dumfries and Galloway,85940,163620 +S12000008,East Ayrshire,82669,131065 +S12000010,East Lothian,82119,280390 +S12000011,East Renfrewshire,84440,297395 +S12000013,Na h-Eileanan Siar,86183,139148 +S12000014,Falkirk,86048,171236 +S12000017,Highland,90706,216711 +S12000018,Inverclyde,83811,113267 +S12000019,Midlothian,83766,286803 +S12000020,Moray,84248,197451 +S12000021,North Ayrshire,84155,134830 +S12000023,Orkney Islands,83272,229610 +S12000026,Scottish Borders,82091,182102 +S12000027,Shetland Islands,83113,201503 +S12000028,South Ayrshire,84785,173377 +S12000029,South Lanarkshire,95010,186880 +S12000030,Stirling,84552,228054 +S12000033,Aberdeen City,86478,133119 +S12000034,Aberdeenshire,90347,202362 +S12000035,Argyll and Bute,85250,186309 +S12000036,City of Edinburgh,112532,293243 +S12000038,Renfrewshire,83979,160277 +S12000039,West Dunbartonshire,84070,131097 +S12000040,West Lothian,85547,219317 +S12000041,Angus,84966,174680 +S12000042,Dundee City,95901,141246 +S12000045,East Dunbartonshire,84536,262223 +S12000047,Fife,100135,177750 +S12000048,Perth and Kinross,84587,228534 +S12000049,Glasgow City,131728,189093 +S12000050,North Lanarkshire,92029,158859 +W06000001,Isle of Anglesey,39750,242141 +W06000002,Gwynedd,64757,196260 +W06000003,Conwy,64876,211833 +W06000004,Denbighshire,51535,195430 +W06000005,Flintshire,81654,213990 +W06000006,Wrexham,70846,206799 +W06000008,Ceredigion,39495,233722 +W06000009,Pembrokeshire,68590,213226 +W06000010,Carmarthenshire,98615,196607 +W06000011,Swansea,125018,208872 +W06000012,Neath Port Talbot,76913,160856 +W06000013,Bridgend,75306,208808 +W06000014,Vale of Glamorgan,72758,299757 +W06000015,Cardiff,162839,271273 +W06000016,Rhondda Cynon Taf,117872,162675 +W06000018,Caerphilly,93171,196048 +W06000019,Blaenau Gwent,38230,142090 +W06000020,Torfaen,50955,189702 +W06000021,Monmouthshire,52772,335746 +W06000022,Newport,80610,226573 +W06000023,Powys,76021,229762 +W06000024,Merthyr Tydfil,32578,143596 diff --git a/policyengine_uk_data/targets/sources/la_land.py b/policyengine_uk_data/targets/sources/la_land.py new file mode 100644 index 000000000..20d7a2211 --- /dev/null +++ b/policyengine_uk_data/targets/sources/la_land.py @@ -0,0 +1,90 @@ +"""LA-level household land value targets. + +Local authority generalisation of mhclg_regional_land.py. Each local +authority's share of national household land value is proportional to +its total property wealth (households x avg_house_price), then scaled +to the ONS National Balance Sheet household-land series. + +Data sources: +- Average house price by LA: HM Land Registry UK HPI (Dec 2025). + For LAs whose ONS code changed between releases, the CSV matches on + LA name. For Northern Ireland LGDs missing from a specific month, + the NI country-level HPI price is used as a fallback. +- Households by LA: derived from the policyengine-uk-data LA weight + matrix (storage/local_authority_weights.h5), keeping the household + count definition consistent with the rest of the LA calibration. +- National household land total: HOUSEHOLD_LAND_VALUES (ONS National + Balance Sheet 2025, series AN.211 household sector). +""" + +import pandas as pd + +from policyengine_uk_data.targets.schema import ( + GeographicLevel, + Target, + Unit, +) +from policyengine_uk_data.targets.sources._land import ( + HOUSEHOLD_LAND_VALUES, + _REF_URL, +) +from policyengine_uk_data.targets.sources._common import STORAGE + + +def _load_inputs() -> pd.DataFrame: + csv_path = STORAGE / "la_land_values.csv" + return pd.read_csv(csv_path) + + +def _compute_la_shares() -> pd.DataFrame: + """Return a DataFrame with columns code, name, share. + + Each LA's share is proportional to households x avg_house_price, + scaled to sum to 1 across all UK local authorities. + """ + df = _load_inputs() + df["property_wealth"] = df["households"] * df["avg_house_price"] + total = df["property_wealth"].sum() + df["share"] = df["property_wealth"] / total + return df[["code", "name", "share"]] + + +def _compute_la_targets() -> dict[str, dict[int, float]]: + """Scale per-LA shares by the national household-land series.""" + shares = _compute_la_shares().set_index("code")["share"] + return { + code: { + year: float(share) * HOUSEHOLD_LAND_VALUES[year] + for year in HOUSEHOLD_LAND_VALUES + } + for code, share in shares.items() + } + + +def get_targets() -> list[Target]: + csv_path = STORAGE / "la_land_values.csv" + if not csv_path.exists(): + return [] + + df = _load_inputs() + la_targets = _compute_la_targets() + + targets: list[Target] = [] + for _, row in df.iterrows(): + code = row["code"] + name = row["name"] + targets.append( + Target( + name=f"ons/household_land_value/{code}", + variable="household_land_value", + source="ons", + unit=Unit.GBP, + geographic_level=GeographicLevel.LOCAL_AUTHORITY, + geo_code=code, + geo_name=name, + values=la_targets[code], + reference_url=_REF_URL, + ) + ) + + return targets diff --git a/policyengine_uk_data/tests/test_la_land_value_targets.py b/policyengine_uk_data/tests/test_la_land_value_targets.py new file mode 100644 index 000000000..81325c4f6 --- /dev/null +++ b/policyengine_uk_data/tests/test_la_land_value_targets.py @@ -0,0 +1,185 @@ +"""Tests for LA-level household land value calibration targets.""" + +import pandas as pd + +from policyengine_uk_data.targets.schema import GeographicLevel +from policyengine_uk_data.targets.sources._common import STORAGE +from policyengine_uk_data.targets.sources._land import HOUSEHOLD_LAND_VALUES +from policyengine_uk_data.targets.sources.la_land import ( + _compute_la_shares, + _compute_la_targets, + _load_inputs, + get_targets, +) + +LA_TARGETS = _compute_la_targets() +LA_SHARES = _compute_la_shares() +LA_INPUTS = _load_inputs() + + +# ── CSV data quality ───────────────────────────────────────────────── + + +def test_csv_row_count_matches_la_code_list(): + """la_land_values.csv should have the same 360 LAs as local_authorities_2021.csv.""" + la_codes = pd.read_csv(STORAGE / "local_authorities_2021.csv") + assert len(LA_INPUTS) == len(la_codes) + assert set(LA_INPUTS["code"]) == set(la_codes["code"]) + + +def test_csv_columns_match_schema(): + """CSV should have exactly the columns code, name, households, avg_house_price.""" + assert list(LA_INPUTS.columns) == ["code", "name", "households", "avg_house_price"] + + +def test_csv_no_missing_values(): + """No LA should have NaN in any column.""" + assert not LA_INPUTS.isna().any().any() + + +def test_csv_covers_all_four_countries(): + """All four UK countries (E/W/S/NI) should appear.""" + prefixes = LA_INPUTS["code"].str[0].unique() + assert set(prefixes) == {"E", "W", "S", "N"} + + +def test_house_prices_within_plausible_range(): + """Avg house prices should be between £50k and £2m per LA.""" + for _, row in LA_INPUTS.iterrows(): + assert 50_000 <= row["avg_house_price"] <= 2_000_000, ( + f"{row['name']}: avg_house_price £{row['avg_house_price']:,} " + "outside plausible range" + ) + + +def test_households_positive(): + """Every LA should have a positive implied household count.""" + assert (LA_INPUTS["households"] > 0).all() + + +# ── Share constraints ──────────────────────────────────────────────── + + +def test_shares_sum_to_one(): + """LA shares should sum to exactly 1.""" + assert abs(LA_SHARES["share"].sum() - 1.0) < 1e-9 + + +def test_all_shares_positive(): + """Every LA share should be positive.""" + assert (LA_SHARES["share"] > 0).all() + + +# ── Target value constraints ───────────────────────────────────────── + + +def test_all_targets_positive(): + """Every LA target should be a positive value for every year.""" + for code, values in LA_TARGETS.items(): + for year, value in values.items(): + assert value > 0, f"{code} {year}: non-positive target {value}" + + +def test_targets_sum_to_national(): + """LA targets should sum to the ONS national household land total.""" + for year in (2021, 2023, 2024): + la_sum = sum(values[year] for values in LA_TARGETS.values()) + national = HOUSEHOLD_LAND_VALUES[year] + rel_error = abs(la_sum / national - 1) + assert rel_error < 1e-6, ( + f"{year}: LA sum £{la_sum / 1e12:.3f}tn != " + f"national £{national / 1e12:.3f}tn" + ) + + +def test_kensington_and_chelsea_above_blackpool(): + """K&C avg household land value should exceed Blackpool's.""" + kc_code = LA_INPUTS.loc[ + LA_INPUTS["name"] == "Kensington and Chelsea", "code" + ].iloc[0] + blackpool_code = LA_INPUTS.loc[LA_INPUTS["name"] == "Blackpool", "code"].iloc[0] + kc_hh = LA_INPUTS.set_index("code").loc[kc_code, "households"] + bp_hh = LA_INPUTS.set_index("code").loc[blackpool_code, "households"] + kc_per_hh = LA_TARGETS[kc_code][2024] / kc_hh + bp_per_hh = LA_TARGETS[blackpool_code][2024] / bp_hh + assert kc_per_hh > bp_per_hh * 3, ( + f"K&C avg household land (£{kc_per_hh:,.0f}) should be at least " + f"3x Blackpool (£{bp_per_hh:,.0f})" + ) + + +def test_london_prime_dominates_top_quintile(): + """Top quintile of LAs by avg household land value should be London-heavy.""" + totals = pd.Series( + {code: values[2024] for code, values in LA_TARGETS.items()}, name="total" + ) + inputs = LA_INPUTS.set_index("code") + avg_per_hh = (totals / inputs["households"]).sort_values(ascending=False) + top_quintile = avg_per_hh.head(len(avg_per_hh) // 5).index + london_codes = set(inputs.loc[inputs.index.str.startswith("E09"), :].index) + london_in_top = len(set(top_quintile) & london_codes) + assert london_in_top >= 15, ( + f"Expected London LAs to dominate top quintile, found only {london_in_top}" + ) + + +def test_london_total_land_dwarfs_north_east(): + """Sum of London LA targets should exceed sum of North-East LA targets.""" + inputs = LA_INPUTS.set_index("code") + london_codes = inputs.loc[inputs.index.str.startswith("E09"), :].index + ne_prefixes = ("E06000001", "E06000002", "E06000003", "E06000004", "E06000005", + "E06000047", "E08000021", "E08000022", "E08000023", "E08000024", + "E08000037", "E06000057") + ne_codes = [c for c in inputs.index if c in ne_prefixes] + london_total = sum(LA_TARGETS[c][2024] for c in london_codes) + ne_total = sum(LA_TARGETS[c][2024] for c in ne_codes) + assert london_total > ne_total * 3, ( + f"London total (£{london_total / 1e9:.0f}bn) should exceed " + f"NE total (£{ne_total / 1e9:.0f}bn) by at least 3x" + ) + + +# ── Target registry integration ────────────────────────────────────── + + +def test_get_targets_returns_360(): + """get_targets() should return exactly 360 LA targets.""" + targets = get_targets() + assert len(targets) == 360 + + +def test_target_names_follow_code_pattern(): + """Target names should follow the ons/household_land_value/{code} pattern.""" + targets = get_targets() + for t in targets: + assert t.name.startswith("ons/household_land_value/") + assert t.name.removeprefix("ons/household_land_value/") == t.geo_code + + +def test_targets_declare_local_authority_geographic_level(): + """All LA targets should be tagged with GeographicLevel.LOCAL_AUTHORITY.""" + for t in get_targets(): + assert t.geographic_level == GeographicLevel.LOCAL_AUTHORITY + + +def test_targets_have_values_for_all_known_years(): + """LA targets should carry every year in the backfilled series.""" + expected_years = set(HOUSEHOLD_LAND_VALUES) + for t in get_targets(): + assert set(t.values) == expected_years, ( + f"{t.name} missing years: " + f"{expected_years - set(t.values)}" + ) + + +def test_target_registry_includes_la_targets(): + """LA land targets should appear in the global registry.""" + from policyengine_uk_data.targets import get_all_targets + + targets = get_all_targets( + year=2024, geographic_level=GeographicLevel.LOCAL_AUTHORITY + ) + la_land = [t for t in targets if t.name.startswith("ons/household_land_value/")] + assert len(la_land) == 360, ( + f"Expected 360 LA household-land targets, got {len(la_land)}" + ) diff --git a/policyengine_uk_data/tests/test_regional_land_value_targets.py b/policyengine_uk_data/tests/test_regional_land_value_targets.py index 5b1b5f931..92b77df69 100644 --- a/policyengine_uk_data/tests/test_regional_land_value_targets.py +++ b/policyengine_uk_data/tests/test_regional_land_value_targets.py @@ -130,9 +130,9 @@ def test_targets_have_values_for_2021_to_2026(): def test_target_registry_includes_regional(): """Regional land targets should appear in the global registry.""" - from policyengine_uk_data.targets import get_all_targets + from policyengine_uk_data.targets import get_all_targets, GeographicLevel - targets = get_all_targets(year=2025) + targets = get_all_targets(year=2025, geographic_level=GeographicLevel.REGION) regional = [t for t in targets if t.name.startswith("ons/household_land_value/")] assert len(regional) == 11, ( f"Expected 11 regional land targets, got {len(regional)}" From 6bd9d68a90b26a3b41e325fd819c3fa52ae3d8c6 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Mon, 20 Apr 2026 12:19:23 +0100 Subject: [PATCH 02/11] Apply ruff format to test_la_land_value_targets.py Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tests/test_la_land_value_targets.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/policyengine_uk_data/tests/test_la_land_value_targets.py b/policyengine_uk_data/tests/test_la_land_value_targets.py index 81325c4f6..37d1e8960 100644 --- a/policyengine_uk_data/tests/test_la_land_value_targets.py +++ b/policyengine_uk_data/tests/test_la_land_value_targets.py @@ -94,9 +94,9 @@ def test_targets_sum_to_national(): def test_kensington_and_chelsea_above_blackpool(): """K&C avg household land value should exceed Blackpool's.""" - kc_code = LA_INPUTS.loc[ - LA_INPUTS["name"] == "Kensington and Chelsea", "code" - ].iloc[0] + kc_code = LA_INPUTS.loc[LA_INPUTS["name"] == "Kensington and Chelsea", "code"].iloc[ + 0 + ] blackpool_code = LA_INPUTS.loc[LA_INPUTS["name"] == "Blackpool", "code"].iloc[0] kc_hh = LA_INPUTS.set_index("code").loc[kc_code, "households"] bp_hh = LA_INPUTS.set_index("code").loc[blackpool_code, "households"] @@ -127,9 +127,20 @@ def test_london_total_land_dwarfs_north_east(): """Sum of London LA targets should exceed sum of North-East LA targets.""" inputs = LA_INPUTS.set_index("code") london_codes = inputs.loc[inputs.index.str.startswith("E09"), :].index - ne_prefixes = ("E06000001", "E06000002", "E06000003", "E06000004", "E06000005", - "E06000047", "E08000021", "E08000022", "E08000023", "E08000024", - "E08000037", "E06000057") + ne_prefixes = ( + "E06000001", + "E06000002", + "E06000003", + "E06000004", + "E06000005", + "E06000047", + "E08000021", + "E08000022", + "E08000023", + "E08000024", + "E08000037", + "E06000057", + ) ne_codes = [c for c in inputs.index if c in ne_prefixes] london_total = sum(LA_TARGETS[c][2024] for c in london_codes) ne_total = sum(LA_TARGETS[c][2024] for c in ne_codes) @@ -167,8 +178,7 @@ def test_targets_have_values_for_all_known_years(): expected_years = set(HOUSEHOLD_LAND_VALUES) for t in get_targets(): assert set(t.values) == expected_years, ( - f"{t.name} missing years: " - f"{expected_years - set(t.values)}" + f"{t.name} missing years: {expected_years - set(t.values)}" ) From 3ed729c1dae1af37577ac5faf7e11fc39ec764d2 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Thu, 23 Apr 2026 14:29:31 +0100 Subject: [PATCH 03/11] Fix Isles of Scilly households leak and add bound tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The E06000053 row carried households=2,492,115 — roughly the South West region total — from an upstream fallback that fired during CSV generation. Real IoS has ~1,115 households per ONS mid-2023. With the bug, IoS absorbed 7.85% of the national property-wealth share, understating every other LA's 2024 target by ~8.5% (e.g. K&C moved from £42.6bn to £46.2bn after the fix). Two new tests prevent the regression: - test_households_within_plausible_range: bounds every LA to [500, 500_000] so any future 10x+ outlier fails immediately. - test_isles_of_scilly_households_are_thousands_not_millions: tight [500, 5_000] bound on the specific row that leaked. Methodology unchanged; LA targets still sum to the ONS national household-land series within 1e-6. --- .../storage/la_land_values.csv | 2 +- .../tests/test_la_land_value_targets.py | 28 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/policyengine_uk_data/storage/la_land_values.csv b/policyengine_uk_data/storage/la_land_values.csv index e0e4842c6..ef0023a32 100644 --- a/policyengine_uk_data/storage/la_land_values.csv +++ b/policyengine_uk_data/storage/la_land_values.csv @@ -48,7 +48,7 @@ E06000049,Cheshire East,185025,306039 E06000050,Cheshire West and Chester,162209,265955 E06000051,Shropshire,146609,281161 E06000052,Cornwall,265515,277318 -E06000053,Isles of Scilly,2492115,308582 +E06000053,Isles of Scilly,1115,308582 E06000054,Wiltshire,230273,332483 E06000055,Bedford,82624,331140 E06000056,Central Bedfordshire,129846,358488 diff --git a/policyengine_uk_data/tests/test_la_land_value_targets.py b/policyengine_uk_data/tests/test_la_land_value_targets.py index 37d1e8960..21a593fac 100644 --- a/policyengine_uk_data/tests/test_la_land_value_targets.py +++ b/policyengine_uk_data/tests/test_la_land_value_targets.py @@ -57,6 +57,34 @@ def test_households_positive(): assert (LA_INPUTS["households"] > 0).all() +def test_households_within_plausible_range(): + """Smallest UK billing authority (Isles of Scilly) has ~1,100 + households; largest (Birmingham) has ~450,000. A 1000x outlier — like + the regional-total fallback that leaked into the IoS row pre-review — + must be caught by bounds, not spotted by eye. + """ + out_of_range = LA_INPUTS[~LA_INPUTS["households"].between(500, 500_000)] + assert out_of_range.empty, ( + "Households out of plausible [500, 500_000] range: " + f"{out_of_range[['code', 'name', 'households']].to_dict('records')}" + ) + + +def test_isles_of_scilly_households_are_thousands_not_millions(): + """Explicit regression for the IoS fallback leak (was 2,492,115). + + Real IoS has ~1,115 households per ONS mid-2023 estimate (pop ~2,000). + Anything outside [500, 5,000] indicates the fallback path has + regressed again. + """ + ios = LA_INPUTS[LA_INPUTS["code"] == "E06000053"] + assert len(ios) == 1 + hh = int(ios["households"].iloc[0]) + assert 500 <= hh <= 5_000, ( + f"Isles of Scilly households = {hh:,}; ONS mid-2023 estimate is ~1,115" + ) + + # ── Share constraints ──────────────────────────────────────────────── From 2e42c740ee2562b6028453e8c6efc36fc705749d Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Mon, 27 Apr 2026 09:55:36 +0100 Subject: [PATCH 04/11] Wire LA land value targets into the calibration loss matrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The targets added in the previous commits were registered but inert — datasets/local_areas/local_authorities/loss.py never built a column for them, so the LA reweighter could not see them. This adds the ons/household_land_value column to the LA target matrix: - matrix entry: per-household household_land_value (from policyengine-uk). - y entry: 360-vector of per-LA targets at the calibration year, taken from la_land._compute_la_targets and reordered to match local_authorities_2021.csv so the country mask and target indices agree at every position. The year is selected from time_period; if it is outside HOUSEHOLD_LAND_VALUES (defined for 2021–2026) the latest known year is used as a fallback. New tests in test_la_loss_land_value.py cover both layers: - target dict ↔ la_codes ordering, finite-positive vector, sum-to- national for 2024/2025/2026 (no Microsimulation needed). - full create_local_authority_target_matrix build (gated on the enhanced FRS fixture): column presence, length 360, sum-to-national for the calibration year, ordering matches la_codes, all positive, and matrix column equals sim.calculate("household_land_value"). Closes the "out of scope" follow-up flagged in the original PR body. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../local_areas/local_authorities/loss.py | 17 ++ .../tests/test_la_loss_land_value.py | 167 ++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 policyengine_uk_data/tests/test_la_loss_land_value.py diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py index fd5ed9440..d573ff817 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py @@ -11,6 +11,7 @@ - ONS income: ONS small area income estimates - Tenure: English Housing Survey - Private rent: VOA/ONS private rental market statistics +- Household land value: ONS National Balance Sheet × LA property-wealth share """ from policyengine_uk import Microsimulation @@ -38,6 +39,8 @@ load_tenure_data, load_private_rents, ) +from policyengine_uk_data.targets.sources._land import HOUSEHOLD_LAND_VALUES +from policyengine_uk_data.targets.sources.la_land import _compute_la_targets def create_local_authority_target_matrix( @@ -252,6 +255,20 @@ def create_local_authority_target_matrix( national_rent * la_household_share, ) + # ── Household land value (LA targets) ────────────────────────── + # Per-LA target = LA's share of national property wealth × ONS + # household-land series for the calibration year. Source: + # policyengine_uk_data/targets/sources/la_land.py. + year = int(time_period) + land_year = year if year in HOUSEHOLD_LAND_VALUES else max(HOUSEHOLD_LAND_VALUES) + la_land_by_code = { + code: values[land_year] for code, values in _compute_la_targets().items() + } + matrix["ons/household_land_value"] = sim.calculate("household_land_value").values + y["ons/household_land_value"] = ( + la_codes["code"].map(la_land_by_code).values + ) + # ── Country mask ─────────────────────────────────────────────── country_mask = create_country_mask( household_countries=sim.calculate("country").values, diff --git a/policyengine_uk_data/tests/test_la_loss_land_value.py b/policyengine_uk_data/tests/test_la_loss_land_value.py new file mode 100644 index 000000000..74afa1dd0 --- /dev/null +++ b/policyengine_uk_data/tests/test_la_loss_land_value.py @@ -0,0 +1,167 @@ +"""Tests for the LA-level household-land-value column wired into the +local-authority calibration loss matrix. + +Two layers: + +1. Light-weight checks against the per-LA target dict from la_land.py — + these run without a Microsimulation and exercise the ordering / + summation properties the loss-matrix code relies on. +2. Full ``create_local_authority_target_matrix`` build, gated on the + enhanced FRS fixture so CI environments without the dataset skip + gracefully. +""" + +import numpy as np +import pandas as pd +import pytest + +from policyengine_uk_data.storage import STORAGE_FOLDER +from policyengine_uk_data.targets.sources._land import HOUSEHOLD_LAND_VALUES +from policyengine_uk_data.targets.sources.la_land import _compute_la_targets + + +LA_CODES = pd.read_csv(STORAGE_FOLDER / "local_authorities_2021.csv") +LA_TARGETS = _compute_la_targets() + + +# ── Layer 1: per-LA targets line up with the LA code ordering ──────── + + +def test_targets_cover_every_la_code(): + """Every code in local_authorities_2021.csv has an LA land target.""" + missing = set(LA_CODES["code"]) - set(LA_TARGETS) + assert not missing, f"LA codes missing land targets: {sorted(missing)[:5]}" + + +def test_target_vector_in_la_codes_order_is_finite_positive(): + """Reindexing by la_codes order yields a clean float vector.""" + year = 2025 + vec = LA_CODES["code"].map( + {code: values[year] for code, values in LA_TARGETS.items()} + ).values + assert len(vec) == 360 + assert np.isfinite(vec).all() + assert (vec > 0).all() + + +def test_target_vector_sums_to_national_household_land(): + """Sum of the 360 LA targets equals the ONS national figure for that year.""" + for year in (2024, 2025, 2026): + vec = LA_CODES["code"].map( + {code: values[year] for code, values in LA_TARGETS.items()} + ).values + rel_error = abs(vec.sum() / HOUSEHOLD_LAND_VALUES[year] - 1) + assert rel_error < 1e-6, ( + f"{year}: sum £{vec.sum() / 1e12:.3f}tn != " + f"national £{HOUSEHOLD_LAND_VALUES[year] / 1e12:.3f}tn" + ) + + +# ── Layer 2: full LA loss matrix build ─────────────────────────────── + + +def test_la_loss_matrix_includes_household_land_value(enhanced_frs): + """The LA target matrix must expose ons/household_land_value in both + matrix (per-household) and y (per-LA) so the calibrator can train on it. + """ + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + matrix, y, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + assert "ons/household_land_value" in matrix.columns + assert "ons/household_land_value" in y.columns + + +def test_la_loss_y_vector_length_360(enhanced_frs): + """y has one entry per LA and matches local_authorities_2021.csv ordering + by length.""" + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + _, y, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + assert len(y) == 360 + assert len(y["ons/household_land_value"]) == 360 + + +def test_la_loss_y_sums_to_national_for_calibration_year(enhanced_frs): + """Sum of LA-level y values equals the ONS national household-land total + for the calibration year (within float tolerance).""" + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + year = int(enhanced_frs.time_period) + fallback = max(HOUSEHOLD_LAND_VALUES) + expected = HOUSEHOLD_LAND_VALUES.get(year, HOUSEHOLD_LAND_VALUES[fallback]) + + _, y, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + rel_error = abs(y["ons/household_land_value"].sum() / expected - 1) + assert rel_error < 1e-6 + + +def test_la_loss_y_ordering_matches_la_codes(enhanced_frs): + """y["ons/household_land_value"] must be ordered by local_authorities_2021.csv, + so the country mask and the targets refer to the same LAs at each index.""" + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + year = int(enhanced_frs.time_period) + fallback = max(HOUSEHOLD_LAND_VALUES) + land_year = year if year in HOUSEHOLD_LAND_VALUES else fallback + expected = LA_CODES["code"].map( + {code: values[land_year] for code, values in LA_TARGETS.items()} + ).values + + _, y, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + np.testing.assert_array_equal( + y["ons/household_land_value"].values, expected + ) + + +def test_la_loss_y_all_positive(enhanced_frs): + """No LA should have a non-positive household-land target.""" + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + _, y, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + assert (y["ons/household_land_value"] > 0).all() + + +def test_la_loss_matrix_column_matches_household_land_value(enhanced_frs): + """matrix['ons/household_land_value'] should equal the per-household + household_land_value pulled from policyengine-uk for the calibration year.""" + from policyengine_uk import Microsimulation + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + matrix, _, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + sim = Microsimulation(dataset=enhanced_frs) + sim.default_calculation_period = enhanced_frs.time_period + expected = sim.calculate("household_land_value").values + + np.testing.assert_array_equal( + matrix["ons/household_land_value"].values, expected + ) From 768754b096d91fe0dc8444aac83f6d787232f259 Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Mon, 27 Apr 2026 09:59:29 +0100 Subject: [PATCH 05/11] Apply ruff format to loss.py and test_la_loss_land_value.py Co-Authored-By: Claude Opus 4.7 (1M context) --- .../local_areas/local_authorities/loss.py | 4 +-- .../tests/test_la_loss_land_value.py | 32 ++++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py index d573ff817..8aa8692bb 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py @@ -265,9 +265,7 @@ def create_local_authority_target_matrix( code: values[land_year] for code, values in _compute_la_targets().items() } matrix["ons/household_land_value"] = sim.calculate("household_land_value").values - y["ons/household_land_value"] = ( - la_codes["code"].map(la_land_by_code).values - ) + y["ons/household_land_value"] = la_codes["code"].map(la_land_by_code).values # ── Country mask ─────────────────────────────────────────────── country_mask = create_country_mask( diff --git a/policyengine_uk_data/tests/test_la_loss_land_value.py b/policyengine_uk_data/tests/test_la_loss_land_value.py index 74afa1dd0..64c9b4b55 100644 --- a/policyengine_uk_data/tests/test_la_loss_land_value.py +++ b/policyengine_uk_data/tests/test_la_loss_land_value.py @@ -36,9 +36,11 @@ def test_targets_cover_every_la_code(): def test_target_vector_in_la_codes_order_is_finite_positive(): """Reindexing by la_codes order yields a clean float vector.""" year = 2025 - vec = LA_CODES["code"].map( - {code: values[year] for code, values in LA_TARGETS.items()} - ).values + vec = ( + LA_CODES["code"] + .map({code: values[year] for code, values in LA_TARGETS.items()}) + .values + ) assert len(vec) == 360 assert np.isfinite(vec).all() assert (vec > 0).all() @@ -47,9 +49,11 @@ def test_target_vector_in_la_codes_order_is_finite_positive(): def test_target_vector_sums_to_national_household_land(): """Sum of the 360 LA targets equals the ONS national figure for that year.""" for year in (2024, 2025, 2026): - vec = LA_CODES["code"].map( - {code: values[year] for code, values in LA_TARGETS.items()} - ).values + vec = ( + LA_CODES["code"] + .map({code: values[year] for code, values in LA_TARGETS.items()}) + .values + ) rel_error = abs(vec.sum() / HOUSEHOLD_LAND_VALUES[year] - 1) assert rel_error < 1e-6, ( f"{year}: sum £{vec.sum() / 1e12:.3f}tn != " @@ -120,17 +124,17 @@ def test_la_loss_y_ordering_matches_la_codes(enhanced_frs): year = int(enhanced_frs.time_period) fallback = max(HOUSEHOLD_LAND_VALUES) land_year = year if year in HOUSEHOLD_LAND_VALUES else fallback - expected = LA_CODES["code"].map( - {code: values[land_year] for code, values in LA_TARGETS.items()} - ).values + expected = ( + LA_CODES["code"] + .map({code: values[land_year] for code, values in LA_TARGETS.items()}) + .values + ) _, y, _ = create_local_authority_target_matrix( enhanced_frs, time_period=enhanced_frs.time_period ) - np.testing.assert_array_equal( - y["ons/household_land_value"].values, expected - ) + np.testing.assert_array_equal(y["ons/household_land_value"].values, expected) def test_la_loss_y_all_positive(enhanced_frs): @@ -162,6 +166,4 @@ def test_la_loss_matrix_column_matches_household_land_value(enhanced_frs): sim.default_calculation_period = enhanced_frs.time_period expected = sim.calculate("household_land_value").values - np.testing.assert_array_equal( - matrix["ons/household_land_value"].values, expected - ) + np.testing.assert_array_equal(matrix["ons/household_land_value"].values, expected) From 4d4650915897b0e29f3691961bd98cf224e3848d Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 29 Apr 2026 11:36:37 +0100 Subject: [PATCH 06/11] Switch LA target to directly observed indicators (no national apportionment) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the imputed land-value target with a main-residence-value target built from observed LA-level inputs, mirroring the existing private-rent block: target_la = avg_house_price_la × ownership_share_la × n_households_la (HMLR HPI) × (English Housing Survey) × (Census) Per @MaxGhenis's standup note (28 Apr): minimise target manipulation by calibrating on observable LA-level housing indicators rather than apportioning a national ONS land-value total across LAs. The new target uses the same shape as the rent target (median × share × count), including the national-share fallback for LAs missing any input. Changes: - la_land.py: drop HOUSEHOLD_LAND_VALUES dependency; new load_la_avg_prices() helper; _compute_la_targets() returns observed-product £ per LA; targets renamed housing/main_residence_value/{code}, source=hmlr. - loss.py: replace the apportionment block with the rent-style inline pattern (merge avg_price into tenure_merged, target = price × ownership × households, na-fallback to national_property × la_household_share). - Tests: drop "sums to ONS national" assertions; assert per-LA target equals observed product exactly. Layer-2 FRS-gated tests updated to use main_residence_value column. Co-Authored-By: Claude Opus 4.7 (1M context) --- changelog.d/370.md | 2 +- .../local_areas/local_authorities/loss.py | 53 ++-- .../targets/sources/la_land.py | 115 +++++---- .../tests/test_la_land_value_targets.py | 228 ++++++++---------- .../tests/test_la_loss_land_value.py | 146 ++++++----- .../tests/test_regional_land_value_targets.py | 4 +- 6 files changed, 287 insertions(+), 261 deletions(-) diff --git a/changelog.d/370.md b/changelog.d/370.md index 3778bcf57..dd421dc13 100644 --- a/changelog.d/370.md +++ b/changelog.d/370.md @@ -1 +1 @@ -Add LA-level household land value calibration targets for all 360 UK local authorities, generalising the regional method to LA granularity using HM Land Registry UK HPI and PolicyEngine's LA household counts. +Add LA-level main residence value calibration targets for all 360 UK local authorities, built from directly observed indicators (HMLR UK HPI × English Housing Survey ownership share × Census household count) and wired into the LA reweighter alongside the existing tenure and rent targets. diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py index 8aa8692bb..a58b66bd2 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py @@ -11,7 +11,7 @@ - ONS income: ONS small area income estimates - Tenure: English Housing Survey - Private rent: VOA/ONS private rental market statistics -- Household land value: ONS National Balance Sheet × LA property-wealth share +- Main residence value: HMLR UK HPI × ownership share × household count """ from policyengine_uk import Microsimulation @@ -39,8 +39,7 @@ load_tenure_data, load_private_rents, ) -from policyengine_uk_data.targets.sources._land import HOUSEHOLD_LAND_VALUES -from policyengine_uk_data.targets.sources.la_land import _compute_la_targets +from policyengine_uk_data.targets.sources.la_land import load_la_avg_prices def create_local_authority_target_matrix( @@ -255,17 +254,43 @@ def create_local_authority_target_matrix( national_rent * la_household_share, ) - # ── Household land value (LA targets) ────────────────────────── - # Per-LA target = LA's share of national property wealth × ONS - # household-land series for the calibration year. Source: - # policyengine_uk_data/targets/sources/la_land.py. - year = int(time_period) - land_year = year if year in HOUSEHOLD_LAND_VALUES else max(HOUSEHOLD_LAND_VALUES) - la_land_by_code = { - code: values[land_year] for code, values in _compute_la_targets().items() - } - matrix["ons/household_land_value"] = sim.calculate("household_land_value").values - y["ons/household_land_value"] = la_codes["code"].map(la_land_by_code).values + # ── Main residence value (HMLR × ownership share × households) ─ + # Mirrors the private-rent target pattern: directly observed + # LA-level housing indicators multiplied together, with a + # national-share fallback for LAs missing any input. + la_prices = load_la_avg_prices() + tenure_merged = tenure_merged.merge( + la_prices[["code", "avg_house_price"]], on="code", how="left" + ) + + matrix["housing/main_residence_value"] = sim.calculate( + "main_residence_value" + ).values + + ownership_share_la = ( + tenure_merged["owned_outright_pct"].fillna(0) + + tenure_merged["owned_mortgage_pct"].fillna(0) + ) / 100 + tenure_merged["main_residence_value_target"] = ( + tenure_merged["avg_house_price"] + * ownership_share_la + * tenure_merged["households"] + ) + + has_property = ( + tenure_merged["avg_house_price"].notna() + & tenure_merged["owned_outright_pct"].notna() + & tenure_merged["households"].notna() + ).values + national_property = ( + original_weights * matrix["housing/main_residence_value"].values + ).sum() + + y["housing/main_residence_value"] = np.where( + has_property, + tenure_merged["main_residence_value_target"].values, + national_property * la_household_share, + ) # ── Country mask ─────────────────────────────────────────────── country_mask = create_country_mask( diff --git a/policyengine_uk_data/targets/sources/la_land.py b/policyengine_uk_data/targets/sources/la_land.py index 20d7a2211..0d4472875 100644 --- a/policyengine_uk_data/targets/sources/la_land.py +++ b/policyengine_uk_data/targets/sources/la_land.py @@ -1,20 +1,21 @@ -"""LA-level household land value targets. +"""LA-level main residence value targets. -Local authority generalisation of mhclg_regional_land.py. Each local -authority's share of national household land value is proportional to -its total property wealth (households x avg_house_price), then scaled -to the ONS National Balance Sheet household-land series. +Each local authority's target is built from directly observed LA-level +housing indicators, mirroring the existing private-rent calibration: + + target_la = avg_house_price_la × ownership_share_la × n_households_la + +This is the symmetric counterpart of the rent target for the +owner-occupier side. No national-total apportionment. Data sources: - Average house price by LA: HM Land Registry UK HPI (Dec 2025). For LAs whose ONS code changed between releases, the CSV matches on LA name. For Northern Ireland LGDs missing from a specific month, the NI country-level HPI price is used as a fallback. -- Households by LA: derived from the policyengine-uk-data LA weight - matrix (storage/local_authority_weights.h5), keeping the household - count definition consistent with the rest of the LA calibration. -- National household land total: HOUSEHOLD_LAND_VALUES (ONS National - Balance Sheet 2025, series AN.211 household sector). +- Ownership share by LA: English Housing Survey, via load_tenure_data + (owned_outright_pct + owned_mortgage_pct). +- Households by LA: Census 2021, via load_household_counts. """ import pandas as pd @@ -24,66 +25,92 @@ Target, Unit, ) -from policyengine_uk_data.targets.sources._land import ( - HOUSEHOLD_LAND_VALUES, - _REF_URL, -) from policyengine_uk_data.targets.sources._common import STORAGE -def _load_inputs() -> pd.DataFrame: +_REF_URL_HMLR = ( + "https://www.gov.uk/government/statistical-data-sets/" + "uk-house-price-index-data-downloads-december-2025" +) + + +def load_la_avg_prices() -> pd.DataFrame: + """Load HMLR average house price by LA. + + Returns DataFrame with columns: code, name, avg_house_price. + """ csv_path = STORAGE / "la_land_values.csv" - return pd.read_csv(csv_path) + if not csv_path.exists(): + return pd.DataFrame(columns=["code", "name", "avg_house_price"]) + df = pd.read_csv(csv_path) + return df[["code", "name", "avg_house_price"]] + +def _compute_la_targets() -> dict[str, float]: + """Per-LA main residence value target. -def _compute_la_shares() -> pd.DataFrame: - """Return a DataFrame with columns code, name, share. + target_la = avg_house_price_la × ownership_share_la × n_households_la - Each LA's share is proportional to households x avg_house_price, - scaled to sum to 1 across all UK local authorities. + Returns a dict ``code -> £``. LAs missing any input drop out and + are handled in loss.py by the national-share fallback (same + pattern as the tenure and rent targets). """ - df = _load_inputs() - df["property_wealth"] = df["households"] * df["avg_house_price"] - total = df["property_wealth"].sum() - df["share"] = df["property_wealth"] / total - return df[["code", "name", "share"]] + from policyengine_uk_data.targets.sources.local_la_extras import ( + load_household_counts, + load_tenure_data, + ) + + prices = load_la_avg_prices() + tenure = load_tenure_data() + households = load_household_counts() + + if prices.empty or tenure.empty or households.empty: + return {} + merged = prices.merge( + tenure, left_on="code", right_on="la_code", how="left" + ).merge(households, on="la_code", how="left") + + ownership_share = ( + merged["owned_outright_pct"].fillna(0) + + merged["owned_mortgage_pct"].fillna(0) + ) / 100 + targets = merged["avg_house_price"] * ownership_share * merged["households"] -def _compute_la_targets() -> dict[str, dict[int, float]]: - """Scale per-LA shares by the national household-land series.""" - shares = _compute_la_shares().set_index("code")["share"] return { - code: { - year: float(share) * HOUSEHOLD_LAND_VALUES[year] - for year in HOUSEHOLD_LAND_VALUES - } - for code, share in shares.items() + code: float(value) + for code, value in zip(merged["code"], targets) + if pd.notna(value) and value > 0 } def get_targets() -> list[Target]: - csv_path = STORAGE / "la_land_values.csv" - if not csv_path.exists(): + prices = load_la_avg_prices() + if prices.empty: return [] - df = _load_inputs() la_targets = _compute_la_targets() targets: list[Target] = [] - for _, row in df.iterrows(): + for _, row in prices.iterrows(): code = row["code"] - name = row["name"] + target_value = la_targets.get(code) + if target_value is None: + continue + # HMLR Dec 2025 snapshot; same value across calibration years + # until a year-varying HMLR series is wired in. + values = {year: target_value for year in (2024, 2025, 2026)} targets.append( Target( - name=f"ons/household_land_value/{code}", - variable="household_land_value", - source="ons", + name=f"housing/main_residence_value/{code}", + variable="main_residence_value", + source="hmlr", unit=Unit.GBP, geographic_level=GeographicLevel.LOCAL_AUTHORITY, geo_code=code, - geo_name=name, - values=la_targets[code], - reference_url=_REF_URL, + geo_name=row["name"], + values=values, + reference_url=_REF_URL_HMLR, ) ) diff --git a/policyengine_uk_data/tests/test_la_land_value_targets.py b/policyengine_uk_data/tests/test_la_land_value_targets.py index 21a593fac..d8ebadc62 100644 --- a/policyengine_uk_data/tests/test_la_land_value_targets.py +++ b/policyengine_uk_data/tests/test_la_land_value_targets.py @@ -1,20 +1,28 @@ -"""Tests for LA-level household land value calibration targets.""" +"""Tests for LA-level main residence value calibration targets. + +Targets are built from directly observed LA-level housing indicators +(HMLR avg house price × English Housing Survey ownership share × Census +household count), mirroring the existing private-rent target. No +national-total apportionment. +""" import pandas as pd from policyengine_uk_data.targets.schema import GeographicLevel from policyengine_uk_data.targets.sources._common import STORAGE -from policyengine_uk_data.targets.sources._land import HOUSEHOLD_LAND_VALUES from policyengine_uk_data.targets.sources.la_land import ( - _compute_la_shares, _compute_la_targets, - _load_inputs, get_targets, + load_la_avg_prices, +) +from policyengine_uk_data.targets.sources.local_la_extras import ( + load_household_counts, + load_tenure_data, ) + +LA_PRICES = load_la_avg_prices() LA_TARGETS = _compute_la_targets() -LA_SHARES = _compute_la_shares() -LA_INPUTS = _load_inputs() # ── CSV data quality ───────────────────────────────────────────────── @@ -23,61 +31,56 @@ def test_csv_row_count_matches_la_code_list(): """la_land_values.csv should have the same 360 LAs as local_authorities_2021.csv.""" la_codes = pd.read_csv(STORAGE / "local_authorities_2021.csv") - assert len(LA_INPUTS) == len(la_codes) - assert set(LA_INPUTS["code"]) == set(la_codes["code"]) + raw = pd.read_csv(STORAGE / "la_land_values.csv") + assert len(raw) == len(la_codes) + assert set(raw["code"]) == set(la_codes["code"]) def test_csv_columns_match_schema(): """CSV should have exactly the columns code, name, households, avg_house_price.""" - assert list(LA_INPUTS.columns) == ["code", "name", "households", "avg_house_price"] + raw = pd.read_csv(STORAGE / "la_land_values.csv") + assert list(raw.columns) == ["code", "name", "households", "avg_house_price"] def test_csv_no_missing_values(): """No LA should have NaN in any column.""" - assert not LA_INPUTS.isna().any().any() + raw = pd.read_csv(STORAGE / "la_land_values.csv") + assert not raw.isna().any().any() def test_csv_covers_all_four_countries(): """All four UK countries (E/W/S/NI) should appear.""" - prefixes = LA_INPUTS["code"].str[0].unique() + prefixes = LA_PRICES["code"].str[0].unique() assert set(prefixes) == {"E", "W", "S", "N"} def test_house_prices_within_plausible_range(): """Avg house prices should be between £50k and £2m per LA.""" - for _, row in LA_INPUTS.iterrows(): + for _, row in LA_PRICES.iterrows(): assert 50_000 <= row["avg_house_price"] <= 2_000_000, ( f"{row['name']}: avg_house_price £{row['avg_house_price']:,} " "outside plausible range" ) -def test_households_positive(): - """Every LA should have a positive implied household count.""" - assert (LA_INPUTS["households"] > 0).all() - - -def test_households_within_plausible_range(): +def test_csv_households_within_plausible_range(): """Smallest UK billing authority (Isles of Scilly) has ~1,100 - households; largest (Birmingham) has ~450,000. A 1000x outlier — like - the regional-total fallback that leaked into the IoS row pre-review — - must be caught by bounds, not spotted by eye. + households; largest (Birmingham) has ~450,000. The CSV `households` + column is retained as a regression fixture for the IoS fallback leak + even though the calibration target uses Census counts. """ - out_of_range = LA_INPUTS[~LA_INPUTS["households"].between(500, 500_000)] + raw = pd.read_csv(STORAGE / "la_land_values.csv") + out_of_range = raw[~raw["households"].between(500, 500_000)] assert out_of_range.empty, ( - "Households out of plausible [500, 500_000] range: " + "CSV households out of plausible [500, 500_000] range: " f"{out_of_range[['code', 'name', 'households']].to_dict('records')}" ) def test_isles_of_scilly_households_are_thousands_not_millions(): - """Explicit regression for the IoS fallback leak (was 2,492,115). - - Real IoS has ~1,115 households per ONS mid-2023 estimate (pop ~2,000). - Anything outside [500, 5,000] indicates the fallback path has - regressed again. - """ - ios = LA_INPUTS[LA_INPUTS["code"] == "E06000053"] + """Explicit regression for the IoS fallback leak (was 2,492,115).""" + raw = pd.read_csv(STORAGE / "la_land_values.csv") + ios = raw[raw["code"] == "E06000053"] assert len(ios) == 1 hh = int(ios["households"].iloc[0]) assert 500 <= hh <= 5_000, ( @@ -85,93 +88,67 @@ def test_isles_of_scilly_households_are_thousands_not_millions(): ) -# ── Share constraints ──────────────────────────────────────────────── - - -def test_shares_sum_to_one(): - """LA shares should sum to exactly 1.""" - assert abs(LA_SHARES["share"].sum() - 1.0) < 1e-9 - - -def test_all_shares_positive(): - """Every LA share should be positive.""" - assert (LA_SHARES["share"] > 0).all() - - # ── Target value constraints ───────────────────────────────────────── -def test_all_targets_positive(): - """Every LA target should be a positive value for every year.""" - for code, values in LA_TARGETS.items(): - for year, value in values.items(): - assert value > 0, f"{code} {year}: non-positive target {value}" - - -def test_targets_sum_to_national(): - """LA targets should sum to the ONS national household land total.""" - for year in (2021, 2023, 2024): - la_sum = sum(values[year] for values in LA_TARGETS.values()) - national = HOUSEHOLD_LAND_VALUES[year] - rel_error = abs(la_sum / national - 1) - assert rel_error < 1e-6, ( - f"{year}: LA sum £{la_sum / 1e12:.3f}tn != " - f"national £{national / 1e12:.3f}tn" +def test_targets_match_observed_product(): + """Every target equals avg_price × ownership_share × n_households exactly. + + No national-total apportionment, no rescaling: just the directly + observed product, identical in shape to the rent target. + """ + prices = LA_PRICES.set_index("code")["avg_house_price"] + tenure = load_tenure_data().set_index("la_code") + households = load_household_counts().set_index("la_code")["households"] + + for code, target in LA_TARGETS.items(): + if code not in tenure.index or code not in households.index: + continue + ownership = ( + tenure.loc[code, "owned_outright_pct"] + + tenure.loc[code, "owned_mortgage_pct"] + ) / 100 + expected = prices.loc[code] * ownership * households.loc[code] + assert abs(target - expected) < 1e-3, ( + f"{code}: target {target:,.2f} != expected {expected:,.2f}" ) -def test_kensington_and_chelsea_above_blackpool(): - """K&C avg household land value should exceed Blackpool's.""" - kc_code = LA_INPUTS.loc[LA_INPUTS["name"] == "Kensington and Chelsea", "code"].iloc[ - 0 - ] - blackpool_code = LA_INPUTS.loc[LA_INPUTS["name"] == "Blackpool", "code"].iloc[0] - kc_hh = LA_INPUTS.set_index("code").loc[kc_code, "households"] - bp_hh = LA_INPUTS.set_index("code").loc[blackpool_code, "households"] - kc_per_hh = LA_TARGETS[kc_code][2024] / kc_hh - bp_per_hh = LA_TARGETS[blackpool_code][2024] / bp_hh - assert kc_per_hh > bp_per_hh * 3, ( - f"K&C avg household land (£{kc_per_hh:,.0f}) should be at least " - f"3x Blackpool (£{bp_per_hh:,.0f})" +def test_all_targets_positive(): + """Every per-LA target should be positive.""" + assert all(value > 0 for value in LA_TARGETS.values()) + + +def test_explicit_targets_cover_english_las(): + """Direct-formula targets are produced for LAs with EHS tenure data + (England). Wales, Scotland and Northern Ireland LAs are handled by + the national-share fallback in loss.py — same as the existing + tenure target, by construction.""" + prefixes = {code[0] for code in LA_TARGETS} + assert prefixes == {"E"}, ( + f"Expected English-only targets from EHS data, got {sorted(prefixes)}" ) -def test_london_prime_dominates_top_quintile(): - """Top quintile of LAs by avg household land value should be London-heavy.""" - totals = pd.Series( - {code: values[2024] for code, values in LA_TARGETS.items()}, name="total" - ) - inputs = LA_INPUTS.set_index("code") - avg_per_hh = (totals / inputs["households"]).sort_values(ascending=False) - top_quintile = avg_per_hh.head(len(avg_per_hh) // 5).index - london_codes = set(inputs.loc[inputs.index.str.startswith("E09"), :].index) - london_in_top = len(set(top_quintile) & london_codes) - assert london_in_top >= 15, ( - f"Expected London LAs to dominate top quintile, found only {london_in_top}" - ) +def test_kensington_and_chelsea_above_blackpool(): + """K&C aggregate main-residence-value target should exceed Blackpool's.""" + name_to_code = dict(zip(LA_PRICES["name"], LA_PRICES["code"])) + kc = LA_TARGETS[name_to_code["Kensington and Chelsea"]] + bp = LA_TARGETS[name_to_code["Blackpool"]] + assert kc > bp, f"K&C target (£{kc / 1e9:.1f}bn) should exceed Blackpool (£{bp / 1e9:.1f}bn)" -def test_london_total_land_dwarfs_north_east(): +def test_london_total_exceeds_north_east(): """Sum of London LA targets should exceed sum of North-East LA targets.""" - inputs = LA_INPUTS.set_index("code") - london_codes = inputs.loc[inputs.index.str.startswith("E09"), :].index - ne_prefixes = ( - "E06000001", - "E06000002", - "E06000003", - "E06000004", - "E06000005", - "E06000047", - "E08000021", - "E08000022", - "E08000023", - "E08000024", - "E08000037", - "E06000057", - ) - ne_codes = [c for c in inputs.index if c in ne_prefixes] - london_total = sum(LA_TARGETS[c][2024] for c in london_codes) - ne_total = sum(LA_TARGETS[c][2024] for c in ne_codes) + london_codes = [c for c in LA_TARGETS if c.startswith("E09")] + ne_prefixes = { + "E06000001", "E06000002", "E06000003", "E06000004", "E06000005", + "E06000047", "E08000021", "E08000022", "E08000023", "E08000024", + "E08000037", "E06000057", + } + ne_codes = [c for c in LA_TARGETS if c in ne_prefixes] + london_total = sum(LA_TARGETS[c] for c in london_codes) + ne_total = sum(LA_TARGETS[c] for c in ne_codes) assert london_total > ne_total * 3, ( f"London total (£{london_total / 1e9:.0f}bn) should exceed " f"NE total (£{ne_total / 1e9:.0f}bn) by at least 3x" @@ -181,18 +158,18 @@ def test_london_total_land_dwarfs_north_east(): # ── Target registry integration ────────────────────────────────────── -def test_get_targets_returns_360(): - """get_targets() should return exactly 360 LA targets.""" +def test_get_targets_returns_targets_for_covered_las(): + """get_targets() returns one Target per LA with all inputs available.""" targets = get_targets() - assert len(targets) == 360 + assert len(targets) == len(LA_TARGETS) + assert {t.geo_code for t in targets} == set(LA_TARGETS) def test_target_names_follow_code_pattern(): - """Target names should follow the ons/household_land_value/{code} pattern.""" - targets = get_targets() - for t in targets: - assert t.name.startswith("ons/household_land_value/") - assert t.name.removeprefix("ons/household_land_value/") == t.geo_code + """Target names should follow the housing/main_residence_value/{code} pattern.""" + for t in get_targets(): + assert t.name.startswith("housing/main_residence_value/") + assert t.name.removeprefix("housing/main_residence_value/") == t.geo_code def test_targets_declare_local_authority_geographic_level(): @@ -201,23 +178,28 @@ def test_targets_declare_local_authority_geographic_level(): assert t.geographic_level == GeographicLevel.LOCAL_AUTHORITY -def test_targets_have_values_for_all_known_years(): - """LA targets should carry every year in the backfilled series.""" - expected_years = set(HOUSEHOLD_LAND_VALUES) +def test_targets_declare_hmlr_source(): + """LA property-value targets are sourced from HMLR UK HPI.""" for t in get_targets(): - assert set(t.values) == expected_years, ( - f"{t.name} missing years: {expected_years - set(t.values)}" - ) + assert t.source == "hmlr" + + +def test_targets_have_calibration_year_values(): + """LA targets should carry values for the supported calibration years.""" + for t in get_targets(): + assert {2024, 2025, 2026} <= set(t.values) def test_target_registry_includes_la_targets(): - """LA land targets should appear in the global registry.""" + """LA property-value targets should appear in the global registry.""" from policyengine_uk_data.targets import get_all_targets targets = get_all_targets( year=2024, geographic_level=GeographicLevel.LOCAL_AUTHORITY ) - la_land = [t for t in targets if t.name.startswith("ons/household_land_value/")] - assert len(la_land) == 360, ( - f"Expected 360 LA household-land targets, got {len(la_land)}" + la_property = [ + t for t in targets if t.name.startswith("housing/main_residence_value/") + ] + assert len(la_property) == len(LA_TARGETS), ( + f"Expected {len(LA_TARGETS)} LA property-value targets, got {len(la_property)}" ) diff --git a/policyengine_uk_data/tests/test_la_loss_land_value.py b/policyengine_uk_data/tests/test_la_loss_land_value.py index 64c9b4b55..bbc9d6b5f 100644 --- a/policyengine_uk_data/tests/test_la_loss_land_value.py +++ b/policyengine_uk_data/tests/test_la_loss_land_value.py @@ -1,11 +1,11 @@ -"""Tests for the LA-level household-land-value column wired into the +"""Tests for the LA-level main-residence-value column wired into the local-authority calibration loss matrix. Two layers: 1. Light-weight checks against the per-LA target dict from la_land.py — these run without a Microsimulation and exercise the ordering / - summation properties the loss-matrix code relies on. + shape properties the loss-matrix code relies on. 2. Full ``create_local_authority_target_matrix`` build, gated on the enhanced FRS fixture so CI environments without the dataset skip gracefully. @@ -13,11 +13,16 @@ import numpy as np import pandas as pd -import pytest from policyengine_uk_data.storage import STORAGE_FOLDER -from policyengine_uk_data.targets.sources._land import HOUSEHOLD_LAND_VALUES -from policyengine_uk_data.targets.sources.la_land import _compute_la_targets +from policyengine_uk_data.targets.sources.la_land import ( + _compute_la_targets, + load_la_avg_prices, +) +from policyengine_uk_data.targets.sources.local_la_extras import ( + load_household_counts, + load_tenure_data, +) LA_CODES = pd.read_csv(STORAGE_FOLDER / "local_authorities_2021.csv") @@ -27,47 +32,50 @@ # ── Layer 1: per-LA targets line up with the LA code ordering ──────── -def test_targets_cover_every_la_code(): - """Every code in local_authorities_2021.csv has an LA land target.""" - missing = set(LA_CODES["code"]) - set(LA_TARGETS) - assert not missing, f"LA codes missing land targets: {sorted(missing)[:5]}" +def test_explicit_targets_cover_english_las(): + """Direct-formula targets are produced for LAs with EHS tenure data + (England). Other UK countries fall through to the national-share + fallback in loss.py — same as the existing tenure target.""" + prefixes = {code[0] for code in LA_TARGETS} + assert prefixes == {"E"} -def test_target_vector_in_la_codes_order_is_finite_positive(): - """Reindexing by la_codes order yields a clean float vector.""" - year = 2025 - vec = ( - LA_CODES["code"] - .map({code: values[year] for code, values in LA_TARGETS.items()}) - .values - ) +def test_target_vector_in_la_codes_order_is_finite_positive_where_present(): + """Reindexing by la_codes order yields a clean float vector for + LAs with a target; LAs missing inputs become NaN (later filled by + the national-share fallback inside loss.py).""" + vec = LA_CODES["code"].map(LA_TARGETS).values + finite = vec[~np.isnan(vec.astype(float))] assert len(vec) == 360 - assert np.isfinite(vec).all() - assert (vec > 0).all() + assert (finite > 0).all() -def test_target_vector_sums_to_national_household_land(): - """Sum of the 360 LA targets equals the ONS national figure for that year.""" - for year in (2024, 2025, 2026): - vec = ( - LA_CODES["code"] - .map({code: values[year] for code, values in LA_TARGETS.items()}) - .values - ) - rel_error = abs(vec.sum() / HOUSEHOLD_LAND_VALUES[year] - 1) - assert rel_error < 1e-6, ( - f"{year}: sum £{vec.sum() / 1e12:.3f}tn != " - f"national £{HOUSEHOLD_LAND_VALUES[year] / 1e12:.3f}tn" - ) +def test_targets_match_observed_product_inline(): + """Per-LA target equals avg_price × ownership_share × n_households — + the same shape as private rent's ``median_rent × renter_pct × n_hh``. + """ + prices = load_la_avg_prices().set_index("code")["avg_house_price"] + tenure = load_tenure_data().set_index("la_code") + households = load_household_counts().set_index("la_code")["households"] + + for code, target in LA_TARGETS.items(): + if code not in tenure.index or code not in households.index: + continue + ownership = ( + tenure.loc[code, "owned_outright_pct"] + + tenure.loc[code, "owned_mortgage_pct"] + ) / 100 + expected = prices.loc[code] * ownership * households.loc[code] + assert abs(target - expected) < 1e-3 # ── Layer 2: full LA loss matrix build ─────────────────────────────── -def test_la_loss_matrix_includes_household_land_value(enhanced_frs): - """The LA target matrix must expose ons/household_land_value in both - matrix (per-household) and y (per-LA) so the calibrator can train on it. - """ +def test_la_loss_matrix_includes_main_residence_value(enhanced_frs): + """The LA target matrix must expose housing/main_residence_value in + both matrix (per-household) and y (per-LA) so the calibrator can + train on it.""" from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( create_local_authority_target_matrix, ) @@ -76,8 +84,8 @@ def test_la_loss_matrix_includes_household_land_value(enhanced_frs): enhanced_frs, time_period=enhanced_frs.time_period ) - assert "ons/household_land_value" in matrix.columns - assert "ons/household_land_value" in y.columns + assert "housing/main_residence_value" in matrix.columns + assert "housing/main_residence_value" in y.columns def test_la_loss_y_vector_length_360(enhanced_frs): @@ -92,53 +100,35 @@ def test_la_loss_y_vector_length_360(enhanced_frs): ) assert len(y) == 360 - assert len(y["ons/household_land_value"]) == 360 - - -def test_la_loss_y_sums_to_national_for_calibration_year(enhanced_frs): - """Sum of LA-level y values equals the ONS national household-land total - for the calibration year (within float tolerance).""" - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - year = int(enhanced_frs.time_period) - fallback = max(HOUSEHOLD_LAND_VALUES) - expected = HOUSEHOLD_LAND_VALUES.get(year, HOUSEHOLD_LAND_VALUES[fallback]) - - _, y, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) + assert len(y["housing/main_residence_value"]) == 360 - rel_error = abs(y["ons/household_land_value"].sum() / expected - 1) - assert rel_error < 1e-6 +def test_la_loss_y_matches_observed_product_for_covered_las(enhanced_frs): + """For LAs with all inputs present, y equals avg_price × ownership × n_households. -def test_la_loss_y_ordering_matches_la_codes(enhanced_frs): - """y["ons/household_land_value"] must be ordered by local_authorities_2021.csv, - so the country mask and the targets refer to the same LAs at each index.""" + LAs missing inputs use the national-share fallback (covered in + test_la_loss_y_all_positive).""" from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( create_local_authority_target_matrix, ) - year = int(enhanced_frs.time_period) - fallback = max(HOUSEHOLD_LAND_VALUES) - land_year = year if year in HOUSEHOLD_LAND_VALUES else fallback - expected = ( - LA_CODES["code"] - .map({code: values[land_year] for code, values in LA_TARGETS.items()}) - .values - ) - _, y, _ = create_local_authority_target_matrix( enhanced_frs, time_period=enhanced_frs.time_period ) - np.testing.assert_array_equal(y["ons/household_land_value"].values, expected) + expected_by_code = LA_TARGETS + for i, code in enumerate(LA_CODES["code"].values): + if code not in expected_by_code: + continue # fallback path + actual = y["housing/main_residence_value"].iloc[i] + expected = expected_by_code[code] + assert abs(actual - expected) < 1e-3, ( + f"{code}: y {actual:,.2f} != expected {expected:,.2f}" + ) def test_la_loss_y_all_positive(enhanced_frs): - """No LA should have a non-positive household-land target.""" + """No LA should have a non-positive main-residence-value target.""" from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( create_local_authority_target_matrix, ) @@ -147,12 +137,12 @@ def test_la_loss_y_all_positive(enhanced_frs): enhanced_frs, time_period=enhanced_frs.time_period ) - assert (y["ons/household_land_value"] > 0).all() + assert (y["housing/main_residence_value"] > 0).all() -def test_la_loss_matrix_column_matches_household_land_value(enhanced_frs): - """matrix['ons/household_land_value'] should equal the per-household - household_land_value pulled from policyengine-uk for the calibration year.""" +def test_la_loss_matrix_column_matches_main_residence_value(enhanced_frs): + """matrix['housing/main_residence_value'] should equal the per-household + main_residence_value pulled from policyengine-uk for the calibration year.""" from policyengine_uk import Microsimulation from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( create_local_authority_target_matrix, @@ -164,6 +154,8 @@ def test_la_loss_matrix_column_matches_household_land_value(enhanced_frs): sim = Microsimulation(dataset=enhanced_frs) sim.default_calculation_period = enhanced_frs.time_period - expected = sim.calculate("household_land_value").values + expected = sim.calculate("main_residence_value").values - np.testing.assert_array_equal(matrix["ons/household_land_value"].values, expected) + np.testing.assert_array_equal( + matrix["housing/main_residence_value"].values, expected + ) diff --git a/policyengine_uk_data/tests/test_regional_land_value_targets.py b/policyengine_uk_data/tests/test_regional_land_value_targets.py index 92b77df69..5b1b5f931 100644 --- a/policyengine_uk_data/tests/test_regional_land_value_targets.py +++ b/policyengine_uk_data/tests/test_regional_land_value_targets.py @@ -130,9 +130,9 @@ def test_targets_have_values_for_2021_to_2026(): def test_target_registry_includes_regional(): """Regional land targets should appear in the global registry.""" - from policyengine_uk_data.targets import get_all_targets, GeographicLevel + from policyengine_uk_data.targets import get_all_targets - targets = get_all_targets(year=2025, geographic_level=GeographicLevel.REGION) + targets = get_all_targets(year=2025) regional = [t for t in targets if t.name.startswith("ons/household_land_value/")] assert len(regional) == 11, ( f"Expected 11 regional land targets, got {len(regional)}" From e69d6586fc7ef69f6aec425db84b149b3353accf Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 29 Apr 2026 11:40:48 +0100 Subject: [PATCH 07/11] Apply ruff format to la_land.py and test_la_land_value_targets.py Co-Authored-By: Claude Opus 4.7 (1M context) --- .../targets/sources/la_land.py | 9 ++++----- .../tests/test_la_land_value_targets.py | 19 +++++++++++++++---- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/policyengine_uk_data/targets/sources/la_land.py b/policyengine_uk_data/targets/sources/la_land.py index 0d4472875..193f8dedb 100644 --- a/policyengine_uk_data/targets/sources/la_land.py +++ b/policyengine_uk_data/targets/sources/la_land.py @@ -67,13 +67,12 @@ def _compute_la_targets() -> dict[str, float]: if prices.empty or tenure.empty or households.empty: return {} - merged = prices.merge( - tenure, left_on="code", right_on="la_code", how="left" - ).merge(households, on="la_code", how="left") + merged = prices.merge(tenure, left_on="code", right_on="la_code", how="left").merge( + households, on="la_code", how="left" + ) ownership_share = ( - merged["owned_outright_pct"].fillna(0) - + merged["owned_mortgage_pct"].fillna(0) + merged["owned_outright_pct"].fillna(0) + merged["owned_mortgage_pct"].fillna(0) ) / 100 targets = merged["avg_house_price"] * ownership_share * merged["households"] diff --git a/policyengine_uk_data/tests/test_la_land_value_targets.py b/policyengine_uk_data/tests/test_la_land_value_targets.py index d8ebadc62..3942cef4d 100644 --- a/policyengine_uk_data/tests/test_la_land_value_targets.py +++ b/policyengine_uk_data/tests/test_la_land_value_targets.py @@ -135,16 +135,27 @@ def test_kensington_and_chelsea_above_blackpool(): name_to_code = dict(zip(LA_PRICES["name"], LA_PRICES["code"])) kc = LA_TARGETS[name_to_code["Kensington and Chelsea"]] bp = LA_TARGETS[name_to_code["Blackpool"]] - assert kc > bp, f"K&C target (£{kc / 1e9:.1f}bn) should exceed Blackpool (£{bp / 1e9:.1f}bn)" + assert kc > bp, ( + f"K&C target (£{kc / 1e9:.1f}bn) should exceed Blackpool (£{bp / 1e9:.1f}bn)" + ) def test_london_total_exceeds_north_east(): """Sum of London LA targets should exceed sum of North-East LA targets.""" london_codes = [c for c in LA_TARGETS if c.startswith("E09")] ne_prefixes = { - "E06000001", "E06000002", "E06000003", "E06000004", "E06000005", - "E06000047", "E08000021", "E08000022", "E08000023", "E08000024", - "E08000037", "E06000057", + "E06000001", + "E06000002", + "E06000003", + "E06000004", + "E06000005", + "E06000047", + "E08000021", + "E08000022", + "E08000023", + "E08000024", + "E08000037", + "E06000057", } ne_codes = [c for c in LA_TARGETS if c in ne_prefixes] london_total = sum(LA_TARGETS[c] for c in london_codes) From c40f7edb00dd6379f6ff993bb310ba35568bb52a Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 29 Apr 2026 12:08:09 +0100 Subject: [PATCH 08/11] Add LA loss-matrix calibration well-formedness tests Four FRS-fixture-gated tests exercising properties the optimiser relies on: - y has no NaN entries (NaN would propagate silently through the optimiser). - Non-English LAs use the national-share fallback (positive, non-NaN values), since EHS coverage is England-only. - matrix column has non-zero variance, so the new target carries calibration signal rather than being inert. - Sum of English LA targets is in the same order of magnitude (0.5x-3x) as the implied initial English main-residence-value, so the calibrator can plausibly reach the target via reweighting rather than 100x weight inflation. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tests/test_la_loss_land_value.py | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/policyengine_uk_data/tests/test_la_loss_land_value.py b/policyengine_uk_data/tests/test_la_loss_land_value.py index bbc9d6b5f..48bcfcb68 100644 --- a/policyengine_uk_data/tests/test_la_loss_land_value.py +++ b/policyengine_uk_data/tests/test_la_loss_land_value.py @@ -159,3 +159,96 @@ def test_la_loss_matrix_column_matches_main_residence_value(enhanced_frs): np.testing.assert_array_equal( matrix["housing/main_residence_value"].values, expected ) + + +# ── Layer 2b: calibration well-formedness ───────────────────────────── + + +def test_la_loss_y_has_no_nan(enhanced_frs): + """Every LA must have a numeric target. NaN entries would propagate + through the optimiser and fail calibration silently.""" + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + _, y, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + assert not y["housing/main_residence_value"].isna().any() + + +def test_la_loss_fallback_applied_to_non_english_las(enhanced_frs): + """Wales / Scotland / NI LAs use the national-share fallback because + EHS only covers England. Their y entry must still be positive + (so the optimiser has a target to fit) and must NOT equal the + direct-formula value (which is undefined when ownership share + is missing).""" + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + _, y, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + fallback_codes = [c for c in LA_CODES["code"] if not c.startswith("E")] + fallback_indices = [ + i for i, c in enumerate(LA_CODES["code"].values) if c in fallback_codes + ] + fallback_values = y["housing/main_residence_value"].iloc[fallback_indices] + + assert (fallback_values > 0).all() + assert fallback_values.notna().all() + + +def test_la_loss_matrix_column_carries_calibration_signal(enhanced_frs): + """matrix['housing/main_residence_value'] must vary across households — + a constant column gives the optimiser no signal to differentiate + LAs and the new target would be inert.""" + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + matrix, _, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + column = matrix["housing/main_residence_value"].values + assert column.var() > 0 + assert (column > 0).any(), "no households with positive main_residence_value" + + +def test_la_loss_english_target_total_within_reach_of_initial_weights(enhanced_frs): + """Sum of English LA targets should be in the same order of magnitude + as the implied initial English main-residence-value — so the + optimiser has a chance of hitting them via reweighting rather than + requiring weights to inflate by 100x.""" + from policyengine_uk import Microsimulation + from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( + create_local_authority_target_matrix, + ) + + _, y, _ = create_local_authority_target_matrix( + enhanced_frs, time_period=enhanced_frs.time_period + ) + + sim = Microsimulation(dataset=enhanced_frs) + original_weights = sim.calculate("household_weight", 2025).values + main_res = sim.calculate("main_residence_value", enhanced_frs.time_period).values + country = sim.calculate("country", enhanced_frs.time_period).values + + england_mask = country == "ENGLAND" + england_initial = (original_weights[england_mask] * main_res[england_mask]).sum() + + english_indices = [ + i for i, c in enumerate(LA_CODES["code"].values) if c.startswith("E") + ] + english_targets = y["housing/main_residence_value"].iloc[english_indices].sum() + + ratio = english_targets / england_initial + assert 0.5 < ratio < 3.0, ( + f"English LA target sum (£{english_targets / 1e9:.0f}bn) / " + f"initial English main-residence-value (£{england_initial / 1e9:.0f}bn) " + f"= {ratio:.2f}; calibration target may be hard to reach" + ) From 717c9cddc2dcbdd5bb95201bf04ebca376abdfac Mon Sep 17 00:00:00 2001 From: Vahid Ahmadi Date: Wed, 29 Apr 2026 15:48:59 +0100 Subject: [PATCH 09/11] Relabel housing/main_residence_value as derived proxy + document lineage caveat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per @MaxGhenis PR review: the target value is a constructed proxy (avg HMLR price × EHS ownership share × Census households), not a directly observed LA total of main residence value. The earlier PR description and code comments overstated this. Substantive lineage gap that the docs now flag explicitly: - Matrix col main_residence_value (policyengine-uk) is WAS-imputed household stock wealth, regionally uprated. - Target uses HMLR UK HPI 'Average Price' — a transaction-weighted geography-period price index, not an observed stock total of owner-occupied residences. - Two different price concepts on the two sides of the constraint. The product is a defensible identity, but it is a derived proxy, not a direct benchmark. Behaviour unchanged. This commit only updates the docstring in la_land.py and the comment in loss.py to call the target "derived proxy" rather than "directly observed". A separate policy question (whether derived proxy targets should sit at full training weight alongside direct VOA/HMRC/ONS/DWP targets, or be soft-weighted) is being tracked separately. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../local_areas/local_authorities/loss.py | 9 ++++-- .../targets/sources/la_land.py | 28 +++++++++++++++---- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py index a58b66bd2..53283f59c 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py @@ -255,9 +255,12 @@ def create_local_authority_target_matrix( ) # ── Main residence value (HMLR × ownership share × households) ─ - # Mirrors the private-rent target pattern: directly observed - # LA-level housing indicators multiplied together, with a - # national-share fallback for LAs missing any input. + # Derived proxy target: a product of three observed inputs, not a + # directly observed LA total. Mirrors the private-rent target's + # multiplicative shape. Lineage caveat: matrix col is WAS-imputed + # stock wealth; the target uses HMLR HPI transaction prices — + # different price concepts on the two sides of the constraint. + # See targets/sources/la_land.py for full provenance. la_prices = load_la_avg_prices() tenure_merged = tenure_merged.merge( la_prices[["code", "avg_house_price"]], on="code", how="left" diff --git a/policyengine_uk_data/targets/sources/la_land.py b/policyengine_uk_data/targets/sources/la_land.py index 193f8dedb..b9a60bcc7 100644 --- a/policyengine_uk_data/targets/sources/la_land.py +++ b/policyengine_uk_data/targets/sources/la_land.py @@ -1,12 +1,28 @@ -"""LA-level main residence value targets. +"""LA-level main residence value targets (derived proxy). -Each local authority's target is built from directly observed LA-level -housing indicators, mirroring the existing private-rent calibration: +This target is a **derived proxy**, not a directly observed LA total. +Per-LA target is constructed by multiplying three observed inputs: target_la = avg_house_price_la × ownership_share_la × n_households_la -This is the symmetric counterpart of the rent target for the -owner-occupier side. No national-total apportionment. +Same multiplicative shape as the existing private-rent target +(median × pct × n_households). + +Lineage caveat (flagged in PR review by @MaxGhenis): +- Matrix column ``main_residence_value`` in policyengine-uk is a + **stock-wealth** quantity, imputed from the Wealth and Assets Survey + (WAS) and uprated regionally via property-wealth intensity ratios. +- The target value uses HM Land Registry UK HPI "Average Price" — a + **transaction-weighted geography-period price index**, not an + observed stock total of owner-occupied main residences. +- The product avg_price × ownership × n_households is therefore a + defensible identity ("if every owner-occupied dwelling were valued + at the LA HPI average, the total would be £X") but the two sides + of the calibration constraint reference different price concepts. + +The target is treated as a soft-weighted/proxy training signal +relative to direct observed targets (HMRC SPI counts, ONS mid-year +population, DWP UC caseload, VOA dwelling band counts). Data sources: - Average house price by LA: HM Land Registry UK HPI (Dec 2025). @@ -14,7 +30,7 @@ LA name. For Northern Ireland LGDs missing from a specific month, the NI country-level HPI price is used as a fallback. - Ownership share by LA: English Housing Survey, via load_tenure_data - (owned_outright_pct + owned_mortgage_pct). + (owned_outright_pct + owned_mortgage_pct). England-only. - Households by LA: Census 2021, via load_household_counts. """ From c330b4440e72177abb597a1430804cc486c479d9 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 29 Apr 2026 20:17:40 -0400 Subject: [PATCH 10/11] Make LA property value target validation-only --- policyengine_uk_data/datasets/create_datasets.py | 3 ++- .../datasets/local_areas/local_authorities/calibrate.py | 8 +++++++- policyengine_uk_data/targets/sources/la_land.py | 7 ++++--- policyengine_uk_data/tests/test_la_loss_land_value.py | 9 +++++++++ 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index d49566781..16e7773a2 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -183,6 +183,7 @@ def main(): from policyengine_uk_data.datasets.local_areas.local_authorities.calibrate import ( get_performance as get_la_performance, + VALIDATION_TARGETS as LA_VALIDATION_TARGETS, ) from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( create_local_authority_target_matrix, @@ -197,7 +198,7 @@ def main(): national_matrix_fn=create_national_target_matrix, area_count=360, weight_file="local_authority_weights.h5", - excluded_training_targets=[], + excluded_training_targets=LA_VALIDATION_TARGETS, log_csv="la_calibration_log.csv", verbose=True, # Enable nested progress display area_name="Local Authority", diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py index 746d94e73..b8f7f8774 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py @@ -10,12 +10,18 @@ from policyengine_uk.data import UKSingleYearDataset +VALIDATION_TARGETS = ["housing/main_residence_value"] + + def calibrate( dataset: UKSingleYearDataset, - excluded_training_targets=[], + excluded_training_targets=None, log_csv="la_calibration_log.csv", verbose: bool = False, ): + if excluded_training_targets is None: + excluded_training_targets = VALIDATION_TARGETS + return calibrate_local_areas( dataset=dataset, matrix_fn=lambda ds: create_local_authority_target_matrix(ds, ds.time_period), diff --git a/policyengine_uk_data/targets/sources/la_land.py b/policyengine_uk_data/targets/sources/la_land.py index b9a60bcc7..9c0c1ebce 100644 --- a/policyengine_uk_data/targets/sources/la_land.py +++ b/policyengine_uk_data/targets/sources/la_land.py @@ -20,9 +20,10 @@ at the LA HPI average, the total would be £X") but the two sides of the calibration constraint reference different price concepts. -The target is treated as a soft-weighted/proxy training signal -relative to direct observed targets (HMRC SPI counts, ONS mid-year -population, DWP UC caseload, VOA dwelling band counts). +The target is included as a validation/proxy diagnostic and is excluded +from the default local-authority training targets. It should not be +treated as a direct target unless a cleaner LA stock-value source is +added. Data sources: - Average house price by LA: HM Land Registry UK HPI (Dec 2025). diff --git a/policyengine_uk_data/tests/test_la_loss_land_value.py b/policyengine_uk_data/tests/test_la_loss_land_value.py index 48bcfcb68..2f94401de 100644 --- a/policyengine_uk_data/tests/test_la_loss_land_value.py +++ b/policyengine_uk_data/tests/test_la_loss_land_value.py @@ -88,6 +88,15 @@ def test_la_loss_matrix_includes_main_residence_value(enhanced_frs): assert "housing/main_residence_value" in y.columns +def test_main_residence_value_is_validation_target_by_default(): + """The derived proxy target is reported for validation, not trained by default.""" + from policyengine_uk_data.datasets.local_areas.local_authorities.calibrate import ( + VALIDATION_TARGETS, + ) + + assert "housing/main_residence_value" in VALIDATION_TARGETS + + def test_la_loss_y_vector_length_360(enhanced_frs): """y has one entry per LA and matches local_authorities_2021.csv ordering by length.""" From ecfd6c3214202f640b2845f9b9b790ca675399bb Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 30 Apr 2026 08:12:40 -0400 Subject: [PATCH 11/11] Remove derived LA property value target --- changelog.d/370.md | 1 - .../datasets/create_datasets.py | 3 +- .../local_authorities/calibrate.py | 8 +- .../local_areas/local_authorities/loss.py | 43 --- .../storage/la_land_values.csv | 361 ------------------ .../targets/sources/la_land.py | 133 ------- .../tests/test_la_land_value_targets.py | 216 ----------- .../tests/test_la_loss_land_value.py | 263 ------------- 8 files changed, 2 insertions(+), 1026 deletions(-) delete mode 100644 changelog.d/370.md delete mode 100644 policyengine_uk_data/storage/la_land_values.csv delete mode 100644 policyengine_uk_data/targets/sources/la_land.py delete mode 100644 policyengine_uk_data/tests/test_la_land_value_targets.py delete mode 100644 policyengine_uk_data/tests/test_la_loss_land_value.py diff --git a/changelog.d/370.md b/changelog.d/370.md deleted file mode 100644 index dd421dc13..000000000 --- a/changelog.d/370.md +++ /dev/null @@ -1 +0,0 @@ -Add LA-level main residence value calibration targets for all 360 UK local authorities, built from directly observed indicators (HMLR UK HPI × English Housing Survey ownership share × Census household count) and wired into the LA reweighter alongside the existing tenure and rent targets. diff --git a/policyengine_uk_data/datasets/create_datasets.py b/policyengine_uk_data/datasets/create_datasets.py index 16e7773a2..d49566781 100644 --- a/policyengine_uk_data/datasets/create_datasets.py +++ b/policyengine_uk_data/datasets/create_datasets.py @@ -183,7 +183,6 @@ def main(): from policyengine_uk_data.datasets.local_areas.local_authorities.calibrate import ( get_performance as get_la_performance, - VALIDATION_TARGETS as LA_VALIDATION_TARGETS, ) from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( create_local_authority_target_matrix, @@ -198,7 +197,7 @@ def main(): national_matrix_fn=create_national_target_matrix, area_count=360, weight_file="local_authority_weights.h5", - excluded_training_targets=LA_VALIDATION_TARGETS, + excluded_training_targets=[], log_csv="la_calibration_log.csv", verbose=True, # Enable nested progress display area_name="Local Authority", diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py index b8f7f8774..746d94e73 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/calibrate.py @@ -10,18 +10,12 @@ from policyengine_uk.data import UKSingleYearDataset -VALIDATION_TARGETS = ["housing/main_residence_value"] - - def calibrate( dataset: UKSingleYearDataset, - excluded_training_targets=None, + excluded_training_targets=[], log_csv="la_calibration_log.csv", verbose: bool = False, ): - if excluded_training_targets is None: - excluded_training_targets = VALIDATION_TARGETS - return calibrate_local_areas( dataset=dataset, matrix_fn=lambda ds: create_local_authority_target_matrix(ds, ds.time_period), diff --git a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py index 53283f59c..fd5ed9440 100644 --- a/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py +++ b/policyengine_uk_data/datasets/local_areas/local_authorities/loss.py @@ -11,7 +11,6 @@ - ONS income: ONS small area income estimates - Tenure: English Housing Survey - Private rent: VOA/ONS private rental market statistics -- Main residence value: HMLR UK HPI × ownership share × household count """ from policyengine_uk import Microsimulation @@ -39,7 +38,6 @@ load_tenure_data, load_private_rents, ) -from policyengine_uk_data.targets.sources.la_land import load_la_avg_prices def create_local_authority_target_matrix( @@ -254,47 +252,6 @@ def create_local_authority_target_matrix( national_rent * la_household_share, ) - # ── Main residence value (HMLR × ownership share × households) ─ - # Derived proxy target: a product of three observed inputs, not a - # directly observed LA total. Mirrors the private-rent target's - # multiplicative shape. Lineage caveat: matrix col is WAS-imputed - # stock wealth; the target uses HMLR HPI transaction prices — - # different price concepts on the two sides of the constraint. - # See targets/sources/la_land.py for full provenance. - la_prices = load_la_avg_prices() - tenure_merged = tenure_merged.merge( - la_prices[["code", "avg_house_price"]], on="code", how="left" - ) - - matrix["housing/main_residence_value"] = sim.calculate( - "main_residence_value" - ).values - - ownership_share_la = ( - tenure_merged["owned_outright_pct"].fillna(0) - + tenure_merged["owned_mortgage_pct"].fillna(0) - ) / 100 - tenure_merged["main_residence_value_target"] = ( - tenure_merged["avg_house_price"] - * ownership_share_la - * tenure_merged["households"] - ) - - has_property = ( - tenure_merged["avg_house_price"].notna() - & tenure_merged["owned_outright_pct"].notna() - & tenure_merged["households"].notna() - ).values - national_property = ( - original_weights * matrix["housing/main_residence_value"].values - ).sum() - - y["housing/main_residence_value"] = np.where( - has_property, - tenure_merged["main_residence_value_target"].values, - national_property * la_household_share, - ) - # ── Country mask ─────────────────────────────────────────────── country_mask = create_country_mask( household_countries=sim.calculate("country").values, diff --git a/policyengine_uk_data/storage/la_land_values.csv b/policyengine_uk_data/storage/la_land_values.csv deleted file mode 100644 index ef0023a32..000000000 --- a/policyengine_uk_data/storage/la_land_values.csv +++ /dev/null @@ -1,361 +0,0 @@ -code,name,households,avg_house_price -E06000001,Hartlepool,42687,132463 -E06000002,Middlesbrough,62873,141270 -E06000003,Redcar and Cleveland,64597,146735 -E06000004,Stockton-on-Tees,88187,170575 -E06000005,Darlington,50525,159542 -E06000006,Halton,58738,189414 -E06000007,Warrington,96224,251903 -E06000008,Blackburn with Darwen,61298,162893 -E06000009,Blackpool,69328,136485 -E06000010,"Kingston upon Hull, City of",119535,131323 -E06000011,East Riding of Yorkshire,160848,221028 -E06000012,North East Lincolnshire,72731,148043 -E06000013,North Lincolnshire,77080,180202 -E06000014,York,90598,306571 -E06000015,Derby,110196,205530 -E06000016,Leicester,134204,226491 -E06000017,Rutland,18295,318903 -E06000018,Nottingham,131657,193888 -E06000019,"Herefordshire, County of",86625,287302 -E06000020,Telford and Wrekin,79145,215752 -E06000021,Stoke-on-Trent,115950,147831 -E06000022,Bath and North East Somerset,84855,407049 -E06000023,"Bristol, City of",205211,353265 -E06000024,North Somerset,102261,312318 -E06000025,South Gloucestershire,129657,332736 -E06000026,Plymouth,126166,218085 -E06000027,Torbay,66499,232041 -E06000030,Swindon,100125,260905 -E06000031,Peterborough,88023,231757 -E06000032,Luton,84690,281149 -E06000033,Southend-on-Sea,80611,333356 -E06000034,Thurrock,70478,325936 -E06000035,Medway,118085,298520 -E06000036,Bracknell Forest,52392,394751 -E06000037,West Berkshire,70169,400144 -E06000038,Reading,70322,354805 -E06000039,Slough,54550,337206 -E06000040,Windsor and Maidenhead,64836,572852 -E06000041,Wokingham,72272,503052 -E06000042,Milton Keynes,119789,328697 -E06000043,Brighton and Hove,126374,410203 -E06000044,Portsmouth,93797,249460 -E06000045,Southampton,107827,233920 -E06000046,Isle of Wight,71945,247936 -E06000047,County Durham,253106,143291 -E06000049,Cheshire East,185025,306039 -E06000050,Cheshire West and Chester,162209,265955 -E06000051,Shropshire,146609,281161 -E06000052,Cornwall,265515,277318 -E06000053,Isles of Scilly,1115,308582 -E06000054,Wiltshire,230273,332483 -E06000055,Bedford,82624,331140 -E06000056,Central Bedfordshire,129846,358488 -E06000057,Northumberland,154547,214989 -E06000058,"Bournemouth, Christchurch and Poole",180181,309673 -E06000059,Dorset,180376,332378 -E06000060,Buckinghamshire,237680,487653 -E06000061,North Northamptonshire,164829,258515 -E06000062,West Northamptonshire,180295,294010 -E06000063,Cumberland,91136,174281 -E06000064,Westmorland and Furness,94053,227777 -E06000065,North Yorkshire,127165,272111 -E06000066,Somerset,107037,278440 -E07000008,Cambridge,55946,485985 -E07000009,East Cambridgeshire,39423,357866 -E07000010,Fenland,46560,234696 -E07000011,Huntingdonshire,81045,310990 -E07000012,South Cambridgeshire,71683,433729 -E07000032,Amber Valley,59550,234786 -E07000033,Bolsover,37016,185199 -E07000034,Chesterfield,49895,200389 -E07000035,Derbyshire Dales,34180,344946 -E07000036,Erewash,53328,219887 -E07000037,High Peak,42696,262663 -E07000038,North East Derbyshire,48876,249829 -E07000039,South Derbyshire,47323,257691 -E07000040,East Devon,69365,343715 -E07000041,Exeter,54901,283194 -E07000042,Mid Devon,37599,299716 -E07000043,North Devon,44797,286805 -E07000044,South Hams,41310,368805 -E07000045,Teignbridge,62939,297853 -E07000046,Torridge,31384,269713 -E07000047,West Devon,25342,308836 -E07000061,Eastbourne,48596,251894 -E07000062,Hastings,42726,240579 -E07000063,Lewes,45592,354787 -E07000064,Rother,44092,340936 -E07000065,Wealden,69502,399752 -E07000066,Basildon,80242,362898 -E07000067,Braintree,69538,332140 -E07000068,Brentwood,33376,520013 -E07000069,Castle Point,39330,365893 -E07000070,Chelmsford,77893,383278 -E07000071,Colchester,84604,300310 -E07000072,Epping Forest,57631,549606 -E07000073,Harlow,40148,314356 -E07000074,Maldon,29084,395716 -E07000075,Rochford,37856,407373 -E07000076,Tendring,73646,268088 -E07000077,Uttlesford,38874,490567 -E07000078,Cheltenham,55163,336877 -E07000079,Cotswold,42623,415906 -E07000080,Forest of Dean,39023,299003 -E07000081,Gloucester,57916,236963 -E07000082,Stroud,54475,333064 -E07000083,Tewkesbury,43553,331405 -E07000084,Basingstoke and Deane,82589,373985 -E07000085,East Hampshire,55752,440091 -E07000086,Eastleigh,60138,311946 -E07000087,Fareham,52074,332945 -E07000088,Gosport,37662,230613 -E07000089,Hart,43057,495632 -E07000090,Havant,55546,316941 -E07000091,New Forest,88338,392680 -E07000092,Rushmoor,42222,332762 -E07000093,Test Valley,58786,370524 -E07000094,Winchester,56541,465183 -E07000095,Broxbourne,41490,407544 -E07000096,Dacorum,67995,467070 -E07000098,Hertsmere,46391,552787 -E07000099,North Hertfordshire,59481,419688 -E07000102,Three Rivers,38141,585189 -E07000103,Watford,42014,392605 -E07000105,Ashford,55344,347180 -E07000106,Canterbury,67957,335342 -E07000107,Dartford,47304,356253 -E07000108,Dover,52754,293719 -E07000109,Gravesham,44362,347503 -E07000110,Maidstone,76476,358161 -E07000111,Sevenoaks,52383,535114 -E07000112,Folkestone and Hythe,50991,310400 -E07000113,Swale,63290,289925 -E07000114,Thanet,65339,263646 -E07000115,Tonbridge and Malling,56505,406667 -E07000116,Tunbridge Wells,51447,465399 -E07000117,Burnley,41847,131476 -E07000118,Chorley,52608,208587 -E07000119,Fylde,39584,227834 -E07000120,Hyndburn,37173,135525 -E07000121,Lancaster,63938,200827 -E07000122,Pendle,39879,146756 -E07000123,Preston,63040,189448 -E07000124,Ribble Valley,28002,280677 -E07000125,Rossendale,31819,197028 -E07000126,South Ribble,51099,207382 -E07000127,West Lancashire,51070,229049 -E07000128,Wyre,53177,193044 -E07000129,Blaby,45683,295289 -E07000130,Charnwood,75849,275442 -E07000131,Harborough,42531,344277 -E07000132,Hinckley and Bosworth,52155,261427 -E07000133,Melton,23700,290884 -E07000134,North West Leicestershire,46330,287928 -E07000135,Oadby and Wigston,23560,271073 -E07000136,Boston,30365,192397 -E07000137,East Lindsey,67863,218674 -E07000138,Lincoln,44186,186704 -E07000139,North Kesteven,53366,244502 -E07000140,South Holland,42889,227339 -E07000141,South Kesteven,66371,255822 -E07000142,West Lindsey,44831,212770 -E07000143,Breckland,63896,277510 -E07000144,Broadland,59286,315312 -E07000145,Great Yarmouth,46015,208770 -E07000146,King's Lynn and West Norfolk,72550,268435 -E07000147,North Norfolk,51309,288661 -E07000148,Norwich,66263,222723 -E07000149,South Norfolk,64294,312743 -E07000170,Ashfield,56709,190280 -E07000171,Bassetlaw,54577,207390 -E07000172,Broxtowe,51864,256619 -E07000173,Gedling,55718,249519 -E07000174,Mansfield,50295,190540 -E07000175,Newark and Sherwood,58096,240202 -E07000176,Rushcliffe,53065,333758 -E07000177,Cherwell,71480,355186 -E07000178,Oxford,58982,480531 -E07000179,South Oxfordshire,64446,462111 -E07000180,Vale of White Horse,62792,411832 -E07000181,West Oxfordshire,51488,422389 -E07000192,Cannock Chase,45724,234415 -E07000193,East Staffordshire,54623,223975 -E07000194,Lichfield,50572,326691 -E07000195,Newcastle-under-Lyme,57040,202139 -E07000196,South Staffordshire,50237,295861 -E07000197,Stafford,66115,265685 -E07000198,Staffordshire Moorlands,45583,216108 -E07000199,Tamworth,34257,233451 -E07000200,Babergh,42926,331953 -E07000202,Ipswich,64321,221494 -E07000203,Mid Suffolk,49895,319139 -E07000207,Elmbridge,61225,743009 -E07000208,Epsom and Ewell,33649,545227 -E07000209,Guildford,60529,523409 -E07000210,Mole Valley,40275,557926 -E07000211,Reigate and Banstead,64563,466628 -E07000212,Runnymede,36674,480402 -E07000213,Spelthorne,46050,440782 -E07000214,Surrey Heath,38463,462511 -E07000215,Tandridge,37661,495400 -E07000216,Waverley,55267,555732 -E07000217,Woking,43207,433269 -E07000218,North Warwickshire,29186,269540 -E07000219,Nuneaton and Bedworth,59777,234032 -E07000220,Rugby,49396,275985 -E07000221,Stratford-on-Avon,62587,390081 -E07000222,Warwick,66686,366063 -E07000223,Adur,29062,370708 -E07000224,Arun,76206,325042 -E07000225,Chichester,57159,441151 -E07000226,Crawley,47401,336354 -E07000227,Horsham,64939,441285 -E07000228,Mid Sussex,66723,436743 -E07000229,Worthing,53504,308125 -E07000234,Bromsgrove,43061,333847 -E07000235,Malvern Hills,36187,338287 -E07000236,Redditch,38123,248001 -E07000237,Worcester,47138,249218 -E07000238,Wychavon,61914,333519 -E07000239,Wyre Forest,47963,234081 -E07000240,St Albans,61308,609646 -E07000241,Welwyn Hatfield,48609,443795 -E07000242,East Hertfordshire,64958,460270 -E07000243,Stevenage,38761,323392 -E07000244,East Suffolk,118154,283660 -E07000245,West Suffolk,81715,295650 -E08000001,Bolton,123006,200491 -E08000002,Bury,83538,237721 -E08000003,Manchester,230104,257630 -E08000004,Oldham,96814,213245 -E08000005,Rochdale,93421,209799 -E08000006,Salford,120347,226427 -E08000007,Stockport,134807,306235 -E08000008,Tameside,106293,211680 -E08000009,Trafford,99920,378514 -E08000010,Wigan,150180,191180 -E08000011,Knowsley,68335,190405 -E08000012,Liverpool,215816,184804 -E08000013,St. Helens,86307,179209 -E08000014,Sefton,130413,220702 -E08000015,Wirral,154379,218516 -E08000016,Barnsley,111728,172458 -E08000017,Doncaster,140435,173424 -E08000018,Rotherham,120506,191161 -E08000019,Sheffield,245451,220445 -E08000021,Newcastle upon Tyne,127262,207936 -E08000022,North Tyneside,100515,202840 -E08000023,South Tyneside,71355,165647 -E08000024,Sunderland,129173,146527 -E08000025,Birmingham,443632,232844 -E08000026,Coventry,141539,226361 -E08000027,Dudley,146542,227378 -E08000028,Sandwell,135966,200069 -E08000029,Solihull,93737,328744 -E08000030,Walsall,121137,215676 -E08000031,Wolverhampton,106933,213273 -E08000032,Bradford,218386,189396 -E08000033,Calderdale,95154,186573 -E08000034,Kirklees,185181,205944 -E08000035,Leeds,378060,246293 -E08000036,Wakefield,159785,199323 -E08000037,Gateshead,93642,151480 -E09000001,City of London,5133,740433 -E09000002,Barking and Dagenham,76891,353512 -E09000003,Barnet,156752,594093 -E09000004,Bexley,102241,410346 -E09000005,Brent,130863,568171 -E09000006,Bromley,142895,535306 -E09000007,Camden,94816,783812 -E09000008,Croydon,163059,402126 -E09000009,Ealing,145231,575503 -E09000010,Enfield,125214,471381 -E09000011,Greenwich,119526,474935 -E09000012,Hackney,114015,614552 -E09000013,Hammersmith and Fulham,83515,713773 -E09000014,Haringey,106436,626807 -E09000015,Harrow,92681,530409 -E09000016,Havering,108202,452231 -E09000017,Hillingdon,116295,477979 -E09000018,Hounslow,113871,519639 -E09000019,Islington,98568,699726 -E09000020,Kensington and Chelsea,70165,1178497 -E09000021,Kingston upon Thames,70208,573027 -E09000022,Lambeth,138311,538500 -E09000023,Lewisham,127327,493356 -E09000024,Merton,83414,601814 -E09000025,Newham,122280,405619 -E09000026,Redbridge,112886,495269 -E09000027,Richmond upon Thames,84981,777164 -E09000028,Southwark,134900,589636 -E09000029,Sutton,86112,453058 -E09000030,Tower Hamlets,123601,463527 -E09000031,Waltham Forest,109286,525738 -E09000032,Wandsworth,141843,689285 -E09000033,Westminster,100112,880389 -N09000001,Antrim and Newtownabbey,83744,197918 -N09000002,"Armagh City, Banbridge and Craigavon",87066,179907 -N09000003,Belfast,87441,178459 -N09000004,Causeway Coast and Glens,85707,213957 -N09000005,Derry City and Strabane,86420,177589 -N09000006,Fermanagh and Omagh,87392,194970 -N09000007,Lisburn and Castlereagh,89622,231628 -N09000008,Mid and East Antrim,84384,173261 -N09000009,Mid Ulster,85900,189185 -N09000010,"Newry, Mourne and Down",88089,218595 -S12000005,Clackmannanshire,85677,171785 -S12000006,Dumfries and Galloway,85940,163620 -S12000008,East Ayrshire,82669,131065 -S12000010,East Lothian,82119,280390 -S12000011,East Renfrewshire,84440,297395 -S12000013,Na h-Eileanan Siar,86183,139148 -S12000014,Falkirk,86048,171236 -S12000017,Highland,90706,216711 -S12000018,Inverclyde,83811,113267 -S12000019,Midlothian,83766,286803 -S12000020,Moray,84248,197451 -S12000021,North Ayrshire,84155,134830 -S12000023,Orkney Islands,83272,229610 -S12000026,Scottish Borders,82091,182102 -S12000027,Shetland Islands,83113,201503 -S12000028,South Ayrshire,84785,173377 -S12000029,South Lanarkshire,95010,186880 -S12000030,Stirling,84552,228054 -S12000033,Aberdeen City,86478,133119 -S12000034,Aberdeenshire,90347,202362 -S12000035,Argyll and Bute,85250,186309 -S12000036,City of Edinburgh,112532,293243 -S12000038,Renfrewshire,83979,160277 -S12000039,West Dunbartonshire,84070,131097 -S12000040,West Lothian,85547,219317 -S12000041,Angus,84966,174680 -S12000042,Dundee City,95901,141246 -S12000045,East Dunbartonshire,84536,262223 -S12000047,Fife,100135,177750 -S12000048,Perth and Kinross,84587,228534 -S12000049,Glasgow City,131728,189093 -S12000050,North Lanarkshire,92029,158859 -W06000001,Isle of Anglesey,39750,242141 -W06000002,Gwynedd,64757,196260 -W06000003,Conwy,64876,211833 -W06000004,Denbighshire,51535,195430 -W06000005,Flintshire,81654,213990 -W06000006,Wrexham,70846,206799 -W06000008,Ceredigion,39495,233722 -W06000009,Pembrokeshire,68590,213226 -W06000010,Carmarthenshire,98615,196607 -W06000011,Swansea,125018,208872 -W06000012,Neath Port Talbot,76913,160856 -W06000013,Bridgend,75306,208808 -W06000014,Vale of Glamorgan,72758,299757 -W06000015,Cardiff,162839,271273 -W06000016,Rhondda Cynon Taf,117872,162675 -W06000018,Caerphilly,93171,196048 -W06000019,Blaenau Gwent,38230,142090 -W06000020,Torfaen,50955,189702 -W06000021,Monmouthshire,52772,335746 -W06000022,Newport,80610,226573 -W06000023,Powys,76021,229762 -W06000024,Merthyr Tydfil,32578,143596 diff --git a/policyengine_uk_data/targets/sources/la_land.py b/policyengine_uk_data/targets/sources/la_land.py deleted file mode 100644 index 9c0c1ebce..000000000 --- a/policyengine_uk_data/targets/sources/la_land.py +++ /dev/null @@ -1,133 +0,0 @@ -"""LA-level main residence value targets (derived proxy). - -This target is a **derived proxy**, not a directly observed LA total. -Per-LA target is constructed by multiplying three observed inputs: - - target_la = avg_house_price_la × ownership_share_la × n_households_la - -Same multiplicative shape as the existing private-rent target -(median × pct × n_households). - -Lineage caveat (flagged in PR review by @MaxGhenis): -- Matrix column ``main_residence_value`` in policyengine-uk is a - **stock-wealth** quantity, imputed from the Wealth and Assets Survey - (WAS) and uprated regionally via property-wealth intensity ratios. -- The target value uses HM Land Registry UK HPI "Average Price" — a - **transaction-weighted geography-period price index**, not an - observed stock total of owner-occupied main residences. -- The product avg_price × ownership × n_households is therefore a - defensible identity ("if every owner-occupied dwelling were valued - at the LA HPI average, the total would be £X") but the two sides - of the calibration constraint reference different price concepts. - -The target is included as a validation/proxy diagnostic and is excluded -from the default local-authority training targets. It should not be -treated as a direct target unless a cleaner LA stock-value source is -added. - -Data sources: -- Average house price by LA: HM Land Registry UK HPI (Dec 2025). - For LAs whose ONS code changed between releases, the CSV matches on - LA name. For Northern Ireland LGDs missing from a specific month, - the NI country-level HPI price is used as a fallback. -- Ownership share by LA: English Housing Survey, via load_tenure_data - (owned_outright_pct + owned_mortgage_pct). England-only. -- Households by LA: Census 2021, via load_household_counts. -""" - -import pandas as pd - -from policyengine_uk_data.targets.schema import ( - GeographicLevel, - Target, - Unit, -) -from policyengine_uk_data.targets.sources._common import STORAGE - - -_REF_URL_HMLR = ( - "https://www.gov.uk/government/statistical-data-sets/" - "uk-house-price-index-data-downloads-december-2025" -) - - -def load_la_avg_prices() -> pd.DataFrame: - """Load HMLR average house price by LA. - - Returns DataFrame with columns: code, name, avg_house_price. - """ - csv_path = STORAGE / "la_land_values.csv" - if not csv_path.exists(): - return pd.DataFrame(columns=["code", "name", "avg_house_price"]) - df = pd.read_csv(csv_path) - return df[["code", "name", "avg_house_price"]] - - -def _compute_la_targets() -> dict[str, float]: - """Per-LA main residence value target. - - target_la = avg_house_price_la × ownership_share_la × n_households_la - - Returns a dict ``code -> £``. LAs missing any input drop out and - are handled in loss.py by the national-share fallback (same - pattern as the tenure and rent targets). - """ - from policyengine_uk_data.targets.sources.local_la_extras import ( - load_household_counts, - load_tenure_data, - ) - - prices = load_la_avg_prices() - tenure = load_tenure_data() - households = load_household_counts() - - if prices.empty or tenure.empty or households.empty: - return {} - - merged = prices.merge(tenure, left_on="code", right_on="la_code", how="left").merge( - households, on="la_code", how="left" - ) - - ownership_share = ( - merged["owned_outright_pct"].fillna(0) + merged["owned_mortgage_pct"].fillna(0) - ) / 100 - targets = merged["avg_house_price"] * ownership_share * merged["households"] - - return { - code: float(value) - for code, value in zip(merged["code"], targets) - if pd.notna(value) and value > 0 - } - - -def get_targets() -> list[Target]: - prices = load_la_avg_prices() - if prices.empty: - return [] - - la_targets = _compute_la_targets() - - targets: list[Target] = [] - for _, row in prices.iterrows(): - code = row["code"] - target_value = la_targets.get(code) - if target_value is None: - continue - # HMLR Dec 2025 snapshot; same value across calibration years - # until a year-varying HMLR series is wired in. - values = {year: target_value for year in (2024, 2025, 2026)} - targets.append( - Target( - name=f"housing/main_residence_value/{code}", - variable="main_residence_value", - source="hmlr", - unit=Unit.GBP, - geographic_level=GeographicLevel.LOCAL_AUTHORITY, - geo_code=code, - geo_name=row["name"], - values=values, - reference_url=_REF_URL_HMLR, - ) - ) - - return targets diff --git a/policyengine_uk_data/tests/test_la_land_value_targets.py b/policyengine_uk_data/tests/test_la_land_value_targets.py deleted file mode 100644 index 3942cef4d..000000000 --- a/policyengine_uk_data/tests/test_la_land_value_targets.py +++ /dev/null @@ -1,216 +0,0 @@ -"""Tests for LA-level main residence value calibration targets. - -Targets are built from directly observed LA-level housing indicators -(HMLR avg house price × English Housing Survey ownership share × Census -household count), mirroring the existing private-rent target. No -national-total apportionment. -""" - -import pandas as pd - -from policyengine_uk_data.targets.schema import GeographicLevel -from policyengine_uk_data.targets.sources._common import STORAGE -from policyengine_uk_data.targets.sources.la_land import ( - _compute_la_targets, - get_targets, - load_la_avg_prices, -) -from policyengine_uk_data.targets.sources.local_la_extras import ( - load_household_counts, - load_tenure_data, -) - - -LA_PRICES = load_la_avg_prices() -LA_TARGETS = _compute_la_targets() - - -# ── CSV data quality ───────────────────────────────────────────────── - - -def test_csv_row_count_matches_la_code_list(): - """la_land_values.csv should have the same 360 LAs as local_authorities_2021.csv.""" - la_codes = pd.read_csv(STORAGE / "local_authorities_2021.csv") - raw = pd.read_csv(STORAGE / "la_land_values.csv") - assert len(raw) == len(la_codes) - assert set(raw["code"]) == set(la_codes["code"]) - - -def test_csv_columns_match_schema(): - """CSV should have exactly the columns code, name, households, avg_house_price.""" - raw = pd.read_csv(STORAGE / "la_land_values.csv") - assert list(raw.columns) == ["code", "name", "households", "avg_house_price"] - - -def test_csv_no_missing_values(): - """No LA should have NaN in any column.""" - raw = pd.read_csv(STORAGE / "la_land_values.csv") - assert not raw.isna().any().any() - - -def test_csv_covers_all_four_countries(): - """All four UK countries (E/W/S/NI) should appear.""" - prefixes = LA_PRICES["code"].str[0].unique() - assert set(prefixes) == {"E", "W", "S", "N"} - - -def test_house_prices_within_plausible_range(): - """Avg house prices should be between £50k and £2m per LA.""" - for _, row in LA_PRICES.iterrows(): - assert 50_000 <= row["avg_house_price"] <= 2_000_000, ( - f"{row['name']}: avg_house_price £{row['avg_house_price']:,} " - "outside plausible range" - ) - - -def test_csv_households_within_plausible_range(): - """Smallest UK billing authority (Isles of Scilly) has ~1,100 - households; largest (Birmingham) has ~450,000. The CSV `households` - column is retained as a regression fixture for the IoS fallback leak - even though the calibration target uses Census counts. - """ - raw = pd.read_csv(STORAGE / "la_land_values.csv") - out_of_range = raw[~raw["households"].between(500, 500_000)] - assert out_of_range.empty, ( - "CSV households out of plausible [500, 500_000] range: " - f"{out_of_range[['code', 'name', 'households']].to_dict('records')}" - ) - - -def test_isles_of_scilly_households_are_thousands_not_millions(): - """Explicit regression for the IoS fallback leak (was 2,492,115).""" - raw = pd.read_csv(STORAGE / "la_land_values.csv") - ios = raw[raw["code"] == "E06000053"] - assert len(ios) == 1 - hh = int(ios["households"].iloc[0]) - assert 500 <= hh <= 5_000, ( - f"Isles of Scilly households = {hh:,}; ONS mid-2023 estimate is ~1,115" - ) - - -# ── Target value constraints ───────────────────────────────────────── - - -def test_targets_match_observed_product(): - """Every target equals avg_price × ownership_share × n_households exactly. - - No national-total apportionment, no rescaling: just the directly - observed product, identical in shape to the rent target. - """ - prices = LA_PRICES.set_index("code")["avg_house_price"] - tenure = load_tenure_data().set_index("la_code") - households = load_household_counts().set_index("la_code")["households"] - - for code, target in LA_TARGETS.items(): - if code not in tenure.index or code not in households.index: - continue - ownership = ( - tenure.loc[code, "owned_outright_pct"] - + tenure.loc[code, "owned_mortgage_pct"] - ) / 100 - expected = prices.loc[code] * ownership * households.loc[code] - assert abs(target - expected) < 1e-3, ( - f"{code}: target {target:,.2f} != expected {expected:,.2f}" - ) - - -def test_all_targets_positive(): - """Every per-LA target should be positive.""" - assert all(value > 0 for value in LA_TARGETS.values()) - - -def test_explicit_targets_cover_english_las(): - """Direct-formula targets are produced for LAs with EHS tenure data - (England). Wales, Scotland and Northern Ireland LAs are handled by - the national-share fallback in loss.py — same as the existing - tenure target, by construction.""" - prefixes = {code[0] for code in LA_TARGETS} - assert prefixes == {"E"}, ( - f"Expected English-only targets from EHS data, got {sorted(prefixes)}" - ) - - -def test_kensington_and_chelsea_above_blackpool(): - """K&C aggregate main-residence-value target should exceed Blackpool's.""" - name_to_code = dict(zip(LA_PRICES["name"], LA_PRICES["code"])) - kc = LA_TARGETS[name_to_code["Kensington and Chelsea"]] - bp = LA_TARGETS[name_to_code["Blackpool"]] - assert kc > bp, ( - f"K&C target (£{kc / 1e9:.1f}bn) should exceed Blackpool (£{bp / 1e9:.1f}bn)" - ) - - -def test_london_total_exceeds_north_east(): - """Sum of London LA targets should exceed sum of North-East LA targets.""" - london_codes = [c for c in LA_TARGETS if c.startswith("E09")] - ne_prefixes = { - "E06000001", - "E06000002", - "E06000003", - "E06000004", - "E06000005", - "E06000047", - "E08000021", - "E08000022", - "E08000023", - "E08000024", - "E08000037", - "E06000057", - } - ne_codes = [c for c in LA_TARGETS if c in ne_prefixes] - london_total = sum(LA_TARGETS[c] for c in london_codes) - ne_total = sum(LA_TARGETS[c] for c in ne_codes) - assert london_total > ne_total * 3, ( - f"London total (£{london_total / 1e9:.0f}bn) should exceed " - f"NE total (£{ne_total / 1e9:.0f}bn) by at least 3x" - ) - - -# ── Target registry integration ────────────────────────────────────── - - -def test_get_targets_returns_targets_for_covered_las(): - """get_targets() returns one Target per LA with all inputs available.""" - targets = get_targets() - assert len(targets) == len(LA_TARGETS) - assert {t.geo_code for t in targets} == set(LA_TARGETS) - - -def test_target_names_follow_code_pattern(): - """Target names should follow the housing/main_residence_value/{code} pattern.""" - for t in get_targets(): - assert t.name.startswith("housing/main_residence_value/") - assert t.name.removeprefix("housing/main_residence_value/") == t.geo_code - - -def test_targets_declare_local_authority_geographic_level(): - """All LA targets should be tagged with GeographicLevel.LOCAL_AUTHORITY.""" - for t in get_targets(): - assert t.geographic_level == GeographicLevel.LOCAL_AUTHORITY - - -def test_targets_declare_hmlr_source(): - """LA property-value targets are sourced from HMLR UK HPI.""" - for t in get_targets(): - assert t.source == "hmlr" - - -def test_targets_have_calibration_year_values(): - """LA targets should carry values for the supported calibration years.""" - for t in get_targets(): - assert {2024, 2025, 2026} <= set(t.values) - - -def test_target_registry_includes_la_targets(): - """LA property-value targets should appear in the global registry.""" - from policyengine_uk_data.targets import get_all_targets - - targets = get_all_targets( - year=2024, geographic_level=GeographicLevel.LOCAL_AUTHORITY - ) - la_property = [ - t for t in targets if t.name.startswith("housing/main_residence_value/") - ] - assert len(la_property) == len(LA_TARGETS), ( - f"Expected {len(LA_TARGETS)} LA property-value targets, got {len(la_property)}" - ) diff --git a/policyengine_uk_data/tests/test_la_loss_land_value.py b/policyengine_uk_data/tests/test_la_loss_land_value.py deleted file mode 100644 index 2f94401de..000000000 --- a/policyengine_uk_data/tests/test_la_loss_land_value.py +++ /dev/null @@ -1,263 +0,0 @@ -"""Tests for the LA-level main-residence-value column wired into the -local-authority calibration loss matrix. - -Two layers: - -1. Light-weight checks against the per-LA target dict from la_land.py — - these run without a Microsimulation and exercise the ordering / - shape properties the loss-matrix code relies on. -2. Full ``create_local_authority_target_matrix`` build, gated on the - enhanced FRS fixture so CI environments without the dataset skip - gracefully. -""" - -import numpy as np -import pandas as pd - -from policyengine_uk_data.storage import STORAGE_FOLDER -from policyengine_uk_data.targets.sources.la_land import ( - _compute_la_targets, - load_la_avg_prices, -) -from policyengine_uk_data.targets.sources.local_la_extras import ( - load_household_counts, - load_tenure_data, -) - - -LA_CODES = pd.read_csv(STORAGE_FOLDER / "local_authorities_2021.csv") -LA_TARGETS = _compute_la_targets() - - -# ── Layer 1: per-LA targets line up with the LA code ordering ──────── - - -def test_explicit_targets_cover_english_las(): - """Direct-formula targets are produced for LAs with EHS tenure data - (England). Other UK countries fall through to the national-share - fallback in loss.py — same as the existing tenure target.""" - prefixes = {code[0] for code in LA_TARGETS} - assert prefixes == {"E"} - - -def test_target_vector_in_la_codes_order_is_finite_positive_where_present(): - """Reindexing by la_codes order yields a clean float vector for - LAs with a target; LAs missing inputs become NaN (later filled by - the national-share fallback inside loss.py).""" - vec = LA_CODES["code"].map(LA_TARGETS).values - finite = vec[~np.isnan(vec.astype(float))] - assert len(vec) == 360 - assert (finite > 0).all() - - -def test_targets_match_observed_product_inline(): - """Per-LA target equals avg_price × ownership_share × n_households — - the same shape as private rent's ``median_rent × renter_pct × n_hh``. - """ - prices = load_la_avg_prices().set_index("code")["avg_house_price"] - tenure = load_tenure_data().set_index("la_code") - households = load_household_counts().set_index("la_code")["households"] - - for code, target in LA_TARGETS.items(): - if code not in tenure.index or code not in households.index: - continue - ownership = ( - tenure.loc[code, "owned_outright_pct"] - + tenure.loc[code, "owned_mortgage_pct"] - ) / 100 - expected = prices.loc[code] * ownership * households.loc[code] - assert abs(target - expected) < 1e-3 - - -# ── Layer 2: full LA loss matrix build ─────────────────────────────── - - -def test_la_loss_matrix_includes_main_residence_value(enhanced_frs): - """The LA target matrix must expose housing/main_residence_value in - both matrix (per-household) and y (per-LA) so the calibrator can - train on it.""" - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - matrix, y, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) - - assert "housing/main_residence_value" in matrix.columns - assert "housing/main_residence_value" in y.columns - - -def test_main_residence_value_is_validation_target_by_default(): - """The derived proxy target is reported for validation, not trained by default.""" - from policyengine_uk_data.datasets.local_areas.local_authorities.calibrate import ( - VALIDATION_TARGETS, - ) - - assert "housing/main_residence_value" in VALIDATION_TARGETS - - -def test_la_loss_y_vector_length_360(enhanced_frs): - """y has one entry per LA and matches local_authorities_2021.csv ordering - by length.""" - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - _, y, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) - - assert len(y) == 360 - assert len(y["housing/main_residence_value"]) == 360 - - -def test_la_loss_y_matches_observed_product_for_covered_las(enhanced_frs): - """For LAs with all inputs present, y equals avg_price × ownership × n_households. - - LAs missing inputs use the national-share fallback (covered in - test_la_loss_y_all_positive).""" - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - _, y, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) - - expected_by_code = LA_TARGETS - for i, code in enumerate(LA_CODES["code"].values): - if code not in expected_by_code: - continue # fallback path - actual = y["housing/main_residence_value"].iloc[i] - expected = expected_by_code[code] - assert abs(actual - expected) < 1e-3, ( - f"{code}: y {actual:,.2f} != expected {expected:,.2f}" - ) - - -def test_la_loss_y_all_positive(enhanced_frs): - """No LA should have a non-positive main-residence-value target.""" - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - _, y, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) - - assert (y["housing/main_residence_value"] > 0).all() - - -def test_la_loss_matrix_column_matches_main_residence_value(enhanced_frs): - """matrix['housing/main_residence_value'] should equal the per-household - main_residence_value pulled from policyengine-uk for the calibration year.""" - from policyengine_uk import Microsimulation - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - matrix, _, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) - - sim = Microsimulation(dataset=enhanced_frs) - sim.default_calculation_period = enhanced_frs.time_period - expected = sim.calculate("main_residence_value").values - - np.testing.assert_array_equal( - matrix["housing/main_residence_value"].values, expected - ) - - -# ── Layer 2b: calibration well-formedness ───────────────────────────── - - -def test_la_loss_y_has_no_nan(enhanced_frs): - """Every LA must have a numeric target. NaN entries would propagate - through the optimiser and fail calibration silently.""" - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - _, y, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) - - assert not y["housing/main_residence_value"].isna().any() - - -def test_la_loss_fallback_applied_to_non_english_las(enhanced_frs): - """Wales / Scotland / NI LAs use the national-share fallback because - EHS only covers England. Their y entry must still be positive - (so the optimiser has a target to fit) and must NOT equal the - direct-formula value (which is undefined when ownership share - is missing).""" - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - _, y, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) - - fallback_codes = [c for c in LA_CODES["code"] if not c.startswith("E")] - fallback_indices = [ - i for i, c in enumerate(LA_CODES["code"].values) if c in fallback_codes - ] - fallback_values = y["housing/main_residence_value"].iloc[fallback_indices] - - assert (fallback_values > 0).all() - assert fallback_values.notna().all() - - -def test_la_loss_matrix_column_carries_calibration_signal(enhanced_frs): - """matrix['housing/main_residence_value'] must vary across households — - a constant column gives the optimiser no signal to differentiate - LAs and the new target would be inert.""" - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - matrix, _, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) - - column = matrix["housing/main_residence_value"].values - assert column.var() > 0 - assert (column > 0).any(), "no households with positive main_residence_value" - - -def test_la_loss_english_target_total_within_reach_of_initial_weights(enhanced_frs): - """Sum of English LA targets should be in the same order of magnitude - as the implied initial English main-residence-value — so the - optimiser has a chance of hitting them via reweighting rather than - requiring weights to inflate by 100x.""" - from policyengine_uk import Microsimulation - from policyengine_uk_data.datasets.local_areas.local_authorities.loss import ( - create_local_authority_target_matrix, - ) - - _, y, _ = create_local_authority_target_matrix( - enhanced_frs, time_period=enhanced_frs.time_period - ) - - sim = Microsimulation(dataset=enhanced_frs) - original_weights = sim.calculate("household_weight", 2025).values - main_res = sim.calculate("main_residence_value", enhanced_frs.time_period).values - country = sim.calculate("country", enhanced_frs.time_period).values - - england_mask = country == "ENGLAND" - england_initial = (original_weights[england_mask] * main_res[england_mask]).sum() - - english_indices = [ - i for i, c in enumerate(LA_CODES["code"].values) if c.startswith("E") - ] - english_targets = y["housing/main_residence_value"].iloc[english_indices].sum() - - ratio = english_targets / england_initial - assert 0.5 < ratio < 3.0, ( - f"English LA target sum (£{english_targets / 1e9:.0f}bn) / " - f"initial English main-residence-value (£{england_initial / 1e9:.0f}bn) " - f"= {ratio:.2f}; calibration target may be hard to reach" - )