From 8640b02943b611cbd9bd24fd2178e408ea0fdb08 Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Thu, 7 May 2026 16:34:48 +0200 Subject: [PATCH 1/2] Update UK release bundle to data 1.53.0 --- ...update-uk-release-bundle-1-53-0.changed.md | 1 + pyproject.toml | 4 +- .../data/release_manifests/uk.json | 22 +-- .../release_manifests/uk.trace.tro.jsonld | 135 ++++++++++++++++++ .../release_manifests/us.trace.tro.jsonld | 117 +++++++++++++++ tests/test_models.py | 6 +- tests/test_release_manifests.py | 32 +++-- tests/test_uk_regions.py | 2 +- uv.lock | 10 +- 9 files changed, 296 insertions(+), 33 deletions(-) create mode 100644 changelog.d/update-uk-release-bundle-1-53-0.changed.md create mode 100644 src/policyengine/data/release_manifests/uk.trace.tro.jsonld create mode 100644 src/policyengine/data/release_manifests/us.trace.tro.jsonld diff --git a/changelog.d/update-uk-release-bundle-1-53-0.changed.md b/changelog.d/update-uk-release-bundle-1-53-0.changed.md new file mode 100644 index 00000000..f757682b --- /dev/null +++ b/changelog.d/update-uk-release-bundle-1-53-0.changed.md @@ -0,0 +1 @@ +Updated the bundled UK release contract to policyengine-uk 2.88.6 and policyengine-uk-data 1.53.0. diff --git a/pyproject.toml b/pyproject.toml index 57ce102e..b1168e4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ graph = [ ] uk = [ "policyengine_core>=3.25.0", - "policyengine-uk==2.88.0", + "policyengine-uk==2.88.6", ] us = [ "policyengine_core>=3.25.0", @@ -60,7 +60,7 @@ dev = [ "pytest-asyncio>=0.26.0", "ruff>=0.9.0", "policyengine_core>=3.25.0", - "policyengine-uk==2.88.0", + "policyengine-uk==2.88.6", "policyengine-us==1.687.0", "towncrier>=24.8.0", "mypy>=1.11.0", diff --git a/src/policyengine/data/release_manifests/uk.json b/src/policyengine/data/release_manifests/uk.json index 10c0eec3..3fa04445 100644 --- a/src/policyengine/data/release_manifests/uk.json +++ b/src/policyengine/data/release_manifests/uk.json @@ -5,29 +5,31 @@ "policyengine_version": "4.3.1", "model_package": { "name": "policyengine-uk", - "version": "2.88.0", - "sha256": "46a3ba443b43ec810c5efaccd4645edb63c8dc90ef5acf9b0cdf5ace86b9334d", - "wheel_url": "https://files.pythonhosted.org/packages/23/7e/8a2a42eac1da63730a865964aa17e7fd4420ce4db4c80001c1b5ca6011e8/policyengine_uk-2.88.0-py3-none-any.whl" + "version": "2.88.6", + "sha256": "d89c86c4835629115d9c20005dc818462798ac1940616f788db37e2b87472e33", + "wheel_url": "https://files.pythonhosted.org/packages/b8/38/3cf1f123e0ea25f16e3e207d46ba7ca86e6c251e3171dd7d22b18723c9a5/policyengine_uk-2.88.6-py3-none-any.whl" }, "data_package": { "name": "policyengine-uk-data", - "version": "1.40.4", + "version": "1.53.0", "repo_id": "policyengine/policyengine-uk-data-private" }, "certified_data_artifact": { "data_package": { "name": "policyengine-uk-data", - "version": "1.40.4" + "version": "1.53.0" }, - "build_id": "policyengine-uk-data-1.40.4", + "build_id": "policyengine-uk-data-1.53.0", "dataset": "enhanced_frs_2023_24", - "uri": "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" + "uri": "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.53.0", + "sha256": "032c343bf583235c0dd27143d9a0bb4d9ab1d39849424d9a9e3d68283611cc26" }, "certification": { "compatibility_basis": "exact_build_model_version", - "data_build_id": "policyengine-uk-data-1.40.4", - "built_with_model_version": "2.88.0", - "certified_for_model_version": "2.88.0", + "data_build_id": "policyengine-uk-data-1.53.0", + "built_with_model_version": "2.88.6", + "certified_for_model_version": "2.88.6", + "data_build_fingerprint": "sha256:535d2fd64f9e2b2aa9991e5a5d65be25bdd126f18858682533789f5c7e467782", "certified_by": "policyengine.py bundled manifest" }, "default_dataset": "enhanced_frs_2023_24", diff --git a/src/policyengine/data/release_manifests/uk.trace.tro.jsonld b/src/policyengine/data/release_manifests/uk.trace.tro.jsonld new file mode 100644 index 00000000..fc059df9 --- /dev/null +++ b/src/policyengine/data/release_manifests/uk.trace.tro.jsonld @@ -0,0 +1,135 @@ +{ + "@context": [ + { + "pe": "https://policyengine.org/trace/0.1#", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "schema": "https://schema.org/", + "trov": "https://w3id.org/trace/trov/0.1#" + } + ], + "@graph": [ + { + "@id": "tro", + "@type": "trov:TransparentResearchObject", + "schema:creator": { + "@type": "schema:Organization", + "schema:name": "PolicyEngine", + "schema:url": "https://policyengine.org" + }, + "schema:dateCreated": "2026-04-19T22:34:16.746038Z", + "schema:description": "TRACE TRO for certified runtime bundle uk-4.3.1 covering the bundle manifest, the certified dataset artifact, the country model wheel, and the country data release manifest when it is available.", + "schema:name": "policyengine uk certified bundle TRO", + "trov:createdWith": { + "@type": "schema:SoftwareApplication", + "schema:name": "policyengine", + "schema:softwareVersion": "4.3.1" + }, + "trov:hasArrangement": [ + { + "@id": "arrangement/1", + "@type": "trov:ArtifactArrangement", + "rdfs:comment": "Certified arrangement for bundle uk-4.3.1.", + "trov:hasArtifactLocation": [ + { + "@id": "arrangement/1/location/bundle_manifest", + "@type": "trov:ArtifactLocation", + "trov:hasArtifact": { + "@id": "composition/1/artifact/bundle_manifest" + }, + "trov:hasLocation": "data/release_manifests/uk.json" + }, + { + "@id": "arrangement/1/location/data_release_manifest", + "@type": "trov:ArtifactLocation", + "trov:hasArtifact": { + "@id": "composition/1/artifact/data_release_manifest" + }, + "trov:hasLocation": "https://huggingface.co/policyengine/policyengine-uk-data-private/resolve/1.53.0/release_manifest.json" + }, + { + "@id": "arrangement/1/location/dataset", + "@type": "trov:ArtifactLocation", + "trov:hasArtifact": { + "@id": "composition/1/artifact/dataset" + }, + "trov:hasLocation": "https://huggingface.co/policyengine/policyengine-uk-data-private/resolve/1.53.0/enhanced_frs_2023_24.h5" + }, + { + "@id": "arrangement/1/location/model_wheel", + "@type": "trov:ArtifactLocation", + "trov:hasArtifact": { + "@id": "composition/1/artifact/model_wheel" + }, + "trov:hasLocation": "https://files.pythonhosted.org/packages/b8/38/3cf1f123e0ea25f16e3e207d46ba7ca86e6c251e3171dd7d22b18723c9a5/policyengine_uk-2.88.6-py3-none-any.whl" + } + ] + } + ], + "trov:hasComposition": { + "@id": "composition/1", + "@type": "trov:ArtifactComposition", + "trov:hasArtifact": [ + { + "@id": "composition/1/artifact/bundle_manifest", + "@type": "trov:ResearchArtifact", + "schema:name": "policyengine.py bundle manifest for uk", + "trov:mimeType": "application/json", + "trov:sha256": "32bca42530afc0776406891b644294c1fe818452c132646eeba4fe1568460f28" + }, + { + "@id": "composition/1/artifact/data_release_manifest", + "@type": "trov:ResearchArtifact", + "schema:name": "policyengine-uk-data release manifest 1.53.0", + "trov:mimeType": "application/json", + "trov:sha256": "e3af62e134684d26bfca49fd2d7c9819d666b2a959ea78dab8789997b803d36d" + }, + { + "@id": "composition/1/artifact/dataset", + "@type": "trov:ResearchArtifact", + "schema:name": "enhanced_frs_2023_24", + "trov:mimeType": "application/x-hdf5", + "trov:sha256": "032c343bf583235c0dd27143d9a0bb4d9ab1d39849424d9a9e3d68283611cc26" + }, + { + "@id": "composition/1/artifact/model_wheel", + "@type": "trov:ResearchArtifact", + "schema:name": "policyengine-uk==2.88.6 wheel", + "trov:mimeType": "application/zip", + "trov:sha256": "d89c86c4835629115d9c20005dc818462798ac1940616f788db37e2b87472e33" + } + ], + "trov:hasFingerprint": { + "@id": "composition/1/fingerprint", + "@type": "trov:CompositionFingerprint", + "trov:sha256": "e38af547b509c2844c0263140e798ba871db77b85bf30e47a2fe2ee2a6931660" + } + }, + "trov:hasPerformance": { + "@id": "trp/1", + "@type": "trov:TransparentResearchPerformance", + "pe:builtWithModelVersion": "2.88.6", + "pe:certifiedBy": "policyengine.py bundled manifest", + "pe:certifiedForModelVersion": "2.88.6", + "pe:compatibilityBasis": "exact_build_model_version", + "pe:dataBuildFingerprint": "sha256:535d2fd64f9e2b2aa9991e5a5d65be25bdd126f18858682533789f5c7e467782", + "pe:dataBuildId": "policyengine-uk-data-1.53.0", + "pe:emittedIn": "local", + "rdfs:comment": "Certification of build policyengine-uk-data-1.53.0 for policyengine-uk 2.88.6.", + "trov:accessedArrangement": { + "@id": "arrangement/1" + }, + "trov:startedAtTime": "2026-04-19T22:34:16.746038Z", + "trov:wasConductedBy": { + "@id": "trs" + } + }, + "trov:wasAssembledBy": { + "@id": "trs", + "@type": "trov:TransparentResearchSystem", + "rdfs:comment": "PolicyEngine certification workflow that pins a country model version, a country data release, and a specific dataset artifact.", + "schema:name": "PolicyEngine release pipeline" + } + } + ] +} diff --git a/src/policyengine/data/release_manifests/us.trace.tro.jsonld b/src/policyengine/data/release_manifests/us.trace.tro.jsonld new file mode 100644 index 00000000..8f5a1f20 --- /dev/null +++ b/src/policyengine/data/release_manifests/us.trace.tro.jsonld @@ -0,0 +1,117 @@ +{ + "@context": [ + { + "pe": "https://policyengine.org/trace/0.1#", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "schema": "https://schema.org/", + "trov": "https://w3id.org/trace/trov/0.1#" + } + ], + "@graph": [ + { + "@id": "tro", + "@type": "trov:TransparentResearchObject", + "schema:creator": { + "@type": "schema:Organization", + "schema:name": "PolicyEngine", + "schema:url": "https://policyengine.org" + }, + "schema:description": "TRACE TRO for certified runtime bundle us-4.3.1 covering the bundle manifest, the certified dataset artifact, the country model wheel, and the country data release manifest when it is available.", + "schema:name": "policyengine us certified bundle TRO", + "trov:createdWith": { + "@type": "schema:SoftwareApplication", + "schema:name": "policyengine", + "schema:softwareVersion": "4.3.1" + }, + "trov:hasArrangement": [ + { + "@id": "arrangement/1", + "@type": "trov:ArtifactArrangement", + "rdfs:comment": "Certified arrangement for bundle us-4.3.1.", + "trov:hasArtifactLocation": [ + { + "@id": "arrangement/1/location/bundle_manifest", + "@type": "trov:ArtifactLocation", + "trov:hasArtifact": { + "@id": "composition/1/artifact/bundle_manifest" + }, + "trov:hasLocation": "data/release_manifests/us.json" + }, + { + "@id": "arrangement/1/location/dataset", + "@type": "trov:ArtifactLocation", + "trov:hasArtifact": { + "@id": "composition/1/artifact/dataset" + }, + "trov:hasLocation": "https://huggingface.co/policyengine/policyengine-us-data/resolve/1.78.2/enhanced_cps_2024.h5" + }, + { + "@id": "arrangement/1/location/model_wheel", + "@type": "trov:ArtifactLocation", + "trov:hasArtifact": { + "@id": "composition/1/artifact/model_wheel" + }, + "trov:hasLocation": "https://files.pythonhosted.org/packages/c3/36/5633f5a3996c915494154ec3852011b1a239ea06d9f08cb6287ab709618c/policyengine_us-1.687.0-py3-none-any.whl" + } + ] + } + ], + "trov:hasComposition": { + "@id": "composition/1", + "@type": "trov:ArtifactComposition", + "trov:hasArtifact": [ + { + "@id": "composition/1/artifact/bundle_manifest", + "@type": "trov:ResearchArtifact", + "schema:name": "policyengine.py bundle manifest for us", + "trov:mimeType": "application/json", + "trov:sha256": "a763519f03ef1acceea3121461c533f8de5b39d8da9c3cc085f370c5daac5cc9" + }, + { + "@id": "composition/1/artifact/dataset", + "@type": "trov:ResearchArtifact", + "schema:name": "enhanced_cps_2024", + "trov:sha256": "4e92b340c3ea3e200ed5d55edf752ee1a13baf787442956fb67d25242fed13b5" + }, + { + "@id": "composition/1/artifact/model_wheel", + "@type": "trov:ResearchArtifact", + "schema:name": "policyengine-us==1.687.0 wheel", + "trov:mimeType": "application/zip", + "trov:sha256": "cac7da3aa9ba4bf57009eee75d798217bbef7e1c5ca17646d472fad715ab634f" + } + ], + "trov:hasFingerprint": { + "@id": "composition/1/fingerprint", + "@type": "trov:CompositionFingerprint", + "trov:sha256": "21e32eb3586a13c7e7d2070b0b95452f2d4a630e7e32d5a7f5bcad49dfa86dd0" + } + }, + "trov:hasPerformance": { + "@id": "trp/1", + "@type": "trov:TransparentResearchPerformance", + "pe:builtWithModelVersion": "1.647.0", + "pe:certifiedBy": "policyengine.py bundled manifest", + "pe:certifiedForModelVersion": "1.687.0", + "pe:compatibilityBasis": "matching_data_build_fingerprint", + "pe:dataBuildId": "policyengine-us-data-1.78.2", + "pe:dataReleaseManifestStatus": "unavailable", + "pe:emittedIn": "local", + "rdfs:comment": "Certification of build policyengine-us-data-1.78.2 for policyengine-us 1.687.0.", + "trov:accessedArrangement": { + "@id": "arrangement/1" + }, + "trov:wasConductedBy": { + "@id": "trs" + } + }, + "trov:wasAssembledBy": { + "@id": "trs", + "@type": "trov:TransparentResearchSystem", + "rdfs:comment": "PolicyEngine certification workflow that pins a country model version, a country data release, and a specific dataset artifact.", + "schema:name": "PolicyEngine release pipeline" + } + } + ] +} diff --git a/tests/test_models.py b/tests/test_models.py index 3dd644c3..4e25555a 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -29,12 +29,12 @@ def test_has_release_manifest_metadata(self): assert uk_latest.release_manifest is not None assert uk_latest.release_manifest.country_id == "uk" assert uk_latest.model_package.name == "policyengine-uk" - assert uk_latest.model_package.version == "2.88.0" + assert uk_latest.model_package.version == "2.88.6" assert uk_latest.data_package.name == "policyengine-uk-data" - assert uk_latest.data_package.version == "1.40.4" + assert uk_latest.data_package.version == "1.53.0" assert ( uk_latest.default_dataset_uri - == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.53.0" ) def test_has_hundreds_of_parameters(self): diff --git a/tests/test_release_manifests.py b/tests/test_release_manifests.py index 769efabe..652c32b1 100644 --- a/tests/test_release_manifests.py +++ b/tests/test_release_manifests.py @@ -80,21 +80,29 @@ def test__given_uk_manifest__then_has_pinned_model_and_data_packages(self): assert manifest.country_id == "uk" assert manifest.policyengine_version == POLICYENGINE_VERSION assert manifest.model_package.name == "policyengine-uk" - assert manifest.model_package.version == "2.88.0" + assert manifest.model_package.version == "2.88.6" assert manifest.data_package.name == "policyengine-uk-data" - assert manifest.data_package.version == "1.40.4" + assert manifest.data_package.version == "1.53.0" assert ( manifest.data_package.repo_id == "policyengine/policyengine-uk-data-private" ) assert manifest.certified_data_artifact is not None assert ( - manifest.certified_data_artifact.build_id == "policyengine-uk-data-1.40.4" + manifest.certified_data_artifact.build_id == "policyengine-uk-data-1.53.0" ) assert manifest.certified_data_artifact.dataset == "enhanced_frs_2023_24" + assert ( + manifest.certified_data_artifact.sha256 + == "032c343bf583235c0dd27143d9a0bb4d9ab1d39849424d9a9e3d68283611cc26" + ) assert manifest.certification is not None - assert manifest.certification.data_build_id == "policyengine-uk-data-1.40.4" - assert manifest.certification.built_with_model_version == "2.88.0" - assert manifest.certification.certified_for_model_version == "2.88.0" + assert manifest.certification.data_build_id == "policyengine-uk-data-1.53.0" + assert manifest.certification.built_with_model_version == "2.88.6" + assert manifest.certification.certified_for_model_version == "2.88.6" + assert ( + manifest.certification.data_build_fingerprint + == "sha256:535d2fd64f9e2b2aa9991e5a5d65be25bdd126f18858682533789f5c7e467782" + ) def test__given_us_dataset_name__then_resolves_to_versioned_hf_url(self): resolved = resolve_dataset_reference("us", "enhanced_cps_2024") @@ -109,7 +117,7 @@ def test__given_uk_dataset_name__then_resolves_to_versioned_hf_url(self): assert ( resolved - == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.53.0" ) def test__given_explicit_url__then_resolution_is_noop(self): @@ -420,8 +428,8 @@ def test__given_manifest_certification__then_release_bundle_exposes_it(self): assert bundle["bundle_id"] == f"uk-{POLICYENGINE_VERSION}" assert bundle["default_dataset"] == "enhanced_frs_2023_24" assert bundle["default_dataset_uri"] == manifest.default_dataset_uri - assert bundle["certified_data_build_id"] == "policyengine-uk-data-1.40.4" - assert bundle["data_build_model_version"] == "2.88.0" + assert bundle["certified_data_build_id"] == "policyengine-uk-data-1.53.0" + assert bundle["data_build_model_version"] == "2.88.6" assert bundle["compatibility_basis"] == "exact_build_model_version" assert bundle["certified_by"] == "policyengine.py bundled manifest" @@ -502,19 +510,19 @@ def test__given_uk_managed_dataset_name__then_resolves_within_bundle(self): else: assert dataset == ( "hf://policyengine/policyengine-uk-data-private/" - "enhanced_frs_2023_24.h5@1.40.4" + "enhanced_frs_2023_24.h5@1.53.0" ) assert ( microsim.policyengine_bundle["policyengine_version"] == POLICYENGINE_VERSION ) assert microsim.policyengine_bundle["runtime_dataset"] == "enhanced_frs_2023_24" assert microsim.policyengine_bundle["runtime_dataset_uri"] == ( - "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" + "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.53.0" ) dataset_source = microsim.policyengine_bundle["runtime_dataset_source"] assert ( dataset_source - == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.53.0" or str(dataset_source).endswith( "policyengine_uk_data/storage/enhanced_frs_2023_24.h5" ) diff --git a/tests/test_uk_regions.py b/tests/test_uk_regions.py index 56f5a5fd..8cdd4c54 100644 --- a/tests/test_uk_regions.py +++ b/tests/test_uk_regions.py @@ -68,7 +68,7 @@ def test__given_uk_registry__then_has_national_region(self): assert national.region_type == "national" assert ( national.dataset_path - == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.40.4" + == "hf://policyengine/policyengine-uk-data-private/enhanced_frs_2023_24.h5@1.53.0" ) assert not national.requires_filter diff --git a/uv.lock b/uv.lock index 43a2498b..33392669 100644 --- a/uv.lock +++ b/uv.lock @@ -2484,8 +2484,8 @@ requires-dist = [ { name = "policyengine-core", marker = "extra == 'dev'", specifier = ">=3.25.0" }, { name = "policyengine-core", marker = "extra == 'uk'", specifier = ">=3.25.0" }, { name = "policyengine-core", marker = "extra == 'us'", specifier = ">=3.25.0" }, - { name = "policyengine-uk", marker = "extra == 'dev'", specifier = "==2.88.0" }, - { name = "policyengine-uk", marker = "extra == 'uk'", specifier = "==2.88.0" }, + { name = "policyengine-uk", marker = "extra == 'dev'", specifier = "==2.88.6" }, + { name = "policyengine-uk", marker = "extra == 'uk'", specifier = "==2.88.6" }, { name = "policyengine-us", marker = "extra == 'dev'", specifier = "==1.687.0" }, { name = "policyengine-us", marker = "extra == 'us'", specifier = "==1.687.0" }, { name = "psutil", specifier = ">=5.9.0" }, @@ -2533,7 +2533,7 @@ wheels = [ [[package]] name = "policyengine-uk" -version = "2.88.0" +version = "2.88.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -2543,9 +2543,9 @@ dependencies = [ { name = "tables", version = "3.10.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, { name = "tables", version = "3.11.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/72/cf/749dea25c17210b5dc40098363e0b6a60b7fc5feb69ff77c74b88deb5cde/policyengine_uk-2.88.0.tar.gz", hash = "sha256:d157c7336b7aa3a321f317af1a4f111d7b857451ff43f4998abdc5a8c893e989", size = 1166666, upload-time = "2026-04-17T19:22:55.418Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c3/d7/e6cb63bfba276602587ba4ca73ee0a95415fdca6dbf702602934a83bcf20/policyengine_uk-2.88.6.tar.gz", hash = "sha256:c853edf140ca427117d454ee244311e7132f7abc7ea60bdd45ad69e5e0389f44", size = 1170503, upload-time = "2026-04-19T20:52:34.871Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/23/7e/8a2a42eac1da63730a865964aa17e7fd4420ce4db4c80001c1b5ca6011e8/policyengine_uk-2.88.0-py3-none-any.whl", hash = "sha256:46a3ba443b43ec810c5efaccd4645edb63c8dc90ef5acf9b0cdf5ace86b9334d", size = 1867764, upload-time = "2026-04-17T19:22:53.244Z" }, + { url = "https://files.pythonhosted.org/packages/b8/38/3cf1f123e0ea25f16e3e207d46ba7ca86e6c251e3171dd7d22b18723c9a5/policyengine_uk-2.88.6-py3-none-any.whl", hash = "sha256:d89c86c4835629115d9c20005dc818462798ac1940616f788db37e2b87472e33", size = 1873598, upload-time = "2026-04-19T20:52:31.95Z" }, ] [[package]] From 5724a7137b7caba359520a98cd67739a481a352e Mon Sep 17 00:00:00 2001 From: Anthony Volk Date: Fri, 8 May 2026 00:00:50 +0200 Subject: [PATCH 2/2] Use data release metadata when refreshing bundles --- src/policyengine/provenance/bundle.py | 120 +++++++++++++++++++++++--- tests/test_bundle_refresh.py | 69 +++++++++++++++ 2 files changed, 175 insertions(+), 14 deletions(-) diff --git a/src/policyengine/provenance/bundle.py b/src/policyengine/provenance/bundle.py index 50ebf173..511e2761 100644 --- a/src/policyengine/provenance/bundle.py +++ b/src/policyengine/provenance/bundle.py @@ -40,12 +40,17 @@ from dataclasses import dataclass from pathlib import Path from typing import Optional +from urllib.error import HTTPError from urllib.request import Request, urlopen from policyengine.provenance.manifest import ( CountryReleaseManifest, + DataPackageVersion, + DataReleaseArtifact, + DataReleaseManifest, get_release_manifest, https_dataset_uri, + https_release_manifest_uri, ) # --------------------------------------------------------------------------- @@ -159,13 +164,8 @@ def _hf_dataset_sha256(repo_id: str, path: str, revision: str) -> str: path_in_repo=path, revision=revision, ) - headers = {"User-Agent": "policyengine.py"} - token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN") - if token: - headers["Authorization"] = f"Bearer {token}" - hasher = hashlib.sha256() - with urlopen(Request(url, headers=headers)) as f: + with urlopen(Request(url, headers=_hf_request_headers())) as f: while True: chunk = f.read(8 * 1024 * 1024) if not chunk: @@ -174,6 +174,48 @@ def _hf_dataset_sha256(repo_id: str, path: str, revision: str) -> str: return hasher.hexdigest() +def _hf_request_headers() -> dict[str, str]: + headers = {"User-Agent": "policyengine.py"} + token = os.environ.get("HUGGING_FACE_TOKEN") or os.environ.get("HF_TOKEN") + if token: + headers["Authorization"] = f"Bearer {token}" + return headers + + +def _hf_data_release_manifest( + data_package: DataPackageVersion, + revision: str, +) -> Optional[DataReleaseManifest]: + """Fetch the HF data release manifest when a release publishes one. + + Older data releases predate ``release_manifest.json``. Those remain + supported by returning ``None`` and falling back to streaming the dataset. + """ + package_at_revision = data_package.model_copy(update={"version": revision}) + url = https_release_manifest_uri(package_at_revision) + try: + with urlopen(Request(url, headers=_hf_request_headers())) as f: + return DataReleaseManifest.model_validate_json(f.read().decode()) + except HTTPError as exc: + if exc.code == 404: + return None + if exc.code in (401, 403): + raise ValueError( + "Could not fetch the data release manifest from Hugging Face. " + "If this country uses a private data repo, set HUGGING_FACE_TOKEN." + ) from exc + raise + + +def _release_artifact( + data_release: Optional[DataReleaseManifest], + dataset: str, +) -> Optional[DataReleaseArtifact]: + if data_release is None: + return None + return data_release.artifacts.get(dataset) + + # --------------------------------------------------------------------------- # Refresh result # --------------------------------------------------------------------------- @@ -275,14 +317,52 @@ def refresh_release_bundle( f"'hf://{{owner}}/{{repo}}/{{path}}@{{revision}}'" ) repo_id, dataset_path, _old_revision = repo_id_match.groups() + data_release = ( + _hf_data_release_manifest(current.data_package, new_data) + if new_data != old_data + else None + ) + release_artifact = _release_artifact( + data_release, + current.certified_data_artifact.dataset, + ) + if release_artifact is not None: + repo_id = release_artifact.repo_id + dataset_path = release_artifact.path + artifact_revision = release_artifact.revision + else: + artifact_revision = new_data # Only hit HF if the data version actually changed. if new_data != old_data: - new_dataset_sha256 = _hf_dataset_sha256(repo_id, dataset_path, new_data) + new_dataset_sha256 = ( + release_artifact.sha256 + if release_artifact is not None and release_artifact.sha256 is not None + else _hf_dataset_sha256(repo_id, dataset_path, artifact_revision) + ) else: new_dataset_sha256 = old_dataset_sha256 - new_uri = f"hf://{repo_id}/{dataset_path}@{new_data}" + new_uri = f"hf://{repo_id}/{dataset_path}@{artifact_revision}" policyengine_version = _pyproject_version(pyproject_path) + data_build_id = ( + data_release.build.build_id + if ( + data_release is not None + and data_release.build is not None + and data_release.build.build_id is not None + ) + else f"{current.data_package.name}-{new_data}" + ) + built_with_model = ( + data_release.build.built_with_model_package + if data_release is not None and data_release.build is not None + else None + ) + if built_with_model is not None and built_with_model.name != package_name: + raise ValueError( + "Data release manifest was built with a different model package: " + f"expected {package_name}, got {built_with_model.name}." + ) # Mutate the manifest JSON in place (keep unknown fields untouched). manifest_json["model_package"]["version"] = new_model @@ -290,15 +370,27 @@ def refresh_release_bundle( manifest_json["model_package"]["wheel_url"] = new_wheel_url manifest_json["data_package"]["version"] = new_data manifest_json["certified_data_artifact"]["data_package"]["version"] = new_data - manifest_json["certified_data_artifact"]["build_id"] = ( - f"{current.data_package.name}-{new_data}" - ) + manifest_json["certified_data_artifact"]["build_id"] = data_build_id manifest_json["certified_data_artifact"]["uri"] = new_uri manifest_json["certified_data_artifact"]["sha256"] = new_dataset_sha256 - manifest_json["certification"]["data_build_id"] = ( - f"{current.data_package.name}-{new_data}" - ) + manifest_json["certification"]["data_build_id"] = data_build_id manifest_json["certification"]["certified_for_model_version"] = new_model + if data_release is not None: + manifest_json["certification"].pop("built_with_model_version", None) + manifest_json["certification"].pop("built_with_model_git_sha", None) + manifest_json["certification"].pop("data_build_fingerprint", None) + if built_with_model is not None: + manifest_json["certification"]["built_with_model_version"] = ( + built_with_model.version + ) + if built_with_model.git_sha is not None: + manifest_json["certification"]["built_with_model_git_sha"] = ( + built_with_model.git_sha + ) + if built_with_model.data_build_fingerprint is not None: + manifest_json["certification"]["data_build_fingerprint"] = ( + built_with_model.data_build_fingerprint + ) manifest_path.write_text( json.dumps(manifest_json, indent=2, sort_keys=False) + "\n" diff --git a/tests/test_bundle_refresh.py b/tests/test_bundle_refresh.py index 85a12b1c..5ed6b482 100644 --- a/tests/test_bundle_refresh.py +++ b/tests/test_bundle_refresh.py @@ -24,6 +24,7 @@ import json from pathlib import Path from unittest.mock import patch +from urllib.error import HTTPError import pytest @@ -78,6 +79,10 @@ def __exit__(self, *args): self._buffer.close() +def _missing_release_manifest(url: str) -> HTTPError: + return HTTPError(url, 404, "Not Found", hdrs=None, fp=None) + + @pytest.fixture def sandbox(tmp_path: Path) -> dict: """A writable scratch copy of the US release manifest + a stub @@ -184,6 +189,8 @@ def test__bump_data_only_streams_hf_and_updates_uri(sandbox) -> None: def fake_urlopen(request, *args, **kwargs): url = request.full_url + if url.endswith("/release_manifest.json"): + raise _missing_release_manifest(url) if "huggingface.co" in url: assert "@" not in url # URI revision is in the URL path assert "/datasets/" not in url @@ -224,6 +231,8 @@ def fake_urlopen(request, *args, **kwargs): url = request.full_url if "pypi.org" in url: return _pypi_response("policyengine-us", "1.653.3") + if url.endswith("/release_manifest.json"): + raise _missing_release_manifest(url) if "huggingface.co" in url: return _FakeHFResponse(hf_bytes) raise AssertionError(url) @@ -242,6 +251,66 @@ def fake_urlopen(request, *args, **kwargs): assert result.new_data == "1.83.4" +def test__bump_data_with_release_manifest_updates_build_metadata(sandbox) -> None: + release_manifest = { + "schema_version": 1, + "data_package": { + "name": "policyengine-us-data", + "version": "1.83.4", + }, + "build": { + "build_id": "policyengine-us-data-1.83.4", + "built_with_model_package": { + "name": "policyengine-us", + "version": "1.653.3", + "git_sha": "abc123", + "data_build_fingerprint": "sha256:" + "f" * 64, + }, + }, + "artifacts": { + "enhanced_cps_2024": { + "kind": "dataset", + "path": "enhanced_cps_2024.h5", + "repo_id": "policyengine/policyengine-us-data", + "revision": "1.83.4", + "sha256": "e" * 64, + } + }, + } + + def fake_urlopen(request, *args, **kwargs): + url = request.full_url + if "pypi.org" in url: + return _pypi_response("policyengine-us", "1.653.3") + if url.endswith("/release_manifest.json"): + return io.BytesIO(json.dumps(release_manifest).encode()) + if "huggingface.co" in url: + raise AssertionError(f"Dataset should not be streamed: {url}") + raise AssertionError(url) + + with patch("policyengine.provenance.bundle.urlopen", side_effect=fake_urlopen): + result = refresh_release_bundle( + country="us", + model_version="1.653.3", + data_version="1.83.4", + manifest_dir=sandbox["manifest_dir"], + pyproject_path=sandbox["pyproject_path"], + ) + + assert result.new_dataset_sha256 == "e" * 64 + + written = json.loads((sandbox["manifest_dir"] / "us.json").read_text()) + assert written["certified_data_artifact"]["sha256"] == "e" * 64 + assert ( + written["certified_data_artifact"]["build_id"] == "policyengine-us-data-1.83.4" + ) + assert written["certification"]["data_build_id"] == "policyengine-us-data-1.83.4" + assert written["certification"]["built_with_model_version"] == "1.653.3" + assert written["certification"]["built_with_model_git_sha"] == "abc123" + assert written["certification"]["data_build_fingerprint"] == "sha256:" + "f" * 64 + assert written["certification"]["certified_for_model_version"] == "1.653.3" + + def test__update_pyproject_false_leaves_pins_alone(sandbox) -> None: def fake_urlopen(*args, **kwargs): return _pypi_response("policyengine-us", "1.653.3")