diff --git a/.github/workflows/export-columns.yml b/.github/workflows/export-columns.yml new file mode 100644 index 0000000..50eccbc --- /dev/null +++ b/.github/workflows/export-columns.yml @@ -0,0 +1,40 @@ +name: Export columns (eCPS parity) + +# Fast, first-line contract gate. Compares an export's column set against +# the frozen eCPS contract in milliseconds, with no H5 build, no GPU, and +# none of the heavy ML deps (microplex / torch / policyengine-us). This is +# a standalone workflow on purpose so column drift is caught before the +# slow artifact-gate and site-snapshot jobs run. + +on: + push: + branches: [main] + pull_request: + +permissions: + contents: read + +jobs: + column-parity: + runs-on: ubuntu-latest + steps: + - name: Check out repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + - name: Set up uv + uses: astral-sh/setup-uv@v6 + with: + version: "0.11.14" + - name: Install minimal deps + # Only what the gate and its tests need; no microplex / torch. + run: uv pip install --system pytest h5py numpy + - name: Run column-parity tests + run: PYTHONPATH=src python -m pytest tests/pipelines/test_check_export_columns.py -q + - name: Self-check against committed clean fixture + # Run the module as a file (not `-m`) so the package __init__ + # (which imports microplex/torch) never loads. Proves the gate + # exits 0 on a known-good column set, with no data file at all. + run: python src/microplex_us/pipelines/check_export_columns.py --columns-json tests/pipelines/fixtures/ecps_clean_columns.json diff --git a/pyproject.toml b/pyproject.toml index 3019ee1..a94857c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ microplex-us-arch-target-smoke = "microplex_us.targets.arch:main_smoke" microplex-us-build-aca-ptc-multipliers = "microplex_us.targets.aca_ptc:main" microplex-us-backfill-pe-native-audit = "microplex_us.pipelines.backfill_pe_native_audit:main" microplex-us-backfill-pe-native-scores = "microplex_us.pipelines.backfill_pe_native_scores:main" +microplex-us-check-export-columns = "microplex_us.pipelines.check_export_columns:main" microplex-us-check-site-snapshot = "microplex_us.pipelines.check_site_snapshot:main" microplex-us-compact-policyengine-dataset = "microplex_us.pipelines.compact_policyengine_dataset:main" microplex-us-mp300k-artifact-gates = "microplex_us.pipelines.mp300k_artifact_gates:main" @@ -72,6 +73,7 @@ allow-direct-references = true [tool.hatch.build.targets.wheel.force-include] "src/microplex_us/pipelines/pe_native_scores.py" = "microplex_us/pipelines/pe_native_scores.py" +"src/microplex_us/pipelines/ecps_export_contract.json" = "microplex_us/pipelines/ecps_export_contract.json" [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/src/microplex_us/pipelines/check_export_columns.py b/src/microplex_us/pipelines/check_export_columns.py new file mode 100644 index 0000000..aa8cf64 --- /dev/null +++ b/src/microplex_us/pipelines/check_export_columns.py @@ -0,0 +1,253 @@ +"""Fast eCPS column-parity check for exported datasets. + +This is the cheap, millisecond gate that should pass *before* the +expensive MP-300k build. It compares the column set of a candidate export +against a frozen contract describing what the enhanced CPS (eCPS) baseline +exports, so column drift is catchable locally and in CI without producing +any data. + +The required/forbidden column diff here mirrors the one inside +``_column_contract_gate`` in ``mp300k_artifact_gates`` (``required +- present`` and ``forbidden & present``) -- but that gate only runs deep +in the slow artifact path. This module surfaces the same check as a +one-line local command and the first, cheap CI job. It also reuses that +module's ``_h5_top_level_columns`` helper for H5 parsing so the two cannot +read columns differently. + +The contract (``ecps_export_contract.json``) defines three categories: + +- ``required`` -- columns MP must export to be a drop-in eCPS replacement. +- ``ecps_internal_optional`` -- eCPS clone-bookkeeping columns MP need not + export (neither required nor forbidden). +- ``forbidden`` -- transient takeup-input columns eCPS drops and MP must + not export. + +Heavy imports (``h5py``, via the gate helper) are deferred so importing +this module and running the ``--columns-json`` path stay cheap. + +Usage:: + + python -m microplex_us.pipelines.check_export_columns export.h5 + python -m microplex_us.pipelines.check_export_columns \\ + --columns-json columns.json + python -m microplex_us.pipelines.check_export_columns export.h5 \\ + --contract custom_contract.json + +Exits 1 if any required column is missing or any forbidden column is +present; exits 0 otherwise. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from dataclasses import dataclass +from pathlib import Path + +# Path to the committed contract shipped alongside this module. +DEFAULT_CONTRACT_PATH = Path(__file__).with_name("ecps_export_contract.json") + + +@dataclass +class ColumnDiff: + """Result of comparing a present column set against a contract.""" + + missing_required: list[str] + forbidden_present: list[str] + extra_unknown: list[str] + + @property + def ok(self) -> bool: + """True when no required column is missing and none forbidden.""" + return not self.missing_required and not self.forbidden_present + + +def compute_column_diff( + present: set[str], + *, + required: set[str], + forbidden: set[str], + optional: frozenset[str] | set[str] = frozenset(), + excluded: frozenset[str] | set[str] = frozenset(), +) -> ColumnDiff: + """Compare a present column set against contract categories. + + Mirrors the required/forbidden diff in ``_column_contract_gate`` in + ``mp300k_artifact_gates`` (``required - present`` and ``forbidden & + present``). ``optional`` (clone-bookkeeping flags) and ``excluded`` + (formula-owned columns MP need not export) are recognized categories, so + they never appear in ``extra_unknown``. ``extra_unknown`` is informational + only: columns present that are in no known category. + """ + missing_required = required - present + forbidden_present = forbidden & present + known = required | forbidden | set(optional) | set(excluded) + extra_unknown = present - known + return ColumnDiff( + missing_required=sorted(missing_required), + forbidden_present=sorted(forbidden_present), + extra_unknown=sorted(extra_unknown), + ) + + +def load_contract(path: Path) -> dict: + """Load and validate the column-parity contract JSON.""" + with open(path) as f: + contract = json.load(f) + for key in ("required", "forbidden"): + if key not in contract: + raise ValueError(f"Contract {path} is missing required key '{key}'.") + contract.setdefault("ecps_internal_optional", []) + contract.setdefault("formula_owned_excluded", []) + return contract + + +def _gate_h5_top_level_columns(): + """Return the artifact gate's ``_h5_top_level_columns`` helper. + + Loaded from the sibling module *by file path* (not by package import) + so neither importing this module nor running it as a script pulls the + heavy ``microplex_us`` package ``__init__`` (and ``microplex`` / + torch). ``h5py`` is imported only as a side effect of executing the + gate module here, on the H5 path. The module is registered in + ``sys.modules`` before execution so its dataclasses resolve. + """ + import importlib.util + + module_name = "_mp300k_artifact_gates_for_columns" + cached = sys.modules.get(module_name) + if cached is not None: + return cached._h5_top_level_columns + + gate_path = Path(__file__).with_name("mp300k_artifact_gates.py") + spec = importlib.util.spec_from_file_location(module_name, gate_path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module._h5_top_level_columns + + +def _columns_from_h5(h5_path: Path) -> set[str]: + """Return top-level base column names from an exported H5. + + Reuses the artifact-gate helper so H5 parsing stays identical. Columns + may be datasets named ```` or groups ``/``; + both collapse to the base name. + """ + return _gate_h5_top_level_columns()(h5_path) + + +def _columns_from_json(json_path: Path) -> set[str]: + """Return base column names from a JSON list (no data file needed).""" + with open(json_path) as f: + names = json.load(f) + if not isinstance(names, list): + raise ValueError( + f"--columns-json {json_path} must contain a JSON list of column names." + ) + return {str(name).split("/")[0] for name in names} + + +def _bullet_lines(items: list[str]) -> list[str]: + """Render a list as indented bullets, or a placeholder if empty.""" + if not items: + return [" (none)"] + return [f" - {item}" for item in items] + + +def _format_report( + diff: ColumnDiff, + *, + source: str, + n_present: int, + n_required: int, + n_forbidden: int, +) -> str: + """Build a human-readable report for the diff.""" + lines = [ + "eCPS column-parity check", + f" source: {source}", + f" columns present: {n_present}", + f" required (contract): {n_required}", + f" forbidden (contract): {n_forbidden}", + "", + f" missing_required ({len(diff.missing_required)}):", + *_bullet_lines(diff.missing_required), + f" forbidden_present ({len(diff.forbidden_present)}):", + *_bullet_lines(diff.forbidden_present), + f" extra_unknown (informational, {len(diff.extra_unknown)}):", + *_bullet_lines(diff.extra_unknown), + "", + " RESULT: " + ("PASS" if diff.ok else "FAIL"), + ] + return "\n".join(lines) + + +def main(argv: list[str] | None = None) -> int: + """Run the column-parity check; return the process exit code.""" + parser = argparse.ArgumentParser( + prog="check_export_columns", + description=( + "Fast eCPS column-parity check: compare a candidate export's " + "columns to the frozen eCPS contract. Produces no data." + ), + ) + parser.add_argument( + "h5path", + nargs="?", + help="Path to an exported H5 whose columns are checked.", + ) + parser.add_argument( + "--columns-json", + metavar="FILE", + help=( + "Path to a JSON list of column names to check instead of an " + "H5 (the no-data CI path). Mutually exclusive with h5path." + ), + ) + parser.add_argument( + "--contract", + metavar="FILE", + default=str(DEFAULT_CONTRACT_PATH), + help="Override the contract JSON (default: committed contract).", + ) + args = parser.parse_args(argv) + + if bool(args.h5path) == bool(args.columns_json): + parser.error("provide exactly one of an H5 path or --columns-json.") + + contract = load_contract(Path(args.contract)) + required = set(contract["required"]) + forbidden = set(contract["forbidden"]) + optional = set(contract["ecps_internal_optional"]) + excluded = set(contract.get("formula_owned_excluded", [])) + + if args.columns_json: + source = args.columns_json + present = _columns_from_json(Path(args.columns_json)) + else: + source = args.h5path + present = _columns_from_h5(Path(args.h5path)) + + diff = compute_column_diff( + present, + required=required, + forbidden=forbidden, + optional=optional, + excluded=excluded, + ) + print( + _format_report( + diff, + source=source, + n_present=len(present), + n_required=len(required), + n_forbidden=len(forbidden), + ) + ) + return 0 if diff.ok else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/microplex_us/pipelines/ecps_export_contract.json b/src/microplex_us/pipelines/ecps_export_contract.json new file mode 100644 index 0000000..2c31626 --- /dev/null +++ b/src/microplex_us/pipelines/ecps_export_contract.json @@ -0,0 +1,292 @@ +{ + "_description": "Frozen column-parity contract for the eCPS export. Defines what a Microplex (MP) dataset must export to be a drop-in replacement for the enhanced CPS (eCPS) baseline. Derived from the 252-column eCPS baseline H5 export. Consumed by check_export_columns.py; mirrors the required/forbidden diff in _column_contract_gate (mp300k_artifact_gates.py).", + "_categories": { + "required": "Columns MP MUST export to match eCPS. The gate FAILS if any are missing from the candidate export. Equals the 252 clone-correct baseline columns minus the 5 clone-bookkeeping flags (ecps_internal_optional) and weeks_worked (formula_owned_excluded).", + "ecps_internal_optional": "eCPS clone-bookkeeping columns (PUF-clone flags). eCPS emits these internally; MP is NOT required to export them, so they never count as missing. The gate neither requires nor forbids them.", + "forbidden": "Columns eCPS deliberately drops from its export: the transient *_reported takeup-input family and the PUF reported/calculated tax-credit outputs. The gate FAILS if MP exports any.", + "formula_owned_excluded": "pe-us FORMULA variables the baseline does NOT persist as inputs (currently just weeks_worked). Structural/overridable computed fields (has_tin/has_itin/in_nyc/fsla_overtime_premium/meets_ssi_disability_criteria) are REQUIRED, not here, per the in-tree _column_contract_gate." + }, + "_source": "eCPS clone-correct baseline H5 export column set (252 columns).", + "required": [ + "age", + "alimony_expense", + "alimony_income", + "american_opportunity_credit_claimed_prior_years", + "attends_eligible_educational_institution_for_american_opportunity_credit", + "auto_loan_balance", + "auto_loan_interest", + "bank_account_assets", + "block_geoid", + "bond_assets", + "business_is_sstb", + "casualty_loss", + "charitable_cash_donations", + "charitable_non_cash_donations", + "child_support_expense", + "child_support_received", + "congressional_district_geoid", + "count_under_18", + "count_under_6", + "county_fips", + "cps_race", + "detailed_occupation_recode", + "difficulty_doing_errands", + "difficulty_dressing_or_bathing", + "difficulty_hearing", + "difficulty_remembering_or_making_decisions", + "difficulty_seeing", + "difficulty_walking_or_climbing_stairs", + "disability_benefits", + "domestic_production_ald", + "educational_assistance", + "educator_expense", + "employer_sponsored_insurance_premiums", + "employment_income_before_lsr", + "estate_income", + "estate_income_would_be_qualified", + "family_id", + "farm_income", + "farm_operations_income", + "farm_operations_income_would_be_qualified", + "farm_rent_income", + "farm_rent_income_would_be_qualified", + "financial_assistance", + "first_home_mortgage_balance", + "first_home_mortgage_interest", + "first_home_mortgage_origination_year", + "fsla_overtime_premium", + "has_american_opportunity_credit_1098_t_or_exception", + "has_american_opportunity_credit_institution_ein", + "has_champva_health_coverage_at_interview", + "has_completed_first_four_years_of_postsecondary_education", + "has_esi", + "has_felony_drug_conviction", + "has_indian_health_service_coverage_at_interview", + "has_itin", + "has_marketplace_health_coverage", + "has_marketplace_health_coverage_at_interview", + "has_medicaid_health_coverage_at_interview", + "has_never_worked", + "has_non_marketplace_direct_purchase_health_coverage_at_interview", + "has_other_means_tested_health_coverage_at_interview", + "has_tin", + "has_tricare_health_coverage_at_interview", + "has_va_health_coverage_at_interview", + "has_valid_ssn", + "health_insurance_premiums_without_medicare_part_b", + "health_savings_account_ald", + "home_mortgage_interest", + "hourly_wage", + "hours_worked_last_week", + "household_id", + "household_vehicles_owned", + "household_vehicles_value", + "household_weight", + "immigration_status_str", + "in_nyc", + "investment_income_elected_form_4952", + "investment_interest_expense", + "is_blind", + "is_computer_scientist", + "is_disabled", + "is_enrolled_at_least_half_time_for_american_opportunity_credit", + "is_executive_administrative_professional", + "is_farmer_fisher", + "is_female", + "is_full_time_college_student", + "is_hispanic", + "is_household_head", + "is_military", + "is_paid_hourly", + "is_pregnant", + "is_pursuing_credential_for_american_opportunity_credit", + "is_separated", + "is_surviving_spouse", + "is_tipped_occupation", + "is_union_member_or_covered", + "is_unmarried_partner_of_household_head", + "is_wic_at_nutritional_risk", + "keogh_distributions", + "long_term_capital_gains_before_response", + "long_term_capital_gains_on_collectibles", + "marital_unit_id", + "meets_ssi_disability_criteria", + "miscellaneous_income", + "net_worth", + "non_qualified_dividend_income", + "non_sch_d_capital_gains", + "other_health_insurance_premiums", + "other_medical_expenses", + "other_type_retirement_account_distributions", + "over_the_counter_health_expenses", + "own_children_in_household", + "partnership_s_corp_income", + "partnership_s_corp_income_would_be_qualified", + "partnership_se_income", + "person_family_id", + "person_household_id", + "person_id", + "person_marital_unit_id", + "person_spm_unit_id", + "person_tax_unit_id", + "pre_subsidy_rent", + "previous_year_income_available", + "qualified_bdc_income", + "qualified_dividend_income", + "qualified_reit_and_ptp_income", + "qualified_tuition_expenses", + "real_estate_taxes", + "receives_housing_assistance", + "receives_wic", + "regular_ira_distributions", + "rental_income", + "rental_income_would_be_qualified", + "reported_has_champva_health_coverage_at_interview", + "reported_has_chip_health_coverage_at_interview", + "reported_has_direct_purchase_health_coverage_at_interview", + "reported_has_employer_sponsored_health_coverage_at_interview", + "reported_has_indian_health_service_coverage_at_interview", + "reported_has_marketplace_health_coverage_at_interview", + "reported_has_means_tested_health_coverage_at_interview", + "reported_has_medicaid_health_coverage_at_interview", + "reported_has_medicare_health_coverage_at_interview", + "reported_has_multiple_health_coverage_at_interview", + "reported_has_non_marketplace_direct_purchase_health_coverage_at_interview", + "reported_has_other_means_tested_health_coverage_at_interview", + "reported_has_private_health_coverage_at_interview", + "reported_has_public_health_coverage_at_interview", + "reported_has_subsidized_marketplace_health_coverage_at_interview", + "reported_has_tricare_health_coverage_at_interview", + "reported_has_unsubsidized_marketplace_health_coverage_at_interview", + "reported_has_va_health_coverage_at_interview", + "reported_is_insured_at_interview", + "reported_is_uninsured_at_interview", + "reported_owns_employer_sponsored_health_insurance_at_interview", + "roth_401k_contributions_desired", + "roth_ira_contributions_desired", + "roth_ira_distributions", + "salt_refund_income", + "scf_business_equity", + "scf_cash_value_life_insurance", + "scf_certificates_of_deposit", + "scf_credit_card_debt", + "scf_mortgage_debt", + "scf_nonresidential_real_estate_equity", + "scf_other_debt", + "scf_other_financial_assets", + "scf_other_installment_debt", + "scf_other_lines_of_credit", + "scf_other_managed_assets", + "scf_other_nonfinancial_assets", + "scf_other_residential_debt", + "scf_other_residential_real_estate", + "scf_primary_residence_value", + "scf_retirement_assets", + "scf_savings_bonds", + "scf_student_loan_debt", + "scf_vehicle_installment_debt", + "second_home_mortgage_balance", + "second_home_mortgage_interest", + "second_home_mortgage_origination_year", + "selected_marketplace_plan_benchmark_ratio", + "self_employed_pension_contributions_desired", + "self_employment_income_before_lsr", + "self_employment_income_last_year", + "self_employment_income_would_be_qualified", + "short_term_capital_gains", + "social_security_dependents", + "social_security_disability", + "social_security_retirement", + "social_security_survivors", + "spm_unit_energy_subsidy", + "spm_unit_id", + "spm_unit_pre_subsidy_childcare_expenses", + "spm_unit_tenure_type", + "ssn_card_type", + "sstb_self_employment_income_before_lsr", + "sstb_self_employment_income_would_be_qualified", + "sstb_unadjusted_basis_qualified_property", + "sstb_w2_wages_from_qualified_business", + "state_fips", + "stock_assets", + "strike_benefits", + "student_loan_interest", + "survivor_benefits", + "takes_up_aca_if_eligible", + "takes_up_dc_ptc", + "takes_up_early_head_start_if_eligible", + "takes_up_eitc", + "takes_up_head_start_if_eligible", + "takes_up_housing_assistance_if_eligible", + "takes_up_medicaid_if_eligible", + "takes_up_medicare_if_eligible", + "takes_up_snap_if_eligible", + "takes_up_ssi_if_eligible", + "takes_up_tanf_if_eligible", + "tax_exempt_401k_distributions", + "tax_exempt_403b_distributions", + "tax_exempt_interest_income", + "tax_exempt_ira_distributions", + "tax_exempt_private_pension_income", + "tax_exempt_sep_distributions", + "tax_unit_id", + "taxable_401k_distributions", + "taxable_403b_distributions", + "taxable_interest_income", + "taxable_ira_distributions", + "taxable_private_pension_income", + "taxable_sep_distributions", + "taxpayer_id_type", + "tenure_type", + "tip_income", + "tract_geoid", + "traditional_401k_contributions_desired", + "traditional_ira_contributions_desired", + "treasury_tipped_occupation_code", + "unadjusted_basis_qualified_property", + "unemployment_compensation", + "unrecaptured_section_1250_gain", + "unreimbursed_business_employee_expenses", + "veterans_benefits", + "w2_wages_from_qualified_business", + "weekly_hours_worked_before_lsr", + "weeks_unemployed", + "workers_compensation", + "would_claim_wic", + "would_file_taxes_voluntarily" + ], + "ecps_internal_optional": [ + "family_is_puf_clone", + "household_is_puf_clone", + "person_is_puf_clone", + "spm_unit_is_puf_clone", + "tax_unit_is_puf_clone" + ], + "forbidden": [ + "amt_foreign_tax_credit", + "early_withdrawal_penalty", + "excess_withheld_payroll_tax", + "free_school_meals_reported", + "general_business_credit", + "other_credits", + "prior_year_minimum_tax_credit", + "recapture_of_investment_credit", + "reduced_price_school_meals_reported", + "snap_reported", + "spm_unit_broadband_subsidy_reported", + "spm_unit_capped_housing_subsidy_reported", + "spm_unit_energy_subsidy_reported", + "spm_unit_federal_tax_reported", + "spm_unit_net_income_reported", + "spm_unit_payroll_tax_reported", + "spm_unit_state_tax_reported", + "spm_unit_total_income_reported", + "spm_unit_wic_reported", + "ssi_reported", + "tanf_reported", + "unreported_payroll_tax" + ], + "formula_owned_excluded": [ + "weeks_worked" + ] +} \ No newline at end of file diff --git a/src/microplex_us/pipelines/mp300k_artifact_gates.py b/src/microplex_us/pipelines/mp300k_artifact_gates.py index 5305364..6e9c7c3 100644 --- a/src/microplex_us/pipelines/mp300k_artifact_gates.py +++ b/src/microplex_us/pipelines/mp300k_artifact_gates.py @@ -537,6 +537,20 @@ def _computed_policyengine_us_export_columns(columns: list[str]) -> set[str]: ) +def _h5_top_level_columns(candidate_dataset: Path) -> set[str]: + """Return base column names at the top level of an exported H5. + + Accepts both shapes a column can take: a group ``/`` + (the eCPS export layout) or a flat dataset ````. Names are + collapsed to the base column via ``split("/")[0]`` so the two are + comparable. Shared by the fast column-parity CLI + (``check_export_columns``) so it reads columns the same way the + artifact path does. + """ + with h5py.File(candidate_dataset, "r") as handle: + return {name.split("/")[0] for name in handle.keys()} + + def _h5_period_columns(path: Path, *, period_key: str) -> list[str]: with h5py.File(path, "r") as handle: return sorted( @@ -1441,13 +1455,17 @@ def _source_weight_diagnostics_gate( "largest_source_household_weight_share", ) if isinstance(summary, dict): - support_share = support_share if support_share is not None else _first_present( - summary, - "support_household_weight_share", - "support_weight_share", - "puf_support_household_weight_share", - "puf_clone_household_weight_share", - "clone_household_weight_share", + support_share = ( + support_share + if support_share is not None + else _first_present( + summary, + "support_household_weight_share", + "support_weight_share", + "puf_support_household_weight_share", + "puf_clone_household_weight_share", + "clone_household_weight_share", + ) ) puf_support_share = ( puf_support_share @@ -1458,10 +1476,14 @@ def _source_weight_diagnostics_gate( "puf_clone_household_weight_share", ) ) - max_source_share = max_source_share if max_source_share is not None else _first_present( - summary, - "max_source_household_weight_share", - "largest_source_household_weight_share", + max_source_share = ( + max_source_share + if max_source_share is not None + else _first_present( + summary, + "max_source_household_weight_share", + "largest_source_household_weight_share", + ) ) if entries: @@ -1706,9 +1728,7 @@ def _benchmark_manifest_gate( "pass", "frozen microsimulation benchmark manifest pins baseline, target, and package evidence", metrics={ - "required_evidence_count": len( - _REQUIRED_BENCHMARK_MANIFEST_EVIDENCE - ), + "required_evidence_count": len(_REQUIRED_BENCHMARK_MANIFEST_EVIDENCE), "present_evidence_count": len(evidence["present"]), }, details={**descriptor, "present_evidence": evidence["present"]}, diff --git a/tests/pipelines/fixtures/ecps_clean_columns.json b/tests/pipelines/fixtures/ecps_clean_columns.json new file mode 100644 index 0000000..0059341 --- /dev/null +++ b/tests/pipelines/fixtures/ecps_clean_columns.json @@ -0,0 +1,254 @@ +[ + "age", + "alimony_expense", + "alimony_income", + "american_opportunity_credit_claimed_prior_years", + "attends_eligible_educational_institution_for_american_opportunity_credit", + "auto_loan_balance", + "auto_loan_interest", + "bank_account_assets", + "block_geoid", + "bond_assets", + "business_is_sstb", + "casualty_loss", + "charitable_cash_donations", + "charitable_non_cash_donations", + "child_support_expense", + "child_support_received", + "congressional_district_geoid", + "count_under_18", + "count_under_6", + "county_fips", + "cps_race", + "detailed_occupation_recode", + "difficulty_doing_errands", + "difficulty_dressing_or_bathing", + "difficulty_hearing", + "difficulty_remembering_or_making_decisions", + "difficulty_seeing", + "difficulty_walking_or_climbing_stairs", + "disability_benefits", + "domestic_production_ald", + "educational_assistance", + "educator_expense", + "employer_sponsored_insurance_premiums", + "employment_income_before_lsr", + "estate_income", + "estate_income_would_be_qualified", + "family_id", + "family_is_puf_clone", + "farm_income", + "farm_operations_income", + "farm_operations_income_would_be_qualified", + "farm_rent_income", + "farm_rent_income_would_be_qualified", + "financial_assistance", + "first_home_mortgage_balance", + "first_home_mortgage_interest", + "first_home_mortgage_origination_year", + "fsla_overtime_premium", + "has_american_opportunity_credit_1098_t_or_exception", + "has_american_opportunity_credit_institution_ein", + "has_champva_health_coverage_at_interview", + "has_completed_first_four_years_of_postsecondary_education", + "has_esi", + "has_felony_drug_conviction", + "has_indian_health_service_coverage_at_interview", + "has_itin", + "has_marketplace_health_coverage", + "has_marketplace_health_coverage_at_interview", + "has_medicaid_health_coverage_at_interview", + "has_never_worked", + "has_non_marketplace_direct_purchase_health_coverage_at_interview", + "has_other_means_tested_health_coverage_at_interview", + "has_tin", + "has_tricare_health_coverage_at_interview", + "has_va_health_coverage_at_interview", + "has_valid_ssn", + "health_insurance_premiums_without_medicare_part_b", + "health_savings_account_ald", + "home_mortgage_interest", + "hourly_wage", + "hours_worked_last_week", + "household_id", + "household_is_puf_clone", + "household_vehicles_owned", + "household_vehicles_value", + "household_weight", + "immigration_status_str", + "in_nyc", + "investment_income_elected_form_4952", + "investment_interest_expense", + "is_blind", + "is_computer_scientist", + "is_disabled", + "is_enrolled_at_least_half_time_for_american_opportunity_credit", + "is_executive_administrative_professional", + "is_farmer_fisher", + "is_female", + "is_full_time_college_student", + "is_hispanic", + "is_household_head", + "is_military", + "is_paid_hourly", + "is_pregnant", + "is_pursuing_credential_for_american_opportunity_credit", + "is_separated", + "is_surviving_spouse", + "is_tipped_occupation", + "is_union_member_or_covered", + "is_unmarried_partner_of_household_head", + "is_wic_at_nutritional_risk", + "keogh_distributions", + "long_term_capital_gains_before_response", + "long_term_capital_gains_on_collectibles", + "marital_unit_id", + "meets_ssi_disability_criteria", + "miscellaneous_income", + "net_worth", + "non_qualified_dividend_income", + "non_sch_d_capital_gains", + "other_health_insurance_premiums", + "other_medical_expenses", + "other_type_retirement_account_distributions", + "over_the_counter_health_expenses", + "own_children_in_household", + "partnership_s_corp_income", + "partnership_s_corp_income_would_be_qualified", + "partnership_se_income", + "person_family_id", + "person_household_id", + "person_id", + "person_is_puf_clone", + "person_marital_unit_id", + "person_spm_unit_id", + "person_tax_unit_id", + "pre_subsidy_rent", + "previous_year_income_available", + "qualified_bdc_income", + "qualified_dividend_income", + "qualified_reit_and_ptp_income", + "qualified_tuition_expenses", + "real_estate_taxes", + "receives_housing_assistance", + "receives_wic", + "regular_ira_distributions", + "rental_income", + "rental_income_would_be_qualified", + "reported_has_champva_health_coverage_at_interview", + "reported_has_chip_health_coverage_at_interview", + "reported_has_direct_purchase_health_coverage_at_interview", + "reported_has_employer_sponsored_health_coverage_at_interview", + "reported_has_indian_health_service_coverage_at_interview", + "reported_has_marketplace_health_coverage_at_interview", + "reported_has_means_tested_health_coverage_at_interview", + "reported_has_medicaid_health_coverage_at_interview", + "reported_has_medicare_health_coverage_at_interview", + "reported_has_multiple_health_coverage_at_interview", + "reported_has_non_marketplace_direct_purchase_health_coverage_at_interview", + "reported_has_other_means_tested_health_coverage_at_interview", + "reported_has_private_health_coverage_at_interview", + "reported_has_public_health_coverage_at_interview", + "reported_has_subsidized_marketplace_health_coverage_at_interview", + "reported_has_tricare_health_coverage_at_interview", + "reported_has_unsubsidized_marketplace_health_coverage_at_interview", + "reported_has_va_health_coverage_at_interview", + "reported_is_insured_at_interview", + "reported_is_uninsured_at_interview", + "reported_owns_employer_sponsored_health_insurance_at_interview", + "roth_401k_contributions_desired", + "roth_ira_contributions_desired", + "roth_ira_distributions", + "salt_refund_income", + "scf_business_equity", + "scf_cash_value_life_insurance", + "scf_certificates_of_deposit", + "scf_credit_card_debt", + "scf_mortgage_debt", + "scf_nonresidential_real_estate_equity", + "scf_other_debt", + "scf_other_financial_assets", + "scf_other_installment_debt", + "scf_other_lines_of_credit", + "scf_other_managed_assets", + "scf_other_nonfinancial_assets", + "scf_other_residential_debt", + "scf_other_residential_real_estate", + "scf_primary_residence_value", + "scf_retirement_assets", + "scf_savings_bonds", + "scf_student_loan_debt", + "scf_vehicle_installment_debt", + "second_home_mortgage_balance", + "second_home_mortgage_interest", + "second_home_mortgage_origination_year", + "selected_marketplace_plan_benchmark_ratio", + "self_employed_pension_contributions_desired", + "self_employment_income_before_lsr", + "self_employment_income_last_year", + "self_employment_income_would_be_qualified", + "short_term_capital_gains", + "social_security_dependents", + "social_security_disability", + "social_security_retirement", + "social_security_survivors", + "spm_unit_energy_subsidy", + "spm_unit_id", + "spm_unit_is_puf_clone", + "spm_unit_pre_subsidy_childcare_expenses", + "spm_unit_tenure_type", + "ssn_card_type", + "sstb_self_employment_income_before_lsr", + "sstb_self_employment_income_would_be_qualified", + "sstb_unadjusted_basis_qualified_property", + "sstb_w2_wages_from_qualified_business", + "state_fips", + "stock_assets", + "strike_benefits", + "student_loan_interest", + "survivor_benefits", + "takes_up_aca_if_eligible", + "takes_up_dc_ptc", + "takes_up_early_head_start_if_eligible", + "takes_up_eitc", + "takes_up_head_start_if_eligible", + "takes_up_housing_assistance_if_eligible", + "takes_up_medicaid_if_eligible", + "takes_up_medicare_if_eligible", + "takes_up_snap_if_eligible", + "takes_up_ssi_if_eligible", + "takes_up_tanf_if_eligible", + "tax_exempt_401k_distributions", + "tax_exempt_403b_distributions", + "tax_exempt_interest_income", + "tax_exempt_ira_distributions", + "tax_exempt_private_pension_income", + "tax_exempt_sep_distributions", + "tax_unit_id", + "tax_unit_is_puf_clone", + "taxable_401k_distributions", + "taxable_403b_distributions", + "taxable_interest_income", + "taxable_ira_distributions", + "taxable_private_pension_income", + "taxable_sep_distributions", + "taxpayer_id_type", + "tenure_type", + "tip_income", + "tract_geoid", + "traditional_401k_contributions_desired", + "traditional_ira_contributions_desired", + "treasury_tipped_occupation_code", + "unadjusted_basis_qualified_property", + "unemployment_compensation", + "unrecaptured_section_1250_gain", + "unreimbursed_business_employee_expenses", + "veterans_benefits", + "w2_wages_from_qualified_business", + "weekly_hours_worked_before_lsr", + "weeks_unemployed", + "weeks_worked", + "workers_compensation", + "would_claim_wic", + "would_file_taxes_voluntarily" +] \ No newline at end of file diff --git a/tests/pipelines/test_check_export_columns.py b/tests/pipelines/test_check_export_columns.py new file mode 100644 index 0000000..58b580d --- /dev/null +++ b/tests/pipelines/test_check_export_columns.py @@ -0,0 +1,275 @@ +"""Tests for the fast eCPS column-parity check CLI. + +The module under test is loaded directly from its file path (not via +``import microplex_us...``) so these tests run with only ``pytest`` / +``h5py`` / ``numpy`` installed -- importing the ``microplex_us`` package +would pull ``microplex`` and torch. This mirrors the loader pattern in +``test_mp300k_artifact_gates.py``. +""" + +from __future__ import annotations + +import importlib.util +import json +import sys +from pathlib import Path + +import pytest + +_MODULE_PATH = ( + Path(__file__).resolve().parents[2] + / "src" + / "microplex_us" + / "pipelines" + / "check_export_columns.py" +) +_spec = importlib.util.spec_from_file_location("check_export_columns", _MODULE_PATH) +cec = importlib.util.module_from_spec(_spec) +# Register before exec so the module's @dataclass can resolve its module. +sys.modules["check_export_columns"] = cec +_spec.loader.exec_module(cec) + +DEFAULT_CONTRACT_PATH = cec.DEFAULT_CONTRACT_PATH +compute_column_diff = cec.compute_column_diff +load_contract = cec.load_contract +main = cec.main + +# A tiny self-contained contract so most tests do not depend on the +# (large) committed contract. +TINY_CONTRACT = { + "required": ["age", "snap", "employment_income"], + "ecps_internal_optional": ["person_is_puf_clone"], + "forbidden": ["snap_reported", "ssi_reported"], +} + + +def _write_json(path: Path, obj) -> Path: + path.write_text(json.dumps(obj)) + return path + + +@pytest.fixture +def contract_path(tmp_path: Path) -> Path: + return _write_json(tmp_path / "contract.json", TINY_CONTRACT) + + +def _run_columns( + tmp_path: Path, + contract_path: Path, + columns: list[str], +) -> int: + cols_path = _write_json(tmp_path / "cols.json", columns) + return main( + [ + "--columns-json", + str(cols_path), + "--contract", + str(contract_path), + ] + ) + + +def test_main_clean_list_returns_zero(tmp_path, contract_path): + # required + optional, no forbidden -> pass. + cols = ["age", "snap", "employment_income", "person_is_puf_clone"] + assert _run_columns(tmp_path, contract_path, cols) == 0 + + +def test_main_missing_required_returns_one(tmp_path, contract_path): + # Drop a required column. + cols = ["age", "snap"] # missing employment_income + assert _run_columns(tmp_path, contract_path, cols) == 1 + + +def test_main_forbidden_present_returns_one(tmp_path, contract_path): + # All required present, but a forbidden column is exported. + cols = ["age", "snap", "employment_income", "snap_reported"] + assert _run_columns(tmp_path, contract_path, cols) == 1 + + +def test_columns_json_path_collapses_period_suffix(tmp_path, contract_path): + # "name/period" entries collapse to the base name and still pass. + cols = ["age/2024", "snap/2024", "employment_income/2024"] + assert _run_columns(tmp_path, contract_path, cols) == 0 + + +def test_optional_column_is_neither_required_nor_forbidden(tmp_path, contract_path): + # Omitting an optional column does not fail; it is not "missing". + cols = ["age", "snap", "employment_income"] + assert _run_columns(tmp_path, contract_path, cols) == 0 + + +def test_main_h5_path_returns_zero_when_clean(tmp_path, contract_path): + h5py = pytest.importorskip("h5py") + import numpy as np + + # Mirror the eCPS export layout: each column is a group /. + h5_path = tmp_path / "export.h5" + with h5py.File(h5_path, "w") as f: + for col in ["age", "snap", "employment_income"]: + f.create_dataset(f"{col}/2024", data=np.array([1, 2, 3])) + rc = main([str(h5_path), "--contract", str(contract_path)]) + assert rc == 0 + + +def test_main_h5_path_flags_missing_required(tmp_path, contract_path): + h5py = pytest.importorskip("h5py") + import numpy as np + + h5_path = tmp_path / "export.h5" + with h5py.File(h5_path, "w") as f: + # missing employment_income + for col in ["age", "snap"]: + f.create_dataset(f"{col}/2024", data=np.array([1, 2, 3])) + rc = main([str(h5_path), "--contract", str(contract_path)]) + assert rc == 1 + + +def test_main_h5_path_flags_forbidden_present(tmp_path, contract_path): + h5py = pytest.importorskip("h5py") + import numpy as np + + h5_path = tmp_path / "export.h5" + with h5py.File(h5_path, "w") as f: + for col in ["age", "snap", "employment_income", "snap_reported"]: + f.create_dataset(f"{col}/2024", data=np.array([1, 2, 3])) + rc = main([str(h5_path), "--contract", str(contract_path)]) + assert rc == 1 + + +def test_main_h5_path_accepts_flat_datasets(tmp_path, contract_path): + # A flat dataset layout (no period sub-group) is also accepted. + h5py = pytest.importorskip("h5py") + import numpy as np + + h5_path = tmp_path / "export.h5" + with h5py.File(h5_path, "w") as f: + for col in ["age", "snap", "employment_income"]: + f.create_dataset(col, data=np.array([1, 2, 3])) + rc = main([str(h5_path), "--contract", str(contract_path)]) + assert rc == 0 + + +def test_main_requires_exactly_one_input(tmp_path, contract_path): + # Neither input -> argparse error (SystemExit code 2). + with pytest.raises(SystemExit) as exc: + main(["--contract", str(contract_path)]) + assert exc.value.code == 2 + + # Both inputs -> argparse error. + cols_path = _write_json(tmp_path / "c.json", ["age"]) + with pytest.raises(SystemExit) as exc: + main( + [ + str(tmp_path / "x.h5"), + "--columns-json", + str(cols_path), + "--contract", + str(contract_path), + ] + ) + assert exc.value.code == 2 + + +def test_compute_column_diff_categories(): + diff = compute_column_diff( + {"age", "snap", "snap_reported", "mystery"}, + required={"age", "snap", "wages"}, + forbidden={"snap_reported"}, + optional={"person_is_puf_clone"}, + ) + assert diff.missing_required == ["wages"] + assert diff.forbidden_present == ["snap_reported"] + assert diff.extra_unknown == ["mystery"] + assert diff.ok is False + + +def test_load_contract_rejects_missing_keys(tmp_path): + bad = _write_json(tmp_path / "bad.json", {"required": ["age"]}) + with pytest.raises(ValueError, match="forbidden"): + load_contract(bad) + + +def test_committed_contract_parses_with_expected_categories(): + contract = load_contract(DEFAULT_CONTRACT_PATH) + for key in ( + "required", + "ecps_internal_optional", + "forbidden", + "formula_owned_excluded", + ): + assert key in contract, f"contract missing '{key}'" + assert isinstance(contract[key], list) + # Category sizes of the eCPS contract, aligned to the clone-correct baseline + # H5 (postfix_clonecorrect): required exports the 5 *_desired retirement + # INPUTS (not the bare formula-computed columns), forbids the + # PUF_REPORTED_CALCULATED_TAX_OUTPUT_VARIABLES tax-credit outputs, and + # excludes only weeks_worked (the lone pe-us formula var the baseline does + # not persist). Structural/overridable computed fields + # (has_tin/has_itin/in_nyc/fsla_overtime_premium/meets_ssi_disability_criteria) + # are REQUIRED, matching the in-tree _column_contract_gate. + # Sizes sum to the 252-column clone-correct baseline: 246 + 5 + 1. + assert len(contract["required"]) == 246 + assert len(contract["ecps_internal_optional"]) == 5 + assert len(contract["forbidden"]) == 22 + assert len(contract["formula_owned_excluded"]) == 1 + # Categories must be disjoint. + req = set(contract["required"]) + opt = set(contract["ecps_internal_optional"]) + forb = set(contract["forbidden"]) + excl = set(contract["formula_owned_excluded"]) + assert req.isdisjoint(opt) + assert req.isdisjoint(forb) + assert opt.isdisjoint(forb) + assert excl.isdisjoint(req) + assert excl.isdisjoint(forb) + # The clone-bookkeeping flags are optional, not required. + assert "person_is_puf_clone" in opt + assert "person_is_puf_clone" not in req + # Structural/overridable computed fields are REQUIRED (in-tree gate parity), + # NOT excluded; only weeks_worked is excluded. + for structural in ( + "has_tin", + "has_itin", + "in_nyc", + "fsla_overtime_premium", + "meets_ssi_disability_criteria", + "difficulty_hearing", + ): + assert structural in req + assert excl == {"weeks_worked"} + + +def test_committed_clean_fixture_passes_committed_contract(): + # The CI fixture must be a clean, passing set against the real + # contract so the green CI path proves the gate passes on good data. + fixture = Path(__file__).parent / "fixtures" / "ecps_clean_columns.json" + rc = main(["--columns-json", str(fixture)]) + assert rc == 0 + + +def test_committed_contract_covers_every_baseline_column(): + # Completeness invariant: every column the clean baseline fixture exports + # must be accounted for by some contract category, so a baseline-shaped + # export produces no extra_unknown columns. This pins the contract to the + # real baseline and catches silent under-specification of `required`. + contract = load_contract(DEFAULT_CONTRACT_PATH) + fixture = Path(__file__).parent / "fixtures" / "ecps_clean_columns.json" + present = set(json.loads(fixture.read_text())) + diff = compute_column_diff( + present, + required=set(contract["required"]), + forbidden=set(contract["forbidden"]), + optional=set(contract["ecps_internal_optional"]), + excluded=set(contract["formula_owned_excluded"]), + ) + assert diff.extra_unknown == [] + assert diff.missing_required == [] + assert diff.forbidden_present == [] + + +def test_default_contract_path_is_packaged(): + # The contract ships next to the module so the default path resolves. + assert DEFAULT_CONTRACT_PATH.name == "ecps_export_contract.json" + assert DEFAULT_CONTRACT_PATH.exists() + assert callable(cec.main)