Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ node_modules
!population_by_state.csv
!aca_spending_and_enrollment_2024.csv
!aca_spending_and_enrollment_2025.csv
!policyengine_us_data/storage/calibration_targets/acs_housing_costs_2024.csv
!real_estate_taxes_by_state_acs.csv
!snap_state.csv
!age_state.csv
Expand Down
1 change: 1 addition & 0 deletions changelog.d/831.changed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Refined national ECPS calibration targets to remove circular survey/SPM constructs while keeping defensible rent, property tax, childcare, private-transfer balance constraints, structured EITC-by-AGI-and-child-count SOI targets, and taxable-filer AGI/count targets by AGI band and filing status. Added a national target parity manifest utility to classify legacy `build_loss_matrix()` labels against structured `policy_data.db` target rows.
1 change: 1 addition & 0 deletions changelog.d/845.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add national WIC calibration targets for FY 2024 food costs and average monthly participation.
5 changes: 5 additions & 0 deletions policyengine_us_data/calibration/calibration_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,11 @@ def apply_op(values: np.ndarray, op: str, val: str) -> np.ndarray:
if values.dtype.kind == "S" and isinstance(parsed, str):
parsed = parsed.encode()

if op == "in":
allowed = [part.strip() for part in val.split("|")]
if values.dtype.kind == "S":
allowed = [part.encode() for part in allowed]
return np.isin(values, allowed)
if op in ("==", "="):
return values == parsed
if op == ">":
Expand Down
43 changes: 24 additions & 19 deletions policyengine_us_data/calibration/target_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,18 @@ include:
- variable: adjusted_gross_income
geo_level: national
domain_variable: adjusted_gross_income
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits
- variable: adjusted_gross_income
geo_level: national
domain_variable: adjusted_gross_income,income_tax_before_credits
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,filing_status,income_tax_before_credits
- variable: adjusted_gross_income
geo_level: national
domain_variable: adjusted_gross_income,filing_status,income_tax_before_credits

# === NATIONAL — wealth target (Federal Reserve SCF, no filer filter) ===
- variable: net_worth
Expand All @@ -108,17 +120,7 @@ include:
# === NATIONAL — aggregate dollar targets ===
- variable: adjusted_gross_income
geo_level: national
- variable: alimony_expense
geo_level: national
- variable: alimony_income
geo_level: national
- variable: child_support_expense
geo_level: national
- variable: child_support_received
geo_level: national
- variable: employer_sponsored_insurance_premiums
geo_level: national
- variable: health_insurance_premiums_without_medicare_part_b
- variable: childcare_expenses
geo_level: national
- variable: long_term_capital_gains
geo_level: national
Expand All @@ -129,10 +131,6 @@ include:
domain_variable: medicare_enrolled
- variable: medicare_part_b_premium
geo_level: national
- variable: other_medical_expenses
geo_level: national
- variable: over_the_counter_health_expenses
geo_level: national
- variable: real_estate_taxes
geo_level: national
- variable: rent
Expand All @@ -149,10 +147,6 @@ include:
geo_level: national
- variable: social_security_survivors
geo_level: national
- variable: spm_unit_capped_housing_subsidy
geo_level: national
- variable: spm_unit_capped_work_childcare_expenses
geo_level: national
- variable: ssi
geo_level: national
- variable: tanf
Expand All @@ -164,6 +158,8 @@ include:
geo_level: national
- variable: unemployment_compensation
geo_level: national
- variable: wic
geo_level: national

# === NATIONAL — retirement contribution targets ===
- variable: traditional_ira_contributions
Expand All @@ -184,6 +180,9 @@ include:
- variable: eitc
geo_level: national
domain_variable: eitc_child_count
- variable: eitc
geo_level: national
domain_variable: adjusted_gross_income,eitc,eitc_child_count
- variable: net_capital_gains
geo_level: national
domain_variable: net_capital_gains
Expand Down Expand Up @@ -294,6 +293,9 @@ include:
- variable: tax_unit_count
geo_level: national
domain_variable: eitc_child_count
- variable: tax_unit_count
geo_level: national
domain_variable: adjusted_gross_income,eitc,eitc_child_count
# Restore old loss.py's ACA enrollment count target.
- variable: person_count
geo_level: national
Expand All @@ -319,6 +321,9 @@ include:
- variable: person_count
geo_level: national
domain_variable: ssn_card_type
- variable: person_count
geo_level: national
domain_variable: wic

# === NATIONAL — SOI deduction totals (non-reform) ===
- variable: charitable_deduction
Expand Down
25 changes: 22 additions & 3 deletions policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from policyengine_core.data import Dataset
import pandas as pd
from policyengine_us_data.utils import (
ABSOLUTE_ERROR_SCALE_TARGETS,
build_loss_matrix,
get_target_error_normalisation,
HardConcrete,
print_reweighting_diagnostics,
set_seeds,
Expand Down Expand Up @@ -241,6 +243,10 @@ def reweight(
):
target_names = np.array(loss_matrix.columns)
is_national = loss_matrix.columns.str.startswith("nation/")
numerator_shift_np, error_denominator_np = get_target_error_normalisation(
target_names,
targets_array,
)
loss_matrix = torch.tensor(loss_matrix.values, dtype=torch.float32)
nation_normalisation_factor = is_national * (1 / is_national.sum())
state_normalisation_factor = ~is_national * (1 / (~is_national).sum())
Expand All @@ -249,6 +255,8 @@ def reweight(
)
normalisation_factor = torch.tensor(normalisation_factor, dtype=torch.float32)
targets_array = torch.tensor(targets_array, dtype=torch.float32)
numerator_shift = torch.tensor(numerator_shift_np, dtype=torch.float32)
error_denominator = torch.tensor(error_denominator_np, dtype=torch.float32)

inv_mean_normalisation = 1 / np.mean(normalisation_factor.numpy())

Expand All @@ -260,7 +268,9 @@ def loss(weights):
estimate = weights @ loss_matrix
if torch.isnan(estimate).any():
raise ValueError("Estimate contains NaNs")
rel_error = (((estimate - targets_array) + 1) / (targets_array + 1)) ** 2
rel_error = (
(estimate - targets_array + numerator_shift) / error_denominator
) ** 2
rel_error_normalized = inv_mean_normalisation * rel_error * normalisation_factor
if torch.isnan(rel_error_normalized).any():
raise ValueError("Relative error contains NaNs")
Expand Down Expand Up @@ -304,7 +314,10 @@ def loss(weights):
)
df["epoch"] = i
df["error"] = df.estimate - df.target
df["rel_error"] = df.error / df.target
df["error_denominator"] = error_denominator.detach().numpy()
df["rel_error"] = (
df.error + numerator_shift.detach().numpy()
) / df.error_denominator
df["abs_error"] = df.error.abs()
df["rel_abs_error"] = df.rel_error.abs()
df["loss"] = df.rel_abs_error**2
Expand All @@ -331,6 +344,7 @@ def loss(weights):
loss_matrix,
targets_array,
"L0 Sparse Solution",
target_names=target_names,
)

return final_weights_sparse
Expand Down Expand Up @@ -376,7 +390,12 @@ def generate(self):
# Run the optimization procedure to get (close to) minimum loss weights
for year in range(self.start_year, self.end_year + 1):
loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year)
zero_mask = np.isclose(targets_array, 0.0, atol=0.1)
scaled_zero_target_mask = loss_matrix.columns.isin(
ABSOLUTE_ERROR_SCALE_TARGETS.keys()
)
zero_mask = np.isclose(targets_array, 0.0, atol=0.1) & (
~scaled_zero_target_mask
)
bad_mask = loss_matrix.columns.isin(bad_targets)
keep_mask_bool = ~(zero_mask | bad_mask)
keep_idx = np.where(keep_mask_bool)[0]
Expand Down
18 changes: 18 additions & 0 deletions policyengine_us_data/db/DATABASE_GUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,24 @@ rm -f policyengine_us_data/storage/calibration/policy_data.db
make database
```

### Legacy National Target Parity

The legacy national Enhanced CPS pipeline still builds labels through
`policyengine_us_data.utils.loss.build_loss_matrix()`. To audit whether those
labels correspond to structured rows in `policy_data.db`, build a parity
manifest:

```bash
python -m policyengine_us_data.utils.national_target_parity \
--dataset-path policyengine_us_data/storage/enhanced_cps_2024.h5 \
--target-db policyengine_us_data/storage/calibration/policy_data.db \
--period 2024 \
--output national_target_parity.json
```

Each national loss label is classified as `matched` with a target ID or as
`legacy_only` with an explicit reason.

## Database Schema

### Core Tables
Expand Down
1 change: 1 addition & 0 deletions policyengine_us_data/db/create_field_valid_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def populate_field_valid_values(session: Session) -> None:
operation_values = [
("operation", "==", "Equals"),
("operation", "!=", "Not equals"),
("operation", "in", "In pipe-delimited set"),
("operation", ">", "Greater than"),
("operation", ">=", "Greater than or equal"),
("operation", "<", "Less than"),
Expand Down
Loading