Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
- bump: patch
- bump: minor
changes:
fixed:
- Versioning workflow checkout for push events
added:
- tests to verify SparseMatrixBuilder correctly calculates variables and constraints into the calibration matrix.
1 change: 0 additions & 1 deletion policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from microimpute.models.qrf import QRF
import logging


test_lite = os.environ.get("TEST_LITE") == "true"
print(f"TEST_LITE == {test_lite}")

Expand Down
1 change: 0 additions & 1 deletion policyengine_us_data/datasets/cps/enhanced_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from pathlib import Path
import logging


try:
import torch
except ImportError:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
StateCode,
)


# State/Geographic Mappings
STATE_CODES = {
1: "AL",
Expand Down Expand Up @@ -193,15 +192,21 @@ def get_calculated_variables(sim) -> List[str]:
"""
Return variables that should be cleared for state-swap recalculation.

Includes variables with formulas, adds, or subtracts.

Excludes ID variables (person_id, household_id, etc.) because:
1. They have formulas that generate sequential IDs (0, 1, 2, ...)
2. We need the original H5 values, not regenerated sequences
3. PolicyEngine's random() function uses entity IDs as seeds:
seed = abs(entity_id * 100 + count_random_calls)
If IDs change, random-dependent variables (SSI resource test,
WIC nutritional risk, WIC takeup) produce different results.
Includes variables with formulas, or adds/subtracts that are lists.

Excludes:
1. ID variables (person_id, household_id, etc.) - needed for random seeds
2. Variables with string adds/subtracts (parameter paths) - these are
pseudo-inputs stored in H5 that would recalculate differently using
parameter lookups. Examples: pre_tax_contributions.
3. Variables in input_variables (have stored H5 values) even if they
have formulas - the stored values represent original survey data
that should be preserved. Examples: cdcc_relevant_expenses, rent.

The exclusions are critical because:
- The H5 file stores pre-computed values from original CPS processing
- If deleted, recalculation produces different values, corrupting
downstream calculations like income_tax
"""
exclude_ids = {
"person_id",
Expand All @@ -211,16 +216,36 @@ def get_calculated_variables(sim) -> List[str]:
"family_id",
"marital_unit_id",
}
return [
name
for name, var in sim.tax_benefit_system.variables.items()
if (
var.formulas
or getattr(var, "adds", None)
or getattr(var, "subtracts", None)
)
and name not in exclude_ids
]

# Get stored input variables to exclude
input_vars = set(sim.input_variables)

result = []
for name, var in sim.tax_benefit_system.variables.items():
if name in exclude_ids:
continue

# Exclude variables that have stored values (input_variables)
# These represent original survey data that should be preserved
if name in input_vars:
continue

# Include if has formulas
if var.formulas:
result.append(name)
continue

# Include if adds/subtracts is a list (explicit component aggregation)
# Exclude if adds/subtracts is a string (parameter path - pseudo-input)
adds = getattr(var, "adds", None)
subtracts = getattr(var, "subtracts", None)

if adds and isinstance(adds, list):
result.append(name)
elif subtracts and isinstance(subtracts, list):
result.append(name)

return result


def get_pseudo_input_variables(sim) -> set:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
create_target_groups,
)


logger = logging.getLogger(__name__)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,105 @@ def __init__(
self.time_period = time_period
self.cds_to_calibrate = cds_to_calibrate
self.dataset_path = dataset_path
self._entity_rel_cache = None

def _build_entity_relationship(self, sim) -> pd.DataFrame:
"""
Build entity relationship DataFrame mapping persons to all entity IDs.

This is used to evaluate constraints at the person level and then
aggregate to household level, handling variables defined at different
entity levels (person, tax_unit, household, spm_unit).

Returns:
DataFrame with person_id, household_id, tax_unit_id, spm_unit_id
"""
if self._entity_rel_cache is not None:
return self._entity_rel_cache

self._entity_rel_cache = pd.DataFrame(
{
"person_id": sim.calculate(
"person_id", map_to="person"
).values,
"household_id": sim.calculate(
"household_id", map_to="person"
).values,
"tax_unit_id": sim.calculate(
"tax_unit_id", map_to="person"
).values,
"spm_unit_id": sim.calculate(
"spm_unit_id", map_to="person"
).values,
}
)
return self._entity_rel_cache

def _evaluate_constraints_entity_aware(
self, state_sim, constraints: List[dict], n_households: int
) -> np.ndarray:
"""
Evaluate non-geographic constraints at person level, aggregate to
household level using .any().

This properly handles constraints on variables defined at different
entity levels (e.g., tax_unit_is_filer at tax_unit level). Instead of
summing values at household level (which would give 2, 3, etc. for
households with multiple tax units), we evaluate at person level and
use .any() aggregation ("does this household have at least one person
satisfying all constraints?").

Args:
state_sim: Microsimulation with state_fips set
constraints: List of constraint dicts with variable, operation,
value keys (geographic constraints should be pre-filtered)
n_households: Number of households

Returns:
Boolean mask array of length n_households
"""
if not constraints:
return np.ones(n_households, dtype=bool)

entity_rel = self._build_entity_relationship(state_sim)
n_persons = len(entity_rel)

person_mask = np.ones(n_persons, dtype=bool)

for c in constraints:
var = c["variable"]
op = c["operation"]
val = c["value"]

# Calculate constraint variable at person level
constraint_values = state_sim.calculate(
var, map_to="person"
).values

# Apply operation at person level
person_mask &= apply_op(constraint_values, op, val)

# Aggregate to household level using .any()
# "At least one person in this household satisfies ALL constraints"
entity_rel_with_mask = entity_rel.copy()
entity_rel_with_mask["satisfies"] = person_mask

household_mask_series = entity_rel_with_mask.groupby("household_id")[
"satisfies"
].any()

# Ensure we return a mask aligned with household order
household_ids = state_sim.calculate(
"household_id", map_to="household"
).values
household_mask = np.array(
[
household_mask_series.get(hh_id, False)
for hh_id in household_ids
]
)

return household_mask

def _query_targets(self, target_filter: dict) -> pd.DataFrame:
"""Query targets based on filter criteria using OR logic."""
Expand Down Expand Up @@ -166,6 +265,9 @@ def build_matrix(
cds_by_state[state].append((cd_idx, cd))

for state, cd_list in cds_by_state.items():
# Clear entity relationship cache when creating new simulation
self._entity_rel_cache = None

if self.dataset_path:
state_sim = self._create_state_sim(state, n_households)
else:
Expand All @@ -184,35 +286,43 @@ def build_matrix(
for row_idx, (_, target) in enumerate(targets_df.iterrows()):
constraints = self._get_constraints(target["stratum_id"])

mask = np.ones(n_households, dtype=bool)
geo_constraints = []
non_geo_constraints = []
for c in constraints:
if c["variable"] in (
"state_fips",
"congressional_district_geoid",
):
geo_constraints.append(c)
else:
non_geo_constraints.append(c)

# Check geographic constraints first (quick fail)
geo_mask = np.ones(n_households, dtype=bool)
for c in geo_constraints:
if c["variable"] == "congressional_district_geoid":
if (
c["operation"] in ("==", "=")
and c["value"] != cd
):
mask[:] = False
geo_mask[:] = False
elif c["variable"] == "state_fips":
if (
c["operation"] in ("==", "=")
and int(c["value"]) != state
):
mask[:] = False
else:
try:
values = state_sim.calculate(
c["variable"], map_to="household"
).values
mask &= apply_op(
values, c["operation"], c["value"]
)
except Exception as e:
# Variable may not exist or may not be
# calculable at household level - skip
logger.debug(
f"Could not evaluate constraint "
f"{c['variable']}: {e}"
)
geo_mask[:] = False

if not geo_mask.any():
continue

# Evaluate non-geographic constraints at entity level
entity_mask = self._evaluate_constraints_entity_aware(
state_sim, non_geo_constraints, n_households
)

# Combine geographic and entity-aware masks
mask = geo_mask & entity_mask

if not mask.any():
continue
Expand Down
1 change: 0 additions & 1 deletion policyengine_us_data/datasets/puf/puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
create_policyengine_uprating_factors_table,
)


rng = np.random.default_rng(seed=64)

# Get Qualified Business Income simulation parameters ---
Expand Down
1 change: 0 additions & 1 deletion policyengine_us_data/datasets/puf/uprate_puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import numpy as np
from policyengine_us_data.storage import STORAGE_FOLDER


ITMDED_GROW_RATE = 0.02 # annual growth rate in itemized deduction amounts

USE_VARIABLE_SPECIFIC_POPULATION_GROWTH_DIVISORS = False
Expand Down
1 change: 0 additions & 1 deletion policyengine_us_data/db/create_database_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

from policyengine_us_data.storage import STORAGE_FOLDER


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
Expand Down
1 change: 0 additions & 1 deletion policyengine_us_data/db/etl_age.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
)
from policyengine_us_data.utils.census import get_census_docs, pull_acs_table


LABEL_TO_SHORT = {
"Estimate!!Total!!Total population!!AGE!!Under 5 years": "0-4",
"Estimate!!Total!!Total population!!AGE!!5 to 9 years": "5-9",
Expand Down
1 change: 0 additions & 1 deletion policyengine_us_data/db/etl_irs_soi.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
get_district_mapping,
)


"""See the 22incddocguide.docx manual from the IRS SOI"""
# Let's make this work with strict inequalities
# Language in the doc: '$10,000 under $25,000'
Expand Down
1 change: 0 additions & 1 deletion policyengine_us_data/db/validate_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pandas as pd
from policyengine_us.system import system


conn = sqlite3.connect("policyengine_us_data/storage/policy_data.db")

stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
STATE_NAME_TO_ABBREV,
)


STATE_NAME_TO_FIPS = {
"Alabama": "01",
"Alaska": "02",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
LOCAL_FOLDER,
)


# Sample data that mimics the format from census.gov
SAMPLE_CENSUS_DATA = """STATE|STATEFP|COUNTYFP|COUNTYNAME
AL|01|001|Autauga County
Expand Down
Loading