PolicyEngine · baogorek · Jan 22, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 22, 2026
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -1,4 +1,4 @@
-- bump: patch
+- bump: minor
   changes:
-    fixed:
-    - Versioning workflow checkout for push events
+    added:
+      - tests to verify SparseMatrixBuilder correctly calculates variables and constraints into the calibration matrix.
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -15,7 +15,6 @@
 from microimpute.models.qrf import QRF
 import logging
 
-
 test_lite = os.environ.get("TEST_LITE") == "true"
 print(f"TEST_LITE == {test_lite}")
 

diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py
@@ -22,7 +22,6 @@
 from pathlib import Path
 import logging
 
-
 try:
     import torch
 except ImportError:

diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py b/policyengine_us_data/datasets/cps/local_area_calibration/calibration_utils.py
@@ -17,7 +17,6 @@
     StateCode,
 )
 
-
 # State/Geographic Mappings
 STATE_CODES = {
     1: "AL",
@@ -193,15 +192,21 @@ def get_calculated_variables(sim) -> List[str]:
     """
     Return variables that should be cleared for state-swap recalculation.
 
-    Includes variables with formulas, adds, or subtracts.
-
-    Excludes ID variables (person_id, household_id, etc.) because:
-    1. They have formulas that generate sequential IDs (0, 1, 2, ...)
-    2. We need the original H5 values, not regenerated sequences
-    3. PolicyEngine's random() function uses entity IDs as seeds:
-       seed = abs(entity_id * 100 + count_random_calls)
-       If IDs change, random-dependent variables (SSI resource test,
-       WIC nutritional risk, WIC takeup) produce different results.
+    Includes variables with formulas, or adds/subtracts that are lists.
+
+    Excludes:
+    1. ID variables (person_id, household_id, etc.) - needed for random seeds
+    2. Variables with string adds/subtracts (parameter paths) - these are
+       pseudo-inputs stored in H5 that would recalculate differently using
+       parameter lookups. Examples: pre_tax_contributions.
+    3. Variables in input_variables (have stored H5 values) even if they
+       have formulas - the stored values represent original survey data
+       that should be preserved. Examples: cdcc_relevant_expenses, rent.
+
+    The exclusions are critical because:
+    - The H5 file stores pre-computed values from original CPS processing
+    - If deleted, recalculation produces different values, corrupting
+      downstream calculations like income_tax
     """
     exclude_ids = {
         "person_id",
@@ -211,16 +216,36 @@ def get_calculated_variables(sim) -> List[str]:
         "family_id",
         "marital_unit_id",
     }
-    return [
-        name
-        for name, var in sim.tax_benefit_system.variables.items()
-        if (
-            var.formulas
-            or getattr(var, "adds", None)
-            or getattr(var, "subtracts", None)
-        )
-        and name not in exclude_ids
-    ]
+
+    # Get stored input variables to exclude
+    input_vars = set(sim.input_variables)
+
+    result = []
+    for name, var in sim.tax_benefit_system.variables.items():
+        if name in exclude_ids:
+            continue
+
+        # Exclude variables that have stored values (input_variables)
+        # These represent original survey data that should be preserved
+        if name in input_vars:
+            continue
+
+        # Include if has formulas
+        if var.formulas:
+            result.append(name)
+            continue
+
+        # Include if adds/subtracts is a list (explicit component aggregation)
+        # Exclude if adds/subtracts is a string (parameter path - pseudo-input)
+        adds = getattr(var, "adds", None)
+        subtracts = getattr(var, "subtracts", None)
+
+        if adds and isinstance(adds, list):
+            result.append(name)
+        elif subtracts and isinstance(subtracts, list):
+            result.append(name)
+
+    return result
 
 
 def get_pseudo_input_variables(sim) -> set:

diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py b/policyengine_us_data/datasets/cps/local_area_calibration/matrix_tracer.py
@@ -46,7 +46,6 @@
     create_target_groups,
 )
 
-
 logger = logging.getLogger(__name__)
 
 

diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/sparse_matrix_builder.py
@@ -38,6 +38,105 @@ def __init__(
         self.time_period = time_period
         self.cds_to_calibrate = cds_to_calibrate
         self.dataset_path = dataset_path
+        self._entity_rel_cache = None
+
+    def _build_entity_relationship(self, sim) -> pd.DataFrame:
+        """
+        Build entity relationship DataFrame mapping persons to all entity IDs.
+
+        This is used to evaluate constraints at the person level and then
+        aggregate to household level, handling variables defined at different
+        entity levels (person, tax_unit, household, spm_unit).
+
+        Returns:
+            DataFrame with person_id, household_id, tax_unit_id, spm_unit_id
+        """
+        if self._entity_rel_cache is not None:
+            return self._entity_rel_cache
+
+        self._entity_rel_cache = pd.DataFrame(
+            {
+                "person_id": sim.calculate(
+                    "person_id", map_to="person"
+                ).values,
+                "household_id": sim.calculate(
+                    "household_id", map_to="person"
+                ).values,
+                "tax_unit_id": sim.calculate(
+                    "tax_unit_id", map_to="person"
+                ).values,
+                "spm_unit_id": sim.calculate(
+                    "spm_unit_id", map_to="person"
+                ).values,
+            }
+        )
+        return self._entity_rel_cache
+
+    def _evaluate_constraints_entity_aware(
+        self, state_sim, constraints: List[dict], n_households: int
+    ) -> np.ndarray:
+        """
+        Evaluate non-geographic constraints at person level, aggregate to
+        household level using .any().
+
+        This properly handles constraints on variables defined at different
+        entity levels (e.g., tax_unit_is_filer at tax_unit level). Instead of
+        summing values at household level (which would give 2, 3, etc. for
+        households with multiple tax units), we evaluate at person level and
+        use .any() aggregation ("does this household have at least one person
+        satisfying all constraints?").
+
+        Args:
+            state_sim: Microsimulation with state_fips set
+            constraints: List of constraint dicts with variable, operation,
+                value keys (geographic constraints should be pre-filtered)
+            n_households: Number of households
+
+        Returns:
+            Boolean mask array of length n_households
+        """
+        if not constraints:
+            return np.ones(n_households, dtype=bool)
+
+        entity_rel = self._build_entity_relationship(state_sim)
+        n_persons = len(entity_rel)
+
+        person_mask = np.ones(n_persons, dtype=bool)
+
+        for c in constraints:
+            var = c["variable"]
+            op = c["operation"]
+            val = c["value"]
+
+            # Calculate constraint variable at person level
+            constraint_values = state_sim.calculate(
+                var, map_to="person"
+            ).values
+
+            # Apply operation at person level
+            person_mask &= apply_op(constraint_values, op, val)
+
+        # Aggregate to household level using .any()
+        # "At least one person in this household satisfies ALL constraints"
+        entity_rel_with_mask = entity_rel.copy()
+        entity_rel_with_mask["satisfies"] = person_mask
+
+        household_mask_series = entity_rel_with_mask.groupby("household_id")[
+            "satisfies"
+        ].any()
+
+        # Ensure we return a mask aligned with household order
+        household_ids = state_sim.calculate(
+            "household_id", map_to="household"
+        ).values
+        household_mask = np.array(
+            [
+                household_mask_series.get(hh_id, False)
+                for hh_id in household_ids
+            ]
+        )
+
+        return household_mask
 
     def _query_targets(self, target_filter: dict) -> pd.DataFrame:
         """Query targets based on filter criteria using OR logic."""
@@ -166,6 +265,9 @@ def build_matrix(
             cds_by_state[state].append((cd_idx, cd))
 
         for state, cd_list in cds_by_state.items():
+            # Clear entity relationship cache when creating new simulation
+            self._entity_rel_cache = None
+
             if self.dataset_path:
                 state_sim = self._create_state_sim(state, n_households)
             else:
@@ -184,35 +286,43 @@ def build_matrix(
                 for row_idx, (_, target) in enumerate(targets_df.iterrows()):
                     constraints = self._get_constraints(target["stratum_id"])
 
-                    mask = np.ones(n_households, dtype=bool)
+                    geo_constraints = []
+                    non_geo_constraints = []
                     for c in constraints:
+                        if c["variable"] in (
+                            "state_fips",
+                            "congressional_district_geoid",
+                        ):
+                            geo_constraints.append(c)
+                        else:
+                            non_geo_constraints.append(c)
+
+                    # Check geographic constraints first (quick fail)
+                    geo_mask = np.ones(n_households, dtype=bool)
+                    for c in geo_constraints:
                         if c["variable"] == "congressional_district_geoid":
                             if (
                                 c["operation"] in ("==", "=")
                                 and c["value"] != cd
                             ):
-                                mask[:] = False
+                                geo_mask[:] = False
                         elif c["variable"] == "state_fips":
                             if (
                                 c["operation"] in ("==", "=")
                                 and int(c["value"]) != state
                             ):
-                                mask[:] = False
-                        else:
-                            try:
-                                values = state_sim.calculate(
-                                    c["variable"], map_to="household"
-                                ).values
-                                mask &= apply_op(
-                                    values, c["operation"], c["value"]
-                                )
-                            except Exception as e:
-                                # Variable may not exist or may not be
-                                # calculable at household level - skip
-                                logger.debug(
-                                    f"Could not evaluate constraint "
-                                    f"{c['variable']}: {e}"
-                                )
+                                geo_mask[:] = False
+
+                    if not geo_mask.any():
+                        continue
+
+                    # Evaluate non-geographic constraints at entity level
+                    entity_mask = self._evaluate_constraints_entity_aware(
+                        state_sim, non_geo_constraints, n_households
+                    )
+
+                    # Combine geographic and entity-aware masks
+                    mask = geo_mask & entity_mask
 
                     if not mask.any():
                         continue

diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
@@ -15,7 +15,6 @@
     create_policyengine_uprating_factors_table,
 )
 
-
 rng = np.random.default_rng(seed=64)
 
 # Get Qualified Business Income simulation parameters ---

diff --git a/policyengine_us_data/datasets/puf/uprate_puf.py b/policyengine_us_data/datasets/puf/uprate_puf.py
@@ -2,7 +2,6 @@
 import numpy as np
 from policyengine_us_data.storage import STORAGE_FOLDER
 
-
 ITMDED_GROW_RATE = 0.02  # annual growth rate in itemized deduction amounts
 
 USE_VARIABLE_SPECIFIC_POPULATION_GROWTH_DIVISORS = False

diff --git a/policyengine_us_data/db/create_database_tables.py b/policyengine_us_data/db/create_database_tables.py
@@ -15,7 +15,6 @@
 
 from policyengine_us_data.storage import STORAGE_FOLDER
 
-
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",

diff --git a/policyengine_us_data/db/etl_age.py b/policyengine_us_data/db/etl_age.py
@@ -11,7 +11,6 @@
 )
 from policyengine_us_data.utils.census import get_census_docs, pull_acs_table
 
-
 LABEL_TO_SHORT = {
     "Estimate!!Total!!Total population!!AGE!!Under 5 years": "0-4",
     "Estimate!!Total!!Total population!!AGE!!5 to 9 years": "5-9",

diff --git a/policyengine_us_data/db/etl_irs_soi.py b/policyengine_us_data/db/etl_irs_soi.py
@@ -24,7 +24,6 @@
     get_district_mapping,
 )
 
-
 """See the 22incddocguide.docx manual from the IRS SOI"""
 # Let's make this work with strict inequalities
 # Language in the doc: '$10,000 under $25,000'

diff --git a/policyengine_us_data/db/validate_database.py b/policyengine_us_data/db/validate_database.py
@@ -9,7 +9,6 @@
 import pandas as pd
 from policyengine_us.system import system
 
-
 conn = sqlite3.connect("policyengine_us_data/storage/policy_data.db")
 
 stratum_constraints_df = pd.read_sql("SELECT * FROM stratum_constraints", conn)

diff --git a/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py b/policyengine_us_data/storage/calibration_targets/pull_snap_targets.py
@@ -9,7 +9,6 @@
     STATE_NAME_TO_ABBREV,
 )
 
-
 STATE_NAME_TO_FIPS = {
     "Alabama": "01",
     "Alaska": "02",

diff --git a/policyengine_us_data/tests/test_datasets/test_county_fips.py b/policyengine_us_data/tests/test_datasets/test_county_fips.py
@@ -10,7 +10,6 @@
     LOCAL_FOLDER,
 )
 
-
 # Sample data that mimics the format from census.gov
 SAMPLE_CENSUS_DATA = """STATE|STATEFP|COUNTYFP|COUNTYNAME
 AL|01|001|Autauga County
Original file line number	Diff line number	Diff line change
Expand Up		@@ -46,7 +46,6 @@
		create_target_groups,
		)


		logger = logging.getLogger(__name__)


Expand Down
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,6 @@ @@
         create_policyengine_uprating_factors_table,
     )
     rng = np.random.default_rng(seed=64)
     # Get Qualified Business Income simulation parameters ---
@@ Expand Down @@