PolicyEngine · MaxGhenis · May 12, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
diff --git a/.github/workflows/long_run_projection.yaml b/.github/workflows/long_run_projection.yaml
@@ -78,6 +78,16 @@ on:
         required: false
         default: ""
         type: string
+      support_augmentation_sanitize_worker_non_target_income:
+        description: "Zero worker-donor clone investment and retirement income"
+        required: false
+        default: false
+        type: boolean
+      support_augmentation_sanitize_clone_non_target_income:
+        description: "Zero all donor-clone investment and retirement income"
+        required: false
+        default: false
+        type: boolean
       allow_validation_failures:
         description: "Allow invalid artifacts to be written for diagnostics"
         required: false
@@ -146,6 +156,8 @@ jobs:
           SUPPORT_AUGMENTATION_DONORS_PER_TARGET: ${{ inputs.support_augmentation_donors_per_target }}
           SUPPORT_AUGMENTATION_MAX_DISTANCE: ${{ inputs.support_augmentation_max_distance }}
           SUPPORT_AUGMENTATION_PROFILE: ${{ inputs.support_augmentation_profile }}
+          SUPPORT_AUGMENTATION_SANITIZE_CLONE_NON_TARGET_INCOME: ${{ inputs.support_augmentation_sanitize_clone_non_target_income }}
+          SUPPORT_AUGMENTATION_SANITIZE_WORKER_NON_TARGET_INCOME: ${{ inputs.support_augmentation_sanitize_worker_non_target_income }}
           SUPPORT_AUGMENTATION_START_YEAR: ${{ inputs.support_augmentation_start_year }}
           SUPPORT_AUGMENTATION_TARGET_YEAR: ${{ inputs.support_augmentation_target_year }}
           SUPPORT_AUGMENTATION_TOP_N_TARGETS: ${{ inputs.support_augmentation_top_n_targets }}
@@ -199,6 +211,12 @@ jobs:
           if [ -n "${SUPPORT_AUGMENTATION_BLUEPRINT_BASE_WEIGHT_SCALE}" ]; then
             cmd+=(--support-augmentation-blueprint-base-weight-scale "${SUPPORT_AUGMENTATION_BLUEPRINT_BASE_WEIGHT_SCALE}")
           fi
+          if [ "${SUPPORT_AUGMENTATION_SANITIZE_WORKER_NON_TARGET_INCOME}" = "true" ]; then
+            cmd+=(--support-augmentation-sanitize-worker-non-target-income)
+          fi
+          if [ "${SUPPORT_AUGMENTATION_SANITIZE_CLONE_NON_TARGET_INCOME}" = "true" ]; then
+            cmd+=(--support-augmentation-sanitize-clone-non-target-income)
+          fi
           if [ "${ALLOW_VALIDATION_FAILURES}" = "true" ]; then
             cmd+=(--allow-validation-failures)
           fi

diff --git a/changelog.d/950.fixed.md b/changelog.d/950.fixed.md
@@ -0,0 +1 @@
+Harden long-term donor-composite support so the 2100 Social Security, taxable payroll, and TOB calibration path validates with nonnegative weights.
diff --git a/modal_app/long_term_publishable_probe.py b/modal_app/long_term_publishable_probe.py
@@ -0,0 +1,122 @@
+from __future__ import annotations
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+import modal
+
+_baked = "/root/policyengine-us-data"
+_local = str(Path(__file__).resolve().parent.parent)
+for _p in (_baked, _local):
+    if _p not in sys.path:
+        sys.path.insert(0, _p)
+
+from modal_app.images import cpu_image as base_image  # noqa: E402
+
+app = modal.App("policyengine-us-data-long-term-probe")
+
+hf_secret = modal.Secret.from_name("huggingface-token")
+
+image = base_image
+
+_LONG_TERM_DIR = (
+    "/root/policyengine-us-data/policyengine_us_data/datasets/cps/long_term"
+)
+_VENV_PYTHON = "/root/policyengine-us-data/.venv/bin/python"
+
+
+def _run_long_term_json_command(script_name: str, *args: str) -> str:
+    command = [_VENV_PYTHON, f"{_LONG_TERM_DIR}/{script_name}", *args]
+    result = subprocess.run(
+        command,
+        check=True,
+        capture_output=True,
+        text=True,
+        env={
+            **os.environ,
+            "PYTHONPATH": _LONG_TERM_DIR,
+        },
+    )
+    return result.stdout
+
+
+@app.function(
+    image=image,
+    timeout=60 * 60,
+    cpu=8,
+    memory=65536,
+    secrets=[hf_secret],
+)
+def assess_publishable_probe_json(
+    *,
+    years_csv: str = "2075",
+    profile: str = "ss-payroll-tob",
+    target_source: str = "trustees_2025_current_law",
+    base_dataset_path: str = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
+) -> str:
+    return _run_long_term_json_command(
+        "assess_publishable_horizon.py",
+        "--profile",
+        profile,
+        "--target-source",
+        target_source,
+        "--years",
+        years_csv,
+        "--base-dataset",
+        base_dataset_path,
+    )
+
+
+@app.function(
+    image=image,
+    timeout=60 * 60,
+    cpu=8,
+    memory=65536,
+    secrets=[hf_secret],
+)
+def assess_augmented_publishable_probe_json(
+    *,
+    years_csv: str = "2075",
+    profile: str = "ss-payroll-tob",
+    target_source: str = "trustees_2025_current_law",
+    support_augmentation_profile: str = "late-clone-v1",
+    base_dataset_path: str = "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5",
+) -> str:
+    return _run_long_term_json_command(
+        "assess_augmented_publishability.py",
+        "--profile",
+        profile,
+        "--target-source",
+        target_source,
+        "--years",
+        years_csv,
+        "--base-dataset",
+        base_dataset_path,
+        "--support-augmentation",
+        support_augmentation_profile,
+    )
+
+
+@app.local_entrypoint()
+def main(
+    years: str = "2075",
+    profile: str = "ss-payroll-tob",
+    target_source: str = "trustees_2025_current_law",
+    support_augmentation_profile: str = "",
+) -> None:
+    if support_augmentation_profile:
+        payload = assess_augmented_publishable_probe_json.remote(
+            years_csv=years,
+            profile=profile,
+            target_source=target_source,
+            support_augmentation_profile=support_augmentation_profile,
+        )
+    else:
+        payload = assess_publishable_probe_json.remote(
+            years_csv=years,
+            profile=profile,
+            target_source=target_source,
+        )
+    print(payload)
diff --git a/policyengine_us_data/datasets/cps/long_term/README.md b/policyengine_us_data/datasets/cps/long_term/README.md
@@ -10,10 +10,10 @@ Run projections using `run_household_projection.py`:
 python run_household_projection.py 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --save-h5
 
 # Experimental: role-based donor composites assembled into late-year support
-python run_household_projection.py 2075 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-composite-v1 --support-augmentation-target-year 2100 --support-augmentation-blueprint-base-weight-scale 5.0
+python run_household_projection.py 2075 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-composite-v1 --support-augmentation-target-year 2100 --support-augmentation-blueprint-base-weight-scale 5.0 --support-augmentation-sanitize-clone-non-target-income
 
 # Experimental: target-year blueprint calibration over donor-composite support
-python run_household_projection.py 2100 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-composite-v1 --support-augmentation-target-year 2100 --support-augmentation-blueprint-base-weight-scale 5.0 --save-h5
+python run_household_projection.py 2100 2100 --profile ss-payroll-tob --target-source trustees_2025_current_law --support-augmentation-profile donor-backed-composite-v1 --support-augmentation-target-year 2100 --support-augmentation-blueprint-base-weight-scale 5.0 --support-augmentation-sanitize-clone-non-target-income --save-h5
 
 # IPF with only age distribution constraints (faster, less accurate)
 python run_household_projection.py 2050 --profile age-only
@@ -35,7 +35,11 @@ python run_long_term_production.py \
   --jobs 4 \
   --output-dir ./projected_datasets_production \
   --profile ss-payroll-tob \
-  --target-source trustees_2025_current_law
+  --target-source trustees_2025_current_law \
+  --support-augmentation-profile donor-backed-composite-v1 \
+  --support-augmentation-target-year 2100 \
+  --support-augmentation-blueprint-base-weight-scale 5.0 \
+  --support-augmentation-sanitize-clone-non-target-income
 ```
 
 **Arguments:**
@@ -53,6 +57,8 @@ python run_long_term_production.py \
 - `--support-augmentation-max-distance`: Maximum donor-match distance retained for cloning (default `3.0`).
 - `--support-augmentation-clone-weight-scale`: Baseline weight multiplier applied to each donor-backed clone (default `0.1`).
 - `--support-augmentation-blueprint-base-weight-scale`: When donor-composite augmentation is active at its target year, scales the original household priors before replacing clone priors with synthetic blueprint shares (default `5.0`).
+- `--support-augmentation-sanitize-worker-non-target-income`: For `donor-backed-composite-v1`, zero investment and retirement income on worker-sourced donor clone rows.
+- `--support-augmentation-sanitize-clone-non-target-income`: For `donor-backed-composite-v1`, zero investment and retirement income on all donor clone rows.
 - `--greg`: Use GREG calibration instead of IPF
 - `--use-ss`: Include Social Security benefit totals as calibration target (requires `--greg`)
 - `--use-payroll`: Include taxable payroll totals as calibration target (requires `--greg`)
@@ -141,7 +147,7 @@ python run_long_term_production.py \
 - Current status:
   - Fixing the long-run payroll-cap bug in `policyengine-us` changed the picture materially. With the correct SSA wage base extended through `2100`, the donor-composite synthetic support is exact-feasible and dense at the archetype level.
   - The runner now supports a target-year calibration blueprint for donor-composite augmentation. At the augmentation target year, it applies target age composition to clone rows, uses realized PolicyEngine Social Security and payroll values for the same rows, and keeps synthetic prior shares for clone households.
-  - In the current `2075` OACT probe, that blueprint path hits actual Social Security, taxable payroll, and TOB targets exactly with nonnegative weights when `--support-augmentation-blueprint-base-weight-scale 5.0`.
+  - In the current `2100` Trustees current-law probe, that blueprint path hits actual Social Security, taxable payroll, and TOB targets exactly with nonnegative weights when `--support-augmentation-blueprint-base-weight-scale 5.0 --support-augmentation-sanitize-clone-non-target-income`.
   - The runner now also has a dynamic mode, `--support-augmentation-align-to-run-year`, that rebuilds donor-composite support for each run year and writes per-year augmentation reports.
   - This is still experimental. The blueprint path is now structurally capable of handling year-specific support, but the full `2075-2100` production sweep still needs runtime tuning and caching work.
 

diff --git a/policyengine_us_data/datasets/cps/long_term/assess_augmented_publishability.py b/policyengine_us_data/datasets/cps/long_term/assess_augmented_publishability.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+import argparse
+import json
+
+from assess_publishable_horizon import assess_years
+from support_augmentation import build_augmented_dataset
+
+
+DEFAULT_BASE_DATASET_PATH = (
+    "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"
+)
+
+
+def parse_years(raw: str) -> list[int]:
+    years = [int(value.strip()) for value in raw.split(",") if value.strip()]
+    if not years:
+        raise ValueError("At least one year must be provided.")
+    return sorted(set(years))
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Assess publishability for an augmented late-tail dataset under the "
+            "standard long-run contract."
+        )
+    )
+    parser.add_argument(
+        "--years",
+        default="2075",
+        help="Comma-separated years to assess.",
+    )
+    parser.add_argument(
+        "--profile",
+        default="ss-payroll-tob",
+        help="Named calibration profile to assess.",
+    )
+    parser.add_argument(
+        "--target-source",
+        default="trustees_2025_current_law",
+        help="Named long-run target source package.",
+    )
+    parser.add_argument(
+        "--support-augmentation",
+        required=True,
+        help="Support augmentation profile name.",
+    )
+    parser.add_argument(
+        "--base-dataset",
+        default=DEFAULT_BASE_DATASET_PATH,
+        help="Base microsimulation dataset path.",
+    )
+    args = parser.parse_args()
+
+    augmented_dataset, augmentation_report = build_augmented_dataset(
+        base_dataset=args.base_dataset,
+        base_year=2024,
+        profile=args.support_augmentation,
+    )
+
+    rows = assess_years(
+        years=parse_years(args.years),
+        profile_name=args.profile,
+        target_source=args.target_source,
+        base_dataset_path=augmented_dataset,
+    )
+
+    payload = {
+        "years": parse_years(args.years),
+        "profile": args.profile,
+        "target_source": args.target_source,
+        "support_augmentation_profile": args.support_augmentation,
+        "augmentation_report": augmentation_report,
+        "rows": rows,
+    }
+    print(json.dumps(payload, indent=2, sort_keys=True))
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Harden long-term donor-composite support so the 2100 Social Security, taxable payroll, and TOB calibration path validates with nonnegative weights.