Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@ Before creating or sharing any PR, all developers and agents must:
`gh repo view PolicyEngine/policyengine-us-data --json nameWithOwner`.
2. Push the branch to that repository, for example:
`git push upstream HEAD:<branch-name>`.
3. Create the PR from the same repository, for example:
`gh pr create --repo PolicyEngine/policyengine-us-data --head <branch-name> --base main`.
4. Verify the PR head repository before reporting it:
`gh pr view <PR> --repo PolicyEngine/policyengine-us-data --json headRepositoryOwner,headRepository`.

The PR is valid only if the head repository is `PolicyEngine/policyengine-us-data`.
3. Create the PR as a draft from the same repository, for example:
`gh pr create --draft --repo PolicyEngine/policyengine-us-data --head <branch-name> --base main`.
4. Verify the PR is draft and the head repository is canonical before reporting
it:
`gh pr view <PR> --repo PolicyEngine/policyengine-us-data --json isDraft,headRepositoryOwner,headRepository`.

The PR is valid only if `isDraft` is `true` and the head repository is
`PolicyEngine/policyengine-us-data`.
If you cannot push to the canonical repository, stop and ask for access. Do not
create a fork PR as a fallback. If you accidentally create one, immediately
close it and replace it with a same-repository PR.
close it and replace it with a same-repository draft PR.
2 changes: 2 additions & 0 deletions changelog.d/926.added
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Stage 2 now emits and validates a semantic `calibration_package_contract.json`
sidecar next to `calibration_package.pkl`.
16 changes: 9 additions & 7 deletions docs/engineering/skills/github-prs.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,18 @@ Before creating or sharing a PR:
`make lint`.
6. Push the current branch to the canonical repository:
`make push-pr-branch`.
7. Create the PR from that same repository:
`gh pr create --repo PolicyEngine/policyengine-us-data --head "$(git branch --show-current)" --base main`.
8. Verify the PR head repository:
`gh pr view <PR> --repo PolicyEngine/policyengine-us-data --json headRepositoryOwner,headRepository`.

The PR is valid only if the head repository is
7. Create the PR as a draft from that same repository:
`gh pr create --draft --repo PolicyEngine/policyengine-us-data --head "$(git branch --show-current)" --base main`.
8. Verify the PR is draft and the head repository is canonical:
`gh pr view <PR> --repo PolicyEngine/policyengine-us-data --json isDraft,headRepositoryOwner,headRepository`.
9. Leave the PR as draft unless a maintainer explicitly asks for it to be
marked ready for review.

The PR is valid only if `isDraft` is `true` and the head repository is
`PolicyEngine/policyengine-us-data`. If you cannot push to the canonical
repository, stop and ask for access. Do not create a fork PR as a fallback. If
you accidentally create one, close it immediately and replace it with a
same-repository PR.
same-repository draft PR.

## PR title

Expand Down
13 changes: 13 additions & 0 deletions modal_app/remote_calibration_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,19 @@ def _build_package_impl(
if build_rc != 0:
raise RuntimeError(f"Package build failed with code {build_rc}")

from policyengine_us_data.stage_contracts.calibration_package import (
CALIBRATION_PACKAGE_CONTRACT_FILENAME,
validate_persisted_calibration_package_contract,
)

contract_path = f"{artifacts}/{CALIBRATION_PACKAGE_CONTRACT_FILENAME}"
validate_persisted_calibration_package_contract(
package_path=Path(pkg_path),
contract_path=Path(contract_path),
dataset_path=Path(dataset_path),
db_path=Path(db_path),
)

sidecar_ok = _write_package_sidecar(pkg_path)
if not sidecar_ok:
print(
Expand Down
99 changes: 92 additions & 7 deletions policyengine_us_data/calibration/unified_calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import logging
import os
import sys
from datetime import UTC, datetime
from pathlib import Path
from typing import Optional

Expand All @@ -44,6 +45,9 @@
create_target_groups,
)
from policyengine_us_data.pipeline_metadata import pipeline_node
from policyengine_us_data.stage_contracts.calibration_package import (
CalibrationPackageParameters,
)
from policyengine_us_data.pipeline_schema import PipelineNode

logging.basicConfig(
Expand Down Expand Up @@ -71,6 +75,41 @@
DEFAULT_TARGET_CONFIG_PATH = Path(__file__).resolve().parent / "target_config.yaml"


def _utc_now_isoformat() -> str:
"""Return a compact UTC timestamp for contract metadata."""

return datetime.now(UTC).isoformat().replace("+00:00", "Z")


def _calibration_package_contract_parameters(
*,
workers: int,
n_clones: int,
target_config_path: str | None,
skip_county: bool,
skip_source_impute: bool,
skip_takeup_rerandomize: bool,
chunked_matrix: bool,
chunk_size: int,
parallel: bool,
num_matrix_workers: int,
) -> CalibrationPackageParameters:
"""Return Stage 2 parameters that affect package construction."""

return CalibrationPackageParameters.from_runtime_args(
workers=workers,
n_clones=n_clones,
target_config_path=target_config_path,
skip_county=skip_county,
skip_source_impute=skip_source_impute,
skip_takeup_rerandomize=skip_takeup_rerandomize,
chunked_matrix=chunked_matrix,
chunk_size=chunk_size,
parallel=parallel,
num_matrix_workers=num_matrix_workers,
)


def get_git_provenance() -> dict:
"""Capture git state and package version for provenance tracking."""
import subprocess as _sp
Expand Down Expand Up @@ -152,7 +191,11 @@ def check_package_staleness(metadata: dict) -> None:
if created:
try:
built_dt = datetime.datetime.fromisoformat(created)
age = datetime.datetime.now() - built_dt
if built_dt.tzinfo is None:
built_dt = built_dt.replace(tzinfo=datetime.UTC)
age = datetime.datetime.now(datetime.UTC) - built_dt.astimezone(
datetime.UTC
)
if age.days > 7:
print(f"WARNING: Package is {age.days} days old (built {created})")
except Exception:
Expand Down Expand Up @@ -1303,6 +1346,7 @@ def run_calibration(
"""
import time

started_at = _utc_now_isoformat()
t0 = time.time()

# Early exit: load pre-built package
Expand Down Expand Up @@ -1547,16 +1591,14 @@ def run_calibration(
# Step 6b: Save the calibration package. By default this is the
# minimal package selected by target_config.yaml; use
# --all-active-targets to build a broad diagnostic package.
import datetime

metadata = {
"dataset_path": dataset_path,
"db_path": db_path,
"n_clones": n_clones,
"n_records": X_sparse.shape[1],
"base_n_records": n_records,
"seed": seed,
"created_at": datetime.datetime.now().isoformat(),
"created_at": _utc_now_isoformat(),
"target_config_path": target_config_path,
"package_scope": "minimal" if target_config else "all_active_targets",
"matrix_builder": "chunked" if chunked_matrix else "precompute",
Expand All @@ -1573,20 +1615,63 @@ def run_calibration(
Path(target_config_path)
)

initial_weights = compute_initial_weights(X_sparse, targets_df)
if package_output_path:
full_initial_weights = compute_initial_weights(X_sparse, targets_df)
package_payload = {
"X_sparse": X_sparse,
"targets_df": targets_df,
"target_names": target_names,
"metadata": metadata,
"initial_weights": initial_weights,
"cd_geoid": geography.cd_geoid,
"block_geoid": geography.block_geoid,
}
save_calibration_package(
package_output_path,
X_sparse,
targets_df,
target_names,
metadata,
initial_weights=full_initial_weights,
initial_weights=initial_weights,
cd_geoid=geography.cd_geoid,
block_geoid=geography.block_geoid,
)
from policyengine_us_data.stage_contracts.calibration_package import (
validate_calibration_package_contract,
write_calibration_package_contract,
)

initial_weights = compute_initial_weights(X_sparse, targets_df)
completed_at = _utc_now_isoformat()
write_calibration_package_contract(
package_path=Path(package_output_path),
dataset_path=Path(dataset_path),
db_path=Path(db_path),
package=package_payload,
parameters=_calibration_package_contract_parameters(
workers=workers,
n_clones=n_clones,
target_config_path=target_config_path,
skip_county=skip_county,
skip_source_impute=skip_source_impute,
skip_takeup_rerandomize=skip_takeup_rerandomize,
chunked_matrix=chunked_matrix,
chunk_size=chunk_size,
parallel=parallel,
num_matrix_workers=num_matrix_workers,
),
run_id=run_id,
started_at=started_at,
completed_at=completed_at,
duration_s=round(time.time() - t0, 1),
code_sha=metadata.get("git_commit"),
package_version=metadata.get("package_version"),
)
validate_calibration_package_contract(
package_path=Path(package_output_path),
package=package_payload,
dataset_path=Path(dataset_path),
db_path=Path(db_path),
)

if build_only:
from policyengine_us_data.calibration.validate_package import (
Expand Down
22 changes: 22 additions & 0 deletions policyengine_us_data/stage_contracts/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,18 @@
"""

from .artifacts import ArtifactRef
from .calibration_package import (
CALIBRATION_PACKAGE_CONTRACT_FILENAME,
CALIBRATION_PACKAGE_CONTRACT_TYPE,
CalibrationPackageParameters,
CalibrationPackageSummary,
build_calibration_package_contract,
load_calibration_package_payload,
summarize_calibration_package,
validate_calibration_package_contract,
validate_persisted_calibration_package_contract,
write_calibration_package_contract,
)
from .constants import (
CONTRACT_FINGERPRINT_ALGORITHM,
CONTRACT_SCHEMA_VERSION,
Expand Down Expand Up @@ -73,6 +85,10 @@
"VALIDATION_REPORT_STATUSES",
"ArtifactRef",
"CANONICAL_STAGE_IDS",
"CALIBRATION_PACKAGE_CONTRACT_FILENAME",
"CALIBRATION_PACKAGE_CONTRACT_TYPE",
"CalibrationPackageParameters",
"CalibrationPackageSummary",
"CONTRACT_TYPE_BY_STAGE_ID",
"DATASET_BUILD_OUTPUT_CONTRACT_FILENAME",
"DATASET_BUILD_OUTPUT_CONTRACT_TYPE",
Expand All @@ -97,6 +113,7 @@
"ValidationFindingStatus",
"ValidationReport",
"ValidationReportStatus",
"build_calibration_package_contract",
"build_dataset_build_output_contract",
"canonicalize_for_fingerprint",
"contract_from_json",
Expand All @@ -105,7 +122,12 @@
"fingerprint_material",
"is_canonical_stage_id",
"is_canonical_substage_id",
"load_calibration_package_payload",
"read_contract",
"summarize_calibration_package",
"substage_ids_for_stage",
"validate_calibration_package_contract",
"validate_persisted_calibration_package_contract",
"write_calibration_package_contract",
"write_contract",
]
Loading