Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ jobs:
python-version: "3.14"
- uses: astral-sh/setup-uv@v8.1.0
- run: uv sync --dev
- run: uv run python -m build
- run: uv run python -m build --wheel
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
Expand Down
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ changelog:
python .github/bump_version.py
towncrier build --yes --version $$(python -c "import re; print(re.search(r'version = \"(.+?)\"', open('pyproject.toml').read()).group(1))")
download:
python -m policyengine_us_data.storage.download_private_prerequisites
python -m policyengine_us_data.storage.download_prerequisites

upload:
python -m policyengine_us_data.storage.upload_completed_datasets
Expand Down Expand Up @@ -284,10 +284,10 @@ clean:
rm -rf policyengine_us_data/docs/_build

build:
python -m build
python -m build --wheel

publish:
twine upload dist/*
twine upload dist/*.whl

paper-content:
@echo "Building paper sections and docs from unified content..."
Expand Down
4 changes: 4 additions & 0 deletions changelog.d/948.changed
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Build PyPI wheels only and download generated block geography prerequisites from
Hugging Face instead of bundling them in the package. The historical
`download_private_prerequisites` entry point has been removed in favor of the
canonical `download_prerequisites` downloader.
2 changes: 1 addition & 1 deletion modal_app/data_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,7 +626,7 @@ def build_datasets(

# Download prerequisites
run_script(
"policyengine_us_data/storage/download_private_prerequisites.py",
"policyengine_us_data/storage/download_prerequisites.py",
env=env,
log_file=log_file,
)
Expand Down
Binary file not shown.
Binary file removed policyengine_us_data/storage/block_crosswalk.csv.gz
Binary file not shown.
125 changes: 125 additions & 0 deletions policyengine_us_data/storage/download_prerequisites.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
"""Download build prerequisites that are not vendored in the package."""

from __future__ import annotations

from dataclasses import dataclass
import hashlib
from pathlib import Path
import shutil

from huggingface_hub import hf_hub_download

from policyengine_us_data.storage import STORAGE_FOLDER
from policyengine_us_data.utils.huggingface import get_token

HF_REPO_TYPE = "model"
PRIVATE_PUF_REPO = "policyengine/irs-soi-puf"
GEOGRAPHY_REPO = "policyengine/policyengine-us-data"
GEOGRAPHY_REVISION = "afe8d64cd1b66d35a5d6be11abe12bbc72b2e44b"


@dataclass(frozen=True)
class PrerequisiteArtifact:
"""Hugging Face artifact required before running the data build."""

repo: str
path_in_repo: str
local_filename: str
revision: str | None = None
sha256: str | None = None


PREREQUISITE_ARTIFACTS = (
PrerequisiteArtifact(
repo=PRIVATE_PUF_REPO,
path_in_repo="puf_2015.csv",
local_filename="puf_2015.csv",
),
PrerequisiteArtifact(
repo=PRIVATE_PUF_REPO,
path_in_repo="demographics_2015.csv",
local_filename="demographics_2015.csv",
),
PrerequisiteArtifact(
repo=PRIVATE_PUF_REPO,
path_in_repo="np2023_d5_mid.csv",
local_filename="np2023_d5_mid.csv",
),
PrerequisiteArtifact(
repo=GEOGRAPHY_REPO,
path_in_repo="prerequisites/geography/block_cd_distributions.csv.gz",
local_filename="block_cd_distributions.csv.gz",
revision=GEOGRAPHY_REVISION,
sha256="0932ddbf95f454ddcf299d4aa8e3d6919ded9c401e2e7d2cc769466f7fade9bd",
),
PrerequisiteArtifact(
repo=GEOGRAPHY_REPO,
path_in_repo="prerequisites/geography/block_crosswalk.csv.gz",
local_filename="block_crosswalk.csv.gz",
revision=GEOGRAPHY_REVISION,
sha256="cb729f21ef59ea44c0f49aa3c2369f884419765a2c6bd32dc18857952cb8ed4f",
),
)


def _sha256(path: Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as file:
for chunk in iter(lambda: file.read(1024 * 1024), b""):
digest.update(chunk)
return digest.hexdigest()


def download_prerequisite(
artifact: PrerequisiteArtifact,
*,
storage_folder: Path = STORAGE_FOLDER,
) -> Path:
"""Download one prerequisite artifact and return its local storage path."""
source_path = Path(
hf_hub_download(
repo_id=artifact.repo,
repo_type=HF_REPO_TYPE,
filename=artifact.path_in_repo,
revision=artifact.revision,
token=get_token(),
)
)
destination = storage_folder / artifact.local_filename
destination.parent.mkdir(parents=True, exist_ok=True)

if source_path.resolve() != destination.resolve():
shutil.copyfile(source_path, destination)

if artifact.sha256 is not None:
actual = _sha256(destination)
if actual != artifact.sha256:
raise ValueError(
f"Downloaded {artifact.path_in_repo} from {artifact.repo} "
f"with SHA256 {actual}, expected {artifact.sha256}."
)

return destination


def download_prerequisites(
artifacts: tuple[PrerequisiteArtifact, ...] = PREREQUISITE_ARTIFACTS,
*,
storage_folder: Path = STORAGE_FOLDER,
) -> dict[str, Path]:
"""Download all build prerequisites into ``storage_folder``."""
return {
artifact.local_filename: download_prerequisite(
artifact,
storage_folder=storage_folder,
)
for artifact in artifacts
}


def main() -> None:
download_prerequisites()


if __name__ == "__main__":
main()
24 changes: 0 additions & 24 deletions policyengine_us_data/storage/download_private_prerequisites.py

This file was deleted.

7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ namespaces = false
]

[tool.setuptools.exclude-package-data]
"policyengine_us_data" = [
# Generated block geography prerequisites are downloaded from Hugging Face
# by policyengine_us_data.storage.download_prerequisites instead of being
# bundled in PyPI wheels.
"storage/block_cd_distributions.csv.gz",
"storage/block_crosswalk.csv.gz",
]
"*" = ["**/__pycache__/*", "**/*.py[cod]"]

[tool.pytest.ini_options]
Expand Down
Loading
Loading