Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 8 additions & 36 deletions .github/workflows/local_area_publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ on:
paths:
- 'policyengine_us_data/datasets/cps/local_area_calibration/**'
- '.github/workflows/local_area_publish.yaml'
- 'modal_app/**'
repository_dispatch:
types: [calibration-updated]
workflow_dispatch:
Expand All @@ -17,54 +18,25 @@ on:

jobs:
publish-local-area:
runs-on: self-hosted
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
env:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}

steps:
- name: Checkout repo
uses: actions/checkout@v4

- name: Install uv
uses: astral-sh/setup-uv@v5

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.13'

- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider"
service_account: "policyengine-research@policyengine-research.iam.gserviceaccount.com"

- name: Install package
run: uv sync --dev

- name: Download checkpoint (if exists)
continue-on-error: true
run: |
gsutil cp gs://policyengine-us-data/checkpoints/completed_states.txt . || true
gsutil cp gs://policyengine-us-data/checkpoints/completed_districts.txt . || true
gsutil cp gs://policyengine-us-data/checkpoints/completed_cities.txt . || true

- name: Build and publish local area H5 files
run: uv run make publish-local-area

- name: Upload checkpoint
if: always()
run: |
gsutil cp completed_states.txt gs://policyengine-us-data/checkpoints/ || true
gsutil cp completed_districts.txt gs://policyengine-us-data/checkpoints/ || true
gsutil cp completed_cities.txt gs://policyengine-us-data/checkpoints/ || true
- name: Install Modal CLI
run: pip install modal

- name: Clean up checkpoints on success
if: success()
run: |
gsutil rm gs://policyengine-us-data/checkpoints/completed_states.txt || true
gsutil rm gs://policyengine-us-data/checkpoints/completed_districts.txt || true
gsutil rm gs://policyengine-us-data/checkpoints/completed_cities.txt || true
- name: Run local area publishing on Modal
run: modal run modal_app/local_area.py --branch=${{ github.head_ref || github.ref_name }}
5 changes: 2 additions & 3 deletions .github/workflows/pr_code_changes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,8 @@ jobs:
uses: astral-sh/setup-uv@v5
- name: Check lock file is up-to-date
run: |
uv lock --upgrade
git diff --exit-code uv.lock || {
echo "::error::uv.lock is outdated. Run 'uv lock --upgrade' and commit the changes."
uv lock --locked || {
echo "::error::uv.lock is outdated. Run 'uv lock' and commit the changes."
exit 1
}

Expand Down
66 changes: 20 additions & 46 deletions .github/workflows/reusable_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,22 @@ on:
required: false
POLICYENGINE_US_DATA_GITHUB_TOKEN:
required: false
MODAL_TOKEN_ID:
required: false
MODAL_TOKEN_SECRET:
required: false

jobs:
test:
runs-on: self-hosted
runs-on: ubuntu-latest
permissions:
contents: write # Required for GitHub Pages deploy
id-token: write # Required for GCP auth
contents: write
id-token: write
env:
HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
steps:
- name: Checkout repo
uses: actions/checkout@v4
Expand All @@ -50,57 +56,25 @@ jobs:
with:
node-version: '24'

- uses: "google-github-actions/auth@v2"
if: inputs.upload_data
with:
workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider"
service_account: "policyengine-research@policyengine-research.iam.gserviceaccount.com"

- name: Install package
run: uv sync --dev

- name: Download data inputs
if: inputs.full_suite
run: uv run make download

# Temporarily disabled - database target causing issues
# - name: Create and load calibration targets database
# if: inputs.full_suite
# run: make database

- name: Build datasets
- name: Install Modal CLI
if: inputs.full_suite
run: uv run make data
env:
TEST_LITE: ${{ !inputs.upload_data }}
PYTHON_LOG_LEVEL: INFO
run: pip install modal

- name: Build datasets for local area calibration
- name: Run data build and tests on Modal
if: inputs.full_suite
run: |
LOCAL_AREA_CALIBRATION=true uv run python policyengine_us_data/datasets/cps/cps.py
LOCAL_AREA_CALIBRATION=true uv run python policyengine_us_data/datasets/puf/puf.py
LOCAL_AREA_CALIBRATION=true uv run python policyengine_us_data/datasets/cps/extended_cps.py
uv run python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py 10500
modal run modal_app/data_build.py \
${{ inputs.upload_data && '--upload' || '--no-upload' }} \
--branch=${{ github.head_ref || github.ref_name }} \
${{ inputs.upload_data && '--no-test-lite' || '--test-lite' }}

- name: Run local area calibration tests
if: inputs.full_suite
run: uv run pytest policyengine_us_data/tests/test_local_area_calibration/ -v

- name: Save calibration log
if: inputs.full_suite
uses: actions/upload-artifact@v4
with:
name: calibration_log.csv
path: calibration_log.csv
- name: Install package
run: uv sync --dev

- name: Run tests
- name: Run basic tests
if: ${{ !inputs.full_suite }}
run: uv run pytest

- name: Upload data
if: inputs.upload_data
run: uv run make upload

- name: Test documentation builds
run: uv run make documentation
env:
Expand Down
7 changes: 7 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- bump: minor
changes:
added:
- Modal integration for CI/CD workflows, replacing self-hosted GCP runners
changed:
- Updated reusable_test.yaml to trigger data builds on Modal
- Updated local_area_publish.yaml to run on Modal
1 change: 1 addition & 0 deletions modal_app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Modal application for policyengine-us-data CI/CD
184 changes: 184 additions & 0 deletions modal_app/data_build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import os
import subprocess
import modal

app = modal.App("policyengine-us-data")

hf_secret = modal.Secret.from_name("huggingface-token")
gcp_secret = modal.Secret.from_name("gcp-credentials")

image = (
modal.Image.debian_slim(python_version="3.13")
.apt_install("git")
.pip_install(
"policyengine-us>=1.353.0",
"policyengine-core>=3.19.0",
"pandas>=2.3.1",
"requests>=2.25.0",
"tqdm>=4.60.0",
"microdf_python>=1.0.0",
"microimpute>=1.1.4",
"google-cloud-storage>=2.0.0",
"google-auth>=2.0.0",
"scipy>=1.15.3",
"statsmodels>=0.14.5",
"openpyxl>=3.1.5",
"tables>=3.10.2",
"torch>=2.7.1",
"us>=2.0.0",
"sqlalchemy>=2.0.41",
"sqlmodel>=0.0.24",
"xlrd>=2.0.2",
"huggingface_hub",
"pytest",
)
)

REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"


def setup_gcp_credentials():
"""Write GCP credentials JSON to a temp file for google.auth.default()."""
creds_json = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS_JSON")
if creds_json:
creds_path = "/tmp/gcp-credentials.json"
with open(creds_path, "w") as f:
f.write(creds_json)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
return creds_path
return None


@app.function(
image=image,
secrets=[hf_secret, gcp_secret],
memory=32768,
cpu=8.0,
timeout=14400,
)
def build_datasets(
upload: bool = False,
branch: str = "main",
test_lite: bool = False,
):
setup_gcp_credentials()

os.chdir("/root")
subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True)
os.chdir("policyengine-us-data")
subprocess.run(["pip", "install", "-e", ".[dev]"], check=True)

env = os.environ.copy()
if test_lite:
env["TEST_LITE"] = "true"

# Download prerequisites
subprocess.run(
[
"python",
"policyengine_us_data/storage/download_private_prerequisites.py",
],
check=True,
env=env,
)

# Build main datasets
scripts = [
"policyengine_us_data/utils/uprating.py",
"policyengine_us_data/datasets/acs/acs.py",
"policyengine_us_data/datasets/cps/cps.py",
"policyengine_us_data/datasets/puf/irs_puf.py",
"policyengine_us_data/datasets/puf/puf.py",
"policyengine_us_data/datasets/cps/extended_cps.py",
"policyengine_us_data/datasets/cps/enhanced_cps.py",
"policyengine_us_data/datasets/cps/small_enhanced_cps.py",
]
for script in scripts:
print(f"Running {script}...")
subprocess.run(["python", script], check=True, env=env)

os.rename(
"policyengine_us_data/storage/enhanced_cps_2024.h5",
"policyengine_us_data/storage/dense_enhanced_cps_2024.h5",
)
subprocess.run(
[
"cp",
"policyengine_us_data/storage/sparse_enhanced_cps_2024.h5",
"policyengine_us_data/storage/enhanced_cps_2024.h5",
],
check=True,
)

# Build local area calibration datasets (without TEST_LITE - must match full dataset)
print("Building local area calibration datasets...")
local_area_env = os.environ.copy()
local_area_env["LOCAL_AREA_CALIBRATION"] = "true"

subprocess.run(
["python", "policyengine_us_data/datasets/cps/cps.py"],
check=True,
env=local_area_env,
)
subprocess.run(
["python", "policyengine_us_data/datasets/puf/puf.py"],
check=True,
env=local_area_env,
)
subprocess.run(
["python", "policyengine_us_data/datasets/cps/extended_cps.py"],
check=True,
env=local_area_env,
)
subprocess.run(
[
"python",
"policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py",
"10500",
],
check=True,
env=local_area_env,
)

# Run local area calibration tests
print("Running local area calibration tests...")
subprocess.run(
[
"pytest",
"policyengine_us_data/tests/test_local_area_calibration/",
"-v",
],
check=True,
env=local_area_env,
)

# Run main test suite
print("Running main test suite...")
subprocess.run(["pytest"], check=True, env=env)

# Upload if requested
if upload:
subprocess.run(
[
"python",
"policyengine_us_data/storage/upload_completed_datasets.py",
],
check=True,
env=env,
)

return "Data build and tests completed successfully"


@app.local_entrypoint()
def main(
upload: bool = False,
branch: str = "main",
test_lite: bool = False,
):
result = build_datasets.remote(
upload=upload,
branch=branch,
test_lite=test_lite,
)
print(result)
Loading