PolicyEngine · MaxGhenis · Jan 9, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/.github/workflows/local_area_publish.yaml b/.github/workflows/local_area_publish.yaml
@@ -6,6 +6,7 @@ on:
     paths:
       - 'policyengine_us_data/datasets/cps/local_area_calibration/**'
       - '.github/workflows/local_area_publish.yaml'
+      - 'modal_app/**'
   repository_dispatch:
     types: [calibration-updated]
   workflow_dispatch:
@@ -17,54 +18,25 @@ on:
 
 jobs:
   publish-local-area:
-    runs-on: self-hosted
+    runs-on: ubuntu-latest
     permissions:
       contents: read
-      id-token: write
     env:
       HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
 
     steps:
       - name: Checkout repo
         uses: actions/checkout@v4
 
-      - name: Install uv
-        uses: astral-sh/setup-uv@v5
-
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: '3.13'
 
-      - name: Authenticate to Google Cloud
-        uses: google-github-actions/auth@v2
-        with:
-          workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider"
-          service_account: "policyengine-research@policyengine-research.iam.gserviceaccount.com"
-
-      - name: Install package
-        run: uv sync --dev
-
-      - name: Download checkpoint (if exists)
-        continue-on-error: true
-        run: |
-          gsutil cp gs://policyengine-us-data/checkpoints/completed_states.txt . || true
-          gsutil cp gs://policyengine-us-data/checkpoints/completed_districts.txt . || true
-          gsutil cp gs://policyengine-us-data/checkpoints/completed_cities.txt . || true
-
-      - name: Build and publish local area H5 files
-        run: uv run make publish-local-area
-
-      - name: Upload checkpoint
-        if: always()
-        run: |
-          gsutil cp completed_states.txt gs://policyengine-us-data/checkpoints/ || true
-          gsutil cp completed_districts.txt gs://policyengine-us-data/checkpoints/ || true
-          gsutil cp completed_cities.txt gs://policyengine-us-data/checkpoints/ || true
+      - name: Install Modal CLI
+        run: pip install modal
 
-      - name: Clean up checkpoints on success
-        if: success()
-        run: |
-          gsutil rm gs://policyengine-us-data/checkpoints/completed_states.txt || true
-          gsutil rm gs://policyengine-us-data/checkpoints/completed_districts.txt || true
-          gsutil rm gs://policyengine-us-data/checkpoints/completed_cities.txt || true
+      - name: Run local area publishing on Modal
+        run: modal run modal_app/local_area.py --branch=${{ github.head_ref || github.ref_name }}
diff --git a/.github/workflows/pr_code_changes.yaml b/.github/workflows/pr_code_changes.yaml
@@ -44,9 +44,8 @@ jobs:
         uses: astral-sh/setup-uv@v5
       - name: Check lock file is up-to-date
         run: |
-          uv lock --upgrade
-          git diff --exit-code uv.lock || {
-            echo "::error::uv.lock is outdated. Run 'uv lock --upgrade' and commit the changes."
+          uv lock --locked || {
+            echo "::error::uv.lock is outdated. Run 'uv lock' and commit the changes."
             exit 1
           }
 

diff --git a/.github/workflows/reusable_test.yaml b/.github/workflows/reusable_test.yaml
@@ -23,16 +23,22 @@ on:
         required: false
       POLICYENGINE_US_DATA_GITHUB_TOKEN:
         required: false
+      MODAL_TOKEN_ID:
+        required: false
+      MODAL_TOKEN_SECRET:
+        required: false
 
 jobs:
   test:
-    runs-on: self-hosted
+    runs-on: ubuntu-latest
     permissions:
-      contents: write  # Required for GitHub Pages deploy
-      id-token: write  # Required for GCP auth
+      contents: write
+      id-token: write
     env:
       HUGGING_FACE_TOKEN: ${{ secrets.HUGGING_FACE_TOKEN }}
       POLICYENGINE_US_DATA_GITHUB_TOKEN: ${{ secrets.POLICYENGINE_US_DATA_GITHUB_TOKEN }}
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
     steps:
       - name: Checkout repo
         uses: actions/checkout@v4
@@ -50,57 +56,25 @@ jobs:
         with:
           node-version: '24'
 
-      - uses: "google-github-actions/auth@v2"
-        if: inputs.upload_data
-        with:
-          workload_identity_provider: "projects/322898545428/locations/global/workloadIdentityPools/policyengine-research-id-pool/providers/prod-github-provider"
-          service_account: "policyengine-research@policyengine-research.iam.gserviceaccount.com"
-
-      - name: Install package
-        run: uv sync --dev
-
-      - name: Download data inputs
-        if: inputs.full_suite
-        run: uv run make download
-
-      # Temporarily disabled - database target causing issues
-      # - name: Create and load calibration targets database 
-      #   if: inputs.full_suite
-      #   run: make database
-
-      - name: Build datasets
+      - name: Install Modal CLI
         if: inputs.full_suite
-        run: uv run make data
-        env:
-          TEST_LITE: ${{ !inputs.upload_data }}
-          PYTHON_LOG_LEVEL: INFO
+        run: pip install modal
 
-      - name: Build datasets for local area calibration
+      - name: Run data build and tests on Modal
         if: inputs.full_suite
         run: |
-          LOCAL_AREA_CALIBRATION=true uv run python policyengine_us_data/datasets/cps/cps.py
-          LOCAL_AREA_CALIBRATION=true uv run python policyengine_us_data/datasets/puf/puf.py
-          LOCAL_AREA_CALIBRATION=true uv run python policyengine_us_data/datasets/cps/extended_cps.py
-          uv run python policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py 10500
+          modal run modal_app/data_build.py \
+            ${{ inputs.upload_data && '--upload' || '--no-upload' }} \
+            --branch=${{ github.head_ref || github.ref_name }} \
+            ${{ inputs.upload_data && '--no-test-lite' || '--test-lite' }}
 
-      - name: Run local area calibration tests
-        if: inputs.full_suite
-        run: uv run pytest policyengine_us_data/tests/test_local_area_calibration/ -v
-
-      - name: Save calibration log
-        if: inputs.full_suite
-        uses: actions/upload-artifact@v4
-        with:
-          name: calibration_log.csv
-          path: calibration_log.csv
+      - name: Install package
+        run: uv sync --dev
 
-      - name: Run tests
+      - name: Run basic tests
+        if: ${{ !inputs.full_suite }}
         run: uv run pytest
 
-      - name: Upload data
-        if: inputs.upload_data
-        run: uv run make upload
-
       - name: Test documentation builds
         run: uv run make documentation
         env:

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,7 @@
+- bump: minor
+  changes:
+    added:
+    - Modal integration for CI/CD workflows, replacing self-hosted GCP runners
+    changed:
+    - Updated reusable_test.yaml to trigger data builds on Modal
+    - Updated local_area_publish.yaml to run on Modal
diff --git a/modal_app/__init__.py b/modal_app/__init__.py
@@ -0,0 +1 @@
+# Modal application for policyengine-us-data CI/CD
diff --git a/modal_app/data_build.py b/modal_app/data_build.py
@@ -0,0 +1,184 @@
+import os
+import subprocess
+import modal
+
+app = modal.App("policyengine-us-data")
+
+hf_secret = modal.Secret.from_name("huggingface-token")
+gcp_secret = modal.Secret.from_name("gcp-credentials")
+
+image = (
+    modal.Image.debian_slim(python_version="3.13")
+    .apt_install("git")
+    .pip_install(
+        "policyengine-us>=1.353.0",
+        "policyengine-core>=3.19.0",
+        "pandas>=2.3.1",
+        "requests>=2.25.0",
+        "tqdm>=4.60.0",
+        "microdf_python>=1.0.0",
+        "microimpute>=1.1.4",
+        "google-cloud-storage>=2.0.0",
+        "google-auth>=2.0.0",
+        "scipy>=1.15.3",
+        "statsmodels>=0.14.5",
+        "openpyxl>=3.1.5",
+        "tables>=3.10.2",
+        "torch>=2.7.1",
+        "us>=2.0.0",
+        "sqlalchemy>=2.0.41",
+        "sqlmodel>=0.0.24",
+        "xlrd>=2.0.2",
+        "huggingface_hub",
+        "pytest",
+    )
+)
+
+REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
+
+
+def setup_gcp_credentials():
+    """Write GCP credentials JSON to a temp file for google.auth.default()."""
+    creds_json = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS_JSON")
+    if creds_json:
+        creds_path = "/tmp/gcp-credentials.json"
+        with open(creds_path, "w") as f:
+            f.write(creds_json)
+        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
+        return creds_path
+    return None
+
+
+@app.function(
+    image=image,
+    secrets=[hf_secret, gcp_secret],
+    memory=32768,
+    cpu=8.0,
+    timeout=14400,
+)
+def build_datasets(
+    upload: bool = False,
+    branch: str = "main",
+    test_lite: bool = False,
+):
+    setup_gcp_credentials()
+
+    os.chdir("/root")
+    subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True)
+    os.chdir("policyengine-us-data")
+    subprocess.run(["pip", "install", "-e", ".[dev]"], check=True)
+
+    env = os.environ.copy()
+    if test_lite:
+        env["TEST_LITE"] = "true"
+
+    # Download prerequisites
+    subprocess.run(
+        [
+            "python",
+            "policyengine_us_data/storage/download_private_prerequisites.py",
+        ],
+        check=True,
+        env=env,
+    )
+
+    # Build main datasets
+    scripts = [
+        "policyengine_us_data/utils/uprating.py",
+        "policyengine_us_data/datasets/acs/acs.py",
+        "policyengine_us_data/datasets/cps/cps.py",
+        "policyengine_us_data/datasets/puf/irs_puf.py",
+        "policyengine_us_data/datasets/puf/puf.py",
+        "policyengine_us_data/datasets/cps/extended_cps.py",
+        "policyengine_us_data/datasets/cps/enhanced_cps.py",
+        "policyengine_us_data/datasets/cps/small_enhanced_cps.py",
+    ]
+    for script in scripts:
+        print(f"Running {script}...")
+        subprocess.run(["python", script], check=True, env=env)
+
+    os.rename(
+        "policyengine_us_data/storage/enhanced_cps_2024.h5",
+        "policyengine_us_data/storage/dense_enhanced_cps_2024.h5",
+    )
+    subprocess.run(
+        [
+            "cp",
+            "policyengine_us_data/storage/sparse_enhanced_cps_2024.h5",
+            "policyengine_us_data/storage/enhanced_cps_2024.h5",
+        ],
+        check=True,
+    )
+
+    # Build local area calibration datasets (without TEST_LITE - must match full dataset)
+    print("Building local area calibration datasets...")
+    local_area_env = os.environ.copy()
+    local_area_env["LOCAL_AREA_CALIBRATION"] = "true"
+
+    subprocess.run(
+        ["python", "policyengine_us_data/datasets/cps/cps.py"],
+        check=True,
+        env=local_area_env,
+    )
+    subprocess.run(
+        ["python", "policyengine_us_data/datasets/puf/puf.py"],
+        check=True,
+        env=local_area_env,
+    )
+    subprocess.run(
+        ["python", "policyengine_us_data/datasets/cps/extended_cps.py"],
+        check=True,
+        env=local_area_env,
+    )
+    subprocess.run(
+        [
+            "python",
+            "policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py",
+            "10500",
+        ],
+        check=True,
+        env=local_area_env,
+    )
+
+    # Run local area calibration tests
+    print("Running local area calibration tests...")
+    subprocess.run(
+        [
+            "pytest",
+            "policyengine_us_data/tests/test_local_area_calibration/",
+            "-v",
+        ],
+        check=True,
+        env=local_area_env,
+    )
+
+    # Run main test suite
+    print("Running main test suite...")
+    subprocess.run(["pytest"], check=True, env=env)
+
+    # Upload if requested
+    if upload:
+        subprocess.run(
+            [
+                "python",
+                "policyengine_us_data/storage/upload_completed_datasets.py",
+            ],
+            check=True,
+            env=env,
+        )
+
+    return "Data build and tests completed successfully"
+
+
+@app.local_entrypoint()
+def main(
+    upload: bool = False,
+    branch: str = "main",
+    test_lite: bool = False,
+):
+    result = build_datasets.remote(
+        upload=upload,
+        branch=branch,
+        test_lite=test_lite,
+    )
+    print(result)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Modal application for policyengine-us-data CI/CD