diff --git a/modal_app/data_build.py b/modal_app/data_build.py
index 5492e586..52803568 100644
--- a/modal_app/data_build.py
+++ b/modal_app/data_build.py
@@ -10,28 +10,7 @@
 image = (
     modal.Image.debian_slim(python_version="3.13")
     .apt_install("git")
-    .pip_install(
-        "policyengine-us>=1.353.0",
-        "policyengine-core>=3.19.0",
-        "pandas>=2.3.1",
-        "requests>=2.25.0",
-        "tqdm>=4.60.0",
-        "microdf_python>=1.0.0",
-        "microimpute>=1.1.4",
-        "google-cloud-storage>=2.0.0",
-        "google-auth>=2.0.0",
-        "scipy>=1.15.3",
-        "statsmodels>=0.14.5",
-        "openpyxl>=3.1.5",
-        "tables>=3.10.2",
-        "torch>=2.7.1",
-        "us>=2.0.0",
-        "sqlalchemy>=2.0.41",
-        "sqlmodel>=0.0.24",
-        "xlrd>=2.0.2",
-        "huggingface_hub",
-        "pytest",
-    )
+    .pip_install("uv")
 )
 
 REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
@@ -66,7 +45,8 @@ def build_datasets(
     os.chdir("/root")
     subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True)
     os.chdir("policyengine-us-data")
-    subprocess.run(["pip", "install", "-e", ".[dev]"], check=True)
+    # Use uv sync to install exact versions from uv.lock
+    subprocess.run(["uv", "sync", "--locked"], check=True)
 
     env = os.environ.copy()
     if test_lite:
@@ -75,6 +55,8 @@ def build_datasets(
     # Download prerequisites
     subprocess.run(
         [
+            "uv",
+            "run",
             "python",
             "policyengine_us_data/storage/download_private_prerequisites.py",
         ],
@@ -95,7 +77,7 @@ def build_datasets(
     ]
     for script in scripts:
         print(f"Running {script}...")
-        subprocess.run(["python", script], check=True, env=env)
+        subprocess.run(["uv", "run", "python", script], check=True, env=env)
 
     os.rename(
         "policyengine_us_data/storage/enhanced_cps_2024.h5",
@@ -116,22 +98,29 @@ def build_datasets(
     local_area_env["LOCAL_AREA_CALIBRATION"] = "true"
 
     subprocess.run(
-        ["python", "policyengine_us_data/datasets/cps/cps.py"],
+        ["uv", "run", "python", "policyengine_us_data/datasets/cps/cps.py"],
         check=True,
         env=local_area_env,
     )
     subprocess.run(
-        ["python", "policyengine_us_data/datasets/puf/puf.py"],
+        ["uv", "run", "python", "policyengine_us_data/datasets/puf/puf.py"],
         check=True,
         env=local_area_env,
     )
     subprocess.run(
-        ["python", "policyengine_us_data/datasets/cps/extended_cps.py"],
+        [
+            "uv",
+            "run",
+            "python",
+            "policyengine_us_data/datasets/cps/extended_cps.py",
+        ],
         check=True,
         env=local_area_env,
     )
     subprocess.run(
         [
+            "uv",
+            "run",
             "python",
             "policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py",
             "10500",
@@ -144,6 +133,8 @@ def build_datasets(
     print("Running local area calibration tests...")
     subprocess.run(
         [
+            "uv",
+            "run",
             "pytest",
             "policyengine_us_data/tests/test_local_area_calibration/",
             "-v",
@@ -154,12 +145,14 @@ def build_datasets(
 
     # Run main test suite
     print("Running main test suite...")
-    subprocess.run(["pytest"], check=True, env=env)
+    subprocess.run(["uv", "run", "pytest"], check=True, env=env)
 
     # Upload if requested
     if upload:
         subprocess.run(
             [
+                "uv",
+                "run",
                 "python",
                 "policyengine_us_data/storage/upload_completed_datasets.py",
             ],
diff --git a/modal_app/local_area.py b/modal_app/local_area.py
index 3f8f903b..8a1bd2b8 100644
--- a/modal_app/local_area.py
+++ b/modal_app/local_area.py
@@ -10,27 +10,7 @@
 image = (
     modal.Image.debian_slim(python_version="3.13")
     .apt_install("git")
-    .pip_install(
-        "policyengine-us>=1.353.0",
-        "policyengine-core>=3.19.0",
-        "pandas>=2.3.1",
-        "requests>=2.25.0",
-        "tqdm>=4.60.0",
-        "microdf_python>=1.0.0",
-        "microimpute>=1.1.4",
-        "google-cloud-storage>=2.0.0",
-        "google-auth>=2.0.0",
-        "scipy>=1.15.3",
-        "statsmodels>=0.14.5",
-        "openpyxl>=3.1.5",
-        "tables>=3.10.2",
-        "torch>=2.7.1",
-        "us>=2.0.0",
-        "sqlalchemy>=2.0.41",
-        "sqlmodel>=0.0.24",
-        "xlrd>=2.0.2",
-        "huggingface_hub",
-    )
+    .pip_install("uv")
 )
 
 REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
@@ -61,10 +41,13 @@ def publish_all_local_areas(branch: str = "main"):
     os.chdir("/root")
     subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True)
     os.chdir("policyengine-us-data")
-    subprocess.run(["pip", "install", "-e", "."], check=True)
+    # Use uv sync to install exact versions from uv.lock
+    subprocess.run(["uv", "sync", "--locked"], check=True)
 
     subprocess.run(
         [
+            "uv",
+            "run",
             "python",
             "policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py",
         ],
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py b/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py
index a3b8c19e..780bc4c7 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py
@@ -123,3 +123,66 @@ def assign_counties_for_cd(
     weights = list(dist.values())
     selected = random.choices(counties, weights=weights, k=n_households)
     return np.array([get_county_index(c) for c in selected], dtype=np.int32)
+
+
+def get_county_filter_probability(
+    cd_geoid: str,
+    county_filter: set,
+) -> float:
+    """
+    Calculate P(county in filter | CD).
+
+    Returns the probability that a household in this CD would be in the
+    target area (e.g., NYC). Used for weight scaling when building
+    city-level datasets.
+
+    Args:
+        cd_geoid: Congressional district geoid (e.g., "3610")
+        county_filter: Set of county names that define the target area
+
+    Returns:
+        Probability between 0 and 1
+    """
+    cd_key = str(int(cd_geoid))
+
+    if cd_key in _CD_COUNTY_DISTRIBUTIONS:
+        dist = _CD_COUNTY_DISTRIBUTIONS[cd_key]
+    else:
+        dist = _generate_uniform_distribution(cd_key)
+
+    return sum(
+        prob for county, prob in dist.items() if county in county_filter
+    )
+
+
+def get_filtered_county_distribution(
+    cd_geoid: str,
+    county_filter: set,
+) -> Dict[str, float]:
+    """
+    Get normalized distribution over target counties only.
+
+    Used when building city-level datasets to assign only valid counties
+    while maintaining relative proportions within the target area.
+
+    Args:
+        cd_geoid: Congressional district geoid (e.g., "3610")
+        county_filter: Set of county names that define the target area
+
+    Returns:
+        Dictionary mapping county names to normalized probabilities.
+        Empty dict if CD has no overlap with target area.
+    """
+    cd_key = str(int(cd_geoid))
+
+    if cd_key in _CD_COUNTY_DISTRIBUTIONS:
+        dist = _CD_COUNTY_DISTRIBUTIONS[cd_key]
+    else:
+        dist = _generate_uniform_distribution(cd_key)
+
+    filtered = {c: p for c, p in dist.items() if c in county_filter}
+    total = sum(filtered.values())
+
+    if total > 0:
+        return {c: p / total for c, p in filtered.items()}
+    return {}
diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py
index 9989928c..c3559071 100644
--- a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py
+++ b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py
@@ -26,6 +26,8 @@
 )
 from policyengine_us_data.datasets.cps.local_area_calibration.county_assignment import (
     assign_counties_for_cd,
+    get_county_filter_probability,
+    get_filtered_county_distribution,
 )
 
 NYC_COUNTIES = {
@@ -65,6 +67,7 @@ def create_sparse_cd_stacked_dataset(
     output_path=None,
     dataset_path=None,
     county_filter=None,
+    seed: int = 42,
 ):
     """
     Create a SPARSE congressional district-stacked dataset using DataFrame approach.
@@ -80,6 +83,8 @@ def create_sparse_cd_stacked_dataset(
         dataset_path: Path to the base .h5 dataset used during calibration.
         county_filter: Optional set of county names to filter to. Only households
            assigned to these counties will be included. Used for city-level datasets.
+        seed: Base random seed for county assignment. Each CD gets seed + int(cd_geoid)
+           for deterministic, order-independent results. Default 42.
 
     Returns:
         output_path: Path to the saved .h5 file.
@@ -208,7 +213,16 @@ def create_sparse_cd_stacked_dataset(
         # Get this CD's calibrated weights from the weight matrix
         calibrated_weights_for_cd = W[
             cd_idx, :
-        ]  # Get this CD's row from weight matrix
+        ].copy()  # Get this CD's row from weight matrix
+
+        # For city datasets: scale weights by P(target|CD)
+        # This preserves the representative sample while adjusting for target population
+        if county_filter is not None:
+            p_target = get_county_filter_probability(cd_geoid, county_filter)
+            if p_target == 0:
+                # CD has no overlap with target area, skip entirely
+                continue
+            calibrated_weights_for_cd = calibrated_weights_for_cd * p_target
 
         # Map the calibrated weights to household IDs
         hh_weight_values = []
@@ -325,23 +339,31 @@ def create_sparse_cd_stacked_dataset(
         )
 
         # Set county for this CD
-        county_indices = assign_counties_for_cd(
-            cd_geoid=cd_geoid, n_households=n_households_orig, seed=42 + idx
-        )
-        cd_sim.set_input("county", time_period, county_indices)
-
-        # Filter to only households assigned to specified counties (e.g., NYC)
+        # For city datasets: use only target counties (normalized distribution)
         if county_filter is not None:
-            filtered_household_ids = set()
-            for hh_idx in active_household_indices:
-                county_name = get_county_name(county_indices[hh_idx])
-                if county_name in county_filter:
-                    filtered_household_ids.add(household_ids[hh_idx])
-
-            active_household_ids = filtered_household_ids
-
-            if len(active_household_ids) == 0:
+            filtered_dist = get_filtered_county_distribution(
+                cd_geoid, county_filter
+            )
+            if not filtered_dist:
+                # Should not happen if we already checked p_target > 0
                 continue
+            county_indices = assign_counties_for_cd(
+                cd_geoid=cd_geoid,
+                n_households=n_households_orig,
+                seed=seed + int(cd_geoid),
+                distributions={cd_geoid: filtered_dist},
+            )
+        else:
+            county_indices = assign_counties_for_cd(
+                cd_geoid=cd_geoid,
+                n_households=n_households_orig,
+                seed=seed + int(cd_geoid),
+            )
+        cd_sim.set_input("county", time_period, county_indices)
+
+        # Note: We no longer use binary filtering for county_filter.
+        # Instead, weights are scaled by P(target|CD) and all households
+        # are included to avoid sample selection bias.
 
         geoadj = cd_geoadj_values[cd_geoid]
         new_spm_thresholds = calculate_spm_thresholds_for_cd(
diff --git a/policyengine_us_data/storage/calibration_targets/audit_county_enum.py b/policyengine_us_data/storage/calibration_targets/audit_county_enum.py
new file mode 100644
index 00000000..4849a10e
--- /dev/null
+++ b/policyengine_us_data/storage/calibration_targets/audit_county_enum.py
@@ -0,0 +1,160 @@
+"""
+Audit County enum against Census 2020 data.
+
+Identifies bogus entries (counties assigned to wrong states, non-existent
+combinations, encoding issues) and generates the INVALID_COUNTY_NAMES set
+for use in county_assignment.py.
+"""
+
+import re
+import requests
+import pandas as pd
+from io import StringIO
+from collections import defaultdict
+
+from policyengine_us.variables.household.demographic.geographic.county.county_enum import (
+    County,
+)
+
+
+def audit_county_enum():
+    """
+    Compare County enum entries against Census 2020 county reference.
+
+    Returns categorized list of invalid entries:
+    - wrong_state: county exists but in different state
+    - non_existent: county name doesn't exist anywhere
+    - encoding_issue: likely character encoding mismatch
+    """
+    print("Downloading Census 2020 county reference...")
+    url = "https://www2.census.gov/geo/docs/reference/codes2020/national_county2020.txt"
+    response = requests.get(url, timeout=60)
+    census_df = pd.read_csv(
+        StringIO(response.text),
+        delimiter="|",
+        dtype=str,
+        usecols=["STATE", "STATEFP", "COUNTYFP", "COUNTYNAME"],
+    )
+
+    # Build Census valid (state, normalized_county_name) pairs
+    census_valid = set()
+    county_to_states = defaultdict(set)
+
+    for _, row in census_df.iterrows():
+        state = row["STATE"]
+        county_name = row["COUNTYNAME"].upper()
+        # Apply same normalization as make_county_cd_distributions.py
+        normalized = re.sub(r"[.'\"]", "", county_name)
+        normalized = normalized.replace("-", "_")
+        normalized = normalized.replace(" ", "_")
+
+        census_valid.add((state, normalized))
+        county_to_states[normalized].add(state)
+
+    print(f"Census has {len(census_valid)} valid (state, county) pairs")
+
+    # Audit each County enum entry
+    invalid_entries = {
+        "wrong_state": [],
+        "non_existent": [],
+        "encoding_issue": [],
+    }
+    valid_count = 0
+
+    for name in County._member_names_:
+        if name == "UNKNOWN":
+            continue
+
+        # Parse state code (last 2 chars)
+        state = name[-2:]
+        county_part = name[:-3]  # Remove _XX suffix
+
+        if (state, county_part) in census_valid:
+            valid_count += 1
+        else:
+            # Check if county exists in any state
+            if county_part in county_to_states:
+                correct_states = county_to_states[county_part]
+                invalid_entries["wrong_state"].append(
+                    (name, state, list(correct_states))
+                )
+            elif "Ñ" in name or "Í" in name or "Ó" in name or "Á" in name:
+                invalid_entries["encoding_issue"].append((name, state))
+            else:
+                invalid_entries["non_existent"].append((name, state))
+
+    print(f"\nAudit Results:")
+    print(f"  Valid entries: {valid_count}")
+    print(
+        f"  Wrong state: {len(invalid_entries['wrong_state'])} "
+        "(county exists in different state)"
+    )
+    print(
+        f"  Non-existent: {len(invalid_entries['non_existent'])} "
+        "(county name doesn't exist)"
+    )
+    print(
+        f"  Encoding issues: {len(invalid_entries['encoding_issue'])} "
+        "(special character mismatch)"
+    )
+
+    total_invalid = sum(len(v) for v in invalid_entries.values())
+    print(f"  TOTAL INVALID: {total_invalid}")
+
+    return invalid_entries, county_to_states
+
+
+def print_categorized_report(invalid_entries, county_to_states):
+    """Print detailed report of invalid entries."""
+    print("\n" + "=" * 60)
+    print("WRONG STATE ASSIGNMENTS")
+    print("=" * 60)
+    for name, wrong_state, correct_states in sorted(
+        invalid_entries["wrong_state"]
+    ):
+        print(f"  {name}")
+        print(f"    Listed as: {wrong_state}")
+        print(f"    Actually exists in: {', '.join(sorted(correct_states))}")
+
+    print("\n" + "=" * 60)
+    print("NON-EXISTENT COMBINATIONS")
+    print("=" * 60)
+    for name, state in sorted(invalid_entries["non_existent"]):
+        print(f"  {name}")
+
+    print("\n" + "=" * 60)
+    print("ENCODING ISSUES")
+    print("=" * 60)
+    for name, state in sorted(invalid_entries["encoding_issue"]):
+        print(f"  {name}")
+
+
+def generate_invalid_county_names_set(invalid_entries):
+    """Generate Python set literal for INVALID_COUNTY_NAMES."""
+    all_invalid = []
+
+    for name, _, _ in invalid_entries["wrong_state"]:
+        all_invalid.append(name)
+    for name, _ in invalid_entries["non_existent"]:
+        all_invalid.append(name)
+    for name, _ in invalid_entries["encoding_issue"]:
+        all_invalid.append(name)
+
+    all_invalid.sort()
+
+    print("\n" + "=" * 60)
+    print("INVALID_COUNTY_NAMES SET (copy to county_assignment.py)")
+    print("=" * 60)
+    print("INVALID_COUNTY_NAMES = {")
+    for name in all_invalid:
+        print(f'    "{name}",')
+    print("}")
+
+    return set(all_invalid)
+
+
+if __name__ == "__main__":
+    invalid_entries, county_to_states = audit_county_enum()
+    print_categorized_report(invalid_entries, county_to_states)
+    invalid_set = generate_invalid_county_names_set(invalid_entries)
+    print(f"\nTotal entries to exclude: {len(invalid_set)}")
diff --git a/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py b/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py
index 4ada2e39..ba68a556 100644
--- a/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py
+++ b/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py
@@ -141,6 +141,13 @@ def build_county_cd_distributions():
     cd_totals = cd_county_pop.groupby("cd_geoid")["POP20"].transform("sum")
     cd_county_pop["probability"] = cd_county_pop["POP20"] / cd_totals
 
+    # Step 5b: Filter out zero-probability entries (unpopulated county-CD pairs)
+    pre_filter_count = len(cd_county_pop)
+    cd_county_pop = cd_county_pop[cd_county_pop["probability"] > 0]
+    filtered_count = pre_filter_count - len(cd_county_pop)
+    if filtered_count > 0:
+        print(f"  Filtered out {filtered_count} zero-probability entries")
+
     # Step 6: Map county FIPS to enum names
     print("\nMapping county FIPS to enum names...")
     fips_to_enum = build_county_fips_to_enum_mapping()
diff --git a/policyengine_us_data/storage/county_cd_distributions.csv b/policyengine_us_data/storage/county_cd_distributions.csv
index 6c3f06d8..eb2900ca 100644
--- a/policyengine_us_data/storage/county_cd_distributions.csv
+++ b/policyengine_us_data/storage/county_cd_distributions.csv
@@ -224,7 +224,6 @@ cd_geoid,county_name,probability
 602,MENDOCINO_COUNTY_CA,0.11962446685419817
 602,DEL_NORTE_COUNTY_CA,0.0362304077896095
 602,TRINITY_COUNTY_CA,0.02104113939754851
-602,SAN_FRANCISCO_COUNTY_CA,0.0
 603,PLACER_COUNTY_CA,0.5293266024090083
 603,SACRAMENTO_COUNTY_CA,0.16285523717353492
 603,NEVADA_COUNTY_CA,0.13371303767835424
@@ -261,7 +260,6 @@ cd_geoid,county_name,probability
 610,ALAMEDA_COUNTY_CA,0.0736834213288507
 611,SAN_FRANCISCO_COUNTY_CA,1.0
 612,ALAMEDA_COUNTY_CA,1.0
-612,SAN_FRANCISCO_COUNTY_CA,0.0
 613,MERCED_COUNTY_CA,0.3661141143017842
 613,STANISLAUS_COUNTY_CA,0.29177472945244715
 613,MADERA_COUNTY_CA,0.16385443031382474
@@ -340,7 +338,6 @@ cd_geoid,county_name,probability
 652,SAN_DIEGO_COUNTY_CA,1.0
 801,DENVER_COUNTY_CO,0.9898907323399574
 801,ARAPAHOE_COUNTY_CO,0.010109267660042621
-801,JEFFERSON_COUNTY_CO,0.0
 802,BOULDER_COUNTY_CO,0.45829130410685587
 802,LARIMER_COUNTY_CO,0.3250554231557945
 802,EAGLE_COUNTY_CO,0.06277642298952503
@@ -352,7 +349,6 @@ cd_geoid,county_name,probability
 802,GILPIN_COUNTY_CO,0.008047442221360085
 802,JEFFERSON_COUNTY_CO,0.0025674776921797925
 802,JACKSON_COUNTY_CO,0.0019107132960150752
-802,BROOMFIELD_COUNTY_CO,0.0
 803,PUEBLO_COUNTY_CO,0.23299848973993045
 803,MESA_COUNTY_CO,0.21573580147700663
 803,GARFIELD_COUNTY_CO,0.08546824989954692
@@ -417,8 +413,6 @@ cd_geoid,county_name,probability
 807,CUSTER_COUNTY_CO,0.006517970838177241
 807,ADAMS_COUNTY_CO,0.004007221867348762
 807,EL_PASO_COUNTY_CO,0.0008618575385514974
-807,BOULDER_COUNTY_CO,0.0
-807,WELD_COUNTY_CO,0.0
 808,ADAMS_COUNTY_CO,0.6325353627172756
 808,WELD_COUNTY_CO,0.34371674352607307
 808,LARIMER_COUNTY_CO,0.023747893756651296
@@ -489,7 +483,6 @@ cd_geoid,county_name,probability
 1206,ST_JOHNS_COUNTY_FL,0.05089174632517833
 1207,SEMINOLE_COUNTY_FL,0.6121205739312889
 1207,VOLUSIA_COUNTY_FL,0.38787942606871106
-1207,ORANGE_COUNTY_FL,0.0
 1208,BREVARD_COUNTY_FL,0.7886056152913142
 1208,INDIAN_RIVER_COUNTY_FL,0.2077270381333843
 1208,ORANGE_COUNTY_FL,0.0036673465753015062
@@ -504,7 +497,6 @@ cd_geoid,county_name,probability
 1212,PASCO_COUNTY_FL,0.5471288485363764
 1212,HERNANDO_COUNTY_FL,0.2528727114834358
 1212,CITRUS_COUNTY_FL,0.19999843998018774
-1212,MARION_COUNTY_FL,0.0
 1213,PINELLAS_COUNTY_FL,1.0
 1214,HILLSBOROUGH_COUNTY_FL,0.7531450649423248
 1214,PINELLAS_COUNTY_FL,0.24685493505767522
@@ -2668,9 +2660,6 @@ cd_geoid,county_name,probability
 4102,GILLIAM_COUNTY_OR,0.0028249427577388565
 4102,SHERMAN_COUNTY_OR,0.002647941331815369
 4102,WHEELER_COUNTY_OR,0.0020546325521198397
-4102,CLACKAMAS_COUNTY_OR,0.0
-4102,CURRY_COUNTY_OR,0.0
-4102,MARION_COUNTY_OR,0.0
 4103,MULTNOMAH_COUNTY_OR,0.850297857999544
 4103,CLACKAMAS_COUNTY_OR,0.11575043648551633
 4103,HOOD_RIVER_COUNTY_OR,0.03395170551493963
@@ -2681,20 +2670,17 @@ cd_geoid,county_name,probability
 4104,LINCOLN_COUNTY_OR,0.07135999592188137
 4104,CURRY_COUNTY_OR,0.03319985046898364
 4104,LINN_COUNTY_OR,0.0014967261769903485
-4104,POLK_COUNTY_OR,0.0
 4105,CLACKAMAS_COUNTY_OR,0.42576630997339315
 4105,DESCHUTES_COUNTY_OR,0.2463321764520135
 4105,LINN_COUNTY_OR,0.18061650304654855
 4105,MULTNOMAH_COUNTY_OR,0.07367932156061449
 4105,MARION_COUNTY_OR,0.07357736873928257
 4105,JEFFERSON_COUNTY_OR,2.832022814775796e-05
-4105,BENTON_COUNTY_OR,0.0
 4106,MARION_COUNTY_OR,0.4162475290705907
 4106,WASHINGTON_COUNTY_OR,0.2522202964548889
 4106,YAMHILL_COUNTY_OR,0.1525349328530243
 4106,POLK_COUNTY_OR,0.12380559945172272
 4106,CLACKAMAS_COUNTY_OR,0.05519164216977338
-4106,LINN_COUNTY_OR,0.0
 4201,BUCKS_COUNTY_PA,0.8452957772995531
 4201,MONTGOMERY_COUNTY_PA,0.15470422270044687
 4202,PHILADELPHIA_COUNTY_PA,1.0
@@ -3144,7 +3130,6 @@ cd_geoid,county_name,probability
 4814,JEFFERSON_COUNTY_TX,0.2411331613182492
 4814,BRAZORIA_COUNTY_TX,0.19107364270841617
 4814,ORANGE_COUNTY_TX,0.11057293018004216
-4814,CHAMBERS_COUNTY_TX,0.0
 4815,HIDALGO_COUNTY_TX,0.7514664524952835
 4815,GUADALUPE_COUNTY_TX,0.08977596751965809
 4815,WILSON_COUNTY_TX,0.06486811380114657
@@ -3263,7 +3248,6 @@ cd_geoid,county_name,probability
 4826,DENTON_COUNTY_TX,0.8856851550287033
 4826,WISE_COUNTY_TX,0.05998797893575771
 4826,COOKE_COUNTY_TX,0.054326866035539066
-4826,TARRANT_COUNTY_TX,0.0
 4827,NUECES_COUNTY_TX,0.46047455823892713
 4827,VICTORIA_COUNTY_TX,0.11906199192424383
 4827,SAN_PATRICIO_COUNTY_TX,0.08964297960721629
diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py b/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py
index a5459cc1..158e0ca6 100644
--- a/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py
+++ b/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py
@@ -10,6 +10,8 @@
     assign_counties_for_cd,
     get_county_index,
     _build_state_counties,
+    get_county_filter_probability,
+    get_filtered_county_distribution,
 )
 
 
@@ -112,3 +114,75 @@ def test_ny_has_nyc_counties(self):
         ]
         for county in nyc_counties:
             assert county in ny_counties, f"Missing NYC county: {county}"
+
+
+class TestInvalidCountyExclusion:
+    """Test that invalid counties are properly excluded."""
+
+    def test_delaware_has_exactly_3_counties(self):
+        """Delaware should have exactly 3 counties (no DORCHESTER)."""
+        state_counties = _build_state_counties()
+        de_counties = state_counties.get("DE", [])
+
+        assert len(de_counties) == 3
+        assert "DORCHESTER_COUNTY_DE" not in de_counties
+
+        expected = {
+            "KENT_COUNTY_DE",
+            "NEW_CASTLE_COUNTY_DE",
+            "SUSSEX_COUNTY_DE",
+        }
+        assert set(de_counties) == expected
+
+    def test_suffolk_county_ct_excluded(self):
+        """Suffolk County, CT should be excluded (doesn't exist)."""
+        state_counties = _build_state_counties()
+        ct_counties = state_counties.get("CT", [])
+        assert "SUFFOLK_COUNTY_CT" not in ct_counties
+
+
+class TestCountyFilterProbability:
+    """Test probability calculations for city datasets."""
+
+    NYC_COUNTIES = {
+        "QUEENS_COUNTY_NY",
+        "BRONX_COUNTY_NY",
+        "RICHMOND_COUNTY_NY",
+        "NEW_YORK_COUNTY_NY",
+        "KINGS_COUNTY_NY",
+    }
+
+    def test_fully_nyc_cd_has_probability_one(self):
+        """NY-05 (fully in NYC) should have P(NYC|CD) = 1.0."""
+        prob = get_county_filter_probability("3605", self.NYC_COUNTIES)
+        assert prob == pytest.approx(1.0, abs=0.001)
+
+    def test_mixed_cd_has_partial_probability(self):
+        """NY-03 (mixed NYC/suburbs) should have 0 < P(NYC|CD) < 1."""
+        prob = get_county_filter_probability("3603", self.NYC_COUNTIES)
+        assert 0 < prob < 1
+        # Should be approximately 24% based on Census data
+        assert prob == pytest.approx(0.24, abs=0.05)
+
+    def test_non_nyc_cd_has_zero_probability(self):
+        """Non-NY CD should have P(NYC|CD) = 0."""
+        # CA-12 (San Francisco)
+        prob = get_county_filter_probability("612", self.NYC_COUNTIES)
+        assert prob == 0.0
+
+    def test_filtered_distribution_sums_to_one(self):
+        """Filtered distribution should sum to 1.0."""
+        dist = get_filtered_county_distribution("3603", self.NYC_COUNTIES)
+        if dist:  # Only if CD has overlap
+            assert sum(dist.values()) == pytest.approx(1.0)
+
+    def test_filtered_distribution_only_target_counties(self):
+        """Filtered distribution should only contain target counties."""
+        dist = get_filtered_county_distribution("3603", self.NYC_COUNTIES)
+        for county in dist:
+            assert county in self.NYC_COUNTIES
+
+    def test_filtered_distribution_empty_for_no_overlap(self):
+        """Non-overlapping CD should return empty distribution."""
+        dist = get_filtered_county_distribution("612", self.NYC_COUNTIES)
+        assert dist == {}
diff --git a/uv.lock b/uv.lock
index 7f2e4e5f..24fc4182 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1830,16 +1830,16 @@ wheels = [
 
 [[package]]
 name = "policyengine-us"
-version = "1.497.1"
+version = "1.499.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "microdf-python" },
     { name = "policyengine-core" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/fd/51/9df605ac6939ccbd8a93f5fe8a23d08b4b97e3806ea509c022c603e44266/policyengine_us-1.497.1.tar.gz", hash = "sha256:2f5eb011c8c8c205b3d313f42aa52b8356266921f46611ac9346bc04361eff61", size = 8449641, upload-time = "2026-01-06T15:19:16.995Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/44/05/dbaf4b5aec28ce4f72bdff321b4dadc8ff6839d791d087d5c88723d2c083/policyengine_us-1.499.0.tar.gz", hash = "sha256:a16d056f37ad4fd500dc59a9030fab8ac3730df1b553c9e53222a4f932bc5ec9", size = 8460855, upload-time = "2026-01-12T20:04:10.184Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cc/6d/c877b3e438ae3a8d509161e7439c399629e85647d9238a9d168b06dce21d/policyengine_us-1.497.1-py3-none-any.whl", hash = "sha256:b589e060545f6e38099b0e6233a2ba94195e5c15d53b5aaa8c1efa97b025cd9f", size = 7139280, upload-time = "2026-01-06T15:19:14.666Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/75/348a505f35f60dcbc8af75db868abdf1d6b5d30482338ee5801e8c13689b/policyengine_us-1.499.0-py3-none-any.whl", hash = "sha256:32cd2b6d2c8ac1c7074ebad0343821bed0189bc84d157e54f63eec362bc849bb", size = 7175450, upload-time = "2026-01-12T20:04:07.527Z" },
 ]
 
 [[package]]