diff --git a/modal_app/data_build.py b/modal_app/data_build.py index 5492e586..52803568 100644 --- a/modal_app/data_build.py +++ b/modal_app/data_build.py @@ -10,28 +10,7 @@ image = ( modal.Image.debian_slim(python_version="3.13") .apt_install("git") - .pip_install( - "policyengine-us>=1.353.0", - "policyengine-core>=3.19.0", - "pandas>=2.3.1", - "requests>=2.25.0", - "tqdm>=4.60.0", - "microdf_python>=1.0.0", - "microimpute>=1.1.4", - "google-cloud-storage>=2.0.0", - "google-auth>=2.0.0", - "scipy>=1.15.3", - "statsmodels>=0.14.5", - "openpyxl>=3.1.5", - "tables>=3.10.2", - "torch>=2.7.1", - "us>=2.0.0", - "sqlalchemy>=2.0.41", - "sqlmodel>=0.0.24", - "xlrd>=2.0.2", - "huggingface_hub", - "pytest", - ) + .pip_install("uv") ) REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" @@ -66,7 +45,8 @@ def build_datasets( os.chdir("/root") subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True) os.chdir("policyengine-us-data") - subprocess.run(["pip", "install", "-e", ".[dev]"], check=True) + # Use uv sync to install exact versions from uv.lock + subprocess.run(["uv", "sync", "--locked"], check=True) env = os.environ.copy() if test_lite: @@ -75,6 +55,8 @@ def build_datasets( # Download prerequisites subprocess.run( [ + "uv", + "run", "python", "policyengine_us_data/storage/download_private_prerequisites.py", ], @@ -95,7 +77,7 @@ def build_datasets( ] for script in scripts: print(f"Running {script}...") - subprocess.run(["python", script], check=True, env=env) + subprocess.run(["uv", "run", "python", script], check=True, env=env) os.rename( "policyengine_us_data/storage/enhanced_cps_2024.h5", @@ -116,22 +98,29 @@ def build_datasets( local_area_env["LOCAL_AREA_CALIBRATION"] = "true" subprocess.run( - ["python", "policyengine_us_data/datasets/cps/cps.py"], + ["uv", "run", "python", "policyengine_us_data/datasets/cps/cps.py"], check=True, env=local_area_env, ) subprocess.run( - ["python", "policyengine_us_data/datasets/puf/puf.py"], + ["uv", "run", "python", "policyengine_us_data/datasets/puf/puf.py"], check=True, env=local_area_env, ) subprocess.run( - ["python", "policyengine_us_data/datasets/cps/extended_cps.py"], + [ + "uv", + "run", + "python", + "policyengine_us_data/datasets/cps/extended_cps.py", + ], check=True, env=local_area_env, ) subprocess.run( [ + "uv", + "run", "python", "policyengine_us_data/datasets/cps/local_area_calibration/create_stratified_cps.py", "10500", @@ -144,6 +133,8 @@ def build_datasets( print("Running local area calibration tests...") subprocess.run( [ + "uv", + "run", "pytest", "policyengine_us_data/tests/test_local_area_calibration/", "-v", @@ -154,12 +145,14 @@ def build_datasets( # Run main test suite print("Running main test suite...") - subprocess.run(["pytest"], check=True, env=env) + subprocess.run(["uv", "run", "pytest"], check=True, env=env) # Upload if requested if upload: subprocess.run( [ + "uv", + "run", "python", "policyengine_us_data/storage/upload_completed_datasets.py", ], diff --git a/modal_app/local_area.py b/modal_app/local_area.py index 3f8f903b..8a1bd2b8 100644 --- a/modal_app/local_area.py +++ b/modal_app/local_area.py @@ -10,27 +10,7 @@ image = ( modal.Image.debian_slim(python_version="3.13") .apt_install("git") - .pip_install( - "policyengine-us>=1.353.0", - "policyengine-core>=3.19.0", - "pandas>=2.3.1", - "requests>=2.25.0", - "tqdm>=4.60.0", - "microdf_python>=1.0.0", - "microimpute>=1.1.4", - "google-cloud-storage>=2.0.0", - "google-auth>=2.0.0", - "scipy>=1.15.3", - "statsmodels>=0.14.5", - "openpyxl>=3.1.5", - "tables>=3.10.2", - "torch>=2.7.1", - "us>=2.0.0", - "sqlalchemy>=2.0.41", - "sqlmodel>=0.0.24", - "xlrd>=2.0.2", - "huggingface_hub", - ) + .pip_install("uv") ) REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git" @@ -61,10 +41,13 @@ def publish_all_local_areas(branch: str = "main"): os.chdir("/root") subprocess.run(["git", "clone", "-b", branch, REPO_URL], check=True) os.chdir("policyengine-us-data") - subprocess.run(["pip", "install", "-e", "."], check=True) + # Use uv sync to install exact versions from uv.lock + subprocess.run(["uv", "sync", "--locked"], check=True) subprocess.run( [ + "uv", + "run", "python", "policyengine_us_data/datasets/cps/local_area_calibration/publish_local_area.py", ], diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py b/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py index a3b8c19e..780bc4c7 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/county_assignment.py @@ -123,3 +123,66 @@ def assign_counties_for_cd( weights = list(dist.values()) selected = random.choices(counties, weights=weights, k=n_households) return np.array([get_county_index(c) for c in selected], dtype=np.int32) + + +def get_county_filter_probability( + cd_geoid: str, + county_filter: set, +) -> float: + """ + Calculate P(county in filter | CD). + + Returns the probability that a household in this CD would be in the + target area (e.g., NYC). Used for weight scaling when building + city-level datasets. + + Args: + cd_geoid: Congressional district geoid (e.g., "3610") + county_filter: Set of county names that define the target area + + Returns: + Probability between 0 and 1 + """ + cd_key = str(int(cd_geoid)) + + if cd_key in _CD_COUNTY_DISTRIBUTIONS: + dist = _CD_COUNTY_DISTRIBUTIONS[cd_key] + else: + dist = _generate_uniform_distribution(cd_key) + + return sum( + prob for county, prob in dist.items() if county in county_filter + ) + + +def get_filtered_county_distribution( + cd_geoid: str, + county_filter: set, +) -> Dict[str, float]: + """ + Get normalized distribution over target counties only. + + Used when building city-level datasets to assign only valid counties + while maintaining relative proportions within the target area. + + Args: + cd_geoid: Congressional district geoid (e.g., "3610") + county_filter: Set of county names that define the target area + + Returns: + Dictionary mapping county names to normalized probabilities. + Empty dict if CD has no overlap with target area. + """ + cd_key = str(int(cd_geoid)) + + if cd_key in _CD_COUNTY_DISTRIBUTIONS: + dist = _CD_COUNTY_DISTRIBUTIONS[cd_key] + else: + dist = _generate_uniform_distribution(cd_key) + + filtered = {c: p for c, p in dist.items() if c in county_filter} + total = sum(filtered.values()) + + if total > 0: + return {c: p / total for c, p in filtered.items()} + return {} diff --git a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py index 9989928c..c3559071 100644 --- a/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py +++ b/policyengine_us_data/datasets/cps/local_area_calibration/stacked_dataset_builder.py @@ -26,6 +26,8 @@ ) from policyengine_us_data.datasets.cps.local_area_calibration.county_assignment import ( assign_counties_for_cd, + get_county_filter_probability, + get_filtered_county_distribution, ) NYC_COUNTIES = { @@ -65,6 +67,7 @@ def create_sparse_cd_stacked_dataset( output_path=None, dataset_path=None, county_filter=None, + seed: int = 42, ): """ Create a SPARSE congressional district-stacked dataset using DataFrame approach. @@ -80,6 +83,8 @@ def create_sparse_cd_stacked_dataset( dataset_path: Path to the base .h5 dataset used during calibration. county_filter: Optional set of county names to filter to. Only households assigned to these counties will be included. Used for city-level datasets. + seed: Base random seed for county assignment. Each CD gets seed + int(cd_geoid) + for deterministic, order-independent results. Default 42. Returns: output_path: Path to the saved .h5 file. @@ -208,7 +213,16 @@ def create_sparse_cd_stacked_dataset( # Get this CD's calibrated weights from the weight matrix calibrated_weights_for_cd = W[ cd_idx, : - ] # Get this CD's row from weight matrix + ].copy() # Get this CD's row from weight matrix + + # For city datasets: scale weights by P(target|CD) + # This preserves the representative sample while adjusting for target population + if county_filter is not None: + p_target = get_county_filter_probability(cd_geoid, county_filter) + if p_target == 0: + # CD has no overlap with target area, skip entirely + continue + calibrated_weights_for_cd = calibrated_weights_for_cd * p_target # Map the calibrated weights to household IDs hh_weight_values = [] @@ -325,23 +339,31 @@ def create_sparse_cd_stacked_dataset( ) # Set county for this CD - county_indices = assign_counties_for_cd( - cd_geoid=cd_geoid, n_households=n_households_orig, seed=42 + idx - ) - cd_sim.set_input("county", time_period, county_indices) - - # Filter to only households assigned to specified counties (e.g., NYC) + # For city datasets: use only target counties (normalized distribution) if county_filter is not None: - filtered_household_ids = set() - for hh_idx in active_household_indices: - county_name = get_county_name(county_indices[hh_idx]) - if county_name in county_filter: - filtered_household_ids.add(household_ids[hh_idx]) - - active_household_ids = filtered_household_ids - - if len(active_household_ids) == 0: + filtered_dist = get_filtered_county_distribution( + cd_geoid, county_filter + ) + if not filtered_dist: + # Should not happen if we already checked p_target > 0 continue + county_indices = assign_counties_for_cd( + cd_geoid=cd_geoid, + n_households=n_households_orig, + seed=seed + int(cd_geoid), + distributions={cd_geoid: filtered_dist}, + ) + else: + county_indices = assign_counties_for_cd( + cd_geoid=cd_geoid, + n_households=n_households_orig, + seed=seed + int(cd_geoid), + ) + cd_sim.set_input("county", time_period, county_indices) + + # Note: We no longer use binary filtering for county_filter. + # Instead, weights are scaled by P(target|CD) and all households + # are included to avoid sample selection bias. geoadj = cd_geoadj_values[cd_geoid] new_spm_thresholds = calculate_spm_thresholds_for_cd( diff --git a/policyengine_us_data/storage/calibration_targets/audit_county_enum.py b/policyengine_us_data/storage/calibration_targets/audit_county_enum.py new file mode 100644 index 00000000..4849a10e --- /dev/null +++ b/policyengine_us_data/storage/calibration_targets/audit_county_enum.py @@ -0,0 +1,160 @@ +""" +Audit County enum against Census 2020 data. + +Identifies bogus entries (counties assigned to wrong states, non-existent +combinations, encoding issues) and generates the INVALID_COUNTY_NAMES set +for use in county_assignment.py. +""" + +import re +import requests +import pandas as pd +from io import StringIO +from collections import defaultdict + +from policyengine_us.variables.household.demographic.geographic.county.county_enum import ( + County, +) + + +def audit_county_enum(): + """ + Compare County enum entries against Census 2020 county reference. + + Returns categorized list of invalid entries: + - wrong_state: county exists but in different state + - non_existent: county name doesn't exist anywhere + - encoding_issue: likely character encoding mismatch + """ + print("Downloading Census 2020 county reference...") + url = "https://www2.census.gov/geo/docs/reference/codes2020/national_county2020.txt" + response = requests.get(url, timeout=60) + census_df = pd.read_csv( + StringIO(response.text), + delimiter="|", + dtype=str, + usecols=["STATE", "STATEFP", "COUNTYFP", "COUNTYNAME"], + ) + + # Build Census valid (state, normalized_county_name) pairs + census_valid = set() + county_to_states = defaultdict(set) + + for _, row in census_df.iterrows(): + state = row["STATE"] + county_name = row["COUNTYNAME"].upper() + # Apply same normalization as make_county_cd_distributions.py + normalized = re.sub(r"[.'\"]", "", county_name) + normalized = normalized.replace("-", "_") + normalized = normalized.replace(" ", "_") + + census_valid.add((state, normalized)) + county_to_states[normalized].add(state) + + print(f"Census has {len(census_valid)} valid (state, county) pairs") + + # Audit each County enum entry + invalid_entries = { + "wrong_state": [], + "non_existent": [], + "encoding_issue": [], + } + valid_count = 0 + + for name in County._member_names_: + if name == "UNKNOWN": + continue + + # Parse state code (last 2 chars) + state = name[-2:] + county_part = name[:-3] # Remove _XX suffix + + if (state, county_part) in census_valid: + valid_count += 1 + else: + # Check if county exists in any state + if county_part in county_to_states: + correct_states = county_to_states[county_part] + invalid_entries["wrong_state"].append( + (name, state, list(correct_states)) + ) + elif "Ñ" in name or "Í" in name or "Ó" in name or "Á" in name: + invalid_entries["encoding_issue"].append((name, state)) + else: + invalid_entries["non_existent"].append((name, state)) + + print(f"\nAudit Results:") + print(f" Valid entries: {valid_count}") + print( + f" Wrong state: {len(invalid_entries['wrong_state'])} " + "(county exists in different state)" + ) + print( + f" Non-existent: {len(invalid_entries['non_existent'])} " + "(county name doesn't exist)" + ) + print( + f" Encoding issues: {len(invalid_entries['encoding_issue'])} " + "(special character mismatch)" + ) + + total_invalid = sum(len(v) for v in invalid_entries.values()) + print(f" TOTAL INVALID: {total_invalid}") + + return invalid_entries, county_to_states + + +def print_categorized_report(invalid_entries, county_to_states): + """Print detailed report of invalid entries.""" + print("\n" + "=" * 60) + print("WRONG STATE ASSIGNMENTS") + print("=" * 60) + for name, wrong_state, correct_states in sorted( + invalid_entries["wrong_state"] + ): + print(f" {name}") + print(f" Listed as: {wrong_state}") + print(f" Actually exists in: {', '.join(sorted(correct_states))}") + + print("\n" + "=" * 60) + print("NON-EXISTENT COMBINATIONS") + print("=" * 60) + for name, state in sorted(invalid_entries["non_existent"]): + print(f" {name}") + + print("\n" + "=" * 60) + print("ENCODING ISSUES") + print("=" * 60) + for name, state in sorted(invalid_entries["encoding_issue"]): + print(f" {name}") + + +def generate_invalid_county_names_set(invalid_entries): + """Generate Python set literal for INVALID_COUNTY_NAMES.""" + all_invalid = [] + + for name, _, _ in invalid_entries["wrong_state"]: + all_invalid.append(name) + for name, _ in invalid_entries["non_existent"]: + all_invalid.append(name) + for name, _ in invalid_entries["encoding_issue"]: + all_invalid.append(name) + + all_invalid.sort() + + print("\n" + "=" * 60) + print("INVALID_COUNTY_NAMES SET (copy to county_assignment.py)") + print("=" * 60) + print("INVALID_COUNTY_NAMES = {") + for name in all_invalid: + print(f' "{name}",') + print("}") + + return set(all_invalid) + + +if __name__ == "__main__": + invalid_entries, county_to_states = audit_county_enum() + print_categorized_report(invalid_entries, county_to_states) + invalid_set = generate_invalid_county_names_set(invalid_entries) + print(f"\nTotal entries to exclude: {len(invalid_set)}") diff --git a/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py b/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py index 4ada2e39..ba68a556 100644 --- a/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py +++ b/policyengine_us_data/storage/calibration_targets/make_county_cd_distributions.py @@ -141,6 +141,13 @@ def build_county_cd_distributions(): cd_totals = cd_county_pop.groupby("cd_geoid")["POP20"].transform("sum") cd_county_pop["probability"] = cd_county_pop["POP20"] / cd_totals + # Step 5b: Filter out zero-probability entries (unpopulated county-CD pairs) + pre_filter_count = len(cd_county_pop) + cd_county_pop = cd_county_pop[cd_county_pop["probability"] > 0] + filtered_count = pre_filter_count - len(cd_county_pop) + if filtered_count > 0: + print(f" Filtered out {filtered_count} zero-probability entries") + # Step 6: Map county FIPS to enum names print("\nMapping county FIPS to enum names...") fips_to_enum = build_county_fips_to_enum_mapping() diff --git a/policyengine_us_data/storage/county_cd_distributions.csv b/policyengine_us_data/storage/county_cd_distributions.csv index 6c3f06d8..eb2900ca 100644 --- a/policyengine_us_data/storage/county_cd_distributions.csv +++ b/policyengine_us_data/storage/county_cd_distributions.csv @@ -224,7 +224,6 @@ cd_geoid,county_name,probability 602,MENDOCINO_COUNTY_CA,0.11962446685419817 602,DEL_NORTE_COUNTY_CA,0.0362304077896095 602,TRINITY_COUNTY_CA,0.02104113939754851 -602,SAN_FRANCISCO_COUNTY_CA,0.0 603,PLACER_COUNTY_CA,0.5293266024090083 603,SACRAMENTO_COUNTY_CA,0.16285523717353492 603,NEVADA_COUNTY_CA,0.13371303767835424 @@ -261,7 +260,6 @@ cd_geoid,county_name,probability 610,ALAMEDA_COUNTY_CA,0.0736834213288507 611,SAN_FRANCISCO_COUNTY_CA,1.0 612,ALAMEDA_COUNTY_CA,1.0 -612,SAN_FRANCISCO_COUNTY_CA,0.0 613,MERCED_COUNTY_CA,0.3661141143017842 613,STANISLAUS_COUNTY_CA,0.29177472945244715 613,MADERA_COUNTY_CA,0.16385443031382474 @@ -340,7 +338,6 @@ cd_geoid,county_name,probability 652,SAN_DIEGO_COUNTY_CA,1.0 801,DENVER_COUNTY_CO,0.9898907323399574 801,ARAPAHOE_COUNTY_CO,0.010109267660042621 -801,JEFFERSON_COUNTY_CO,0.0 802,BOULDER_COUNTY_CO,0.45829130410685587 802,LARIMER_COUNTY_CO,0.3250554231557945 802,EAGLE_COUNTY_CO,0.06277642298952503 @@ -352,7 +349,6 @@ cd_geoid,county_name,probability 802,GILPIN_COUNTY_CO,0.008047442221360085 802,JEFFERSON_COUNTY_CO,0.0025674776921797925 802,JACKSON_COUNTY_CO,0.0019107132960150752 -802,BROOMFIELD_COUNTY_CO,0.0 803,PUEBLO_COUNTY_CO,0.23299848973993045 803,MESA_COUNTY_CO,0.21573580147700663 803,GARFIELD_COUNTY_CO,0.08546824989954692 @@ -417,8 +413,6 @@ cd_geoid,county_name,probability 807,CUSTER_COUNTY_CO,0.006517970838177241 807,ADAMS_COUNTY_CO,0.004007221867348762 807,EL_PASO_COUNTY_CO,0.0008618575385514974 -807,BOULDER_COUNTY_CO,0.0 -807,WELD_COUNTY_CO,0.0 808,ADAMS_COUNTY_CO,0.6325353627172756 808,WELD_COUNTY_CO,0.34371674352607307 808,LARIMER_COUNTY_CO,0.023747893756651296 @@ -489,7 +483,6 @@ cd_geoid,county_name,probability 1206,ST_JOHNS_COUNTY_FL,0.05089174632517833 1207,SEMINOLE_COUNTY_FL,0.6121205739312889 1207,VOLUSIA_COUNTY_FL,0.38787942606871106 -1207,ORANGE_COUNTY_FL,0.0 1208,BREVARD_COUNTY_FL,0.7886056152913142 1208,INDIAN_RIVER_COUNTY_FL,0.2077270381333843 1208,ORANGE_COUNTY_FL,0.0036673465753015062 @@ -504,7 +497,6 @@ cd_geoid,county_name,probability 1212,PASCO_COUNTY_FL,0.5471288485363764 1212,HERNANDO_COUNTY_FL,0.2528727114834358 1212,CITRUS_COUNTY_FL,0.19999843998018774 -1212,MARION_COUNTY_FL,0.0 1213,PINELLAS_COUNTY_FL,1.0 1214,HILLSBOROUGH_COUNTY_FL,0.7531450649423248 1214,PINELLAS_COUNTY_FL,0.24685493505767522 @@ -2668,9 +2660,6 @@ cd_geoid,county_name,probability 4102,GILLIAM_COUNTY_OR,0.0028249427577388565 4102,SHERMAN_COUNTY_OR,0.002647941331815369 4102,WHEELER_COUNTY_OR,0.0020546325521198397 -4102,CLACKAMAS_COUNTY_OR,0.0 -4102,CURRY_COUNTY_OR,0.0 -4102,MARION_COUNTY_OR,0.0 4103,MULTNOMAH_COUNTY_OR,0.850297857999544 4103,CLACKAMAS_COUNTY_OR,0.11575043648551633 4103,HOOD_RIVER_COUNTY_OR,0.03395170551493963 @@ -2681,20 +2670,17 @@ cd_geoid,county_name,probability 4104,LINCOLN_COUNTY_OR,0.07135999592188137 4104,CURRY_COUNTY_OR,0.03319985046898364 4104,LINN_COUNTY_OR,0.0014967261769903485 -4104,POLK_COUNTY_OR,0.0 4105,CLACKAMAS_COUNTY_OR,0.42576630997339315 4105,DESCHUTES_COUNTY_OR,0.2463321764520135 4105,LINN_COUNTY_OR,0.18061650304654855 4105,MULTNOMAH_COUNTY_OR,0.07367932156061449 4105,MARION_COUNTY_OR,0.07357736873928257 4105,JEFFERSON_COUNTY_OR,2.832022814775796e-05 -4105,BENTON_COUNTY_OR,0.0 4106,MARION_COUNTY_OR,0.4162475290705907 4106,WASHINGTON_COUNTY_OR,0.2522202964548889 4106,YAMHILL_COUNTY_OR,0.1525349328530243 4106,POLK_COUNTY_OR,0.12380559945172272 4106,CLACKAMAS_COUNTY_OR,0.05519164216977338 -4106,LINN_COUNTY_OR,0.0 4201,BUCKS_COUNTY_PA,0.8452957772995531 4201,MONTGOMERY_COUNTY_PA,0.15470422270044687 4202,PHILADELPHIA_COUNTY_PA,1.0 @@ -3144,7 +3130,6 @@ cd_geoid,county_name,probability 4814,JEFFERSON_COUNTY_TX,0.2411331613182492 4814,BRAZORIA_COUNTY_TX,0.19107364270841617 4814,ORANGE_COUNTY_TX,0.11057293018004216 -4814,CHAMBERS_COUNTY_TX,0.0 4815,HIDALGO_COUNTY_TX,0.7514664524952835 4815,GUADALUPE_COUNTY_TX,0.08977596751965809 4815,WILSON_COUNTY_TX,0.06486811380114657 @@ -3263,7 +3248,6 @@ cd_geoid,county_name,probability 4826,DENTON_COUNTY_TX,0.8856851550287033 4826,WISE_COUNTY_TX,0.05998797893575771 4826,COOKE_COUNTY_TX,0.054326866035539066 -4826,TARRANT_COUNTY_TX,0.0 4827,NUECES_COUNTY_TX,0.46047455823892713 4827,VICTORIA_COUNTY_TX,0.11906199192424383 4827,SAN_PATRICIO_COUNTY_TX,0.08964297960721629 diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py b/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py index a5459cc1..158e0ca6 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_county_assignment.py @@ -10,6 +10,8 @@ assign_counties_for_cd, get_county_index, _build_state_counties, + get_county_filter_probability, + get_filtered_county_distribution, ) @@ -112,3 +114,75 @@ def test_ny_has_nyc_counties(self): ] for county in nyc_counties: assert county in ny_counties, f"Missing NYC county: {county}" + + +class TestInvalidCountyExclusion: + """Test that invalid counties are properly excluded.""" + + def test_delaware_has_exactly_3_counties(self): + """Delaware should have exactly 3 counties (no DORCHESTER).""" + state_counties = _build_state_counties() + de_counties = state_counties.get("DE", []) + + assert len(de_counties) == 3 + assert "DORCHESTER_COUNTY_DE" not in de_counties + + expected = { + "KENT_COUNTY_DE", + "NEW_CASTLE_COUNTY_DE", + "SUSSEX_COUNTY_DE", + } + assert set(de_counties) == expected + + def test_suffolk_county_ct_excluded(self): + """Suffolk County, CT should be excluded (doesn't exist).""" + state_counties = _build_state_counties() + ct_counties = state_counties.get("CT", []) + assert "SUFFOLK_COUNTY_CT" not in ct_counties + + +class TestCountyFilterProbability: + """Test probability calculations for city datasets.""" + + NYC_COUNTIES = { + "QUEENS_COUNTY_NY", + "BRONX_COUNTY_NY", + "RICHMOND_COUNTY_NY", + "NEW_YORK_COUNTY_NY", + "KINGS_COUNTY_NY", + } + + def test_fully_nyc_cd_has_probability_one(self): + """NY-05 (fully in NYC) should have P(NYC|CD) = 1.0.""" + prob = get_county_filter_probability("3605", self.NYC_COUNTIES) + assert prob == pytest.approx(1.0, abs=0.001) + + def test_mixed_cd_has_partial_probability(self): + """NY-03 (mixed NYC/suburbs) should have 0 < P(NYC|CD) < 1.""" + prob = get_county_filter_probability("3603", self.NYC_COUNTIES) + assert 0 < prob < 1 + # Should be approximately 24% based on Census data + assert prob == pytest.approx(0.24, abs=0.05) + + def test_non_nyc_cd_has_zero_probability(self): + """Non-NY CD should have P(NYC|CD) = 0.""" + # CA-12 (San Francisco) + prob = get_county_filter_probability("612", self.NYC_COUNTIES) + assert prob == 0.0 + + def test_filtered_distribution_sums_to_one(self): + """Filtered distribution should sum to 1.0.""" + dist = get_filtered_county_distribution("3603", self.NYC_COUNTIES) + if dist: # Only if CD has overlap + assert sum(dist.values()) == pytest.approx(1.0) + + def test_filtered_distribution_only_target_counties(self): + """Filtered distribution should only contain target counties.""" + dist = get_filtered_county_distribution("3603", self.NYC_COUNTIES) + for county in dist: + assert county in self.NYC_COUNTIES + + def test_filtered_distribution_empty_for_no_overlap(self): + """Non-overlapping CD should return empty distribution.""" + dist = get_filtered_county_distribution("612", self.NYC_COUNTIES) + assert dist == {} diff --git a/uv.lock b/uv.lock index 7f2e4e5f..24fc4182 100644 --- a/uv.lock +++ b/uv.lock @@ -1830,16 +1830,16 @@ wheels = [ [[package]] name = "policyengine-us" -version = "1.497.1" +version = "1.499.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, { name = "policyengine-core" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/fd/51/9df605ac6939ccbd8a93f5fe8a23d08b4b97e3806ea509c022c603e44266/policyengine_us-1.497.1.tar.gz", hash = "sha256:2f5eb011c8c8c205b3d313f42aa52b8356266921f46611ac9346bc04361eff61", size = 8449641, upload-time = "2026-01-06T15:19:16.995Z" } +sdist = { url = "https://files.pythonhosted.org/packages/44/05/dbaf4b5aec28ce4f72bdff321b4dadc8ff6839d791d087d5c88723d2c083/policyengine_us-1.499.0.tar.gz", hash = "sha256:a16d056f37ad4fd500dc59a9030fab8ac3730df1b553c9e53222a4f932bc5ec9", size = 8460855, upload-time = "2026-01-12T20:04:10.184Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/cc/6d/c877b3e438ae3a8d509161e7439c399629e85647d9238a9d168b06dce21d/policyengine_us-1.497.1-py3-none-any.whl", hash = "sha256:b589e060545f6e38099b0e6233a2ba94195e5c15d53b5aaa8c1efa97b025cd9f", size = 7139280, upload-time = "2026-01-06T15:19:14.666Z" }, + { url = "https://files.pythonhosted.org/packages/a2/75/348a505f35f60dcbc8af75db868abdf1d6b5d30482338ee5801e8c13689b/policyengine_us-1.499.0-py3-none-any.whl", hash = "sha256:32cd2b6d2c8ac1c7074ebad0343821bed0189bc84d157e54f63eec362bc849bb", size = 7175450, upload-time = "2026-01-12T20:04:07.527Z" }, ] [[package]]