diff --git a/obbba_district_impacts/Congressional-Hackathon-2025 b/obbba_district_impacts/Congressional-Hackathon-2025 new file mode 160000 index 0000000..3f6d05e --- /dev/null +++ b/obbba_district_impacts/Congressional-Hackathon-2025 @@ -0,0 +1 @@ +Subproject commit 3f6d05e76400c6e396a3a4eddd34a7b3f6919fc3 diff --git a/ri_dataset_exploration.ipynb b/ri_dataset_exploration.ipynb new file mode 100644 index 0000000..3e656ea --- /dev/null +++ b/ri_dataset_exploration.ipynb @@ -0,0 +1,536 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "be1cea7a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\dtsax\\envs\\pe\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from policyengine_us import Microsimulation\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0d21b774", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`\n" + ] + } + ], + "source": [ + "# Load RI dataset\n", + "sim = Microsimulation(dataset=\"hf://policyengine/test/RI.h5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1870e7ac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of households in dataset: 8,617\n", + "Household count (mapped): 401,236\n", + "Person count (mapped): 1,117,161\n" + ] + } + ], + "source": [ + "# Check dataset size\n", + "household_weight = sim.calculate(\"household_weight\", period=2025)\n", + "household_count = sim.calculate(\"household_count\", period=2025, map_to=\"household\")\n", + "person_count = sim.calculate(\"person_count\", period=2025, map_to=\"household\")\n", + "\n", + "print(f\"Number of households in dataset: {len(household_weight):,}\")\n", + "print(f\"Household count (mapped): {household_count.sum():,.0f}\")\n", + "print(f\"Person count (mapped): {person_count.sum():,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f0c79a50", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Income distribution:\n", + " Median AGI: $79,994\n", + " 75th percentile: $168,598\n", + " 90th percentile: $405,000\n", + " 95th percentile: $518,164\n", + " Max AGI: $2,600,478\n", + "\n", + "Households by income threshold:\n", + " Households over $80k: 200,330.7008952641\n", + " Households over $120k: 146,947.59684899804\n", + " Households over $160k: 110,723.5024763195\n", + " Households over $240k: 72,041.0344688301\n" + ] + } + ], + "source": [ + "# Check household income distribution (aggregate to household level using map_to)\n", + "agi = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n", + "print(f\"Income distribution:\")\n", + "print(f\" Median AGI: ${agi.median():,.0f}\")\n", + "print(f\" 75th percentile: ${agi.quantile(0.75):,.0f}\")\n", + "print(f\" 90th percentile: ${agi.quantile(0.90):,.0f}\")\n", + "print(f\" 95th percentile: ${agi.quantile(0.95):,.0f}\")\n", + "print(f\" Max AGI: ${agi.max():,.0f}\")\n", + "print(f\"\\nHouseholds by income threshold:\")\n", + "print(f\" Households over $80k: {(agi > 80_000).sum():,}\")\n", + "print(f\" Households over $120k: {(agi > 120_000).sum():,}\")\n", + "print(f\" Households over $160k: {(agi > 160_000).sum():,}\")\n", + "print(f\" Households over $240k: {(agi > 240_000).sum():,}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "71b548db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Households with children (weighted):\n", + " Total households with children: 122,610\n", + " Households with 1 child: 65,074\n", + " Households with 2 children: 38,411\n", + " Households with 3+ children: 19,126\n" + ] + } + ], + "source": [ + "# Check households with children (count at person level, aggregate to household)\n", + "is_child = sim.calculate(\"is_child\", period=2025, map_to=\"person\")\n", + "household_id = sim.calculate(\"household_id\", period=2025, map_to=\"person\")\n", + "household_weight = sim.calculate(\"household_weight\", period=2025, map_to=\"person\")\n", + "\n", + "# Create DataFrame for easier manipulation\n", + "df_households = pd.DataFrame({\n", + " 'household_id': household_id,\n", + " 'is_child': is_child,\n", + " 'household_weight': household_weight\n", + "})\n", + "\n", + "# Count children per household\n", + "children_per_household = df_households.groupby('household_id').agg({\n", + " 'is_child': 'sum',\n", + " 'household_weight': 'first' # household_weight is same for all members\n", + "}).reset_index()\n", + "\n", + "# Calculate weighted household counts\n", + "total_households_with_children = children_per_household[children_per_household['is_child'] > 0]['household_weight'].sum()\n", + "households_with_1_child = children_per_household[children_per_household['is_child'] == 1]['household_weight'].sum()\n", + "households_with_2_children = children_per_household[children_per_household['is_child'] == 2]['household_weight'].sum()\n", + "households_with_3plus_children = children_per_household[children_per_household['is_child'] >= 3]['household_weight'].sum()\n", + "\n", + "print(f\"\\nHouseholds with children (weighted):\")\n", + "print(f\" Total households with children: {total_households_with_children:,.0f}\")\n", + "print(f\" Households with 1 child: {households_with_1_child:,.0f}\")\n", + "print(f\" Households with 2 children: {households_with_2_children:,.0f}\")\n", + "print(f\" Households with 3+ children: {households_with_3plus_children:,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a215302f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Children by age:\n", + " Total children under 18: 206,993\n", + " Children under 4: 43,318\n", + " Children under 6: 64,240\n", + " Children ages 6-17: 138,628\n", + "\n", + "Sample of children under 4:\n", + " household_id tax_unit_id person_id age\n", + "27 6825009 12 11825027 3.0\n", + "112 6825079 54 11825112 1.0\n", + "140 6825054 69 11825140 2.0\n", + "143 6825055 70 11825143 2.0\n", + "146 6825056 71 11825146 1.0\n", + "173 6825065 80 11825173 2.0\n", + "174 6825065 80 11825174 0.0\n", + "200 6825076 96 11825200 0.0\n", + "224 6825085 109 11825224 2.0\n", + "292 6825109 145 11825292 3.0\n" + ] + } + ], + "source": [ + "# Check children by age groups using Ben's workaround\n", + "import pandas as pd\n", + "df = pd.DataFrame({\n", + " \"household_id\": sim.calculate(\"household_id\", map_to=\"person\"),\n", + " \"tax_unit_id\": sim.calculate(\"tax_unit_id\", map_to=\"person\"),\n", + " \"person_id\": sim.calculate(\"person_id\", map_to=\"person\"),\n", + " \"age\": sim.calculate(\"age\", map_to=\"person\"),\n", + " \"person_weight\": sim.calculate(\"person_weight\", map_to=\"person\")\n", + "})\n", + "\n", + "# Filter for children and apply weights\n", + "children_under_4_df = df[df['age'] < 4]\n", + "children_under_6_df = df[df['age'] < 6]\n", + "children_under_18_df = df[df['age'] < 18]\n", + "children_6_17_df = df[(df['age'] >= 6) & (df['age'] < 18)]\n", + "\n", + "# Calculate weighted totals\n", + "is_child = sim.calculate(\"is_child\", period=2025)\n", + "total_children = is_child.sum()\n", + "children_under_4 = children_under_4_df['person_weight'].sum()\n", + "children_under_6 = children_under_6_df['person_weight'].sum()\n", + "children_6_17 = children_6_17_df['person_weight'].sum()\n", + "\n", + "print(f\"\\nChildren by age:\")\n", + "print(f\" Total children under 18: {total_children:,.0f}\")\n", + "print(f\" Children under 4: {children_under_4:,.0f}\")\n", + "print(f\" Children under 6: {children_under_6:,.0f}\")\n", + "print(f\" Children ages 6-17: {children_6_17:,.0f}\")\n", + "\n", + "print(f\"\\nSample of children under 4:\")\n", + "print(children_under_4_df[['household_id', 'tax_unit_id', 'person_id', 'age']].head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9468033e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "RI DATASET SUMMARY - WEIGHTED (Population Estimates)\n", + "============================================================\n", + " Metric Value\n", + " Household count (weighted) 401,236\n", + " Person count (weighted) 1,117,161\n", + " Median AGI $79,994\n", + " 75th percentile AGI $168,598\n", + " 90th percentile AGI $405,000\n", + " 95th percentile AGI $518,164\n", + " Max AGI $2,600,478\n", + " Households over $80k 200,331\n", + " Households over $120k 146,948\n", + " Households over $160k 110,724\n", + " Households over $240k 72,041\n", + "Total households with children 122,610\n", + " Households with 1 child 65,074\n", + " Households with 2 children 38,411\n", + " Households with 3+ children 19,126\n", + " Total children under 18 206,993\n", + " Children under 4 43,318\n", + " Children under 6 64,240\n", + " Children ages 6-17 138,628\n", + "============================================================\n", + "\n", + "============================================================\n", + "RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)\n", + "============================================================\n", + " Metric Value\n", + " Number of households in dataset 8,617\n", + " Number of persons in dataset 26,217\n", + " Households with children (unweighted) 3,717\n", + " Households with 1 child (unweighted) 1,427\n", + " Households with 2 children (unweighted) 1,438\n", + "Households with 3+ children (unweighted) 852\n", + " Children under 18 (unweighted) 7,257\n", + " Children under 4 (unweighted) 1,221\n", + " Children under 6 (unweighted) 1,975\n", + " Children ages 6-17 (unweighted) 5,282\n", + "============================================================\n", + "\n", + "Summaries saved to:\n", + " - ri_dataset_summary_weighted.csv\n", + " - ri_dataset_summary_unweighted.csv\n" + ] + } + ], + "source": [ + "# Create weighted summary table\n", + "weighted_summary_data = {\n", + " 'Metric': [\n", + " 'Household count (weighted)',\n", + " 'Person count (weighted)',\n", + " 'Median AGI',\n", + " '75th percentile AGI',\n", + " '90th percentile AGI',\n", + " '95th percentile AGI',\n", + " 'Max AGI',\n", + " 'Households over $80k',\n", + " 'Households over $120k',\n", + " 'Households over $160k',\n", + " 'Households over $240k',\n", + " 'Total households with children',\n", + " 'Households with 1 child',\n", + " 'Households with 2 children',\n", + " 'Households with 3+ children',\n", + " 'Total children under 18',\n", + " 'Children under 4',\n", + " 'Children under 6',\n", + " 'Children ages 6-17'\n", + " ],\n", + " 'Value': [\n", + " f\"{household_count.sum():,.0f}\",\n", + " f\"{person_count.sum():,.0f}\",\n", + " f\"${agi.median():,.0f}\",\n", + " f\"${agi.quantile(0.75):,.0f}\",\n", + " f\"${agi.quantile(0.90):,.0f}\",\n", + " f\"${agi.quantile(0.95):,.0f}\",\n", + " f\"${agi.max():,.0f}\",\n", + " f\"{(agi > 80_000).sum():,.0f}\",\n", + " f\"{(agi > 120_000).sum():,.0f}\",\n", + " f\"{(agi > 160_000).sum():,.0f}\",\n", + " f\"{(agi > 240_000).sum():,.0f}\",\n", + " f\"{total_households_with_children:,.0f}\",\n", + " f\"{households_with_1_child:,.0f}\",\n", + " f\"{households_with_2_children:,.0f}\",\n", + " f\"{households_with_3plus_children:,.0f}\",\n", + " f\"{total_children:,.0f}\",\n", + " f\"{children_under_4:,.0f}\",\n", + " f\"{children_under_6:,.0f}\",\n", + " f\"{children_6_17:,.0f}\"\n", + " ]\n", + "}\n", + "\n", + "# Get unique counts for unweighted table\n", + "unique_households = df['household_id'].nunique()\n", + "unique_persons = len(df)\n", + "\n", + "# Create unweighted summary table\n", + "unweighted_summary_data = {\n", + " 'Metric': [\n", + " 'Number of households in dataset',\n", + " 'Number of persons in dataset',\n", + " 'Households with children (unweighted)',\n", + " 'Households with 1 child (unweighted)',\n", + " 'Households with 2 children (unweighted)',\n", + " 'Households with 3+ children (unweighted)',\n", + " 'Children under 18 (unweighted)',\n", + " 'Children under 4 (unweighted)',\n", + " 'Children under 6 (unweighted)',\n", + " 'Children ages 6-17 (unweighted)'\n", + " ],\n", + " 'Value': [\n", + " f\"{unique_households:,}\",\n", + " f\"{unique_persons:,}\",\n", + " f\"{(children_per_household['is_child'] > 0).sum():,}\",\n", + " f\"{(children_per_household['is_child'] == 1).sum():,}\",\n", + " f\"{(children_per_household['is_child'] == 2).sum():,}\",\n", + " f\"{(children_per_household['is_child'] >= 3).sum():,}\",\n", + " f\"{len(children_under_18_df):,}\",\n", + " f\"{len(children_under_4_df):,}\",\n", + " f\"{len(children_under_6_df):,}\",\n", + " f\"{len(children_6_17_df):,}\"\n", + " ]\n", + "}\n", + "\n", + "weighted_df = pd.DataFrame(weighted_summary_data)\n", + "unweighted_df = pd.DataFrame(unweighted_summary_data)\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"RI DATASET SUMMARY - WEIGHTED (Population Estimates)\")\n", + "print(\"=\"*60)\n", + "print(weighted_df.to_string(index=False))\n", + "print(\"=\"*60)\n", + "\n", + "print(\"\\n\" + \"=\"*60)\n", + "print(\"RI DATASET SUMMARY - UNWEIGHTED (Sample Counts)\")\n", + "print(\"=\"*60)\n", + "print(unweighted_df.to_string(index=False))\n", + "print(\"=\"*60)\n", + "\n", + "# Save both tables\n", + "weighted_df.to_csv('ri_dataset_summary_weighted.csv', index=False)\n", + "unweighted_df.to_csv('ri_dataset_summary_unweighted.csv', index=False)\n", + "print(\"\\nSummaries saved to:\")\n", + "print(\" - ri_dataset_summary_weighted.csv\")\n", + "print(\" - ri_dataset_summary_unweighted.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "dzvou2zqia4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Median AGI by aggregation level:\n", + " Household level: $79,994\n", + " Tax unit level: $38,552\n", + " Person level: $49,057\n", + "\n", + "Total AGI for Rhode Island (by aggregation level):\n", + " Using tax unit level: $57,748,447,798\n", + " Using household level: $57,748,447,798\n", + " Using person level: $122,937,416,952\n" + ] + } + ], + "source": [ + "# Compare median AGI at different aggregation levels\n", + "agi_household = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"household\")\n", + "agi_tax_unit = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"tax_unit\")\n", + "agi_person = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"person\")\n", + "\n", + "print(\"Median AGI by aggregation level:\")\n", + "print(f\" Household level: ${agi_household.median():,.0f}\")\n", + "print(f\" Tax unit level: ${agi_tax_unit.median():,.0f}\")\n", + "print(f\" Person level: ${agi_person.median():,.0f}\")\n", + "\n", + "# Calculate total AGI - just sum the values (weights are already built into the arrays)\n", + "total_agi_tax_unit = agi_tax_unit.sum()\n", + "total_agi_household = agi_household.sum()\n", + "total_agi_person = agi_person.sum()\n", + "\n", + "print(f\"\\nTotal AGI for Rhode Island (by aggregation level):\")\n", + "print(f\" Using tax unit level: ${total_agi_tax_unit:,.0f}\")\n", + "print(f\" Using household level: ${total_agi_household:,.0f}\")\n", + "print(f\" Using person level: ${total_agi_person:,.0f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "gispfkxpnph", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AGI Component Breakdown (Tax Unit Level)\n", + "============================================================\n", + "\n", + "Total Income (Statewide):\n", + " Employment Income: $ 41,313,375,713\n", + " Self-Employment Income: $ 1,895,414,487\n", + " Capital Gains: $ 4,284,511,812\n", + " Qualified Dividends: $ 998,165,965\n", + " Interest Income: $ 674,707,426\n", + " Taxable Social Security: $ 1,115,904,003\n", + " Pension Income: $ 1,507,391,122\n", + " Adjusted Gross Income (AGI): $ 57,748,447,798\n", + "\n", + "Median Values:\n", + " Employment Income: $ 32,484\n", + " Self-Employment Income: $ 0\n", + " Capital Gains: $ 0\n", + " Qualified Dividends: $ 0\n", + " Interest Income: $ 0\n", + " Taxable Social Security: $ 0\n", + " Pension Income: $ 0\n", + " Adjusted Gross Income (AGI): $ 38,552\n", + "\n", + "Sum of income components: $ 51,789,470,528\n", + "AGI (for comparison): $ 57,748,447,798\n", + "Difference (potential missing income or deductions): $ -5,958,977,270\n" + ] + } + ], + "source": [ + "# Break down AGI components at tax unit level\n", + "print(\"AGI Component Breakdown (Tax Unit Level)\")\n", + "print(\"=\"*60)\n", + "\n", + "# Calculate key income components\n", + "employment_income = sim.calculate(\"employment_income\", period=2025, map_to=\"tax_unit\")\n", + "self_employment_income = sim.calculate(\"self_employment_income\", period=2025, map_to=\"tax_unit\")\n", + "capital_gains = sim.calculate(\"capital_gains\", period=2025, map_to=\"tax_unit\")\n", + "qualified_dividend_income = sim.calculate(\"qualified_dividend_income\", period=2025, map_to=\"tax_unit\")\n", + "interest_income = sim.calculate(\"interest_income\", period=2025, map_to=\"tax_unit\")\n", + "taxable_social_security = sim.calculate(\"taxable_social_security\", period=2025, map_to=\"tax_unit\")\n", + "pension_income = sim.calculate(\"pension_income\", period=2025, map_to=\"tax_unit\")\n", + "adjusted_gross_income = sim.calculate(\"adjusted_gross_income\", period=2025, map_to=\"tax_unit\")\n", + "\n", + "print(\"\\nTotal Income (Statewide):\")\n", + "print(f\" Employment Income: ${employment_income.sum():>15,.0f}\")\n", + "print(f\" Self-Employment Income: ${self_employment_income.sum():>15,.0f}\")\n", + "print(f\" Capital Gains: ${capital_gains.sum():>15,.0f}\")\n", + "print(f\" Qualified Dividends: ${qualified_dividend_income.sum():>15,.0f}\")\n", + "print(f\" Interest Income: ${interest_income.sum():>15,.0f}\")\n", + "print(f\" Taxable Social Security: ${taxable_social_security.sum():>15,.0f}\")\n", + "print(f\" Pension Income: ${pension_income.sum():>15,.0f}\")\n", + "print(f\" Adjusted Gross Income (AGI): ${adjusted_gross_income.sum():>15,.0f}\")\n", + "\n", + "print(\"\\nMedian Values:\")\n", + "print(f\" Employment Income: ${employment_income.median():>15,.0f}\")\n", + "print(f\" Self-Employment Income: ${self_employment_income.median():>15,.0f}\")\n", + "print(f\" Capital Gains: ${capital_gains.median():>15,.0f}\")\n", + "print(f\" Qualified Dividends: ${qualified_dividend_income.median():>15,.0f}\")\n", + "print(f\" Interest Income: ${interest_income.median():>15,.0f}\")\n", + "print(f\" Taxable Social Security: ${taxable_social_security.median():>15,.0f}\")\n", + "print(f\" Pension Income: ${pension_income.median():>15,.0f}\")\n", + "print(f\" Adjusted Gross Income (AGI): ${adjusted_gross_income.median():>15,.0f}\")\n", + "\n", + "# Calculate sum of components to compare with AGI\n", + "total_components = (employment_income + self_employment_income + capital_gains + \n", + " qualified_dividend_income + interest_income + taxable_social_security + pension_income)\n", + "print(f\"\\nSum of income components: ${total_components.sum():>15,.0f}\")\n", + "print(f\"AGI (for comparison): ${adjusted_gross_income.sum():>15,.0f}\")\n", + "print(f\"Difference (potential missing income or deductions): ${(total_components.sum() - adjusted_gross_income.sum()):>15,.0f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pe", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ri_dataset_summary_unweighted.csv b/ri_dataset_summary_unweighted.csv new file mode 100644 index 0000000..ab9b7b0 --- /dev/null +++ b/ri_dataset_summary_unweighted.csv @@ -0,0 +1,11 @@ +Metric,Value +Number of households in dataset,"8,617" +Number of persons in dataset,"26,217" +Households with children (unweighted),"3,717" +Households with 1 child (unweighted),"1,427" +Households with 2 children (unweighted),"1,438" +Households with 3+ children (unweighted),852 +Children under 18 (unweighted),"7,257" +Children under 4 (unweighted),"1,221" +Children under 6 (unweighted),"1,975" +Children ages 6-17 (unweighted),"5,282" diff --git a/ri_dataset_summary_weighted.csv b/ri_dataset_summary_weighted.csv new file mode 100644 index 0000000..7822171 --- /dev/null +++ b/ri_dataset_summary_weighted.csv @@ -0,0 +1,20 @@ +Metric,Value +Household count (weighted),"401,236" +Person count (weighted),"1,117,161" +Median AGI,"$79,994" +75th percentile AGI,"$168,598" +90th percentile AGI,"$405,000" +95th percentile AGI,"$518,164" +Max AGI,"$2,600,478" +Households over $80k,"200,331" +Households over $120k,"146,948" +Households over $160k,"110,724" +Households over $240k,"72,041" +Total households with children,"122,610" +Households with 1 child,"65,074" +Households with 2 children,"38,411" +Households with 3+ children,"19,126" +Total children under 18,"206,993" +Children under 4,"43,318" +Children under 6,"64,240" +Children ages 6-17,"138,628"