diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index be48ac80..00000000 --- a/CLAUDE.md +++ /dev/null @@ -1,17 +0,0 @@ -# Claude notes - -Claude, please follow these always. These principles are aimed at preventing you from producing AI slop. - -1. British English, sentence case -2. No excessive duplication, keep code files as concise as possible to produce the same meaningful value. No excessive printing -3. Don't create multiple files for successive versions. Keep checking: have I added lots of intermediate files which are deprecated? Delete them if so, but ideally don't create them in the first place - -## MicroDataFrame - -A pandas DataFrame that automatically handles weights for survey microdata. Key features: - -- Create with `MicroDataFrame(df, weights='weight_column')` -- All aggregations (sum, mean, etc.) automatically weight results -- Each column is a MicroSeries with weighted operations -- Use `.groupby()` for weighted group statistics -- Built-in poverty analysis: `.poverty_rate()`, `.poverty_gap()` diff --git a/src/policyengine/outputs/__init__.py b/src/policyengine/outputs/__init__.py index 8997578d..cac44e5d 100644 --- a/src/policyengine/outputs/__init__.py +++ b/src/policyengine/outputs/__init__.py @@ -8,6 +8,22 @@ DecileImpact, calculate_decile_impacts, ) +from policyengine.outputs.inequality import ( + UK_INEQUALITY_INCOME_VARIABLE, + US_INEQUALITY_INCOME_VARIABLE, + Inequality, + calculate_uk_inequality, + calculate_us_inequality, +) +from policyengine.outputs.poverty import ( + UK_POVERTY_VARIABLES, + US_POVERTY_VARIABLES, + Poverty, + UKPovertyType, + USPovertyType, + calculate_uk_poverty_rates, + calculate_us_poverty_rates, +) __all__ = [ "Output", @@ -18,4 +34,16 @@ "ChangeAggregateType", "DecileImpact", "calculate_decile_impacts", + "Poverty", + "UKPovertyType", + "USPovertyType", + "UK_POVERTY_VARIABLES", + "US_POVERTY_VARIABLES", + "calculate_uk_poverty_rates", + "calculate_us_poverty_rates", + "Inequality", + "UK_INEQUALITY_INCOME_VARIABLE", + "US_INEQUALITY_INCOME_VARIABLE", + "calculate_uk_inequality", + "calculate_us_inequality", ] diff --git a/src/policyengine/outputs/inequality.py b/src/policyengine/outputs/inequality.py new file mode 100644 index 00000000..e17e704a --- /dev/null +++ b/src/policyengine/outputs/inequality.py @@ -0,0 +1,276 @@ +"""Inequality analysis output types.""" + +from typing import Any + +import numpy as np +import pandas as pd +from pydantic import ConfigDict + +from policyengine.core import Output, Simulation + + +def _gini(values: np.ndarray, weights: np.ndarray) -> float: + """Calculate weighted Gini coefficient. + + Args: + values: Array of income values + weights: Array of weights + + Returns: + Gini coefficient between 0 (perfect equality) and 1 (perfect inequality) + """ + # Handle edge cases + if len(values) == 0 or weights.sum() == 0: + return 0.0 + + # Sort by values + sorted_indices = np.argsort(values) + sorted_values = values[sorted_indices] + sorted_weights = weights[sorted_indices] + + # Cumulative weights and weighted values + cumulative_weights = np.cumsum(sorted_weights) + total_weight = cumulative_weights[-1] + cumulative_weighted_values = np.cumsum(sorted_values * sorted_weights) + total_weighted_value = cumulative_weighted_values[-1] + + if total_weighted_value == 0: + return 0.0 + + # Calculate Gini using the area formula + # Gini = 1 - 2 * (area under Lorenz curve) + lorenz_curve = cumulative_weighted_values / total_weighted_value + weight_fractions = sorted_weights / total_weight + + # Area under Lorenz curve using trapezoidal rule + area = np.sum(weight_fractions * (lorenz_curve - weight_fractions / 2)) + + return float(1 - 2 * area) + + +class Inequality(Output): + """Single inequality measure result - represents one database row. + + This is a single-simulation output type that calculates inequality + metrics for a given income variable, optionally filtered by + demographic variables. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + simulation: Simulation + income_variable: str + entity: str = "household" + + # Optional demographic filters + filter_variable: str | None = None + filter_variable_eq: Any | None = None + filter_variable_leq: Any | None = None + filter_variable_geq: Any | None = None + + # Results populated by run() + gini: float | None = None + top_10_share: float | None = None + top_1_share: float | None = None + bottom_50_share: float | None = None + + def run(self): + """Calculate inequality metrics.""" + # Get income variable info + income_var_obj = ( + self.simulation.tax_benefit_model_version.get_variable( + self.income_variable + ) + ) + + # Get target entity data + target_entity = self.entity + data = getattr(self.simulation.output_dataset.data, target_entity) + + # Map income variable to target entity if needed + if income_var_obj.entity != target_entity: + mapped = self.simulation.output_dataset.data.map_to_entity( + income_var_obj.entity, + target_entity, + columns=[self.income_variable], + ) + income_series = mapped[self.income_variable] + else: + income_series = data[self.income_variable] + + # Get weights + weight_col = f"{target_entity}_weight" + if weight_col in data.columns: + weights = data[weight_col] + else: + weights = pd.Series(np.ones(len(income_series))) + + # Apply demographic filter if specified + if self.filter_variable is not None: + filter_var_obj = ( + self.simulation.tax_benefit_model_version.get_variable( + self.filter_variable + ) + ) + + if filter_var_obj.entity != target_entity: + filter_mapped = ( + self.simulation.output_dataset.data.map_to_entity( + filter_var_obj.entity, + target_entity, + columns=[self.filter_variable], + ) + ) + filter_series = filter_mapped[self.filter_variable] + else: + filter_series = data[self.filter_variable] + + # Build filter mask + mask = filter_series.notna() + if self.filter_variable_eq is not None: + mask &= filter_series == self.filter_variable_eq + if self.filter_variable_leq is not None: + mask &= filter_series <= self.filter_variable_leq + if self.filter_variable_geq is not None: + mask &= filter_series >= self.filter_variable_geq + + # Apply mask + income_series = income_series[mask] + weights = weights[mask] + + # Convert to numpy arrays + values = np.array(income_series) + weights_arr = np.array(weights) + + # Remove NaN values + valid_mask = ~np.isnan(values) & ~np.isnan(weights_arr) + values = values[valid_mask] + weights_arr = weights_arr[valid_mask] + + # Calculate Gini coefficient + self.gini = _gini(values, weights_arr) + + # Calculate income shares + if len(values) > 0 and weights_arr.sum() > 0: + total_income = np.sum(values * weights_arr) + + if total_income > 0: + # Sort by income + sorted_indices = np.argsort(values) + sorted_values = values[sorted_indices] + sorted_weights = weights_arr[sorted_indices] + + # Cumulative weight fractions + cumulative_weights = np.cumsum(sorted_weights) + total_weight = cumulative_weights[-1] + weight_fractions = cumulative_weights / total_weight + + # Top 10% share + top_10_mask = weight_fractions > 0.9 + self.top_10_share = float( + np.sum( + sorted_values[top_10_mask] + * sorted_weights[top_10_mask] + ) + / total_income + ) + + # Top 1% share + top_1_mask = weight_fractions > 0.99 + self.top_1_share = float( + np.sum( + sorted_values[top_1_mask] * sorted_weights[top_1_mask] + ) + / total_income + ) + + # Bottom 50% share + bottom_50_mask = weight_fractions <= 0.5 + self.bottom_50_share = float( + np.sum( + sorted_values[bottom_50_mask] + * sorted_weights[bottom_50_mask] + ) + / total_income + ) + else: + self.top_10_share = 0.0 + self.top_1_share = 0.0 + self.bottom_50_share = 0.0 + else: + self.top_10_share = 0.0 + self.top_1_share = 0.0 + self.bottom_50_share = 0.0 + + +# Default income variables for each country +UK_INEQUALITY_INCOME_VARIABLE = "equiv_hbai_household_net_income" +US_INEQUALITY_INCOME_VARIABLE = "household_net_income" + + +def calculate_uk_inequality( + simulation: Simulation, + income_variable: str = UK_INEQUALITY_INCOME_VARIABLE, + filter_variable: str | None = None, + filter_variable_eq: Any | None = None, + filter_variable_leq: Any | None = None, + filter_variable_geq: Any | None = None, +) -> Inequality: + """Calculate inequality metrics for a UK simulation. + + Args: + simulation: The simulation to analyse + income_variable: Income variable to use (default: equiv_hbai_household_net_income) + filter_variable: Optional variable to filter by + filter_variable_eq: Filter for exact match + filter_variable_leq: Filter for less than or equal + filter_variable_geq: Filter for greater than or equal + + Returns: + Inequality object with Gini and income share metrics + """ + inequality = Inequality( + simulation=simulation, + income_variable=income_variable, + entity="household", + filter_variable=filter_variable, + filter_variable_eq=filter_variable_eq, + filter_variable_leq=filter_variable_leq, + filter_variable_geq=filter_variable_geq, + ) + inequality.run() + return inequality + + +def calculate_us_inequality( + simulation: Simulation, + income_variable: str = US_INEQUALITY_INCOME_VARIABLE, + filter_variable: str | None = None, + filter_variable_eq: Any | None = None, + filter_variable_leq: Any | None = None, + filter_variable_geq: Any | None = None, +) -> Inequality: + """Calculate inequality metrics for a US simulation. + + Args: + simulation: The simulation to analyse + income_variable: Income variable to use (default: household_net_income) + filter_variable: Optional variable to filter by + filter_variable_eq: Filter for exact match + filter_variable_leq: Filter for less than or equal + filter_variable_geq: Filter for greater than or equal + + Returns: + Inequality object with Gini and income share metrics + """ + inequality = Inequality( + simulation=simulation, + income_variable=income_variable, + entity="household", + filter_variable=filter_variable, + filter_variable_eq=filter_variable_eq, + filter_variable_leq=filter_variable_leq, + filter_variable_geq=filter_variable_geq, + ) + inequality.run() + return inequality diff --git a/src/policyengine/outputs/poverty.py b/src/policyengine/outputs/poverty.py new file mode 100644 index 00000000..9b5074f4 --- /dev/null +++ b/src/policyengine/outputs/poverty.py @@ -0,0 +1,238 @@ +"""Poverty analysis output types.""" + +from enum import Enum +from typing import Any + +import pandas as pd +from pydantic import ConfigDict + +from policyengine.core import Output, OutputCollection, Simulation + + +class UKPovertyType(str, Enum): + """UK poverty measure types.""" + + ABSOLUTE_BHC = "absolute_bhc" + ABSOLUTE_AHC = "absolute_ahc" + RELATIVE_BHC = "relative_bhc" + RELATIVE_AHC = "relative_ahc" + + +class USPovertyType(str, Enum): + """US poverty measure types.""" + + SPM = "spm" + SPM_DEEP = "spm_deep" + + +# Mapping from poverty type to variable name +UK_POVERTY_VARIABLES = { + UKPovertyType.ABSOLUTE_BHC: "in_poverty_bhc", + UKPovertyType.ABSOLUTE_AHC: "in_poverty_ahc", + UKPovertyType.RELATIVE_BHC: "in_relative_poverty_bhc", + UKPovertyType.RELATIVE_AHC: "in_relative_poverty_ahc", +} + +US_POVERTY_VARIABLES = { + USPovertyType.SPM: "spm_unit_is_in_spm_poverty", + USPovertyType.SPM_DEEP: "spm_unit_is_in_deep_spm_poverty", +} + + +class Poverty(Output): + """Single poverty measure result - represents one database row. + + This is a single-simulation output type that calculates poverty + headcount and rate for a given poverty measure, optionally filtered + by demographic variables. + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + simulation: Simulation + poverty_variable: str + entity: str = "person" + + # Optional demographic filters + filter_variable: str | None = None + filter_variable_eq: Any | None = None + filter_variable_leq: Any | None = None + filter_variable_geq: Any | None = None + + # Results populated by run() + headcount: float | None = None + total_population: float | None = None + rate: float | None = None + + def run(self): + """Calculate poverty headcount and rate.""" + # Get poverty variable info + poverty_var_obj = ( + self.simulation.tax_benefit_model_version.get_variable( + self.poverty_variable + ) + ) + + # Get target entity data + target_entity = self.entity + data = getattr(self.simulation.output_dataset.data, target_entity) + + # Map poverty variable to target entity if needed + if poverty_var_obj.entity != target_entity: + mapped = self.simulation.output_dataset.data.map_to_entity( + poverty_var_obj.entity, + target_entity, + columns=[self.poverty_variable], + ) + poverty_series = mapped[self.poverty_variable] + else: + poverty_series = data[self.poverty_variable] + + # Apply demographic filter if specified + if self.filter_variable is not None: + filter_var_obj = ( + self.simulation.tax_benefit_model_version.get_variable( + self.filter_variable + ) + ) + + if filter_var_obj.entity != target_entity: + filter_mapped = ( + self.simulation.output_dataset.data.map_to_entity( + filter_var_obj.entity, + target_entity, + columns=[self.filter_variable], + ) + ) + filter_series = filter_mapped[self.filter_variable] + else: + filter_series = data[self.filter_variable] + + # Build filter mask + mask = filter_series.notna() + if self.filter_variable_eq is not None: + mask &= filter_series == self.filter_variable_eq + if self.filter_variable_leq is not None: + mask &= filter_series <= self.filter_variable_leq + if self.filter_variable_geq is not None: + mask &= filter_series >= self.filter_variable_geq + + # Apply mask + poverty_series = poverty_series[mask] + + # Calculate results using weighted counts + self.headcount = float((poverty_series == True).sum()) # noqa: E712 + self.total_population = float(poverty_series.count()) + self.rate = ( + self.headcount / self.total_population + if self.total_population > 0 + else 0.0 + ) + + +def calculate_uk_poverty_rates( + simulation: Simulation, + filter_variable: str | None = None, + filter_variable_eq: Any | None = None, + filter_variable_leq: Any | None = None, + filter_variable_geq: Any | None = None, +) -> OutputCollection[Poverty]: + """Calculate all UK poverty rates for a simulation. + + Args: + simulation: The simulation to analyse + filter_variable: Optional variable to filter by (e.g., "is_child") + filter_variable_eq: Filter for exact match + filter_variable_leq: Filter for less than or equal + filter_variable_geq: Filter for greater than or equal + + Returns: + OutputCollection containing Poverty objects for each UK poverty type + """ + results = [] + + for poverty_variable in UK_POVERTY_VARIABLES.values(): + poverty = Poverty( + simulation=simulation, + poverty_variable=poverty_variable, + entity="person", + filter_variable=filter_variable, + filter_variable_eq=filter_variable_eq, + filter_variable_leq=filter_variable_leq, + filter_variable_geq=filter_variable_geq, + ) + poverty.run() + results.append(poverty) + + df = pd.DataFrame( + [ + { + "simulation_id": r.simulation.id, + "poverty_variable": r.poverty_variable, + "filter_variable": r.filter_variable, + "filter_variable_eq": r.filter_variable_eq, + "filter_variable_leq": r.filter_variable_leq, + "filter_variable_geq": r.filter_variable_geq, + "headcount": r.headcount, + "total_population": r.total_population, + "rate": r.rate, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) + + +def calculate_us_poverty_rates( + simulation: Simulation, + filter_variable: str | None = None, + filter_variable_eq: Any | None = None, + filter_variable_leq: Any | None = None, + filter_variable_geq: Any | None = None, +) -> OutputCollection[Poverty]: + """Calculate all US poverty rates for a simulation. + + Args: + simulation: The simulation to analyse + filter_variable: Optional variable to filter by (e.g., "is_child") + filter_variable_eq: Filter for exact match + filter_variable_leq: Filter for less than or equal + filter_variable_geq: Filter for greater than or equal + + Returns: + OutputCollection containing Poverty objects for each US poverty type + """ + results = [] + + for poverty_variable in US_POVERTY_VARIABLES.values(): + poverty = Poverty( + simulation=simulation, + poverty_variable=poverty_variable, + entity="person", + filter_variable=filter_variable, + filter_variable_eq=filter_variable_eq, + filter_variable_leq=filter_variable_leq, + filter_variable_geq=filter_variable_geq, + ) + poverty.run() + results.append(poverty) + + df = pd.DataFrame( + [ + { + "simulation_id": r.simulation.id, + "poverty_variable": r.poverty_variable, + "filter_variable": r.filter_variable, + "filter_variable_eq": r.filter_variable_eq, + "filter_variable_leq": r.filter_variable_leq, + "filter_variable_geq": r.filter_variable_geq, + "headcount": r.headcount, + "total_population": r.total_population, + "rate": r.rate, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) diff --git a/src/policyengine/tax_benefit_models/uk/analysis.py b/src/policyengine/tax_benefit_models/uk/analysis.py index 5d633619..f329dbfc 100644 --- a/src/policyengine/tax_benefit_models/uk/analysis.py +++ b/src/policyengine/tax_benefit_models/uk/analysis.py @@ -14,6 +14,14 @@ DecileImpact, calculate_decile_impacts, ) +from policyengine.outputs.inequality import ( + Inequality, + calculate_uk_inequality, +) +from policyengine.outputs.poverty import ( + Poverty, + calculate_uk_poverty_rates, +) from .datasets import PolicyEngineUKDataset, UKYearData from .model import uk_latest @@ -175,6 +183,10 @@ class PolicyReformAnalysis(BaseModel): decile_impacts: OutputCollection[DecileImpact] programme_statistics: OutputCollection[ProgrammeStatistics] + baseline_poverty: OutputCollection[Poverty] + reform_poverty: OutputCollection[Poverty] + baseline_inequality: Inequality + reform_inequality: Inequality def economic_impact_analysis( @@ -262,7 +274,19 @@ def economic_impact_analysis( outputs=programme_statistics, dataframe=programme_df ) + # Calculate poverty rates for both simulations + baseline_poverty = calculate_uk_poverty_rates(baseline_simulation) + reform_poverty = calculate_uk_poverty_rates(reform_simulation) + + # Calculate inequality for both simulations + baseline_inequality = calculate_uk_inequality(baseline_simulation) + reform_inequality = calculate_uk_inequality(reform_simulation) + return PolicyReformAnalysis( decile_impacts=decile_impacts, programme_statistics=programme_collection, + baseline_poverty=baseline_poverty, + reform_poverty=reform_poverty, + baseline_inequality=baseline_inequality, + reform_inequality=reform_inequality, ) diff --git a/src/policyengine/tax_benefit_models/us/analysis.py b/src/policyengine/tax_benefit_models/us/analysis.py index f626a8c6..fad0a5b1 100644 --- a/src/policyengine/tax_benefit_models/us/analysis.py +++ b/src/policyengine/tax_benefit_models/us/analysis.py @@ -14,6 +14,14 @@ DecileImpact, calculate_decile_impacts, ) +from policyengine.outputs.inequality import ( + Inequality, + calculate_us_inequality, +) +from policyengine.outputs.poverty import ( + Poverty, + calculate_us_poverty_rates, +) from .datasets import PolicyEngineUSDataset, USYearData from .model import us_latest @@ -193,6 +201,10 @@ class PolicyReformAnalysis(BaseModel): decile_impacts: OutputCollection[DecileImpact] program_statistics: OutputCollection[ProgramStatistics] + baseline_poverty: OutputCollection[Poverty] + reform_poverty: OutputCollection[Poverty] + baseline_inequality: Inequality + reform_inequality: Inequality def economic_impact_analysis( @@ -283,6 +295,19 @@ def economic_impact_analysis( outputs=program_statistics, dataframe=program_df ) + # Calculate poverty rates for both simulations + baseline_poverty = calculate_us_poverty_rates(baseline_simulation) + reform_poverty = calculate_us_poverty_rates(reform_simulation) + + # Calculate inequality for both simulations + baseline_inequality = calculate_us_inequality(baseline_simulation) + reform_inequality = calculate_us_inequality(reform_simulation) + return PolicyReformAnalysis( - decile_impacts=decile_impacts, program_statistics=program_collection + decile_impacts=decile_impacts, + program_statistics=program_collection, + baseline_poverty=baseline_poverty, + reform_poverty=reform_poverty, + baseline_inequality=baseline_inequality, + reform_inequality=reform_inequality, ) diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index 27760e01..939f3f17 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -59,6 +59,8 @@ class PolicyEngineUSLatest(TaxBenefitModelVersion): "person_weight", # Demographics "age", + "is_child", + "is_adult", # Income "employment_income", # Benefits @@ -81,6 +83,9 @@ class PolicyEngineUSLatest(TaxBenefitModelVersion): "snap", "tanf", "spm_unit_net_income", + # Poverty measures + "spm_unit_is_in_spm_poverty", + "spm_unit_is_in_deep_spm_poverty", ], "tax_unit": [ "tax_unit_id", diff --git a/tests/test_inequality.py b/tests/test_inequality.py new file mode 100644 index 00000000..bbdb0962 --- /dev/null +++ b/tests/test_inequality.py @@ -0,0 +1,288 @@ +"""Tests for inequality analysis output type.""" + +import os +import tempfile + +import numpy as np +import pandas as pd +from microdf import MicroDataFrame + +from policyengine.core import Simulation +from policyengine.outputs.inequality import ( + UK_INEQUALITY_INCOME_VARIABLE, + US_INEQUALITY_INCOME_VARIABLE, + Inequality, + _gini, +) +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + UKYearData, + uk_latest, +) + + +def test_gini_perfect_equality(): + """Test Gini coefficient with perfect equality (all same income).""" + values = np.array([100.0, 100.0, 100.0, 100.0]) + weights = np.array([1.0, 1.0, 1.0, 1.0]) + gini = _gini(values, weights) + assert abs(gini) < 0.01 # Should be ~0 + + +def test_gini_perfect_inequality(): + """Test Gini coefficient with extreme inequality.""" + # One person has all income, others have none + values = np.array([0.0, 0.0, 0.0, 1000.0]) + weights = np.array([1.0, 1.0, 1.0, 1.0]) + gini = _gini(values, weights) + assert gini > 0.7 # Should be high + + +def test_gini_moderate_inequality(): + """Test Gini coefficient with moderate inequality.""" + values = np.array([10.0, 20.0, 30.0, 40.0]) + weights = np.array([1.0, 1.0, 1.0, 1.0]) + gini = _gini(values, weights) + # Moderate inequality should be between 0.1 and 0.4 + assert 0.1 < gini < 0.4 + + +def test_gini_weighted(): + """Test Gini coefficient with different weights.""" + values = np.array([100.0, 200.0]) + weights = np.array([3.0, 1.0]) # 3 people with 100, 1 with 200 + gini = _gini(values, weights) + # Should reflect that 75% have lower income + assert 0.05 < gini < 0.3 + + +def test_inequality_basic(): + """Test basic inequality calculation.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [0, 1, 2, 3], + "benunit_id": [0, 0, 1, 1], + "household_id": [0, 0, 1, 1], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [0, 1], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + # Two households with different incomes + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [0, 1], + "household_weight": [1.0, 1.0], + "equiv_hbai_household_net_income": [20000.0, 80000.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + inequality = Inequality( + simulation=simulation, + income_variable="equiv_hbai_household_net_income", + entity="household", + ) + inequality.run() + + # Check Gini is calculated + assert inequality.gini is not None + assert 0 <= inequality.gini <= 1 + + # Check income shares are calculated + assert inequality.top_10_share is not None + assert inequality.top_1_share is not None + assert inequality.bottom_50_share is not None + + # With 2 households of equal weight, one with 20k and one with 80k: + # Top 50% (1 hh) has 80k, bottom 50% (1 hh) has 20k + # Total = 100k, so bottom 50% share = 20% + assert abs(inequality.bottom_50_share - 0.2) < 0.01 + + +def test_inequality_income_shares(): + """Test income share calculations with more households.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": list(range(10)), + "benunit_id": list(range(10)), + "household_id": list(range(10)), + "person_weight": [1.0] * 10, + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": list(range(10)), + "benunit_weight": [1.0] * 10, + } + ), + weights="benunit_weight", + ) + + # 10 households with incomes 10k, 20k, ..., 100k + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": list(range(10)), + "household_weight": [1.0] * 10, + "equiv_hbai_household_net_income": [ + 10000.0, + 20000.0, + 30000.0, + 40000.0, + 50000.0, + 60000.0, + 70000.0, + 80000.0, + 90000.0, + 100000.0, + ], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + inequality = Inequality( + simulation=simulation, + income_variable="equiv_hbai_household_net_income", + entity="household", + ) + inequality.run() + + # Total income = 550k + # Top 10% (1 hh with 100k) = 100k/550k = 18.2% + assert abs(inequality.top_10_share - 100000 / 550000) < 0.02 + + # Bottom 50% (5 hh with 10k-50k) = 150k/550k = 27.3% + assert abs(inequality.bottom_50_share - 150000 / 550000) < 0.02 + + +def test_inequality_variable_defaults(): + """Test default income variables for UK and US.""" + assert UK_INEQUALITY_INCOME_VARIABLE == "equiv_hbai_household_net_income" + assert US_INEQUALITY_INCOME_VARIABLE == "household_net_income" + + +def test_inequality_weighted(): + """Test inequality with weighted households.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [0, 1], + "benunit_id": [0, 1], + "household_id": [0, 1], + "person_weight": [1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [0, 1], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + # Two households: one with weight 9 (30k), one with weight 1 (100k) + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [0, 1], + "household_weight": [9.0, 1.0], + "equiv_hbai_household_net_income": [30000.0, 100000.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + inequality = Inequality( + simulation=simulation, + income_variable="equiv_hbai_household_net_income", + entity="household", + ) + inequality.run() + + # Total weighted income = 9*30k + 1*100k = 370k + # 90% of weight (9 hh) has 270k = 73% of income + # So top 10% (1 hh with weight 1) has 100k/370k = 27% + assert abs(inequality.top_10_share - 100000 / 370000) < 0.02 diff --git a/tests/test_poverty.py b/tests/test_poverty.py new file mode 100644 index 00000000..64baad68 --- /dev/null +++ b/tests/test_poverty.py @@ -0,0 +1,283 @@ +"""Tests for poverty analysis output type.""" + +import os +import tempfile + +import pandas as pd +from microdf import MicroDataFrame + +from policyengine.core import Simulation +from policyengine.outputs.poverty import ( + UK_POVERTY_VARIABLES, + US_POVERTY_VARIABLES, + Poverty, + UKPovertyType, + USPovertyType, +) +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + UKYearData, + uk_latest, +) + + +def test_poverty_basic(): + """Test basic poverty calculation. + + in_poverty_bhc is a household-level variable, so we set it on households + and then map to persons for the rate calculation. + """ + # Create test data - 2 people in household 0 (in poverty), 3 in household 1 (not) + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [0, 1, 2, 3, 4], + "benunit_id": [0, 0, 1, 1, 1], + "household_id": [0, 0, 1, 1, 1], + "person_weight": [1.0, 1.0, 1.0, 1.0, 1.0], + "is_child": [True, False, True, False, False], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [0, 1], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + # Household 0 is in poverty, household 1 is not + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [0, 1], + "household_weight": [1.0, 1.0], + "in_poverty_bhc": [True, False], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + # Calculate poverty at person level (mapping from household) + poverty = Poverty( + simulation=simulation, + poverty_variable="in_poverty_bhc", + entity="person", + ) + poverty.run() + + # 2 people (in hh 0) out of 5 total in poverty = 40% + assert poverty.headcount == 2.0 + assert poverty.total_population == 5.0 + assert poverty.rate == 0.4 + + +def test_poverty_with_filter(): + """Test poverty calculation with demographic filter. + + is_child is person-level, in_poverty_bhc is household-level. + We test child poverty by filtering to children and mapping household poverty. + """ + # Household 0: 2 people (1 child, 1 adult), in poverty + # Household 1: 3 people (2 children, 1 adult), not in poverty + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [0, 1, 2, 3, 4], + "benunit_id": [0, 0, 1, 1, 1], + "household_id": [0, 0, 1, 1, 1], + "person_weight": [1.0, 1.0, 1.0, 1.0, 1.0], + "is_child": [True, False, True, True, False], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [0, 1], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [0, 1], + "household_weight": [1.0, 1.0], + "in_poverty_bhc": [True, False], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + # Calculate child poverty (filter for is_child == True) + child_poverty = Poverty( + simulation=simulation, + poverty_variable="in_poverty_bhc", + entity="person", + filter_variable="is_child", + filter_variable_eq=True, + ) + child_poverty.run() + + # 3 children total: 1 in hh 0 (in poverty) + 2 in hh 1 (not in poverty) + # Child poverty headcount = 1, total children = 3, rate = 33.3% + assert child_poverty.headcount == 1.0 + assert child_poverty.total_population == 3.0 + assert abs(child_poverty.rate - 1 / 3) < 0.001 + + +def test_poverty_type_enums(): + """Test poverty type enums have correct values.""" + # UK poverty types + assert UKPovertyType.ABSOLUTE_BHC == "absolute_bhc" + assert UKPovertyType.ABSOLUTE_AHC == "absolute_ahc" + assert UKPovertyType.RELATIVE_BHC == "relative_bhc" + assert UKPovertyType.RELATIVE_AHC == "relative_ahc" + + # US poverty types + assert USPovertyType.SPM == "spm" + assert USPovertyType.SPM_DEEP == "spm_deep" + + +def test_poverty_variable_mappings(): + """Test poverty variable mappings are correct.""" + # UK mappings + assert UK_POVERTY_VARIABLES[UKPovertyType.ABSOLUTE_BHC] == "in_poverty_bhc" + assert UK_POVERTY_VARIABLES[UKPovertyType.ABSOLUTE_AHC] == "in_poverty_ahc" + assert ( + UK_POVERTY_VARIABLES[UKPovertyType.RELATIVE_BHC] + == "in_relative_poverty_bhc" + ) + assert ( + UK_POVERTY_VARIABLES[UKPovertyType.RELATIVE_AHC] + == "in_relative_poverty_ahc" + ) + + # US mappings + assert ( + US_POVERTY_VARIABLES[USPovertyType.SPM] == "spm_unit_is_in_spm_poverty" + ) + assert ( + US_POVERTY_VARIABLES[USPovertyType.SPM_DEEP] + == "spm_unit_is_in_deep_spm_poverty" + ) + + +def test_poverty_weighted(): + """Test poverty calculation with weights. + + Household 0 is in poverty (2 people, weights 1+2=3), + Household 1 is not in poverty (2 people, weights 3+4=7). + """ + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [0, 1, 2, 3], + "benunit_id": [0, 0, 1, 1], + "household_id": [0, 0, 1, 1], + "person_weight": [1.0, 2.0, 3.0, 4.0], # Total weight = 10 + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [0, 1], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [0, 1], + "household_weight": [1.0, 1.0], + "in_poverty_bhc": [True, False], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + # Calculate poverty + poverty = Poverty( + simulation=simulation, + poverty_variable="in_poverty_bhc", + entity="person", + ) + poverty.run() + + # Weighted: 1 + 2 = 3 in poverty (hh 0), total = 10, rate = 30% + assert poverty.headcount == 3.0 + assert poverty.total_population == 10.0 + assert poverty.rate == 0.3