From 543400257e132df9c3c567bb1a105a2e8271e0c1 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Wed, 30 Jul 2025 19:44:14 +0200
Subject: [PATCH 01/12] enable L0 hyperparameter tuning

---
 changelog_entry.yaml                |   4 +
 pyproject.toml                      |   1 +
 src/microcalibrate/calibration.py   | 225 ++++++++++++++++++++++++++--
 src/microcalibrate/utils/metrics.py |   6 +-
 tests/test_regularization.py        | 141 +++++++++++++++++
 5 files changed, 364 insertions(+), 13 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29..bced5d6 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,4 @@
+- bump: minor
+  changes:
+    added:
+    - Add hyperparameter tuning for L0 implementation.
diff --git a/pyproject.toml b/pyproject.toml
index 8cf58eb..954d1ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
     "numpy",
     "pandas",
     "tqdm",
+    "optuna",
 ]
 
 [project.optional-dependencies]
diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
index 9a9f240..cd57df5 100644
--- a/src/microcalibrate/calibration.py
+++ b/src/microcalibrate/calibration.py
@@ -1,10 +1,10 @@
 import logging
-from typing import Callable, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import numpy as np
+import optuna
 import pandas as pd
 import torch
-from torch import Tensor
 
 logger = logging.getLogger(__name__)
 
@@ -16,7 +16,9 @@ def __init__(
         targets: np.ndarray,
         target_names: Optional[np.ndarray] = None,
         estimate_matrix: Optional[pd.DataFrame] = None,
-        estimate_function: Optional[Callable[[Tensor], Tensor]] = None,
+        estimate_function: Optional[
+            Callable[[torch.Tensor], torch.Tensor]
+        ] = None,
         epochs: Optional[int] = 32,
         noise_level: Optional[float] = 10.0,
         learning_rate: Optional[float] = 1e-3,
@@ -37,7 +39,7 @@ def __init__(
             targets (np.ndarray): Array of target values.
             target_names (Optional[np.ndarray]): Optional names of the targets for logging. Defaults to None. You MUST pass these names if you are not passing in an estimate matrix, and just passing in an estimate function.
             estimate_matrix (pd.DataFrame): DataFrame containing the estimate matrix.
-            estimate_function (Optional[Callable[[Tensor], Tensor]]): Function to estimate targets from weights. Defaults to None, in which case it will use the estimate_matrix.
+            estimate_function (Optional[Callable[[torch.Tensor], torch.Tensor]]): Function to estimate targets from weights. Defaults to None, in which case it will use the estimate_matrix.
             epochs (int): Optional number of epochs for calibration. Defaults to 32.
             noise_level (float): Optional level of noise to add to weights. Defaults to 10.0.
             learning_rate (float): Optional learning rate for the optimizer. Defaults to 1e-3.
@@ -78,6 +80,7 @@ def __init__(
         self.init_mean = init_mean
         self.temperature = temperature
         self.regularize_with_l0 = regularize_with_l0
+        self.seed = 42
 
         self.estimate_matrix = None
         self.targets = None
@@ -263,13 +266,13 @@ def exclude_targets(
         self.targets = targets_array
         self.target_names = target_names
 
-    def estimate(self) -> pd.Series:
+    def estimate(self, weights: Optional[np.ndarray] = None) -> pd.Series:
+        if weights is None:
+            weights = self.weights
         return pd.Series(
             index=self.target_names,
             data=self.estimate_function(
-                torch.tensor(
-                    self.weights, dtype=torch.float32, device=self.device
-                )
+                torch.tensor(weights, dtype=torch.float32, device=self.device)
             )
             .cpu()
             .detach()
@@ -278,7 +281,7 @@ def estimate(self) -> pd.Series:
 
     def _assess_targets(
         self,
-        estimate_function: Callable[[Tensor], Tensor],
+        estimate_function: Callable[[torch.Tensor], torch.Tensor],
         estimate_matrix: Optional[pd.DataFrame],
         weights: np.ndarray,
         targets: np.ndarray,
@@ -287,7 +290,7 @@ def _assess_targets(
         """Assess the targets to ensure they do not violate basic requirements like compatibility, correct order of magnitude, etc.
 
         Args:
-            estimate_function (Callable[[Tensor], Tensor]): Function to estimate the targets from weights.
+            estimate_function (Callable[[torch.Tensor], torch.Tensor]): Function to estimate the targets from weights.
             estimate_matrix (Optional[pd.DataFrame]): DataFrame containing the estimate matrix. Defaults to None.
             weights (np.ndarray): Array of original weights.
             targets (np.ndarray): Array of target values.
@@ -463,3 +466,205 @@ def summary(
         ) / df["Official target"]
         df = df.reset_index(drop=True)
         return df
+
+    def tune_hyperparameters(
+        self,
+        n_trials: int = 30,
+        objectives_balance: Optional[Dict[str, float]] = {
+            "loss": 1.0,
+            "accuracy": 100.0,
+            "sparsity": 10.0,
+        },
+        epochs_per_trial: Optional[int] = None,
+        timeout: Optional[float] = None,
+        n_jobs: int = 1,
+        study_name: Optional[str] = None,
+        storage: Optional[str] = None,
+        load_if_exists: bool = False,
+        direction: str = "minimize",
+        sampler: Optional["optuna.samplers.BaseSampler"] = None,
+        pruner: Optional["optuna.pruners.BasePruner"] = None,
+    ) -> Dict[str, Any]:
+        """
+        Tune hyperparameters for L0 regularization using Optuna.
+
+        This method optimizes l0_lambda, init_mean, and temperature to achieve:
+        1. Low calibration loss
+        2. High percentage of targets within 10% of their true values
+        3. Sparse weights (fewer non-zero weights)
+
+        Args:
+            n_trials: Number of optimization trials to run.
+            epochs_per_trial: Number of epochs per trial. If None, uses self.epochs // 4.
+            objectives_balance: Dictionary to balance the importance of loss, accuracy, and sparsity in the objective function. Default prioritizes being within 10% of targets.
+            timeout: Stop study after this many seconds. None means no timeout.
+            n_jobs: Number of parallel jobs. -1 means using all processors.
+            study_name: Name of the study for storage.
+            storage: Database URL for distributed optimization.
+            load_if_exists: Whether to load existing study.
+            direction: Optimization direction ('minimize' or 'maximize').
+            sampler: Optuna sampler for hyperparameter suggestions.
+            pruner: Optuna pruner for early stopping of trials.
+
+        Returns:
+            Dictionary containing the best hyperparameters found.
+        """
+        # Suppress Optuna's logs during optimization
+        optuna.logging.set_verbosity(optuna.logging.WARNING)
+
+        if epochs_per_trial is None:
+            epochs_per_trial = max(self.epochs // 4, 100)
+
+        logger.info(
+            f"Starting hyperparameter tuning with {n_trials} trials, "
+            f"{epochs_per_trial} epochs per trial"
+        )
+
+        def objective(
+            trial: optuna.Trial,
+            objectives_balance: Dict[str, float] = objectives_balance,
+        ) -> float:
+            """Objective function for Optuna optimization."""
+            try:
+                # Suggest hyperparameters
+                l0_lambda = trial.suggest_float(
+                    "l0_lambda", 1e-6, 1e-4, log=True
+                )
+                init_mean = trial.suggest_float("init_mean", 0.5, 0.999)
+                temperature = trial.suggest_float("temperature", 0.5, 2.0)
+
+                # Store original parameters
+                original_l0_lambda = self.l0_lambda
+                original_init_mean = self.init_mean
+                original_temperature = self.temperature
+                original_regularize = self.regularize_with_l0
+                original_epochs = self.epochs
+
+                # Update parameters for this trial
+                self.l0_lambda = l0_lambda
+                self.init_mean = init_mean
+                self.temperature = temperature
+                self.regularize_with_l0 = True
+                self.epochs = epochs_per_trial
+
+                # Run calibration with current hyperparameters
+                performance_df = self.calibrate()
+                sparse_weights = self.sparse_weights
+
+                # Calculate metrics for objective
+                if sparse_weights is not None:
+                    final_estimates = self.estimate(sparse_weights)
+
+                    targets_tensor = torch.tensor(
+                        self.targets, dtype=torch.float32, device=self.device
+                    )
+                    estimates_tensor = torch.tensor(
+                        final_estimates,
+                        dtype=torch.float32,
+                        device=self.device,
+                    )
+
+                    from .utils.metrics import loss, pct_close
+
+                    within_10_pct = pct_close(estimates_tensor, targets_tensor)
+                    final_loss = loss(
+                        estimates_tensor,
+                        targets_tensor,
+                        self.normalization_factor,
+                    )
+
+                    sparsity = np.mean(sparse_weights == 0)
+
+                    # Combined objective: minimize loss while maximizing sparsity and percentage within 10%
+                    # We weight these components to balance their importance
+                    objective_value = (
+                        final_loss * objectives_balance["loss"]
+                        + (1 - within_10_pct) * objectives_balance["accuracy"]
+                        + (1 - sparsity) * objectives_balance["sparsity"]
+                    )
+
+                    # Report intermediate values for multi-objective optimization
+                    trial.set_user_attr("final_loss", final_loss)
+                    trial.set_user_attr("within_10_pct", within_10_pct)
+                    trial.set_user_attr("sparsity", sparsity)
+                    trial.set_user_attr(
+                        "n_nonzero_weights", int(np.sum(sparse_weights != 0))
+                    )
+
+                    # Log progress
+                    if trial.number % 5 == 0:
+                        logger.info(
+                            f"Trial {trial.number}: loss={final_loss:.6f}, "
+                            f"within_10%={within_10_pct:.2%}, "
+                            f"sparsity={sparsity:.2%}, "
+                            f"objective={objective_value:.6f}"
+                        )
+
+            except Exception as e:
+                logger.warning(f"Trial {trial.number} failed: {str(e)}")
+                objective_value = 1e10
+
+            finally:
+                # Restore original parameters
+                self.l0_lambda = original_l0_lambda
+                self.init_mean = original_init_mean
+                self.temperature = original_temperature
+                self.regularize_with_l0 = original_regularize
+                self.epochs = original_epochs
+
+            return objective_value
+
+        # Create or load study
+        if sampler is None:
+            sampler = optuna.samplers.TPESampler(seed=self.seed)
+
+        study = optuna.create_study(
+            study_name=study_name,
+            storage=storage,
+            load_if_exists=load_if_exists,
+            direction=direction,
+            sampler=sampler,
+            pruner=pruner,
+        )
+
+        # Run optimization
+        study.optimize(
+            objective,
+            n_trials=n_trials,
+            timeout=timeout,
+            n_jobs=n_jobs,
+            show_progress_bar=True,
+        )
+
+        # Get best parameters
+        best_params = study.best_params
+        best_value = study.best_value
+
+        # Add additional metrics from the best trial
+        best_trial = study.best_trial
+        best_params["final_loss"] = best_trial.user_attrs.get(
+            "final_loss", None
+        )
+        best_params["within_10_pct"] = best_trial.user_attrs.get(
+            "within_10_pct", None
+        )
+        best_params["sparsity"] = best_trial.user_attrs.get("sparsity", None)
+        best_params["n_nonzero_weights"] = best_trial.user_attrs.get(
+            "n_nonzero_weights", None
+        )
+
+        logger.info(
+            f"\nHyperparameter tuning completed!"
+            f"\nBest objective value: {best_value:.6f}"
+            f"\nBest parameters:"
+            f"\n  - l0_lambda: {best_params['l0_lambda']:.2e}"
+            f"\n  - init_mean: {best_params['init_mean']:.4f}"
+            f"\n  - temperature: {best_params['temperature']:.4f}"
+            f"\nBest trial metrics:"
+            f"\n  - Final loss: {best_params['final_loss']:.6f}"
+            f"\n  - Within 10% of targets: {best_params['within_10_pct']:.2%}"
+            f"\n  - Sparsity: {best_params['sparsity']:.2%}"
+            f"\n  - Non-zero weights: {best_params['n_nonzero_weights']:,} / {len(self.weights):,}"
+        )
+
+        return best_params
diff --git a/src/microcalibrate/utils/metrics.py b/src/microcalibrate/utils/metrics.py
index da55edf..37237ad 100644
--- a/src/microcalibrate/utils/metrics.py
+++ b/src/microcalibrate/utils/metrics.py
@@ -30,18 +30,18 @@ def loss(
 
 def pct_close(
     estimate: torch.Tensor,
-    targets_array: torch.Tensor,
+    targets: torch.Tensor,
     t: Optional[float] = 0.1,
 ) -> float:
     """Calculate the percentage of estimates close to targets.
 
     Args:
         estimate (torch.Tensor): Current estimates in log space.
-        targets_array (torch.Tensor): Array of target values.
+        targets (torch.Tensor): Array of target values.
         t (float): Optional threshold for closeness.
 
     Returns:
         float: Percentage of estimates within the threshold.
     """
-    abs_error = torch.abs((estimate - targets_array) / (1 + targets_array))
+    abs_error = torch.abs((estimate - targets) / (1 + targets))
     return ((abs_error < t).sum() / abs_error.numel()).item()
diff --git a/tests/test_regularization.py b/tests/test_regularization.py
index 34bf707..7ca5119 100644
--- a/tests/test_regularization.py
+++ b/tests/test_regularization.py
@@ -106,3 +106,144 @@ def test_calibration_with_l0_regularization() -> None:
     assert (
         percentage_below_threshold > 10
     ), f"Only {percentage_below_threshold:.1f}% of sparse weights are below 0.5 (expected > 10%)"
+
+
+def test_l0_hyperparameter_tuning() -> None:
+    # Create a sample dataset for testing
+    random_generator = np.random.default_rng(0)
+    data = pd.DataFrame(
+        {
+            "age": np.append(random_generator.integers(18, 70, size=500), 71),
+            "income": random_generator.normal(40000, 10000, size=501),
+        }
+    )
+
+    weights = np.ones(len(data))
+
+    # Calculate target values:
+    targets_matrix = pd.DataFrame(
+        {
+            "income_aged_20_30": (
+                (data["age"] >= 20) & (data["age"] < 30)
+            ).astype(float)
+            * data["income"],
+            "income_aged_30_40": (
+                (data["age"] >= 30) & (data["age"] < 40)
+            ).astype(float)
+            * data["income"],
+            "income_aged_40_50": (
+                (data["age"] >= 40) & (data["age"] < 50)
+            ).astype(float)
+            * data["income"],
+            "income_aged_50_60": (
+                (data["age"] >= 50) & (data["age"] < 60)
+            ).astype(float)
+            * data["income"],
+            "income_aged_60_70": (
+                (data["age"] >= 60) & (data["age"] <= 70)
+            ).astype(float)
+            * data["income"],
+        }
+    )
+    targets = np.array(
+        [
+            (targets_matrix["income_aged_20_30"] * weights).sum() * 1.2,
+            (targets_matrix["income_aged_30_40"] * weights).sum() * 1.3,
+            (targets_matrix["income_aged_40_50"] * weights).sum() * 0.9,
+            (targets_matrix["income_aged_50_60"] * weights).sum() * 1.5,
+            (targets_matrix["income_aged_60_70"] * weights).sum() * 1.2,
+        ]
+    )
+
+    # Create calibrator instance
+    calibrator = Calibration(
+        estimate_matrix=targets_matrix,
+        weights=weights,
+        targets=targets,
+        noise_level=0.05,
+        epochs=200,
+        learning_rate=0.01,
+        dropout_rate=0,
+        regularize_with_l0=False,
+    )
+
+    # Test hyperparameter tuning
+    best_params = calibrator.tune_hyperparameters(
+        n_trials=10,  # Fewer trials for testing
+        epochs_per_trial=50,  # Shorter epochs for quick testing
+        objectives_balance={
+            "loss": 1.0,
+            "accuracy": 100.0,  # Prioritize hitting targets
+            "sparsity": 10.0,
+        },
+        n_jobs=1,
+    )
+
+    # Verify that best_params contains expected keys
+    assert "l0_lambda" in best_params, "Missing l0_lambda in best parameters"
+    assert "init_mean" in best_params, "Missing init_mean in best parameters"
+    assert (
+        "temperature" in best_params
+    ), "Missing temperature in best parameters"
+    assert "final_loss" in best_params, "Missing final_loss in best parameters"
+    assert (
+        "within_10_pct" in best_params
+    ), "Missing within_10_pct in best parameters"
+    assert "sparsity" in best_params, "Missing sparsity in best parameters"
+    assert (
+        "n_nonzero_weights" in best_params
+    ), "Missing n_nonzero_weights in best parameters"
+
+    # Verify parameter ranges
+    assert (
+        1e-6 <= best_params["l0_lambda"] <= 1e-4
+    ), f"l0_lambda {best_params['l0_lambda']} out of range"
+    assert (
+        0.5 <= best_params["init_mean"] <= 0.999
+    ), f"init_mean {best_params['init_mean']} out of range"
+    assert (
+        0.5 <= best_params["temperature"] <= 2.0
+    ), f"temperature {best_params['temperature']} out of range"
+
+    # Verify metrics are reasonable
+    assert (
+        0 <= best_params["within_10_pct"] <= 1
+    ), "within_10_pct should be between 0 and 1"
+    assert (
+        0 <= best_params["sparsity"] <= 1
+    ), "sparsity should be between 0 and 1"
+    assert best_params["n_nonzero_weights"] <= len(
+        weights
+    ), "n_nonzero_weights exceeds total weights"
+
+    # Now run calibration with the best parameters
+    calibrator.l0_lambda = best_params["l0_lambda"]
+    calibrator.init_mean = best_params["init_mean"]
+    calibrator.temperature = best_params["temperature"]
+    calibrator.regularize_with_l0 = True
+
+    # Run the full calibration
+    performance_df = calibrator.calibrate()
+    sparse_weights = calibrator.sparse_weights
+
+    assert (
+        sparse_weights is not None
+    ), "Sparse weights should be generated with L0 regularization"
+
+    # Evaluate the final calibration
+    percentage_within_10 = evaluate_sparse_weights(
+        optimised_weights=sparse_weights,
+        estimate_matrix=targets_matrix,
+        targets_array=targets,
+    )
+
+    # The tuned parameters should give reasonable results
+    assert (
+        percentage_within_10 > 50
+    ), f"Only {percentage_within_10:.1f}% of targets within 10% (expected > 50%)"
+
+    # Check that we achieved some sparsity
+    actual_sparsity = np.mean(sparse_weights == 0)
+    assert (
+        actual_sparsity > 0.1
+    ), f"Sparsity {actual_sparsity:.1%} is too low (expected > 10%)"

From 1398743df990be7aa97046daa6f2698e2f03644f Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Wed, 30 Jul 2025 19:47:37 +0200
Subject: [PATCH 02/12] increase the importance of achieving sparsity with L0

---
 tests/test_regularization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_regularization.py b/tests/test_regularization.py
index 7ca5119..87d5ddb 100644
--- a/tests/test_regularization.py
+++ b/tests/test_regularization.py
@@ -169,12 +169,12 @@ def test_l0_hyperparameter_tuning() -> None:
 
     # Test hyperparameter tuning
     best_params = calibrator.tune_hyperparameters(
-        n_trials=10,  # Fewer trials for testing
+        n_trials=20,  # Fewer trials for testing
         epochs_per_trial=50,  # Shorter epochs for quick testing
         objectives_balance={
             "loss": 1.0,
             "accuracy": 100.0,  # Prioritize hitting targets
-            "sparsity": 10.0,
+            "sparsity": 30.0,
         },
         n_jobs=1,
     )

From 0c5d975060dcdedf368f4e928cd8dc7d6dcd4afa Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Wed, 30 Jul 2025 22:47:50 +0200
Subject: [PATCH 03/12] pass seed to torch too

---
 src/microcalibrate/calibration.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
index cd57df5..66e109c 100644
--- a/src/microcalibrate/calibration.py
+++ b/src/microcalibrate/calibration.py
@@ -31,6 +31,7 @@ def __init__(
         init_mean: float = 0.999,  # initial proportion with non-zero weights, set near 0
         temperature: float = 0.5,  # usual values .5 to 3
         regularize_with_l0: Optional[bool] = False,
+        seed: Optional[int] = 42,
     ):
         """Initialize the Calibration class.
 
@@ -52,16 +53,8 @@ def __init__(
             init_mean (float): Initial mean for L0 regularization, representing the initial proportion of non-zero weights. Defaults to 0.999.
             temperature (float): Temperature parameter for L0 regularization, controlling the sparsity of the model. Defaults to 0.5.
             regularize_with_l0 (Optional[bool]): Whether to apply L0 regularization. Defaults to False.
+            seed (Optional[int]): Random seed for reproducibility. Defaults to 42.
         """
-        if device is not None:
-            self.device = torch.device(device)
-        else:
-            self.device = torch.device(
-                "cuda"
-                if torch.cuda.is_available()
-                else "mps" if torch.mps.is_available() else "cpu"
-            )
-
         self.original_estimate_matrix = estimate_matrix
         self.original_targets = targets
         self.original_target_names = target_names
@@ -80,7 +73,18 @@ def __init__(
         self.init_mean = init_mean
         self.temperature = temperature
         self.regularize_with_l0 = regularize_with_l0
-        self.seed = 42
+        self.seed = seed
+
+        if device is not None:
+            self.device = torch.device(device)
+            torch.manual_seed(self.seed)
+        else:
+            self.device = torch.device(
+                "cuda"
+                if torch.cuda.is_available()
+                else "mps" if torch.mps.is_available() else "cpu"
+            )
+            torch.cuda.manual_seed(self.seed)
 
         self.estimate_matrix = None
         self.targets = None

From 4fef2f30e00bfb7f060c4786cdf27f1334fc55e3 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Thu, 31 Jul 2025 13:40:36 +0200
Subject: [PATCH 04/12] add holdout to hyperparam tuning

---
 src/microcalibrate/calibration.py | 454 +++++++++++++++++++++++-------
 1 file changed, 349 insertions(+), 105 deletions(-)

diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
index 66e109c..a5da9b7 100644
--- a/src/microcalibrate/calibration.py
+++ b/src/microcalibrate/calibration.py
@@ -58,9 +58,9 @@ def __init__(
         self.original_estimate_matrix = estimate_matrix
         self.original_targets = targets
         self.original_target_names = target_names
+        self.original_estimate_function = estimate_function
         self.weights = weights
         self.excluded_targets = excluded_targets
-        self.estimate_function = estimate_function
         self.epochs = epochs
         self.noise_level = noise_level
         self.learning_rate = learning_rate
@@ -84,11 +84,13 @@ def __init__(
                 if torch.cuda.is_available()
                 else "mps" if torch.mps.is_available() else "cpu"
             )
-            torch.cuda.manual_seed(self.seed)
+            if self.device == "cuda":
+                torch.cuda.manual_seed(self.seed)
 
         self.estimate_matrix = None
         self.targets = None
         self.target_names = None
+        self.estimate_function = None
         self.excluded_target_data = {}
 
         # Set target names from estimate_matrix if not provided
@@ -111,7 +113,7 @@ def __init__(
             else:
                 self.estimate_matrix = None
 
-        if self.estimate_function is None:
+        if self.original_estimate_function is None:
             if self.estimate_matrix is not None:
                 self.estimate_function = (
                     lambda weights: weights @ self.estimate_matrix
@@ -131,9 +133,7 @@ def calibrate(self) -> None:
 
         self._assess_targets(
             estimate_function=self.estimate_function,
-            estimate_matrix=getattr(
-                self, "original_estimate_matrix", self.estimate_matrix
-            ),
+            estimate_matrix=self.estimate_matrix,
             weights=self.weights,
             targets=self.targets,
             target_names=self.target_names,
@@ -212,9 +212,9 @@ def exclude_targets(
             initial_weights_tensor = torch.tensor(
                 self.weights, dtype=torch.float32, device=self.device
             )
-            if self.estimate_function is not None:
+            if self.original_estimate_function is not None:
                 initial_estimates_all = (
-                    self.estimate_function(initial_weights_tensor)
+                    self.original_estimate_function(initial_weights_tensor)
                     .detach()
                     .cpu()
                     .numpy()
@@ -242,6 +242,10 @@ def exclude_targets(
                     dtype=torch.float32,
                     device=self.device,
                 )
+
+                self.estimate_function = (
+                    lambda weights: weights @ self.estimate_matrix
+                )
             else:
                 raise ValueError(
                     "Either estimate_function or estimate_matrix must be provided"
@@ -263,6 +267,10 @@ def exclude_targets(
                     dtype=torch.float32,
                     device=self.device,
                 )
+                if self.original_estimate_function is None:
+                    self.estimate_function = (
+                        lambda weights: weights @ self.estimate_matrix
+                    )
             else:
                 self.estimate_matrix = None
 
@@ -317,6 +325,11 @@ def _assess_targets(
                 "Some targets are negative. This may not make sense for totals."
             )
 
+        if estimate_matrix is None and self.excluded_targets is not None:
+            logger.warning(
+                "You are excluding targets but not passing an estimate matrix. Make sure the estimate function handles excluded targets correctly, otherwise you may face operand errors."
+            )
+
         # Estimate order of magnitude from column sums and warn if they are off by an order of magnitude from targets
         one_weights = weights * 0 + 1
         estimates = (
@@ -330,6 +343,7 @@ def _assess_targets(
             .numpy()
             .flatten()
         )
+
         # Use a small epsilon to avoid division by zero
         eps = 1e-4
         adjusted_estimates = np.where(estimates == 0, eps, estimates)
@@ -446,7 +460,7 @@ def _get_linear_loss(metrics_matrix, target_vector, sparse=False):
 
     def summary(
         self,
-    ) -> str:
+    ) -> pd.DataFrame:
         """Generate a summary of the calibration process."""
         if self.performance_df is None:
             return "No calibration has been performed yet, make sure to run .calibrate() before requesting a summary."
@@ -473,19 +487,22 @@ def summary(
 
     def tune_hyperparameters(
         self,
-        n_trials: int = 30,
+        n_trials: Optional[int] = 30,
         objectives_balance: Optional[Dict[str, float]] = {
             "loss": 1.0,
             "accuracy": 100.0,
             "sparsity": 10.0,
         },
         epochs_per_trial: Optional[int] = None,
+        n_holdout_sets: Optional[int] = 3,
+        holdout_fraction: Optional[float] = 0.2,
+        aggregation: Optional[str] = "mean",
         timeout: Optional[float] = None,
-        n_jobs: int = 1,
+        n_jobs: Optional[int] = 1,
         study_name: Optional[str] = None,
         storage: Optional[str] = None,
-        load_if_exists: bool = False,
-        direction: str = "minimize",
+        load_if_exists: Optional[bool] = False,
+        direction: Optional[str] = "minimize",
         sampler: Optional["optuna.samplers.BaseSampler"] = None,
         pruner: Optional["optuna.pruners.BasePruner"] = None,
     ) -> Dict[str, Any]:
@@ -499,8 +516,11 @@ def tune_hyperparameters(
 
         Args:
             n_trials: Number of optimization trials to run.
-            epochs_per_trial: Number of epochs per trial. If None, uses self.epochs // 4.
             objectives_balance: Dictionary to balance the importance of loss, accuracy, and sparsity in the objective function. Default prioritizes being within 10% of targets.
+            epochs_per_trial: Number of epochs per trial. If None, uses self.epochs // 4.
+            n_holdout_sets: Number of different holdout sets to create and evaluate on
+            holdout_fraction: Fraction of targets in each holdout set
+            aggregation: How to combine scores across holdouts ("mean", "median", "worst")
             timeout: Stop study after this many seconds. None means no timeout.
             n_jobs: Number of parallel jobs. -1 means using all processors.
             study_name: Name of the study for storage.
@@ -519,11 +539,28 @@ def tune_hyperparameters(
         if epochs_per_trial is None:
             epochs_per_trial = max(self.epochs // 4, 100)
 
+        holdout_sets = self._create_holdout_sets(
+            n_holdout_sets, holdout_fraction, self.seed
+        )
+
         logger.info(
-            f"Starting hyperparameter tuning with {n_trials} trials, "
-            f"{epochs_per_trial} epochs per trial"
+            f"Multi-holdout hyperparameter tuning:\n"
+            f"  - {n_holdout_sets} holdout sets\n"
+            f"  - {len(holdout_sets[0]['indices'])} targets per holdout ({holdout_fraction:.1%})\n"
+            f"  - Aggregation: {aggregation}\n"
         )
 
+        # Store original state
+        original_state = {
+            "excluded_targets": self.excluded_targets,
+            "targets": self.targets.copy(),
+            "target_names": (
+                self.target_names.copy()
+                if self.target_names is not None
+                else None
+            ),
+        }
+
         def objective(
             trial: optuna.Trial,
             objectives_balance: Dict[str, float] = objectives_balance,
@@ -531,92 +568,98 @@ def objective(
             """Objective function for Optuna optimization."""
             try:
                 # Suggest hyperparameters
-                l0_lambda = trial.suggest_float(
-                    "l0_lambda", 1e-6, 1e-4, log=True
-                )
-                init_mean = trial.suggest_float("init_mean", 0.5, 0.999)
-                temperature = trial.suggest_float("temperature", 0.5, 2.0)
-
-                # Store original parameters
-                original_l0_lambda = self.l0_lambda
-                original_init_mean = self.init_mean
-                original_temperature = self.temperature
-                original_regularize = self.regularize_with_l0
-                original_epochs = self.epochs
-
-                # Update parameters for this trial
-                self.l0_lambda = l0_lambda
-                self.init_mean = init_mean
-                self.temperature = temperature
-                self.regularize_with_l0 = True
-                self.epochs = epochs_per_trial
-
-                # Run calibration with current hyperparameters
-                performance_df = self.calibrate()
-                sparse_weights = self.sparse_weights
-
-                # Calculate metrics for objective
-                if sparse_weights is not None:
-                    final_estimates = self.estimate(sparse_weights)
-
-                    targets_tensor = torch.tensor(
-                        self.targets, dtype=torch.float32, device=self.device
-                    )
-                    estimates_tensor = torch.tensor(
-                        final_estimates,
-                        dtype=torch.float32,
-                        device=self.device,
-                    )
-
-                    from .utils.metrics import loss, pct_close
+                hyperparameters = {
+                    "l0_lambda": trial.suggest_float(
+                        "l0_lambda", 1e-6, 1e-4, log=True
+                    ),
+                    "init_mean": trial.suggest_float("init_mean", 0.5, 0.999),
+                    "temperature": trial.suggest_float(
+                        "temperature", 0.5, 2.0
+                    ),
+                }
 
-                    within_10_pct = pct_close(estimates_tensor, targets_tensor)
-                    final_loss = loss(
-                        estimates_tensor,
-                        targets_tensor,
-                        self.normalization_factor,
+                # Evaluate on all holdout sets
+                holdout_results = []
+                for holdout_idx, holdout_set in enumerate(holdout_sets):
+                    result = self._evaluate_single_holdout(
+                        holdout_set=holdout_set,
+                        hyperparameters=hyperparameters,
+                        epochs_per_trial=epochs_per_trial,
+                        objectives_balance=objectives_balance,
                     )
+                    holdout_results.append(result)
 
-                    sparsity = np.mean(sparse_weights == 0)
+                # Aggregate objectives
+                final_objective = self._aggregate_holdout_results(
+                    holdout_results, aggregation
+                )
 
-                    # Combined objective: minimize loss while maximizing sparsity and percentage within 10%
-                    # We weight these components to balance their importance
-                    objective_value = (
-                        final_loss * objectives_balance["loss"]
-                        + (1 - within_10_pct) * objectives_balance["accuracy"]
-                        + (1 - sparsity) * objectives_balance["sparsity"]
-                    )
+                # Store detailed metrics
+                trial.set_user_attr(
+                    "holdout_objectives",
+                    [r["objective"] for r in holdout_results],
+                )
+                trial.set_user_attr(
+                    "mean_val_loss",
+                    np.mean([r["val_loss"] for r in holdout_results]),
+                )
+                trial.set_user_attr(
+                    "std_val_loss",
+                    np.std([r["val_loss"] for r in holdout_results]),
+                )
+                trial.set_user_attr(
+                    "mean_val_accuracy",
+                    np.mean([r["val_accuracy"] for r in holdout_results]),
+                )
+                trial.set_user_attr(
+                    "std_val_accuracy",
+                    np.std([r["val_accuracy"] for r in holdout_results]),
+                )
+                trial.set_user_attr(
+                    "mean_train_loss",
+                    np.mean([r["train_loss"] for r in holdout_results]),
+                )
+                trial.set_user_attr(
+                    "mean_train_accuracy",
+                    np.mean([r["train_accuracy"] for r in holdout_results]),
+                )
+
+                # Use the last holdout's sparsity metrics
+                last_result = holdout_results[-1]
+                trial.set_user_attr("sparsity", last_result["sparsity"])
+                trial.set_user_attr(
+                    "n_nonzero_weights",
+                    last_result.get("n_nonzero_weights", 0),
+                )
 
-                    # Report intermediate values for multi-objective optimization
-                    trial.set_user_attr("final_loss", final_loss)
-                    trial.set_user_attr("within_10_pct", within_10_pct)
-                    trial.set_user_attr("sparsity", sparsity)
-                    trial.set_user_attr(
-                        "n_nonzero_weights", int(np.sum(sparse_weights != 0))
+                # Log progress
+                if trial.number % 5 == 0:
+                    objectives = [r["objective"] for r in holdout_results]
+                    val_accuracies = [
+                        r["val_accuracy"] for r in holdout_results
+                    ]
+                    logger.info(
+                        f"Trial {trial.number}:\n"
+                        f"  Objectives by holdout: {[f'{obj:.4f}' for obj in objectives]}\n"
+                        f"  {aggregation.capitalize()} objective: {final_objective:.4f}\n"
+                        f"  Mean val accuracy: {np.mean(val_accuracies):.2%} (±{np.std(val_accuracies):.2%})\n"
+                        f"  Sparsity: {last_result['sparsity']:.2%}"
                     )
 
-                    # Log progress
-                    if trial.number % 5 == 0:
-                        logger.info(
-                            f"Trial {trial.number}: loss={final_loss:.6f}, "
-                            f"within_10%={within_10_pct:.2%}, "
-                            f"sparsity={sparsity:.2%}, "
-                            f"objective={objective_value:.6f}"
-                        )
+                return final_objective
 
             except Exception as e:
                 logger.warning(f"Trial {trial.number} failed: {str(e)}")
-                objective_value = 1e10
+                return 1e10
 
             finally:
-                # Restore original parameters
-                self.l0_lambda = original_l0_lambda
-                self.init_mean = original_init_mean
-                self.temperature = original_temperature
-                self.regularize_with_l0 = original_regularize
-                self.epochs = original_epochs
+                # Restore original state
+                self.excluded_targets = original_state["excluded_targets"]
+                self.targets = original_state["targets"]
+                self.target_names = original_state["target_names"]
 
-            return objective_value
+                if self.excluded_targets is not None:
+                    self.exclude_targets()
 
         # Create or load study
         if sampler is None:
@@ -642,33 +685,234 @@ def objective(
 
         # Get best parameters
         best_params = study.best_params
-        best_value = study.best_value
-
-        # Add additional metrics from the best trial
         best_trial = study.best_trial
-        best_params["final_loss"] = best_trial.user_attrs.get(
-            "final_loss", None
+        best_params["mean_val_loss"] = best_trial.user_attrs.get(
+            "mean_val_loss"
+        )
+        best_params["std_val_loss"] = best_trial.user_attrs.get("std_val_loss")
+        best_params["mean_val_accuracy"] = best_trial.user_attrs.get(
+            "mean_val_accuracy"
         )
-        best_params["within_10_pct"] = best_trial.user_attrs.get(
-            "within_10_pct", None
+        best_params["std_val_accuracy"] = best_trial.user_attrs.get(
+            "std_val_accuracy"
         )
-        best_params["sparsity"] = best_trial.user_attrs.get("sparsity", None)
-        best_params["n_nonzero_weights"] = best_trial.user_attrs.get(
-            "n_nonzero_weights", None
+        best_params["holdout_objectives"] = best_trial.user_attrs.get(
+            "holdout_objectives"
         )
+        best_params["sparsity"] = best_trial.user_attrs.get("sparsity")
+        best_params["n_holdout_sets"] = n_holdout_sets
+        best_params["aggregation"] = aggregation
 
         logger.info(
-            f"\nHyperparameter tuning completed!"
-            f"\nBest objective value: {best_value:.6f}"
+            f"\nMulti-holdout tuning completed!"
             f"\nBest parameters:"
             f"\n  - l0_lambda: {best_params['l0_lambda']:.2e}"
             f"\n  - init_mean: {best_params['init_mean']:.4f}"
             f"\n  - temperature: {best_params['temperature']:.4f}"
-            f"\nBest trial metrics:"
-            f"\n  - Final loss: {best_params['final_loss']:.6f}"
-            f"\n  - Within 10% of targets: {best_params['within_10_pct']:.2%}"
+            f"\nPerformance across {n_holdout_sets} holdouts:"
+            f"\n  - Mean val loss: {best_params['mean_val_loss']:.6f} (±{best_params['std_val_loss']:.6f})"
+            f"\n  - Mean val accuracy: {best_params['mean_val_accuracy']:.2%} (±{best_params['std_val_accuracy']:.2%})"
+            f"\n  - Individual objectives: {[f'{obj:.4f}' for obj in best_params['holdout_objectives']]}"
             f"\n  - Sparsity: {best_params['sparsity']:.2%}"
-            f"\n  - Non-zero weights: {best_params['n_nonzero_weights']:,} / {len(self.weights):,}"
         )
 
         return best_params
+
+    def _create_holdout_sets(
+        self,
+        n_holdout_sets: int,
+        holdout_fraction: float,
+        random_state: Optional[int] = None,
+    ) -> List[Dict[str, Any]]:
+        """Create multiple holdout sets for cross-validation.
+
+        Args:
+            n_holdout_sets: Number of holdout sets to create
+            holdout_fraction: Fraction of targets in each holdout set
+            random_state: Base random seed for reproducibility
+
+        Returns:
+            List of dictionaries containing holdout names and indices
+        """
+        n_targets = len(self.original_target_names)
+        n_holdout_targets = max(1, int(n_targets * holdout_fraction))
+
+        holdout_sets = []
+        for i in range(n_holdout_sets):
+            # Each holdout set gets a different random selection
+            set_rng = np.random.default_rng((random_state or self.seed) + i)
+            holdout_indices = set_rng.choice(
+                n_targets, size=n_holdout_targets, replace=False
+            )
+            holdout_names = [
+                self.original_target_names[idx] for idx in holdout_indices
+            ]
+            holdout_sets.append(
+                {"names": holdout_names, "indices": holdout_indices}
+            )
+
+        return holdout_sets
+
+    def _evaluate_single_holdout(
+        self,
+        holdout_set: Dict[str, Any],
+        hyperparameters: Dict[str, float],
+        epochs_per_trial: int,
+        objectives_balance: Dict[str, float],
+    ) -> Dict[str, float]:
+        """Evaluate hyperparameters on a single holdout set.
+
+        Args:
+            holdout_set: Dictionary with 'names' and 'indices' of holdout targets
+            hyperparameters: Dictionary with l0_lambda, init_mean, temperature
+            epochs_per_trial: Number of epochs to run
+            objectives_balance: Weights for different objectives
+
+        Returns:
+            Dictionary with evaluation metrics
+        """
+        # Store original parameters
+        original_params = {
+            "l0_lambda": self.l0_lambda,
+            "init_mean": self.init_mean,
+            "temperature": self.temperature,
+            "regularize_with_l0": self.regularize_with_l0,
+            "epochs": self.epochs,
+        }
+
+        try:
+            # Update parameters for this evaluation
+            self.l0_lambda = hyperparameters["l0_lambda"]
+            self.init_mean = hyperparameters["init_mean"]
+            self.temperature = hyperparameters["temperature"]
+            self.regularize_with_l0 = True
+            self.epochs = epochs_per_trial
+
+            # Set up calibration with this holdout set
+            self.excluded_targets = holdout_set["names"]
+            self.exclude_targets()
+
+            # Run calibration
+            performance_df = self.calibrate()
+            sparse_weights = self.sparse_weights
+
+            # Get estimates for all targets
+            weights_tensor = torch.tensor(
+                sparse_weights, dtype=torch.float32, device=self.device
+            )
+
+            if self.original_estimate_matrix is not None:
+                original_matrix_tensor = torch.tensor(
+                    self.original_estimate_matrix.values,
+                    dtype=torch.float32,
+                    device=self.device,
+                )
+                all_estimates = (
+                    (weights_tensor @ original_matrix_tensor).cpu().numpy()
+                )
+            else:
+                all_estimates = (
+                    self.original_estimate_function(weights_tensor)
+                    .cpu()
+                    .numpy()
+                )
+
+            # Split into train/validation
+            n_targets = len(self.original_target_names)
+            val_indices = holdout_set["indices"]
+            train_indices = [
+                i for i in range(n_targets) if i not in val_indices
+            ]
+
+            val_estimates = all_estimates[val_indices]
+            val_targets = self.original_targets[val_indices]
+            train_estimates = all_estimates[train_indices]
+            train_targets = self.original_targets[train_indices]
+
+            # Calculate metrics
+            from .utils.metrics import loss, pct_close
+
+            val_loss = loss(
+                torch.tensor(
+                    val_estimates, dtype=torch.float32, device=self.device
+                ),
+                torch.tensor(
+                    val_targets, dtype=torch.float32, device=self.device
+                ),
+                None,
+            ).item()
+
+            val_accuracy = pct_close(
+                torch.tensor(
+                    val_estimates, dtype=torch.float32, device=self.device
+                ),
+                torch.tensor(
+                    val_targets, dtype=torch.float32, device=self.device
+                ),
+            )
+
+            train_loss = loss(
+                torch.tensor(
+                    train_estimates, dtype=torch.float32, device=self.device
+                ),
+                torch.tensor(
+                    train_targets, dtype=torch.float32, device=self.device
+                ),
+                None,
+            ).item()
+
+            train_accuracy = pct_close(
+                torch.tensor(
+                    train_estimates, dtype=torch.float32, device=self.device
+                ),
+                torch.tensor(
+                    train_targets, dtype=torch.float32, device=self.device
+                ),
+            )
+
+            sparsity = np.mean(sparse_weights == 0)
+
+            # Calculate objective
+            objective = (
+                val_loss * objectives_balance["loss"]
+                + (1 - val_accuracy) * objectives_balance["accuracy"]
+                + (1 - sparsity) * objectives_balance["sparsity"]
+            )
+
+            return {
+                "objective": objective,
+                "val_loss": val_loss,
+                "val_accuracy": val_accuracy,
+                "train_loss": train_loss,
+                "train_accuracy": train_accuracy,
+                "sparsity": sparsity,
+                "n_nonzero_weights": int(np.sum(sparse_weights != 0)),
+            }
+
+        finally:
+            # Restore original parameters
+            for key, value in original_params.items():
+                setattr(self, key, value)
+
+    def _aggregate_holdout_results(
+        self, results: List[Dict[str, float]], aggregation: str
+    ) -> float:
+        """Aggregate results across multiple holdout sets.
+
+        Args:
+            results: List of evaluation results from each holdout
+            aggregation: Method to aggregate ('mean', 'median', 'worst')
+
+        Returns:
+            Aggregated objective value
+        """
+        objectives = [r["objective"] for r in results]
+
+        if aggregation == "mean":
+            return np.mean(objectives)
+        elif aggregation == "median":
+            return np.median(objectives)
+        elif aggregation == "worst":
+            return np.max(objectives)
+        else:
+            raise ValueError(f"Unknown aggregation method: {aggregation}")

From 24bf593a23932b3bc591a43be9228606e1ee520e Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Thu, 31 Jul 2025 13:53:15 +0200
Subject: [PATCH 05/12] update test to handle new logic

---
 changelog_entry.yaml              |  2 +-
 src/microcalibrate/calibration.py |  4 +---
 tests/test_regularization.py      | 22 ++++++++++++----------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index bced5d6..0db6d6c 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -1,4 +1,4 @@
 - bump: minor
   changes:
     added:
-    - Add hyperparameter tuning for L0 implementation.
+    - Add hyperparameter tuning for L0 implementation with option to holdout targets.
diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
index a5da9b7..1031fc6 100644
--- a/src/microcalibrate/calibration.py
+++ b/src/microcalibrate/calibration.py
@@ -657,9 +657,7 @@ def objective(
                 self.excluded_targets = original_state["excluded_targets"]
                 self.targets = original_state["targets"]
                 self.target_names = original_state["target_names"]
-
-                if self.excluded_targets is not None:
-                    self.exclude_targets()
+                self.exclude_targets()
 
         # Create or load study
         if sampler is None:
diff --git a/tests/test_regularization.py b/tests/test_regularization.py
index 87d5ddb..9e6a8d5 100644
--- a/tests/test_regularization.py
+++ b/tests/test_regularization.py
@@ -185,14 +185,16 @@ def test_l0_hyperparameter_tuning() -> None:
     assert (
         "temperature" in best_params
     ), "Missing temperature in best parameters"
-    assert "final_loss" in best_params, "Missing final_loss in best parameters"
     assert (
-        "within_10_pct" in best_params
-    ), "Missing within_10_pct in best parameters"
+        "mean_val_loss" in best_params
+    ), "Missing mean_val_loss in best parameters"
+    assert (
+        "mean_val_accuracy" in best_params
+    ), "Missing mean_val_accuracy in best parameters"
     assert "sparsity" in best_params, "Missing sparsity in best parameters"
     assert (
-        "n_nonzero_weights" in best_params
-    ), "Missing n_nonzero_weights in best parameters"
+        "holdout_objectives" in best_params
+    ), "Missing holdout_objectives in best parameters"
 
     # Verify parameter ranges
     assert (
@@ -207,14 +209,14 @@ def test_l0_hyperparameter_tuning() -> None:
 
     # Verify metrics are reasonable
     assert (
-        0 <= best_params["within_10_pct"] <= 1
-    ), "within_10_pct should be between 0 and 1"
+        0 <= best_params["mean_val_accuracy"] <= 1
+    ), "mean_val_accuracy should be between 0 and 1"
     assert (
         0 <= best_params["sparsity"] <= 1
     ), "sparsity should be between 0 and 1"
-    assert best_params["n_nonzero_weights"] <= len(
-        weights
-    ), "n_nonzero_weights exceeds total weights"
+    assert (
+        best_params["mean_val_loss"] >= 0
+    ), "mean_val_loss should be non-negative"
 
     # Now run calibration with the best parameters
     calibrator.l0_lambda = best_params["l0_lambda"]

From 9b9c39d397440c7f26e5f3aab4ef3b0bbba55823 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Thu, 31 Jul 2025 14:03:58 +0200
Subject: [PATCH 06/12] add test to show behavior without holdouts

---
 tests/test_regularization.py | 155 ++++++++++++++++++++++++++++++++++-
 1 file changed, 154 insertions(+), 1 deletion(-)

diff --git a/tests/test_regularization.py b/tests/test_regularization.py
index 9e6a8d5..4a35f52 100644
--- a/tests/test_regularization.py
+++ b/tests/test_regularization.py
@@ -108,7 +108,8 @@ def test_calibration_with_l0_regularization() -> None:
     ), f"Only {percentage_below_threshold:.1f}% of sparse weights are below 0.5 (expected > 10%)"
 
 
-def test_l0_hyperparameter_tuning() -> None:
+def test_l0_hyperparameter_tuning_with_holdouts() -> None:
+    """Test L0 hyperparameter tuning with holdout validation."""
     # Create a sample dataset for testing
     random_generator = np.random.default_rng(0)
     data = pd.DataFrame(
@@ -249,3 +250,155 @@ def test_l0_hyperparameter_tuning() -> None:
     assert (
         actual_sparsity > 0.1
     ), f"Sparsity {actual_sparsity:.1%} is too low (expected > 10%)"
+
+
+def test_l0_hyperparameter_tuning_without_holdouts() -> None:
+    """Test L0 hyperparameter tuning without holdout validation (simpler case)."""
+    # Create a sample dataset for testing
+    random_generator = np.random.default_rng(0)
+    data = pd.DataFrame(
+        {
+            "age": np.append(random_generator.integers(18, 70, size=500), 71),
+            "income": random_generator.normal(40000, 10000, size=501),
+        }
+    )
+
+    weights = np.ones(len(data))
+
+    # Calculate target values:
+    targets_matrix = pd.DataFrame(
+        {
+            "income_aged_20_30": (
+                (data["age"] >= 20) & (data["age"] < 30)
+            ).astype(float)
+            * data["income"],
+            "income_aged_30_40": (
+                (data["age"] >= 30) & (data["age"] < 40)
+            ).astype(float)
+            * data["income"],
+            "income_aged_40_50": (
+                (data["age"] >= 40) & (data["age"] < 50)
+            ).astype(float)
+            * data["income"],
+            "income_aged_50_60": (
+                (data["age"] >= 50) & (data["age"] < 60)
+            ).astype(float)
+            * data["income"],
+            "income_aged_60_70": (
+                (data["age"] >= 60) & (data["age"] <= 70)
+            ).astype(float)
+            * data["income"],
+        }
+    )
+    targets = np.array(
+        [
+            (targets_matrix["income_aged_20_30"] * weights).sum() * 1.2,
+            (targets_matrix["income_aged_30_40"] * weights).sum() * 1.3,
+            (targets_matrix["income_aged_40_50"] * weights).sum() * 0.9,
+            (targets_matrix["income_aged_50_60"] * weights).sum() * 1.5,
+            (targets_matrix["income_aged_60_70"] * weights).sum() * 1.2,
+        ]
+    )
+
+    # Create calibrator instance
+    calibrator = Calibration(
+        estimate_matrix=targets_matrix,
+        weights=weights,
+        targets=targets,
+        noise_level=0.05,
+        epochs=200,
+        learning_rate=0.01,
+        dropout_rate=0,
+        regularize_with_l0=False,
+    )
+
+    # Test hyperparameter tuning WITHOUT holdouts
+    best_params = calibrator.tune_hyperparameters(
+        n_trials=10,
+        epochs_per_trial=30,
+        n_holdout_sets=1,  # Single holdout set
+        holdout_fraction=0,  # No holdouts - use all data for both training and validation
+        objectives_balance={
+            "loss": 1.0,
+            "accuracy": 100.0,
+            "sparsity": 30.0,
+        },
+        n_jobs=1,
+    )
+
+    # Verify that best_params contains expected keys
+    assert "l0_lambda" in best_params, "Missing l0_lambda in best parameters"
+    assert "init_mean" in best_params, "Missing init_mean in best parameters"
+    assert (
+        "temperature" in best_params
+    ), "Missing temperature in best parameters"
+    assert (
+        "mean_val_loss" in best_params
+    ), "Missing mean_val_loss in best parameters"
+    assert (
+        "mean_val_accuracy" in best_params
+    ), "Missing mean_val_accuracy in best parameters"
+    assert "sparsity" in best_params, "Missing sparsity in best parameters"
+
+    # Verify parameter ranges
+    assert (
+        1e-6 <= best_params["l0_lambda"] <= 1e-4
+    ), f"l0_lambda {best_params['l0_lambda']} out of range"
+    assert (
+        0.5 <= best_params["init_mean"] <= 0.999
+    ), f"init_mean {best_params['init_mean']} out of range"
+    assert (
+        0.5 <= best_params["temperature"] <= 2.0
+    ), f"temperature {best_params['temperature']} out of range"
+
+    # Verify metrics are reasonable
+    assert (
+        0 <= best_params["mean_val_accuracy"] <= 1
+    ), "mean_val_accuracy should be between 0 and 1"
+    assert (
+        0 <= best_params["sparsity"] <= 1
+    ), "sparsity should be between 0 and 1"
+    assert (
+        best_params["mean_val_loss"] >= 0
+    ), "mean_val_loss should be non-negative"
+
+    # When there are no holdouts, n_holdout_sets should be 1 and aggregation should work
+    assert best_params["n_holdout_sets"] == 1, "Should have 1 holdout set"
+    assert (
+        "holdout_objectives" in best_params
+    ), "Should still have holdout_objectives"
+    assert (
+        len(best_params["holdout_objectives"]) == 1
+    ), "Should have exactly 1 objective"
+
+    # Run calibration with the best parameters
+    calibrator.l0_lambda = best_params["l0_lambda"]
+    calibrator.init_mean = best_params["init_mean"]
+    calibrator.temperature = best_params["temperature"]
+    calibrator.regularize_with_l0 = True
+
+    # Run the full calibration
+    performance_df = calibrator.calibrate()
+    sparse_weights = calibrator.sparse_weights
+
+    assert (
+        sparse_weights is not None
+    ), "Sparse weights should be generated with L0 regularization"
+
+    # Evaluate the final calibration
+    percentage_within_10 = evaluate_sparse_weights(
+        optimised_weights=sparse_weights,
+        estimate_matrix=targets_matrix,
+        targets_array=targets,
+    )
+
+    # The tuned parameters should give reasonable results
+    assert (
+        percentage_within_10 > 50
+    ), f"Only {percentage_within_10:.1f}% of targets within 10% (expected > 50%)"
+
+    # Check that we achieved some sparsity
+    actual_sparsity = np.mean(sparse_weights == 0)
+    assert (
+        actual_sparsity > 0.05
+    ), f"Sparsity {actual_sparsity:.1%} is too low (expected > 5%)"

From 066015c3a6d3d9f53d90d78fd29253d03f8ef87e Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Sat, 2 Aug 2025 14:57:13 +0200
Subject: [PATCH 07/12] return information about each holdout when tuning

---
 src/microcalibrate/calibration.py |  28 ++++++-
 tests/test_regularization.py      | 129 ++++++++----------------------
 2 files changed, 59 insertions(+), 98 deletions(-)

diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
index 1031fc6..ed9f01b 100644
--- a/src/microcalibrate/calibration.py
+++ b/src/microcalibrate/calibration.py
@@ -561,6 +561,9 @@ def tune_hyperparameters(
             ),
         }
 
+        # Initialize list to collect all holdout evaluations
+        all_evaluations = []
+
         def objective(
             trial: optuna.Trial,
             objectives_balance: Dict[str, float] = objectives_balance,
@@ -587,6 +590,11 @@ def objective(
                         epochs_per_trial=epochs_per_trial,
                         objectives_balance=objectives_balance,
                     )
+                    # Add trial and holdout identifiers for tracking
+                    evaluation_record = result.copy()
+                    evaluation_record["trial_number"] = trial.number
+                    evaluation_record["holdout_set_idx"] = holdout_idx
+                    all_evaluations.append(evaluation_record)
                     holdout_results.append(result)
 
                 # Aggregate objectives
@@ -701,6 +709,17 @@ def objective(
         best_params["n_holdout_sets"] = n_holdout_sets
         best_params["aggregation"] = aggregation
 
+        # Create evaluation tracking dataframe
+        evaluation_df = pd.DataFrame(all_evaluations)
+
+        # Convert holdout_targets list to string for easier viewing
+        if "holdout_targets" in evaluation_df.columns:
+            evaluation_df["holdout_targets"] = evaluation_df[
+                "holdout_targets"
+            ].apply(lambda x: ", ".join(x) if isinstance(x, list) else str(x))
+
+        best_params["evaluation_history"] = evaluation_df
+
         logger.info(
             f"\nMulti-holdout tuning completed!"
             f"\nBest parameters:"
@@ -712,6 +731,7 @@ def objective(
             f"\n  - Mean val accuracy: {best_params['mean_val_accuracy']:.2%} (±{best_params['std_val_accuracy']:.2%})"
             f"\n  - Individual objectives: {[f'{obj:.4f}' for obj in best_params['holdout_objectives']]}"
             f"\n  - Sparsity: {best_params['sparsity']:.2%}"
+            f"\n\nEvaluation history saved with {len(evaluation_df)} records across {n_trials} trials."
         )
 
         return best_params
@@ -757,7 +777,7 @@ def _evaluate_single_holdout(
         hyperparameters: Dict[str, float],
         epochs_per_trial: int,
         objectives_balance: Dict[str, float],
-    ) -> Dict[str, float]:
+    ) -> Dict[str, Any]:
         """Evaluate hyperparameters on a single holdout set.
 
         Args:
@@ -767,7 +787,7 @@ def _evaluate_single_holdout(
             objectives_balance: Weights for different objectives
 
         Returns:
-            Dictionary with evaluation metrics
+            Dictionary with evaluation metrics and holdout target names
         """
         # Store original parameters
         original_params = {
@@ -791,7 +811,7 @@ def _evaluate_single_holdout(
             self.exclude_targets()
 
             # Run calibration
-            performance_df = self.calibrate()
+            self.calibrate()
             sparse_weights = self.sparse_weights
 
             # Get estimates for all targets
@@ -885,6 +905,8 @@ def _evaluate_single_holdout(
                 "train_accuracy": train_accuracy,
                 "sparsity": sparsity,
                 "n_nonzero_weights": int(np.sum(sparse_weights != 0)),
+                "holdout_targets": holdout_set["names"],
+                "hyperparameters": hyperparameters.copy(),
             }
 
         finally:
diff --git a/tests/test_regularization.py b/tests/test_regularization.py
index 4a35f52..5ab5c58 100644
--- a/tests/test_regularization.py
+++ b/tests/test_regularization.py
@@ -7,10 +7,12 @@
 import logging
 import numpy as np
 import pandas as pd
+import pytest
 
 
-def test_calibration_with_l0_regularization() -> None:
-    # Create a sample dataset for testing
+@pytest.fixture(scope="session")
+def test_data():
+    """Create sample dataset and targets for L0 hyperparameter tuning tests."""
     random_generator = np.random.default_rng(0)
     data = pd.DataFrame(
         {
@@ -21,7 +23,6 @@ def test_calibration_with_l0_regularization() -> None:
 
     weights = np.ones(len(data))
 
-    # Calculate target values:
     targets_matrix = pd.DataFrame(
         {
             "income_aged_20_30": (
@@ -56,6 +57,19 @@ def test_calibration_with_l0_regularization() -> None:
         ]
     )
 
+    return {
+        "targets_matrix": targets_matrix,
+        "weights": weights,
+        "targets": targets,
+    }
+
+
+def test_calibration_with_l0_regularization(test_data) -> None:
+    "Test calibration with L0 regularization."
+    targets_matrix = test_data["targets_matrix"]
+    weights = test_data["weights"]
+    targets = test_data["targets"]
+
     calibrator = Calibration(
         estimate_matrix=targets_matrix,
         weights=weights,
@@ -108,53 +122,11 @@ def test_calibration_with_l0_regularization() -> None:
     ), f"Only {percentage_below_threshold:.1f}% of sparse weights are below 0.5 (expected > 10%)"
 
 
-def test_l0_hyperparameter_tuning_with_holdouts() -> None:
+def test_l0_hyperparameter_tuning_with_holdouts(test_data) -> None:
     """Test L0 hyperparameter tuning with holdout validation."""
-    # Create a sample dataset for testing
-    random_generator = np.random.default_rng(0)
-    data = pd.DataFrame(
-        {
-            "age": np.append(random_generator.integers(18, 70, size=500), 71),
-            "income": random_generator.normal(40000, 10000, size=501),
-        }
-    )
-
-    weights = np.ones(len(data))
-
-    # Calculate target values:
-    targets_matrix = pd.DataFrame(
-        {
-            "income_aged_20_30": (
-                (data["age"] >= 20) & (data["age"] < 30)
-            ).astype(float)
-            * data["income"],
-            "income_aged_30_40": (
-                (data["age"] >= 30) & (data["age"] < 40)
-            ).astype(float)
-            * data["income"],
-            "income_aged_40_50": (
-                (data["age"] >= 40) & (data["age"] < 50)
-            ).astype(float)
-            * data["income"],
-            "income_aged_50_60": (
-                (data["age"] >= 50) & (data["age"] < 60)
-            ).astype(float)
-            * data["income"],
-            "income_aged_60_70": (
-                (data["age"] >= 60) & (data["age"] <= 70)
-            ).astype(float)
-            * data["income"],
-        }
-    )
-    targets = np.array(
-        [
-            (targets_matrix["income_aged_20_30"] * weights).sum() * 1.2,
-            (targets_matrix["income_aged_30_40"] * weights).sum() * 1.3,
-            (targets_matrix["income_aged_40_50"] * weights).sum() * 0.9,
-            (targets_matrix["income_aged_50_60"] * weights).sum() * 1.5,
-            (targets_matrix["income_aged_60_70"] * weights).sum() * 1.2,
-        ]
-    )
+    targets_matrix = test_data["targets_matrix"]
+    weights = test_data["weights"]
+    targets = test_data["targets"]
 
     # Create calibrator instance
     calibrator = Calibration(
@@ -219,6 +191,10 @@ def test_l0_hyperparameter_tuning_with_holdouts() -> None:
         best_params["mean_val_loss"] >= 0
     ), "mean_val_loss should be non-negative"
 
+    best_params["evaluation_history"].to_csv(
+        "tests/l0_hyperparameter_tuning_history_with_holdouts.csv", index=False
+    )
+
     # Now run calibration with the best parameters
     calibrator.l0_lambda = best_params["l0_lambda"]
     calibrator.init_mean = best_params["init_mean"]
@@ -252,53 +228,11 @@ def test_l0_hyperparameter_tuning_with_holdouts() -> None:
     ), f"Sparsity {actual_sparsity:.1%} is too low (expected > 10%)"
 
 
-def test_l0_hyperparameter_tuning_without_holdouts() -> None:
+def test_l0_hyperparameter_tuning_without_holdouts(test_data) -> None:
     """Test L0 hyperparameter tuning without holdout validation (simpler case)."""
-    # Create a sample dataset for testing
-    random_generator = np.random.default_rng(0)
-    data = pd.DataFrame(
-        {
-            "age": np.append(random_generator.integers(18, 70, size=500), 71),
-            "income": random_generator.normal(40000, 10000, size=501),
-        }
-    )
-
-    weights = np.ones(len(data))
-
-    # Calculate target values:
-    targets_matrix = pd.DataFrame(
-        {
-            "income_aged_20_30": (
-                (data["age"] >= 20) & (data["age"] < 30)
-            ).astype(float)
-            * data["income"],
-            "income_aged_30_40": (
-                (data["age"] >= 30) & (data["age"] < 40)
-            ).astype(float)
-            * data["income"],
-            "income_aged_40_50": (
-                (data["age"] >= 40) & (data["age"] < 50)
-            ).astype(float)
-            * data["income"],
-            "income_aged_50_60": (
-                (data["age"] >= 50) & (data["age"] < 60)
-            ).astype(float)
-            * data["income"],
-            "income_aged_60_70": (
-                (data["age"] >= 60) & (data["age"] <= 70)
-            ).astype(float)
-            * data["income"],
-        }
-    )
-    targets = np.array(
-        [
-            (targets_matrix["income_aged_20_30"] * weights).sum() * 1.2,
-            (targets_matrix["income_aged_30_40"] * weights).sum() * 1.3,
-            (targets_matrix["income_aged_40_50"] * weights).sum() * 0.9,
-            (targets_matrix["income_aged_50_60"] * weights).sum() * 1.5,
-            (targets_matrix["income_aged_60_70"] * weights).sum() * 1.2,
-        ]
-    )
+    targets_matrix = test_data["targets_matrix"]
+    weights = test_data["weights"]
+    targets = test_data["targets"]
 
     # Create calibrator instance
     calibrator = Calibration(
@@ -371,6 +305,11 @@ def test_l0_hyperparameter_tuning_without_holdouts() -> None:
         len(best_params["holdout_objectives"]) == 1
     ), "Should have exactly 1 objective"
 
+    best_params["evaluation_history"].to_csv(
+        "tests/l0_hyperparameter_tuning_history_without_holdouts.csv",
+        index=False,
+    )
+
     # Run calibration with the best parameters
     calibrator.l0_lambda = best_params["l0_lambda"]
     calibrator.init_mean = best_params["init_mean"]

From bca9f26b4320068296e5f53589a896728ea3819c Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 4 Aug 2025 14:38:57 +0200
Subject: [PATCH 08/12] add robustness evaluation logic independent of l0
 tuning

---
 changelog_entry.yaml              |   1 +
 src/microcalibrate/calibration.py | 741 ++++++++++++++++++++++++------
 tests/test_evaluation.py          | 240 ++++++++++
 3 files changed, 842 insertions(+), 140 deletions(-)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index 0db6d6c..8ef067d 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -2,3 +2,4 @@
   changes:
     added:
     - Add hyperparameter tuning for L0 implementation with option to holdout targets.
+    - Add method to evaluate robustness of calibration to target holdouts.
diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
index ed9f01b..f5370fa 100644
--- a/src/microcalibrate/calibration.py
+++ b/src/microcalibrate/calibration.py
@@ -564,6 +564,152 @@ def tune_hyperparameters(
         # Initialize list to collect all holdout evaluations
         all_evaluations = []
 
+        def evaluate_single_holdout(
+            holdout_set: Dict[str, Any],
+            hyperparameters: Dict[str, float],
+            epochs_per_trial: int,
+            objectives_balance: Dict[str, float],
+        ) -> Dict[str, Any]:
+            """Evaluate hyperparameters on a single holdout set.
+
+            Args:
+                holdout_set: Dictionary with 'names' and 'indices' of holdout targets
+                hyperparameters: Dictionary with l0_lambda, init_mean, temperature
+                epochs_per_trial: Number of epochs to run
+                objectives_balance: Weights for different objectives
+
+            Returns:
+                Dictionary with evaluation metrics and holdout target names
+            """
+            # Store original parameters
+            original_params = {
+                "l0_lambda": self.l0_lambda,
+                "init_mean": self.init_mean,
+                "temperature": self.temperature,
+                "regularize_with_l0": self.regularize_with_l0,
+                "epochs": self.epochs,
+            }
+
+            try:
+                # Update parameters for this evaluation
+                self.l0_lambda = hyperparameters["l0_lambda"]
+                self.init_mean = hyperparameters["init_mean"]
+                self.temperature = hyperparameters["temperature"]
+                self.regularize_with_l0 = True
+                self.epochs = epochs_per_trial
+
+                # Set up calibration with this holdout set
+                self.excluded_targets = holdout_set["names"]
+                self.exclude_targets()
+
+                # Run calibration
+                self.calibrate()
+                sparse_weights = self.sparse_weights
+
+                # Get estimates for all targets
+                weights_tensor = torch.tensor(
+                    sparse_weights, dtype=torch.float32, device=self.device
+                )
+
+                if self.original_estimate_matrix is not None:
+                    original_matrix_tensor = torch.tensor(
+                        self.original_estimate_matrix.values,
+                        dtype=torch.float32,
+                        device=self.device,
+                    )
+                    all_estimates = (
+                        (weights_tensor @ original_matrix_tensor).cpu().numpy()
+                    )
+                else:
+                    all_estimates = (
+                        self.original_estimate_function(weights_tensor)
+                        .cpu()
+                        .numpy()
+                    )
+
+                # Split into train/validation
+                n_targets = len(self.original_target_names)
+                val_indices = holdout_set["indices"]
+                train_indices = [
+                    i for i in range(n_targets) if i not in val_indices
+                ]
+
+                val_estimates = all_estimates[val_indices]
+                val_targets = self.original_targets[val_indices]
+                train_estimates = all_estimates[train_indices]
+                train_targets = self.original_targets[train_indices]
+
+                # Calculate metrics
+                from .utils.metrics import loss, pct_close
+
+                val_loss = loss(
+                    torch.tensor(
+                        val_estimates, dtype=torch.float32, device=self.device
+                    ),
+                    torch.tensor(
+                        val_targets, dtype=torch.float32, device=self.device
+                    ),
+                    None,
+                ).item()
+
+                val_accuracy = pct_close(
+                    torch.tensor(
+                        val_estimates, dtype=torch.float32, device=self.device
+                    ),
+                    torch.tensor(
+                        val_targets, dtype=torch.float32, device=self.device
+                    ),
+                )
+
+                train_loss = loss(
+                    torch.tensor(
+                        train_estimates,
+                        dtype=torch.float32,
+                        device=self.device,
+                    ),
+                    torch.tensor(
+                        train_targets, dtype=torch.float32, device=self.device
+                    ),
+                    None,
+                ).item()
+
+                train_accuracy = pct_close(
+                    torch.tensor(
+                        train_estimates,
+                        dtype=torch.float32,
+                        device=self.device,
+                    ),
+                    torch.tensor(
+                        train_targets, dtype=torch.float32, device=self.device
+                    ),
+                )
+
+                sparsity = np.mean(sparse_weights == 0)
+
+                # Calculate objective
+                objective = (
+                    val_loss * objectives_balance["loss"]
+                    + (1 - val_accuracy) * objectives_balance["accuracy"]
+                    + (1 - sparsity) * objectives_balance["sparsity"]
+                )
+
+                return {
+                    "objective": objective,
+                    "val_loss": val_loss,
+                    "val_accuracy": val_accuracy,
+                    "train_loss": train_loss,
+                    "train_accuracy": train_accuracy,
+                    "sparsity": sparsity,
+                    "n_nonzero_weights": int(np.sum(sparse_weights != 0)),
+                    "holdout_targets": holdout_set["names"],
+                    "hyperparameters": hyperparameters.copy(),
+                }
+
+            finally:
+                # Restore original parameters
+                for key, value in original_params.items():
+                    setattr(self, key, value)
+
         def objective(
             trial: optuna.Trial,
             objectives_balance: Dict[str, float] = objectives_balance,
@@ -584,7 +730,7 @@ def objective(
                 # Evaluate on all holdout sets
                 holdout_results = []
                 for holdout_idx, holdout_set in enumerate(holdout_sets):
-                    result = self._evaluate_single_holdout(
+                    result = evaluate_single_holdout(
                         holdout_set=holdout_set,
                         hyperparameters=hyperparameters,
                         epochs_per_trial=epochs_per_trial,
@@ -598,9 +744,18 @@ def objective(
                     holdout_results.append(result)
 
                 # Aggregate objectives
-                final_objective = self._aggregate_holdout_results(
-                    holdout_results, aggregation
-                )
+                objectives = [r["objective"] for r in holdout_results]
+
+                if aggregation == "mean":
+                    final_objective = np.mean(objectives)
+                elif aggregation == "median":
+                    final_objective = np.median(objectives)
+                elif aggregation == "worst":
+                    final_objective = np.max(objectives)
+                else:
+                    raise ValueError(
+                        f"Unknown aggregation method: {aggregation}"
+                    )
 
                 # Store detailed metrics
                 trial.set_user_attr(
@@ -748,11 +903,12 @@ def _create_holdout_sets(
             n_holdout_sets: Number of holdout sets to create
             holdout_fraction: Fraction of targets in each holdout set
             random_state: Base random seed for reproducibility
+            exclude_excluded: Whether to exclude already excluded targets from the holdout sets
 
         Returns:
             List of dictionaries containing holdout names and indices
         """
-        n_targets = len(self.original_target_names)
+        n_targets = len(self.target_names)
         n_holdout_targets = max(1, int(n_targets * holdout_fraction))
 
         holdout_sets = []
@@ -762,177 +918,482 @@ def _create_holdout_sets(
             holdout_indices = set_rng.choice(
                 n_targets, size=n_holdout_targets, replace=False
             )
-            holdout_names = [
-                self.original_target_names[idx] for idx in holdout_indices
-            ]
+            holdout_names = [self.target_names[idx] for idx in holdout_indices]
             holdout_sets.append(
                 {"names": holdout_names, "indices": holdout_indices}
             )
 
         return holdout_sets
 
-    def _evaluate_single_holdout(
+    def evaluate_holdout_robustness(
         self,
-        holdout_set: Dict[str, Any],
-        hyperparameters: Dict[str, float],
-        epochs_per_trial: int,
-        objectives_balance: Dict[str, float],
+        n_holdout_sets: Optional[int] = 5,
+        holdout_fraction: Optional[float] = 0.2,
+        save_results_to: Optional[str] = None,
     ) -> Dict[str, Any]:
-        """Evaluate hyperparameters on a single holdout set.
+        """
+        Evaluate calibration robustness using holdout validation.
+
+        This function assesses how well the calibration generalizes by:
+        1. Repeatedly holding out random subsets of targets
+        2. Calibrating on the remaining targets
+        3. Evaluating performance on held-out targets
 
         Args:
-            holdout_set: Dictionary with 'names' and 'indices' of holdout targets
-            hyperparameters: Dictionary with l0_lambda, init_mean, temperature
-            epochs_per_trial: Number of epochs to run
-            objectives_balance: Weights for different objectives
+            n_holdout_sets (int): Number of different holdout sets to evaluate.
+            More sets provide better estimates but increase computation time.
+            holdout_fraction (float): Fraction of targets to hold out in each set.
+            save_results_to (str): Path to save detailed results as CSV. If None, no saving.
 
         Returns:
-            Dictionary with evaluation metrics and holdout target names
+            Dict[str, Any]: Dictionary containing:
+                - overall_metrics: Summary statistics across all holdouts
+                - target_robustness: DataFrame showing each target's performance when held out
+                - recommendation: String with interpretation and recommendations
+                - detailed_results: (if requested) List of detailed results per holdout
         """
-        # Store original parameters
-        original_params = {
-            "l0_lambda": self.l0_lambda,
-            "init_mean": self.init_mean,
-            "temperature": self.temperature,
-            "regularize_with_l0": self.regularize_with_l0,
-            "epochs": self.epochs,
+
+        logger.info(
+            f"Starting holdout robustness evaluation with {n_holdout_sets} sets, "
+            f"holding out {holdout_fraction:.1%} of targets each time."
+        )
+
+        # Store original state
+        original_state = {
+            "weights": self.weights.copy(),
+            "excluded_targets": (
+                self.excluded_targets.copy() if self.excluded_targets else None
+            ),
+            "targets": self.targets.copy(),
+            "target_names": (
+                self.target_names.copy()
+                if self.target_names is not None
+                else None
+            ),
+            "sparse_weights": (
+                self.sparse_weights.copy()
+                if self.sparse_weights is not None
+                else None
+            ),
         }
 
-        try:
-            # Update parameters for this evaluation
-            self.l0_lambda = hyperparameters["l0_lambda"]
-            self.init_mean = hyperparameters["init_mean"]
-            self.temperature = hyperparameters["temperature"]
-            self.regularize_with_l0 = True
-            self.epochs = epochs_per_trial
+        # Create holdout sets
+        holdout_sets = self._create_holdout_sets(
+            n_holdout_sets, holdout_fraction, self.seed + 1
+        )
 
-            # Set up calibration with this holdout set
-            self.excluded_targets = holdout_set["names"]
-            self.exclude_targets()
+        # Collect results
+        all_results = []
+        target_performance = {
+            name: {"held_out_losses": [], "held_out_accuracies": []}
+            for name in self.original_target_names
+        }
 
-            # Run calibration
-            self.calibrate()
-            sparse_weights = self.sparse_weights
+        def evaluate_single_holdout_robustness(
+            holdout_idx: int,
+        ) -> Dict[str, Any]:
+            """Evaluate a single holdout set."""
+            try:
+                holdout_set = holdout_sets[holdout_idx]
+                logger.info(
+                    f"Evaluating holdout set {holdout_idx + 1}/{n_holdout_sets}"
+                )
 
-            # Get estimates for all targets
-            weights_tensor = torch.tensor(
-                sparse_weights, dtype=torch.float32, device=self.device
-            )
+                # Reset to original state
+                self.weights = original_state["weights"].copy()
+                self.excluded_targets = holdout_set["names"]
+                self.exclude_targets()
 
-            if self.original_estimate_matrix is not None:
-                original_matrix_tensor = torch.tensor(
-                    self.original_estimate_matrix.values,
-                    dtype=torch.float32,
-                    device=self.device,
+                # Run calibration on training targets
+                start_time = pd.Timestamp.now()
+                self.calibrate()
+                calibration_time = (
+                    pd.Timestamp.now() - start_time
+                ).total_seconds()
+
+                # Get final weights (sparse if using L0, otherwise regular)
+                final_weights = (
+                    self.sparse_weights
+                    if self.sparse_weights is not None
+                    else self.weights
                 )
-                all_estimates = (
-                    (weights_tensor @ original_matrix_tensor).cpu().numpy()
+
+                # Evaluate on all targets
+                weights_tensor = torch.tensor(
+                    final_weights, dtype=torch.float32, device=self.device
                 )
-            else:
-                all_estimates = (
-                    self.original_estimate_function(weights_tensor)
-                    .cpu()
-                    .numpy()
+
+                # Get estimates for all targets using original estimate function/matrix
+                if self.original_estimate_matrix is not None:
+                    original_matrix_tensor = torch.tensor(
+                        self.original_estimate_matrix.values,
+                        dtype=torch.float32,
+                        device=self.device,
+                    )
+                    all_estimates = (
+                        (weights_tensor @ original_matrix_tensor).cpu().numpy()
+                    )
+                else:
+                    all_estimates = (
+                        self.original_estimate_function(weights_tensor)
+                        .cpu()
+                        .numpy()
+                    )
+
+                # Calculate metrics for holdout vs training sets
+                holdout_indices = holdout_set["indices"]
+                train_indices = [
+                    i
+                    for i in range(len(self.original_target_names))
+                    if i not in holdout_indices
+                ]
+
+                holdout_estimates = all_estimates[holdout_indices]
+                holdout_targets = self.original_targets[holdout_indices]
+                holdout_names = holdout_set["names"]
+
+                train_estimates = all_estimates[train_indices]
+                train_targets = self.original_targets[train_indices]
+
+                # Calculate losses and accuracies
+                from .utils.metrics import loss, pct_close
+
+                holdout_loss = loss(
+                    torch.tensor(
+                        holdout_estimates,
+                        dtype=torch.float32,
+                        device=self.device,
+                    ),
+                    torch.tensor(
+                        holdout_targets,
+                        dtype=torch.float32,
+                        device=self.device,
+                    ),
+                    None,
+                ).item()
+
+                holdout_accuracy = pct_close(
+                    torch.tensor(
+                        holdout_estimates,
+                        dtype=torch.float32,
+                        device=self.device,
+                    ),
+                    torch.tensor(
+                        holdout_targets,
+                        dtype=torch.float32,
+                        device=self.device,
+                    ),
                 )
 
-            # Split into train/validation
-            n_targets = len(self.original_target_names)
-            val_indices = holdout_set["indices"]
-            train_indices = [
-                i for i in range(n_targets) if i not in val_indices
-            ]
+                train_loss = loss(
+                    torch.tensor(
+                        train_estimates,
+                        dtype=torch.float32,
+                        device=self.device,
+                    ),
+                    torch.tensor(
+                        train_targets, dtype=torch.float32, device=self.device
+                    ),
+                    None,
+                ).item()
+
+                train_accuracy = pct_close(
+                    torch.tensor(
+                        train_estimates,
+                        dtype=torch.float32,
+                        device=self.device,
+                    ),
+                    torch.tensor(
+                        train_targets, dtype=torch.float32, device=self.device
+                    ),
+                )
 
-            val_estimates = all_estimates[val_indices]
-            val_targets = self.original_targets[val_indices]
-            train_estimates = all_estimates[train_indices]
-            train_targets = self.original_targets[train_indices]
+                # Calculate per-target metrics for holdout targets
+                target_details = []
+                for idx, name in enumerate(holdout_names):
+                    rel_error = (
+                        holdout_estimates[idx] - holdout_targets[idx]
+                    ) / holdout_targets[idx]
+                    target_details.append(
+                        {
+                            "target_name": name,
+                            "target_value": holdout_targets[idx],
+                            "estimate": holdout_estimates[idx],
+                            "relative_error": rel_error,
+                            "within_10pct": abs(rel_error) <= 0.1,
+                        }
+                    )
 
-            # Calculate metrics
-            from .utils.metrics import loss, pct_close
+                    target_performance[name]["held_out_losses"].append(
+                        (holdout_estimates[idx] - holdout_targets[idx]) ** 2
+                    )
+                    target_performance[name]["held_out_accuracies"].append(
+                        abs(rel_error) <= 0.1
+                    )
 
-            val_loss = loss(
-                torch.tensor(
-                    val_estimates, dtype=torch.float32, device=self.device
-                ),
-                torch.tensor(
-                    val_targets, dtype=torch.float32, device=self.device
-                ),
-                None,
-            ).item()
+                generalization_gap = holdout_loss - train_loss
+                accuracy_gap = train_accuracy - holdout_accuracy
+
+                result = {
+                    "holdout_set_idx": holdout_idx,
+                    "n_holdout_targets": len(holdout_indices),
+                    "n_train_targets": len(train_indices),
+                    "holdout_loss": holdout_loss,
+                    "train_loss": train_loss,
+                    "generalization_gap": generalization_gap,
+                    "holdout_accuracy": holdout_accuracy,
+                    "train_accuracy": train_accuracy,
+                    "accuracy_gap": accuracy_gap,
+                    "calibration_time_seconds": calibration_time,
+                    "holdout_target_names": holdout_names,
+                    "target_details": target_details,
+                    "weights_sparsity": (
+                        np.mean(final_weights == 0)
+                        if self.sparse_weights is not None
+                        else 0
+                    ),
+                }
 
-            val_accuracy = pct_close(
-                torch.tensor(
-                    val_estimates, dtype=torch.float32, device=self.device
-                ),
-                torch.tensor(
-                    val_targets, dtype=torch.float32, device=self.device
-                ),
+                return result
+
+            except Exception as e:
+                logger.error(f"Error in holdout set {holdout_idx}: {str(e)}")
+                return None
+            finally:
+                # Restore original state
+                for key, value in original_state.items():
+                    if value is not None:
+                        setattr(
+                            self,
+                            key,
+                            value.copy() if hasattr(value, "copy") else value,
+                        )
+                if self.excluded_targets:
+                    self.exclude_targets()
+
+        for i in range(n_holdout_sets):
+            result = evaluate_single_holdout_robustness(i)
+            if result is not None:
+                all_results.append(result)
+
+        if not all_results:
+            raise ValueError("No successful holdout evaluations completed")
+
+        # Calculate overall metrics
+        holdout_losses = [r["holdout_loss"] for r in all_results]
+        holdout_accuracies = [r["holdout_accuracy"] for r in all_results]
+        train_losses = [r["train_loss"] for r in all_results]
+        train_accuracies = [r["train_accuracy"] for r in all_results]
+        generalization_gaps = [r["generalization_gap"] for r in all_results]
+
+        overall_metrics = {
+            "mean_holdout_loss": np.mean(holdout_losses),
+            "std_holdout_loss": np.std(holdout_losses),
+            "mean_holdout_accuracy": np.mean(holdout_accuracies),
+            "std_holdout_accuracy": np.std(holdout_accuracies),
+            "worst_holdout_accuracy": np.min(holdout_accuracies),
+            "best_holdout_accuracy": np.max(holdout_accuracies),
+            "mean_train_loss": np.mean(train_losses),
+            "mean_train_accuracy": np.mean(train_accuracies),
+            "mean_generalization_gap": np.mean(generalization_gaps),
+            "std_generalization_gap": np.std(generalization_gaps),
+            "n_successful_evaluations": len(all_results),
+            "n_failed_evaluations": n_holdout_sets - len(all_results),
+        }
+
+        target_robustness_data = []
+        for target_name in self.original_target_names:
+            perf = target_performance[target_name]
+            if perf[
+                "held_out_losses"
+            ]:  # Only include if target was held out at least once
+                target_robustness_data.append(
+                    {
+                        "target_name": target_name,
+                        "times_held_out": len(perf["held_out_losses"]),
+                        "mean_holdout_loss": np.mean(perf["held_out_losses"]),
+                        "std_holdout_loss": np.std(perf["held_out_losses"]),
+                        "holdout_accuracy_rate": np.mean(
+                            perf["held_out_accuracies"]
+                        ),
+                    }
+                )
+
+        target_robustness_df = pd.DataFrame(target_robustness_data)
+        target_robustness_df = target_robustness_df.sort_values(
+            "holdout_accuracy_rate", ascending=True
+        )
+
+        # Generate recommendations
+        recommendation = self._generate_robustness_recommendation(
+            overall_metrics, target_robustness_df
+        )
+
+        # Save results if requested
+        def save_holdout_results(
+            save_path: str,
+            overall_metrics: Dict[str, float],
+            target_robustness_df: pd.DataFrame,
+            detailed_results: List[Dict[str, Any]],
+        ) -> None:
+            """Save detailed holdout results to CSV files."""
+            from pathlib import Path
+
+            save_path = Path(save_path)
+            save_path.parent.mkdir(parents=True, exist_ok=True)
+
+            overall_df = pd.DataFrame([overall_metrics])
+            overall_path = save_path.with_name(f"{save_path.stem}_overall.csv")
+            overall_df.to_csv(overall_path, index=False)
+
+            robustness_path = save_path.with_name(
+                f"{save_path.stem}_target_robustness.csv"
             )
+            target_robustness_df.to_csv(robustness_path, index=False)
+
+            detailed_data = []
+            for result in detailed_results:
+                for target_detail in result["target_details"]:
+                    detailed_data.append(
+                        {
+                            "holdout_set_idx": result["holdout_set_idx"],
+                            "target_name": target_detail["target_name"],
+                            "target_value": target_detail["target_value"],
+                            "estimate": target_detail["estimate"],
+                            "relative_error": target_detail["relative_error"],
+                            "within_10pct": target_detail["within_10pct"],
+                            "holdout_loss": result["holdout_loss"],
+                            "train_loss": result["train_loss"],
+                            "generalization_gap": result["generalization_gap"],
+                        }
+                    )
 
-            train_loss = loss(
-                torch.tensor(
-                    train_estimates, dtype=torch.float32, device=self.device
-                ),
-                torch.tensor(
-                    train_targets, dtype=torch.float32, device=self.device
-                ),
-                None,
-            ).item()
+            detailed_df = pd.DataFrame(detailed_data)
+            detailed_path = save_path.with_name(
+                f"{save_path.stem}_detailed.csv"
+            )
+            detailed_df.to_csv(detailed_path, index=False)
+
+        if save_results_to:
+            save_holdout_results(
+                save_results_to,
+                overall_metrics,
+                target_robustness_df,
+                all_results,
+            )
 
-            train_accuracy = pct_close(
-                torch.tensor(
-                    train_estimates, dtype=torch.float32, device=self.device
-                ),
-                torch.tensor(
-                    train_targets, dtype=torch.float32, device=self.device
-                ),
+        results = {
+            "overall_metrics": overall_metrics,
+            "target_robustness": target_robustness_df,
+            "recommendation": recommendation,
+            "detailed_results": all_results,
+        }
+
+        logger.info(
+            f"\nHoldout evaluation completed:"
+            f"\n  Mean holdout accuracy: {overall_metrics['mean_holdout_accuracy']:.2%} "
+            f"(±{overall_metrics['std_holdout_accuracy']:.2%})"
+            f"\n  Worst-case accuracy: {overall_metrics['worst_holdout_accuracy']:.2%}"
+            f"\n  Generalization gap: {overall_metrics['mean_generalization_gap']:.6f}"
+            f"\n  Least robust targets: {', '.join(target_robustness_df.head(5)['target_name'].tolist())}"
+        )
+
+        return results
+
+    def _generate_robustness_recommendation(
+        self,
+        overall_metrics: Dict[str, float],
+        target_robustness_df: pd.DataFrame,
+    ) -> str:
+        """Generate interpretation and recommendations based on robustness evaluation."""
+
+        mean_acc = overall_metrics["mean_holdout_accuracy"]
+        std_acc = overall_metrics["std_holdout_accuracy"]
+        worst_acc = overall_metrics["worst_holdout_accuracy"]
+        gen_gap = overall_metrics["mean_generalization_gap"]
+        problematic_targets = target_robustness_df[
+            target_robustness_df["holdout_accuracy_rate"] < 0.5
+        ]["target_name"].tolist()
+
+        rec_parts = []
+
+        # Overall assessment
+        if mean_acc >= 0.9 and std_acc <= 0.05:
+            rec_parts.append(
+                "✅ EXCELLENT ROBUSTNESS: The calibration generalizes very well."
+            )
+        elif mean_acc >= 0.8 and std_acc <= 0.1:
+            rec_parts.append(
+                "👍 GOOD ROBUSTNESS: The calibration shows good generalization."
+            )
+        elif mean_acc >= 0.7:
+            rec_parts.append(
+                "⚠️ MODERATE ROBUSTNESS: The calibration has decent but improvable generalization."
+            )
+        else:
+            rec_parts.append(
+                "❌ POOR ROBUSTNESS: The calibration shows weak generalization."
             )
 
-            sparsity = np.mean(sparse_weights == 0)
+        rec_parts.append(
+            f"\nOn average, {mean_acc:.1%} of held-out targets are within 10% of their true values."
+        )
 
-            # Calculate objective
-            objective = (
-                val_loss * objectives_balance["loss"]
-                + (1 - val_accuracy) * objectives_balance["accuracy"]
-                + (1 - sparsity) * objectives_balance["sparsity"]
+        # Stability assessment
+        if std_acc > 0.15:
+            rec_parts.append(
+                f"\n ⚠️ High variability (std={std_acc:.1%}) suggests instability across different target combinations."
             )
 
-            return {
-                "objective": objective,
-                "val_loss": val_loss,
-                "val_accuracy": val_accuracy,
-                "train_loss": train_loss,
-                "train_accuracy": train_accuracy,
-                "sparsity": sparsity,
-                "n_nonzero_weights": int(np.sum(sparse_weights != 0)),
-                "holdout_targets": holdout_set["names"],
-                "hyperparameters": hyperparameters.copy(),
-            }
+        # Worst-case analysis
+        if worst_acc < 0.5:
+            rec_parts.append(
+                f"\n ⚠️ Worst-case scenario: Only {worst_acc:.1%} accuracy in some holdout sets."
+            )
 
-        finally:
-            # Restore original parameters
-            for key, value in original_params.items():
-                setattr(self, key, value)
+        # Problematic targets
+        if problematic_targets:
+            rec_parts.append(
+                f"\n\n📊 Targets with poor holdout performance (<50% accuracy):"
+            )
+            for target in problematic_targets[:5]:
+                target_data = target_robustness_df[
+                    target_robustness_df["target_name"] == target
+                ].iloc[0]
+                rec_parts.append(
+                    f"\n  - {target}: {target_data['holdout_accuracy_rate']:.1%} accuracy"
+                )
 
-    def _aggregate_holdout_results(
-        self, results: List[Dict[str, float]], aggregation: str
-    ) -> float:
-        """Aggregate results across multiple holdout sets.
+        rec_parts.append("\n\n💡 RECOMMENDATIONS:")
 
-        Args:
-            results: List of evaluation results from each holdout
-            aggregation: Method to aggregate ('mean', 'median', 'worst')
+        if mean_acc < 0.8 or std_acc > 0.1:
+            if self.regularize_with_l0:
+                rec_parts.append(
+                    "\n  1. Consider tuning L0 regularization parameters with tune_hyperparameters()"
+                )
+            else:
+                rec_parts.append(
+                    "\n  1. Consider enabling L0 regularization for better generalization"
+                )
 
-        Returns:
-            Aggregated objective value
-        """
-        objectives = [r["objective"] for r in results]
-
-        if aggregation == "mean":
-            return np.mean(objectives)
-        elif aggregation == "median":
-            return np.median(objectives)
-        elif aggregation == "worst":
-            return np.max(objectives)
-        else:
-            raise ValueError(f"Unknown aggregation method: {aggregation}")
+            rec_parts.append(
+                "\n  2. Increase the noise_level parameter to improve robustness"
+            )
+            rec_parts.append(
+                "\n  3. Try increasing dropout_rate to reduce overfitting"
+            )
+
+        if problematic_targets:
+            rec_parts.append(
+                f"\n  4. Investigate why these targets are hard to predict: {', '.join(problematic_targets[:3])}"
+            )
+            rec_parts.append(
+                "\n  5. Consider if these targets have sufficient support in the microdata"
+            )
+
+        if gen_gap > 0.01:
+            rec_parts.append(
+                f"\n  6. Generalization gap of {gen_gap:.4f} suggests some overfitting - consider regularization"
+            )
+
+        return "".join(rec_parts)
diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py
index 1a7b371..d6de1d3 100644
--- a/tests/test_evaluation.py
+++ b/tests/test_evaluation.py
@@ -104,3 +104,243 @@ def test_all_within_tolerance():
     np.testing.assert_array_almost_equal(
         result_df["distances"], [0.1, 0.2, 0.0]
     )
+
+
+def test_evaluate_holdout_robustness():
+    """Test the holdout robustness evaluation functionality."""
+
+    # Create a more complex mock dataset with multiple features
+    random_generator = np.random.default_rng(42)
+    n_samples = 500
+
+    data = pd.DataFrame(
+        {
+            "age": random_generator.integers(18, 80, size=n_samples),
+            "income": random_generator.lognormal(10.5, 0.7, size=n_samples),
+            "region": random_generator.choice(
+                ["North", "South", "East", "West"], size=n_samples
+            ),
+            "employed": random_generator.binomial(1, 0.7, size=n_samples),
+        }
+    )
+
+    weights = random_generator.uniform(0.5, 1.5, size=n_samples)
+    weights = weights / weights.sum() * n_samples
+
+    estimate_matrix = pd.DataFrame(
+        {
+            "total_population": np.ones(n_samples),
+            "employed_count": data["employed"].astype(float),
+            "income_north": (
+                (data["region"] == "North") * data["income"]
+            ).astype(float),
+            "income_south": (
+                (data["region"] == "South") * data["income"]
+            ).astype(float),
+            "income_east": (
+                (data["region"] == "East") * data["income"]
+            ).astype(float),
+            "income_west": (
+                (data["region"] == "West") * data["income"]
+            ).astype(float),
+            "young_employed": (
+                (data["age"] < 30) & (data["employed"] == 1)
+            ).astype(float),
+            "senior_count": (data["age"] >= 65).astype(float),
+        }
+    )
+
+    targets = np.array(
+        [
+            n_samples * 1.05,
+            (estimate_matrix["employed_count"] * weights).sum() * 0.95,
+            (estimate_matrix["income_north"] * weights).sum() * 1.1,
+            (estimate_matrix["income_south"] * weights).sum() * 0.9,
+            (estimate_matrix["income_east"] * weights).sum() * 1.05,
+            (estimate_matrix["income_west"] * weights).sum() * 0.98,
+            (estimate_matrix["young_employed"] * weights).sum() * 1.15,
+            (estimate_matrix["senior_count"] * weights).sum() * 0.92,
+        ]
+    )
+
+    calibrator = Calibration(
+        estimate_matrix=estimate_matrix,
+        weights=weights,
+        targets=targets,
+        noise_level=0.1,
+        epochs=100,
+        learning_rate=0.01,
+        dropout_rate=0.05,
+        seed=42,
+    )
+    calibrator.calibrate()
+
+    # Test basic robustness evaluation
+    results = calibrator.evaluate_holdout_robustness(
+        n_holdout_sets=3,
+        holdout_fraction=0.25,
+        save_results_to=None,  # pass a str path if you want to save and explore results' dataframes
+    )
+
+    # Check structure of results
+    assert "overall_metrics" in results
+    assert "target_robustness" in results
+    assert "recommendation" in results
+    assert "detailed_results" in results
+
+    # Check overall metrics
+    metrics = results["overall_metrics"]
+    assert "mean_holdout_loss" in metrics
+    assert "std_holdout_loss" in metrics
+    assert "mean_holdout_accuracy" in metrics
+    assert "std_holdout_accuracy" in metrics
+    assert "worst_holdout_accuracy" in metrics
+    assert "best_holdout_accuracy" in metrics
+    assert "mean_generalization_gap" in metrics
+    assert metrics["n_successful_evaluations"] == 3
+    assert metrics["n_failed_evaluations"] == 0
+
+    # Check that accuracy is between 0 and 1
+    assert 0 <= metrics["mean_holdout_accuracy"] <= 1
+    assert 0 <= metrics["worst_holdout_accuracy"] <= 1
+    assert 0 <= metrics["best_holdout_accuracy"] <= 1
+
+    # Check target robustness DataFrame
+    robustness_df = results["target_robustness"]
+    assert isinstance(robustness_df, pd.DataFrame)
+    assert len(robustness_df) > 0  # At least some targets should be evaluated
+    assert "target_name" in robustness_df.columns
+    assert "times_held_out" in robustness_df.columns
+    assert "holdout_accuracy_rate" in robustness_df.columns
+    assert "mean_holdout_loss" in robustness_df.columns
+    assert robustness_df["holdout_accuracy_rate"].is_monotonic_increasing
+    assert isinstance(results["recommendation"], str)
+    assert len(results["recommendation"]) > 0
+    assert any(
+        word in results["recommendation"]
+        for word in ["ROBUSTNESS", "RECOMMENDATIONS"]
+    )
+
+    # Check detailed results
+    assert len(results["detailed_results"]) == 3
+    for detail in results["detailed_results"]:
+        assert "holdout_loss" in detail
+        assert "train_loss" in detail
+        assert "holdout_accuracy" in detail
+        assert "train_accuracy" in detail
+        assert "generalization_gap" in detail
+        assert "target_details" in detail
+        assert len(detail["target_details"]) == 2  # 25% of 8 targets
+
+    # Test error handling with invalid parameters
+    with pytest.raises(ValueError):
+        calibrator.evaluate_holdout_robustness(
+            n_holdout_sets=0,  # Invalid
+        )
+    with pytest.raises(ValueError):
+        calibrator.evaluate_holdout_robustness(
+            holdout_fraction=1.5,  # Invalid
+        )
+
+
+def test_evaluate_holdout_robustness_with_l0_regularization():
+    """Test robustness evaluation with L0 regularization enabled."""
+
+    # Create simple dataset
+    random_generator = np.random.default_rng(123)
+    n_samples = 200
+
+    estimate_matrix = pd.DataFrame(
+        {
+            "feature_1": random_generator.uniform(0.5, 1.5, n_samples),
+            "feature_2": random_generator.uniform(0.5, 1.5, n_samples),
+            "feature_3": random_generator.uniform(0.5, 1.5, n_samples),
+            "feature_redundant": random_generator.uniform(
+                0, 0.1, n_samples
+            ),  # Low signal
+        }
+    )
+
+    weights = np.ones(n_samples)
+    col_sums = estimate_matrix.sum()
+    targets = np.array(
+        [
+            col_sums["feature_1"] * 0.95,
+            col_sums["feature_2"] * 1.05,
+            col_sums["feature_3"] * 1.0,
+            col_sums["feature_redundant"] * 1.1,
+        ]
+    )
+
+    # Initialize with L0 regularization - aggressive parameters for sparsity
+    calibrator = Calibration(
+        estimate_matrix=estimate_matrix,
+        weights=weights,
+        targets=targets,
+        regularize_with_l0=True,
+        l0_lambda=1e-4,
+        init_mean=0.5,
+        temperature=0.3,
+        epochs=100,
+        seed=123,
+    )
+
+    calibrator.calibrate()
+
+    results = calibrator.evaluate_holdout_robustness(
+        n_holdout_sets=3,
+        holdout_fraction=0.25,
+    )
+    assert all(
+        "weights_sparsity" in detail for detail in results["detailed_results"]
+    )
+    sparsity_values = [
+        detail["weights_sparsity"] for detail in results["detailed_results"]
+    ]
+    assert max(sparsity_values) >= 0 or calibrator.sparse_weights is not None
+    assert results["overall_metrics"]["mean_holdout_accuracy"] >= 0
+
+
+def test_evaluate_holdout_robustness_recommendation_logic():
+    """Test the recommendation generation logic."""
+
+    # Create a calibrator with known poor performance
+    random_generator = np.random.default_rng(789)
+    n_samples = 50
+    base_feature = random_generator.normal(0, 1, n_samples)
+    estimate_matrix = pd.DataFrame(
+        {
+            "feature_1": base_feature
+            + random_generator.normal(0, 0.1, n_samples),
+            "feature_2": base_feature
+            + random_generator.normal(0, 0.1, n_samples),
+            "feature_3": base_feature
+            + random_generator.normal(0, 0.1, n_samples),
+        }
+    )
+
+    weights = np.ones(n_samples)
+    targets = np.array([100, 200, 300])
+
+    calibrator = Calibration(
+        estimate_matrix=estimate_matrix,
+        weights=weights,
+        targets=targets,
+        epochs=20,
+        noise_level=0.01,
+        dropout_rate=0,
+    )
+
+    calibrator.calibrate()
+    results = calibrator.evaluate_holdout_robustness(n_holdout_sets=3)
+
+    recommendation = results["recommendation"]
+    if results["overall_metrics"]["mean_holdout_accuracy"] < 0.7:
+        assert any(marker in recommendation for marker in ["⚠️", "❌"])
+    if not calibrator.regularize_with_l0:
+        assert "L0 regularization" in recommendation
+    problematic = results["target_robustness"][
+        results["target_robustness"]["holdout_accuracy_rate"] < 0.5
+    ]
+    if len(problematic) > 0:
+        assert "Targets with poor holdout performance" in recommendation

From c8757dc431dd68cc5fd697ceb9673513a094e6e9 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Mon, 11 Aug 2025 15:49:51 +0200
Subject: [PATCH 09/12] add sparsity learning rate

---
 src/microcalibrate/calibration.py | 3 +++
 src/microcalibrate/reweight.py    | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
index f5370fa..c357cdf 100644
--- a/src/microcalibrate/calibration.py
+++ b/src/microcalibrate/calibration.py
@@ -30,6 +30,7 @@ def __init__(
         l0_lambda: float = 5e-6,  # best between 1e-6 and 1e-5
         init_mean: float = 0.999,  # initial proportion with non-zero weights, set near 0
         temperature: float = 0.5,  # usual values .5 to 3
+        sparse_learning_rate: Optional[float] = 0.2,
         regularize_with_l0: Optional[bool] = False,
         seed: Optional[int] = 42,
     ):
@@ -72,6 +73,7 @@ def __init__(
         self.l0_lambda = l0_lambda
         self.init_mean = init_mean
         self.temperature = temperature
+        self.sparse_learning_rate = sparse_learning_rate
         self.regularize_with_l0 = regularize_with_l0
         self.seed = seed
 
@@ -158,6 +160,7 @@ def calibrate(self) -> None:
             l0_lambda=self.l0_lambda,
             init_mean=self.init_mean,
             temperature=self.temperature,
+            sparse_learning_rate=self.sparse_learning_rate,
             regularize_with_l0=self.regularize_with_l0,
         )
 
diff --git a/src/microcalibrate/reweight.py b/src/microcalibrate/reweight.py
index 738795a..1c87c6a 100644
--- a/src/microcalibrate/reweight.py
+++ b/src/microcalibrate/reweight.py
@@ -25,6 +25,7 @@ def reweight(
     init_mean: float,
     temperature: float,
     regularize_with_l0: bool,
+    sparse_learning_rate: Optional[float] = 0.2,
     dropout_rate: Optional[float] = 0.05,
     epochs: Optional[int] = 2_000,
     noise_level: Optional[float] = 10.0,
@@ -200,7 +201,7 @@ def dropout_weights(weights: torch.Tensor, p: float) -> torch.Tensor:
         # NOTE: Results are pretty sensitve to learning rates
         # optimizer breaks down somewhere near .005, does better at above .1
         optimizer = torch.optim.Adam(
-            [weights] + list(gates.parameters()), lr=0.2
+            [weights] + list(gates.parameters()), lr=sparse_learning_rate
         )
         start_loss = None
 

From 7bd54f77365f49dab820fe2c47d369269bdad7aa Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 22 Aug 2025 12:55:52 +0200
Subject: [PATCH 10/12] refactor hyperparameter tuning into its own module

---
 src/microcalibrate/__init__.py              |   7 +-
 src/microcalibrate/calibration.py           | 838 +-------------------
 src/microcalibrate/evaluation.py            | 509 +++++++++++-
 src/microcalibrate/hyperparameter_tuning.py | 450 +++++++++++
 tests/test_regularization.py                |   4 +-
 5 files changed, 997 insertions(+), 811 deletions(-)
 create mode 100644 src/microcalibrate/hyperparameter_tuning.py

diff --git a/src/microcalibrate/__init__.py b/src/microcalibrate/__init__.py
index 0b8f3fa..ab06bd3 100644
--- a/src/microcalibrate/__init__.py
+++ b/src/microcalibrate/__init__.py
@@ -1,2 +1,7 @@
 from .calibration import Calibration
-from .evaluation import evaluate_estimate_distance_to_targets
+from .evaluation import (
+    evaluate_estimate_distance_to_targets,
+    evaluate_holdout_robustness,
+    evaluate_sparse_weights,
+)
+from .hyperparameter_tuning import tune_l0_hyperparameters
diff --git a/src/microcalibrate/calibration.py b/src/microcalibrate/calibration.py
index 8cd5e6b..3a2a9ef 100644
--- a/src/microcalibrate/calibration.py
+++ b/src/microcalibrate/calibration.py
@@ -7,6 +7,14 @@
 import torch
 from torch import Tensor
 
+from microcalibrate.evaluation import (
+    evaluate_holdout_robustness as _evaluate_holdout_robustness,
+)
+from microcalibrate.hyperparameter_tuning import (
+    tune_l0_hyperparameters as _tune_l0_hyperparameters,
+)
+from microcalibrate.reweight import reweight
+
 
 class Calibration:
     def __init__(
@@ -26,10 +34,11 @@ def __init__(
         excluded_targets: Optional[List[str]] = None,
         csv_path: Optional[str] = None,
         device: str = "cpu",  # fix to cpu for now to avoid user device-specific issues
-        l0_lambda: float = 5e-6,  # best between 1e-6 and 1e-5
-        init_mean: float = 0.999,  # initial proportion with non-zero weights, set near 0
-        sparse_learning_rate: float = 0.2,
-        temperature: float = 0.5,  # usual values .5 to 3
+        l0_lambda: Optional[float] = 5e-6,  # best between 1e-6 and 1e-5
+        init_mean: Optional[
+            float
+        ] = 0.999,  # initial proportion with non-zero weights, set near 0
+        temperature: Optional[float] = 0.5,  # usual values .5 to 3
         sparse_learning_rate: Optional[float] = 0.2,
         regularize_with_l0: Optional[bool] = False,
         seed: Optional[int] = 42,
@@ -152,8 +161,6 @@ def calibrate(self) -> None:
             target_names=self.target_names,
         )
 
-        from .reweight import reweight
-
         new_weights, sparse_weights, self.performance_df = reweight(
             original_weights=self.weights,
             estimate_function=self.estimate_function,
@@ -341,7 +348,7 @@ def _assess_targets(
             )
 
         if estimate_matrix is None and self.excluded_targets is not None:
-            logger.warning(
+            self.logger.warning(
                 "You are excluding targets but not passing an estimate matrix. Make sure the estimate function handles excluded targets correctly, otherwise you may face operand errors."
             )
 
@@ -500,14 +507,10 @@ def summary(
         df = df.reset_index(drop=True)
         return df
 
-    def tune_hyperparameters(
+    def tune_l0_hyperparameters(
         self,
         n_trials: Optional[int] = 30,
-        objectives_balance: Optional[Dict[str, float]] = {
-            "loss": 1.0,
-            "accuracy": 100.0,
-            "sparsity": 10.0,
-        },
+        objectives_balance: Optional[Dict[str, float]] = None,
         epochs_per_trial: Optional[int] = None,
         n_holdout_sets: Optional[int] = 3,
         holdout_fraction: Optional[float] = 0.2,
@@ -548,300 +551,16 @@ def tune_hyperparameters(
         Returns:
             Dictionary containing the best hyperparameters found.
         """
-        # Suppress Optuna's logs during optimization
-        optuna.logging.set_verbosity(optuna.logging.WARNING)
-
-        if epochs_per_trial is None:
-            epochs_per_trial = max(self.epochs // 4, 100)
-
-        holdout_sets = self._create_holdout_sets(
-            n_holdout_sets, holdout_fraction, self.seed
-        )
-
-        logger.info(
-            f"Multi-holdout hyperparameter tuning:\n"
-            f"  - {n_holdout_sets} holdout sets\n"
-            f"  - {len(holdout_sets[0]['indices'])} targets per holdout ({holdout_fraction:.1%})\n"
-            f"  - Aggregation: {aggregation}\n"
-        )
-
-        # Store original state
-        original_state = {
-            "excluded_targets": self.excluded_targets,
-            "targets": self.targets.copy(),
-            "target_names": (
-                self.target_names.copy()
-                if self.target_names is not None
-                else None
-            ),
-        }
-
-        # Initialize list to collect all holdout evaluations
-        all_evaluations = []
-
-        def evaluate_single_holdout(
-            holdout_set: Dict[str, Any],
-            hyperparameters: Dict[str, float],
-            epochs_per_trial: int,
-            objectives_balance: Dict[str, float],
-        ) -> Dict[str, Any]:
-            """Evaluate hyperparameters on a single holdout set.
-
-            Args:
-                holdout_set: Dictionary with 'names' and 'indices' of holdout targets
-                hyperparameters: Dictionary with l0_lambda, init_mean, temperature
-                epochs_per_trial: Number of epochs to run
-                objectives_balance: Weights for different objectives
-
-            Returns:
-                Dictionary with evaluation metrics and holdout target names
-            """
-            # Store original parameters
-            original_params = {
-                "l0_lambda": self.l0_lambda,
-                "init_mean": self.init_mean,
-                "temperature": self.temperature,
-                "regularize_with_l0": self.regularize_with_l0,
-                "epochs": self.epochs,
-            }
-
-            try:
-                # Update parameters for this evaluation
-                self.l0_lambda = hyperparameters["l0_lambda"]
-                self.init_mean = hyperparameters["init_mean"]
-                self.temperature = hyperparameters["temperature"]
-                self.regularize_with_l0 = True
-                self.epochs = epochs_per_trial
-
-                # Set up calibration with this holdout set
-                self.excluded_targets = holdout_set["names"]
-                self.exclude_targets()
-
-                # Run calibration
-                self.calibrate()
-                sparse_weights = self.sparse_weights
-
-                # Get estimates for all targets
-                weights_tensor = torch.tensor(
-                    sparse_weights, dtype=torch.float32, device=self.device
-                )
-
-                if self.original_estimate_matrix is not None:
-                    original_matrix_tensor = torch.tensor(
-                        self.original_estimate_matrix.values,
-                        dtype=torch.float32,
-                        device=self.device,
-                    )
-                    all_estimates = (
-                        (weights_tensor @ original_matrix_tensor).cpu().numpy()
-                    )
-                else:
-                    all_estimates = (
-                        self.original_estimate_function(weights_tensor)
-                        .cpu()
-                        .numpy()
-                    )
-
-                # Split into train/validation
-                n_targets = len(self.original_target_names)
-                val_indices = holdout_set["indices"]
-                train_indices = [
-                    i for i in range(n_targets) if i not in val_indices
-                ]
-
-                val_estimates = all_estimates[val_indices]
-                val_targets = self.original_targets[val_indices]
-                train_estimates = all_estimates[train_indices]
-                train_targets = self.original_targets[train_indices]
-
-                # Calculate metrics
-                from .utils.metrics import loss, pct_close
-
-                val_loss = loss(
-                    torch.tensor(
-                        val_estimates, dtype=torch.float32, device=self.device
-                    ),
-                    torch.tensor(
-                        val_targets, dtype=torch.float32, device=self.device
-                    ),
-                    None,
-                ).item()
-
-                val_accuracy = pct_close(
-                    torch.tensor(
-                        val_estimates, dtype=torch.float32, device=self.device
-                    ),
-                    torch.tensor(
-                        val_targets, dtype=torch.float32, device=self.device
-                    ),
-                )
-
-                train_loss = loss(
-                    torch.tensor(
-                        train_estimates,
-                        dtype=torch.float32,
-                        device=self.device,
-                    ),
-                    torch.tensor(
-                        train_targets, dtype=torch.float32, device=self.device
-                    ),
-                    None,
-                ).item()
-
-                train_accuracy = pct_close(
-                    torch.tensor(
-                        train_estimates,
-                        dtype=torch.float32,
-                        device=self.device,
-                    ),
-                    torch.tensor(
-                        train_targets, dtype=torch.float32, device=self.device
-                    ),
-                )
-
-                sparsity = np.mean(sparse_weights == 0)
-
-                # Calculate objective
-                objective = (
-                    val_loss * objectives_balance["loss"]
-                    + (1 - val_accuracy) * objectives_balance["accuracy"]
-                    + (1 - sparsity) * objectives_balance["sparsity"]
-                )
-
-                return {
-                    "objective": objective,
-                    "val_loss": val_loss,
-                    "val_accuracy": val_accuracy,
-                    "train_loss": train_loss,
-                    "train_accuracy": train_accuracy,
-                    "sparsity": sparsity,
-                    "n_nonzero_weights": int(np.sum(sparse_weights != 0)),
-                    "holdout_targets": holdout_set["names"],
-                    "hyperparameters": hyperparameters.copy(),
-                }
-
-            finally:
-                # Restore original parameters
-                for key, value in original_params.items():
-                    setattr(self, key, value)
-
-        def objective(
-            trial: optuna.Trial,
-            objectives_balance: Dict[str, float] = objectives_balance,
-        ) -> float:
-            """Objective function for Optuna optimization."""
-            try:
-                # Suggest hyperparameters
-                hyperparameters = {
-                    "l0_lambda": trial.suggest_float(
-                        "l0_lambda", 1e-6, 1e-4, log=True
-                    ),
-                    "init_mean": trial.suggest_float("init_mean", 0.5, 0.999),
-                    "temperature": trial.suggest_float(
-                        "temperature", 0.5, 2.0
-                    ),
-                }
-
-                # Evaluate on all holdout sets
-                holdout_results = []
-                for holdout_idx, holdout_set in enumerate(holdout_sets):
-                    result = evaluate_single_holdout(
-                        holdout_set=holdout_set,
-                        hyperparameters=hyperparameters,
-                        epochs_per_trial=epochs_per_trial,
-                        objectives_balance=objectives_balance,
-                    )
-                    # Add trial and holdout identifiers for tracking
-                    evaluation_record = result.copy()
-                    evaluation_record["trial_number"] = trial.number
-                    evaluation_record["holdout_set_idx"] = holdout_idx
-                    all_evaluations.append(evaluation_record)
-                    holdout_results.append(result)
-
-                # Aggregate objectives
-                objectives = [r["objective"] for r in holdout_results]
-
-                if aggregation == "mean":
-                    final_objective = np.mean(objectives)
-                elif aggregation == "median":
-                    final_objective = np.median(objectives)
-                elif aggregation == "worst":
-                    final_objective = np.max(objectives)
-                else:
-                    raise ValueError(
-                        f"Unknown aggregation method: {aggregation}"
-                    )
-
-                # Store detailed metrics
-                trial.set_user_attr(
-                    "holdout_objectives",
-                    [r["objective"] for r in holdout_results],
-                )
-                trial.set_user_attr(
-                    "mean_val_loss",
-                    np.mean([r["val_loss"] for r in holdout_results]),
-                )
-                trial.set_user_attr(
-                    "std_val_loss",
-                    np.std([r["val_loss"] for r in holdout_results]),
-                )
-                trial.set_user_attr(
-                    "mean_val_accuracy",
-                    np.mean([r["val_accuracy"] for r in holdout_results]),
-                )
-                trial.set_user_attr(
-                    "std_val_accuracy",
-                    np.std([r["val_accuracy"] for r in holdout_results]),
-                )
-                trial.set_user_attr(
-                    "mean_train_loss",
-                    np.mean([r["train_loss"] for r in holdout_results]),
-                )
-                trial.set_user_attr(
-                    "mean_train_accuracy",
-                    np.mean([r["train_accuracy"] for r in holdout_results]),
-                )
-
-                # Use the last holdout's sparsity metrics
-                last_result = holdout_results[-1]
-                trial.set_user_attr("sparsity", last_result["sparsity"])
-                trial.set_user_attr(
-                    "n_nonzero_weights",
-                    last_result.get("n_nonzero_weights", 0),
-                )
-
-                # Log progress
-                if trial.number % 5 == 0:
-                    objectives = [r["objective"] for r in holdout_results]
-                    val_accuracies = [
-                        r["val_accuracy"] for r in holdout_results
-                    ]
-                    logger.info(
-                        f"Trial {trial.number}:\n"
-                        f"  Objectives by holdout: {[f'{obj:.4f}' for obj in objectives]}\n"
-                        f"  {aggregation.capitalize()} objective: {final_objective:.4f}\n"
-                        f"  Mean val accuracy: {np.mean(val_accuracies):.2%} (±{np.std(val_accuracies):.2%})\n"
-                        f"  Sparsity: {last_result['sparsity']:.2%}"
-                    )
-
-                return final_objective
-
-            except Exception as e:
-                logger.warning(f"Trial {trial.number} failed: {str(e)}")
-                return 1e10
-
-            finally:
-                # Restore original state
-                self.excluded_targets = original_state["excluded_targets"]
-                self.targets = original_state["targets"]
-                self.target_names = original_state["target_names"]
-                self.exclude_targets()
-
-        # Create or load study
-        if sampler is None:
-            sampler = optuna.samplers.TPESampler(seed=self.seed)
-
-        study = optuna.create_study(
+        return _tune_l0_hyperparameters(
+            calibration=self,
+            n_trials=n_trials,
+            objectives_balance=objectives_balance,
+            epochs_per_trial=epochs_per_trial,
+            n_holdout_sets=n_holdout_sets,
+            holdout_fraction=holdout_fraction,
+            aggregation=aggregation,
+            timeout=timeout,
+            n_jobs=n_jobs,
             study_name=study_name,
             storage=storage,
             load_if_exists=load_if_exists,
@@ -850,62 +569,6 @@ def objective(
             pruner=pruner,
         )
 
-        # Run optimization
-        study.optimize(
-            objective,
-            n_trials=n_trials,
-            timeout=timeout,
-            n_jobs=n_jobs,
-            show_progress_bar=True,
-        )
-
-        # Get best parameters
-        best_params = study.best_params
-        best_trial = study.best_trial
-        best_params["mean_val_loss"] = best_trial.user_attrs.get(
-            "mean_val_loss"
-        )
-        best_params["std_val_loss"] = best_trial.user_attrs.get("std_val_loss")
-        best_params["mean_val_accuracy"] = best_trial.user_attrs.get(
-            "mean_val_accuracy"
-        )
-        best_params["std_val_accuracy"] = best_trial.user_attrs.get(
-            "std_val_accuracy"
-        )
-        best_params["holdout_objectives"] = best_trial.user_attrs.get(
-            "holdout_objectives"
-        )
-        best_params["sparsity"] = best_trial.user_attrs.get("sparsity")
-        best_params["n_holdout_sets"] = n_holdout_sets
-        best_params["aggregation"] = aggregation
-
-        # Create evaluation tracking dataframe
-        evaluation_df = pd.DataFrame(all_evaluations)
-
-        # Convert holdout_targets list to string for easier viewing
-        if "holdout_targets" in evaluation_df.columns:
-            evaluation_df["holdout_targets"] = evaluation_df[
-                "holdout_targets"
-            ].apply(lambda x: ", ".join(x) if isinstance(x, list) else str(x))
-
-        best_params["evaluation_history"] = evaluation_df
-
-        logger.info(
-            f"\nMulti-holdout tuning completed!"
-            f"\nBest parameters:"
-            f"\n  - l0_lambda: {best_params['l0_lambda']:.2e}"
-            f"\n  - init_mean: {best_params['init_mean']:.4f}"
-            f"\n  - temperature: {best_params['temperature']:.4f}"
-            f"\nPerformance across {n_holdout_sets} holdouts:"
-            f"\n  - Mean val loss: {best_params['mean_val_loss']:.6f} (±{best_params['std_val_loss']:.6f})"
-            f"\n  - Mean val accuracy: {best_params['mean_val_accuracy']:.2%} (±{best_params['std_val_accuracy']:.2%})"
-            f"\n  - Individual objectives: {[f'{obj:.4f}' for obj in best_params['holdout_objectives']]}"
-            f"\n  - Sparsity: {best_params['sparsity']:.2%}"
-            f"\n\nEvaluation history saved with {len(evaluation_df)} records across {n_trials} trials."
-        )
-
-        return best_params
-
     def _create_holdout_sets(
         self,
         n_holdout_sets: int,
@@ -967,448 +630,9 @@ def evaluate_holdout_robustness(
                 - recommendation: String with interpretation and recommendations
                 - detailed_results: (if requested) List of detailed results per holdout
         """
-
-        logger.info(
-            f"Starting holdout robustness evaluation with {n_holdout_sets} sets, "
-            f"holding out {holdout_fraction:.1%} of targets each time."
-        )
-
-        # Store original state
-        original_state = {
-            "weights": self.weights.copy(),
-            "excluded_targets": (
-                self.excluded_targets.copy() if self.excluded_targets else None
-            ),
-            "targets": self.targets.copy(),
-            "target_names": (
-                self.target_names.copy()
-                if self.target_names is not None
-                else None
-            ),
-            "sparse_weights": (
-                self.sparse_weights.copy()
-                if self.sparse_weights is not None
-                else None
-            ),
-        }
-
-        # Create holdout sets
-        holdout_sets = self._create_holdout_sets(
-            n_holdout_sets, holdout_fraction, self.seed + 1
-        )
-
-        # Collect results
-        all_results = []
-        target_performance = {
-            name: {"held_out_losses": [], "held_out_accuracies": []}
-            for name in self.original_target_names
-        }
-
-        def evaluate_single_holdout_robustness(
-            holdout_idx: int,
-        ) -> Dict[str, Any]:
-            """Evaluate a single holdout set."""
-            try:
-                holdout_set = holdout_sets[holdout_idx]
-                logger.info(
-                    f"Evaluating holdout set {holdout_idx + 1}/{n_holdout_sets}"
-                )
-
-                # Reset to original state
-                self.weights = original_state["weights"].copy()
-                self.excluded_targets = holdout_set["names"]
-                self.exclude_targets()
-
-                # Run calibration on training targets
-                start_time = pd.Timestamp.now()
-                self.calibrate()
-                calibration_time = (
-                    pd.Timestamp.now() - start_time
-                ).total_seconds()
-
-                # Get final weights (sparse if using L0, otherwise regular)
-                final_weights = (
-                    self.sparse_weights
-                    if self.sparse_weights is not None
-                    else self.weights
-                )
-
-                # Evaluate on all targets
-                weights_tensor = torch.tensor(
-                    final_weights, dtype=torch.float32, device=self.device
-                )
-
-                # Get estimates for all targets using original estimate function/matrix
-                if self.original_estimate_matrix is not None:
-                    original_matrix_tensor = torch.tensor(
-                        self.original_estimate_matrix.values,
-                        dtype=torch.float32,
-                        device=self.device,
-                    )
-                    all_estimates = (
-                        (weights_tensor @ original_matrix_tensor).cpu().numpy()
-                    )
-                else:
-                    all_estimates = (
-                        self.original_estimate_function(weights_tensor)
-                        .cpu()
-                        .numpy()
-                    )
-
-                # Calculate metrics for holdout vs training sets
-                holdout_indices = holdout_set["indices"]
-                train_indices = [
-                    i
-                    for i in range(len(self.original_target_names))
-                    if i not in holdout_indices
-                ]
-
-                holdout_estimates = all_estimates[holdout_indices]
-                holdout_targets = self.original_targets[holdout_indices]
-                holdout_names = holdout_set["names"]
-
-                train_estimates = all_estimates[train_indices]
-                train_targets = self.original_targets[train_indices]
-
-                # Calculate losses and accuracies
-                from .utils.metrics import loss, pct_close
-
-                holdout_loss = loss(
-                    torch.tensor(
-                        holdout_estimates,
-                        dtype=torch.float32,
-                        device=self.device,
-                    ),
-                    torch.tensor(
-                        holdout_targets,
-                        dtype=torch.float32,
-                        device=self.device,
-                    ),
-                    None,
-                ).item()
-
-                holdout_accuracy = pct_close(
-                    torch.tensor(
-                        holdout_estimates,
-                        dtype=torch.float32,
-                        device=self.device,
-                    ),
-                    torch.tensor(
-                        holdout_targets,
-                        dtype=torch.float32,
-                        device=self.device,
-                    ),
-                )
-
-                train_loss = loss(
-                    torch.tensor(
-                        train_estimates,
-                        dtype=torch.float32,
-                        device=self.device,
-                    ),
-                    torch.tensor(
-                        train_targets, dtype=torch.float32, device=self.device
-                    ),
-                    None,
-                ).item()
-
-                train_accuracy = pct_close(
-                    torch.tensor(
-                        train_estimates,
-                        dtype=torch.float32,
-                        device=self.device,
-                    ),
-                    torch.tensor(
-                        train_targets, dtype=torch.float32, device=self.device
-                    ),
-                )
-
-                # Calculate per-target metrics for holdout targets
-                target_details = []
-                for idx, name in enumerate(holdout_names):
-                    rel_error = (
-                        holdout_estimates[idx] - holdout_targets[idx]
-                    ) / holdout_targets[idx]
-                    target_details.append(
-                        {
-                            "target_name": name,
-                            "target_value": holdout_targets[idx],
-                            "estimate": holdout_estimates[idx],
-                            "relative_error": rel_error,
-                            "within_10pct": abs(rel_error) <= 0.1,
-                        }
-                    )
-
-                    target_performance[name]["held_out_losses"].append(
-                        (holdout_estimates[idx] - holdout_targets[idx]) ** 2
-                    )
-                    target_performance[name]["held_out_accuracies"].append(
-                        abs(rel_error) <= 0.1
-                    )
-
-                generalization_gap = holdout_loss - train_loss
-                accuracy_gap = train_accuracy - holdout_accuracy
-
-                result = {
-                    "holdout_set_idx": holdout_idx,
-                    "n_holdout_targets": len(holdout_indices),
-                    "n_train_targets": len(train_indices),
-                    "holdout_loss": holdout_loss,
-                    "train_loss": train_loss,
-                    "generalization_gap": generalization_gap,
-                    "holdout_accuracy": holdout_accuracy,
-                    "train_accuracy": train_accuracy,
-                    "accuracy_gap": accuracy_gap,
-                    "calibration_time_seconds": calibration_time,
-                    "holdout_target_names": holdout_names,
-                    "target_details": target_details,
-                    "weights_sparsity": (
-                        np.mean(final_weights == 0)
-                        if self.sparse_weights is not None
-                        else 0
-                    ),
-                }
-
-                return result
-
-            except Exception as e:
-                logger.error(f"Error in holdout set {holdout_idx}: {str(e)}")
-                return None
-            finally:
-                # Restore original state
-                for key, value in original_state.items():
-                    if value is not None:
-                        setattr(
-                            self,
-                            key,
-                            value.copy() if hasattr(value, "copy") else value,
-                        )
-                if self.excluded_targets:
-                    self.exclude_targets()
-
-        for i in range(n_holdout_sets):
-            result = evaluate_single_holdout_robustness(i)
-            if result is not None:
-                all_results.append(result)
-
-        if not all_results:
-            raise ValueError("No successful holdout evaluations completed")
-
-        # Calculate overall metrics
-        holdout_losses = [r["holdout_loss"] for r in all_results]
-        holdout_accuracies = [r["holdout_accuracy"] for r in all_results]
-        train_losses = [r["train_loss"] for r in all_results]
-        train_accuracies = [r["train_accuracy"] for r in all_results]
-        generalization_gaps = [r["generalization_gap"] for r in all_results]
-
-        overall_metrics = {
-            "mean_holdout_loss": np.mean(holdout_losses),
-            "std_holdout_loss": np.std(holdout_losses),
-            "mean_holdout_accuracy": np.mean(holdout_accuracies),
-            "std_holdout_accuracy": np.std(holdout_accuracies),
-            "worst_holdout_accuracy": np.min(holdout_accuracies),
-            "best_holdout_accuracy": np.max(holdout_accuracies),
-            "mean_train_loss": np.mean(train_losses),
-            "mean_train_accuracy": np.mean(train_accuracies),
-            "mean_generalization_gap": np.mean(generalization_gaps),
-            "std_generalization_gap": np.std(generalization_gaps),
-            "n_successful_evaluations": len(all_results),
-            "n_failed_evaluations": n_holdout_sets - len(all_results),
-        }
-
-        target_robustness_data = []
-        for target_name in self.original_target_names:
-            perf = target_performance[target_name]
-            if perf[
-                "held_out_losses"
-            ]:  # Only include if target was held out at least once
-                target_robustness_data.append(
-                    {
-                        "target_name": target_name,
-                        "times_held_out": len(perf["held_out_losses"]),
-                        "mean_holdout_loss": np.mean(perf["held_out_losses"]),
-                        "std_holdout_loss": np.std(perf["held_out_losses"]),
-                        "holdout_accuracy_rate": np.mean(
-                            perf["held_out_accuracies"]
-                        ),
-                    }
-                )
-
-        target_robustness_df = pd.DataFrame(target_robustness_data)
-        target_robustness_df = target_robustness_df.sort_values(
-            "holdout_accuracy_rate", ascending=True
-        )
-
-        # Generate recommendations
-        recommendation = self._generate_robustness_recommendation(
-            overall_metrics, target_robustness_df
-        )
-
-        # Save results if requested
-        def save_holdout_results(
-            save_path: str,
-            overall_metrics: Dict[str, float],
-            target_robustness_df: pd.DataFrame,
-            detailed_results: List[Dict[str, Any]],
-        ) -> None:
-            """Save detailed holdout results to CSV files."""
-            from pathlib import Path
-
-            save_path = Path(save_path)
-            save_path.parent.mkdir(parents=True, exist_ok=True)
-
-            overall_df = pd.DataFrame([overall_metrics])
-            overall_path = save_path.with_name(f"{save_path.stem}_overall.csv")
-            overall_df.to_csv(overall_path, index=False)
-
-            robustness_path = save_path.with_name(
-                f"{save_path.stem}_target_robustness.csv"
-            )
-            target_robustness_df.to_csv(robustness_path, index=False)
-
-            detailed_data = []
-            for result in detailed_results:
-                for target_detail in result["target_details"]:
-                    detailed_data.append(
-                        {
-                            "holdout_set_idx": result["holdout_set_idx"],
-                            "target_name": target_detail["target_name"],
-                            "target_value": target_detail["target_value"],
-                            "estimate": target_detail["estimate"],
-                            "relative_error": target_detail["relative_error"],
-                            "within_10pct": target_detail["within_10pct"],
-                            "holdout_loss": result["holdout_loss"],
-                            "train_loss": result["train_loss"],
-                            "generalization_gap": result["generalization_gap"],
-                        }
-                    )
-
-            detailed_df = pd.DataFrame(detailed_data)
-            detailed_path = save_path.with_name(
-                f"{save_path.stem}_detailed.csv"
-            )
-            detailed_df.to_csv(detailed_path, index=False)
-
-        if save_results_to:
-            save_holdout_results(
-                save_results_to,
-                overall_metrics,
-                target_robustness_df,
-                all_results,
-            )
-
-        results = {
-            "overall_metrics": overall_metrics,
-            "target_robustness": target_robustness_df,
-            "recommendation": recommendation,
-            "detailed_results": all_results,
-        }
-
-        logger.info(
-            f"\nHoldout evaluation completed:"
-            f"\n  Mean holdout accuracy: {overall_metrics['mean_holdout_accuracy']:.2%} "
-            f"(±{overall_metrics['std_holdout_accuracy']:.2%})"
-            f"\n  Worst-case accuracy: {overall_metrics['worst_holdout_accuracy']:.2%}"
-            f"\n  Generalization gap: {overall_metrics['mean_generalization_gap']:.6f}"
-            f"\n  Least robust targets: {', '.join(target_robustness_df.head(5)['target_name'].tolist())}"
+        return _evaluate_holdout_robustness(
+            calibration=self,
+            n_holdout_sets=n_holdout_sets,
+            holdout_fraction=holdout_fraction,
+            save_results_to=save_results_to,
         )
-
-        return results
-
-    def _generate_robustness_recommendation(
-        self,
-        overall_metrics: Dict[str, float],
-        target_robustness_df: pd.DataFrame,
-    ) -> str:
-        """Generate interpretation and recommendations based on robustness evaluation."""
-
-        mean_acc = overall_metrics["mean_holdout_accuracy"]
-        std_acc = overall_metrics["std_holdout_accuracy"]
-        worst_acc = overall_metrics["worst_holdout_accuracy"]
-        gen_gap = overall_metrics["mean_generalization_gap"]
-        problematic_targets = target_robustness_df[
-            target_robustness_df["holdout_accuracy_rate"] < 0.5
-        ]["target_name"].tolist()
-
-        rec_parts = []
-
-        # Overall assessment
-        if mean_acc >= 0.9 and std_acc <= 0.05:
-            rec_parts.append(
-                "✅ EXCELLENT ROBUSTNESS: The calibration generalizes very well."
-            )
-        elif mean_acc >= 0.8 and std_acc <= 0.1:
-            rec_parts.append(
-                "👍 GOOD ROBUSTNESS: The calibration shows good generalization."
-            )
-        elif mean_acc >= 0.7:
-            rec_parts.append(
-                "⚠️ MODERATE ROBUSTNESS: The calibration has decent but improvable generalization."
-            )
-        else:
-            rec_parts.append(
-                "❌ POOR ROBUSTNESS: The calibration shows weak generalization."
-            )
-
-        rec_parts.append(
-            f"\nOn average, {mean_acc:.1%} of held-out targets are within 10% of their true values."
-        )
-
-        # Stability assessment
-        if std_acc > 0.15:
-            rec_parts.append(
-                f"\n ⚠️ High variability (std={std_acc:.1%}) suggests instability across different target combinations."
-            )
-
-        # Worst-case analysis
-        if worst_acc < 0.5:
-            rec_parts.append(
-                f"\n ⚠️ Worst-case scenario: Only {worst_acc:.1%} accuracy in some holdout sets."
-            )
-
-        # Problematic targets
-        if problematic_targets:
-            rec_parts.append(
-                f"\n\n📊 Targets with poor holdout performance (<50% accuracy):"
-            )
-            for target in problematic_targets[:5]:
-                target_data = target_robustness_df[
-                    target_robustness_df["target_name"] == target
-                ].iloc[0]
-                rec_parts.append(
-                    f"\n  - {target}: {target_data['holdout_accuracy_rate']:.1%} accuracy"
-                )
-
-        rec_parts.append("\n\n💡 RECOMMENDATIONS:")
-
-        if mean_acc < 0.8 or std_acc > 0.1:
-            if self.regularize_with_l0:
-                rec_parts.append(
-                    "\n  1. Consider tuning L0 regularization parameters with tune_hyperparameters()"
-                )
-            else:
-                rec_parts.append(
-                    "\n  1. Consider enabling L0 regularization for better generalization"
-                )
-
-            rec_parts.append(
-                "\n  2. Increase the noise_level parameter to improve robustness"
-            )
-            rec_parts.append(
-                "\n  3. Try increasing dropout_rate to reduce overfitting"
-            )
-
-        if problematic_targets:
-            rec_parts.append(
-                f"\n  4. Investigate why these targets are hard to predict: {', '.join(problematic_targets[:3])}"
-            )
-            rec_parts.append(
-                "\n  5. Consider if these targets have sufficient support in the microdata"
-            )
-
-        if gen_gap > 0.01:
-            rec_parts.append(
-                f"\n  6. Generalization gap of {gen_gap:.4f} suggests some overfitting - consider regularization"
-            )
-
-        return "".join(rec_parts)
diff --git a/src/microcalibrate/evaluation.py b/src/microcalibrate/evaluation.py
index e485194..138a45a 100644
--- a/src/microcalibrate/evaluation.py
+++ b/src/microcalibrate/evaluation.py
@@ -1,10 +1,13 @@
 import logging
-from typing import List, Optional, Union
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import pandas as pd
 import torch
 
+from microcalibrate.utils.metrics import loss, pct_close
+
 logger = logging.getLogger(__name__)
 
 
@@ -131,3 +134,507 @@ def evaluate_sparse_weights(
         logging.info(f"has rel_error: {rel_error[i]:.2f}\n")
     logging.info("---End of reweighting quick diagnostics------")
     return percent_within_10
+
+
+def _evaluate_single_holdout_robustness(
+    calibration,
+    holdout_set: Dict[str, Any],
+    holdout_idx: int,
+    n_holdout_sets: int,
+) -> Optional[Dict[str, Any]]:
+    """Evaluate a single holdout set for robustness analysis.
+
+    Args:
+        calibration: Calibration instance
+        holdout_set: Dictionary with holdout information
+        holdout_idx: Index of current holdout set
+        n_holdout_sets: Total number of holdout sets
+
+    Returns:
+        Dictionary with evaluation results or None if failed
+    """
+    try:
+        logger.info(
+            f"Evaluating holdout set {holdout_idx + 1}/{n_holdout_sets}"
+        )
+
+        # Run calibration on training targets
+        start_time = pd.Timestamp.now()
+        calibration.calibrate()
+        calibration_time = (pd.Timestamp.now() - start_time).total_seconds()
+
+        # Get final weights (sparse if using L0, otherwise regular)
+        final_weights = (
+            calibration.sparse_weights
+            if calibration.sparse_weights is not None
+            else calibration.weights
+        )
+
+        # Evaluate on all targets
+        weights_tensor = torch.tensor(
+            final_weights, dtype=torch.float32, device=calibration.device
+        )
+
+        # Get estimates for all targets using original estimate function/matrix
+        if calibration.original_estimate_matrix is not None:
+            original_matrix_tensor = torch.tensor(
+                calibration.original_estimate_matrix.values,
+                dtype=torch.float32,
+                device=calibration.device,
+            )
+            all_estimates = (
+                (weights_tensor @ original_matrix_tensor).cpu().numpy()
+            )
+        else:
+            all_estimates = (
+                calibration.original_estimate_function(weights_tensor)
+                .cpu()
+                .numpy()
+            )
+
+        # Calculate metrics for holdout vs training sets
+        holdout_indices = holdout_set["indices"]
+        train_indices = [
+            i
+            for i in range(len(calibration.original_target_names))
+            if i not in holdout_indices
+        ]
+
+        holdout_estimates = all_estimates[holdout_indices]
+        holdout_targets = calibration.original_targets[holdout_indices]
+        holdout_names = holdout_set["names"]
+
+        train_estimates = all_estimates[train_indices]
+        train_targets = calibration.original_targets[train_indices]
+
+        # Calculate losses and accuracies
+        holdout_loss = loss(
+            torch.tensor(
+                holdout_estimates,
+                dtype=torch.float32,
+                device=calibration.device,
+            ),
+            torch.tensor(
+                holdout_targets, dtype=torch.float32, device=calibration.device
+            ),
+            None,
+        ).item()
+
+        holdout_accuracy = pct_close(
+            torch.tensor(
+                holdout_estimates,
+                dtype=torch.float32,
+                device=calibration.device,
+            ),
+            torch.tensor(
+                holdout_targets, dtype=torch.float32, device=calibration.device
+            ),
+        )
+
+        train_loss = loss(
+            torch.tensor(
+                train_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                train_targets, dtype=torch.float32, device=calibration.device
+            ),
+            None,
+        ).item()
+
+        train_accuracy = pct_close(
+            torch.tensor(
+                train_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                train_targets, dtype=torch.float32, device=calibration.device
+            ),
+        )
+
+        # Calculate per-target metrics for holdout targets
+        target_details = []
+        for idx, name in enumerate(holdout_names):
+            rel_error = (
+                holdout_estimates[idx] - holdout_targets[idx]
+            ) / holdout_targets[idx]
+            target_details.append(
+                {
+                    "target_name": name,
+                    "target_value": holdout_targets[idx],
+                    "estimate": holdout_estimates[idx],
+                    "relative_error": rel_error,
+                    "within_10pct": abs(rel_error) <= 0.1,
+                }
+            )
+
+        generalization_gap = holdout_loss - train_loss
+        accuracy_gap = train_accuracy - holdout_accuracy
+
+        result = {
+            "holdout_set_idx": holdout_idx,
+            "n_holdout_targets": len(holdout_indices),
+            "n_train_targets": len(train_indices),
+            "holdout_loss": holdout_loss,
+            "train_loss": train_loss,
+            "generalization_gap": generalization_gap,
+            "holdout_accuracy": holdout_accuracy,
+            "train_accuracy": train_accuracy,
+            "accuracy_gap": accuracy_gap,
+            "calibration_time_seconds": calibration_time,
+            "holdout_target_names": holdout_names,
+            "target_details": target_details,
+            "weights_sparsity": (
+                np.mean(final_weights == 0)
+                if calibration.sparse_weights is not None
+                else 0
+            ),
+        }
+
+        return result
+
+    except Exception as e:
+        logger.error(f"Error in holdout set {holdout_idx}: {str(e)}")
+        return None
+
+
+def _save_holdout_results(
+    save_path: str,
+    overall_metrics: Dict[str, float],
+    target_robustness_df: pd.DataFrame,
+    detailed_results: List[Dict[str, Any]],
+) -> None:
+    """Save detailed holdout results to CSV files.
+
+    Args:
+        save_path: Path to save results
+        overall_metrics: Overall metrics dictionary
+        target_robustness_df: Target robustness dataframe
+        detailed_results: List of detailed results
+    """
+    save_path = Path(save_path)
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+
+    overall_df = pd.DataFrame([overall_metrics])
+    overall_path = save_path.with_name(f"{save_path.stem}_overall.csv")
+    overall_df.to_csv(overall_path, index=False)
+
+    robustness_path = save_path.with_name(
+        f"{save_path.stem}_target_robustness.csv"
+    )
+    target_robustness_df.to_csv(robustness_path, index=False)
+
+    detailed_data = []
+    for result in detailed_results:
+        for target_detail in result["target_details"]:
+            detailed_data.append(
+                {
+                    "holdout_set_idx": result["holdout_set_idx"],
+                    "target_name": target_detail["target_name"],
+                    "target_value": target_detail["target_value"],
+                    "estimate": target_detail["estimate"],
+                    "relative_error": target_detail["relative_error"],
+                    "within_10pct": target_detail["within_10pct"],
+                    "holdout_loss": result["holdout_loss"],
+                    "train_loss": result["train_loss"],
+                    "generalization_gap": result["generalization_gap"],
+                }
+            )
+
+    detailed_df = pd.DataFrame(detailed_data)
+    detailed_path = save_path.with_name(f"{save_path.stem}_detailed.csv")
+    detailed_df.to_csv(detailed_path, index=False)
+
+
+def _generate_robustness_recommendation(
+    overall_metrics: Dict[str, float],
+    target_robustness_df: pd.DataFrame,
+    regularize_with_l0: bool,
+) -> str:
+    """Generate interpretation and recommendations based on robustness evaluation.
+
+    Args:
+        overall_metrics: Overall metrics dictionary
+        target_robustness_df: Target robustness dataframe
+        regularize_with_l0: Whether L0 regularization is enabled
+
+    Returns:
+        Recommendation string
+    """
+    mean_acc = overall_metrics["mean_holdout_accuracy"]
+    std_acc = overall_metrics["std_holdout_accuracy"]
+    worst_acc = overall_metrics["worst_holdout_accuracy"]
+    gen_gap = overall_metrics["mean_generalization_gap"]
+    problematic_targets = target_robustness_df[
+        target_robustness_df["holdout_accuracy_rate"] < 0.5
+    ]["target_name"].tolist()
+
+    rec_parts = []
+
+    # Overall assessment
+    if mean_acc >= 0.9 and std_acc <= 0.05:
+        rec_parts.append(
+            "✅ EXCELLENT ROBUSTNESS: The calibration generalizes very well."
+        )
+    elif mean_acc >= 0.8 and std_acc <= 0.1:
+        rec_parts.append(
+            "👍 GOOD ROBUSTNESS: The calibration shows good generalization."
+        )
+    elif mean_acc >= 0.7:
+        rec_parts.append(
+            "⚠️ MODERATE ROBUSTNESS: The calibration has decent but improvable generalization."
+        )
+    else:
+        rec_parts.append(
+            "❌ POOR ROBUSTNESS: The calibration shows weak generalization."
+        )
+
+    rec_parts.append(
+        f"\nOn average, {mean_acc:.1%} of held-out targets are within 10% of their true values."
+    )
+
+    # Stability assessment
+    if std_acc > 0.15:
+        rec_parts.append(
+            f"\n ⚠️ High variability (std={std_acc:.1%}) suggests instability across different target combinations."
+        )
+
+    # Worst-case analysis
+    if worst_acc < 0.5:
+        rec_parts.append(
+            f"\n ⚠️ Worst-case scenario: Only {worst_acc:.1%} accuracy in some holdout sets."
+        )
+
+    # Problematic targets
+    if problematic_targets:
+        rec_parts.append(
+            f"\n\n📊 Targets with poor holdout performance (<50% accuracy):"
+        )
+        for target in problematic_targets[:5]:
+            target_data = target_robustness_df[
+                target_robustness_df["target_name"] == target
+            ].iloc[0]
+            rec_parts.append(
+                f"\n  - {target}: {target_data['holdout_accuracy_rate']:.1%} accuracy"
+            )
+
+    rec_parts.append("\n\n💡 RECOMMENDATIONS:")
+
+    if mean_acc < 0.8 or std_acc > 0.1:
+        if regularize_with_l0:
+            rec_parts.append(
+                "\n  1. Consider tuning L0 regularization parameters with tune_hyperparameters()"
+            )
+        else:
+            rec_parts.append(
+                "\n  1. Consider enabling L0 regularization for better generalization"
+            )
+
+        rec_parts.append(
+            "\n  2. Increase the noise_level parameter to improve robustness"
+        )
+        rec_parts.append(
+            "\n  3. Try increasing dropout_rate to reduce overfitting"
+        )
+
+    if problematic_targets:
+        rec_parts.append(
+            f"\n  4. Investigate why these targets are hard to predict: {', '.join(problematic_targets[:3])}"
+        )
+        rec_parts.append(
+            "\n  5. Consider if these targets have sufficient support in the microdata"
+        )
+
+    if gen_gap > 0.01:
+        rec_parts.append(
+            f"\n  6. Generalization gap of {gen_gap:.4f} suggests some overfitting - consider regularization"
+        )
+
+    return "".join(rec_parts)
+
+
+def evaluate_holdout_robustness(
+    calibration,
+    n_holdout_sets: Optional[int] = 5,
+    holdout_fraction: Optional[float] = 0.2,
+    save_results_to: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Evaluate calibration robustness using holdout validation.
+
+    This function assesses how well the calibration generalizes by:
+    1. Repeatedly holding out random subsets of targets
+    2. Calibrating on the remaining targets
+    3. Evaluating performance on held-out targets
+
+    Args:
+        calibration: Calibration instance to evaluate
+        n_holdout_sets: Number of different holdout sets to evaluate.
+            More sets provide better estimates but increase computation time.
+        holdout_fraction: Fraction of targets to hold out in each set.
+        save_results_to: Path to save detailed results as CSV. If None, no saving.
+
+    Returns:
+        Dict[str, Any]: Dictionary containing:
+            - overall_metrics: Summary statistics across all holdouts
+            - target_robustness: DataFrame showing each target's performance when held out
+            - recommendation: String with interpretation and recommendations
+            - detailed_results: (if requested) List of detailed results per holdout
+    """
+    logger.info(
+        f"Starting holdout robustness evaluation with {n_holdout_sets} sets, "
+        f"holding out {holdout_fraction:.1%} of targets each time."
+    )
+
+    logger.warning(
+        "Data leakage warning: Targets often share overlapping information "
+        "(e.g., geographic breakdowns like 'snap in CA' and 'snap in US'). "
+        "Holdout validation may not provide complete isolation between training and validation sets. "
+        "The robustness metrics should be interpreted with this limitation in mind - "
+        "they may overestimate the model's true generalization performance."
+    )
+
+    # Store original state
+    original_state = {
+        "weights": calibration.weights.copy(),
+        "excluded_targets": (
+            calibration.excluded_targets.copy()
+            if calibration.excluded_targets
+            else None
+        ),
+        "targets": calibration.targets.copy(),
+        "target_names": (
+            calibration.target_names.copy()
+            if calibration.target_names is not None
+            else None
+        ),
+        "sparse_weights": (
+            calibration.sparse_weights.copy()
+            if calibration.sparse_weights is not None
+            else None
+        ),
+    }
+
+    # Create holdout sets
+    holdout_sets = calibration._create_holdout_sets(
+        n_holdout_sets, holdout_fraction, calibration.seed + 1
+    )
+
+    # Collect results
+    all_results = []
+    target_performance = {
+        name: {"held_out_losses": [], "held_out_accuracies": []}
+        for name in calibration.original_target_names
+    }
+
+    try:
+        for i in range(n_holdout_sets):
+            holdout_set = holdout_sets[i]
+
+            # Reset to original state
+            calibration.weights = original_state["weights"].copy()
+            calibration.excluded_targets = holdout_set["names"]
+            calibration.exclude_targets()
+
+            result = _evaluate_single_holdout_robustness(
+                calibration, holdout_set, i, n_holdout_sets
+            )
+
+            if result is not None:
+                all_results.append(result)
+
+                # Update target performance tracking
+                for detail in result["target_details"]:
+                    name = detail["target_name"]
+                    target_performance[name]["held_out_losses"].append(
+                        (detail["estimate"] - detail["target_value"]) ** 2
+                    )
+                    target_performance[name]["held_out_accuracies"].append(
+                        detail["within_10pct"]
+                    )
+    finally:
+        # Restore original state
+        for key, value in original_state.items():
+            if value is not None:
+                setattr(
+                    calibration,
+                    key,
+                    value.copy() if hasattr(value, "copy") else value,
+                )
+        if calibration.excluded_targets:
+            calibration.exclude_targets()
+
+    if not all_results:
+        raise ValueError("No successful holdout evaluations completed")
+
+    # Calculate overall metrics
+    holdout_losses = [r["holdout_loss"] for r in all_results]
+    holdout_accuracies = [r["holdout_accuracy"] for r in all_results]
+    train_losses = [r["train_loss"] for r in all_results]
+    train_accuracies = [r["train_accuracy"] for r in all_results]
+    generalization_gaps = [r["generalization_gap"] for r in all_results]
+
+    overall_metrics = {
+        "mean_holdout_loss": np.mean(holdout_losses),
+        "std_holdout_loss": np.std(holdout_losses),
+        "mean_holdout_accuracy": np.mean(holdout_accuracies),
+        "std_holdout_accuracy": np.std(holdout_accuracies),
+        "worst_holdout_accuracy": np.min(holdout_accuracies),
+        "best_holdout_accuracy": np.max(holdout_accuracies),
+        "mean_train_loss": np.mean(train_losses),
+        "mean_train_accuracy": np.mean(train_accuracies),
+        "mean_generalization_gap": np.mean(generalization_gaps),
+        "std_generalization_gap": np.std(generalization_gaps),
+        "n_successful_evaluations": len(all_results),
+        "n_failed_evaluations": n_holdout_sets - len(all_results),
+    }
+
+    target_robustness_data = []
+    for target_name in calibration.original_target_names:
+        perf = target_performance[target_name]
+        if perf[
+            "held_out_losses"
+        ]:  # Only include if target was held out at least once
+            target_robustness_data.append(
+                {
+                    "target_name": target_name,
+                    "times_held_out": len(perf["held_out_losses"]),
+                    "mean_holdout_loss": np.mean(perf["held_out_losses"]),
+                    "std_holdout_loss": np.std(perf["held_out_losses"]),
+                    "holdout_accuracy_rate": np.mean(
+                        perf["held_out_accuracies"]
+                    ),
+                }
+            )
+
+    target_robustness_df = pd.DataFrame(target_robustness_data)
+    target_robustness_df = target_robustness_df.sort_values(
+        "holdout_accuracy_rate", ascending=True
+    )
+
+    # Generate recommendations
+    recommendation = _generate_robustness_recommendation(
+        overall_metrics, target_robustness_df, calibration.regularize_with_l0
+    )
+
+    # Save results if requested
+    if save_results_to:
+        _save_holdout_results(
+            save_results_to, overall_metrics, target_robustness_df, all_results
+        )
+
+    results = {
+        "overall_metrics": overall_metrics,
+        "target_robustness": target_robustness_df,
+        "recommendation": recommendation,
+        "detailed_results": all_results,
+    }
+
+    logger.info(
+        f"\nHoldout evaluation completed:"
+        f"\n  Mean holdout accuracy: {overall_metrics['mean_holdout_accuracy']:.2%} "
+        f"(±{overall_metrics['std_holdout_accuracy']:.2%})"
+        f"\n  Worst-case accuracy: {overall_metrics['worst_holdout_accuracy']:.2%}"
+        f"\n  Generalization gap: {overall_metrics['mean_generalization_gap']:.6f}"
+        f"\n  Least robust targets: {', '.join(target_robustness_df.head(5)['target_name'].tolist())}"
+    )
+
+    return results
diff --git a/src/microcalibrate/hyperparameter_tuning.py b/src/microcalibrate/hyperparameter_tuning.py
new file mode 100644
index 0000000..e434b12
--- /dev/null
+++ b/src/microcalibrate/hyperparameter_tuning.py
@@ -0,0 +1,450 @@
+"""Hyperparameter tuning functionality for calibration."""
+
+import logging
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import optuna
+import pandas as pd
+import torch
+
+from microcalibrate.utils.metrics import loss, pct_close
+
+logger = logging.getLogger(__name__)
+
+
+def _evaluate_single_holdout(
+    calibration,
+    holdout_set: Dict[str, Any],
+    hyperparameters: Dict[str, float],
+    epochs_per_trial: int,
+    objectives_balance: Dict[str, float],
+) -> Dict[str, Any]:
+    """Evaluate hyperparameters on a single holdout set.
+
+    Args:
+        calibration: Calibration instance
+        holdout_set: Dictionary with 'names' and 'indices' of holdout targets
+        hyperparameters: Dictionary with l0_lambda, init_mean, temperature
+        epochs_per_trial: Number of epochs to run
+        objectives_balance: Weights for different objectives
+
+    Returns:
+        Dictionary with evaluation metrics and holdout target names
+    """
+    # Store original parameters
+    original_params = {
+        "l0_lambda": calibration.l0_lambda,
+        "init_mean": calibration.init_mean,
+        "temperature": calibration.temperature,
+        "regularize_with_l0": calibration.regularize_with_l0,
+        "epochs": calibration.epochs,
+    }
+
+    try:
+        # Update parameters for this evaluation
+        calibration.l0_lambda = hyperparameters["l0_lambda"]
+        calibration.init_mean = hyperparameters["init_mean"]
+        calibration.temperature = hyperparameters["temperature"]
+        calibration.regularize_with_l0 = True
+        calibration.epochs = epochs_per_trial
+
+        # Set up calibration with this holdout set
+        calibration.excluded_targets = holdout_set["names"]
+        calibration.exclude_targets()
+
+        # Run calibration
+        calibration.calibrate()
+        sparse_weights = calibration.sparse_weights
+
+        # Get estimates for all targets
+        weights_tensor = torch.tensor(
+            sparse_weights, dtype=torch.float32, device=calibration.device
+        )
+
+        if calibration.original_estimate_matrix is not None:
+            original_matrix_tensor = torch.tensor(
+                calibration.original_estimate_matrix.values,
+                dtype=torch.float32,
+                device=calibration.device,
+            )
+            all_estimates = (
+                (weights_tensor @ original_matrix_tensor).cpu().numpy()
+            )
+        else:
+            all_estimates = (
+                calibration.original_estimate_function(weights_tensor)
+                .cpu()
+                .numpy()
+            )
+
+        # Split into train/validation
+        n_targets = len(calibration.original_target_names)
+        val_indices = holdout_set["indices"]
+        train_indices = [i for i in range(n_targets) if i not in val_indices]
+
+        val_estimates = all_estimates[val_indices]
+        val_targets = calibration.original_targets[val_indices]
+        train_estimates = all_estimates[train_indices]
+        train_targets = calibration.original_targets[train_indices]
+
+        # Calculate metrics
+        val_loss = loss(
+            torch.tensor(
+                val_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                val_targets, dtype=torch.float32, device=calibration.device
+            ),
+            None,
+        ).item()
+
+        val_accuracy = pct_close(
+            torch.tensor(
+                val_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                val_targets, dtype=torch.float32, device=calibration.device
+            ),
+        )
+
+        train_loss = loss(
+            torch.tensor(
+                train_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                train_targets, dtype=torch.float32, device=calibration.device
+            ),
+            None,
+        ).item()
+
+        train_accuracy = pct_close(
+            torch.tensor(
+                train_estimates, dtype=torch.float32, device=calibration.device
+            ),
+            torch.tensor(
+                train_targets, dtype=torch.float32, device=calibration.device
+            ),
+        )
+
+        sparsity = np.mean(sparse_weights == 0)
+
+        # Calculate objective
+        objective = (
+            val_loss * objectives_balance["loss"]
+            + (1 - val_accuracy) * objectives_balance["accuracy"]
+            + (1 - sparsity) * objectives_balance["sparsity"]
+        )
+
+        return {
+            "objective": objective,
+            "val_loss": val_loss,
+            "val_accuracy": val_accuracy,
+            "train_loss": train_loss,
+            "train_accuracy": train_accuracy,
+            "sparsity": sparsity,
+            "n_nonzero_weights": int(np.sum(sparse_weights != 0)),
+            "holdout_targets": holdout_set["names"],
+            "hyperparameters": hyperparameters.copy(),
+        }
+
+    finally:
+        # Restore original parameters
+        for key, value in original_params.items():
+            setattr(calibration, key, value)
+
+
+def _create_objective_function(
+    calibration,
+    holdout_sets: List[Dict[str, Any]],
+    epochs_per_trial: int,
+    objectives_balance: Dict[str, float],
+    aggregation: str,
+    all_evaluations: List,
+    original_state: Dict,
+):
+    """Create the objective function for Optuna optimization.
+
+    Args:
+        calibration: Calibration instance
+        holdout_sets: List of holdout sets
+        epochs_per_trial: Number of epochs per trial
+        objectives_balance: Weights for different objectives
+        aggregation: How to aggregate results across holdouts
+        all_evaluations: List to collect evaluation records
+        original_state: Original calibration state to restore
+
+    Returns:
+        Objective function for Optuna
+    """
+
+    def objective(trial: optuna.Trial) -> float:
+        """Objective function for Optuna optimization."""
+        try:
+            # Suggest hyperparameters
+            hyperparameters = {
+                "l0_lambda": trial.suggest_float(
+                    "l0_lambda", 1e-6, 1e-4, log=True
+                ),
+                "init_mean": trial.suggest_float("init_mean", 0.5, 0.999),
+                "temperature": trial.suggest_float("temperature", 0.5, 2.0),
+            }
+
+            # Evaluate on all holdout sets
+            holdout_results = []
+            for holdout_idx, holdout_set in enumerate(holdout_sets):
+                result = _evaluate_single_holdout(
+                    calibration=calibration,
+                    holdout_set=holdout_set,
+                    hyperparameters=hyperparameters,
+                    epochs_per_trial=epochs_per_trial,
+                    objectives_balance=objectives_balance,
+                )
+                # Add trial and holdout identifiers for tracking
+                evaluation_record = result.copy()
+                evaluation_record["trial_number"] = trial.number
+                evaluation_record["holdout_set_idx"] = holdout_idx
+                all_evaluations.append(evaluation_record)
+                holdout_results.append(result)
+
+            # Aggregate objectives
+            objectives = [r["objective"] for r in holdout_results]
+
+            if aggregation == "mean":
+                final_objective = np.mean(objectives)
+            elif aggregation == "median":
+                final_objective = np.median(objectives)
+            elif aggregation == "worst":
+                final_objective = np.max(objectives)
+            else:
+                raise ValueError(f"Unknown aggregation method: {aggregation}")
+
+            # Store detailed metrics
+            trial.set_user_attr(
+                "holdout_objectives", [r["objective"] for r in holdout_results]
+            )
+            trial.set_user_attr(
+                "mean_val_loss",
+                np.mean([r["val_loss"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "std_val_loss",
+                np.std([r["val_loss"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "mean_val_accuracy",
+                np.mean([r["val_accuracy"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "std_val_accuracy",
+                np.std([r["val_accuracy"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "mean_train_loss",
+                np.mean([r["train_loss"] for r in holdout_results]),
+            )
+            trial.set_user_attr(
+                "mean_train_accuracy",
+                np.mean([r["train_accuracy"] for r in holdout_results]),
+            )
+
+            # Use the last holdout's sparsity metrics
+            last_result = holdout_results[-1]
+            trial.set_user_attr("sparsity", last_result["sparsity"])
+            trial.set_user_attr(
+                "n_nonzero_weights", last_result.get("n_nonzero_weights", 0)
+            )
+
+            # Log progress
+            if trial.number % 5 == 0:
+                objectives = [r["objective"] for r in holdout_results]
+                val_accuracies = [r["val_accuracy"] for r in holdout_results]
+                logger.info(
+                    f"Trial {trial.number}:\n"
+                    f"  Objectives by holdout: {[f'{obj:.4f}' for obj in objectives]}\n"
+                    f"  {aggregation.capitalize()} objective: {final_objective:.4f}\n"
+                    f"  Mean val accuracy: {np.mean(val_accuracies):.2%} (±{np.std(val_accuracies):.2%})\n"
+                    f"  Sparsity: {last_result['sparsity']:.2%}"
+                )
+
+            return final_objective
+
+        except Exception as e:
+            logger.warning(f"Trial {trial.number} failed: {str(e)}")
+            return 1e10
+
+        finally:
+            # Restore original state
+            calibration.excluded_targets = original_state["excluded_targets"]
+            calibration.targets = original_state["targets"]
+            calibration.target_names = original_state["target_names"]
+            calibration.exclude_targets()
+
+    return objective
+
+
+def tune_l0_hyperparameters(
+    calibration,
+    n_trials: Optional[int] = 30,
+    objectives_balance: Optional[Dict[str, float]] = None,
+    epochs_per_trial: Optional[int] = None,
+    n_holdout_sets: Optional[int] = 3,
+    holdout_fraction: Optional[float] = 0.2,
+    aggregation: Optional[str] = "mean",
+    timeout: Optional[float] = None,
+    n_jobs: Optional[int] = 1,
+    study_name: Optional[str] = None,
+    storage: Optional[str] = None,
+    load_if_exists: Optional[bool] = False,
+    direction: Optional[str] = "minimize",
+    sampler: Optional["optuna.samplers.BaseSampler"] = None,
+    pruner: Optional["optuna.pruners.BasePruner"] = None,
+) -> Dict[str, Any]:
+    """
+    Tune hyperparameters for L0 regularization using Optuna.
+
+    This method optimizes l0_lambda, init_mean, and temperature to achieve:
+    1. Low calibration loss
+    2. High percentage of targets within 10% of their true values
+    3. Sparse weights (fewer non-zero weights)
+
+    Args:
+        calibration: Calibration instance to tune
+        n_trials: Number of optimization trials to run.
+        objectives_balance: Dictionary to balance the importance of loss, accuracy, and sparsity
+            in the objective function. Default prioritizes being within 10% of targets.
+        epochs_per_trial: Number of epochs per trial. If None, uses calibration.epochs // 4.
+        n_holdout_sets: Number of different holdout sets to create and evaluate on
+        holdout_fraction: Fraction of targets in each holdout set
+        aggregation: How to combine scores across holdouts ("mean", "median", "worst")
+        timeout: Stop study after this many seconds. None means no timeout.
+        n_jobs: Number of parallel jobs. -1 means using all processors.
+        study_name: Name of the study for storage.
+        storage: Database URL for distributed optimization.
+        load_if_exists: Whether to load existing study.
+        direction: Optimization direction ('minimize' or 'maximize').
+        sampler: Optuna sampler for hyperparameter suggestions.
+        pruner: Optuna pruner for early stopping of trials.
+
+    Returns:
+        Dictionary containing the best hyperparameters found.
+    """
+    # Suppress Optuna's logs during optimization
+    optuna.logging.set_verbosity(optuna.logging.WARNING)
+
+    if objectives_balance is None:
+        objectives_balance = {"loss": 1.0, "accuracy": 100.0, "sparsity": 10.0}
+
+    if epochs_per_trial is None:
+        epochs_per_trial = max(calibration.epochs // 4, 100)
+
+    holdout_sets = calibration._create_holdout_sets(
+        n_holdout_sets, holdout_fraction, calibration.seed
+    )
+
+    logger.info(
+        f"Multi-holdout hyperparameter tuning:\n"
+        f"  - {n_holdout_sets} holdout sets\n"
+        f"  - {len(holdout_sets[0]['indices'])} targets per holdout ({holdout_fraction:.1%})\n"
+        f"  - Aggregation: {aggregation}\n"
+    )
+
+    logger.warning(
+        "Data leakage warning: Targets often share overlapping information "
+        "(e.g., geographic breakdowns like 'snap in CA' and 'snap in US'). "
+        "Holdout validation may not provide complete isolation between training and validation sets. "
+        "The robustness metrics should be interpreted with this limitation in mind - "
+        "they may overestimate the model's true generalization performance."
+    )
+
+    # Store original state
+    original_state = {
+        "excluded_targets": calibration.excluded_targets,
+        "targets": calibration.targets.copy(),
+        "target_names": (
+            calibration.target_names.copy()
+            if calibration.target_names is not None
+            else None
+        ),
+    }
+
+    # Initialize list to collect all holdout evaluations
+    all_evaluations = []
+
+    # Create objective function
+    objective = _create_objective_function(
+        calibration=calibration,
+        holdout_sets=holdout_sets,
+        epochs_per_trial=epochs_per_trial,
+        objectives_balance=objectives_balance,
+        aggregation=aggregation,
+        all_evaluations=all_evaluations,
+        original_state=original_state,
+    )
+
+    # Create or load study
+    if sampler is None:
+        sampler = optuna.samplers.TPESampler(seed=calibration.seed)
+
+    study = optuna.create_study(
+        study_name=study_name,
+        storage=storage,
+        load_if_exists=load_if_exists,
+        direction=direction,
+        sampler=sampler,
+        pruner=pruner,
+    )
+
+    # Run optimization
+    study.optimize(
+        objective,
+        n_trials=n_trials,
+        timeout=timeout,
+        n_jobs=n_jobs,
+        show_progress_bar=True,
+    )
+
+    # Get best parameters
+    best_params = study.best_params
+    best_trial = study.best_trial
+    best_params["mean_val_loss"] = best_trial.user_attrs.get("mean_val_loss")
+    best_params["std_val_loss"] = best_trial.user_attrs.get("std_val_loss")
+    best_params["mean_val_accuracy"] = best_trial.user_attrs.get(
+        "mean_val_accuracy"
+    )
+    best_params["std_val_accuracy"] = best_trial.user_attrs.get(
+        "std_val_accuracy"
+    )
+    best_params["holdout_objectives"] = best_trial.user_attrs.get(
+        "holdout_objectives"
+    )
+    best_params["sparsity"] = best_trial.user_attrs.get("sparsity")
+    best_params["n_holdout_sets"] = n_holdout_sets
+    best_params["aggregation"] = aggregation
+
+    # Create evaluation tracking dataframe
+    evaluation_df = pd.DataFrame(all_evaluations)
+
+    # Convert holdout_targets list to string for easier viewing
+    if "holdout_targets" in evaluation_df.columns:
+        evaluation_df["holdout_targets"] = evaluation_df[
+            "holdout_targets"
+        ].apply(lambda x: ", ".join(x) if isinstance(x, list) else str(x))
+
+    best_params["evaluation_history"] = evaluation_df
+
+    logger.info(
+        f"\nMulti-holdout tuning completed!"
+        f"\nBest parameters:"
+        f"\n  - l0_lambda: {best_params['l0_lambda']:.2e}"
+        f"\n  - init_mean: {best_params['init_mean']:.4f}"
+        f"\n  - temperature: {best_params['temperature']:.4f}"
+        f"\nPerformance across {n_holdout_sets} holdouts:"
+        f"\n  - Mean val loss: {best_params['mean_val_loss']:.6f} (±{best_params['std_val_loss']:.6f})"
+        f"\n  - Mean val accuracy: {best_params['mean_val_accuracy']:.2%} (±{best_params['std_val_accuracy']:.2%})"
+        f"\n  - Individual objectives: {[f'{obj:.4f}' for obj in best_params['holdout_objectives']]}"
+        f"\n  - Sparsity: {best_params['sparsity']:.2%}"
+        f"\n\nEvaluation history saved with {len(evaluation_df)} records across {n_trials} trials."
+    )
+
+    return best_params
diff --git a/tests/test_regularization.py b/tests/test_regularization.py
index 06f193b..584192a 100644
--- a/tests/test_regularization.py
+++ b/tests/test_regularization.py
@@ -141,7 +141,7 @@ def test_l0_hyperparameter_tuning_with_holdouts(test_data) -> None:
     )
 
     # Test hyperparameter tuning
-    best_params = calibrator.tune_hyperparameters(
+    best_params = calibrator.tune_l0_hyperparameters(
         n_trials=20,  # Fewer trials for testing
         epochs_per_trial=50,  # Shorter epochs for quick testing
         objectives_balance={
@@ -247,7 +247,7 @@ def test_l0_hyperparameter_tuning_without_holdouts(test_data) -> None:
     )
 
     # Test hyperparameter tuning WITHOUT holdouts
-    best_params = calibrator.tune_hyperparameters(
+    best_params = calibrator.tune_l0_hyperparameters(
         n_trials=10,
         epochs_per_trial=30,
         n_holdout_sets=1,  # Single holdout set

From 344c27de2cb0d112b6a41a797132c529d0ecc4d6 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 22 Aug 2025 12:58:00 +0200
Subject: [PATCH 11/12] fix versioning for deployment

---
 .github/workflows/versioning.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/versioning.yaml b/.github/workflows/versioning.yaml
index d2709e6..e23dcb8 100644
--- a/.github/workflows/versioning.yaml
+++ b/.github/workflows/versioning.yaml
@@ -47,7 +47,7 @@ jobs:
         - name: Set up Python
           uses: actions/setup-python@v5
           with:
-            python-version: ${{ matrix.python-version }}
+            python-version: 3.13
         - name: Install package
           run: make install
         - name: Build package

From 6fb3cfc603aecbc2ad2d852c2c166164abb70d65 Mon Sep 17 00:00:00 2001
From: juaristi22 <juaristi@uni.minerva.edu>
Date: Fri, 22 Aug 2025 12:59:52 +0200
Subject: [PATCH 12/12] add optuna to .toml

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index eb11a84..7dd328f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "pandas",
     "tqdm",
     "l0-python",
+    "optuna",
 ]
 
 [project.optional-dependencies]