Pop _trained_batches key from state_dict on load_state_dict (#3573)

jeffkbkim · meta-codesync[bot] · commit 440e55ac6e42 · 2025-11-21T18:16:47.000-08:00
Summary: Pull Request resolved: #3573 Remove _trained_batches key from metric module state_dict upon loading. Confirmed that the new unit test, `test_load_state_dict_with_trained_batches_key`, fails without the new load_state_dict hook. Reviewed By: iamzainhuda Differential Revision: D87669499 fbshipit-source-id: 1b3f3f0fca4bec9a8d2b339a4e2ca67fcb0983f0
diff --git a/torchrec/metrics/metric_module.py b/torchrec/metrics/metric_module.py
@@ -13,7 +13,7 @@
 import concurrent
 import logging
 import time
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from typing import Any, Dict, List, Optional, Type, TypeVar, Union
 
 import torch
@@ -228,6 +228,26 @@ def __init__(
         )
         self.last_compute_time = -1.0
 
+        self._register_load_state_dict_pre_hook(self.load_state_dict_hook)
+
+    def load_state_dict_hook(
+        self,
+        state_dict: OrderedDict[str, torch.Tensor],
+        prefix: str,
+        local_metadata: Dict[str, Any],
+        strict: bool,
+        missing_keys: List[str],
+        unexpected_keys: List[str],
+        error_msgs: List[str],
+    ) -> None:
+        """Remove _trained_batches key for backward compatibility."""
+        key = f"{prefix}_trained_batches"
+        if key in state_dict:
+            state_dict.pop(key)
+            logger.warning(
+                f"Removed key '{key}' from state_dict for backward compatibility"
+            )
+
     def _update_rec_metrics(
         self, model_out: Dict[str, torch.Tensor], **kwargs: Any
     ) -> None:
diff --git a/torchrec/metrics/tests/test_metric_module.py b/torchrec/metrics/tests/test_metric_module.py
@@ -664,6 +664,52 @@ def test_async_compute_raises_exception(self) -> None:
         ):
             metric_module.async_compute(concurrent.futures.Future())
 
+    def test_load_state_dict_with_trained_batches_key(self) -> None:
+        metric_module = generate_metric_module(
+            TestMetricModule,
+            metrics_config=DefaultMetricsConfig,
+            batch_size=128,
+            world_size=1,
+            my_rank=0,
+            state_metrics_mapping={},
+            device=torch.device("cpu"),
+        )
+        state_dict = metric_module.state_dict()
+
+        # Add the _trained_batches key to simulate old checkpoint
+        state_dict["_trained_batches"] = torch.tensor(42, dtype=torch.long)
+
+        # Load the state_dict with _trained_batches
+        # This should not raise an error
+        metric_module.load_state_dict(state_dict)
+        metric_module.update(gen_test_batch(128))
+        result = metric_module.compute()
+        self.assertIsInstance(result, dict)
+        self.assertTrue(len(result) > 0)
+
+    def test_load_state_dict_without_trained_batches_key(self) -> None:
+        metric_module = generate_metric_module(
+            TestMetricModule,
+            metrics_config=DefaultMetricsConfig,
+            batch_size=128,
+            world_size=1,
+            my_rank=0,
+            state_metrics_mapping={},
+            device=torch.device("cpu"),
+        )
+        state_dict = metric_module.state_dict()
+
+        # Verify the key is not in the state_dict
+        self.assertNotIn("_trained_batches", state_dict)
+
+        # Load the clean state_dict
+        # This should not raise an error
+        metric_module.load_state_dict(state_dict)
+        metric_module.update(gen_test_batch(128))
+        result = metric_module.compute()
+        self.assertIsInstance(result, dict)
+        self.assertTrue(len(result) > 0)
+
 
 def metric_module_gather_state(
     rank: int,