uneven shard sizes support to Fully Sharded 2D collectives and unit tests (#3584)

iamzainhuda · meta-codesync[bot] · commit aa1eeda49d06 · 2025-12-02T15:06:47.000-08:00
Summary: Pull Request resolved: #3584 Adding support for uneven sharding splits across data parallel dimension. In sharding types like row wise and table row wise, uneven sharding cases exist which will cause current collectives in fully sharded 2D to fail. We add padding to ensure the collectives see equal shapes. The collectives shape handling happens as such: ``` total_size = self._emb_module.weights_dev.numel() shard_size = (total_size + num_groups - 1) // num_groups # ceil division padded_total_size = shard_size * num_groups padding_size = padded_total_size - total_size if padding_size > 0: input_tensor = torch.nn.functional.pad( self._emb_module.weights_dev.contiguous(), (0, padding_size), value=0.0, ) else: input_tensor = self._emb_module.weights_dev.contiguous() ``` Padding occurs on the right most shard (the same happens with TorchRec uneven sharding as the last shard is the uneven one The all_gather also accounts for this: ``` num_groups = self._env.num_sharding_groups() shard_size = self._shard_buf.numel() padded_total_size = shard_size * num_groups self._unsharded_param.untyped_storage().resize_( padded_total_size * self._element_size ) self._emb_module.weights_dev = self._unsharded_param[ : self._original_shape.numel() ] ``` This diff also adds all required unit tests for all sharding types for fully sharded 2D (sequence and pooled embeddings) Reviewed By: liangbeixu, kausv Differential Revision: D87406987 fbshipit-source-id: d1311bd665a6ce2443035f2da92ca73cdb892db3
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -2548,15 +2548,13 @@ def __init__(
         self._env: ShardingEnv2D = env
 
         self.weights_sharded = False
+        self._element_size = self._emb_module.weights_dev.element_size()
         # pyre-ignore[8]
         self._original_shape: torch.Size = self._emb_module.weights_dev.shape
         # pyre-ignore[8]
         self._unsharded_param: torch.Tensor = self._emb_module.weights_dev
-        self._stash_nbytes: int = (
-            self._emb_module.weights_dev.untyped_storage().nbytes()  # pyre-ignore[29]
-        )
         self._shard_buf_nbytes: int = 0
-        self.shard_buf: Optional[torch.Tensor] = None
+        self._shard_buf: Optional[torch.Tensor] = None
 
         self._async_stream: torch.cuda.Stream = torch.cuda.Stream(
             device=self._emb_module.weights_dev.device
@@ -2573,18 +2571,26 @@ def _all_gather_table_weights(self) -> None:
         if not self.weights_sharded:
             return
         self._wait_on_reduce_scatter()
-        self._unsharded_param.untyped_storage().resize_(self._stash_nbytes)
+        num_groups = self._env.num_sharding_groups()
+        shard_size = self._shard_buf.numel()
+        padded_total_size = shard_size * num_groups
+
+        self._unsharded_param.untyped_storage().resize_(
+            padded_total_size * self._element_size
+        )
 
         dist.all_gather_into_tensor(
             output_tensor=self._unsharded_param,
-            input_tensor=self.shard_buf,
+            input_tensor=self._shard_buf,
             group=self._env.replica_pg,
             async_op=False,
         )
         # pyre-ignore[16]
-        self._emb_module.weights_dev = self._unsharded_param
+        self._emb_module.weights_dev = self._unsharded_param[
+            : self._original_shape.numel()
+        ]
         # pyre-ignore[16]
-        self.shard_buf.untyped_storage().resize_(0)
+        self._shard_buf.untyped_storage().resize_(0)
         self.weights_sharded = False
 
     def _hybird_sharded_backward_hook(
@@ -2633,26 +2639,38 @@ def _reduce_scatter_weights_async(self) -> ReduceScatterResizeAwaitable:
 
             # pyre-ignore[29]
             total_size = self._emb_module.weights_dev.numel()
-            shard_size = total_size // num_groups
 
-            if self.shard_buf is None:
-                self.shard_buf = torch.empty(
+            shard_size = (total_size + num_groups - 1) // num_groups  # ceil division
+            padded_total_size = shard_size * num_groups
+            padding_size = padded_total_size - total_size
+
+            if padding_size > 0:
+                input_tensor = torch.nn.functional.pad(
+                    self._emb_module.weights_dev.contiguous(),
+                    (0, padding_size),
+                    value=0.0,
+                )
+            else:
+                input_tensor = self._emb_module.weights_dev.contiguous()
+
+            if self._shard_buf is None:
+                self._shard_buf = torch.empty(
                     shard_size,
                     # pyre-ignore[6]
                     dtype=self._emb_module.weights_dev.dtype,
                     # pyre-ignore[6]
                     device=self._emb_module.weights_dev.device,
                 )
                 # pyre-ignore[16]
-                self._shard_buf_nbytes = self.shard_buf.untyped_storage().nbytes()
+                self._shard_buf_nbytes = self._shard_buf.untyped_storage().nbytes()
             else:
-                self.shard_buf.untyped_storage().resize_(self._shard_buf_nbytes)
+                self._shard_buf.untyped_storage().resize_(self._shard_buf_nbytes)
 
             # pyre-ignore[29]
             input_tensor = self._emb_module.weights_dev.contiguous()
 
             self._async_work = dist.reduce_scatter_tensor(
-                output=self.shard_buf,
+                output=self._shard_buf,
                 input=input_tensor,
                 op=dist.ReduceOp.AVG,
                 group=self._env.replica_pg,
@@ -2665,14 +2683,14 @@ def _reduce_scatter_weights_async(self) -> ReduceScatterResizeAwaitable:
 
             def resize_callback() -> None:
                 self._emb_module.weights_dev.untyped_storage().resize_(0)  # pyre-ignore[29]
-                self._emb_module.weights_dev = self.shard_buf  # pyre-ignore[16]
+                self._emb_module.weights_dev = self._shard_buf  # pyre-ignore[16]
 
             return ReduceScatterResizeAwaitable(
                 async_work=self._async_work,
                 async_event=self._async_event,
                 async_stream=self._async_stream,
                 unsharded_param=self._unsharded_param,
-                shard_buf=self.shard_buf,
+                shard_buf=self._shard_buf,
                 resize_callback=resize_callback,
             )
 
@@ -3590,15 +3608,13 @@ def __init__(
         self._env: ShardingEnv2D = env
 
         self.weights_sharded = False
+        self._element_size = self._emb_module.weights_dev.element_size()
         # pyre-ignore[8]
         self._original_shape: torch.Size = self._emb_module.weights_dev.shape
         # pyre-ignore[8]
         self._unsharded_param: torch.Tensor = self._emb_module.weights_dev
-        self._stash_nbytes: int = (
-            self._emb_module.weights_dev.untyped_storage().nbytes()  # pyre-ignore[29]
-        )
         self._shard_buf_nbytes: int = 0
-        self.shard_buf: Optional[torch.Tensor] = None
+        self._shard_buf: Optional[torch.Tensor] = None
 
         self._async_stream: torch.cuda.Stream = torch.cuda.Stream(
             device=self._emb_module.weights_dev.device
@@ -3615,18 +3631,27 @@ def _all_gather_table_weights(self) -> None:
         if not self.weights_sharded:
             return
         self._wait_on_reduce_scatter()
-        self._unsharded_param.untyped_storage().resize_(self._stash_nbytes)
+
+        num_groups = self._env.num_sharding_groups()
+        shard_size = self._shard_buf.numel()
+        padded_total_size = shard_size * num_groups
+
+        self._unsharded_param.untyped_storage().resize_(
+            padded_total_size * self._element_size
+        )
 
         dist.all_gather_into_tensor(
             output_tensor=self._unsharded_param,
-            input_tensor=self.shard_buf,
+            input_tensor=self._shard_buf,
             group=self._env.replica_pg,
             async_op=False,
         )
         # pyre-ignore[16]
-        self._emb_module.weights_dev = self._unsharded_param
+        self._emb_module.weights_dev = self._unsharded_param[
+            : self._original_shape.numel()
+        ]
         # pyre-ignore[16]
-        self.shard_buf.untyped_storage().resize_(0)
+        self._shard_buf.untyped_storage().resize_(0)
         self.weights_sharded = False
 
     def _hybird_sharded_backward_hook(
@@ -3675,26 +3700,35 @@ def _reduce_scatter_weights_async(self) -> ReduceScatterResizeAwaitable:
 
             # pyre-ignore[29]
             total_size = self._emb_module.weights_dev.numel()
-            shard_size = total_size // num_groups
 
-            if self.shard_buf is None:
-                self.shard_buf = torch.empty(
+            shard_size = (total_size + num_groups - 1) // num_groups  # ceil division
+            padded_total_size = shard_size * num_groups
+            padding_size = padded_total_size - total_size
+
+            if padding_size > 0:
+                input_tensor = torch.nn.functional.pad(
+                    self._emb_module.weights_dev.contiguous(),
+                    (0, padding_size),
+                    value=0.0,
+                )
+            else:
+                input_tensor = self._emb_module.weights_dev.contiguous()
+
+            if self._shard_buf is None:
+                self._shard_buf = torch.empty(
                     shard_size,
                     # pyre-ignore[6]
                     dtype=self._emb_module.weights_dev.dtype,
                     # pyre-ignore[6]
                     device=self._emb_module.weights_dev.device,
                 )
                 # pyre-ignore[16]
-                self._shard_buf_nbytes = self.shard_buf.untyped_storage().nbytes()
+                self._shard_buf_nbytes = self._shard_buf.untyped_storage().nbytes()
             else:
-                self.shard_buf.untyped_storage().resize_(self._shard_buf_nbytes)
-
-            # pyre-ignore[29]
-            input_tensor = self._emb_module.weights_dev.contiguous()
+                self._shard_buf.untyped_storage().resize_(self._shard_buf_nbytes)
 
             self._async_work = dist.reduce_scatter_tensor(
-                output=self.shard_buf,
+                output=self._shard_buf,
                 input=input_tensor,
                 op=dist.ReduceOp.AVG,
                 group=self._env.replica_pg,
@@ -3707,14 +3741,14 @@ def _reduce_scatter_weights_async(self) -> ReduceScatterResizeAwaitable:
 
             def resize_callback() -> None:
                 self._emb_module.weights_dev.untyped_storage().resize_(0)  # pyre-ignore[29]
-                self._emb_module.weights_dev = self.shard_buf  # pyre-ignore[16]
+                self._emb_module.weights_dev = self._shard_buf  # pyre-ignore[16]
 
             return ReduceScatterResizeAwaitable(
                 async_work=self._async_work,
                 async_event=self._async_event,
                 async_stream=self._async_stream,
                 unsharded_param=self._unsharded_param,
-                shard_buf=self.shard_buf,
+                shard_buf=self._shard_buf,
                 resize_callback=resize_callback,
             )
 
diff --git a/torchrec/distributed/test_utils/test_model_parallel.py b/torchrec/distributed/test_utils/test_model_parallel.py
@@ -26,7 +26,7 @@
     SharderType,
     sharding_single_rank_test,
 )
-from torchrec.distributed.types import ModuleSharder, ShardingType
+from torchrec.distributed.types import ModuleSharder, ShardingStrategy, ShardingType
 from torchrec.modules.embedding_configs import EmbeddingBagConfig, PoolingType
 from torchrec.test_utils import seed_and_log, skip_if_asan_class
 from torchrec.types import DataType
@@ -161,6 +161,7 @@ def _test_sharding(
         indices_dtype: torch.dtype = torch.int64,
         offsets_dtype: torch.dtype = torch.int64,
         lengths_dtype: torch.dtype = torch.int64,
+        sharding_strategy: Optional[ShardingStrategy] = None,
     ) -> None:
         self._build_tables_and_groups(data_type=data_type)
         # directly run the test with single process
@@ -191,6 +192,7 @@ def _test_sharding(
                 indices_dtype=indices_dtype,
                 offsets_dtype=offsets_dtype,
                 lengths_dtype=lengths_dtype,
+                sharding_strategy=sharding_strategy,
             )
         else:
             self._run_multi_process_test(
@@ -219,6 +221,7 @@ def _test_sharding(
                 indices_dtype=indices_dtype,
                 offsets_dtype=offsets_dtype,
                 lengths_dtype=lengths_dtype,
+                sharding_strategy=sharding_strategy,
             )
 
     def _test_dynamic_sharding(
diff --git a/torchrec/distributed/test_utils/test_sharding.py b/torchrec/distributed/test_utils/test_sharding.py
@@ -62,6 +62,7 @@
     ShardedTensor,
     ShardingEnv,
     ShardingPlan,
+    ShardingStrategy,
     ShardingType,
 )
 from torchrec.modules.embedding_configs import (
@@ -790,6 +791,7 @@ def sharding_single_rank_test_single_process(
     offsets_dtype: torch.dtype = torch.int64,
     lengths_dtype: torch.dtype = torch.int64,
     random_seed: Optional[int] = None,
+    sharding_strategy: Optional[ShardingStrategy] = None,
 ) -> None:
     batch_size = random.randint(0, batch_size) if allow_zero_batch_size else batch_size
     # Generate model & inputs.
@@ -956,6 +958,7 @@ def _custom_hook(input: List[torch.Tensor]) -> None:
             use_inter_host_allreduce=use_inter_host_allreduce,
             custom_all_reduce=all_reduce_func,
             submodule_configs=submodule_configs,
+            sharding_strategy=sharding_strategy,
         )
     else:
         local_model = DistributedModelParallel(
@@ -1069,6 +1072,7 @@ def sharding_single_rank_test(
     offsets_dtype: torch.dtype = torch.int64,
     lengths_dtype: torch.dtype = torch.int64,
     random_seed: Optional[int] = None,
+    sharding_strategy: Optional[ShardingStrategy] = None,
 ) -> None:
     with MultiProcessContext(rank, world_size, backend, local_size) as ctx:
         assert ctx.pg is not None
@@ -1104,6 +1108,7 @@ def sharding_single_rank_test(
             offsets_dtype=offsets_dtype,
             lengths_dtype=lengths_dtype,
             random_seed=random_seed,
+            sharding_strategy=sharding_strategy,
         )
 
 
diff --git a/torchrec/distributed/tests/test_2d_sharding.py b/torchrec/distributed/tests/test_2d_sharding.py