dynamic 2D + fully sharded 2D (#3600)

iamzainhuda · meta-codesync[bot] · commit 28c82d57cc5f · 2025-12-09T11:57:29.000-08:00
Summary: Pull Request resolved: #3600 Add support for dynamic (D76774334) + fully sharded 2D together. Users can specify which modules to apply fully sharded 2D to through adding `ShardingStrategy` in their submodule configs. Reviewed By: aliafzal Differential Revision: D88675533 fbshipit-source-id: ed7c7a4b767aa9317848e5ed65e7dcc2795c5f29
diff --git a/torchrec/distributed/batched_embedding_kernel.py b/torchrec/distributed/batched_embedding_kernel.py
@@ -2565,9 +2565,8 @@ def _all_gather_table_weights(self) -> None:
         if not self.weights_sharded:
             return
         self._wait_on_reduce_scatter()
-        num_groups = self._env.num_sharding_groups()
         shard_size = self._shard_buf.numel()
-        padded_total_size = shard_size * num_groups
+        padded_total_size = shard_size * self._env.num_sharding_groups()
 
         self._unsharded_param.untyped_storage().resize_(
             padded_total_size * self._element_size
@@ -2629,11 +2628,11 @@ def _reduce_scatter_weights_async(self) -> ReduceScatterResizeAwaitable:
         """
         with torch.no_grad():
             self.weights_sharded = True
-            num_groups = self._env.num_sharding_groups()
 
             # pyre-ignore[29]
             total_size = self._emb_module.weights_dev.numel()
 
+            num_groups = self._env.num_sharding_groups()
             shard_size = (total_size + num_groups - 1) // num_groups  # ceil division
             padded_total_size = shard_size * num_groups
             padding_size = padded_total_size - total_size
@@ -3594,7 +3593,6 @@ def __init__(
         env: Optional[ShardingEnv] = None,
     ) -> None:
         super().__init__(config, pg, device, sharding_type)
-
         assert isinstance(
             env, ShardingEnv2D
         ), "env is required for ShardedBatchedFusedEmbeddingBag"
@@ -3625,9 +3623,8 @@ def _all_gather_table_weights(self) -> None:
             return
         self._wait_on_reduce_scatter()
 
-        num_groups = self._env.num_sharding_groups()
         shard_size = self._shard_buf.numel()
-        padded_total_size = shard_size * num_groups
+        padded_total_size = shard_size * self._env.num_sharding_groups()
 
         self._unsharded_param.untyped_storage().resize_(
             padded_total_size * self._element_size
@@ -3689,11 +3686,11 @@ def _reduce_scatter_weights_async(self) -> ReduceScatterResizeAwaitable:
         """
         with torch.no_grad():
             self.weights_sharded = True
-            num_groups = self._env.num_sharding_groups()
 
             # pyre-ignore[29]
             total_size = self._emb_module.weights_dev.numel()
 
+            num_groups = self._env.num_sharding_groups()
             shard_size = (total_size + num_groups - 1) // num_groups  # ceil division
             padded_total_size = shard_size * num_groups
             padding_size = padded_total_size - total_size
diff --git a/torchrec/distributed/model_parallel.py b/torchrec/distributed/model_parallel.py
@@ -931,6 +931,7 @@ def __init__(
                         plan=submodule_config.plan,
                         sharding_group_size=submodule_config.sharding_group_size,
                         use_inter_host_allreduce=submodule_config.use_inter_host_allreduce,
+                        sharding_strategy=submodule_config.sharding_strategy,
                     )
                 )
 
@@ -1022,6 +1023,7 @@ def _shard_modules_impl(
                         device_mesh=ctx.device_mesh,
                         node_group_size=ctx.sharding_group_size,
                         use_inter_host_allreduce=ctx.use_inter_host_allreduce,
+                        sharding_strategy=ctx.sharding_strategy,
                     )
                     break
 
diff --git a/torchrec/distributed/tests/test_2d_sharding.py b/torchrec/distributed/tests/test_2d_sharding.py
@@ -952,6 +952,172 @@ def test_sharding_dynamic_2D(
             submodule_configs=[ec_submodule_config],
         )
 
+    @unittest.skipIf(
+        torch.cuda.device_count() <= 7,
+        "Not enough GPUs, this test requires at least eight GPUs",
+    )
+    # pyre-fixme[56]
+    @given(
+        sharding_type=st.just(ShardingType.ROW_WISE.value),
+        kernel_type=st.sampled_from(
+            [
+                # EmbeddingComputeKernel.DENSE.value,
+                EmbeddingComputeKernel.FUSED.value,
+            ]
+        ),
+        qcomms_config=st.sampled_from(
+            [
+                None,
+                QCommsConfig(
+                    forward_precision=CommType.FP16, backward_precision=CommType.BF16
+                ),
+            ]
+        ),
+        apply_optimizer_in_backward_config=st.sampled_from(
+            [
+                None,
+                {
+                    "embedding_bags": (torch.optim.SGD, {"lr": 0.01}),
+                    "embeddings": (torch.optim.SGD, {"lr": 0.2}),
+                },
+            ]
+        ),
+        variable_batch_size=st.booleans(),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=1, deadline=None)
+    def test_fully_sharded_dynamic_2D(
+        self,
+        sharding_type: str,
+        kernel_type: str,
+        qcomms_config: Optional[QCommsConfig],
+        apply_optimizer_in_backward_config: Optional[
+            Dict[str, Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]]
+        ],
+        variable_batch_size: bool,
+    ) -> None:
+        assume(
+            apply_optimizer_in_backward_config is None
+            or kernel_type != EmbeddingComputeKernel.DENSE.value
+        )
+
+        # add sharding plan for embedding collection later
+        ec_submodule_config = DMPCollectionConfig(
+            module=EmbeddingCollection,
+            sharding_group_size=2,
+            plan=None,  # pyre-ignore[6]
+            sharding_strategy=ShardingStrategy.FULLY_SHARDED,
+        )
+
+        self._test_sharding(
+            world_size=self.WORLD_SIZE,
+            world_size_2D=self.WORLD_SIZE_2D,
+            sharders=[  # pyre-ignore[6]
+                cast(
+                    ModuleSharder[nn.Module],
+                    create_test_sharder(
+                        SharderType.EMBEDDING_BAG_COLLECTION.value,
+                        sharding_type,
+                        kernel_type,
+                        qcomms_config=qcomms_config,
+                        device=torch.device("cuda"),
+                    ),
+                ),
+            ],
+            backend="nccl",
+            qcomms_config=qcomms_config,
+            constraints={
+                table.name: ParameterConstraints(min_partition=2)
+                for table in self.tables
+            },
+            apply_optimizer_in_backward_config=apply_optimizer_in_backward_config,
+            variable_batch_size=variable_batch_size,
+            submodule_configs=[ec_submodule_config],
+            sharding_strategy=ShardingStrategy.FULLY_SHARDED,
+        )
+
+    @unittest.skipIf(
+        torch.cuda.device_count() <= 7,
+        "Not enough GPUs, this test requires at least eight GPUs",
+    )
+    # pyre-fixme[56]
+    @given(
+        sharding_type=st.just(ShardingType.ROW_WISE.value),
+        kernel_type=st.sampled_from(
+            [
+                # EmbeddingComputeKernel.DENSE.value,
+                EmbeddingComputeKernel.FUSED.value,
+            ]
+        ),
+        qcomms_config=st.sampled_from(
+            [
+                None,
+                QCommsConfig(
+                    forward_precision=CommType.FP16, backward_precision=CommType.BF16
+                ),
+            ]
+        ),
+        apply_optimizer_in_backward_config=st.sampled_from(
+            [
+                None,
+                {
+                    "embedding_bags": (torch.optim.SGD, {"lr": 0.01}),
+                    "embeddings": (torch.optim.SGD, {"lr": 0.2}),
+                },
+            ]
+        ),
+        variable_batch_size=st.booleans(),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=1, deadline=None)
+    def test_partially_fully_sharded_dynamic_2D(
+        self,
+        sharding_type: str,
+        kernel_type: str,
+        qcomms_config: Optional[QCommsConfig],
+        apply_optimizer_in_backward_config: Optional[
+            Dict[str, Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]]
+        ],
+        variable_batch_size: bool,
+    ) -> None:
+        assume(
+            apply_optimizer_in_backward_config is None
+            or kernel_type != EmbeddingComputeKernel.DENSE.value
+        )
+
+        # add sharding plan for embedding collection later
+        ec_submodule_config = DMPCollectionConfig(
+            module=EmbeddingCollection,
+            sharding_group_size=2,
+            plan=None,  # pyre-ignore[6]
+            sharding_strategy=ShardingStrategy.FULLY_SHARDED,  # only apply fully sharded to EC tables
+        )
+
+        self._test_sharding(
+            world_size=self.WORLD_SIZE,
+            world_size_2D=self.WORLD_SIZE_2D,
+            sharders=[  # pyre-ignore[6]
+                cast(
+                    ModuleSharder[nn.Module],
+                    create_test_sharder(
+                        SharderType.EMBEDDING_BAG_COLLECTION.value,
+                        sharding_type,
+                        kernel_type,
+                        qcomms_config=qcomms_config,
+                        device=torch.device("cuda"),
+                    ),
+                ),
+            ],
+            backend="nccl",
+            qcomms_config=qcomms_config,
+            constraints={
+                table.name: ParameterConstraints(min_partition=2)
+                for table in self.tables
+            },
+            apply_optimizer_in_backward_config=apply_optimizer_in_backward_config,
+            variable_batch_size=variable_batch_size,
+            submodule_configs=[ec_submodule_config],
+            sharding_strategy=ShardingStrategy.DEFAULT,
+        )
+
     def _test_sharding(
         self,
         sharders: List[TestEmbeddingCollectionSharder],
@@ -969,6 +1135,7 @@ def _test_sharding(
         variable_batch_size: bool = False,
         variable_batch_per_feature: bool = False,
         submodule_configs: Optional[List[DMPCollectionConfig]] = None,
+        sharding_strategy: ShardingStrategy = ShardingStrategy.DEFAULT,
     ) -> None:
         self._run_multi_process_test(
             callable=sharding_single_rank_test,
@@ -988,6 +1155,7 @@ def _test_sharding(
             variable_batch_per_feature=variable_batch_per_feature,
             global_constant_batch=True,
             submodule_configs=submodule_configs,
+            sharding_strategy=sharding_strategy,
         )
 
 
diff --git a/torchrec/distributed/types.py b/torchrec/distributed/types.py
@@ -931,6 +931,51 @@ class ShardingStrategy(Enum):
     FULLY_SHARDED = "fully_sharded"
 
 
+class DMPCollectionConfig:
+    module: Type[nn.Module]
+    plan: "ShardingPlan" = field(repr=False)  # sub-tree-specific sharding plan
+    sharding_group_size: int
+    node_group_size: Optional[int] = None
+    use_inter_host_allreduce: bool = False
+    sharding_strategy: ShardingStrategy = ShardingStrategy.DEFAULT
+
+    def __init__(
+        self,
+        module: Type[nn.Module],
+        plan: "ShardingPlan",
+        sharding_group_size: int,
+        node_group_size: Optional[int] = None,
+        use_inter_host_allreduce: bool = False,
+        sharding_strategy: ShardingStrategy = ShardingStrategy.DEFAULT,
+    ) -> None:
+        self.module = module
+        self.plan = plan
+        self.sharding_group_size = sharding_group_size
+        self.node_group_size = node_group_size
+        self.use_inter_host_allreduce = use_inter_host_allreduce
+        self.sharding_strategy = sharding_strategy
+
+    def __post_init__(self) -> None:
+        if isinstance(self.module, ShardedModule):
+            raise ValueError(
+                f"ShardedModule should not be passed into DMPCollectionConfig: got {type(self.module)}"
+            )
+
+
+# for internal use in DMPCollection
+class DMPCollectionContext(DMPCollectionConfig):
+    device_mesh: "DeviceMesh" = field(init=False)
+    sharding_pg: "dist.ProcessGroup" = field(init=False)
+    replica_pg: "dist.ProcessGroup" = field(init=False)
+    modules_to_sync: List[Tuple[nn.Module, nn.Module]] = field(
+        init=False, default_factory=list
+    )
+    sharded_module: Optional[nn.Module] = field(init=False, default=None)
+    sharding_strategy: ShardingStrategy = field(
+        init=False, default=ShardingStrategy.DEFAULT
+    )
+
+
 class ShardingEnv2D(ShardingEnv):
     """
     Creates a sharding environment for 2D parallelism, enables usage of 2D parallelism in sharding
@@ -1375,42 +1420,3 @@ class ShardingBucketMetadata:
     num_buckets_per_shard: List[int]
     bucket_offsets_per_shard: List[int]
     bucket_size: int
-
-
-class DMPCollectionConfig:
-    module: Type[nn.Module]
-    plan: "ShardingPlan" = field(repr=False)  # sub-tree-specific sharding plan
-    sharding_group_size: int
-    node_group_size: Optional[int] = None
-    use_inter_host_allreduce: bool = False
-
-    def __init__(
-        self,
-        module: Type[nn.Module],
-        plan: "ShardingPlan",
-        sharding_group_size: int,
-        node_group_size: Optional[int] = None,
-        use_inter_host_allreduce: bool = False,
-    ) -> None:
-        self.module = module
-        self.plan = plan
-        self.sharding_group_size = sharding_group_size
-        self.node_group_size = node_group_size
-        self.use_inter_host_allreduce = use_inter_host_allreduce
-
-    def __post_init__(self) -> None:
-        if isinstance(self.module, ShardedModule):
-            raise ValueError(
-                f"ShardedModule should not be passed into DMPCollectionConfig: got {type(self.module)}"
-            )
-
-
-# for internal use in DMPCollection
-class DMPCollectionContext(DMPCollectionConfig):
-    device_mesh: "DeviceMesh" = field(init=False)
-    sharding_pg: "dist.ProcessGroup" = field(init=False)
-    replica_pg: "dist.ProcessGroup" = field(init=False)
-    modules_to_sync: List[Tuple[nn.Module, nn.Module]] = field(
-        init=False, default_factory=list
-    )
-    sharded_module: Optional[nn.Module] = field(init=False, default=None)

Original file line number	Diff line number	Diff line change
`@@ -931,6 +931,7 @@ def __init__(`
`931`	`931`	`plan=submodule_config.plan,`
`932`	`932`	`sharding_group_size=submodule_config.sharding_group_size,`
`933`	`933`	`use_inter_host_allreduce=submodule_config.use_inter_host_allreduce,`
	`934`	`+ sharding_strategy=submodule_config.sharding_strategy,`
`934`	`935`	`)`
`935`	`936`	`)`
`936`	`937`
`@@ -1022,6 +1023,7 @@ def _shard_modules_impl(`
`1022`	`1023`	`device_mesh=ctx.device_mesh,`
`1023`	`1024`	`node_group_size=ctx.sharding_group_size,`
`1024`	`1025`	`use_inter_host_allreduce=ctx.use_inter_host_allreduce,`
	`1026`	`+ sharding_strategy=ctx.sharding_strategy,`
`1025`	`1027`	`)`
`1026`	`1028`	`break`
`1027`	`1029`