addressing L2 CI errors

apbose · apbose · commit 2ea29e42cf48 · 2025-12-02T07:33:04.000-08:00
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -1,3 +1,4 @@
+import glob
 import os
 import unittest
 
@@ -11,6 +12,7 @@
 )
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
+from torch_tensorrt._features import ENABLED_FEATURES
 
 
 def is_distributed_nccl_available():
@@ -31,11 +33,41 @@ def is_distributed_nccl_available():
         return False
 
 
+def cleanup_nccl_shared_memory():
+    """
+    Clean up stale NCCL shared memory segments from /dev/shm.
+
+    In CI environments, previous test runs may leave behind NCCL shared memory
+    segments that fill up /dev/shm. This function removes them to prevent
+    "No space left on device" errors.
+    """
+    try:
+        nccl_files = glob.glob("/dev/shm/nccl-*")
+        for f in nccl_files:
+            try:
+                os.remove(f)
+            except (OSError, PermissionError):
+                # Ignore errors if file is in use or we lack permissions
+                pass
+    except Exception:
+        # If cleanup fails, continue anyway - test may still work
+        pass
+
+
 if "OMPI_COMM_WORLD_SIZE" in os.environ:
     set_environment_variables_pytest_multi_process()
 else:
     set_environment_variables_pytest_single_process()
 
+# Clean up stale NCCL shared memory BEFORE initializing process group
+cleanup_nccl_shared_memory()
+
+# Configure NCCL to use less shared memory in constrained CI environments
+# NCCL_SHM_DISABLE=1 disables shared memory transport, uses sockets instead
+# This is slower but avoids /dev/shm space issues
+if not os.environ.get("NCCL_SHM_DISABLE"):
+    os.environ["NCCL_SHM_DISABLE"] = "1"
+
 if not dist.is_initialized():
     dist.init_process_group(
         backend="nccl",
@@ -75,13 +107,21 @@ def forward(self, x):
 
 class TestNcclOpsConverter(DispatchTestCase):
     # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
-    # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
+    # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
     @unittest.skipIf(
         not is_distributed_nccl_available(),
         "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
     )
+    @unittest.skipIf(
+        not ENABLED_FEATURES.trtllm_for_nccl,
+        "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
+    )
     @classmethod
     def setUpClass(cls):
+        # Clean up stale NCCL shared memory from previous runs
+        # to see if this is needed
+        cleanup_nccl_shared_memory()
+
         cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))
         cls.rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
         cls.group = dist.new_group(ranks=list(range(cls.world_size)))
@@ -92,6 +132,9 @@ def tearDownClass(cls):
         if dist.is_initialized():
             dist.destroy_process_group()
 
+        # Clean up NCCL shared memory after tests complete
+        cleanup_nccl_shared_memory()
+
     @parameterized.expand([8])
     def test_nccl_ops_gather(self, linear_layer_dim):
         inputs = [torch.randn(1, linear_layer_dim).to("cuda")]