1+ import glob
12import os
23import unittest
34
1112)
1213from parameterized import parameterized
1314from torch .testing ._internal .common_utils import run_tests
15+ from torch_tensorrt ._features import ENABLED_FEATURES
1416
1517
1618def is_distributed_nccl_available ():
@@ -31,11 +33,41 @@ def is_distributed_nccl_available():
3133 return False
3234
3335
36+ def cleanup_nccl_shared_memory ():
37+ """
38+ Clean up stale NCCL shared memory segments from /dev/shm.
39+
40+ In CI environments, previous test runs may leave behind NCCL shared memory
41+ segments that fill up /dev/shm. This function removes them to prevent
42+ "No space left on device" errors.
43+ """
44+ try :
45+ nccl_files = glob .glob ("/dev/shm/nccl-*" )
46+ for f in nccl_files :
47+ try :
48+ os .remove (f )
49+ except (OSError , PermissionError ):
50+ # Ignore errors if file is in use or we lack permissions
51+ pass
52+ except Exception :
53+ # If cleanup fails, continue anyway - test may still work
54+ pass
55+
56+
3457if "OMPI_COMM_WORLD_SIZE" in os .environ :
3558 set_environment_variables_pytest_multi_process ()
3659else :
3760 set_environment_variables_pytest_single_process ()
3861
62+ # Clean up stale NCCL shared memory BEFORE initializing process group
63+ cleanup_nccl_shared_memory ()
64+
65+ # Configure NCCL to use less shared memory in constrained CI environments
66+ # NCCL_SHM_DISABLE=1 disables shared memory transport, uses sockets instead
67+ # This is slower but avoids /dev/shm space issues
68+ if not os .environ .get ("NCCL_SHM_DISABLE" ):
69+ os .environ ["NCCL_SHM_DISABLE" ] = "1"
70+
3971if not dist .is_initialized ():
4072 dist .init_process_group (
4173 backend = "nccl" ,
@@ -75,13 +107,21 @@ def forward(self, x):
75107
76108class TestNcclOpsConverter (DispatchTestCase ):
77109 # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78- # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
110+ # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79111 @unittest .skipIf (
80112 not is_distributed_nccl_available (),
81113 "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported)." ,
82114 )
115+ @unittest .skipIf (
116+ not ENABLED_FEATURES .trtllm_for_nccl ,
117+ "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13)." ,
118+ )
83119 @classmethod
84120 def setUpClass (cls ):
121+ # Clean up stale NCCL shared memory from previous runs
122+ # to see if this is needed
123+ cleanup_nccl_shared_memory ()
124+
85125 cls .world_size = int (os .environ .get ("OMPI_COMM_WORLD_SIZE" , 1 ))
86126 cls .rank = int (os .environ .get ("OMPI_COMM_WORLD_RANK" , 0 ))
87127 cls .group = dist .new_group (ranks = list (range (cls .world_size )))
@@ -92,6 +132,9 @@ def tearDownClass(cls):
92132 if dist .is_initialized ():
93133 dist .destroy_process_group ()
94134
135+ # Clean up NCCL shared memory after tests complete
136+ cleanup_nccl_shared_memory ()
137+
95138 @parameterized .expand ([8 ])
96139 def test_nccl_ops_gather (self , linear_layer_dim ):
97140 inputs = [torch .randn (1 , linear_layer_dim ).to ("cuda" )]
0 commit comments