1+ import glob
12import os
23import unittest
34
5+ os .environ .setdefault ("NCCL_SHM_DISABLE" , "1" )
6+
47import torch
58import torch .distributed as dist
69import torch .nn as nn
1114)
1215from parameterized import parameterized
1316from torch .testing ._internal .common_utils import run_tests
17+ from torch_tensorrt ._features import ENABLED_FEATURES
1418
1519
1620def is_distributed_nccl_available ():
@@ -31,11 +35,144 @@ def is_distributed_nccl_available():
3135 return False
3236
3337
38+ def get_shm_usage ():
39+ """Get /dev/shm usage statistics."""
40+ try :
41+ import shutil
42+
43+ total , used , free = shutil .disk_usage ("/dev/shm" )
44+ return {
45+ "total_mb" : total / (1024 * 1024 ),
46+ "used_mb" : used / (1024 * 1024 ),
47+ "free_mb" : free / (1024 * 1024 ),
48+ }
49+ except Exception as e :
50+ return {"error" : str (e )}
51+
52+
53+ def cleanup_nccl_shared_memory ():
54+ """
55+ Clean up stale NCCL and torch shared memory segments from /dev/shm.
56+
57+ Previous CI test runs may leave behind SHM files that cause
58+ "No space left on device" errors during NCCL initialization.
59+ """
60+ print ("\n " + "=" * 60 )
61+ print ("NCCL Shared Memory Cleanup" )
62+ print ("=" * 60 )
63+
64+ # Show /dev/shm usage before cleanup
65+ usage_before = get_shm_usage ()
66+ print (f"Before cleanup - /dev/shm usage: { usage_before } " )
67+
68+ # List ALL files in /dev/shm to see what's consuming space
69+ print ("\n All files in /dev/shm (including hidden):" )
70+ try :
71+ import subprocess
72+ # Use ls -la to see all files including hidden ones
73+ result = subprocess .run (
74+ ["ls" , "-la" , "/dev/shm" ],
75+ capture_output = True ,
76+ text = True ,
77+ timeout = 5
78+ )
79+ print (result .stdout )
80+
81+ # Also run du to see actual disk usage
82+ print ("\n Disk usage breakdown (du -sh /dev/shm/*):" )
83+ result = subprocess .run (
84+ ["du" , "-sh" , "/dev/shm/*" ],
85+ capture_output = True ,
86+ text = True ,
87+ shell = False ,
88+ timeout = 5
89+ )
90+ # du with glob needs shell=True
91+ result = subprocess .run (
92+ "du -sh /dev/shm/* 2>/dev/null | head -20" ,
93+ capture_output = True ,
94+ text = True ,
95+ shell = True ,
96+ timeout = 5
97+ )
98+ print (result .stdout if result .stdout else " (no output)" )
99+
100+ except Exception as e :
101+ print (f" Error listing /dev/shm: { e } " )
102+
103+ # Also list using Python for comparison
104+ print ("\n Python os.listdir():" )
105+ try :
106+ shm_files = []
107+ for f in os .listdir ("/dev/shm" ):
108+ path = os .path .join ("/dev/shm" , f )
109+ try :
110+ size = os .path .getsize (path )
111+ shm_files .append ((path , size ))
112+ except OSError :
113+ shm_files .append ((path , - 1 ))
114+
115+ # Sort by size descending
116+ shm_files .sort (key = lambda x : x [1 ], reverse = True )
117+ total_listed = 0
118+ for path , size in shm_files :
119+ if size >= 0 :
120+ print (f" { path } : { size / (1024 * 1024 ):.2f} MB" )
121+ total_listed += size
122+ else :
123+ print (f" { path } : <unable to get size>" )
124+
125+ print (f"\n Total from listed files: { total_listed / (1024 * 1024 ):.2f} MB" )
126+ print (f"Reported used: { usage_before .get ('used_mb' , 'N/A' )} MB" )
127+ print (f"DISCREPANCY: { usage_before .get ('used_mb' , 0 ) - total_listed / (1024 * 1024 ):.2f} MB unaccounted for!" )
128+
129+ if not shm_files :
130+ print (" (no files found)" )
131+ except Exception as e :
132+ print (f" Error: { e } " )
133+
134+ patterns = [
135+ "/dev/shm/nccl-*" ,
136+ "/dev/shm/torch_*" ,
137+ "/dev/shm/py_shared_memory_*" ,
138+ "/dev/shm/*multiprocessing*" ,
139+ "/dev/shm/vader_segment*" , # Open MPI shared memory
140+ "/dev/shm/sem.*" , # POSIX semaphores
141+ ]
142+
143+ total_files = 0
144+ total_bytes_freed = 0
145+
146+ for pattern in patterns :
147+ files = glob .glob (pattern )
148+ if files :
149+ print (f"\n Pattern: { pattern } " )
150+ for path in files :
151+ try :
152+ file_size = os .path .getsize (path )
153+ os .remove (path )
154+ total_files += 1
155+ total_bytes_freed += file_size
156+ print (f" Removed: { path } ({ file_size / (1024 * 1024 ):.2f} MB)" )
157+ except OSError as e :
158+ print (f" Failed to remove { path } : { e } " )
159+
160+ # Show /dev/shm usage after cleanup
161+ usage_after = get_shm_usage ()
162+ print (f"\n After cleanup - /dev/shm usage: { usage_after } " )
163+ print (f"Total files removed: { total_files } " )
164+ print (f"Total space freed: { total_bytes_freed / (1024 * 1024 ):.2f} MB" )
165+ print ("=" * 60 + "\n " )
166+
167+
34168if "OMPI_COMM_WORLD_SIZE" in os .environ :
35169 set_environment_variables_pytest_multi_process ()
36170else :
37171 set_environment_variables_pytest_single_process ()
38172
173+ # Clean up stale NCCL shared memory BEFORE initializing process group
174+ cleanup_nccl_shared_memory ()
175+
39176if not dist .is_initialized ():
40177 dist .init_process_group (
41178 backend = "nccl" ,
@@ -75,13 +212,21 @@ def forward(self, x):
75212
76213class TestNcclOpsConverter (DispatchTestCase ):
77214 # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78- # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
215+ # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79216 @unittest .skipIf (
80217 not is_distributed_nccl_available (),
81218 "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported)." ,
82219 )
220+ @unittest .skipIf (
221+ not ENABLED_FEATURES .trtllm_for_nccl ,
222+ "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13)." ,
223+ )
83224 @classmethod
84225 def setUpClass (cls ):
226+ # Clean up stale NCCL shared memory from previous runs
227+ # to see if this is needed
228+ cleanup_nccl_shared_memory ()
229+
85230 cls .world_size = int (os .environ .get ("OMPI_COMM_WORLD_SIZE" , 1 ))
86231 cls .rank = int (os .environ .get ("OMPI_COMM_WORLD_RANK" , 0 ))
87232 cls .group = dist .new_group (ranks = list (range (cls .world_size )))
@@ -92,6 +237,9 @@ def tearDownClass(cls):
92237 if dist .is_initialized ():
93238 dist .destroy_process_group ()
94239
240+ # Clean up NCCL shared memory after tests complete
241+ cleanup_nccl_shared_memory ()
242+
95243 @parameterized .expand ([8 ])
96244 def test_nccl_ops_gather (self , linear_layer_dim ):
97245 inputs = [torch .randn (1 , linear_layer_dim ).to ("cuda" )]
0 commit comments