1+ import glob
12import os
23import unittest
34
5+ os .environ .setdefault ("NCCL_SHM_DISABLE" , "1" )
6+
47import torch
58import torch .distributed as dist
69import torch .nn as nn
1114)
1215from parameterized import parameterized
1316from torch .testing ._internal .common_utils import run_tests
17+ from torch_tensorrt ._features import ENABLED_FEATURES
1418
1519
1620def is_distributed_nccl_available ():
@@ -31,11 +35,136 @@ def is_distributed_nccl_available():
3135 return False
3236
3337
38+ def get_shm_usage ():
39+ """Get /dev/shm usage statistics."""
40+ try :
41+ import shutil
42+
43+ total , used , free = shutil .disk_usage ("/dev/shm" )
44+ return {
45+ "total_mb" : total / (1024 * 1024 ),
46+ "used_mb" : used / (1024 * 1024 ),
47+ "free_mb" : free / (1024 * 1024 ),
48+ }
49+ except Exception as e :
50+ return {"error" : str (e )}
51+
52+
53+ def cleanup_nccl_shared_memory ():
54+ """
55+ Clean up stale NCCL and torch shared memory segments from /dev/shm.
56+
57+ Previous CI test runs may leave behind SHM files that cause
58+ "No space left on device" errors during NCCL initialization.
59+ """
60+ print ("\n " + "=" * 60 )
61+ print ("NCCL Shared Memory Cleanup" )
62+ print ("=" * 60 )
63+
64+ # Show /dev/shm usage before cleanup
65+ usage_before = get_shm_usage ()
66+ print (f"Before cleanup - /dev/shm usage: { usage_before } " )
67+
68+ # List ALL files in /dev/shm to see what's consuming space
69+ print ("\n All files in /dev/shm (including hidden):" )
70+ try :
71+ import subprocess
72+ # Use ls -la to see all files including hidden ones
73+ result = subprocess .run (
74+ ["ls" , "-la" , "/dev/shm" ],
75+ capture_output = True ,
76+ text = True ,
77+ timeout = 5
78+ )
79+ print (result .stdout )
80+
81+ # Also run du to see actual disk usage
82+ print ("\n Disk usage breakdown (du -sh /dev/shm/*):" )
83+ result = subprocess .run (
84+ "du -sh /dev/shm/* 2>/dev/null | head -20" ,
85+ capture_output = True ,
86+ text = True ,
87+ shell = True ,
88+ timeout = 5
89+ )
90+ print (result .stdout if result .stdout else " (no output)" )
91+
92+ except Exception as e :
93+ print (f" Error listing /dev/shm: { e } " )
94+
95+ # Also list using Python for comparison
96+ print ("\n Python os.listdir():" )
97+ try :
98+ shm_files = []
99+ for f in os .listdir ("/dev/shm" ):
100+ path = os .path .join ("/dev/shm" , f )
101+ try :
102+ size = os .path .getsize (path )
103+ shm_files .append ((path , size ))
104+ except OSError :
105+ shm_files .append ((path , - 1 ))
106+
107+ # Sort by size descending
108+ shm_files .sort (key = lambda x : x [1 ], reverse = True )
109+ total_listed = 0
110+ for path , size in shm_files :
111+ if size >= 0 :
112+ print (f" { path } : { size / (1024 * 1024 ):.2f} MB" )
113+ total_listed += size
114+ else :
115+ print (f" { path } : <unable to get size>" )
116+
117+ print (f"\n Total from listed files: { total_listed / (1024 * 1024 ):.2f} MB" )
118+ print (f"Reported used: { usage_before .get ('used_mb' , 'N/A' )} MB" )
119+ print (f"DISCREPANCY: { usage_before .get ('used_mb' , 0 ) - total_listed / (1024 * 1024 ):.2f} MB unaccounted for!" )
120+
121+ if not shm_files :
122+ print (" (no files found)" )
123+ except Exception as e :
124+ print (f" Error: { e } " )
125+
126+ patterns = [
127+ "/dev/shm/nccl-*" ,
128+ "/dev/shm/torch_*" ,
129+ "/dev/shm/py_shared_memory_*" ,
130+ "/dev/shm/*multiprocessing*" ,
131+ "/dev/shm/vader_segment*" , # Open MPI shared memory
132+ "/dev/shm/sem.*" , # POSIX semaphores
133+ ]
134+
135+ total_files = 0
136+ total_bytes_freed = 0
137+
138+ for pattern in patterns :
139+ files = glob .glob (pattern )
140+ if files :
141+ print (f"\n Pattern: { pattern } " )
142+ for path in files :
143+ try :
144+ file_size = os .path .getsize (path )
145+ os .remove (path )
146+ total_files += 1
147+ total_bytes_freed += file_size
148+ print (f" Removed: { path } ({ file_size / (1024 * 1024 ):.2f} MB)" )
149+ except OSError as e :
150+ print (f" Failed to remove { path } : { e } " )
151+
152+ # Show /dev/shm usage after cleanup
153+ usage_after = get_shm_usage ()
154+ print (f"\n After cleanup - /dev/shm usage: { usage_after } " )
155+ print (f"Total files removed: { total_files } " )
156+ print (f"Total space freed: { total_bytes_freed / (1024 * 1024 ):.2f} MB" )
157+ print ("=" * 60 + "\n " )
158+
159+
34160if "OMPI_COMM_WORLD_SIZE" in os .environ :
35161 set_environment_variables_pytest_multi_process ()
36162else :
37163 set_environment_variables_pytest_single_process ()
38164
165+ # Clean up stale NCCL shared memory BEFORE initializing process group
166+ cleanup_nccl_shared_memory ()
167+
39168if not dist .is_initialized ():
40169 dist .init_process_group (
41170 backend = "nccl" ,
@@ -75,13 +204,21 @@ def forward(self, x):
75204
76205class TestNcclOpsConverter (DispatchTestCase ):
77206 # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78- # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
207+ # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79208 @unittest .skipIf (
80209 not is_distributed_nccl_available (),
81210 "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported)." ,
82211 )
212+ @unittest .skipIf (
213+ not ENABLED_FEATURES .trtllm_for_nccl ,
214+ "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13)." ,
215+ )
83216 @classmethod
84217 def setUpClass (cls ):
218+ # Clean up stale NCCL shared memory from previous runs
219+ # to see if this is needed
220+ cleanup_nccl_shared_memory ()
221+
85222 cls .world_size = int (os .environ .get ("OMPI_COMM_WORLD_SIZE" , 1 ))
86223 cls .rank = int (os .environ .get ("OMPI_COMM_WORLD_RANK" , 0 ))
87224 cls .group = dist .new_group (ranks = list (range (cls .world_size )))
@@ -92,6 +229,9 @@ def tearDownClass(cls):
92229 if dist .is_initialized ():
93230 dist .destroy_process_group ()
94231
232+ # Clean up NCCL shared memory after tests complete
233+ cleanup_nccl_shared_memory ()
234+
95235 @parameterized .expand ([8 ])
96236 def test_nccl_ops_gather (self , linear_layer_dim ):
97237 inputs = [torch .randn (1 , linear_layer_dim ).to ("cuda" )]
0 commit comments