Skip to content

Commit f40e84b

Browse files
committed
addressing L2 CI errors
1 parent 6d20d2b commit f40e84b

File tree

1 file changed

+149
-1
lines changed

1 file changed

+149
-1
lines changed

tests/py/dynamo/distributed/test_nccl_ops.py

Lines changed: 149 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import glob
12
import os
23
import unittest
34

5+
os.environ.setdefault("NCCL_SHM_DISABLE", "1")
6+
47
import torch
58
import torch.distributed as dist
69
import torch.nn as nn
@@ -11,6 +14,7 @@
1114
)
1215
from parameterized import parameterized
1316
from torch.testing._internal.common_utils import run_tests
17+
from torch_tensorrt._features import ENABLED_FEATURES
1418

1519

1620
def is_distributed_nccl_available():
@@ -31,11 +35,144 @@ def is_distributed_nccl_available():
3135
return False
3236

3337

38+
def get_shm_usage():
39+
"""Get /dev/shm usage statistics."""
40+
try:
41+
import shutil
42+
43+
total, used, free = shutil.disk_usage("/dev/shm")
44+
return {
45+
"total_mb": total / (1024 * 1024),
46+
"used_mb": used / (1024 * 1024),
47+
"free_mb": free / (1024 * 1024),
48+
}
49+
except Exception as e:
50+
return {"error": str(e)}
51+
52+
53+
def cleanup_nccl_shared_memory():
54+
"""
55+
Clean up stale NCCL and torch shared memory segments from /dev/shm.
56+
57+
Previous CI test runs may leave behind SHM files that cause
58+
"No space left on device" errors during NCCL initialization.
59+
"""
60+
print("\n" + "=" * 60)
61+
print("NCCL Shared Memory Cleanup")
62+
print("=" * 60)
63+
64+
# Show /dev/shm usage before cleanup
65+
usage_before = get_shm_usage()
66+
print(f"Before cleanup - /dev/shm usage: {usage_before}")
67+
68+
# List ALL files in /dev/shm to see what's consuming space
69+
print("\nAll files in /dev/shm (including hidden):")
70+
try:
71+
import subprocess
72+
# Use ls -la to see all files including hidden ones
73+
result = subprocess.run(
74+
["ls", "-la", "/dev/shm"],
75+
capture_output=True,
76+
text=True,
77+
timeout=5
78+
)
79+
print(result.stdout)
80+
81+
# Also run du to see actual disk usage
82+
print("\nDisk usage breakdown (du -sh /dev/shm/*):")
83+
result = subprocess.run(
84+
["du", "-sh", "/dev/shm/*"],
85+
capture_output=True,
86+
text=True,
87+
shell=False,
88+
timeout=5
89+
)
90+
# du with glob needs shell=True
91+
result = subprocess.run(
92+
"du -sh /dev/shm/* 2>/dev/null | head -20",
93+
capture_output=True,
94+
text=True,
95+
shell=True,
96+
timeout=5
97+
)
98+
print(result.stdout if result.stdout else " (no output)")
99+
100+
except Exception as e:
101+
print(f" Error listing /dev/shm: {e}")
102+
103+
# Also list using Python for comparison
104+
print("\nPython os.listdir():")
105+
try:
106+
shm_files = []
107+
for f in os.listdir("/dev/shm"):
108+
path = os.path.join("/dev/shm", f)
109+
try:
110+
size = os.path.getsize(path)
111+
shm_files.append((path, size))
112+
except OSError:
113+
shm_files.append((path, -1))
114+
115+
# Sort by size descending
116+
shm_files.sort(key=lambda x: x[1], reverse=True)
117+
total_listed = 0
118+
for path, size in shm_files:
119+
if size >= 0:
120+
print(f" {path}: {size / (1024 * 1024):.2f} MB")
121+
total_listed += size
122+
else:
123+
print(f" {path}: <unable to get size>")
124+
125+
print(f"\nTotal from listed files: {total_listed / (1024 * 1024):.2f} MB")
126+
print(f"Reported used: {usage_before.get('used_mb', 'N/A')} MB")
127+
print(f"DISCREPANCY: {usage_before.get('used_mb', 0) - total_listed / (1024 * 1024):.2f} MB unaccounted for!")
128+
129+
if not shm_files:
130+
print(" (no files found)")
131+
except Exception as e:
132+
print(f" Error: {e}")
133+
134+
patterns = [
135+
"/dev/shm/nccl-*",
136+
"/dev/shm/torch_*",
137+
"/dev/shm/py_shared_memory_*",
138+
"/dev/shm/*multiprocessing*",
139+
"/dev/shm/vader_segment*", # Open MPI shared memory
140+
"/dev/shm/sem.*", # POSIX semaphores
141+
]
142+
143+
total_files = 0
144+
total_bytes_freed = 0
145+
146+
for pattern in patterns:
147+
files = glob.glob(pattern)
148+
if files:
149+
print(f"\nPattern: {pattern}")
150+
for path in files:
151+
try:
152+
file_size = os.path.getsize(path)
153+
os.remove(path)
154+
total_files += 1
155+
total_bytes_freed += file_size
156+
print(f" Removed: {path} ({file_size / (1024 * 1024):.2f} MB)")
157+
except OSError as e:
158+
print(f" Failed to remove {path}: {e}")
159+
160+
# Show /dev/shm usage after cleanup
161+
usage_after = get_shm_usage()
162+
print(f"\nAfter cleanup - /dev/shm usage: {usage_after}")
163+
print(f"Total files removed: {total_files}")
164+
print(f"Total space freed: {total_bytes_freed / (1024 * 1024):.2f} MB")
165+
print("=" * 60 + "\n")
166+
167+
34168
if "OMPI_COMM_WORLD_SIZE" in os.environ:
35169
set_environment_variables_pytest_multi_process()
36170
else:
37171
set_environment_variables_pytest_single_process()
38172

173+
# Clean up stale NCCL shared memory BEFORE initializing process group
174+
cleanup_nccl_shared_memory()
175+
39176
if not dist.is_initialized():
40177
dist.init_process_group(
41178
backend="nccl",
@@ -75,13 +212,21 @@ def forward(self, x):
75212

76213
class TestNcclOpsConverter(DispatchTestCase):
77214
# 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78-
# 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
215+
# 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79216
@unittest.skipIf(
80217
not is_distributed_nccl_available(),
81218
"Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
82219
)
220+
@unittest.skipIf(
221+
not ENABLED_FEATURES.trtllm_for_nccl,
222+
"Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
223+
)
83224
@classmethod
84225
def setUpClass(cls):
226+
# Clean up stale NCCL shared memory from previous runs
227+
# to see if this is needed
228+
cleanup_nccl_shared_memory()
229+
85230
cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))
86231
cls.rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
87232
cls.group = dist.new_group(ranks=list(range(cls.world_size)))
@@ -92,6 +237,9 @@ def tearDownClass(cls):
92237
if dist.is_initialized():
93238
dist.destroy_process_group()
94239

240+
# Clean up NCCL shared memory after tests complete
241+
cleanup_nccl_shared_memory()
242+
95243
@parameterized.expand([8])
96244
def test_nccl_ops_gather(self, linear_layer_dim):
97245
inputs = [torch.randn(1, linear_layer_dim).to("cuda")]

0 commit comments

Comments
 (0)