Skip to content

Commit 99ded8c

Browse files
committed
addressing L2 CI errors
1 parent 6d20d2b commit 99ded8c

File tree

2 files changed

+142
-2
lines changed

2 files changed

+142
-2
lines changed

.github/workflows/linux-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ jobs:
8080
runs-on: ${{ matrix.validation_runner }}
8181
container:
8282
image: ${{ matrix.container_image }}
83-
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
83+
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all --shm-size=1g' || ' ' }}
8484
# If a build is taking longer than 120 minutes on these runners we need
8585
# to have a conversation
8686
timeout-minutes: 120

tests/py/dynamo/distributed/test_nccl_ops.py

Lines changed: 141 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import glob
12
import os
23
import unittest
34

5+
os.environ.setdefault("NCCL_SHM_DISABLE", "1")
6+
47
import torch
58
import torch.distributed as dist
69
import torch.nn as nn
@@ -11,6 +14,7 @@
1114
)
1215
from parameterized import parameterized
1316
from torch.testing._internal.common_utils import run_tests
17+
from torch_tensorrt._features import ENABLED_FEATURES
1418

1519

1620
def is_distributed_nccl_available():
@@ -31,11 +35,136 @@ def is_distributed_nccl_available():
3135
return False
3236

3337

38+
def get_shm_usage():
39+
"""Get /dev/shm usage statistics."""
40+
try:
41+
import shutil
42+
43+
total, used, free = shutil.disk_usage("/dev/shm")
44+
return {
45+
"total_mb": total / (1024 * 1024),
46+
"used_mb": used / (1024 * 1024),
47+
"free_mb": free / (1024 * 1024),
48+
}
49+
except Exception as e:
50+
return {"error": str(e)}
51+
52+
53+
def cleanup_nccl_shared_memory():
54+
"""
55+
Clean up stale NCCL and torch shared memory segments from /dev/shm.
56+
57+
Previous CI test runs may leave behind SHM files that cause
58+
"No space left on device" errors during NCCL initialization.
59+
"""
60+
print("\n" + "=" * 60)
61+
print("NCCL Shared Memory Cleanup")
62+
print("=" * 60)
63+
64+
# Show /dev/shm usage before cleanup
65+
usage_before = get_shm_usage()
66+
print(f"Before cleanup - /dev/shm usage: {usage_before}")
67+
68+
# List ALL files in /dev/shm to see what's consuming space
69+
print("\nAll files in /dev/shm (including hidden):")
70+
try:
71+
import subprocess
72+
# Use ls -la to see all files including hidden ones
73+
result = subprocess.run(
74+
["ls", "-la", "/dev/shm"],
75+
capture_output=True,
76+
text=True,
77+
timeout=5
78+
)
79+
print(result.stdout)
80+
81+
# Also run du to see actual disk usage
82+
print("\nDisk usage breakdown (du -sh /dev/shm/*):")
83+
result = subprocess.run(
84+
"du -sh /dev/shm/* 2>/dev/null | head -20",
85+
capture_output=True,
86+
text=True,
87+
shell=True,
88+
timeout=5
89+
)
90+
print(result.stdout if result.stdout else " (no output)")
91+
92+
except Exception as e:
93+
print(f" Error listing /dev/shm: {e}")
94+
95+
# Also list using Python for comparison
96+
print("\nPython os.listdir():")
97+
try:
98+
shm_files = []
99+
for f in os.listdir("/dev/shm"):
100+
path = os.path.join("/dev/shm", f)
101+
try:
102+
size = os.path.getsize(path)
103+
shm_files.append((path, size))
104+
except OSError:
105+
shm_files.append((path, -1))
106+
107+
# Sort by size descending
108+
shm_files.sort(key=lambda x: x[1], reverse=True)
109+
total_listed = 0
110+
for path, size in shm_files:
111+
if size >= 0:
112+
print(f" {path}: {size / (1024 * 1024):.2f} MB")
113+
total_listed += size
114+
else:
115+
print(f" {path}: <unable to get size>")
116+
117+
print(f"\nTotal from listed files: {total_listed / (1024 * 1024):.2f} MB")
118+
print(f"Reported used: {usage_before.get('used_mb', 'N/A')} MB")
119+
print(f"DISCREPANCY: {usage_before.get('used_mb', 0) - total_listed / (1024 * 1024):.2f} MB unaccounted for!")
120+
121+
if not shm_files:
122+
print(" (no files found)")
123+
except Exception as e:
124+
print(f" Error: {e}")
125+
126+
patterns = [
127+
"/dev/shm/nccl-*",
128+
"/dev/shm/torch_*",
129+
"/dev/shm/py_shared_memory_*",
130+
"/dev/shm/*multiprocessing*",
131+
"/dev/shm/vader_segment*", # Open MPI shared memory
132+
"/dev/shm/sem.*", # POSIX semaphores
133+
]
134+
135+
total_files = 0
136+
total_bytes_freed = 0
137+
138+
for pattern in patterns:
139+
files = glob.glob(pattern)
140+
if files:
141+
print(f"\nPattern: {pattern}")
142+
for path in files:
143+
try:
144+
file_size = os.path.getsize(path)
145+
os.remove(path)
146+
total_files += 1
147+
total_bytes_freed += file_size
148+
print(f" Removed: {path} ({file_size / (1024 * 1024):.2f} MB)")
149+
except OSError as e:
150+
print(f" Failed to remove {path}: {e}")
151+
152+
# Show /dev/shm usage after cleanup
153+
usage_after = get_shm_usage()
154+
print(f"\nAfter cleanup - /dev/shm usage: {usage_after}")
155+
print(f"Total files removed: {total_files}")
156+
print(f"Total space freed: {total_bytes_freed / (1024 * 1024):.2f} MB")
157+
print("=" * 60 + "\n")
158+
159+
34160
if "OMPI_COMM_WORLD_SIZE" in os.environ:
35161
set_environment_variables_pytest_multi_process()
36162
else:
37163
set_environment_variables_pytest_single_process()
38164

165+
# Clean up stale NCCL shared memory BEFORE initializing process group
166+
cleanup_nccl_shared_memory()
167+
39168
if not dist.is_initialized():
40169
dist.init_process_group(
41170
backend="nccl",
@@ -75,13 +204,21 @@ def forward(self, x):
75204

76205
class TestNcclOpsConverter(DispatchTestCase):
77206
# 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
78-
# 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
207+
# 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
79208
@unittest.skipIf(
80209
not is_distributed_nccl_available(),
81210
"Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
82211
)
212+
@unittest.skipIf(
213+
not ENABLED_FEATURES.trtllm_for_nccl,
214+
"Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
215+
)
83216
@classmethod
84217
def setUpClass(cls):
218+
# Clean up stale NCCL shared memory from previous runs
219+
# to see if this is needed
220+
cleanup_nccl_shared_memory()
221+
85222
cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))
86223
cls.rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
87224
cls.group = dist.new_group(ranks=list(range(cls.world_size)))
@@ -92,6 +229,9 @@ def tearDownClass(cls):
92229
if dist.is_initialized():
93230
dist.destroy_process_group()
94231

232+
# Clean up NCCL shared memory after tests complete
233+
cleanup_nccl_shared_memory()
234+
95235
@parameterized.expand([8])
96236
def test_nccl_ops_gather(self, linear_layer_dim):
97237
inputs = [torch.randn(1, linear_layer_dim).to("cuda")]

0 commit comments

Comments
 (0)