addressing L2 CI errors

apbose · apbose · commit 99ded8c7b9a4 · 2025-12-02T21:23:46.000-08:00
diff --git a/.github/workflows/linux-test.yml b/.github/workflows/linux-test.yml
@@ -80,7 +80,7 @@ jobs:
     runs-on: ${{ matrix.validation_runner }}
     container:
       image: ${{ matrix.container_image }}
-      options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
+      options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all --shm-size=1g' || ' ' }}
     # If a build is taking longer than 120 minutes on these runners we need
     # to have a conversation
     timeout-minutes: 120
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -1,6 +1,9 @@
+import glob
 import os
 import unittest
 
+os.environ.setdefault("NCCL_SHM_DISABLE", "1")
+
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -11,6 +14,7 @@
 )
 from parameterized import parameterized
 from torch.testing._internal.common_utils import run_tests
+from torch_tensorrt._features import ENABLED_FEATURES
 
 
 def is_distributed_nccl_available():
@@ -31,11 +35,136 @@ def is_distributed_nccl_available():
         return False
 
 
+def get_shm_usage():
+    """Get /dev/shm usage statistics."""
+    try:
+        import shutil
+
+        total, used, free = shutil.disk_usage("/dev/shm")
+        return {
+            "total_mb": total / (1024 * 1024),
+            "used_mb": used / (1024 * 1024),
+            "free_mb": free / (1024 * 1024),
+        }
+    except Exception as e:
+        return {"error": str(e)}
+
+
+def cleanup_nccl_shared_memory():
+    """
+    Clean up stale NCCL and torch shared memory segments from /dev/shm.
+
+    Previous CI test runs may leave behind SHM files that cause
+    "No space left on device" errors during NCCL initialization.
+    """
+    print("\n" + "=" * 60)
+    print("NCCL Shared Memory Cleanup")
+    print("=" * 60)
+
+    # Show /dev/shm usage before cleanup
+    usage_before = get_shm_usage()
+    print(f"Before cleanup - /dev/shm usage: {usage_before}")
+
+    # List ALL files in /dev/shm to see what's consuming space
+    print("\nAll files in /dev/shm (including hidden):")
+    try:
+        import subprocess
+        # Use ls -la to see all files including hidden ones
+        result = subprocess.run(
+            ["ls", "-la", "/dev/shm"],
+            capture_output=True,
+            text=True,
+            timeout=5
+        )
+        print(result.stdout)
+        
+        # Also run du to see actual disk usage
+        print("\nDisk usage breakdown (du -sh /dev/shm/*):")
+        result = subprocess.run(
+            "du -sh /dev/shm/* 2>/dev/null | head -20",
+            capture_output=True,
+            text=True,
+            shell=True,
+            timeout=5
+        )
+        print(result.stdout if result.stdout else "  (no output)")
+        
+    except Exception as e:
+        print(f"  Error listing /dev/shm: {e}")
+    
+    # Also list using Python for comparison
+    print("\nPython os.listdir():")
+    try:
+        shm_files = []
+        for f in os.listdir("/dev/shm"):
+            path = os.path.join("/dev/shm", f)
+            try:
+                size = os.path.getsize(path)
+                shm_files.append((path, size))
+            except OSError:
+                shm_files.append((path, -1))
+        
+        # Sort by size descending
+        shm_files.sort(key=lambda x: x[1], reverse=True)
+        total_listed = 0
+        for path, size in shm_files:
+            if size >= 0:
+                print(f"  {path}: {size / (1024 * 1024):.2f} MB")
+                total_listed += size
+            else:
+                print(f"  {path}: <unable to get size>")
+        
+        print(f"\nTotal from listed files: {total_listed / (1024 * 1024):.2f} MB")
+        print(f"Reported used: {usage_before.get('used_mb', 'N/A')} MB")
+        print(f"DISCREPANCY: {usage_before.get('used_mb', 0) - total_listed / (1024 * 1024):.2f} MB unaccounted for!")
+        
+        if not shm_files:
+            print("  (no files found)")
+    except Exception as e:
+        print(f"  Error: {e}")
+
+    patterns = [
+        "/dev/shm/nccl-*",
+        "/dev/shm/torch_*",
+        "/dev/shm/py_shared_memory_*",
+        "/dev/shm/*multiprocessing*",
+        "/dev/shm/vader_segment*",  # Open MPI shared memory
+        "/dev/shm/sem.*",           # POSIX semaphores
+    ]
+
+    total_files = 0
+    total_bytes_freed = 0
+
+    for pattern in patterns:
+        files = glob.glob(pattern)
+        if files:
+            print(f"\nPattern: {pattern}")
+        for path in files:
+            try:
+                file_size = os.path.getsize(path)
+                os.remove(path)
+                total_files += 1
+                total_bytes_freed += file_size
+                print(f"  Removed: {path} ({file_size / (1024 * 1024):.2f} MB)")
+            except OSError as e:
+                print(f"  Failed to remove {path}: {e}")
+
+    # Show /dev/shm usage after cleanup
+    usage_after = get_shm_usage()
+    print(f"\nAfter cleanup - /dev/shm usage: {usage_after}")
+    print(f"Total files removed: {total_files}")
+    print(f"Total space freed: {total_bytes_freed / (1024 * 1024):.2f} MB")
+    print("=" * 60 + "\n")
+
+
 if "OMPI_COMM_WORLD_SIZE" in os.environ:
     set_environment_variables_pytest_multi_process()
 else:
     set_environment_variables_pytest_single_process()
 
+# Clean up stale NCCL shared memory BEFORE initializing process group
+cleanup_nccl_shared_memory()
+
 if not dist.is_initialized():
     dist.init_process_group(
         backend="nccl",
@@ -75,13 +204,21 @@ def forward(self, x):
 
 class TestNcclOpsConverter(DispatchTestCase):
     # 1. Skip if NCCL backend is not available (e.g., Windows, Jetson) - hard requirement
-    # 2. Don't skip if TRTLLM is unavailable (e.g., CUDA 13) - falls back to PyTorch
+    # 2. Skip if TRTLLM is unavailable (e.g., CUDA 13) - no converters registered
     @unittest.skipIf(
         not is_distributed_nccl_available(),
         "Skipped: NCCL backend is not available (Windows/Jetson Orin not supported).",
     )
+    @unittest.skipIf(
+        not ENABLED_FEATURES.trtllm_for_nccl,
+        "Skipped: TensorRT-LLM plugin for NCCL is not available (e.g., CUDA 13).",
+    )
     @classmethod
     def setUpClass(cls):
+        # Clean up stale NCCL shared memory from previous runs
+        # to see if this is needed
+        cleanup_nccl_shared_memory()
+
         cls.world_size = int(os.environ.get("OMPI_COMM_WORLD_SIZE", 1))
         cls.rank = int(os.environ.get("OMPI_COMM_WORLD_RANK", 0))
         cls.group = dist.new_group(ranks=list(range(cls.world_size)))
@@ -92,6 +229,9 @@ def tearDownClass(cls):
         if dist.is_initialized():
             dist.destroy_process_group()
 
+        # Clean up NCCL shared memory after tests complete
+        cleanup_nccl_shared_memory()
+
     @parameterized.expand([8])
     def test_nccl_ops_gather(self, linear_layer_dim):
         inputs = [torch.randn(1, linear_layer_dim).to("cuda")]