removing the debug prints, increasing docker shared memory, clearing shared memory in teardown class

apbose · apbose · commit d92f040a52a5 · 2025-12-03T06:33:20.000-08:00
diff --git a/.github/scripts/filter-matrix.py b/.github/scripts/filter-matrix.py
@@ -68,30 +68,16 @@ def create_distributed_config(item: Dict[str, Any]) -> Dict[str, Any]:
     - Adds num_gpus field
     - Adds config marker
     """
-    import sys
-
     # Create a copy to avoid modifying the original
     dist_item = item.copy()
 
-    # Debug: Show original config
-    print(f"[DEBUG] Creating distributed config from:", file=sys.stderr)
-    print(f"[DEBUG]   Python: {item.get('python_version')}", file=sys.stderr)
-    print(f"[DEBUG]   CUDA: {item.get('desired_cuda')}", file=sys.stderr)
-    print(
-        f"[DEBUG]   Original runner: {item.get('validation_runner')}", file=sys.stderr
-    )
-
     # Override runner to use multi-GPU instance
     dist_item["validation_runner"] = "linux.g4dn.12xlarge.nvidia.gpu"
 
     # Add distributed-specific fields
     dist_item["num_gpus"] = 2
     dist_item["config"] = "distributed"
 
-    # Debug: Show modified config
-    print(f"[DEBUG]   New runner: {dist_item['validation_runner']}", file=sys.stderr)
-    print(f"[DEBUG]   GPUs: {dist_item['num_gpus']}", file=sys.stderr)
-
     return dist_item
 
 
@@ -134,58 +120,21 @@ def main(args: list[str]) -> None:
 
     includes = matrix_dict["include"]
     filtered_includes = []
-    distributed_includes = []  # NEW: separate list for distributed configs
-
-    print(f"[DEBUG] Processing {len(includes)} input configs", file=sys.stderr)
+    distributed_includes = []  # Separate list for distributed configs
 
     for item in includes:
-        py_ver = item.get("python_version", "unknown")
-        cuda_ver = item.get("desired_cuda", "unknown")
-
-        print(f"[DEBUG] Checking config: py={py_ver}, cuda={cuda_ver}", file=sys.stderr)
-
         if filter_matrix_item(
             item,
             options.jetpack == "true",
             options.limit_pr_builds == "true",
         ):
-            print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
-            filtered_includes.append(item) 
+            filtered_includes.append(item)
             distributed_includes.append(create_distributed_config(item))
-        else:
-            print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
-
-    # Debug: Show summary
-    print(f"[DEBUG] Final counts:", file=sys.stderr)
-    print(f"[DEBUG]   Regular configs: {len(filtered_includes)}", file=sys.stderr)
-    print(
-        f"[DEBUG]   Distributed configs: {len(distributed_includes)}", file=sys.stderr
-    )
-
-    # Debug: Show which configs will be built
-    print(
-        f"[DEBUG] Configs that will be BUILT (in filtered_includes):", file=sys.stderr
-    )
-    for item in filtered_includes:
-        print(
-            f"[DEBUG]   - py={item.get('python_version')}, cuda={item.get('desired_cuda')}",
-            file=sys.stderr,
-        )
-
-    print(
-        f"[DEBUG] Configs for DISTRIBUTED TESTS (in distributed_includes):",
-        file=sys.stderr,
-    )
-    for item in distributed_includes:
-        print(
-            f"[DEBUG]   - py={item.get('python_version')}, cuda={item.get('desired_cuda')}, gpus={item.get('num_gpus')}",
-            file=sys.stderr,
-        )
 
-    # NEW: Output both regular and distributed configs
+    # Output both regular and distributed configs
     filtered_matrix_dict = {
         "include": filtered_includes,
-        "distributed_include": distributed_includes,  # NEW field
+        "distributed_include": distributed_includes,
     }
 
     # Output to stdout (consumed by GitHub Actions)
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
@@ -563,21 +563,6 @@ jobs:
       script: |
         set -euo pipefail
 
-        # Debug: Show what config we're using
-        echo "=========================================="
-        echo "DISTRIBUTED TEST CONFIGURATION"
-        echo "=========================================="
-        echo "Python version: ${PYTHON_VERSION}"
-        echo "CUDA version: ${CU_VERSION}"
-        echo "Num GPUs: ${NUM_GPUS}"
-        echo "=========================================="
-
-        # Verify GPUs are available
-        echo "Checking GPU availability:"
-        nvidia-smi
-        echo "GPU count: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
-        echo "=========================================="
-
         export USE_HOST_DEPS=1
         export CI_BUILD=1
         export USE_TRTLLM_PLUGINS=1
@@ -590,18 +575,10 @@ jobs:
         export PATH="/usr/lib64/openmpi/bin:$PATH"
         export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH"
 
-        # Verify mpirun is accessible
-        which mpirun
-        mpirun --version
-
         # Run distributed tests
         pushd .
         cd tests/py/dynamo
 
-        echo "Running distributed tests with mpirun..."
-        echo "[CONFIG] Number of GPUs to use: ${NUM_GPUS}"
-        echo "[AVAILABLE] GPUs detected by nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
-
         # Set master port for distributed communication (must be same across all ranks)
         export MASTER_ADDR=127.0.0.1
         export MASTER_PORT=29500
@@ -611,7 +588,6 @@ jobs:
         RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
         mpirun --allow-run-as-root -n ${NUM_GPUS} \
           bash -c '
-            echo "[MPI DEBUG] Rank: ${OMPI_COMM_WORLD_RANK:-0}, World Size: ${OMPI_COMM_WORLD_SIZE:-1}"
             if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
               python -m pytest -ra --junitxml='"${RANK_0_XML}"' distributed/test_nccl_ops.py
             else
diff --git a/.github/workflows/build_linux.yml b/.github/workflows/build_linux.yml
@@ -160,17 +160,6 @@ jobs:
       options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
     timeout-minutes: ${{ inputs.timeout }}
     steps:
-      - name: Debug matrix configuration
-        shell: bash
-        run: |
-          echo "=========================================="
-          echo "BUILD MATRIX DEBUG"
-          echo "=========================================="
-          echo "Python version: ${{ matrix.python_version }}"
-          echo "CUDA version: ${{ matrix.desired_cuda }}"
-          echo "GPU arch type: ${{ matrix.gpu_arch_type }}"
-          echo "Runner: ${{ matrix.validation_runner }}"
-          echo "=========================================="
       - name: Clean workspace
         shell: bash -l {0}
         run: |
diff --git a/.github/workflows/linux-test.yml b/.github/workflows/linux-test.yml
@@ -80,7 +80,7 @@ jobs:
     runs-on: ${{ matrix.validation_runner }}
     container:
       image: ${{ matrix.container_image }}
-      options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
+      options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all --shm-size=1g' || ' ' }}
     # If a build is taking longer than 120 minutes on these runners we need
     # to have a conversation
     timeout-minutes: 120
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.py b/tests/py/dynamo/distributed/test_nccl_ops.py
@@ -1,11 +1,25 @@
+import glob
 import os
+import shutil
 import unittest
 
+# Check /dev/shm space BEFORE importing torch/NCCL
+# If insufficient, disable shared memory transport (use sockets instead)
+try:
+    _, _, free = shutil.disk_usage("/dev/shm")
+    free_mb = free / (1024 * 1024)
+    if free_mb < 40:  # NCCL needs ~33 MB per process from the CI error message
+        os.environ["NCCL_SHM_DISABLE"] = "1"
+        print(
+            f"[NCCL] /dev/shm has only {free_mb:.1f} MB free. Disabling shared memory transport."
+        )
+except Exception:
+    pass  # If check fails, let NCCL use default behavior
+
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from conversion.harness import DispatchTestCase
-
 from distributed_utils import (
     set_environment_variables_pytest_multi_process,
     set_environment_variables_pytest_single_process,
@@ -72,6 +86,12 @@ def setUpClass(cls):
     def tearDownClass(cls):
         if dist.is_initialized():
             dist.destroy_process_group()
+        # Clean up NCCL shared memory after tests complete
+        for f in glob.glob("/dev/shm/nccl-*"):
+            try:
+                os.remove(f)
+            except OSError:
+                pass
 
     @parameterized.expand([8])
     def test_nccl_ops_gather(self, linear_layer_dim):