Skip to content

Commit d92f040

Browse files
committed
removing the debug prints, increasing docker shared memory, clearing shared memory in teardown class
1 parent 30dcc4c commit d92f040

File tree

5 files changed

+26
-92
lines changed

5 files changed

+26
-92
lines changed

.github/scripts/filter-matrix.py

Lines changed: 4 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -68,30 +68,16 @@ def create_distributed_config(item: Dict[str, Any]) -> Dict[str, Any]:
6868
- Adds num_gpus field
6969
- Adds config marker
7070
"""
71-
import sys
72-
7371
# Create a copy to avoid modifying the original
7472
dist_item = item.copy()
7573

76-
# Debug: Show original config
77-
print(f"[DEBUG] Creating distributed config from:", file=sys.stderr)
78-
print(f"[DEBUG] Python: {item.get('python_version')}", file=sys.stderr)
79-
print(f"[DEBUG] CUDA: {item.get('desired_cuda')}", file=sys.stderr)
80-
print(
81-
f"[DEBUG] Original runner: {item.get('validation_runner')}", file=sys.stderr
82-
)
83-
8474
# Override runner to use multi-GPU instance
8575
dist_item["validation_runner"] = "linux.g4dn.12xlarge.nvidia.gpu"
8676

8777
# Add distributed-specific fields
8878
dist_item["num_gpus"] = 2
8979
dist_item["config"] = "distributed"
9080

91-
# Debug: Show modified config
92-
print(f"[DEBUG] New runner: {dist_item['validation_runner']}", file=sys.stderr)
93-
print(f"[DEBUG] GPUs: {dist_item['num_gpus']}", file=sys.stderr)
94-
9581
return dist_item
9682

9783

@@ -134,58 +120,21 @@ def main(args: list[str]) -> None:
134120

135121
includes = matrix_dict["include"]
136122
filtered_includes = []
137-
distributed_includes = [] # NEW: separate list for distributed configs
138-
139-
print(f"[DEBUG] Processing {len(includes)} input configs", file=sys.stderr)
123+
distributed_includes = [] # Separate list for distributed configs
140124

141125
for item in includes:
142-
py_ver = item.get("python_version", "unknown")
143-
cuda_ver = item.get("desired_cuda", "unknown")
144-
145-
print(f"[DEBUG] Checking config: py={py_ver}, cuda={cuda_ver}", file=sys.stderr)
146-
147126
if filter_matrix_item(
148127
item,
149128
options.jetpack == "true",
150129
options.limit_pr_builds == "true",
151130
):
152-
print(f"[DEBUG] passed filter - adding to build matrix", file=sys.stderr)
153-
filtered_includes.append(item)
131+
filtered_includes.append(item)
154132
distributed_includes.append(create_distributed_config(item))
155-
else:
156-
print(f"[DEBUG] FILTERED OUT", file=sys.stderr)
157-
158-
# Debug: Show summary
159-
print(f"[DEBUG] Final counts:", file=sys.stderr)
160-
print(f"[DEBUG] Regular configs: {len(filtered_includes)}", file=sys.stderr)
161-
print(
162-
f"[DEBUG] Distributed configs: {len(distributed_includes)}", file=sys.stderr
163-
)
164-
165-
# Debug: Show which configs will be built
166-
print(
167-
f"[DEBUG] Configs that will be BUILT (in filtered_includes):", file=sys.stderr
168-
)
169-
for item in filtered_includes:
170-
print(
171-
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}",
172-
file=sys.stderr,
173-
)
174-
175-
print(
176-
f"[DEBUG] Configs for DISTRIBUTED TESTS (in distributed_includes):",
177-
file=sys.stderr,
178-
)
179-
for item in distributed_includes:
180-
print(
181-
f"[DEBUG] - py={item.get('python_version')}, cuda={item.get('desired_cuda')}, gpus={item.get('num_gpus')}",
182-
file=sys.stderr,
183-
)
184133

185-
# NEW: Output both regular and distributed configs
134+
# Output both regular and distributed configs
186135
filtered_matrix_dict = {
187136
"include": filtered_includes,
188-
"distributed_include": distributed_includes, # NEW field
137+
"distributed_include": distributed_includes,
189138
}
190139

191140
# Output to stdout (consumed by GitHub Actions)

.github/workflows/build-test-linux-x86_64.yml

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -563,21 +563,6 @@ jobs:
563563
script: |
564564
set -euo pipefail
565565
566-
# Debug: Show what config we're using
567-
echo "=========================================="
568-
echo "DISTRIBUTED TEST CONFIGURATION"
569-
echo "=========================================="
570-
echo "Python version: ${PYTHON_VERSION}"
571-
echo "CUDA version: ${CU_VERSION}"
572-
echo "Num GPUs: ${NUM_GPUS}"
573-
echo "=========================================="
574-
575-
# Verify GPUs are available
576-
echo "Checking GPU availability:"
577-
nvidia-smi
578-
echo "GPU count: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
579-
echo "=========================================="
580-
581566
export USE_HOST_DEPS=1
582567
export CI_BUILD=1
583568
export USE_TRTLLM_PLUGINS=1
@@ -590,18 +575,10 @@ jobs:
590575
export PATH="/usr/lib64/openmpi/bin:$PATH"
591576
export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH"
592577
593-
# Verify mpirun is accessible
594-
which mpirun
595-
mpirun --version
596-
597578
# Run distributed tests
598579
pushd .
599580
cd tests/py/dynamo
600581
601-
echo "Running distributed tests with mpirun..."
602-
echo "[CONFIG] Number of GPUs to use: ${NUM_GPUS}"
603-
echo "[AVAILABLE] GPUs detected by nvidia-smi: $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)"
604-
605582
# Set master port for distributed communication (must be same across all ranks)
606583
export MASTER_ADDR=127.0.0.1
607584
export MASTER_PORT=29500
@@ -611,7 +588,6 @@ jobs:
611588
RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
612589
mpirun --allow-run-as-root -n ${NUM_GPUS} \
613590
bash -c '
614-
echo "[MPI DEBUG] Rank: ${OMPI_COMM_WORLD_RANK:-0}, World Size: ${OMPI_COMM_WORLD_SIZE:-1}"
615591
if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
616592
python -m pytest -ra --junitxml='"${RANK_0_XML}"' distributed/test_nccl_ops.py
617593
else

.github/workflows/build_linux.yml

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -160,17 +160,6 @@ jobs:
160160
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
161161
timeout-minutes: ${{ inputs.timeout }}
162162
steps:
163-
- name: Debug matrix configuration
164-
shell: bash
165-
run: |
166-
echo "=========================================="
167-
echo "BUILD MATRIX DEBUG"
168-
echo "=========================================="
169-
echo "Python version: ${{ matrix.python_version }}"
170-
echo "CUDA version: ${{ matrix.desired_cuda }}"
171-
echo "GPU arch type: ${{ matrix.gpu_arch_type }}"
172-
echo "Runner: ${{ matrix.validation_runner }}"
173-
echo "=========================================="
174163
- name: Clean workspace
175164
shell: bash -l {0}
176165
run: |

.github/workflows/linux-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ jobs:
8080
runs-on: ${{ matrix.validation_runner }}
8181
container:
8282
image: ${{ matrix.container_image }}
83-
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
83+
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all --shm-size=1g' || ' ' }}
8484
# If a build is taking longer than 120 minutes on these runners we need
8585
# to have a conversation
8686
timeout-minutes: 120

tests/py/dynamo/distributed/test_nccl_ops.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,25 @@
1+
import glob
12
import os
3+
import shutil
24
import unittest
35

6+
# Check /dev/shm space BEFORE importing torch/NCCL
7+
# If insufficient, disable shared memory transport (use sockets instead)
8+
try:
9+
_, _, free = shutil.disk_usage("/dev/shm")
10+
free_mb = free / (1024 * 1024)
11+
if free_mb < 40: # NCCL needs ~33 MB per process from the CI error message
12+
os.environ["NCCL_SHM_DISABLE"] = "1"
13+
print(
14+
f"[NCCL] /dev/shm has only {free_mb:.1f} MB free. Disabling shared memory transport."
15+
)
16+
except Exception:
17+
pass # If check fails, let NCCL use default behavior
18+
419
import torch
520
import torch.distributed as dist
621
import torch.nn as nn
722
from conversion.harness import DispatchTestCase
8-
923
from distributed_utils import (
1024
set_environment_variables_pytest_multi_process,
1125
set_environment_variables_pytest_single_process,
@@ -72,6 +86,12 @@ def setUpClass(cls):
7286
def tearDownClass(cls):
7387
if dist.is_initialized():
7488
dist.destroy_process_group()
89+
# Clean up NCCL shared memory after tests complete
90+
for f in glob.glob("/dev/shm/nccl-*"):
91+
try:
92+
os.remove(f)
93+
except OSError:
94+
pass
7595

7696
@parameterized.expand([8])
7797
def test_nccl_ops_gather(self, linear_layer_dim):

0 commit comments

Comments
 (0)