Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion .github/scripts/filter-matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,27 @@ def filter_matrix_item(
return True


def create_distributed_config(item: Dict[str, Any]) -> Dict[str, Any]:
"""Create distributed test configuration from a regular config.

Takes a standard test config and modifies it for distributed testing:
- Changes runner to multi-GPU instance
- Adds num_gpus field
- Adds config marker
"""
# Create a copy to avoid modifying the original
dist_item = item.copy()

# Override runner to use multi-GPU instance
dist_item["validation_runner"] = "linux.g4dn.12xlarge.nvidia.gpu"

# Add distributed-specific fields
dist_item["num_gpus"] = 2
dist_item["config"] = "distributed"

return dist_item


def main(args: list[str]) -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
Expand Down Expand Up @@ -99,6 +120,7 @@ def main(args: list[str]) -> None:

includes = matrix_dict["include"]
filtered_includes = []
distributed_includes = [] # Separate list for distributed configs

for item in includes:
if filter_matrix_item(
Expand All @@ -107,8 +129,15 @@ def main(args: list[str]) -> None:
options.limit_pr_builds == "true",
):
filtered_includes.append(item)
distributed_includes.append(create_distributed_config(item))

# Output both regular and distributed configs
filtered_matrix_dict = {
"include": filtered_includes,
"distributed_include": distributed_includes,
}

filtered_matrix_dict = {"include": filtered_includes}
# Output to stdout (consumed by GitHub Actions)
print(json.dumps(filtered_matrix_dict))


Expand Down
123 changes: 104 additions & 19 deletions .github/workflows/build-test-linux-x86_64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
env-var-script: ${{ matrix.env-var-script }}
post-script: ${{ matrix.post-script }}
Expand Down Expand Up @@ -99,7 +103,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -128,7 +136,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -160,7 +172,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -189,7 +205,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -222,7 +242,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -251,7 +275,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -280,7 +308,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -311,7 +343,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -344,7 +380,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -374,7 +414,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -404,7 +448,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -433,7 +481,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -465,7 +517,11 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail
Expand Down Expand Up @@ -498,18 +554,47 @@ jobs:
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.filter-matrix.outputs.matrix }}
# Extract the distributed_include array from filter-matrix output
build-matrix: |
{
"include": ${{ toJSON(fromJSON(needs.filter-matrix.outputs.matrix).distributed_include) }}
}
pre-script: ${{ matrix.pre-script }}
script: |
set -euo pipefail

export USE_HOST_DEPS=1
export CI_BUILD=1
export USE_TRTLLM_PLUGINS=1
dnf install -y mpich mpich-devel openmpi openmpi-devel

# Install MPI (required for TensorRT-LLM plugins)
echo "Installing MPI..."
dnf install -y openmpi openmpi-devel

# Add OpenMPI to PATH (RHEL/AlmaLinux specific location)
export PATH="/usr/lib64/openmpi/bin:$PATH"
export LD_LIBRARY_PATH="/usr/lib64/openmpi/lib:$LD_LIBRARY_PATH"

# Run distributed tests
pushd .
cd tests/py
cd dynamo
python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml distributed/test_nccl_ops.py
cd tests/py/dynamo

# Set master port for distributed communication (must be same across all ranks)
export MASTER_ADDR=127.0.0.1
export MASTER_PORT=29500

# Use a wrapper script to ensure only rank 0 writes the JUnit XML
# Each rank runs pytest, but only rank 0 saves results to avoid file conflicts
RANK_0_XML="${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_distributed_test_results.xml"
mpirun --allow-run-as-root -n ${NUM_GPUS} \
bash -c '
if [ "${OMPI_COMM_WORLD_RANK:-0}" -eq 0 ]; then
python -m pytest -ra --junitxml='"${RANK_0_XML}"' distributed/test_nccl_ops.py
else
python -m pytest -ra distributed/test_nccl_ops.py
fi
'

popd

concurrency:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/linux-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,13 @@ jobs:
RUNNER_TEST_RESULTS_DIR: /tmp/test_results
ARCH: ${{ inputs.architecture }}
USE_TRT_RTX: ${{ inputs.use-rtx }}
NUM_GPUS: ${{ matrix.num_gpus || '' }}
DOWNLOAD_ARTIFACT_NAME: pytorch_tensorrt_${{ matrix.tensorrt.version }}_${{ matrix.python_version }}_${{ matrix.desired_cuda }}_${{ inputs.architecture }}
name: ${{ inputs.job-name }}-${{ matrix.tensorrt.version }}-${{ matrix.python_version }}-${{ matrix.desired_cuda }}
runs-on: ${{ matrix.validation_runner }}
container:
image: ${{ matrix.container_image }}
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all' || ' ' }}
options: ${{ matrix.gpu_arch_type == 'cuda' && '--gpus all --shm-size=1g' || ' ' }}
# If a build is taking longer than 120 minutes on these runners we need
# to have a conversation
timeout-minutes: 120
Expand Down
Loading
Loading