diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index ede72019..47a48bd9 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -17,9 +17,11 @@ CWP CXI Ceph Containerfile +Containerfiles DNS Dockerfiles Dufourspitze +EFA EMPA ETHZ Ehrenfest @@ -76,6 +78,8 @@ MeteoSwiss NAMD NICs NVMe +NVSHMEM +NVLINK Nordend OpenFabrics OAuth @@ -102,6 +106,7 @@ ROCm RPA Roboto Roothaan +SHMEM SSHService STMV Scopi diff --git a/docs/software/communication/cray-mpich.md b/docs/software/communication/cray-mpich.md index 8bac8559..ec753c56 100644 --- a/docs/software/communication/cray-mpich.md +++ b/docs/software/communication/cray-mpich.md @@ -28,7 +28,7 @@ This means that Cray MPICH will automatically be linked to the GTL library, whic $ ldd myexecutable | grep gtl libmpi_gtl_cuda.so => /user-environment/linux-sles15-neoverse_v2/gcc-13.2.0/cray-gtl-8.1.30-fptqzc5u6t4nals5mivl75nws2fb5vcq/lib/libmpi_gtl_cuda.so (0x0000ffff82aa0000) ``` - + The path may be different, but the `libmpi_gtl_cuda.so` library should be printed when using CUDA. In ROCm environments the `libmpi_gtl_hsa.so` library should be linked. If the GTL library is not linked, nothing will be printed. @@ -40,7 +40,7 @@ See [this page][ref-slurm-gh200] for more information on configuring Slurm to us !!! warning "Segmentation faults when trying to communicate GPU buffers without `MPICH_GPU_SUPPORT_ENABLED=1`" If you attempt to communicate GPU buffers through MPI without setting `MPICH_GPU_SUPPORT_ENABLED=1`, it will lead to segmentation faults, usually without any specific indication that it is the communication that fails. Make sure that the option is set if you are communicating GPU buffers through MPI. - + !!! warning "Error: "`GPU_SUPPORT_ENABLED` is requested, but GTL library is not linked"" If `MPICH_GPU_SUPPORT_ENABLED` is set to `1` and your application does not link against one of the GTL libraries you will get an error similar to the following during MPI initialization: ```bash diff --git a/docs/software/communication/dockerfiles/base b/docs/software/communication/dockerfiles/base new file mode 100644 index 00000000..030fb891 --- /dev/null +++ b/docs/software/communication/dockerfiles/base @@ -0,0 +1,36 @@ +ARG ubuntu_version=24.04 +ARG cuda_version=12.8.1 +FROM docker.io/nvidia/cuda:${cuda_version}-cudnn-devel-ubuntu${ubuntu_version} + +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive \ + apt-get install -y \ + build-essential \ + ca-certificates \ + pkg-config \ + automake \ + autoconf \ + libtool \ + cmake \ + gdb \ + strace \ + wget \ + git \ + bzip2 \ + python3 \ + gfortran \ + rdma-core \ + numactl \ + libconfig-dev \ + libuv1-dev \ + libfuse-dev \ + libfuse3-dev \ + libyaml-dev \ + libnl-3-dev \ + libnuma-dev \ + libsensors-dev \ + libcurl4-openssl-dev \ + libjson-c-dev \ + libibverbs-dev \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* diff --git a/docs/software/communication/dockerfiles/libfabric b/docs/software/communication/dockerfiles/libfabric new file mode 100644 index 00000000..0fe4fad4 --- /dev/null +++ b/docs/software/communication/dockerfiles/libfabric @@ -0,0 +1,20 @@ +ARG gdrcopy_version=2.5.1 +RUN git clone --depth 1 --branch v${gdrcopy_version} https://github.com/NVIDIA/gdrcopy.git \ + && cd gdrcopy \ + && export CUDA_PATH=/usr/local/cuda \ + && make CC=gcc CUDA=$CUDA_PATH lib \ + && make lib_install \ + && cd ../ && rm -rf gdrcopy + +# Install libfabric +ARG libfabric_version=1.22.0 +RUN git clone --branch v${libfabric_version} --depth 1 https://github.com/ofiwg/libfabric.git \ + && cd libfabric \ + && ./autogen.sh \ + && ./configure --prefix=/usr --with-cuda=/usr/local/cuda --enable-cuda-dlopen \ + --enable-gdrcopy-dlopen --enable-efa \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf libfabric diff --git a/docs/software/communication/dockerfiles/nccl-tests b/docs/software/communication/dockerfiles/nccl-tests new file mode 100644 index 00000000..43447a5f --- /dev/null +++ b/docs/software/communication/dockerfiles/nccl-tests @@ -0,0 +1,7 @@ +ARG nccl_tests_version=2.17.1 +RUN wget -O nccl-tests-${nccl_tests_version}.tar.gz https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${nccl_tests_version}.tar.gz \ + && tar xf nccl-tests-${nccl_tests_version}.tar.gz \ + && cd nccl-tests-${nccl_tests_version} \ + && MPI=1 make -j$(nproc) \ + && cd .. \ + && rm -rf nccl-tests-${nccl_tests_version}.tar.gz diff --git a/docs/software/communication/dockerfiles/nvshmem b/docs/software/communication/dockerfiles/nvshmem new file mode 100644 index 00000000..d3b03568 --- /dev/null +++ b/docs/software/communication/dockerfiles/nvshmem @@ -0,0 +1,54 @@ +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive \ + apt-get install -y \ + python3-venv \ + python3-dev \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* \ + && rm /usr/lib/python3.12/EXTERNALLY-MANAGED + +# Build NVSHMEM from source +ARG nvshmem_version=3.4.5 +RUN wget -q https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/source/nvshmem_src_cuda12-all-all-${nvshmem_version}.tar.gz \ + && tar -xvf nvshmem_src_cuda12-all-all-${nvshmem_version}.tar.gz \ + && cd nvshmem_src \ + && NVSHMEM_BUILD_EXAMPLES=0 \ + NVSHMEM_BUILD_TESTS=1 \ + NVSHMEM_DEBUG=0 \ + NVSHMEM_DEVEL=0 \ + NVSHMEM_DEFAULT_PMI2=0 \ + NVSHMEM_DEFAULT_PMIX=1 \ + NVSHMEM_DISABLE_COLL_POLL=1 \ + NVSHMEM_ENABLE_ALL_DEVICE_INLINING=0 \ + NVSHMEM_GPU_COLL_USE_LDST=0 \ + NVSHMEM_LIBFABRIC_SUPPORT=1 \ + NVSHMEM_MPI_SUPPORT=1 \ + NVSHMEM_MPI_IS_OMPI=1 \ + NVSHMEM_NVTX=1 \ + NVSHMEM_PMIX_SUPPORT=1 \ + NVSHMEM_SHMEM_SUPPORT=1 \ + NVSHMEM_TEST_STATIC_LIB=0 \ + NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ + NVSHMEM_TRACE=0 \ + NVSHMEM_USE_DLMALLOC=0 \ + NVSHMEM_USE_NCCL=1 \ + NVSHMEM_USE_GDRCOPY=1 \ + NVSHMEM_VERBOSE=0 \ + NVSHMEM_DEFAULT_UCX=0 \ + NVSHMEM_UCX_SUPPORT=0 \ + NVSHMEM_IBGDA_SUPPORT=0 \ + NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY=0 \ + NVSHMEM_IBDEVX_SUPPORT=0 \ + NVSHMEM_IBRC_SUPPORT=0 \ + LIBFABRIC_HOME=/usr \ + NCCL_HOME=/usr \ + GDRCOPY_HOME=/usr/local \ + MPI_HOME=/usr \ + SHMEM_HOME=/usr \ + NVSHMEM_HOME=/usr \ + cmake . \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -r nvshmem_src nvshmem_src_cuda12-all-all-${nvshmem_version}.tar.gz diff --git a/docs/software/communication/dockerfiles/openmpi b/docs/software/communication/dockerfiles/openmpi new file mode 100644 index 00000000..534ba5df --- /dev/null +++ b/docs/software/communication/dockerfiles/openmpi @@ -0,0 +1,12 @@ +ARG OMPI_VER=5.0.8 +RUN wget -q https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OMPI_VER}.tar.gz \ + && tar xf openmpi-${OMPI_VER}.tar.gz \ + && cd openmpi-${OMPI_VER} \ + && ./configure --prefix=/usr --with-ofi=/usr --with-ucx=/usr \ + --enable-oshmem --with-cuda=/usr/local/cuda \ + --with-cuda-libdir=/usr/local/cuda/lib64/stubs \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf openmpi-${OMPI_VER}.tar.gz openmpi-${OMPI_VER} diff --git a/docs/software/communication/dockerfiles/osu b/docs/software/communication/dockerfiles/osu new file mode 100644 index 00000000..dadf20d3 --- /dev/null +++ b/docs/software/communication/dockerfiles/osu @@ -0,0 +1,16 @@ +ARG omb_version=7.5.1 +RUN wget -q http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${omb_version}.tar.gz \ + && tar xf osu-micro-benchmarks-${omb_version}.tar.gz \ + && cd osu-micro-benchmarks-${omb_version} \ + && ldconfig /usr/local/cuda/targets/sbsa-linux/lib/stubs \ + && ./configure --prefix=/usr/local CC=$(which mpicc) CFLAGS="-O3 -lcuda -lnvidia-ml" \ + --enable-cuda --with-cuda-include=/usr/local/cuda/include \ + --with-cuda-libpath=/usr/local/cuda/lib64 \ + CXXFLAGS="-lmpi -lcuda" \ + && make -j$(nproc) \ + && make install \ + && ldconfig \ + && cd .. \ + && rm -rf osu-micro-benchmarks-${omb_version} osu-micro-benchmarks-${omb_version}.tar.gz + +WORKDIR /usr/local/libexec/osu-micro-benchmarks/mpi diff --git a/docs/software/communication/dockerfiles/ucx b/docs/software/communication/dockerfiles/ucx new file mode 100644 index 00000000..9ef632ab --- /dev/null +++ b/docs/software/communication/dockerfiles/ucx @@ -0,0 +1,13 @@ +# Install UCX +ARG UCX_VERSION=1.19.0 +RUN wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz \ + && tar xzf ucx-${UCX_VERSION}.tar.gz \ + && cd ucx-${UCX_VERSION} \ + && mkdir build \ + && cd build \ + && ../configure --prefix=/usr --with-cuda=/usr/local/cuda --with-gdrcopy=/usr/local \ + --enable-mt --enable-devel-headers \ + && make -j$(nproc) \ + && make install \ + && cd ../.. \ + && rm -rf ucx-${UCX_VERSION}.tar.gz ucx-${UCX_VERSION} diff --git a/docs/software/communication/index.md b/docs/software/communication/index.md index 5d961d77..51ec9cd7 100644 --- a/docs/software/communication/index.md +++ b/docs/software/communication/index.md @@ -1,20 +1,67 @@ [](){#ref-software-communication} # Communication Libraries -CSCS provides common communication libraries optimized for the [Slingshot 11 network on Alps][ref-alps-hsn]. +Communication libraries, like MPI and NCCL, are one of the building blocks for high performance scientific and ML workloads. +Broadly speaking, there are two levels of communication: + +* **Intra-node** communication between two processes on the same node. +* **Inter-node** communication between different nodes, over the [Slingshot 11 network][ref-alps-hsn] that connects nodes on Alps. + +To get the best inter-node performance on Alps, they need to be configured to use the [libfabric][ref-communication-libfabric] library that has an optimised back end for the Slingshot 11 network on Alps. + +As such, communication libraries are part of the "base layer" of libraries and tools used by all workloads to fully utilize the hardware on Alps. +They comprise the *network* layer in the following stack: + +* **CPU**: compilers with support for building applications optimized for the CPU architecture on the node. +* **GPU**: CUDA and ROCM provide compilers and runtime libraries for NVIDIA and AMD GPUs respectively. +* **Network**: libfabric, MPI, NCCL, NVSHMEM, need to be configured for the Slingshot network. + +CSCS provides communication libraries optimised for libfabric and Slingshot in uenv, and guidance on how to create container images that use them. +This section of the documentation provides advice on how to build and install software to use these libraries, and how to deploy them. For most scientific applications relying on MPI, [Cray MPICH][ref-communication-cray-mpich] is recommended. [MPICH][ref-communication-mpich] and [OpenMPI][ref-communication-openmpi] may also be used, with limitations. Cray MPICH, MPICH, and OpenMPI make use of [libfabric][ref-communication-libfabric] to interact with the underlying network. -Most machine learning applications rely on [NCCL][ref-communication-nccl] or [RCCL][ref-communication-rccl] for high-performance implementations of collectives. -NCCL and RCCL have to be configured with a plugin using [libfabric][ref-communication-libfabric] to make full use of the Slingshot network. +Most machine learning applications rely on [NCCL][ref-communication-nccl] for high-performance implementations of collectives. +NCCL have to be configured with a plugin using [libfabric][ref-communication-libfabric] to make full use of the Slingshot network. See the individual pages for each library for information on how to use and best configure the libraries. -* [Cray MPICH][ref-communication-cray-mpich] -* [MPICH][ref-communication-mpich] -* [OpenMPI][ref-communication-openmpi] -* [NCCL][ref-communication-nccl] -* [RCCL][ref-communication-rccl] -* [libfabric][ref-communication-libfabric] +
+ +- __Low Level__ + + Learn about the low-level networking library libfabric, and how to use it in uenv and containers + + [:octicons-arrow-right-24: libfabric][ref-alps] + +
+
+ +- __MPI__ + + Cray MPICH is the most optimized and best tested MPI implementation on Alps, and is used by uenv. + + [:octicons-arrow-right-24: Cray MPICH][ref-communication-cray-mpich] + + For compatibility in containers: + + [:octicons-arrow-right-24: MPICH][ref-communication-mpich] + + Also OpenMPI can be built in containers or in uenv + + [:octicons-arrow-right-24: OpenMPI][ref-communication-openmpi] + +
+
+ +- __Machine Learning__ + + Communication libraries used by ML tools like Torch, and some simulation codes. + + [:octicons-arrow-right-24: NCCL][ref-communication-nccl] + + [:octicons-arrow-right-24: NVSHMEM][ref-communication-nvshmem] + +
diff --git a/docs/software/communication/libfabric.md b/docs/software/communication/libfabric.md index a8dd80d8..c6403a3a 100644 --- a/docs/software/communication/libfabric.md +++ b/docs/software/communication/libfabric.md @@ -1,24 +1,77 @@ [](){#ref-communication-libfabric} # Libfabric -[Libfabric](https://ofiwg.github.io/libfabric/), or Open Fabrics Interfaces (OFI), is a low level networking library that abstracts away various networking backends. -It is used by Cray MPICH, and can be used together with OpenMPI, NCCL, and RCCL to make use of the [Slingshot network on Alps][ref-alps-hsn]. +[Libfabric](https://ofiwg.github.io/libfabric/), or Open Fabrics Interfaces (OFI), is a low-level networking library that provides an abstract interface for networks. +Libfabric has backends for different network types, and is the interface chosen by HPE for the [Slingshot network on Alps][ref-alps-hsn], and by AWS for their [EFA network interface](https://aws.amazon.com/hpc/efa/). +To fully take advantage of the network on Alps: + +* libfabric and its dependencies must be available in your environment (uenv or container); +* and, communication libraries in your environment like Cray MPICH, OpenMPI, NCCL, and NVSHMEM have to be built or configured to use libfabric. + +!!! question "What about UCX?" + [Unified Communication X (UCX)](https://openucx.org/) is a low level library that targets the same layer as libfabric. + Specifically, it provides an open, standards-based, networking API. + By targeting UCX and libfabric, MPI and NCCL do not need to implement low-level support for each network hardware. + + **There is no UCX back end for the Slingshot network on Alps**, and pre-built software (for example conda packages and containers) often provides versions of MPI built for UCX only. + Running these images and packages on Alps will lead to very poor network performance or errors. + +[](){#ref-communication-libfabric-using} ## Using libfabric +[](){#ref-communication-libfabric-uenv} +### uenv + If you are using a uenv provided by CSCS, such as [prgenv-gnu][ref-uenv-prgenv-gnu], [Cray MPICH][ref-communication-cray-mpich] is linked to libfabric and the high speed network will be used. No changes are required in applications. -If you are using containers, the system libfabric can be loaded into your container using the [CXI hook provided by the container engine][ref-ce-cxi-hook]. -Using the hook is essential to make full use of the Alps network. +[](){#ref-communication-libfabric-ce} +### Containers + +The approach is to install libfabric inside the container, along with MPI and NCCL implementations linked against it. +At runtime, the [container engine][ref-container-engine] [CXI hook][ref-ce-cxi-hook] will replace the libfabric libraries inside the container with the corresponding libraries on the host system. +This will ensure access to the Slingshot interconnect. + + +!!! note "Use NVIDIA containers for the gh200 nodes" + Container images provided by NVIDIA, which come with CUDA, NCCL and other commonly used libraries are recommended as the base layer for building a container environment on the [gh200][ref-alps-gh200-node] and [a100][ref-alps-a100-node] nodes. + + The version of CUDA, NCCL and compilers in the container can be used once libfabric has been installed. + Other communication libraries, like MPI and NVSHMEM, provided in the containers can't be used directly. + Instead, they have to be installed in the container and linked against libfabric. +!!! example "Installing libfabric in a container for NVIDIA nodes" + The following lines demonstrate how to configure and install libfabric in a Containerfile. + Communication frameworks are built with explicit support for CUDA and GDRCopy. + + Some additional features are enabled to increase the portability of the container to non-Alps systems: + + - The libfabric [EFA](https://aws.amazon.com/hpc/efa/) provider is configured with the `--enable-efa` flag, for compatibility with AWS infrastructure. + - The UCX communication framework is added to facilitate building a broader set of software (e.g. some OpenSHMEM implementations) and for optimized infiniband network support. + + Note that it is assumed that CUDA has already been installed on the system. + ```Dockerfile + --8<-- "docs/software/communication/dockerfiles/libfabric" + --8<-- "docs/software/communication/dockerfiles/ucx" + ``` + + An example Containerfile that installs libfabric in an NVIDIA container can be expanded below: + + ??? note "The full Containerfile for GH200" + The Containerfile below is based on an NVIDIA CUDA image, which provides a complete CUDA installation and NCCL. + + ``` + --8<-- "docs/software/communication/dockerfiles/base" + --8<-- "docs/software/communication/dockerfiles/libfabric" + --8<-- "docs/software/communication/dockerfiles/ucx" + ``` + +[](){#ref-communication-libfabric-performance} ## Tuning libfabric -Tuning libfabric (particularly together with [Cray MPICH][ref-communication-cray-mpich], [OpenMPI][ref-communication-openmpi], [NCCL][ref-communication-nccl], and [RCCL][ref-communication-rccl]) depends on many factors, including the application, workload, and system. +Tuning libfabric (particularly together with [Cray MPICH][ref-communication-cray-mpich], [OpenMPI][ref-communication-openmpi], and [NCCL][ref-communication-nccl]) depends on many factors, including the application, workload, and system. For a comprehensive overview libfabric options for the CXI provider (the provider for the Slingshot network), see the [`fi_cxi` man pages](https://ofiwg.github.io/libfabric/v2.1.0/man/fi_cxi.7.html). Note that the exact version deployed on Alps may differ, and not all options may be applicable on Alps. See the [Cray MPICH known issues page][ref-communication-cray-mpich-known-issues] for issues when using Cray MPICH together with libfabric. - -!!! todo - More options? diff --git a/docs/software/communication/mpich.md b/docs/software/communication/mpich.md index 8e60cb3c..3e8597d6 100644 --- a/docs/software/communication/mpich.md +++ b/docs/software/communication/mpich.md @@ -4,7 +4,17 @@ MPICH is an open-source MPI implementation actively developed in this [github repository](https://github.com/pmodels/mpich). It can be installed inside containers directly from the source code manually, or using Spack or similar package managers. -## MPICH inside containers +[](){#ref-communication-mpich-using} +## Using MPICH + +[](){#ref-communication-mpich-ce} +### uenv + +MPICH is not provided in any uenv images, which instead use the [Cray MPICH][ref-communication-cray-mpich] distribution which is optimised for the Alps network. + +[](){#ref-communication-mpich-ce} +### Containers + MPICH can be built inside containers, however for native Slingshot performance special care has to be taken to ensure that communication is optimal for all cases: * Intra-node communication (this is via shared memory, especially `xpmem`) @@ -136,13 +146,25 @@ They are explicit and building manually the necessary packages, however for prod RUN rm /etc/ld.so.conf.d/cuda_stubs.conf && ldconfig ``` -!!! important "GPU-to-GPU inter-node communication" +Once the container is built and pushed to a registry, one can create a [container environment][ref-container-engine]. + +!!! note "GPU-to-GPU inter-node communication" To make sure that GPU-to-GPU performance is good for inter-node communication one must set the variable ```console $ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1 ``` -Once the container is built and pushed to a registry, one can create a [container environment][ref-container-engine]. +!!! note "Use PMI-2" + By default MPICH uses [PMI-2](https://www.mcs.anl.gov/papers/P1760.pdf) for wire-up and communication between ranks. + Hence, when launching containers that use MPICH through Slurm, PMI-2 must be used for application launching. + This is done with the `--mpi` flag of `srun`: + ```bash + srun --mpi=pmi2 ... + ``` + +[](){#ref-communication-mpich-performance} +## MPICH Performance + To verify performance, one can run the `osu_bw` benchmark, which is doing a bandwidth benchmark for different message sizes between two ranks. For reference this is the expected performance for different memory residency, with inter-node and intra-node communication: === "CPU-to-CPU memory intra-node" diff --git a/docs/software/communication/nccl.md b/docs/software/communication/nccl.md index 7a979566..9353ab22 100644 --- a/docs/software/communication/nccl.md +++ b/docs/software/communication/nccl.md @@ -4,8 +4,15 @@ [NCCL](https://developer.nvidia.com/nccl) is an optimized inter-GPU communication library for NVIDIA GPUs. It is commonly used in machine learning frameworks, but traditional scientific applications can also benefit from NCCL. +[](){#ref-communication-nccl-using} ## Using NCCL +!!! info "Further reading" + [_Demystifying NCCL: An In-depth Analysis of GPU Communication Protocols and Algorithms_](https://arxiv.org/abs/2507.04786v2) contains detailed information about NCCL algorithms and protocols, which can be helpful for deciding if your application could benefit from an alternative configuration. + +[](){#ref-communication-nccl-uenv} +### uenv + To use the Slingshot network on Alps, the [`aws-ofi-nccl`](https://github.com/aws/aws-ofi-nccl) plugin must be used. With the container engine, the [AWS OFI NCCL hook][ref-ce-aws-ofi-hook] can be used to load the plugin into the container and configure NCCL to use it. @@ -20,7 +27,74 @@ While the container engine sets these automatically when using the NCCL hook, th --8<-- "docs/software/communication/nccl_env_vars" ``` -[_Demystifying NCCL: An In-depth Analysis of GPU Communication Protocols and Algorithms_](https://arxiv.org/abs/2507.04786v2) contains detailed information about NCCL algorithms and protocols, which can be helpful for deciding if your application could benefit from an alternative configuration. +[](){#ref-communication-nccl-ce} +### Containers + +To use NCCL in a container, we suggest using a container provided by NVIDIA that already contains CUDA and NCCL as the starting point. +Then install libfabric as documented in the [libfabric container documentation][ref-communication-libfabric-ce], and use the [AWS OFI hook][ref-ce-aws-ofi-hook] to configure NCCL to use [libfabric][ref-communication-libfabric] optimised for the Alps network. + +!!! example "Installing the NCCL benchmarks in a container for NVIDIA nodes" + To test whether NCCL inside a container has been set up correctly for optimal performance, add the NCCL test suite to the container. + + Use the following as a template for installing the tests: + + ```Dockerfile + --8<-- "docs/software/communication/dockerfiles/nccl-tests" + ``` + + Expand the box below to see the full Containerfile that installs the NCCL tests on top of the example in the [libfabric][ref-communication-libfabric-ce] documentation. + + ??? note "The full Containerfile" + ```Dockerfile + --8<-- "docs/software/communication/dockerfiles/base" + --8<-- "docs/software/communication/dockerfiles/libfabric" + --8<-- "docs/software/communication/dockerfiles/ucx" + --8<-- "docs/software/communication/dockerfiles/nccl-tests" + ``` + +To use NCCL in a container, enable the [AWS OFI hook][ref-ce-aws-ofi-hook] in the [EDF file][ref-ce-edf-reference]. + +```toml +[env] +PMIX_MCA_psec="native" # (1)! + +[annotations] +com.hooks.aws_ofi_nccl.enabled = "true" # (2)! +com.hooks.aws_ofi_nccl.variant = "cuda12" # (3)! +``` + +1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup. +2. Enable the AWS OFI plugin. +3. Take care to match the major CUDA version installed in the container. + +Because NCCL uses OpenMPI in the container to perform initial setup, which in turn uses [PMIx](https://pmix.org/) for wire-up, pass the `--mpi=pmix` option to `srun` when launching jobs. + +```console +$ srun --mpi=pmix -n8 -N2 --environment=nccl-test /nccl-tests-2.17.1/build/all_reduce_perf +``` + +[](){#ref-communication-nccl-issues} +## Known issues + +!!! warning "Do not use `NCCL_NET_PLUGIN="ofi"` with uenvs" + NCCL has an alternative way of specifying what plugin to use: `NCCL_NET_PLUGIN`. + When using uenvs, do not set `NCCL_NET_PLUGIN="ofi"` instead of, or in addition to, `NCCL_NET="AWS Libfabric"`. + If you do, your application will fail to start since NCCL will: + + 1. fail to find the plugin because of the name of the shared library in the uenv, and + 2. prefer `NCCL_NET_PLUGIN` over `NCCL_NET`, so it will fail to find the plugin even if `NCCL_NET="AWS Libfabric"` is correctly set. + + When both environment variables are set the error message, with `NCCL_DEBUG=WARN`, will look similar to when the plugin isn't available: + ```console + nid006365:179857:179897 [1] net.cc:626 NCCL WARN Error: network AWS Libfabric not found. + ``` + + With `NCCL_DEBUG=INFO`, NCCL will print: + ```console + nid006365:180142:180163 [0] NCCL INFO NET/Plugin: Could not find: ofi libnccl-net-ofi.so. Using internal network plugin. + ... + nid006365:180142:180163 [0] net.cc:626 NCCL WARN Error: network AWS Libfabric not found. + ``` In addition to the above variables, setting `NCCL_NCHANNELS_PER_NET_PEER` can improve point-to-point performance (operations based directly on send/recv): @@ -42,9 +116,6 @@ The option is undocumented, but [this issue](https://github.com/NVIDIA/nccl/issu export FI_CXI_RDZV_EAGER_SIZE=0 ``` -!!! warning "Using NCCL with uenvs" - The environment variables listed above are not set automatically when using uenvs. - !!! warning "GPU-aware MPI with NCCL" Using GPU-aware MPI together with NCCL [can easily lead to deadlocks](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/mpi.html#inter-gpu-communication-with-cuda-aware-mpi). Unless care is taken to ensure that the two methods of communication are not used concurrently, we recommend not using GPU-aware MPI with NCCL. @@ -68,24 +139,120 @@ The option is undocumented, but [this issue](https://github.com/NVIDIA/nccl/issu nid006352:34610:34631 [0] NCCL INFO Using network AWS Libfabric ``` -!!! warning "Do not use `NCCL_NET_PLUGIN="ofi"` with uenvs" - NCCL has an alternative way of specifying what plugin to use: `NCCL_NET_PLUGIN`. - When using uenvs, do not set `NCCL_NET_PLUGIN="ofi"` instead of, or in addition to, `NCCL_NET="AWS Libfabric"`. - If you do, your application will fail to start since NCCL will: +[](){#ref-communication-nccl-performance} +## NCCL Performance - 1. fail to find the plugin because of the name of the shared library in the uenv, and - 2. prefer `NCCL_NET_PLUGIN` over `NCCL_NET`, so it will fail to find the plugin even if `NCCL_NET="AWS Libfabric"` is correctly set. - - When both environment variables are set the error message, with `NCCL_DEBUG=WARN`, will look similar to when the plugin isn't available: +!!! warning "no version information available" + The following warning message was generated by each rank running the benchmarks below, and can safely be ignored. + ``` + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + ``` + +!!! note "Impact of disabling the CXI hook" + On many Alps vClusters, the Container Engine is configured with the [CXI hook][ref-ce-cxi-hook] enabled by default, enabling transparent access to the Slingshot interconnect. + + The inter node tests marked with `(*)` were run with the CXI container hook disabled, to demonstrate the effect of not using an optimised network configuration. + If you see similar performance degradation in your tests, the first thing to investigate is whether your setup is using the libfabric optimised back end. + +Below are the results of of running the collective all reduce latency test on 2 nodes with 8 GPUs total (the `all_reduce_perf` test). + +=== "All-reduce latency" ```console - nid006365:179857:179897 [1] net.cc:626 NCCL WARN Error: network AWS Libfabric not found. + $ srun -N2 -t5 --mpi=pmix --ntasks-per-node=4 --environment=nccl-test-ompi /nccl-tests-2.17.1/build/all_reduce_perf -b 8 -e 128M -f 2 + # Collective test starting: all_reduce_perf + # nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0 + # + # Using devices + # Rank 0 Group 0 Pid 204199 on nid005471 device 0 [0009:01:00] NVIDIA GH200 120GB + # Rank 1 Group 0 Pid 204200 on nid005471 device 1 [0019:01:00] NVIDIA GH200 120GB + # Rank 2 Group 0 Pid 204201 on nid005471 device 2 [0029:01:00] NVIDIA GH200 120GB + # Rank 3 Group 0 Pid 204202 on nid005471 device 3 [0039:01:00] NVIDIA GH200 120GB + # Rank 4 Group 0 Pid 155254 on nid005487 device 0 [0009:01:00] NVIDIA GH200 120GB + # Rank 5 Group 0 Pid 155255 on nid005487 device 1 [0019:01:00] NVIDIA GH200 120GB + # Rank 6 Group 0 Pid 155256 on nid005487 device 2 [0029:01:00] NVIDIA GH200 120GB + # Rank 7 Group 0 Pid 155257 on nid005487 device 3 [0039:01:00] NVIDIA GH200 120GB + # + # out-of-place in-place + # size count type redop root time algbw busbw #wrong time algbw busbw #wrong + # (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 17.93 0.00 0.00 0 17.72 0.00 0.00 0 + 16 4 float sum -1 17.65 0.00 0.00 0 17.63 0.00 0.00 0 + 32 8 float sum -1 17.54 0.00 0.00 0 17.43 0.00 0.00 0 + 64 16 float sum -1 19.27 0.00 0.01 0 19.21 0.00 0.01 0 + 128 32 float sum -1 18.86 0.01 0.01 0 18.67 0.01 0.01 0 + 256 64 float sum -1 18.83 0.01 0.02 0 19.02 0.01 0.02 0 + 512 128 float sum -1 19.72 0.03 0.05 0 19.40 0.03 0.05 0 + 1024 256 float sum -1 20.35 0.05 0.09 0 20.32 0.05 0.09 0 + 2048 512 float sum -1 22.07 0.09 0.16 0 21.72 0.09 0.17 0 + 4096 1024 float sum -1 31.97 0.13 0.22 0 31.58 0.13 0.23 0 + 8192 2048 float sum -1 37.21 0.22 0.39 0 35.84 0.23 0.40 0 + 16384 4096 float sum -1 37.29 0.44 0.77 0 36.53 0.45 0.78 0 + 32768 8192 float sum -1 39.61 0.83 1.45 0 37.09 0.88 1.55 0 + 65536 16384 float sum -1 61.03 1.07 1.88 0 68.45 0.96 1.68 0 + 131072 32768 float sum -1 81.41 1.61 2.82 0 72.94 1.80 3.14 0 + 262144 65536 float sum -1 127.0 2.06 3.61 0 108.9 2.41 4.21 0 + 524288 131072 float sum -1 170.3 3.08 5.39 0 349.6 1.50 2.62 0 + 1048576 262144 float sum -1 164.3 6.38 11.17 0 187.7 5.59 9.77 0 + 2097152 524288 float sum -1 182.1 11.51 20.15 0 180.6 11.61 20.32 0 + 4194304 1048576 float sum -1 292.7 14.33 25.08 0 295.4 14.20 24.85 0 + 8388608 2097152 float sum -1 344.5 24.35 42.61 0 345.7 24.27 42.47 0 + 16777216 4194304 float sum -1 461.7 36.34 63.59 0 454.0 36.95 64.67 0 + 33554432 8388608 float sum -1 686.5 48.88 85.54 0 686.6 48.87 85.52 0 + 67108864 16777216 float sum -1 1090.5 61.54 107.69 0 1083.5 61.94 108.39 0 + 134217728 33554432 float sum -1 1916.4 70.04 122.57 0 1907.8 70.35 123.11 0 + # Out of bounds values : 0 OK + # Avg bus bandwidth : 19.7866 + # + # Collective test concluded: all_reduce_perf ``` - - With `NCCL_DEBUG=INFO`, NCCL will print: + +=== "All-reduce latency (*)" ```console - nid006365:180142:180163 [0] NCCL INFO NET/Plugin: Could not find: ofi libnccl-net-ofi.so. Using internal network plugin. - ... - nid006365:180142:180163 [0] net.cc:626 NCCL WARN Error: network AWS Libfabric not found. + $ srun -N2 -t5 --mpi=pmix --ntasks-per-node=4 --environment=nccl-test-ompi /nccl-tests-2.17.1/build/all_reduce_perf -b 8 -e 128M -f 2 + # Collective test starting: all_reduce_perf + # nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0 + # + # Using devices + # Rank 0 Group 0 Pid 202829 on nid005471 device 0 [0009:01:00] NVIDIA GH200 120GB + # Rank 1 Group 0 Pid 202830 on nid005471 device 1 [0019:01:00] NVIDIA GH200 120GB + # Rank 2 Group 0 Pid 202831 on nid005471 device 2 [0029:01:00] NVIDIA GH200 120GB + # Rank 3 Group 0 Pid 202832 on nid005471 device 3 [0039:01:00] NVIDIA GH200 120GB + # Rank 4 Group 0 Pid 154517 on nid005487 device 0 [0009:01:00] NVIDIA GH200 120GB + # Rank 5 Group 0 Pid 154518 on nid005487 device 1 [0019:01:00] NVIDIA GH200 120GB + # Rank 6 Group 0 Pid 154519 on nid005487 device 2 [0029:01:00] NVIDIA GH200 120GB + # Rank 7 Group 0 Pid 154520 on nid005487 device 3 [0039:01:00] NVIDIA GH200 120GB + # + # out-of-place in-place + # size count type redop root time algbw busbw #wrong time algbw busbw #wrong + # (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s) + 8 2 float sum -1 85.47 0.00 0.00 0 53.44 0.00 0.00 0 + 16 4 float sum -1 52.41 0.00 0.00 0 51.11 0.00 0.00 0 + 32 8 float sum -1 50.45 0.00 0.00 0 50.40 0.00 0.00 0 + 64 16 float sum -1 62.58 0.00 0.00 0 50.70 0.00 0.00 0 + 128 32 float sum -1 50.94 0.00 0.00 0 50.77 0.00 0.00 0 + 256 64 float sum -1 50.76 0.01 0.01 0 51.77 0.00 0.01 0 + 512 128 float sum -1 163.2 0.00 0.01 0 357.5 0.00 0.00 0 + 1024 256 float sum -1 373.0 0.00 0.00 0 59.31 0.02 0.03 0 + 2048 512 float sum -1 53.22 0.04 0.07 0 52.58 0.04 0.07 0 + 4096 1024 float sum -1 55.95 0.07 0.13 0 56.63 0.07 0.13 0 + 8192 2048 float sum -1 58.52 0.14 0.24 0 58.62 0.14 0.24 0 + 16384 4096 float sum -1 108.7 0.15 0.26 0 107.8 0.15 0.27 0 + 32768 8192 float sum -1 184.1 0.18 0.31 0 183.5 0.18 0.31 0 + 65536 16384 float sum -1 325.0 0.20 0.35 0 325.4 0.20 0.35 0 + 131072 32768 float sum -1 592.7 0.22 0.39 0 591.5 0.22 0.39 0 + 262144 65536 float sum -1 942.0 0.28 0.49 0 941.4 0.28 0.49 0 + 524288 131072 float sum -1 1143.1 0.46 0.80 0 1138.0 0.46 0.81 0 + 1048576 262144 float sum -1 1502.2 0.70 1.22 0 1478.9 0.71 1.24 0 + 2097152 524288 float sum -1 921.8 2.28 3.98 0 899.8 2.33 4.08 0 + 4194304 1048576 float sum -1 1443.1 2.91 5.09 0 1432.7 2.93 5.12 0 + 8388608 2097152 float sum -1 2437.7 3.44 6.02 0 2417.0 3.47 6.07 0 + 16777216 4194304 float sum -1 5036.9 3.33 5.83 0 5003.6 3.35 5.87 0 + 33554432 8388608 float sum -1 17388 1.93 3.38 0 17275 1.94 3.40 0 + 67108864 16777216 float sum -1 21253 3.16 5.53 0 21180 3.17 5.54 0 + 134217728 33554432 float sum -1 43293 3.10 5.43 0 43396 3.09 5.41 0 + # Out of bounds values : 0 OK + # Avg bus bandwidth : 1.58767 + # + # Collective test concluded: all_reduce_perf ``` - - If you only set `NCCL_NET="ofi"`, NCCL may silently fail to load the plugin but fall back to the default implementation. + diff --git a/docs/software/communication/nvshmem.md b/docs/software/communication/nvshmem.md new file mode 100644 index 00000000..e815c82c --- /dev/null +++ b/docs/software/communication/nvshmem.md @@ -0,0 +1,196 @@ +[](){#ref-communication-nvshmem} +# NVSHMEM + +[NVSHMEM](https://developer.nvidia.com/nvshmem) is a parallel programming interface based on OpenSHMEM that provides efficient and scalable communication for NVIDIA GPU clusters. +NVSHMEM creates a global address space for data that spans the memory of multiple GPUs and can be accessed with fine-grained GPU-initiated operations, CPU-initiated operations, and operations on CUDA streams. + +[](){#ref-communication-nvshmem-using} +## Using NVSHMEM + +[](){#ref-communication-uenv-ce} +### uenv + +Version 2.8 of the [PyTorch uenv][ref-uenv-pytorch] is currently the only uenv that provides NVSHMEM. + +CSCS working on building NVSHMEM that runs efficiently on the Alps network in uenv, and will update these docs when it is available. + +[](){#ref-communication-nvshmem-ce} +### Containers + +To use NVSHMEM, we recommend first installing OpenMPI with libfabric support in the container, or starting with an image that contains OpenMPI with libfabric. + +The image recipe described here is based on the [OpenMPI image for NVIDIA][ref-communication-openmpi], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes. + +!!! warning "Be careful with NVSHMEM provided by NVIDIA containers" + Containers provided by NVIDIA on NGC typically provide NVSHMEM as part of the NVHPC SDK in the image, however this version is built for and linked against OpenMPI and UCX in the container, which are not compatible with the Slingshot network of Alps. + +NVSHMEM is built from source in the container, from a source tar ball provided by NVIDIA. + +- Notice that NVSHMEM is configured with support for libfabric explicitly enabled: `NVSHMEM_LIBFABRIC_SUPPORT=1` +- NVSHMEM is built without support for UCX and Infiniband components, because they are not needed on Alps. +- Since this image uses OpenMPI (which provides PMIx) as MPI implementation, NVSHMEM is also configured to default to PMIx for bootstrapping (`NVSHMEM_PMIX_SUPPORT=1`). + +!!! example "Installing NVSHMEM in a container for NVIDIA nodes" + The following example demonstrates how to download and install NVSHMEM from source in a Containerfile. + + The container image cont + ```dockerfile + --8<-- "docs/software/communication/dockerfiles/nvshmem" + ``` + !!! note + The image also installs the NVSHMEM performance tests, `NVSHMEM_BUILD_TESTS=1`, to demonstrate performance below. + The performance tests, in turn, require the installation of Python dependencies. + When building images intended solely for production purposes, you may exclude both those elements. + + Expand the box below to see an example of a complete Containerfile that installs NVSHMEM and all of its dependencies in an NVIDIA container. + + ??? note "The full Containerfile" + ```dockerfile + --8<-- "docs/software/communication/dockerfiles/base" + --8<-- "docs/software/communication/dockerfiles/libfabric" + --8<-- "docs/software/communication/dockerfiles/ucx" + --8<-- "docs/software/communication/dockerfiles/openmpi" + --8<-- "docs/software/communication/dockerfiles/nvshmem" + ``` + +!!! example "Running the NVSHMEM container" + The following EDF file sets the required environment variables and container hooks for NVSHMEM. + It uses a pre-built container hosted on the [Quay.io](https://quay.io/) registry at the following reference: `quay.io/ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8`. + + + ```toml + image = "quay.io#ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8" + + [env] + PMIX_MCA_psec="native" # (1)! + NVSHMEM_REMOTE_TRANSPORT="libfabric" + NVSHMEM_LIBFABRIC_PROVIDER="cxi" + NVSHMEM_DISABLE_CUDA_VMM="1" # (2)! + + [annotations] + com.hooks.aws_ofi_nccl.enabled = "true" # (3)! + com.hooks.aws_ofi_nccl.variant = "cuda12" + ``` + + 1. Ensures PMIx uses the same security domain as Slurm. + Otherwise PMIx will print warnings at startup. + 2. NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`. + 3. NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect. + Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container. + + Libfabric itself is usually injected by the [CXI hook][ref-ce-cxi-hook], which is enabled by default on several Alps vClusters. + + + ```bash + srun -N2 --ntasks-per-node=4 \ + -mpi=pmix \ # (1)! + --environment=nvshmem \ + /usr/local/nvshmem/bin/perftest/device/coll/alltoall_latency + ``` + + 1. Since NVSHMEM has been configured in the Containerfile to use PMIx for bootstrapping, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs. + + Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM through the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options). + When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library. + +[](){#ref-communication-nvshmem-performance} +## NVSHMEM Performance + +The results of running the `alltoall_latency` benchmark provided by the NCCL test suite, built in the example container [above][ref-communication-nvshmem-ce]. + +```console +$ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=nvshmem /usr/local/nvshmem/bin/perftest/device/coll/alltoall_latency +Runtime options after parsing command line arguments +min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0 +Note: Above is full list of options, any given test will use only a subset of these variables. +mype: 6 mype_node: 2 device name: NVIDIA GH200 120GB bus id: 1 +#alltoall_device +size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) +32 8 32-bit thread 116.220796 0.000 0.000 +64 16 32-bit thread 112.700796 0.001 0.000 +128 32 32-bit thread 113.571203 0.001 0.001 +256 64 32-bit thread 111.123204 0.002 0.002 +512 128 32-bit thread 111.075199 0.005 0.004 +1024 256 32-bit thread 110.131204 0.009 0.008 +2048 512 32-bit thread 111.030400 0.018 0.016 +4096 1024 32-bit thread 110.985601 0.037 0.032 +8192 2048 32-bit thread 111.039996 0.074 0.065 +#alltoall_device +size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) +32 8 32-bit warp 89.801598 0.000 0.000 +64 16 32-bit warp 90.563202 0.001 0.001 +128 32 32-bit warp 89.830399 0.001 0.001 +256 64 32-bit warp 88.863999 0.003 0.003 +512 128 32-bit warp 89.686400 0.006 0.005 +1024 256 32-bit warp 88.908798 0.012 0.010 +2048 512 32-bit warp 88.819200 0.023 0.020 +4096 1024 32-bit warp 89.670402 0.046 0.040 +8192 2048 32-bit warp 88.889599 0.092 0.081 +16384 4096 32-bit warp 88.972801 0.184 0.161 +32768 8192 32-bit warp 89.564800 0.366 0.320 +65536 16384 32-bit warp 89.888000 0.729 0.638 +#alltoall_device +size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) +32 8 32-bit block 89.747202 0.000 0.000 +64 16 32-bit block 88.086402 0.001 0.001 +128 32 32-bit block 87.254399 0.001 0.001 +256 64 32-bit block 87.401599 0.003 0.003 +512 128 32-bit block 88.095999 0.006 0.005 +1024 256 32-bit block 87.273598 0.012 0.010 +2048 512 32-bit block 88.086402 0.023 0.020 +4096 1024 32-bit block 88.940799 0.046 0.040 +8192 2048 32-bit block 88.095999 0.093 0.081 +16384 4096 32-bit block 87.247998 0.188 0.164 +32768 8192 32-bit block 88.976002 0.368 0.322 +65536 16384 32-bit block 88.121599 0.744 0.651 +131072 32768 32-bit block 90.579200 1.447 1.266 +262144 65536 32-bit block 91.360003 2.869 2.511 +524288 131072 32-bit block 101.145601 5.183 4.536 +1048576 262144 32-bit block 111.052799 9.442 8.262 +2097152 524288 32-bit block 137.164795 15.289 13.378 +4194304 1048576 32-bit block 183.171201 22.898 20.036 +#alltoall_device +size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) +64 8 64-bit thread 111.955202 0.001 0.001 +128 16 64-bit thread 113.420796 0.001 0.001 +256 32 64-bit thread 108.508801 0.002 0.002 +512 64 64-bit thread 110.204804 0.005 0.004 +1024 128 64-bit thread 109.487998 0.009 0.008 +2048 256 64-bit thread 109.462404 0.019 0.016 +4096 512 64-bit thread 110.156798 0.037 0.033 +8192 1024 64-bit thread 109.401596 0.075 0.066 +16384 2048 64-bit thread 108.591998 0.151 0.132 +#alltoall_device +size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) +64 8 64-bit warp 88.896000 0.001 0.001 +128 16 64-bit warp 89.679998 0.001 0.001 +256 32 64-bit warp 88.950402 0.003 0.003 +512 64 64-bit warp 89.606398 0.006 0.005 +1024 128 64-bit warp 89.775997 0.011 0.010 +2048 256 64-bit warp 88.838398 0.023 0.020 +4096 512 64-bit warp 90.671998 0.045 0.040 +8192 1024 64-bit warp 89.699203 0.091 0.080 +16384 2048 64-bit warp 89.011198 0.184 0.161 +32768 4096 64-bit warp 89.622402 0.366 0.320 +65536 8192 64-bit warp 88.905603 0.737 0.645 +131072 16384 64-bit warp 89.766401 1.460 1.278 +#alltoall_device +size(B) count type scope latency(us) algbw(GB/s) busbw(GB/s) +64 8 64-bit block 89.788800 0.001 0.001 +128 16 64-bit block 88.012803 0.001 0.001 +256 32 64-bit block 87.353599 0.003 0.003 +512 64 64-bit block 88.000000 0.006 0.005 +1024 128 64-bit block 87.225598 0.012 0.010 +2048 256 64-bit block 87.225598 0.023 0.021 +4096 512 64-bit block 87.168002 0.047 0.041 +8192 1024 64-bit block 88.067198 0.093 0.081 +16384 2048 64-bit block 88.863999 0.184 0.161 +32768 4096 64-bit block 88.723201 0.369 0.323 +65536 8192 64-bit block 87.993598 0.745 0.652 +131072 16384 64-bit block 88.783997 1.476 1.292 +262144 32768 64-bit block 91.366398 2.869 2.511 +524288 65536 64-bit block 102.060795 5.137 4.495 +1048576 131072 64-bit block 111.846399 9.375 8.203 +2097152 262144 64-bit block 137.107205 15.296 13.384 +4194304 524288 64-bit block 183.100796 22.907 20.044 +``` diff --git a/docs/software/communication/openmpi.md b/docs/software/communication/openmpi.md index 9c45c0da..018fecce 100644 --- a/docs/software/communication/openmpi.md +++ b/docs/software/communication/openmpi.md @@ -1,22 +1,23 @@ [](){#ref-communication-openmpi} # OpenMPI -[Cray MPICH][ref-communication-cray-mpich] is the recommended MPI implementation on Alps. +[Cray MPICH][ref-communication-cray-mpich] is the recommended MPI implementation on Alps, particularly if you are using [uenv][ref-uenv]. + However, [OpenMPI](https://www.open-mpi.org/) can be used as an alternative in some cases, with limited support from CSCS. +OpenMPI is available for use in both uenv and containers. To use OpenMPI on Alps, it must be built against [libfabric][ref-communication-libfabric] with support for the [Slingshot 11 network][ref-alps-hsn]. +[](){#ref-communication-openmpi-using} ## Using OpenMPI -!!! warning - Building and using OpenMPI on Alps is still [work in progress](https://eth-cscs.github.io/cray-network-stack/). - The instructions found on this page may be inaccurate, but are a good starting point to using OpenMPI on Alps. +[](){#ref-communication-openmpi-uenv} +### uenv -!!! todo - Deploy experimental uenv. +!!! under-construction + Building and using OpenMPI in uenv on Alps is work in progress. -!!! todo - Document OpenMPI uenv next to prgenv-gnu, prgenv-nvfortran, and linalg? + The instructions found on this page may be inaccurate, but are a good starting point to using OpenMPI on Alps. OpenMPI is provided through a [uenv][ref-uenv] similar to [`prgenv-gnu`][ref-uenv-prgenv-gnu]. Once the uenv is loaded, compiling and linking with OpenMPI and libfabric is transparent. @@ -31,9 +32,9 @@ srun --mpi=pmix ... Additionally, the following environment variables should be set: ```bash export PMIX_MCA_psec="native" # (1)! -export FI_PROVIDER="cxi" # (2)! -export OMPI_MCA_pml="^ucx" # (3)! -export OMPI_MCA_mtl="ofi" # (4)! +export FI_PROVIDER="cxi" # (2)! +export OMPI_MCA_pml="^ucx" # (3)! +export OMPI_MCA_mtl="ofi" # (4)! ``` 1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup. @@ -50,9 +51,389 @@ export OMPI_MCA_mtl="ofi" # (4)! To use the LINKx provider, set the following, instead of `FI_PROVIDER=cxi`: ```bash - export FI_PROVIDER="lnx" # (1)! + export FI_PROVIDER="lnx" # (1)! export FI_LNX_PROV_LINKS="shm+cxi" # (2)! ``` 1. Use the libfabric LINKx provider, to allow using different libfabric providers for inter- and intra-node communication. 2. Use the shared memory provider for intra-node communication and the CXI (Slingshot) provider for inter-node communication. + +[](){#ref-communication-openmpi-ce} +### Containers + +To install OpenMPI in a container, libfabric (and possibly UCX if the container should be portable to other centers), should be installed. +Then OpenMPI is built, and configured to use at least libfabric. +Note that OpenMPI v5 is the first version with full support for libfabric, required for good performance. + +!!! note + The version of MPI in the containers provided by NVIDIA is OpenMPI v4 provided by NVIDIA's [HPC-X](https://developer.nvidia.com/networking/hpc-x) toolkit. + This version is not suitable for use on Alps for two reasons: + + * OpenMPI version 5 is required for full libfabric support. + * It is linked against UCX only, and can't be modified to use the system libfabric. + + See the [performance section][ref-communication-openmpi-performance] below for examples of the level of performance loss caused by using HPC-X. + + +!!! example "Installing OpenMPI in a container for NVIDIA nodes" + The following Dockerfile instructions install OpenMPI from source in an Ubuntu image that already contains CUDA, libfabric and UCX. + + ```Dockerfile + --8<-- "docs/software/communication/dockerfiles/openmpi" + ``` + + * The `--with-ofi` and `--with-ucx` flags configure OpenMPI with the libfabric and UCX back ends respectively. + * The `--enable-oshmem` flag builds OpenSHMEM as part of the OpenMPI installation, which is useful to support SHMEM implementations like [NVSHMEM][ref-communication-nvshmem]. + + Expand the box below to see an example of a full Containerfile that can be used to create an OpenMPI container on the gh200 nodes of Alps: + + ??? note "The full Containerfile" + This is an example of a complete Containerfile that installs OpenMPI based on the a "base image" that provides gdrcopy, libfabric and UCX on top of an NVIDIA container that provides CUDA: + + ```Dockerfile + --8<-- "docs/software/communication/dockerfiles/base" + --8<-- "docs/software/communication/dockerfiles/libfabric" + --8<-- "docs/software/communication/dockerfiles/ucx" + --8<-- "docs/software/communication/dockerfiles/openmpi" + --8<-- "docs/software/communication/dockerfiles/osu" + ``` + + * The container also installs the [OSU MPI micro-benchmarks](https://mvapich.cse.ohio-state.edu/benchmarks) so that the implementation can be tested. + +The EDF file for the container should contain the following: + +```toml +[env] +PMIX_MCA_psec="native" # (1)! +``` + +1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup. + +[](){#ref-communication-openmpi-performance} +## OpenMPI Performance + +We present some performance numbers for OpenMPI, obtained using the OSU benchmarks compiled in the above image. + +!!! warning "no version information available" + The following warning message was generated by each rank running the benchmarks below, and can safely be ignored. + ``` + /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1) + ``` + +The first performance benchmarks are for the OSU point-to-point bandwidth test `osu_bw`. + +* inter-node tests place the two ranks on different nodes, so that all communication is over the Slingshot network +* intra-node tests place two ranks on the same node, so that communication is via NVLINK or memory copies in the CPU-CPU case + +!!! note "impact of disabling the CXI hook" + On many Alps vClusters, the Container Engine is configured with the [CXI hook][ref-ce-cxi-hook] enabled by default, enabling transparent access to the Slingshot interconnect. + + The inter node tests marked with `(*)` were run with the CXI container hook disabled, to demonstrate the effect of not using an optimised network configuration. + If you see similar performance degradation in your tests, the first thing to investigate is whether your setup is using the libfabric optimised back end. + +=== "CPU-to-CPU inter-node" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation + # OSU MPI Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.95 Pass + 2 1.90 Pass + 4 3.80 Pass + 8 7.61 Pass + 16 15.21 Pass + 32 30.47 Pass + 64 60.72 Pass + 128 121.56 Pass + 256 242.28 Pass + 512 484.54 Pass + 1024 968.30 Pass + 2048 1943.99 Pass + 4096 3870.29 Pass + 8192 6972.95 Pass + 16384 13922.36 Pass + 32768 18835.52 Pass + 65536 22049.82 Pass + 131072 23136.20 Pass + 262144 23555.35 Pass + 524288 23758.39 Pass + 1048576 23883.95 Pass + 2097152 23949.94 Pass + 4194304 23982.18 Pass + ``` + +=== "CPU-to-CPU inter-node (*)" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation + # OSU MPI Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.16 Pass + 2 0.32 Pass + 4 0.65 Pass + 8 1.31 Pass + 16 2.59 Pass + 32 5.26 Pass + 64 10.37 Pass + 128 20.91 Pass + 256 41.49 Pass + 512 74.26 Pass + 1024 123.99 Pass + 2048 213.82 Pass + 4096 356.13 Pass + 8192 468.55 Pass + 16384 505.89 Pass + 32768 549.59 Pass + 65536 2170.64 Pass + 131072 2137.95 Pass + 262144 2469.63 Pass + 524288 2731.85 Pass + 1048576 2919.18 Pass + 2097152 3047.21 Pass + 4194304 3121.42 Pass + ``` + +=== "GPU-to-GPU inter-node" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation D D + # OSU MPI-CUDA Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.90 Pass + 2 1.82 Pass + 4 3.65 Pass + 8 7.30 Pass + 16 14.56 Pass + 32 29.03 Pass + 64 57.49 Pass + 128 118.30 Pass + 256 227.18 Pass + 512 461.26 Pass + 1024 926.30 Pass + 2048 1820.46 Pass + 4096 3611.70 Pass + 8192 6837.89 Pass + 16384 13361.25 Pass + 32768 18037.71 Pass + 65536 22019.46 Pass + 131072 23104.58 Pass + 262144 23542.71 Pass + 524288 23758.69 Pass + 1048576 23881.02 Pass + 2097152 23955.49 Pass + 4194304 23989.54 Pass + ``` + +=== "GPU-to-GPU inter-node (*)" + ```console + $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation D D + # OSU MPI-CUDA Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.06 Pass + 2 0.12 Pass + 4 0.24 Pass + 8 0.48 Pass + 16 0.95 Pass + 32 1.91 Pass + 64 3.85 Pass + 128 7.57 Pass + 256 15.28 Pass + 512 19.87 Pass + 1024 53.06 Pass + 2048 97.29 Pass + 4096 180.73 Pass + 8192 343.75 Pass + 16384 473.72 Pass + 32768 530.81 Pass + 65536 1268.51 Pass + 131072 1080.83 Pass + 262144 1435.36 Pass + 524288 1526.12 Pass + 1048576 1727.31 Pass + 2097152 1755.61 Pass + 4194304 1802.75 Pass + ``` + + + +=== "CPU-to-CPU intra-node" + ```console + $ srun -N1 -n2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation + # OSU MPI Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.96 Pass + 2 1.92 Pass + 4 3.85 Pass + 8 7.68 Pass + 16 15.40 Pass + 32 30.78 Pass + 64 61.26 Pass + 128 122.23 Pass + 256 240.96 Pass + 512 483.12 Pass + 1024 966.52 Pass + 2048 1938.09 Pass + 4096 3873.67 Pass + 8192 7100.56 Pass + 16384 14170.44 Pass + 32768 18607.68 Pass + 65536 21993.95 Pass + 131072 23082.11 Pass + 262144 23546.09 Pass + 524288 23745.05 Pass + 1048576 23879.79 Pass + 2097152 23947.23 Pass + 4194304 23980.15 Pass + ``` + +=== "GPU-to-GPU intra-node" + ```console + $ srun -N1 -n2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation D D + # OSU MPI-CUDA Bandwidth Test v7.5 + # Datatype: MPI_CHAR. + # Size Bandwidth (MB/s) Validation + 1 0.91 Pass + 2 1.83 Pass + 4 3.73 Pass + 8 7.47 Pass + 16 14.99 Pass + 32 29.98 Pass + 64 59.72 Pass + 128 119.13 Pass + 256 241.88 Pass + 512 481.52 Pass + 1024 963.60 Pass + 2048 1917.15 Pass + 4096 3840.96 Pass + 8192 6942.05 Pass + 16384 13911.45 Pass + 32768 18379.14 Pass + 65536 21761.73 Pass + 131072 23069.72 Pass + 262144 23543.98 Pass + 524288 23750.83 Pass + 1048576 23882.44 Pass + 2097152 23951.34 Pass + 4194304 23989.44 Pass + ``` + + +Next is the all to all latency test `osu_alltoall`, for 8 ranks spread over nodes (4 ranks per node, 1 rank per GPU). + +=== "CPU-to-CPU" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi ./collective/osu_alltoall --validation + # OSU MPI All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 12.46 Pass + 2 12.05 Pass + 4 11.99 Pass + 8 11.84 Pass + 16 11.87 Pass + 32 11.84 Pass + 64 11.95 Pass + 128 12.22 Pass + 256 13.21 Pass + 512 13.23 Pass + 1024 13.37 Pass + 2048 13.52 Pass + 4096 13.88 Pass + 8192 17.32 Pass + 16384 18.98 Pass + 32768 23.72 Pass + 65536 36.53 Pass + 131072 62.96 Pass + 262144 119.44 Pass + 524288 236.43 Pass + 1048576 519.85 Pass + ``` + +=== "CPU-to-CPU (*)" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi-no-cxi ./collective/osu_alltoall --validation + # OSU MPI All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 137.85 Pass + 2 133.47 Pass + 4 134.03 Pass + 8 131.14 Pass + 16 134.45 Pass + 32 135.35 Pass + 64 137.21 Pass + 128 137.03 Pass + 256 139.90 Pass + 512 140.70 Pass + 1024 165.05 Pass + 2048 197.14 Pass + 4096 255.02 Pass + 8192 335.75 Pass + 16384 543.12 Pass + 32768 928.81 Pass + 65536 782.28 Pass + 131072 1812.95 Pass + 262144 2284.26 Pass + 524288 3213.63 Pass + 1048576 5688.27 Pass + ``` + +=== "GPU-to-GPU" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi ./collective/osu_alltoall --validation -d cuda + # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 22.26 Pass + 2 22.08 Pass + 4 22.15 Pass + 8 22.19 Pass + 16 22.25 Pass + 32 22.11 Pass + 64 22.22 Pass + 128 21.98 Pass + 256 22.19 Pass + 512 22.20 Pass + 1024 22.37 Pass + 2048 22.58 Pass + 4096 22.99 Pass + 8192 27.22 Pass + 16384 28.55 Pass + 32768 32.60 Pass + 65536 44.88 Pass + 131072 70.15 Pass + 262144 123.30 Pass + 524288 234.89 Pass + 1048576 486.89 Pass + ``` + +=== "GPU-to-GPU (*)" + ```console + $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi-no-cxi ./collective/osu_alltoall --validation -d cuda + # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5 + # Datatype: MPI_CHAR. + # Size Avg Latency(us) Validation + 1 186.92 Pass + 2 180.80 Pass + 4 180.72 Pass + 8 179.45 Pass + 16 209.53 Pass + 32 181.73 Pass + 64 182.20 Pass + 128 182.84 Pass + 256 188.29 Pass + 512 189.35 Pass + 1024 237.31 Pass + 2048 231.73 Pass + 4096 298.73 Pass + 8192 396.10 Pass + 16384 589.72 Pass + 32768 983.72 Pass + 65536 786.48 Pass + 131072 1127.39 Pass + 262144 2144.57 Pass + 524288 3107.62 Pass + 1048576 5545.28 Pass + ``` diff --git a/docs/software/communication/rccl.md b/docs/software/communication/rccl.md deleted file mode 100644 index 4e33fb3a..00000000 --- a/docs/software/communication/rccl.md +++ /dev/null @@ -1,14 +0,0 @@ -[](){#ref-communication-rccl} -# RCCL - -[RCCL](https://rocmdocs.amd.com/projects/rccl/en/latest/) is an optimized inter-GPU communication library for AMD GPUs. -It provides equivalent functionality to [NCCL][ref-communication-nccl] for AMD GPUs. - -!!! todo - - high level description - - libfabric/aws-ofi-rccl plugin - - configuration options - -!!! info - RCCL uses many of the same [configuration options as NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html), with the `NCCL` prefix, not `RCCL`. - Refer to NCCL documentation to tune RCCL. diff --git a/docs/software/index.md b/docs/software/index.md index a20f1955..ec072434 100644 --- a/docs/software/index.md +++ b/docs/software/index.md @@ -28,6 +28,8 @@ These pages provided documentation for all supported software, and installation - :fontawesome-solid-bullhorn: [__Communication Libraries__][ref-software-communication] + [:octicons-arrow-right-24: Cray MPICH][ref-communication-libfabric] + [:octicons-arrow-right-24: Cray MPICH][ref-communication-cray-mpich] [:octicons-arrow-right-24: MPICH][ref-communication-mpich] @@ -36,7 +38,6 @@ These pages provided documentation for all supported software, and installation [:octicons-arrow-right-24: NCCL][ref-communication-nccl] - [:octicons-arrow-right-24: RCCL][ref-communication-rccl] [:octicons-arrow-right-24: libfabric][ref-communication-libfabric] diff --git a/mkdocs.yml b/mkdocs.yml index 8fdd4627..1559e1a1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -109,12 +109,12 @@ nav: - 'netcdf-tools': software/cw/netcdf-tools.md - 'Communication Libraries': - software/communication/index.md + - 'libfabric': software/communication/libfabric.md - 'Cray MPICH': software/communication/cray-mpich.md - 'MPICH': software/communication/mpich.md - 'OpenMPI': software/communication/openmpi.md - 'NCCL': software/communication/nccl.md - - 'RCCL': software/communication/rccl.md - - 'libfabric': software/communication/libfabric.md + - 'NVSHMEM': software/communication/nvshmem.md - 'Commercial software': - software/commercial/index.md - 'Matlab': software/commercial/matlab.md