diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt
index ede72019..47a48bd9 100644
--- a/.github/actions/spelling/allow.txt
+++ b/.github/actions/spelling/allow.txt
@@ -17,9 +17,11 @@ CWP
 CXI
 Ceph
 Containerfile
+Containerfiles
 DNS
 Dockerfiles
 Dufourspitze
+EFA
 EMPA
 ETHZ
 Ehrenfest
@@ -76,6 +78,8 @@ MeteoSwiss
 NAMD
 NICs
 NVMe
+NVSHMEM
+NVLINK
 Nordend
 OpenFabrics
 OAuth
@@ -102,6 +106,7 @@ ROCm
 RPA
 Roboto
 Roothaan
+SHMEM
 SSHService
 STMV
 Scopi
diff --git a/docs/software/communication/cray-mpich.md b/docs/software/communication/cray-mpich.md
index 8bac8559..ec753c56 100644
--- a/docs/software/communication/cray-mpich.md
+++ b/docs/software/communication/cray-mpich.md
@@ -28,7 +28,7 @@ This means that Cray MPICH will automatically be linked to the GTL library, whic
     $ ldd myexecutable | grep gtl
             libmpi_gtl_cuda.so => /user-environment/linux-sles15-neoverse_v2/gcc-13.2.0/cray-gtl-8.1.30-fptqzc5u6t4nals5mivl75nws2fb5vcq/lib/libmpi_gtl_cuda.so (0x0000ffff82aa0000)
     ```
-    
+
     The path may be different, but the `libmpi_gtl_cuda.so` library should be printed when using CUDA.
     In ROCm environments the `libmpi_gtl_hsa.so` library should be linked.
     If the GTL library is not linked, nothing will be printed.
@@ -40,7 +40,7 @@ See [this page][ref-slurm-gh200] for more information on configuring Slurm to us
 !!! warning "Segmentation faults when trying to communicate GPU buffers without `MPICH_GPU_SUPPORT_ENABLED=1`"
     If you attempt to communicate GPU buffers through MPI without setting `MPICH_GPU_SUPPORT_ENABLED=1`, it will lead to segmentation faults, usually without any specific indication that it is the communication that fails.
     Make sure that the option is set if you are communicating GPU buffers through MPI.
-    
+
 !!! warning "Error: "`GPU_SUPPORT_ENABLED` is requested, but GTL library is not linked""
     If `MPICH_GPU_SUPPORT_ENABLED` is set to `1` and your application does not link against one of the GTL libraries you will get an error similar to the following during MPI initialization:
     ```bash
diff --git a/docs/software/communication/dockerfiles/base b/docs/software/communication/dockerfiles/base
new file mode 100644
index 00000000..030fb891
--- /dev/null
+++ b/docs/software/communication/dockerfiles/base
@@ -0,0 +1,36 @@
+ARG ubuntu_version=24.04
+ARG cuda_version=12.8.1
+FROM docker.io/nvidia/cuda:${cuda_version}-cudnn-devel-ubuntu${ubuntu_version}
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive \
+       apt-get install -y \
+        build-essential \
+        ca-certificates \
+        pkg-config \
+        automake \
+        autoconf \
+        libtool \
+        cmake \
+        gdb \
+        strace \
+        wget \
+        git \
+        bzip2 \
+        python3 \
+        gfortran \
+        rdma-core \
+        numactl \
+        libconfig-dev \
+        libuv1-dev \
+        libfuse-dev \
+        libfuse3-dev \
+        libyaml-dev \
+        libnl-3-dev \
+        libnuma-dev \
+        libsensors-dev \
+        libcurl4-openssl-dev \
+        libjson-c-dev \
+        libibverbs-dev \
+        --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/*
diff --git a/docs/software/communication/dockerfiles/libfabric b/docs/software/communication/dockerfiles/libfabric
new file mode 100644
index 00000000..0fe4fad4
--- /dev/null
+++ b/docs/software/communication/dockerfiles/libfabric
@@ -0,0 +1,20 @@
+ARG gdrcopy_version=2.5.1
+RUN git clone --depth 1 --branch v${gdrcopy_version} https://github.com/NVIDIA/gdrcopy.git \
+    && cd gdrcopy \
+    && export CUDA_PATH=/usr/local/cuda \
+    && make CC=gcc CUDA=$CUDA_PATH lib \
+    && make lib_install \
+    && cd ../ && rm -rf gdrcopy
+
+# Install libfabric
+ARG libfabric_version=1.22.0
+RUN git clone --branch v${libfabric_version} --depth 1 https://github.com/ofiwg/libfabric.git \
+    && cd libfabric \
+    && ./autogen.sh \
+    && ./configure --prefix=/usr --with-cuda=/usr/local/cuda --enable-cuda-dlopen \
+       --enable-gdrcopy-dlopen --enable-efa \
+    && make -j$(nproc) \
+    && make install \
+    && ldconfig \
+    && cd .. \
+    && rm -rf libfabric
diff --git a/docs/software/communication/dockerfiles/nccl-tests b/docs/software/communication/dockerfiles/nccl-tests
new file mode 100644
index 00000000..43447a5f
--- /dev/null
+++ b/docs/software/communication/dockerfiles/nccl-tests
@@ -0,0 +1,7 @@
+ARG nccl_tests_version=2.17.1
+RUN wget -O nccl-tests-${nccl_tests_version}.tar.gz https://github.com/NVIDIA/nccl-tests/archive/refs/tags/v${nccl_tests_version}.tar.gz \
+    && tar xf nccl-tests-${nccl_tests_version}.tar.gz \
+    && cd nccl-tests-${nccl_tests_version} \
+    && MPI=1 make -j$(nproc) \
+    && cd .. \
+    && rm -rf nccl-tests-${nccl_tests_version}.tar.gz
diff --git a/docs/software/communication/dockerfiles/nvshmem b/docs/software/communication/dockerfiles/nvshmem
new file mode 100644
index 00000000..d3b03568
--- /dev/null
+++ b/docs/software/communication/dockerfiles/nvshmem
@@ -0,0 +1,54 @@
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive \
+       apt-get install -y \
+        python3-venv \
+        python3-dev \
+        --no-install-recommends \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm /usr/lib/python3.12/EXTERNALLY-MANAGED
+
+# Build NVSHMEM from source
+ARG nvshmem_version=3.4.5
+RUN wget -q https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/source/nvshmem_src_cuda12-all-all-${nvshmem_version}.tar.gz \
+    && tar -xvf nvshmem_src_cuda12-all-all-${nvshmem_version}.tar.gz \
+    && cd nvshmem_src \
+    && NVSHMEM_BUILD_EXAMPLES=0 \
+       NVSHMEM_BUILD_TESTS=1 \
+       NVSHMEM_DEBUG=0 \
+       NVSHMEM_DEVEL=0 \
+       NVSHMEM_DEFAULT_PMI2=0 \
+       NVSHMEM_DEFAULT_PMIX=1 \
+       NVSHMEM_DISABLE_COLL_POLL=1 \
+       NVSHMEM_ENABLE_ALL_DEVICE_INLINING=0 \
+       NVSHMEM_GPU_COLL_USE_LDST=0 \
+       NVSHMEM_LIBFABRIC_SUPPORT=1 \
+       NVSHMEM_MPI_SUPPORT=1 \
+       NVSHMEM_MPI_IS_OMPI=1 \
+       NVSHMEM_NVTX=1 \
+       NVSHMEM_PMIX_SUPPORT=1 \
+       NVSHMEM_SHMEM_SUPPORT=1 \
+       NVSHMEM_TEST_STATIC_LIB=0 \
+       NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
+       NVSHMEM_TRACE=0 \
+       NVSHMEM_USE_DLMALLOC=0 \
+       NVSHMEM_USE_NCCL=1 \
+       NVSHMEM_USE_GDRCOPY=1 \
+       NVSHMEM_VERBOSE=0 \
+       NVSHMEM_DEFAULT_UCX=0 \
+       NVSHMEM_UCX_SUPPORT=0 \
+       NVSHMEM_IBGDA_SUPPORT=0 \
+       NVSHMEM_IBGDA_SUPPORT_GPUMEM_ONLY=0 \
+       NVSHMEM_IBDEVX_SUPPORT=0 \
+       NVSHMEM_IBRC_SUPPORT=0 \
+       LIBFABRIC_HOME=/usr \
+       NCCL_HOME=/usr \
+       GDRCOPY_HOME=/usr/local \
+       MPI_HOME=/usr \
+       SHMEM_HOME=/usr \
+       NVSHMEM_HOME=/usr \
+       cmake . \
+       && make -j$(nproc) \
+       && make install \
+   && ldconfig \
+   && cd .. \
+   && rm -r nvshmem_src nvshmem_src_cuda12-all-all-${nvshmem_version}.tar.gz
diff --git a/docs/software/communication/dockerfiles/openmpi b/docs/software/communication/dockerfiles/openmpi
new file mode 100644
index 00000000..534ba5df
--- /dev/null
+++ b/docs/software/communication/dockerfiles/openmpi
@@ -0,0 +1,12 @@
+ARG OMPI_VER=5.0.8
+RUN wget -q https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OMPI_VER}.tar.gz \
+    && tar xf openmpi-${OMPI_VER}.tar.gz \
+    && cd openmpi-${OMPI_VER} \
+    && ./configure --prefix=/usr --with-ofi=/usr --with-ucx=/usr \
+        --enable-oshmem --with-cuda=/usr/local/cuda \
+        --with-cuda-libdir=/usr/local/cuda/lib64/stubs \
+    && make -j$(nproc) \
+    && make install \
+    && ldconfig \
+    && cd .. \
+    && rm -rf openmpi-${OMPI_VER}.tar.gz openmpi-${OMPI_VER}
diff --git a/docs/software/communication/dockerfiles/osu b/docs/software/communication/dockerfiles/osu
new file mode 100644
index 00000000..dadf20d3
--- /dev/null
+++ b/docs/software/communication/dockerfiles/osu
@@ -0,0 +1,16 @@
+ARG omb_version=7.5.1
+RUN wget -q http://mvapich.cse.ohio-state.edu/download/mvapich/osu-micro-benchmarks-${omb_version}.tar.gz \
+    && tar xf osu-micro-benchmarks-${omb_version}.tar.gz \
+    && cd osu-micro-benchmarks-${omb_version} \
+    && ldconfig /usr/local/cuda/targets/sbsa-linux/lib/stubs \
+    && ./configure --prefix=/usr/local CC=$(which mpicc) CFLAGS="-O3 -lcuda -lnvidia-ml" \
+                   --enable-cuda --with-cuda-include=/usr/local/cuda/include \
+                   --with-cuda-libpath=/usr/local/cuda/lib64 \
+                   CXXFLAGS="-lmpi -lcuda" \
+    && make -j$(nproc) \
+    && make install \
+    && ldconfig \
+    && cd .. \
+    && rm -rf osu-micro-benchmarks-${omb_version} osu-micro-benchmarks-${omb_version}.tar.gz
+
+WORKDIR /usr/local/libexec/osu-micro-benchmarks/mpi
diff --git a/docs/software/communication/dockerfiles/ucx b/docs/software/communication/dockerfiles/ucx
new file mode 100644
index 00000000..9ef632ab
--- /dev/null
+++ b/docs/software/communication/dockerfiles/ucx
@@ -0,0 +1,13 @@
+# Install UCX
+ARG UCX_VERSION=1.19.0
+RUN wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}/ucx-${UCX_VERSION}.tar.gz \
+    && tar xzf ucx-${UCX_VERSION}.tar.gz \
+    && cd ucx-${UCX_VERSION} \
+    && mkdir build \
+    && cd build \
+    && ../configure --prefix=/usr --with-cuda=/usr/local/cuda --with-gdrcopy=/usr/local \
+       --enable-mt --enable-devel-headers \
+    && make -j$(nproc) \
+    && make install \
+    && cd ../.. \
+    && rm -rf ucx-${UCX_VERSION}.tar.gz ucx-${UCX_VERSION}
diff --git a/docs/software/communication/index.md b/docs/software/communication/index.md
index 5d961d77..51ec9cd7 100644
--- a/docs/software/communication/index.md
+++ b/docs/software/communication/index.md
@@ -1,20 +1,67 @@
 [](){#ref-software-communication}
 # Communication Libraries
 
-CSCS provides common communication libraries optimized for the [Slingshot 11 network on Alps][ref-alps-hsn].
+Communication libraries, like MPI and NCCL, are one of the building blocks for high performance scientific and ML workloads.
+Broadly speaking, there are two levels of communication:
+
+* **Intra-node** communication between two processes on the same node.
+* **Inter-node** communication between different nodes, over the [Slingshot 11 network][ref-alps-hsn] that connects nodes on Alps.
+
+To get the best inter-node performance on Alps, they need to be configured to use the [libfabric][ref-communication-libfabric] library that has an optimised back end for the Slingshot 11 network on Alps.
+
+As such, communication libraries are part of the "base layer" of libraries and tools used by all workloads to fully utilize the hardware on Alps.
+They comprise the *network* layer in the following stack:
+
+* **CPU**: compilers with support for building applications optimized for the CPU architecture on the node.
+* **GPU**: CUDA and ROCM provide compilers and runtime libraries for NVIDIA and AMD GPUs respectively.
+* **Network**: libfabric, MPI, NCCL, NVSHMEM, need to be configured for the Slingshot network.
+
+CSCS provides communication libraries optimised for libfabric and Slingshot in uenv, and guidance on how to create container images that use them.
+This section of the documentation provides advice on how to build and install software to use these libraries, and how to deploy them.
 
 For most scientific applications relying on MPI, [Cray MPICH][ref-communication-cray-mpich] is recommended.
 [MPICH][ref-communication-mpich] and [OpenMPI][ref-communication-openmpi] may also be used, with limitations.
 Cray MPICH, MPICH, and OpenMPI make use of [libfabric][ref-communication-libfabric] to interact with the underlying network.
 
-Most machine learning applications rely on [NCCL][ref-communication-nccl] or [RCCL][ref-communication-rccl] for high-performance implementations of collectives.
-NCCL and RCCL have to be configured with a plugin using [libfabric][ref-communication-libfabric] to make full use of the Slingshot network.
+Most machine learning applications rely on [NCCL][ref-communication-nccl] for high-performance implementations of collectives.
+NCCL have to be configured with a plugin using [libfabric][ref-communication-libfabric] to make full use of the Slingshot network.
 
 See the individual pages for each library for information on how to use and best configure the libraries.
 
-* [Cray MPICH][ref-communication-cray-mpich]
-* [MPICH][ref-communication-mpich]
-* [OpenMPI][ref-communication-openmpi]
-* [NCCL][ref-communication-nccl]
-* [RCCL][ref-communication-rccl]
-* [libfabric][ref-communication-libfabric]
+<div class="grid cards" markdown>
+
+-   __Low Level__
+
+    Learn about the low-level networking library libfabric, and how to use it in uenv and containers
+
+    [:octicons-arrow-right-24: libfabric][ref-alps]
+
+</div>
+<div class="grid cards" markdown>
+
+-   __MPI__
+
+    Cray MPICH is the most optimized and best tested MPI implementation on Alps, and is used by uenv.
+
+    [:octicons-arrow-right-24: Cray MPICH][ref-communication-cray-mpich]
+
+    For compatibility in containers:
+
+    [:octicons-arrow-right-24: MPICH][ref-communication-mpich]
+
+    Also OpenMPI can be built in containers or in uenv
+
+    [:octicons-arrow-right-24: OpenMPI][ref-communication-openmpi]
+
+</div>
+<div class="grid cards" markdown>
+
+-   __Machine Learning__
+
+    Communication libraries used by ML tools like Torch, and some simulation codes.
+
+    [:octicons-arrow-right-24: NCCL][ref-communication-nccl]
+
+    [:octicons-arrow-right-24: NVSHMEM][ref-communication-nvshmem]
+
+</div>
diff --git a/docs/software/communication/libfabric.md b/docs/software/communication/libfabric.md
index a8dd80d8..c6403a3a 100644
--- a/docs/software/communication/libfabric.md
+++ b/docs/software/communication/libfabric.md
@@ -1,24 +1,77 @@
 [](){#ref-communication-libfabric}
 # Libfabric
 
-[Libfabric](https://ofiwg.github.io/libfabric/), or Open Fabrics Interfaces (OFI), is a low level networking library that abstracts away various networking backends.
-It is used by Cray MPICH, and can be used together with OpenMPI, NCCL, and RCCL to make use of the [Slingshot network on Alps][ref-alps-hsn].
+[Libfabric](https://ofiwg.github.io/libfabric/), or Open Fabrics Interfaces (OFI), is a low-level networking library that provides an abstract interface for networks.
+Libfabric has backends for different network types, and is the interface chosen by HPE for the [Slingshot network on Alps][ref-alps-hsn], and by AWS for their [EFA network interface](https://aws.amazon.com/hpc/efa/).
 
+To fully take advantage of the network on Alps:
+
+* libfabric and its dependencies must be available in your environment (uenv or container);
+* and, communication libraries in your environment like Cray MPICH, OpenMPI, NCCL, and NVSHMEM have to be built or configured to use libfabric.
+
+!!! question "What about UCX?"
+    [Unified Communication X (UCX)](https://openucx.org/) is a low level library that targets the same layer as libfabric.
+    Specifically, it provides an open, standards-based, networking API.
+    By targeting UCX and libfabric, MPI and NCCL do not need to implement low-level support for each network hardware.
+
+    **There is no UCX back end for the Slingshot network on Alps**, and pre-built software (for example conda packages and containers) often provides versions of MPI built for UCX only.
+    Running these images and packages on Alps will lead to very poor network performance or errors.
+
+[](){#ref-communication-libfabric-using}
 ## Using libfabric
 
+[](){#ref-communication-libfabric-uenv}
+### uenv
+
 If you are using a uenv provided by CSCS, such as [prgenv-gnu][ref-uenv-prgenv-gnu], [Cray MPICH][ref-communication-cray-mpich] is linked to libfabric and the high speed network will be used.
 No changes are required in applications.
 
-If you are using containers, the system libfabric can be loaded into your container using the [CXI hook provided by the container engine][ref-ce-cxi-hook].
-Using the hook is essential to make full use of the Alps network.
+[](){#ref-communication-libfabric-ce}
+### Containers
+
+The approach is to install libfabric inside the container, along with MPI and NCCL implementations linked against it.
+At runtime, the [container engine][ref-container-engine] [CXI hook][ref-ce-cxi-hook] will replace the libfabric libraries inside the container with the corresponding libraries on the host system.
+This will ensure access to the Slingshot interconnect.
+
+
+!!! note "Use NVIDIA containers for the gh200 nodes"
+    Container images provided by NVIDIA, which come with CUDA, NCCL and other commonly used libraries are recommended as the base layer for building a container environment on the [gh200][ref-alps-gh200-node] and [a100][ref-alps-a100-node] nodes.
+
+    The version of CUDA, NCCL and compilers in the container can be used once libfabric has been installed.
+    Other communication libraries, like MPI and NVSHMEM, provided in the containers can't be used directly.
+    Instead, they have to be installed in the container and linked against libfabric.
 
+!!! example "Installing libfabric in a container for NVIDIA nodes"
+    The following lines demonstrate how to configure and install libfabric in a Containerfile.
+    Communication frameworks are built with explicit support for CUDA and GDRCopy.
+
+    Some additional features are enabled to increase the portability of the container to non-Alps systems:
+
+    - The libfabric [EFA](https://aws.amazon.com/hpc/efa/) provider is configured with the `--enable-efa` flag, for compatibility with AWS infrastructure.
+    - The UCX communication framework is added to facilitate building a broader set of software (e.g. some OpenSHMEM implementations) and for optimized infiniband network support.
+
+    Note that it is assumed that CUDA has already been installed on the system.
+    ```Dockerfile
+    --8<-- "docs/software/communication/dockerfiles/libfabric"
+    --8<-- "docs/software/communication/dockerfiles/ucx"
+    ```
+
+    An example Containerfile that installs libfabric in an NVIDIA container can be expanded below:
+
+    ??? note "The full Containerfile for GH200"
+        The Containerfile below is based on an NVIDIA CUDA image, which provides a complete CUDA installation and NCCL.
+
+        ```
+        --8<-- "docs/software/communication/dockerfiles/base"
+        --8<-- "docs/software/communication/dockerfiles/libfabric"
+        --8<-- "docs/software/communication/dockerfiles/ucx"
+        ```
+
+[](){#ref-communication-libfabric-performance}
 ## Tuning libfabric
 
-Tuning libfabric (particularly together with [Cray MPICH][ref-communication-cray-mpich], [OpenMPI][ref-communication-openmpi], [NCCL][ref-communication-nccl], and [RCCL][ref-communication-rccl]) depends on many factors, including the application, workload, and system.
+Tuning libfabric (particularly together with [Cray MPICH][ref-communication-cray-mpich], [OpenMPI][ref-communication-openmpi], and [NCCL][ref-communication-nccl]) depends on many factors, including the application, workload, and system.
 For a comprehensive overview libfabric options for the CXI provider (the provider for the Slingshot network), see the [`fi_cxi` man pages](https://ofiwg.github.io/libfabric/v2.1.0/man/fi_cxi.7.html).
 Note that the exact version deployed on Alps may differ, and not all options may be applicable on Alps.
 
 See the [Cray MPICH known issues page][ref-communication-cray-mpich-known-issues] for issues when using Cray MPICH together with libfabric.
-
-!!! todo
-    More options?
diff --git a/docs/software/communication/mpich.md b/docs/software/communication/mpich.md
index 8e60cb3c..3e8597d6 100644
--- a/docs/software/communication/mpich.md
+++ b/docs/software/communication/mpich.md
@@ -4,7 +4,17 @@
 MPICH is an open-source MPI implementation actively developed in this [github repository](https://github.com/pmodels/mpich).
 It can be installed inside containers directly from the source code manually, or using Spack or similar package managers.
 
-## MPICH inside containers
+[](){#ref-communication-mpich-using}
+## Using MPICH
+
+[](){#ref-communication-mpich-ce}
+### uenv
+
+MPICH is not provided in any uenv images, which instead use the [Cray MPICH][ref-communication-cray-mpich] distribution which is optimised for the Alps network.
+
+[](){#ref-communication-mpich-ce}
+### Containers
+
 MPICH can be built inside containers, however for native Slingshot performance special care has to be taken to ensure that communication is optimal for all cases:
 
 * Intra-node communication (this is via shared memory, especially `xpmem`)
@@ -136,13 +146,25 @@ They are explicit and building manually the necessary packages, however for prod
     RUN rm /etc/ld.so.conf.d/cuda_stubs.conf && ldconfig
     ```
 
-!!! important "GPU-to-GPU inter-node communication"
+Once the container is built and pushed to a registry, one can create a [container environment][ref-container-engine].
+
+!!! note "GPU-to-GPU inter-node communication"
     To make sure that GPU-to-GPU performance is good for inter-node communication one must set the variable
     ```console
     $ export MPIR_CVAR_CH4_OFI_ENABLE_HMEM=1
     ```
 
-Once the container is built and pushed to a registry, one can create a [container environment][ref-container-engine].
+!!! note "Use PMI-2"
+    By default MPICH uses [PMI-2](https://www.mcs.anl.gov/papers/P1760.pdf) for wire-up and communication between ranks.
+    Hence, when launching containers that use MPICH through Slurm, PMI-2 must be used for application launching.
+    This is done with the `--mpi` flag of `srun`:
+    ```bash
+    srun --mpi=pmi2 ...
+    ```
+
+[](){#ref-communication-mpich-performance}
+## MPICH Performance
+
 To verify performance, one can run the `osu_bw` benchmark, which is doing a bandwidth benchmark for different message sizes between two ranks.
 For reference this is the expected performance for different memory residency, with inter-node and intra-node communication:
 === "CPU-to-CPU memory intra-node"
diff --git a/docs/software/communication/nccl.md b/docs/software/communication/nccl.md
index 7a979566..9353ab22 100644
--- a/docs/software/communication/nccl.md
+++ b/docs/software/communication/nccl.md
@@ -4,8 +4,15 @@
 [NCCL](https://developer.nvidia.com/nccl) is an optimized inter-GPU communication library for NVIDIA GPUs.
 It is commonly used in machine learning frameworks, but traditional scientific applications can also benefit from NCCL.
 
+[](){#ref-communication-nccl-using}
 ## Using NCCL
 
+!!! info "Further reading"
+    [_Demystifying NCCL: An In-depth Analysis of GPU Communication Protocols and Algorithms_](https://arxiv.org/abs/2507.04786v2) contains detailed information about NCCL algorithms and protocols, which can be helpful for deciding if your application could benefit from an alternative configuration.
+
+[](){#ref-communication-nccl-uenv}
+### uenv
+
 To use the Slingshot network on Alps, the [`aws-ofi-nccl`](https://github.com/aws/aws-ofi-nccl) plugin must be used.
 With the container engine, the [AWS OFI NCCL hook][ref-ce-aws-ofi-hook] can be used to load the plugin into the container and configure NCCL to use it.
 
@@ -20,7 +27,74 @@ While the container engine sets these automatically when using the NCCL hook, th
 --8<-- "docs/software/communication/nccl_env_vars"
 ```
 
-[_Demystifying NCCL: An In-depth Analysis of GPU Communication Protocols and Algorithms_](https://arxiv.org/abs/2507.04786v2) contains detailed information about NCCL algorithms and protocols, which can be helpful for deciding if your application could benefit from an alternative configuration.
+[](){#ref-communication-nccl-ce}
+### Containers
+
+To use NCCL in a container, we suggest using a container provided by NVIDIA that already contains CUDA and NCCL as the starting point.
+Then install libfabric as documented in the [libfabric container documentation][ref-communication-libfabric-ce], and use the [AWS OFI hook][ref-ce-aws-ofi-hook] to configure NCCL to use [libfabric][ref-communication-libfabric] optimised for the Alps network.
+
+!!! example "Installing the NCCL benchmarks in a container for NVIDIA nodes"
+    To test whether NCCL inside a container has been set up correctly for optimal performance, add the NCCL test suite to the container.
+
+    Use the following as a template for installing the tests:
+
+    ```Dockerfile
+    --8<-- "docs/software/communication/dockerfiles/nccl-tests"
+    ```
+
+    Expand the box below to see the full Containerfile that installs the NCCL tests on top of the example in the [libfabric][ref-communication-libfabric-ce] documentation.
+
+    ??? note "The full Containerfile"
+        ```Dockerfile
+        --8<-- "docs/software/communication/dockerfiles/base"
+        --8<-- "docs/software/communication/dockerfiles/libfabric"
+        --8<-- "docs/software/communication/dockerfiles/ucx"
+        --8<-- "docs/software/communication/dockerfiles/nccl-tests"
+        ```
+
+To use NCCL in a container, enable the [AWS OFI hook][ref-ce-aws-ofi-hook] in the [EDF file][ref-ce-edf-reference].
+
+```toml
+[env]
+PMIX_MCA_psec="native" # (1)!
+
+[annotations]
+com.hooks.aws_ofi_nccl.enabled = "true"    # (2)!
+com.hooks.aws_ofi_nccl.variant = "cuda12"  # (3)!
+```
+
+1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup.
+2. Enable the AWS OFI plugin.
+3. Take care to match the major CUDA version installed in the container.
+
+Because NCCL uses OpenMPI in the container to perform initial setup, which in turn uses [PMIx](https://pmix.org/) for wire-up, pass the `--mpi=pmix` option to `srun` when launching jobs.
+
+```console
+$ srun --mpi=pmix -n8 -N2 --environment=nccl-test /nccl-tests-2.17.1/build/all_reduce_perf
+```
+
+[](){#ref-communication-nccl-issues}
+## Known issues
+
+!!! warning "Do not use `NCCL_NET_PLUGIN="ofi"` with uenvs"
+    NCCL has an alternative way of specifying what plugin to use: `NCCL_NET_PLUGIN`.
+    When using uenvs, do not set `NCCL_NET_PLUGIN="ofi"` instead of, or in addition to, `NCCL_NET="AWS Libfabric"`.
+    If you do, your application will fail to start since NCCL will:
+
+    1. fail to find the plugin because of the name of the shared library in the uenv, and
+    2. prefer `NCCL_NET_PLUGIN` over `NCCL_NET`, so it will fail to find the plugin even if `NCCL_NET="AWS Libfabric"` is correctly set.
+
+    When both environment variables are set the error message, with `NCCL_DEBUG=WARN`, will look similar to when the plugin isn't available:
+    ```console
+    nid006365:179857:179897 [1] net.cc:626 NCCL WARN Error: network AWS Libfabric not found.
+    ```
+
+    With `NCCL_DEBUG=INFO`, NCCL will print:
+    ```console
+    nid006365:180142:180163 [0] NCCL INFO NET/Plugin: Could not find: ofi libnccl-net-ofi.so. Using internal network plugin.
+    ...
+    nid006365:180142:180163 [0] net.cc:626 NCCL WARN Error: network AWS Libfabric not found.
+    ```
 
 In addition to the above variables, setting `NCCL_NCHANNELS_PER_NET_PEER` can improve point-to-point performance (operations based directly on send/recv):
 
@@ -42,9 +116,6 @@ The option is undocumented, but [this issue](https://github.com/NVIDIA/nccl/issu
     export FI_CXI_RDZV_EAGER_SIZE=0
     ```
 
-!!! warning "Using NCCL with uenvs"
-    The environment variables listed above are not set automatically when using uenvs.
-
 !!! warning "GPU-aware MPI with NCCL"
     Using GPU-aware MPI together with NCCL [can easily lead to deadlocks](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/mpi.html#inter-gpu-communication-with-cuda-aware-mpi).
     Unless care is taken to ensure that the two methods of communication are not used concurrently, we recommend not using GPU-aware MPI with NCCL.
@@ -68,24 +139,120 @@ The option is undocumented, but [this issue](https://github.com/NVIDIA/nccl/issu
     nid006352:34610:34631 [0] NCCL INFO Using network AWS Libfabric
     ```
 
-!!! warning "Do not use `NCCL_NET_PLUGIN="ofi"` with uenvs"
-    NCCL has an alternative way of specifying what plugin to use: `NCCL_NET_PLUGIN`.
-    When using uenvs, do not set `NCCL_NET_PLUGIN="ofi"` instead of, or in addition to, `NCCL_NET="AWS Libfabric"`.
-    If you do, your application will fail to start since NCCL will:
+[](){#ref-communication-nccl-performance}
+## NCCL Performance
 
-    1. fail to find the plugin because of the name of the shared library in the uenv, and
-    2. prefer `NCCL_NET_PLUGIN` over `NCCL_NET`, so it will fail to find the plugin even if `NCCL_NET="AWS Libfabric"` is correctly set.
-    
-    When both environment variables are set the error message, with `NCCL_DEBUG=WARN`, will look similar to when the plugin isn't available:
+!!! warning "no version information available"
+    The following warning message was generated by each rank running the benchmarks below, and can safely be ignored.
+    ```
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    ```
+
+!!! note "Impact of disabling the CXI hook"
+    On many Alps vClusters, the Container Engine is configured with the [CXI hook][ref-ce-cxi-hook] enabled by default, enabling transparent access to the Slingshot interconnect.
+
+    The inter node tests marked with `(*)` were run with the CXI container hook disabled, to demonstrate the effect of not using an optimised network configuration.
+    If you see similar performance degradation in your tests, the first thing to investigate is whether your setup is using the libfabric optimised back end.
+
+Below are the results of of running the collective all reduce latency test on 2 nodes with 8 GPUs total (the `all_reduce_perf` test).
+
+=== "All-reduce latency"
     ```console
-    nid006365:179857:179897 [1] net.cc:626 NCCL WARN Error: network AWS Libfabric not found.
+    $ srun -N2 -t5 --mpi=pmix --ntasks-per-node=4 --environment=nccl-test-ompi /nccl-tests-2.17.1/build/all_reduce_perf -b 8 -e 128M -f 2
+    # Collective test starting: all_reduce_perf
+    # nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0
+    #
+    # Using devices
+    #  Rank  0 Group  0 Pid 204199 on  nid005471 device  0 [0009:01:00] NVIDIA GH200 120GB
+    #  Rank  1 Group  0 Pid 204200 on  nid005471 device  1 [0019:01:00] NVIDIA GH200 120GB
+    #  Rank  2 Group  0 Pid 204201 on  nid005471 device  2 [0029:01:00] NVIDIA GH200 120GB
+    #  Rank  3 Group  0 Pid 204202 on  nid005471 device  3 [0039:01:00] NVIDIA GH200 120GB
+    #  Rank  4 Group  0 Pid 155254 on  nid005487 device  0 [0009:01:00] NVIDIA GH200 120GB
+    #  Rank  5 Group  0 Pid 155255 on  nid005487 device  1 [0019:01:00] NVIDIA GH200 120GB
+    #  Rank  6 Group  0 Pid 155256 on  nid005487 device  2 [0029:01:00] NVIDIA GH200 120GB
+    #  Rank  7 Group  0 Pid 155257 on  nid005487 device  3 [0039:01:00] NVIDIA GH200 120GB
+    #
+    #                                                              out-of-place                       in-place          
+    #       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+    #        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
+               8             2     float     sum      -1    17.93    0.00    0.00      0    17.72    0.00    0.00      0
+              16             4     float     sum      -1    17.65    0.00    0.00      0    17.63    0.00    0.00      0
+              32             8     float     sum      -1    17.54    0.00    0.00      0    17.43    0.00    0.00      0
+              64            16     float     sum      -1    19.27    0.00    0.01      0    19.21    0.00    0.01      0
+             128            32     float     sum      -1    18.86    0.01    0.01      0    18.67    0.01    0.01      0
+             256            64     float     sum      -1    18.83    0.01    0.02      0    19.02    0.01    0.02      0
+             512           128     float     sum      -1    19.72    0.03    0.05      0    19.40    0.03    0.05      0
+            1024           256     float     sum      -1    20.35    0.05    0.09      0    20.32    0.05    0.09      0
+            2048           512     float     sum      -1    22.07    0.09    0.16      0    21.72    0.09    0.17      0
+            4096          1024     float     sum      -1    31.97    0.13    0.22      0    31.58    0.13    0.23      0
+            8192          2048     float     sum      -1    37.21    0.22    0.39      0    35.84    0.23    0.40      0
+           16384          4096     float     sum      -1    37.29    0.44    0.77      0    36.53    0.45    0.78      0
+           32768          8192     float     sum      -1    39.61    0.83    1.45      0    37.09    0.88    1.55      0
+           65536         16384     float     sum      -1    61.03    1.07    1.88      0    68.45    0.96    1.68      0
+          131072         32768     float     sum      -1    81.41    1.61    2.82      0    72.94    1.80    3.14      0
+          262144         65536     float     sum      -1    127.0    2.06    3.61      0    108.9    2.41    4.21      0
+          524288        131072     float     sum      -1    170.3    3.08    5.39      0    349.6    1.50    2.62      0
+         1048576        262144     float     sum      -1    164.3    6.38   11.17      0    187.7    5.59    9.77      0
+         2097152        524288     float     sum      -1    182.1   11.51   20.15      0    180.6   11.61   20.32      0
+         4194304       1048576     float     sum      -1    292.7   14.33   25.08      0    295.4   14.20   24.85      0
+         8388608       2097152     float     sum      -1    344.5   24.35   42.61      0    345.7   24.27   42.47      0
+        16777216       4194304     float     sum      -1    461.7   36.34   63.59      0    454.0   36.95   64.67      0
+        33554432       8388608     float     sum      -1    686.5   48.88   85.54      0    686.6   48.87   85.52      0
+        67108864      16777216     float     sum      -1   1090.5   61.54  107.69      0   1083.5   61.94  108.39      0
+       134217728      33554432     float     sum      -1   1916.4   70.04  122.57      0   1907.8   70.35  123.11      0
+    # Out of bounds values : 0 OK
+    # Avg bus bandwidth    : 19.7866 
+    #
+    # Collective test concluded: all_reduce_perf
     ```
-    
-    With `NCCL_DEBUG=INFO`, NCCL will print:
+
+=== "All-reduce latency (*)"
     ```console
-    nid006365:180142:180163 [0] NCCL INFO NET/Plugin: Could not find: ofi libnccl-net-ofi.so. Using internal network plugin.
-    ...
-    nid006365:180142:180163 [0] net.cc:626 NCCL WARN Error: network AWS Libfabric not found.
+    $ srun -N2 -t5 --mpi=pmix --ntasks-per-node=4 --environment=nccl-test-ompi /nccl-tests-2.17.1/build/all_reduce_perf -b 8 -e 128M -f 2
+    # Collective test starting: all_reduce_perf
+    # nThread 1 nGpus 1 minBytes 8 maxBytes 134217728 step: 2(factor) warmup iters: 1 iters: 20 agg iters: 1 validation: 1 graph: 0
+    #
+    # Using devices
+    #  Rank  0 Group  0 Pid 202829 on  nid005471 device  0 [0009:01:00] NVIDIA GH200 120GB
+    #  Rank  1 Group  0 Pid 202830 on  nid005471 device  1 [0019:01:00] NVIDIA GH200 120GB
+    #  Rank  2 Group  0 Pid 202831 on  nid005471 device  2 [0029:01:00] NVIDIA GH200 120GB
+    #  Rank  3 Group  0 Pid 202832 on  nid005471 device  3 [0039:01:00] NVIDIA GH200 120GB
+    #  Rank  4 Group  0 Pid 154517 on  nid005487 device  0 [0009:01:00] NVIDIA GH200 120GB
+    #  Rank  5 Group  0 Pid 154518 on  nid005487 device  1 [0019:01:00] NVIDIA GH200 120GB
+    #  Rank  6 Group  0 Pid 154519 on  nid005487 device  2 [0029:01:00] NVIDIA GH200 120GB
+    #  Rank  7 Group  0 Pid 154520 on  nid005487 device  3 [0039:01:00] NVIDIA GH200 120GB
+    #
+    #                                                              out-of-place                       in-place          
+    #       size         count      type   redop    root     time   algbw   busbw #wrong     time   algbw   busbw #wrong
+    #        (B)    (elements)                               (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)       
+               8             2     float     sum      -1    85.47    0.00    0.00      0    53.44    0.00    0.00      0
+              16             4     float     sum      -1    52.41    0.00    0.00      0    51.11    0.00    0.00      0
+              32             8     float     sum      -1    50.45    0.00    0.00      0    50.40    0.00    0.00      0
+              64            16     float     sum      -1    62.58    0.00    0.00      0    50.70    0.00    0.00      0
+             128            32     float     sum      -1    50.94    0.00    0.00      0    50.77    0.00    0.00      0
+             256            64     float     sum      -1    50.76    0.01    0.01      0    51.77    0.00    0.01      0
+             512           128     float     sum      -1    163.2    0.00    0.01      0    357.5    0.00    0.00      0
+            1024           256     float     sum      -1    373.0    0.00    0.00      0    59.31    0.02    0.03      0
+            2048           512     float     sum      -1    53.22    0.04    0.07      0    52.58    0.04    0.07      0
+            4096          1024     float     sum      -1    55.95    0.07    0.13      0    56.63    0.07    0.13      0
+            8192          2048     float     sum      -1    58.52    0.14    0.24      0    58.62    0.14    0.24      0
+           16384          4096     float     sum      -1    108.7    0.15    0.26      0    107.8    0.15    0.27      0
+           32768          8192     float     sum      -1    184.1    0.18    0.31      0    183.5    0.18    0.31      0
+           65536         16384     float     sum      -1    325.0    0.20    0.35      0    325.4    0.20    0.35      0
+          131072         32768     float     sum      -1    592.7    0.22    0.39      0    591.5    0.22    0.39      0
+          262144         65536     float     sum      -1    942.0    0.28    0.49      0    941.4    0.28    0.49      0
+          524288        131072     float     sum      -1   1143.1    0.46    0.80      0   1138.0    0.46    0.81      0
+         1048576        262144     float     sum      -1   1502.2    0.70    1.22      0   1478.9    0.71    1.24      0
+         2097152        524288     float     sum      -1    921.8    2.28    3.98      0    899.8    2.33    4.08      0
+         4194304       1048576     float     sum      -1   1443.1    2.91    5.09      0   1432.7    2.93    5.12      0
+         8388608       2097152     float     sum      -1   2437.7    3.44    6.02      0   2417.0    3.47    6.07      0
+        16777216       4194304     float     sum      -1   5036.9    3.33    5.83      0   5003.6    3.35    5.87      0
+        33554432       8388608     float     sum      -1    17388    1.93    3.38      0    17275    1.94    3.40      0
+        67108864      16777216     float     sum      -1    21253    3.16    5.53      0    21180    3.17    5.54      0
+       134217728      33554432     float     sum      -1    43293    3.10    5.43      0    43396    3.09    5.41      0
+    # Out of bounds values : 0 OK
+    # Avg bus bandwidth    : 1.58767 
+    #
+    # Collective test concluded: all_reduce_perf
     ```
-    
-    If you only set `NCCL_NET="ofi"`, NCCL may silently fail to load the plugin but fall back to the default implementation.
+
diff --git a/docs/software/communication/nvshmem.md b/docs/software/communication/nvshmem.md
new file mode 100644
index 00000000..e815c82c
--- /dev/null
+++ b/docs/software/communication/nvshmem.md
@@ -0,0 +1,196 @@
+[](){#ref-communication-nvshmem}
+# NVSHMEM
+
+[NVSHMEM](https://developer.nvidia.com/nvshmem) is a parallel programming interface based on OpenSHMEM that provides efficient and scalable communication for NVIDIA GPU clusters.
+NVSHMEM creates a global address space for data that spans the memory of multiple GPUs and can be accessed with fine-grained GPU-initiated operations, CPU-initiated operations, and operations on CUDA streams.
+
+[](){#ref-communication-nvshmem-using}
+## Using NVSHMEM
+
+[](){#ref-communication-uenv-ce}
+### uenv
+
+Version 2.8 of the [PyTorch uenv][ref-uenv-pytorch] is currently the only uenv that provides NVSHMEM.
+
+CSCS working on building NVSHMEM that runs efficiently on the Alps network in uenv, and will update these docs when it is available.
+
+[](){#ref-communication-nvshmem-ce}
+### Containers
+
+To use NVSHMEM, we recommend first installing OpenMPI with libfabric support in the container, or starting with an image that contains OpenMPI with libfabric.
+
+The image recipe described here is based on the [OpenMPI image for NVIDIA][ref-communication-openmpi], and thus it is suited for hosts with NVIDIA GPUs, like Alps GH200 nodes.
+
+!!! warning "Be careful with NVSHMEM provided by NVIDIA containers"
+    Containers provided by NVIDIA on NGC typically provide NVSHMEM as part of the NVHPC SDK in the image, however this version is built for and linked against OpenMPI and UCX in the container, which are not compatible with the Slingshot network of Alps.
+
+NVSHMEM is built from source in the container, from a source tar ball provided by NVIDIA.
+
+- Notice that NVSHMEM is configured with support for libfabric explicitly enabled: `NVSHMEM_LIBFABRIC_SUPPORT=1`
+- NVSHMEM is built without support for UCX and Infiniband components, because they are not needed on Alps.
+- Since this image uses OpenMPI (which provides PMIx) as MPI implementation, NVSHMEM is also configured to default to PMIx for bootstrapping (`NVSHMEM_PMIX_SUPPORT=1`).
+
+!!! example "Installing NVSHMEM in a container for NVIDIA nodes"
+    The following example demonstrates how to download and install NVSHMEM from source in a Containerfile.
+
+    The container image cont
+    ```dockerfile
+    --8<-- "docs/software/communication/dockerfiles/nvshmem"
+    ```
+    !!! note
+        The image also installs the NVSHMEM performance tests, `NVSHMEM_BUILD_TESTS=1`, to demonstrate performance below.
+        The performance tests, in turn, require the installation of Python dependencies.
+        When building images intended solely for production purposes, you may exclude both those elements.
+
+    Expand the box below to see an example of a complete Containerfile that installs NVSHMEM and all of its dependencies in an NVIDIA container.
+
+    ??? note "The full Containerfile"
+        ```dockerfile
+        --8<-- "docs/software/communication/dockerfiles/base"
+        --8<-- "docs/software/communication/dockerfiles/libfabric"
+        --8<-- "docs/software/communication/dockerfiles/ucx"
+        --8<-- "docs/software/communication/dockerfiles/openmpi"
+        --8<-- "docs/software/communication/dockerfiles/nvshmem"
+        ```
+
+!!! example "Running the NVSHMEM container"
+    The following EDF file sets the required environment variables and container hooks for NVSHMEM.
+    It uses a pre-built container hosted on the [Quay.io](https://quay.io/) registry at the following reference: `quay.io/ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8`.
+
+
+    ```toml
+    image = "quay.io#ethcscs/nvshmem:3.4.5-ompi5.0.8-ofi1.22-cuda12.8"
+
+    [env]
+    PMIX_MCA_psec="native" # (1)!
+    NVSHMEM_REMOTE_TRANSPORT="libfabric"
+    NVSHMEM_LIBFABRIC_PROVIDER="cxi"
+    NVSHMEM_DISABLE_CUDA_VMM="1" # (2)!
+
+    [annotations]
+    com.hooks.aws_ofi_nccl.enabled = "true" # (3)!
+    com.hooks.aws_ofi_nccl.variant = "cuda12"
+    ```
+
+    1. Ensures PMIx uses the same security domain as Slurm.
+       Otherwise PMIx will print warnings at startup.
+    2. NVSHMEM's `libfabric` transport does not support VMM yet, so VMM must be disabled by setting the environment variable `NVSHMEM_DISABLE_CUDA_VMM=1`.
+    3. NCCL requires the presence of the [AWS OFI NCCL plugin](https://github.com/aws/aws-ofi-nccl) in order to correctly interface with Libfabric and (through the latter) the Slingshot interconnect.
+       Therefore, for optimal performance the [related CE hook][ref-ce-aws-ofi-hook] must be enabled and set to match the CUDA version in the container.
+
+    Libfabric itself is usually injected by the [CXI hook][ref-ce-cxi-hook], which is enabled by default on several Alps vClusters.
+
+
+    ```bash
+    srun -N2 --ntasks-per-node=4  \
+         -mpi=pmix                \ # (1)!
+         --environment=nvshmem    \
+        /usr/local/nvshmem/bin/perftest/device/coll/alltoall_latency
+    ```
+
+    1. Since NVSHMEM has been configured in the Containerfile to use PMIx for bootstrapping, when using this image the `srun` option `--mpi=pmix` must be used to run successful multi-rank jobs.
+
+    Other bootstrapping methods (including different PMI implementations) can be specified for NVSHMEM through the related [environment variables](https://docs.nvidia.com/nvshmem/api/gen/env.html#bootstrap-options).
+    When bootstrapping through PMI or MPI through Slurm, ensure that the PMI implementation used by Slurm (i.e. `srun --mpi` option) matches the one expected by NVSHMEM or the MPI library.
+
+[](){#ref-communication-nvshmem-performance}
+## NVSHMEM Performance
+
+The results of running the `alltoall_latency` benchmark provided by the NCCL test suite, built in the example container [above][ref-communication-nvshmem-ce].
+
+```console
+$ srun -N2 --ntasks-per-node=4  --mpi=pmix --environment=nvshmem /usr/local/nvshmem/bin/perftest/device/coll/alltoall_latency
+Runtime options after parsing command line arguments 
+min_size: 4, max_size: 4194304, step_factor: 2, iterations: 10, warmup iterations: 5, number of ctas: 32, threads per cta: 256 stride: 1, datatype: int, reduce_op: sum, threadgroup_scope: all_scopes, atomic_op: inc, dir: write, report_msgrate: 0, bidirectional: 0, putget_issue :on_stream, use_graph: 0, use_mmap: 0, mem_handle_type: 0, use_egm: 0
+Note: Above is full list of options, any given test will use only a subset of these variables.
+mype: 6 mype_node: 2 device name: NVIDIA GH200 120GB bus id: 1 
+#alltoall_device
+size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+32          8         32-bit    thread    116.220796        0.000         0.000       
+64          16        32-bit    thread    112.700796        0.001         0.000       
+128         32        32-bit    thread    113.571203        0.001         0.001       
+256         64        32-bit    thread    111.123204        0.002         0.002       
+512         128       32-bit    thread    111.075199        0.005         0.004       
+1024        256       32-bit    thread    110.131204        0.009         0.008       
+2048        512       32-bit    thread    111.030400        0.018         0.016       
+4096        1024      32-bit    thread    110.985601        0.037         0.032       
+8192        2048      32-bit    thread    111.039996        0.074         0.065       
+#alltoall_device
+size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+32          8         32-bit    warp      89.801598         0.000         0.000       
+64          16        32-bit    warp      90.563202         0.001         0.001       
+128         32        32-bit    warp      89.830399         0.001         0.001       
+256         64        32-bit    warp      88.863999         0.003         0.003       
+512         128       32-bit    warp      89.686400         0.006         0.005       
+1024        256       32-bit    warp      88.908798         0.012         0.010       
+2048        512       32-bit    warp      88.819200         0.023         0.020       
+4096        1024      32-bit    warp      89.670402         0.046         0.040       
+8192        2048      32-bit    warp      88.889599         0.092         0.081       
+16384       4096      32-bit    warp      88.972801         0.184         0.161       
+32768       8192      32-bit    warp      89.564800         0.366         0.320       
+65536       16384     32-bit    warp      89.888000         0.729         0.638       
+#alltoall_device
+size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+32          8         32-bit    block     89.747202         0.000         0.000       
+64          16        32-bit    block     88.086402         0.001         0.001       
+128         32        32-bit    block     87.254399         0.001         0.001       
+256         64        32-bit    block     87.401599         0.003         0.003       
+512         128       32-bit    block     88.095999         0.006         0.005       
+1024        256       32-bit    block     87.273598         0.012         0.010       
+2048        512       32-bit    block     88.086402         0.023         0.020       
+4096        1024      32-bit    block     88.940799         0.046         0.040       
+8192        2048      32-bit    block     88.095999         0.093         0.081       
+16384       4096      32-bit    block     87.247998         0.188         0.164       
+32768       8192      32-bit    block     88.976002         0.368         0.322       
+65536       16384     32-bit    block     88.121599         0.744         0.651       
+131072      32768     32-bit    block     90.579200         1.447         1.266       
+262144      65536     32-bit    block     91.360003         2.869         2.511       
+524288      131072    32-bit    block     101.145601        5.183         4.536       
+1048576     262144    32-bit    block     111.052799        9.442         8.262       
+2097152     524288    32-bit    block     137.164795        15.289        13.378      
+4194304     1048576   32-bit    block     183.171201        22.898        20.036      
+#alltoall_device
+size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+64          8         64-bit    thread    111.955202        0.001         0.001       
+128         16        64-bit    thread    113.420796        0.001         0.001       
+256         32        64-bit    thread    108.508801        0.002         0.002       
+512         64        64-bit    thread    110.204804        0.005         0.004       
+1024        128       64-bit    thread    109.487998        0.009         0.008       
+2048        256       64-bit    thread    109.462404        0.019         0.016       
+4096        512       64-bit    thread    110.156798        0.037         0.033       
+8192        1024      64-bit    thread    109.401596        0.075         0.066       
+16384       2048      64-bit    thread    108.591998        0.151         0.132       
+#alltoall_device
+size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+64          8         64-bit    warp      88.896000         0.001         0.001       
+128         16        64-bit    warp      89.679998         0.001         0.001       
+256         32        64-bit    warp      88.950402         0.003         0.003       
+512         64        64-bit    warp      89.606398         0.006         0.005       
+1024        128       64-bit    warp      89.775997         0.011         0.010       
+2048        256       64-bit    warp      88.838398         0.023         0.020       
+4096        512       64-bit    warp      90.671998         0.045         0.040       
+8192        1024      64-bit    warp      89.699203         0.091         0.080       
+16384       2048      64-bit    warp      89.011198         0.184         0.161       
+32768       4096      64-bit    warp      89.622402         0.366         0.320       
+65536       8192      64-bit    warp      88.905603         0.737         0.645       
+131072      16384     64-bit    warp      89.766401         1.460         1.278       
+#alltoall_device
+size(B)     count     type      scope     latency(us)       algbw(GB/s)   busbw(GB/s) 
+64          8         64-bit    block     89.788800         0.001         0.001       
+128         16        64-bit    block     88.012803         0.001         0.001       
+256         32        64-bit    block     87.353599         0.003         0.003       
+512         64        64-bit    block     88.000000         0.006         0.005       
+1024        128       64-bit    block     87.225598         0.012         0.010       
+2048        256       64-bit    block     87.225598         0.023         0.021       
+4096        512       64-bit    block     87.168002         0.047         0.041       
+8192        1024      64-bit    block     88.067198         0.093         0.081       
+16384       2048      64-bit    block     88.863999         0.184         0.161       
+32768       4096      64-bit    block     88.723201         0.369         0.323       
+65536       8192      64-bit    block     87.993598         0.745         0.652       
+131072      16384     64-bit    block     88.783997         1.476         1.292       
+262144      32768     64-bit    block     91.366398         2.869         2.511       
+524288      65536     64-bit    block     102.060795        5.137         4.495       
+1048576     131072    64-bit    block     111.846399        9.375         8.203       
+2097152     262144    64-bit    block     137.107205        15.296        13.384      
+4194304     524288    64-bit    block     183.100796        22.907        20.044      
+```
diff --git a/docs/software/communication/openmpi.md b/docs/software/communication/openmpi.md
index 9c45c0da..018fecce 100644
--- a/docs/software/communication/openmpi.md
+++ b/docs/software/communication/openmpi.md
@@ -1,22 +1,23 @@
 [](){#ref-communication-openmpi}
 # OpenMPI
 
-[Cray MPICH][ref-communication-cray-mpich] is the recommended MPI implementation on Alps.
+[Cray MPICH][ref-communication-cray-mpich] is the recommended MPI implementation on Alps, particularly if you are using [uenv][ref-uenv].
+
 However, [OpenMPI](https://www.open-mpi.org/) can be used as an alternative in some cases, with limited support from CSCS.
+OpenMPI is available for use in both uenv and containers.
 
 To use OpenMPI on Alps, it must be built against [libfabric][ref-communication-libfabric] with support for the [Slingshot 11 network][ref-alps-hsn].
 
+[](){#ref-communication-openmpi-using}
 ## Using OpenMPI
 
-!!! warning
-    Building and using OpenMPI on Alps is still [work in progress](https://eth-cscs.github.io/cray-network-stack/).
-    The instructions found on this page may be inaccurate, but are a good starting point to using OpenMPI on Alps.
+[](){#ref-communication-openmpi-uenv}
+### uenv
 
-!!! todo
-    Deploy experimental uenv.
+!!! under-construction
+    Building and using OpenMPI in uenv on Alps is work in progress.
 
-!!! todo
-    Document OpenMPI uenv next to prgenv-gnu, prgenv-nvfortran, and linalg?
+    The instructions found on this page may be inaccurate, but are a good starting point to using OpenMPI on Alps.
 
 OpenMPI is provided through a [uenv][ref-uenv] similar to [`prgenv-gnu`][ref-uenv-prgenv-gnu].
 Once the uenv is loaded, compiling and linking with OpenMPI and libfabric is transparent.
@@ -31,9 +32,9 @@ srun --mpi=pmix ...
 Additionally, the following environment variables should be set:
 ```bash
 export PMIX_MCA_psec="native" # (1)!
-export FI_PROVIDER="cxi" # (2)!
-export OMPI_MCA_pml="^ucx" # (3)!
-export OMPI_MCA_mtl="ofi" # (4)!
+export FI_PROVIDER="cxi"      # (2)!
+export OMPI_MCA_pml="^ucx"    # (3)!
+export OMPI_MCA_mtl="ofi"     # (4)!
 ```
 
 1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup.
@@ -50,9 +51,389 @@ export OMPI_MCA_mtl="ofi" # (4)!
     To use the LINKx provider, set the following, instead of `FI_PROVIDER=cxi`:
 
     ```bash
-    export FI_PROVIDER="lnx" # (1)!
+    export FI_PROVIDER="lnx"           # (1)!
     export FI_LNX_PROV_LINKS="shm+cxi" # (2)!
     ```
 
     1. Use the libfabric LINKx provider, to allow using different libfabric providers for inter- and intra-node communication.
     2. Use the shared memory provider for intra-node communication and the CXI (Slingshot) provider for inter-node communication.
+
+[](){#ref-communication-openmpi-ce}
+### Containers
+
+To install OpenMPI in a container, libfabric (and possibly UCX if the container should be portable to other centers), should be installed.
+Then OpenMPI is built, and configured to use at least libfabric.
+Note that OpenMPI v5 is the first version with full support for libfabric, required for good performance.
+
+!!! note
+    The version of MPI in the containers provided by NVIDIA is OpenMPI v4 provided by NVIDIA's [HPC-X](https://developer.nvidia.com/networking/hpc-x) toolkit.
+    This version is not suitable for use on Alps for two reasons:
+
+    * OpenMPI version 5 is required for full libfabric support.
+    * It is linked against UCX only, and can't be modified to use the system libfabric.
+
+    See the [performance section][ref-communication-openmpi-performance] below for examples of the level of performance loss caused by using HPC-X.
+
+
+!!! example "Installing OpenMPI in a container for NVIDIA nodes"
+    The following Dockerfile instructions install OpenMPI from source in an Ubuntu image that already contains CUDA, libfabric and UCX.
+
+    ```Dockerfile
+    --8<-- "docs/software/communication/dockerfiles/openmpi"
+    ```
+
+    * The `--with-ofi` and `--with-ucx` flags configure OpenMPI with the libfabric and UCX back ends respectively.
+    * The `--enable-oshmem` flag builds OpenSHMEM as part of the OpenMPI installation, which is useful to support SHMEM implementations like [NVSHMEM][ref-communication-nvshmem].
+
+    Expand the box below to see an example of a full Containerfile that can be used to create an OpenMPI container on the gh200 nodes of Alps:
+
+    ??? note "The full Containerfile"
+        This is an example of a complete Containerfile that installs OpenMPI based on the a "base image" that provides gdrcopy, libfabric and UCX on top of an NVIDIA container that provides CUDA:
+
+        ```Dockerfile
+        --8<-- "docs/software/communication/dockerfiles/base"
+        --8<-- "docs/software/communication/dockerfiles/libfabric"
+        --8<-- "docs/software/communication/dockerfiles/ucx"
+        --8<-- "docs/software/communication/dockerfiles/openmpi"
+        --8<-- "docs/software/communication/dockerfiles/osu"
+        ```
+
+        * The container also installs the [OSU MPI micro-benchmarks](https://mvapich.cse.ohio-state.edu/benchmarks) so that the implementation can be tested.
+
+The EDF file for the container should contain the following:
+
+```toml
+[env]
+PMIX_MCA_psec="native" # (1)!
+```
+
+1. Ensures PMIx uses the same security domain as Slurm. Otherwise PMIx will print warnings at startup.
+
+[](){#ref-communication-openmpi-performance}
+## OpenMPI Performance
+
+We present some performance numbers for OpenMPI, obtained using the OSU benchmarks compiled in the above image.
+
+!!! warning "no version information available"
+    The following warning message was generated by each rank running the benchmarks below, and can safely be ignored.
+    ```
+    /usr/local/libexec/osu-micro-benchmarks/mpi/./collective/osu_alltoall: /usr/lib/aarch64-linux-gnu/libnl-3.so.200: no version information available (required by /usr/lib64/libcxi.so.1)
+    ```
+
+The first performance benchmarks are for the OSU point-to-point bandwidth test `osu_bw`.
+
+* inter-node tests place the two ranks on different nodes, so that all communication is over the Slingshot network
+* intra-node tests place two ranks on the same node, so that communication is via NVLINK or memory copies in the CPU-CPU case
+
+!!! note "impact of disabling the CXI hook"
+    On many Alps vClusters, the Container Engine is configured with the [CXI hook][ref-ce-cxi-hook] enabled by default, enabling transparent access to the Slingshot interconnect.
+
+    The inter node tests marked with `(*)` were run with the CXI container hook disabled, to demonstrate the effect of not using an optimised network configuration.
+    If you see similar performance degradation in your tests, the first thing to investigate is whether your setup is using the libfabric optimised back end.
+
+=== "CPU-to-CPU inter-node"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation
+    # OSU MPI Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.95              Pass
+    2                       1.90              Pass
+    4                       3.80              Pass
+    8                       7.61              Pass
+    16                     15.21              Pass
+    32                     30.47              Pass
+    64                     60.72              Pass
+    128                   121.56              Pass
+    256                   242.28              Pass
+    512                   484.54              Pass
+    1024                  968.30              Pass
+    2048                 1943.99              Pass
+    4096                 3870.29              Pass
+    8192                 6972.95              Pass
+    16384               13922.36              Pass
+    32768               18835.52              Pass
+    65536               22049.82              Pass
+    131072              23136.20              Pass
+    262144              23555.35              Pass
+    524288              23758.39              Pass
+    1048576             23883.95              Pass
+    2097152             23949.94              Pass
+    4194304             23982.18              Pass
+    ```
+
+=== "CPU-to-CPU inter-node (*)"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation
+    # OSU MPI Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.16              Pass
+    2                       0.32              Pass
+    4                       0.65              Pass
+    8                       1.31              Pass
+    16                      2.59              Pass
+    32                      5.26              Pass
+    64                     10.37              Pass
+    128                    20.91              Pass
+    256                    41.49              Pass
+    512                    74.26              Pass
+    1024                  123.99              Pass
+    2048                  213.82              Pass
+    4096                  356.13              Pass
+    8192                  468.55              Pass
+    16384                 505.89              Pass
+    32768                 549.59              Pass
+    65536                2170.64              Pass
+    131072               2137.95              Pass
+    262144               2469.63              Pass
+    524288               2731.85              Pass
+    1048576              2919.18              Pass
+    2097152              3047.21              Pass
+    4194304              3121.42              Pass
+    ```
+
+=== "GPU-to-GPU inter-node"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation D D
+    # OSU MPI-CUDA Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.90              Pass
+    2                       1.82              Pass
+    4                       3.65              Pass
+    8                       7.30              Pass
+    16                     14.56              Pass
+    32                     29.03              Pass
+    64                     57.49              Pass
+    128                   118.30              Pass
+    256                   227.18              Pass
+    512                   461.26              Pass
+    1024                  926.30              Pass
+    2048                 1820.46              Pass
+    4096                 3611.70              Pass
+    8192                 6837.89              Pass
+    16384               13361.25              Pass
+    32768               18037.71              Pass
+    65536               22019.46              Pass
+    131072              23104.58              Pass
+    262144              23542.71              Pass
+    524288              23758.69              Pass
+    1048576             23881.02              Pass
+    2097152             23955.49              Pass
+    4194304             23989.54              Pass
+    ```
+
+=== "GPU-to-GPU inter-node  (*)"
+    ```console
+    $ srun -N2 --mpi=pmix --environment=omb-ompi-no-cxi ./pt2pt/osu_bw --validation D D
+    # OSU MPI-CUDA Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.06              Pass
+    2                       0.12              Pass
+    4                       0.24              Pass
+    8                       0.48              Pass
+    16                      0.95              Pass
+    32                      1.91              Pass
+    64                      3.85              Pass
+    128                     7.57              Pass
+    256                    15.28              Pass
+    512                    19.87              Pass
+    1024                   53.06              Pass
+    2048                   97.29              Pass
+    4096                  180.73              Pass
+    8192                  343.75              Pass
+    16384                 473.72              Pass
+    32768                 530.81              Pass
+    65536                1268.51              Pass
+    131072               1080.83              Pass
+    262144               1435.36              Pass
+    524288               1526.12              Pass
+    1048576              1727.31              Pass
+    2097152              1755.61              Pass
+    4194304              1802.75              Pass
+    ```
+
+
+
+=== "CPU-to-CPU intra-node"
+    ```console
+    $ srun -N1 -n2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation
+    # OSU MPI Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.96              Pass
+    2                       1.92              Pass
+    4                       3.85              Pass
+    8                       7.68              Pass
+    16                     15.40              Pass
+    32                     30.78              Pass
+    64                     61.26              Pass
+    128                   122.23              Pass
+    256                   240.96              Pass
+    512                   483.12              Pass
+    1024                  966.52              Pass
+    2048                 1938.09              Pass
+    4096                 3873.67              Pass
+    8192                 7100.56              Pass
+    16384               14170.44              Pass
+    32768               18607.68              Pass
+    65536               21993.95              Pass
+    131072              23082.11              Pass
+    262144              23546.09              Pass
+    524288              23745.05              Pass
+    1048576             23879.79              Pass
+    2097152             23947.23              Pass
+    4194304             23980.15              Pass
+    ```
+
+=== "GPU-to-GPU intra-node"
+    ```console
+    $ srun -N1 -n2 --mpi=pmix --environment=omb-ompi ./pt2pt/osu_bw --validation D D
+    # OSU MPI-CUDA Bandwidth Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size      Bandwidth (MB/s)        Validation
+    1                       0.91              Pass
+    2                       1.83              Pass
+    4                       3.73              Pass
+    8                       7.47              Pass
+    16                     14.99              Pass
+    32                     29.98              Pass
+    64                     59.72              Pass
+    128                   119.13              Pass
+    256                   241.88              Pass
+    512                   481.52              Pass
+    1024                  963.60              Pass
+    2048                 1917.15              Pass
+    4096                 3840.96              Pass
+    8192                 6942.05              Pass
+    16384               13911.45              Pass
+    32768               18379.14              Pass
+    65536               21761.73              Pass
+    131072              23069.72              Pass
+    262144              23543.98              Pass
+    524288              23750.83              Pass
+    1048576             23882.44              Pass
+    2097152             23951.34              Pass
+    4194304             23989.44              Pass
+    ```
+
+
+Next is the all to all latency test `osu_alltoall`, for 8 ranks spread over nodes (4 ranks per node, 1 rank per GPU).
+
+=== "CPU-to-CPU"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi ./collective/osu_alltoall --validation
+    # OSU MPI All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                      12.46              Pass
+    2                      12.05              Pass
+    4                      11.99              Pass
+    8                      11.84              Pass
+    16                     11.87              Pass
+    32                     11.84              Pass
+    64                     11.95              Pass
+    128                    12.22              Pass
+    256                    13.21              Pass
+    512                    13.23              Pass
+    1024                   13.37              Pass
+    2048                   13.52              Pass
+    4096                   13.88              Pass
+    8192                   17.32              Pass
+    16384                  18.98              Pass
+    32768                  23.72              Pass
+    65536                  36.53              Pass
+    131072                 62.96              Pass
+    262144                119.44              Pass
+    524288                236.43              Pass
+    1048576               519.85              Pass
+    ```
+
+=== "CPU-to-CPU (*)"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi-no-cxi ./collective/osu_alltoall --validation
+    # OSU MPI All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                     137.85              Pass
+    2                     133.47              Pass
+    4                     134.03              Pass
+    8                     131.14              Pass
+    16                    134.45              Pass
+    32                    135.35              Pass
+    64                    137.21              Pass
+    128                   137.03              Pass
+    256                   139.90              Pass
+    512                   140.70              Pass
+    1024                  165.05              Pass
+    2048                  197.14              Pass
+    4096                  255.02              Pass
+    8192                  335.75              Pass
+    16384                 543.12              Pass
+    32768                 928.81              Pass
+    65536                 782.28              Pass
+    131072               1812.95              Pass
+    262144               2284.26              Pass
+    524288               3213.63              Pass
+    1048576              5688.27              Pass
+    ```
+
+=== "GPU-to-GPU"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi ./collective/osu_alltoall --validation -d cuda
+    # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                      22.26              Pass
+    2                      22.08              Pass
+    4                      22.15              Pass
+    8                      22.19              Pass
+    16                     22.25              Pass
+    32                     22.11              Pass
+    64                     22.22              Pass
+    128                    21.98              Pass
+    256                    22.19              Pass
+    512                    22.20              Pass
+    1024                   22.37              Pass
+    2048                   22.58              Pass
+    4096                   22.99              Pass
+    8192                   27.22              Pass
+    16384                  28.55              Pass
+    32768                  32.60              Pass
+    65536                  44.88              Pass
+    131072                 70.15              Pass
+    262144                123.30              Pass
+    524288                234.89              Pass
+    1048576               486.89              Pass
+    ```
+
+=== "GPU-to-GPU (*)"
+    ```console
+    $ srun -N2 --ntasks-per-node=4 --mpi=pmix --environment=omb-ompi-no-cxi ./collective/osu_alltoall --validation -d cuda
+    # OSU MPI-CUDA All-to-All Personalized Exchange Latency Test v7.5
+    # Datatype: MPI_CHAR.
+    # Size       Avg Latency(us)        Validation
+    1                     186.92              Pass
+    2                     180.80              Pass
+    4                     180.72              Pass
+    8                     179.45              Pass
+    16                    209.53              Pass
+    32                    181.73              Pass
+    64                    182.20              Pass
+    128                   182.84              Pass
+    256                   188.29              Pass
+    512                   189.35              Pass
+    1024                  237.31              Pass
+    2048                  231.73              Pass
+    4096                  298.73              Pass
+    8192                  396.10              Pass
+    16384                 589.72              Pass
+    32768                 983.72              Pass
+    65536                 786.48              Pass
+    131072               1127.39              Pass
+    262144               2144.57              Pass
+    524288               3107.62              Pass
+    1048576              5545.28              Pass
+    ```
diff --git a/docs/software/communication/rccl.md b/docs/software/communication/rccl.md
deleted file mode 100644
index 4e33fb3a..00000000
--- a/docs/software/communication/rccl.md
+++ /dev/null
@@ -1,14 +0,0 @@
-[](){#ref-communication-rccl}
-# RCCL
-
-[RCCL](https://rocmdocs.amd.com/projects/rccl/en/latest/) is an optimized inter-GPU communication library for AMD GPUs.
-It provides equivalent functionality to [NCCL][ref-communication-nccl] for AMD GPUs.
-
-!!! todo
-    - high level description
-    - libfabric/aws-ofi-rccl plugin
-    - configuration options
-
-!!! info
-    RCCL uses many of the same [configuration options as NCCL](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html), with the `NCCL` prefix, not `RCCL`.
-    Refer to NCCL documentation to tune RCCL.
diff --git a/docs/software/index.md b/docs/software/index.md
index a20f1955..ec072434 100644
--- a/docs/software/index.md
+++ b/docs/software/index.md
@@ -28,6 +28,8 @@ These pages provided documentation for all supported software, and installation
 
 -   :fontawesome-solid-bullhorn: [__Communication Libraries__][ref-software-communication]
 
+    [:octicons-arrow-right-24: Cray MPICH][ref-communication-libfabric]
+
     [:octicons-arrow-right-24: Cray MPICH][ref-communication-cray-mpich]
 
     [:octicons-arrow-right-24: MPICH][ref-communication-mpich]
@@ -36,7 +38,6 @@ These pages provided documentation for all supported software, and installation
 
     [:octicons-arrow-right-24: NCCL][ref-communication-nccl]
 
-    [:octicons-arrow-right-24: RCCL][ref-communication-rccl]
 
     [:octicons-arrow-right-24: libfabric][ref-communication-libfabric]
 
diff --git a/mkdocs.yml b/mkdocs.yml
index 8fdd4627..1559e1a1 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -109,12 +109,12 @@ nav:
       - 'netcdf-tools': software/cw/netcdf-tools.md
     - 'Communication Libraries':
       - software/communication/index.md
+      - 'libfabric': software/communication/libfabric.md
       - 'Cray MPICH': software/communication/cray-mpich.md
       - 'MPICH': software/communication/mpich.md
       - 'OpenMPI': software/communication/openmpi.md
       - 'NCCL': software/communication/nccl.md
-      - 'RCCL': software/communication/rccl.md
-      - 'libfabric': software/communication/libfabric.md
+      - 'NVSHMEM': software/communication/nvshmem.md
     - 'Commercial software':
       - software/commercial/index.md
       - 'Matlab': software/commercial/matlab.md