From de3182997b5d61da1891635a922bd47410010fce Mon Sep 17 00:00:00 2001 From: Ernst Hellbar Date: Fri, 26 Sep 2025 15:51:05 +0200 Subject: [PATCH] dpl-workflow.sh: add env variable to use full MI100 serialization in online --- prodtests/full-system-test/dpl-workflow.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh index 996ab70da8f6c..9fc6ce5507168 100755 --- a/prodtests/full-system-test/dpl-workflow.sh +++ b/prodtests/full-system-test/dpl-workflow.sh @@ -278,8 +278,13 @@ if [[ $GPUTYPE == "HIP" ]]; then GPU_CONFIG+=" --environment \"ROCR_VISIBLE_DEVICES={timeslice${TIMESLICEOFFSET}}\"" fi # serialization workaround for MI100 nodes: remove it again if the problem will be fixed in ROCm, then also remove the DISABLE_MI100_SERIALIZATION flag in the O2DPG parse script - [[ $EPNSYNCMODE == 1 ]] && [[ ${EPN_NODE_MI100:-} == "1" ]] && [[ ${DISABLE_MI100_SERIALIZATION:-0} != 1 ]] && GPU_CONFIG_KEY+="GPU_proc.amdMI100SerializationWorkaround=1;" - [[ -n ${OPTIMIZED_PARALLEL_ASYNC:-} ]] && [[ ${EPN_NODE_MI100:-} == "1" ]] && [[ ${DISABLE_MI100_SERIALIZATION:-0} != 1 ]] && GPU_CONFIG_KEY+="GPU_proc.serializeGPU=3;" + if [[ ${EPN_NODE_MI100:-} == "1" && ${DISABLE_MI100_SERIALIZATION:-0} != 1 ]]; then + if [[ -n ${OPTIMIZED_PARALLEL_ASYNC:-} ]] || [[ $EPNSYNCMODE == 1 && ${FULL_MI100_SERIALIZATION:-0} == 1 ]]; then + GPU_CONFIG_KEY+="GPU_proc.serializeGPU=3;" + elif [[ $EPNSYNCMODE == 1 ]]; then + GPU_CONFIG_KEY+="GPU_proc.amdMI100SerializationWorkaround=1;" + fi + fi #export HSA_TOOLS_LIB=/opt/rocm/lib/librocm-debug-agent.so.2 else GPU_CONFIG_KEY+="GPU_proc.deviceNum=-2;"