From 5cb893445970948fc5b80f5c7d4f6a97d17e477f Mon Sep 17 00:00:00 2001 From: Suffian Khan Date: Tue, 23 Mar 2021 13:36:04 -0700 Subject: [PATCH] update Dockerfile for workaround for issue in RCCL for rocm4.0 (#7108) --- orttraining/tools/amdgpu/Dockerfile.rocm4.0.pytorch | 1 + 1 file changed, 1 insertion(+) diff --git a/orttraining/tools/amdgpu/Dockerfile.rocm4.0.pytorch b/orttraining/tools/amdgpu/Dockerfile.rocm4.0.pytorch index c4fb50aa56..c05888bfc4 100644 --- a/orttraining/tools/amdgpu/Dockerfile.rocm4.0.pytorch +++ b/orttraining/tools/amdgpu/Dockerfile.rocm4.0.pytorch @@ -190,6 +190,7 @@ ENV HSA_NO_SCRATCH_RECLAIM=1 # Distributed training related environment variables ENV HSA_FORCE_FINE_GRAIN_PCIE=1 ENV NCCL_DEBUG=INFO +ENV RCCL_ALLTOALL_KERNEL_DISABLE=1 # ENV NCCL_DEBUG_SUBSYS=INIT,COLL WORKDIR ${WORKSPACE_DIR}/script