update Dockerfile for workaround for issue in RCCL for rocm4.0 (#7108)

This commit is contained in:
Suffian Khan 2021-03-23 13:36:04 -07:00 committed by GitHub
parent c0994fdfbb
commit 5cb8934459
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -190,6 +190,7 @@ ENV HSA_NO_SCRATCH_RECLAIM=1
# Distributed training related environment variables
ENV HSA_FORCE_FINE_GRAIN_PCIE=1
ENV NCCL_DEBUG=INFO
ENV RCCL_ALLTOALL_KERNEL_DISABLE=1
# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
WORKDIR ${WORKSPACE_DIR}/script