diff --git a/orttraining/tools/amdgpu/Dockerfile.rocm4.2.pytorch b/orttraining/tools/amdgpu/Dockerfile.rocm4.2.pytorch
new file mode 100644
index 0000000000..243896ee89
--- /dev/null
+++ b/orttraining/tools/amdgpu/Dockerfile.rocm4.2.pytorch
@@ -0,0 +1,171 @@
+# docker build --network=host --file Dockerfile.rocm4.2.pytorch --tag ort:rocm4.2-pytorch .
+
+FROM rocm/pytorch:rocm4.2_ubuntu18.04_py3.6_pytorch_1.8.1
+
+RUN apt-get -y install gpg-agent
+RUN wget -q -O - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
+RUN echo 'deb [arch=amd64] http://repo.radeon.com/rocm/apt/4.2/ xenial main' | tee /etc/apt/sources.list.d/rocm.list
+
+RUN apt-get -y update
+RUN apt-get -y install apt-utils
+RUN apt-get -y install build-essential autotools-dev \
+    make git curl vim wget rsync jq openssh-server openssh-client sudo \
+    iputils-ping net-tools ethtool libcap2 \
+    automake autoconf libtool flex doxygen \
+    perl lsb-release iproute2 pciutils graphviz \
+    bc tar git bash pbzip2 pv bzip2 unzip cabextract \
+    g++ gcc \
+    && apt-get autoremove
+
+# sh
+RUN rm /bin/sh && ln -s /bin/bash /bin/sh
+
+# Labels for the docker
+LABEL description="This docker sets up the environment to run ORT Training with AMD GPU"
+
+# CMake
+ENV CMAKE_VERSION=3.18.2
+RUN cd /usr/local && \
+    wget -q -O - https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz | tar zxf -
+ENV PATH=/usr/local/cmake-${CMAKE_VERSION}-Linux-x86_64/bin:${PATH}
+
+ENV WORKSPACE_DIR=/workspace
+RUN mkdir -p $WORKSPACE_DIR
+WORKDIR $WORKSPACE_DIR
+
+ENV OLD_PATH=${PATH}
+ENV PATH=/usr/bin:${PATH}
+# Infiniband setup, openmpi installed under /usr/mpi/gcc/openmpi-4.0.4rc3 doesn't support multi-thread
+ENV MOFED_VERSION=5.1-0.6.6.0
+ENV MOFED_OS=ubuntu18.04
+ENV MOFED_FILENAME=MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
+RUN curl -fSsL https://www.mellanox.com/downloads/ofed/MLNX_OFED-${MOFED_VERSION}/${MOFED_FILENAME}.tgz | tar -zxpf -
+RUN cd MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64 && \
+    ./mlnxofedinstall --force --user-space-only --without-fw-update --hpc && \
+    cd .. && \
+    rm -r MLNX_OFED_LINUX-${MOFED_VERSION}-${MOFED_OS}-x86_64
+
+ENV PATH=${OLD_PATH}
+ENV unset OLD_PATH
+
+# python env
+RUN pip3 install --upgrade setuptools
+ARG NUMPY_VERSION=1.18.5
+ARG ONNX_VERSION=1.7.0
+RUN pip3 install --no-cache-dir wheel tqdm boto3 requests six ipdb h5py html2text nltk progressbar pyyaml \
+        git+https://github.com/NVIDIA/dllogger \
+        numpy==${NUMPY_VERSION} \
+        onnx=="${ONNX_VERSION}"
+
+ENV GITHUB_DIR=$WORKSPACE_DIR/github
+RUN mkdir -p $GITHUB_DIR
+
+# UCX
+WORKDIR $GITHUB_DIR
+RUN apt-get -y update && apt-get -y --no-install-recommends install libnuma-dev
+ARG UCX_VERSION=1.9.0-rc3
+ENV UCX_DIR=$WORKSPACE_DIR/ucx-$UCX_VERSION
+RUN git clone https://github.com/openucx/ucx.git \
+  && cd ucx \
+  && git checkout v$UCX_VERSION \
+  && ./autogen.sh \
+  && mkdir build \
+  && cd build \
+  && ../contrib/configure-opt --prefix=$UCX_DIR --without-rocm --without-knem --without-cuda \
+  && make -j"$(nproc)" \
+  && make install \
+  && cd .. \
+  && rm -rf build
+
+# OpenMPI
+# note: require --enable-orterun-prefix-by-default for Azure machine learning compute
+# note: disable verbs as we use ucx middleware and don't want btl openib warnings
+WORKDIR $GITHUB_DIR
+ARG OPENMPI_BASEVERSION=4.0
+ARG OPENMPI_VERSION=${OPENMPI_BASEVERSION}.5
+ENV OPENMPI_DIR=$WORKSPACE_DIR/openmpi-${OPENMPI_VERSION}
+RUN git clone --recursive https://github.com/open-mpi/ompi.git \
+  && cd ompi \
+  && git checkout v$OPENMPI_VERSION \
+  && ./autogen.pl \
+  && mkdir build \
+  && cd build \
+  && ../configure --prefix=$OPENMPI_DIR --with-ucx=$UCX_DIR --without-verbs \
+                  --enable-mpirun-prefix-by-default --enable-orterun-prefix-by-default \
+                  --enable-mca-no-build=btl-uct --disable-mpi-fortran \
+  && make -j"$(nproc)" \
+  && make install \
+  && cd .. \
+  && rm -rf build \
+  && ldconfig \
+  && test -f ${OPENMPI_DIR}/bin/mpic++
+
+ENV PATH=$OPENMPI_DIR/bin:${PATH}
+ENV LD_LIBRARY_PATH=$OPENMPI_DIR/lib:${LD_LIBRARY_PATH}
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv $OPENMPI_DIR/bin/mpirun $OPENMPI_DIR/bin/mpirun.real && \
+    echo '#!/bin/bash' > $OPENMPI_DIR/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> $OPENMPI_DIR/bin/mpirun && \
+    chmod a+x $OPENMPI_DIR/bin/mpirun
+
+# install mpi4py (be sure to link existing /opt/openmpi-xxx)
+RUN CC=mpicc MPICC=mpicc pip install mpi4py --no-binary mpi4py
+
+ARG CACHE_DATA=2021-05-18
+
+# ONNX Runtime
+WORKDIR $GITHUB_DIR
+ENV ORT_DIR=$GITHUB_DIR/onnxruntime
+RUN git clone --recursive https://github.com/microsoft/onnxruntime.git \
+  && cd onnxruntime \
+  && python3 tools/ci_build/build.py \
+    --cmake_extra_defines ONNXRUNTIME_VERSION=`cat ./VERSION_NUMBER` \
+    --build_dir build \
+    --config Release \
+    --parallel \
+    --skip_tests \
+    --build_wheel \
+    --use_rocm --rocm_home /opt/rocm \
+    --mpi_home $OPENMPI_DIR \
+    --nccl_home /opt/rocm \
+    --enable_training \
+  && test -f $ORT_DIR/build/Release/onnxruntime_training_bert \
+  && pip install $ORT_DIR/build/Release/dist/*.whl \
+  && ldconfig
+
+RUN pip3 install --no-cache-dir GPUtil azureml azureml-core datasets tokenizers ninja cerberus sympy sacremoses sacrebleu
+
+# Huggingface Examples
+WORKDIR $GITHUB_DIR
+RUN git clone https://github.com/microsoft/huggingface-transformers.git
+
+# Enable ssh access without password needed
+RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/g' /etc/ssh/sshd_config
+RUN sed -i 's/#StrictModes yes/StrictModes no/g' /etc/ssh/sshd_config
+RUN sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/g' /etc/ssh/sshd_config
+RUN sed -i 's/#PermitEmptyPasswords no/PermitEmptyPasswords yes/g' /etc/ssh/sshd_config
+
+# Start or Restart sshd service
+ENTRYPOINT service ssh restart && /bin/bash
+
+# Add model and scripts
+ADD model ${WORKSPACE_DIR}/model
+ADD script ${WORKSPACE_DIR}/script
+RUN chmod a+x ${WORKSPACE_DIR}/script/run_bert.sh
+
+# add locale en_US.UTF-8
+RUN apt-get install -y locales
+RUN locale-gen en_US.UTF-8
+
+# Workaround an issue in AMD compiler which generates poor GPU ISA
+# when the type of kernel parameter is a structure and “pass-by-value” is used
+# ENV HSA_NO_SCRATCH_RECLAIM=1
+
+# Distributed training related environment variables
+ENV HSA_FORCE_FINE_GRAIN_PCIE=1
+# ENV NCCL_DEBUG=INFO
+# ENV RCCL_ALLTOALL_KERNEL_DISABLE=1
+# ENV NCCL_DEBUG_SUBSYS=INIT,COLL
+
+WORKDIR ${WORKSPACE_DIR}/script