From a20f8037f6d1e512d33e4e1d6a8598e3e71f2889 Mon Sep 17 00:00:00 2001 From: edgchen1 <18449977+edgchen1@users.noreply.github.com> Date: Wed, 16 Sep 2020 09:53:30 -0700 Subject: [PATCH] Install ssh in builder image, fix segfault in TrainingRunnerTest.Basic. (#5186) --- dockerfiles/Dockerfile.training | 3 ++- .../training_ops/cuda/communication/nccl_service.cc | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dockerfiles/Dockerfile.training b/dockerfiles/Dockerfile.training index efcb574a31..5c8f0b497c 100644 --- a/dockerfiles/Dockerfile.training +++ b/dockerfiles/Dockerfile.training @@ -19,12 +19,13 @@ FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as builder # set location for builds WORKDIR /stage -# install curl and git +# install curl, git, ssh (required by MPI when running ORT tests) RUN apt-get -y update &&\ apt-get -y --no-install-recommends install \ curl \ git \ language-pack-en \ + openssh-client \ unattended-upgrades # update existing packages to minimize security vulnerabilities diff --git a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc index d67c093505..8c4554db80 100644 --- a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc +++ b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc @@ -262,7 +262,8 @@ void NcclService::Launch() { { std::lock_guard guard(mutex_); // All tasks must be ready with a valid time. - if (time_ > schedule_.size() - 1 || + if (schedule_.empty() || + time_ > schedule_.size() - 1 || !schedule_[time_].IsAllTasksEqueued() || schedule_[time_].IsAllTasksFinished()) { continue; @@ -337,7 +338,7 @@ void NcclService::Terminate() { WaitForLaunch(); { std::unique_lock lock(mutex_); - cv_.wait(lock, [this] { return total_time_ > 0 && time_ == 0; }); + cv_.wait(lock, [this] { return schedule_.empty() || total_time_ > 0 && time_ == 0; }); } is_running_ = false;