Install ssh in builder image, fix segfault in TrainingRunnerTest.Basic. (#5186)

This commit is contained in:
edgchen1 2020-09-16 09:53:30 -07:00 committed by GitHub
parent 400ac85565
commit a20f8037f6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 5 additions and 3 deletions

View file

@ -19,12 +19,13 @@ FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04 as builder
# set location for builds
WORKDIR /stage
# install curl and git
# install curl, git, ssh (required by MPI when running ORT tests)
RUN apt-get -y update &&\
apt-get -y --no-install-recommends install \
curl \
git \
language-pack-en \
openssh-client \
unattended-upgrades
# update existing packages to minimize security vulnerabilities

View file

@ -262,7 +262,8 @@ void NcclService::Launch() {
{
std::lock_guard<std::mutex> guard(mutex_);
// All tasks must be ready with a valid time.
if (time_ > schedule_.size() - 1 ||
if (schedule_.empty() ||
time_ > schedule_.size() - 1 ||
!schedule_[time_].IsAllTasksEqueued() ||
schedule_[time_].IsAllTasksFinished()) {
continue;
@ -337,7 +338,7 @@ void NcclService::Terminate() {
WaitForLaunch();
{
std::unique_lock<std::mutex> lock(mutex_);
cv_.wait(lock, [this] { return total_time_ > 0 && time_ == 0; });
cv_.wait(lock, [this] { return schedule_.empty() || total_time_ > 0 && time_ == 0; });
}
is_running_ = false;