diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml index e474982625..d27918c73b 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml @@ -14,7 +14,7 @@ jobs: --enable_training --enable_training_torch_interop --config $(buildConfig) - --use_cuda --cuda_version=11.1 --cuda_home=/usr/local/cuda-11.1 --cudnn_home=/usr/local/cuda-11.1 + --use_cuda --cuda_version=11.3 --cuda_home=/usr/local/cuda-11.3 --cudnn_home=/usr/local/cuda-11.3 --build_wheel --enable_nvtx_profile --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=70 diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-test-ci-pipeline.yml index 78808e6701..b0313522e1 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-distributed-test-ci-pipeline.yml @@ -17,7 +17,7 @@ jobs: -o ubuntu20.04 -d gpu \ -t onnxruntime_distributed_tests_image \ -x " \ - --use_cuda --cuda_version=11.1 --cuda_home=/usr/local/cuda-11.1 --cudnn_home=/usr/local/cuda-11.1 \ + --use_cuda --cuda_version=11.3 --cuda_home=/usr/local/cuda-11.3 --cudnn_home=/usr/local/cuda-11.3 \ --config RelWithDebInfo \ --enable_training \ --update --build \ diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml index 8cf65d7d0b..9f66116d52 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml @@ -18,7 +18,7 @@ jobs: -t onnxruntime_ortmodule_distributed_tests_image \ -x " \ --config RelWithDebInfo \ - --use_cuda --cuda_version=11.1 --cuda_home=/usr/local/cuda-11.1 --cudnn_home=/usr/local/cuda-11.1 \ + --use_cuda --cuda_version=11.3 --cuda_home=/usr/local/cuda-11.3 --cudnn_home=/usr/local/cuda-11.3 \ --enable_training \ --enable_training_torch_interop \ --update --build \ diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml index 1aa2cd6a09..5e7960661e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-ortmodule-test-ci-pipeline.yml @@ -33,7 +33,7 @@ steps: --volume /bert_data:/bert_data \ --volume /hf_models_cache:/hf_models_cache \ ${{ parameters.DockerImageTag }} \ - bash -c "python3 -m pip uninstall -y -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/requirements.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.10.0_cu11.1.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt && rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build" \ + bash -c "python3 -m pip uninstall -y -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/requirements.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.10.0_cu11.3.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt && rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build" \ displayName: 'Run orttraining_ortmodule_tests.py' condition: succeededOrFailed() timeoutInMinutes: 60 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_3 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_3 index 17d7ebc216..d74bf2c740 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_3 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_training_cuda11_3 @@ -1,4 +1,4 @@ -ARG BASEIMAGE=nvcr.io/nvidia/cuda:11.3-cudnn8-devel-centos7 +ARG BASEIMAGE=nvcr.io/nvidia/cuda:11.3.1-cudnn8-devel-centos7 ARG POLICY=manylinux2014 ARG PLATFORM=x86_64 ARG DEVTOOLSET_ROOTPATH= @@ -192,7 +192,7 @@ RUN cd /tmp/scripts && \ /tmp/scripts/manylinux/install_centos.sh && \ /tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \ /tmp/scripts/install_ninja.sh && \ - /tmp/scripts/install_python_deps.sh -d gpu -v 11.1 -p $PYTHON_VERSION -h $TORCH_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ + /tmp/scripts/install_python_deps.sh -d gpu -v 11.3 -p $PYTHON_VERSION -h $TORCH_VERSION $INSTALL_DEPS_EXTRA_ARGS && \ rm -rf /tmp/scripts ARG BUILD_UID=1001 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training index 1b5e6e6a4a..f6742b8c37 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_gpu_training @@ -1,4 +1,4 @@ -ARG BASEIMAGE=nvcr.io/nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04 +ARG BASEIMAGE=nvcr.io/nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04 FROM $BASEIMAGE diff --git a/tools/ci_build/github/linux/run_dockerbuild.sh b/tools/ci_build/github/linux/run_dockerbuild.sh index d0748e96ae..a54f75d38d 100755 --- a/tools/ci_build/github/linux/run_dockerbuild.sh +++ b/tools/ci_build/github/linux/run_dockerbuild.sh @@ -91,8 +91,9 @@ elif [ $BUILD_DEVICE = "gpu" ]; then if [[ $ORTMODULE_BUILD = true ]]; then INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -u" fi + INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -v 11.3" $GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \ - --docker-build-args="--build-arg BASEIMAGE=nvcr.io/nvidia/cuda:11.1.1-cudnn8-devel-${BUILD_OS} --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\" --build-arg USE_CONDA=${USE_CONDA} --network=host" \ + --docker-build-args="--build-arg BASEIMAGE=nvcr.io/nvidia/cuda:11.3.1-cudnn8-devel-${BUILD_OS} --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\" --build-arg USE_CONDA=${USE_CONDA} --network=host" \ --dockerfile Dockerfile.ubuntu_gpu_training --context . elif [[ $BUILD_DEVICE = "tensorrt"* ]]; then if [ $BUILD_DEVICE = "tensorrt-v7.1" ]; then