orttraining packaging and ci pipelines to use cuda 11.3 (#10252)

This commit is contained in:
Baiju Meswani 2022-01-13 13:36:33 -08:00 committed by GitHub
parent 4b205eb2b3
commit 2affd6e71e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 9 additions and 8 deletions

View file

@ -14,7 +14,7 @@ jobs:
--enable_training
--enable_training_torch_interop
--config $(buildConfig)
--use_cuda --cuda_version=11.1 --cuda_home=/usr/local/cuda-11.1 --cudnn_home=/usr/local/cuda-11.1
--use_cuda --cuda_version=11.3 --cuda_home=/usr/local/cuda-11.3 --cudnn_home=/usr/local/cuda-11.3
--build_wheel
--enable_nvtx_profile
--cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=70

View file

@ -17,7 +17,7 @@ jobs:
-o ubuntu20.04 -d gpu \
-t onnxruntime_distributed_tests_image \
-x " \
--use_cuda --cuda_version=11.1 --cuda_home=/usr/local/cuda-11.1 --cudnn_home=/usr/local/cuda-11.1 \
--use_cuda --cuda_version=11.3 --cuda_home=/usr/local/cuda-11.3 --cudnn_home=/usr/local/cuda-11.3 \
--config RelWithDebInfo \
--enable_training \
--update --build \

View file

@ -18,7 +18,7 @@ jobs:
-t onnxruntime_ortmodule_distributed_tests_image \
-x " \
--config RelWithDebInfo \
--use_cuda --cuda_version=11.1 --cuda_home=/usr/local/cuda-11.1 --cudnn_home=/usr/local/cuda-11.1 \
--use_cuda --cuda_version=11.3 --cuda_home=/usr/local/cuda-11.3 --cudnn_home=/usr/local/cuda-11.3 \
--enable_training \
--enable_training_torch_interop \
--update --build \

View file

@ -33,7 +33,7 @@ steps:
--volume /bert_data:/bert_data \
--volume /hf_models_cache:/hf_models_cache \
${{ parameters.DockerImageTag }} \
bash -c "python3 -m pip uninstall -y -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/requirements.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.10.0_cu11.1.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt && rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build" \
bash -c "python3 -m pip uninstall -y -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/requirements.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch1.10.0_cu11.3.txt && python3 -m pip install -r /onnxruntime_src/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt && rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw --transformers_cache /hf_models_cache/huggingface/transformers' --cwd /build" \
displayName: 'Run orttraining_ortmodule_tests.py'
condition: succeededOrFailed()
timeoutInMinutes: 60

View file

@ -1,4 +1,4 @@
ARG BASEIMAGE=nvcr.io/nvidia/cuda:11.3-cudnn8-devel-centos7
ARG BASEIMAGE=nvcr.io/nvidia/cuda:11.3.1-cudnn8-devel-centos7
ARG POLICY=manylinux2014
ARG PLATFORM=x86_64
ARG DEVTOOLSET_ROOTPATH=
@ -192,7 +192,7 @@ RUN cd /tmp/scripts && \
/tmp/scripts/manylinux/install_centos.sh && \
/tmp/scripts/install_os_deps.sh -d gpu $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_ninja.sh && \
/tmp/scripts/install_python_deps.sh -d gpu -v 11.1 -p $PYTHON_VERSION -h $TORCH_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
/tmp/scripts/install_python_deps.sh -d gpu -v 11.3 -p $PYTHON_VERSION -h $TORCH_VERSION $INSTALL_DEPS_EXTRA_ARGS && \
rm -rf /tmp/scripts
ARG BUILD_UID=1001

View file

@ -1,4 +1,4 @@
ARG BASEIMAGE=nvcr.io/nvidia/cuda:11.1.1-cudnn8-devel-ubuntu18.04
ARG BASEIMAGE=nvcr.io/nvidia/cuda:11.3.1-cudnn8-devel-ubuntu18.04
FROM $BASEIMAGE

View file

@ -91,8 +91,9 @@ elif [ $BUILD_DEVICE = "gpu" ]; then
if [[ $ORTMODULE_BUILD = true ]]; then
INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -u"
fi
INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -v 11.3"
$GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \
--docker-build-args="--build-arg BASEIMAGE=nvcr.io/nvidia/cuda:11.1.1-cudnn8-devel-${BUILD_OS} --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\" --build-arg USE_CONDA=${USE_CONDA} --network=host" \
--docker-build-args="--build-arg BASEIMAGE=nvcr.io/nvidia/cuda:11.3.1-cudnn8-devel-${BUILD_OS} --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\" --build-arg USE_CONDA=${USE_CONDA} --network=host" \
--dockerfile Dockerfile.ubuntu_gpu_training --context .
elif [[ $BUILD_DEVICE = "tensorrt"* ]]; then
if [ $BUILD_DEVICE = "tensorrt-v7.1" ]; then