onnxruntime/cmake/onnxruntime_python.cmake

1037 lines
50 KiB
CMake
Raw Normal View History

2018-11-20 00:48:22 +00:00
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
include(pybind11)
2018-11-20 00:48:22 +00:00
# ---[ Python + Numpy
set(onnxruntime_pybind_srcs_pattern
"${ONNXRUNTIME_ROOT}/python/*.cc"
"${ONNXRUNTIME_ROOT}/python/*.h"
)
if (onnxruntime_ENABLE_TRAINING)
list(APPEND onnxruntime_pybind_srcs_pattern
"${ORTTRAINING_ROOT}/orttraining/python/*.cc"
"${ORTTRAINING_ROOT}/orttraining/python/*.h"
)
endif()
2019-04-29 19:58:20 +00:00
file(GLOB onnxruntime_pybind_srcs CONFIGURE_DEPENDS
${onnxruntime_pybind_srcs_pattern}
)
if(onnxruntime_ENABLE_TRAINING)
list(REMOVE_ITEM onnxruntime_pybind_srcs ${ONNXRUNTIME_ROOT}/python/onnxruntime_pybind_module.cc)
endif()
2018-11-20 00:48:22 +00:00
# Add Pytorch as a library.
if (onnxruntime_ENABLE_LAZY_TENSOR)
# Lazy Tensor requires Pytorch as a library.
list(APPEND CMAKE_PREFIX_PATH ${onnxruntime_PREBUILT_PYTORCH_PATH})
# The following line may change ${CUDA_NVCC_FLAGS} and ${CMAKE_CUDA_FLAGS},
# if Pytorch is built from source.
# For example, pytorch/cmake/public/cuda.cmake and
# pytorch/torch/share/cmake/Caffe2/public/cuda.cmake both defines
# ONNX_NAMESPACE for both CUDA_NVCC_FLAGS and CMAKE_CUDA_FLAGS.
# Later, this ONNX_NAMESPACE may conflicts with ONNX_NAMESPACE set by ORT.
find_package(Torch REQUIRED)
# Let's remove ONNX_NAMESPACE from Torch.
list(FILTER CUDA_NVCC_FLAGS EXCLUDE REGEX "-DONNX_NAMESPACE=.+")
string(REGEX REPLACE "-DONNX_NAMESPACE=.+ " " " CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS}")
endif()
# Support ORT as a backend in Pytorch's LazyTensor.
if (onnxruntime_ENABLE_LAZY_TENSOR)
file(GLOB onnxruntime_lazy_tensor_extension_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_ROOT}/orttraining/lazy_tensor/*.cc")
file(GLOB onnxruntime_lazy_tensor_extension_headers CONFIGURE_DEPENDS
"${ORTTRAINING_ROOT}/orttraining/lazy_tensor/*.h")
if(NOT MSVC)
set_source_files_properties(${onnxruntime_lazy_tensor_extension_srcs} PROPERTIES COMPILE_FLAGS -Wno-unused-parameter)
set_source_files_properties(${onnxruntime_lazy_tensor_extension_headers} PROPERTIES COMPILE_FLAGS -Wno-unused-parameter)
endif()
list(APPEND onnxruntime_pybind_srcs
${onnxruntime_lazy_tensor_extension_srcs})
endif()
# onnxruntime_ENABLE_LAZY_TENSOR and onnxruntime_ENABLE_EAGER_MODE
# need DLPack code to pass tensors cross ORT and Pytorch boundary.
# TODO: consider making DLPack code a standalone library.
if (onnxruntime_ENABLE_LAZY_TENSOR)
# If DLPack code is not built, add it to ORT's pybind target.
if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
list(APPEND onnxruntime_pybind_srcs
"${ORTTRAINING_ROOT}/orttraining/core/framework/torch/dlpack_python.cc")
endif()
endif()
onnxruntime_add_shared_library_module(onnxruntime_pybind11_state ${onnxruntime_pybind_srcs})
if(MSVC)
target_compile_options(onnxruntime_pybind11_state PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>" "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
if(onnxruntime_ENABLE_TRAINING)
target_compile_options(onnxruntime_pybind11_state PRIVATE "/bigobj")
endif()
endif()
2018-11-20 00:48:22 +00:00
if(HAS_CAST_FUNCTION_TYPE)
target_compile_options(onnxruntime_pybind11_state PRIVATE "-Wno-cast-function-type")
2018-11-20 00:48:22 +00:00
endif()
2021-07-22 22:24:36 +00:00
# We export symbols using linker and the compiler does not know anything about it
# There is a problem with classes that have pybind types as members.
# See https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes
if (NOT MSVC)
target_compile_options(onnxruntime_pybind11_state PRIVATE "-fvisibility=hidden")
endif()
if(onnxruntime_PYBIND_EXPORT_OPSCHEMA)
target_compile_definitions(onnxruntime_pybind11_state PRIVATE onnxruntime_PYBIND_EXPORT_OPSCHEMA)
endif()
if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
#TODO: fix the warnings
target_compile_options(onnxruntime_pybind11_state PRIVATE "/wd4244")
endif()
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
onnxruntime_add_include_to_target(onnxruntime_pybind11_state Python::Module Python::NumPy)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${ONNXRUNTIME_ROOT} ${pybind11_INCLUDE_DIRS})
[CUDA] Fix cuda provider fallback inconsistency (#21425) * Fix fallback setting (cuda still falls back to cuda). * Fix cuda provider fallback inconsistent with/without CUDA_PATH environment variable. * Add cuda and cudnn major version requirement in error message. Example result in Windows: ``` >>> import onnxruntime >>> ort_session = onnxruntime.InferenceSession("model.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) 2024-07-19 17:43:44.2260019 [E:onnxruntime:Default, provider_bridge_ort.cc:1972 onnxruntime::TryGetProviderInfo_CUDA] D:\onnxruntime\onnxruntime\core\session\provider_bridge_ort.cc:1636 onnxruntime::ProviderLibrary::Get [ONNXRuntimeError] : 1 : FAIL : LoadLibrary failed with error 126 "" when trying to load "C:\Users\.conda\envs\py310\lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll" 2024-07-19 17:43:44.2312351 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:970 onnxruntime::python::CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*, and the latest MSVC runtime. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported. >>> ort_session <onnxruntime.capi.onnxruntime_inference_collection.InferenceSession object at 0x0000016BB2DF7D60> >>> ort_session.get_providers() ['CPUExecutionProvider'] ``` Example result in Linux: ``` >>> import onnxruntime >>> ort_session = onnxruntime.InferenceSession("resnet50-v2-7.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) 2024-07-20 20:33:26.486974543 [E:onnxruntime:Default, provider_bridge_ort.cc:1972 TryGetProviderInfo_CUDA] /work/onnxruntime/onnxruntime/core/session/provider_bridge_ort.cc:1636 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcublasLt.so.12: cannot open shared object file: No such file or directory 2024-07-20 20:33:26.487034646 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:961 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported. >>> ort_session.get_providers() ['CPUExecutionProvider'] ``` ### Motivation and Context https://github.com/microsoft/onnxruntime/issues/21424
2024-07-23 18:58:04 +00:00
if(onnxruntime_USE_CUDA)
Adding CUDNN Frontend and use for CUDA NN Convolution (#19470) ### Description Added CUDNN Frontend and used it for NHWC convolutions, and optionally fuse activation. #### Backward compatible - For model existed with FusedConv, model can still run. - If ORT is built with cuDNN 8, cuDNN frontend will not be built into binary. Old kernels (using cudnn backend APIs) are used. #### Major Changes - For cuDNN 9, we will enable cudnn frontend to fuse convolution and bias when a provider option `fuse_conv_bias=1`. - Remove the fusion of FusedConv from graph transformer for CUDA provider, so there will not be FusedConv be added to graph for CUDA EP in the future. - Update cmake files regarding to cudnn settings. The search order of CUDNN installation in build are like the following: * environment variable `CUDNN_PATH` * `onnxruntime_CUDNN_HOME` cmake extra defines. If a build starts from build.py/build.sh, user can pass it through `--cudnn_home` parameter, or by environment variable `CUDNN_HOME` if `--cudnn_home` not used. * cudnn python package installation directory like python3.xx/site-packages/nvidia/cudnn * CUDA installation path #### Potential Issues - If ORT is built with cuDNN 8, FusedConv fusion is no longer done automatically, so some model might have performance regression. If user still wants FusedConv operator for performance reason, they can still have multiple ways to walkaround: like use older version of onnxruntime; or use older version of ORT to save optimized onnx, then run with latest version of ORT. We believe that majority users have moved to cudnn 9 when 1.20 release (since the default in ORT and PyTorch is cudnn 9 for 3 months when 1.20 release), so the impact is small. - cuDNN graph uses TF32 by default, and user cannot disable TF32 through the use_tf32 cuda provider option. If user encounters accuracy issue (like in testing), user has to set environment variable `NVIDIA_TF32_OVERRIDE=0` to disable TF32. Need update the document of use_tf32 later. #### Follow ups This is one of PRs that target to enable NHWC convolution in CUDA EP by default if device supports it. There are other changes will follow up to make it possible. (1) Enable `prefer_nhwc` by default for device with sm >= 70. (2) Change `fuse_conv_bias=1` by default after more testing. (3) Add other NHWC operators (like Resize or UpSample). ### Motivation and Context The new CUDNN Frontend library provides the functionality to fuse operations and provides new heuristics for kernel selection. Here it fuses the convolution with the pointwise bias operation. On the [NVIDIA ResNet50](https://pytorch.org/hub/nvidia_deeplearningexamples_resnet50/) we get a performance boost from 49.1144 ms to 42.4643 ms per inference on a 2560x1440 input (`onnxruntime_perf_test -e cuda -I -q -r 100-d 1 -i 'prefer_nhwc|1' resnet50.onnx`). --------- Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Maximilian Mueller <maximilianm@nvidia.com>
2024-08-02 22:16:42 +00:00
target_include_directories(onnxruntime_pybind11_state PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${CUDNN_INCLUDE_DIR})
Add Python API for specifying device options. (#4205) * Add python API for specifying CUDA device id * Modification for providing session based python api for specifying device id * When include header file pybind11/stl.h, conversion between c++ containers and Python list, vector and dict data structure are automatically enabled. https://pybind11.readthedocs.io/en/stable/advanced/cast/stl.html# Therefore, refactor the code for better leverage this advantage. * Make struct CudaDeviceOptions as default cuda device options * Implement sess.set_providers(list_of_providers, list_of_provider_option_dicts) But still stay consistent with existing sess.set_providers(list_of_provider) * Add cuda provider option default setting * Add support for setting cuda cuda_mem_limit and arena_extend_strategy. Also resolved the merge conflict on session.py * Use python ctypes to call cuda library to help python unittest * Refine the code with reviewer's suggestions * Add the capability of getting execution provider's configuration - Once we introduced the capability to set execution provider's configuration, it makes sense to add capability of getting ep's configuration. * Modify the code with reviewer's suggestions. * Using stoull() and stoul() depends on 32/64-bits architecture. * Rewrite the testcases for testing setting CUDA device id Note: We need to make sure every ORT process be run on one CUDA device at a time. * Make sure old session object is destroyed by python gc before new session object is being created * Move testcases to original onnxruntime_test_python.py * Fix bugs to pass CI build * Make it pass CI build (cont.) * Make it pass CI build (cont.)
2020-07-21 14:28:13 +00:00
endif()
if(onnxruntime_USE_CANN)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CANN_HOME}/include)
endif()
if(onnxruntime_USE_ROCM)
target_compile_options(onnxruntime_pybind11_state PUBLIC -D__HIP_PLATFORM_AMD__=1 -D__HIP_PLATFORM_HCC__=1)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_ROCM_HOME}/hipfft/include ${onnxruntime_ROCM_HOME}/include ${onnxruntime_ROCM_HOME}/hiprand/include ${onnxruntime_ROCM_HOME}/rocrand/include ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/amdgpu/orttraining)
endif()
if (onnxruntime_USE_NCCL)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${NCCL_INCLUDE_DIRS})
endif()
2018-11-20 00:48:22 +00:00
if(APPLE)
set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker -exported_symbols_list -Xlinker ${ONNXRUNTIME_ROOT}/python/exported_symbols.lst")
2018-11-20 00:48:22 +00:00
elseif(UNIX)
if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/python/version_script_expose_onnx_protobuf.lds -Xlinker --gc-sections")
else()
set(ONNXRUNTIME_SO_LINK_FLAG "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/python/version_script.lds -Xlinker --gc-sections")
endif()
else()
set(ONNXRUNTIME_SO_LINK_FLAG "-DEF:${ONNXRUNTIME_ROOT}/python/pybind.def")
2018-11-20 00:48:22 +00:00
endif()
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_pybind11_state PRIVATE ENABLE_ATEN)
Improve dependency management (#13523) ## Description 1. Convert some git submodules to cmake external projects 2. Update nsync from [1.23.0](https://github.com/google/nsync/releases/tag/1.23.0) to [1.25.0](https://github.com/google/nsync/releases/tag/1.25.0) 3. Update re2 from 2021-06-01 to 2022-06-01 4. Update wil from an old commit to 1.0.220914.1 tag 5. Update gtest to a newer commit so that it can optionally leverage absl/re2 for parsing command line flags. The following git submodules are deleted: 1. FP16 2. safeint 3. XNNPACK 4. cxxopts 5. dlpack 7. flatbuffers 8. googlebenchmark 9. json 10. mimalloc 11. mp11 12. pthreadpool More will come. ## Motivation and Context There are 3 ways of integrating 3rd party C/C++ libraries into ONNX Runtime: 1. Install them to a system location, then use cmake's find_package module to locate them. 2. Use git submodules 6. Use cmake's external projects(externalproject_add). At first when this project was just started, we considered both option 2 and option 3. We preferred option 2 because: 1. It's easier to handle authentication. At first this project was not open source, and it had some other non-public dependencies. If we use git submodule, ADO will handle authentication smoothly. Otherwise we need to manually pass tokens around and be very careful on not exposing them in build logs. 2. At that time, cmake fetched dependencies after "cmake" finished generating vcprojects/makefiles. So it was very difficult to make cflags consistent. Since cmake 3.11, it has a new command: FetchContent, which fetches dependencies when it generates vcprojects/makefiles just before add_subdirectories, so the parent project's variables/settings can be easily passed to the child projects. And when the project went on, we had some new concerns: 1. As we started to have more and more EPs and build configs, the number of submodules grew quickly. For more developers, most ORT submodules are not relevant to them. They shouldn't need to download all of them. 2. It is impossible to let two different build configs use two different versions of the same dependency. For example, right now we have protobuf 3.18.3 in the submodules. Then every EP must use the same version. Whenever we have a need to upgrade protobuf, we need to coordinate across the whole team and many external developers. I can't manage it anymore. 3. Some projects want to manage the dependencies in a different way, either because of their preference or because of compliance requirements. For example, some Microsoft teams want to use vcpkg, but we don't want to force every user of onnxruntime using vcpkg. 7. Someone wants to dynamically link to protobuf, but our build script only does static link. 8. Hard to handle security vulnerabilities. For example, whenever protobuf has a security patch, we have a lot of things to do. But if we allowed people to build ORT with a different version of protobuf without changing ORT"s source code, the customer who build ORT from source will be able to act on such things in a quicker way. They will not need to wait ORT having a patch release. 9. Every time we do a release, github will also publish a source file zip file and a source file tarball for us. But they are not usable, because they miss submodules. ### New features After this change, users will be able to: 1. Build the dependencies in the way they want, then install them to somewhere(for example, /usr or a temp folder). 2. Or download the dependencies by using cmake commands from these dependencies official website 3. Similar to the above, but use your private mirrors to migrate supply chain risks. 4. Use different versions of the dependencies, as long as our source code is compatible with them. For example, you may use you can't use protobuf 3.20.x as they need code changes in ONNX Runtime. 6. Only download the things the current build needs. 10. Avoid building external dependencies again and again in every build. ### Breaking change The onnxruntime_PREFER_SYSTEM_LIB build option is removed you could think from now it is default ON. If you don't like the new behavior, you can set FETCHCONTENT_TRY_FIND_PACKAGE_MODE to NEVER. Besides, for who relied on the onnxruntime_PREFER_SYSTEM_LIB build option, please be aware that this PR will change find_package calls from Module mode to Config mode. For example, in the past if you have installed protobuf from apt-get from ubuntu 20.04's official repo, find_package can find it and use it. But after this PR, it won't. This is because that protobuf version provided by Ubuntu 20.04 is too old to support the "config mode". It can be resolved by getting a newer version of protobuf from somewhere.
2022-12-01 17:51:59 +00:00
target_include_directories(onnxruntime_pybind11_state PRIVATE ${dlpack_SOURCE_DIR}/include)
endif()
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
if (onnxruntime_ENABLE_TRAINING)
target_include_directories(onnxruntime_pybind11_state PRIVATE ${ORTTRAINING_ROOT})
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
target_link_libraries(onnxruntime_pybind11_state PRIVATE onnxruntime_training)
endif()
# Eager mode and LazyTensor are both Pytorch's backends, so their
# dependencies are set together below.
if (onnxruntime_ENABLE_LAZY_TENSOR)
# Set library dependencies shared by aforementioned backends.
# todo: this is because the prebuild pytorch may use a different version of protobuf headers.
# force the build to find the protobuf headers ort using.
target_include_directories(onnxruntime_pybind11_state PRIVATE
"${REPO_ROOT}/cmake/external/protobuf/src"
${TORCH_INCLUDE_DIRS})
# For eager mode, torch build has a mkl dependency from torch's cmake config,
# Linking to torch libraries to avoid this unnecessary mkl dependency.
target_include_directories(onnxruntime_pybind11_state PRIVATE "${TORCH_INSTALL_PREFIX}/include" "${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include")
find_library(LIBTORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib")
find_library(LIBTORCH_CPU_LIBRARY torch_cpu PATHS "${TORCH_INSTALL_PREFIX}/lib")
find_library(LIBC10_LIBRARY c10 PATHS "${TORCH_INSTALL_PREFIX}/lib")
# Explicitly link torch_python to workaround https://github.com/pytorch/pytorch/issues/38122#issuecomment-694203281
find_library(TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib")
target_link_libraries(onnxruntime_pybind11_state PRIVATE ${LIBTORCH_LIBRARY} ${LIBTORCH_CPU_LIBRARY} ${LIBC10_LIBRARY} ${TORCH_PYTHON_LIBRARY})
if (onnxruntime_USE_CUDA)
find_library(LIBTORCH_CUDA_LIBRARY torch_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
find_library(LIBC10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib")
target_link_libraries(onnxruntime_pybind11_state PRIVATE ${LIBTORCH_CUDA_LIBRARY} ${LIBC10_CUDA_LIBRARY})
endif()
if (MSVC)
target_compile_options(onnxruntime_pybind11_state PRIVATE "/wd4100" "/wd4324" "/wd4458" "/wd4127" "/wd4193" "/wd4624" "/wd4702")
target_compile_options(onnxruntime_pybind11_state PRIVATE "/bigobj" "/wd4275" "/wd4244" "/wd4267" "/wd4067")
endif()
endif()
target_link_libraries(onnxruntime_pybind11_state PRIVATE
2018-11-20 00:48:22 +00:00
onnxruntime_session
${onnxruntime_libs}
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
${PROVIDERS_TVM}
Initial PR for NNAPI execution provider (#1220) * init * Update DNNLibrary * Update DNNLibrary, set compiler flags, it compiles now * Add more missing flags, add test * Update DNNLibrary * Update Compile method, fix allocator and some other bugs * Update DNNLibrary * Implement CopyTensor * Not delete state explicitly since it is managed by unique_ptr * Add the missing files when SingleUnitTestProjct is ON * misc changes * Fix wrong name in provider factory * Add my own test * Update the code of add node into graph, and add the missing initializer into graph * Fix the bug that re-build the graph produces extra output * Update DNNLibrary * Transpose nchw (ONNX) -> nhwc (NNAPI) * Add license * Add GetSupportedNodes method (implement it later) * Rename onnxruntime_nnapi_test->onnxruntime_nnapi_squeezenet_test * Update squeezenet_test.cpp after rebase master * Remove squeezenet_test.cpp since it is almost same with the c++ sample * Update DNNLibrary for GetSupportedNodes * Update GetSupportedNodes * Revert "Remove squeezenet_test.cpp since it is almost same with the c++ sample" This reverts commit a97575fd9ff49e50ba1dc8d8154790d8cd86c48d. * Update DNNLibrary * Fix multiple outputs bug * Remove GetKernelRegistry * Revert "Revert "Remove squeezenet_test.cpp since it is almost same with the c++ sample"" This reverts commit 2a0670e9cbf10ea654111ce39e198a4be0ddd838. * Set default memory type of NNAPI EP * Add CPUOutput allocator * Update DNNLibrary for multiple outputs * Fix bug of nhwc->nchw * Remove GetExecutionHandle()
2019-07-02 13:03:29 +00:00
${PROVIDERS_NNAPI}
${PROVIDERS_XNNPACK}
${PROVIDERS_COREML}
${PROVIDERS_RKNPU}
${PROVIDERS_DML}
${PROVIDERS_ACL}
${PROVIDERS_ARMNN}
${PROVIDERS_XNNPACK}
${PROVIDERS_AZURE}
${PROVIDERS_QNN}
onnxruntime_optimizer
2018-11-20 00:48:22 +00:00
onnxruntime_providers
onnxruntime_util
${onnxruntime_tvm_libs}
2018-11-20 00:48:22 +00:00
onnxruntime_framework
onnxruntime_util
onnxruntime_graph
${ONNXRUNTIME_MLAS_LIBS}
2018-11-20 00:48:22 +00:00
onnxruntime_common
onnxruntime_flatbuffers
${pybind11_lib}
2018-11-20 00:48:22 +00:00
)
set(onnxruntime_pybind11_state_dependencies
${onnxruntime_EXTERNAL_DEPENDENCIES}
${pybind11_dep}
2018-11-20 00:48:22 +00:00
)
set_property(TARGET onnxruntime_pybind11_state APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_SO_LINK_FLAG} ${onnxruntime_DELAYLOAD_FLAGS})
2018-11-20 00:48:22 +00:00
add_dependencies(onnxruntime_pybind11_state ${onnxruntime_pybind11_state_dependencies})
2018-11-20 00:48:22 +00:00
if (MSVC)
set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "${ONNXRUNTIME_SO_LINK_FLAG}")
# if MSVC, pybind11 undefines _DEBUG in pybind11/detail/common.h, which causes the pragma in pyconfig.h
# from the python installation to require the release version of the lib
# e.g. from a python 3.10 install:
# if defined(_DEBUG)
# pragma comment(lib,"python310_d.lib")
# elif defined(Py_LIMITED_API)
# pragma comment(lib,"python3.lib")
# else
# pragma comment(lib,"python310.lib")
# endif /* _DEBUG */
#
# See https://github.com/pybind/pybind11/issues/3403 for more background info.
#
# Explicitly use the release version of the python library to make the project file consistent with this.
target_link_libraries(onnxruntime_pybind11_state PRIVATE ${Python_LIBRARY_RELEASE})
2018-11-29 02:29:16 +00:00
elseif (APPLE)
set_target_properties(onnxruntime_pybind11_state PROPERTIES LINK_FLAGS "${ONNXRUNTIME_SO_LINK_FLAG} -Xlinker -undefined -Xlinker dynamic_lookup")
2018-11-29 04:01:21 +00:00
set_target_properties(onnxruntime_pybind11_state PROPERTIES
INSTALL_RPATH "@loader_path"
BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH_USE_LINK_PATH FALSE)
2018-11-20 00:48:22 +00:00
else()
set_property(TARGET onnxruntime_pybind11_state APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker -rpath=\\$ORIGIN")
2018-11-20 00:48:22 +00:00
endif()
if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
set(onnxruntime_CUSTOM_EXTERNAL_LIBRARIES "${onnxruntime_EXTERNAL_LIBRARIES}")
list(FIND onnxruntime_CUSTOM_EXTERNAL_LIBRARIES onnx ONNX_INDEX)
list(FIND onnxruntime_CUSTOM_EXTERNAL_LIBRARIES ${PROTOBUF_LIB} PROTOBUF_INDEX)
MATH(EXPR PROTOBUF_INDEX_NEXT "${PROTOBUF_INDEX} + 1")
if (ONNX_INDEX GREATER_EQUAL 0 AND PROTOBUF_INDEX GREATER_EQUAL 0)
# Expect protobuf to follow onnx due to dependence
list(INSERT onnxruntime_CUSTOM_EXTERNAL_LIBRARIES ${ONNX_INDEX} "-Wl,--no-as-needed")
list(INSERT onnxruntime_CUSTOM_EXTERNAL_LIBRARIES ${PROTOBUF_INDEX_NEXT} "-Wl,--as-needed")
else()
message(FATAL_ERROR "Required external libraries onnx and protobuf are not found in onnxruntime_EXTERNAL_LIBRARIES")
endif()
target_link_libraries(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CUSTOM_EXTERNAL_LIBRARIES})
else()
target_link_libraries(onnxruntime_pybind11_state PRIVATE ${onnxruntime_EXTERNAL_LIBRARIES})
endif()
Update manylinux build scripts and GPU CUDA version from 11.0 to 11.1 (#7632) 1. Update manylinux build scripts. This will add [PEP600](https://www.python.org/dev/peps/pep-0600/)(manylinux2 tags) support. numpy has adopted this new feature, we should do the same. The old build script files were copied from https://github.com/pypa/manylinux, but they has been deleted and replaced in the upstream repo. The manylinux repo doesn't have a manylinux2014 branch anymore. So I'm removing the obsolete code, sync the files with the latest master. 2. Update GPU CUDA version from 11.0 to 11.1(after a discussion with PMs). 3. Delete tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda10_2. (Merged the content to tools/ci_build/github/linux/docker/Dockerfile.manylinux2014_cuda11) 4. Modernize the cmake code of how to locate python devel files. It was suggested in https://github.com/onnx/onnx/pull/1631 . 5. Remove `onnxruntime_MSVC_STATIC_RUNTIME` and `onnxruntime_GCC_STATIC_CPP_RUNTIME` build options. Now cmake has builtin support for it. Starting from cmake 3.15, we can use `CMAKE_MSVC_RUNTIME_LIBRARY` cmake variable to choose which MSVC runtime library we want to use. 6. Update Ubuntu docker images that used in our CI build from Ubuntu 18.04 to Ubuntu 20.04. 7. Update GCC version in CUDA 11.1 pipelines from 8.x to 9.3.1 8. Split Linux GPU CI pipeline to two jobs: build the code on a CPU machine then run the tests on another GPU machines. In the past we didn't test our python packages. We only tested the pre-packed files. So we didn't catch the rpath issue in CI build. 9. Add a CentOS machine pool and test our Linux GPU build on real CentOS machines. 10. Rework ARM64 Linux GPU python packaging pipeline. Previously it uses cross-compiling therefore we must static link to C Runtime. But now have pluggable EP API and it doesn't support static link. So I changed to use qemu emulation instead. Now the build is 10x slower than before. But it is more extensible.
2021-06-03 06:36:49 +00:00
2018-11-20 00:48:22 +00:00
set_target_properties(onnxruntime_pybind11_state PROPERTIES PREFIX "")
set_target_properties(onnxruntime_pybind11_state PROPERTIES FOLDER "ONNXRuntime")
if(onnxruntime_ENABLE_LTO)
set_target_properties(onnxruntime_pybind11_state PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE)
set_target_properties(onnxruntime_pybind11_state PROPERTIES INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO TRUE)
set_target_properties(onnxruntime_pybind11_state PROPERTIES INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL TRUE)
endif()
2018-11-20 00:48:22 +00:00
if (MSVC)
set_target_properties(onnxruntime_pybind11_state PROPERTIES SUFFIX ".pyd")
else()
set_target_properties(onnxruntime_pybind11_state PROPERTIES SUFFIX ".so")
endif()
# Generate version_info.py in Windows build.
# Has to be done before onnxruntime_python_srcs is set.
if (WIN32)
set(VERSION_INFO_FILE "${ONNXRUNTIME_ROOT}/python/version_info.py")
if (onnxruntime_USE_CUDA)
file(WRITE "${VERSION_INFO_FILE}" "use_cuda = True\n")
if(onnxruntime_CUDNN_HOME)
file(GLOB CUDNN_DLL_PATH "${onnxruntime_CUDNN_HOME}/bin/cudnn64_*.dll")
if (NOT CUDNN_DLL_PATH)
message(FATAL_ERROR "cuDNN not found in ${onnxruntime_CUDNN_HOME}")
endif()
else()
file(GLOB CUDNN_DLL_PATH "${onnxruntime_CUDA_HOME}/bin/cudnn64_*.dll")
if (NOT CUDNN_DLL_PATH)
message(FATAL_ERROR "cuDNN not found in ${onnxruntime_CUDA_HOME}")
endif()
endif()
get_filename_component(CUDNN_DLL_NAME ${CUDNN_DLL_PATH} NAME_WE)
string(REPLACE "cudnn64_" "" CUDNN_VERSION "${CUDNN_DLL_NAME}")
if(NOT onnxruntime_CUDA_VERSION)
set(onnxruntime_CUDA_VERSION ${CUDAToolkit_VERSION})
message("onnxruntime_CUDA_VERSION=${onnxruntime_CUDA_VERSION}")
endif()
file(APPEND "${VERSION_INFO_FILE}"
"cuda_version = \"${onnxruntime_CUDA_VERSION}\"\n"
"cudnn_version = \"${CUDNN_VERSION}\"\n"
)
else()
file(WRITE "${VERSION_INFO_FILE}" "use_cuda = False\n")
endif()
if ("${MSVC_TOOLSET_VERSION}" STREQUAL "142")
file(APPEND "${VERSION_INFO_FILE}" "vs2019 = True\n")
else()
file(APPEND "${VERSION_INFO_FILE}" "vs2019 = False\n")
endif()
endif()
2019-04-29 19:58:20 +00:00
file(GLOB onnxruntime_backend_srcs CONFIGURE_DEPENDS
2018-11-20 00:48:22 +00:00
"${ONNXRUNTIME_ROOT}/python/backend/*.py"
)
if (onnxruntime_ENABLE_TRAINING)
file(GLOB onnxruntime_python_srcs CONFIGURE_DEPENDS
2018-11-20 00:48:22 +00:00
"${ONNXRUNTIME_ROOT}/python/*.py"
"${ORTTRAINING_SOURCE_DIR}/python/*.py"
)
else()
file(GLOB onnxruntime_python_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/*.py"
)
endif()
# Generate _pybind_state.py from _pybind_state.py.in replacing macros with either setdlopenflags or ""
if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
set(ONNXRUNTIME_SETDLOPENFLAGS_GLOBAL "sys.setdlopenflags(os.RTLD_GLOBAL|os.RTLD_NOW|os.RTLD_DEEPBIND)")
set(ONNXRUNTIME_SETDLOPENFLAGS_LOCAL "sys.setdlopenflags(os.RTLD_LOCAL|os.RTLD_NOW|os.RTLD_DEEPBIND)")
else()
set(ONNXRUNTIME_SETDLOPENFLAGS_GLOBAL "")
set(ONNXRUNTIME_SETDLOPENFLAGS_LOCAL "")
endif()
if (onnxruntime_ENABLE_LAZY_TENSOR)
# Import torch so that onnxruntime's pybind can see its DLLs.
set(ONNXRUNTIME_IMPORT_PYTORCH_TO_RESOLVE_DLLS "import torch")
else()
set(ONNXRUNTIME_IMPORT_PYTORCH_TO_RESOLVE_DLLS "")
endif()
configure_file(${ONNXRUNTIME_ROOT}/python/_pybind_state.py.in
${CMAKE_BINARY_DIR}/onnxruntime/capi/_pybind_state.py)
if (onnxruntime_ENABLE_TRAINING)
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
file(GLOB onnxruntime_python_root_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/*.py"
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
)
file(GLOB onnxruntime_python_amp_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/amp/*.py"
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
)
file(GLOB onnxruntime_python_experimental_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/experimental/*.py"
)
file(GLOB onnxruntime_python_gradient_graph_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/experimental/gradient_graph/*.py"
)
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
file(GLOB onnxruntime_python_optim_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/optim/*.py"
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
)
file(GLOB onnxruntime_python_ortmodule_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/*.py"
)
file(GLOB onnxruntime_python_ortmodule_experimental_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/experimental/*.py"
)
file(GLOB onnxruntime_python_ortmodule_experimental_json_config_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/experimental/json_config/*.py"
)
file(GLOB onnxruntime_python_ortmodule_experimental_hierarchical_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/experimental/hierarchical_ortmodule/*.py"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/*.py"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_aten_op_executor_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/torch_cpp_extensions/aten_op_executor/*"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_torch_interop_utils_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/*"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_torch_gpu_allocator_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/*"
)
file(GLOB onnxruntime_python_ortmodule_torch_cpp_ext_fused_ops_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/*"
)
file(GLOB onnxruntime_python_ortmodule_graph_optimizers_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/graph_optimizers/*"
)
file(GLOB onnxruntime_python_ortmodule_pipe_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ortmodule/experimental/pipe/*"
)
file(GLOB onnxruntime_python_ort_triton_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ort_triton/*.py"
)
file(GLOB onnxruntime_python_ort_triton_kernel_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/ort_triton/kernel/*.py"
)
file(GLOB onnxruntime_python_utils_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/utils/*.py"
)
file(GLOB onnxruntime_python_utils_data_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/utils/data/*"
)
Statistics tool for ORTModule convergence parity (#15020) ### Statistics tool for ORTModule convergence parity As ORTModule get more and more validated, it is pretty fast to intergrade PyTorch based model with ORT. The same time, we need make sure once there is convergence issue, we don't spend months of time to investigate. As part of this efforts, this PR is introducing a tool to dump activation statistics without much involvement from users. The dumping results contains only some statistic numbers plus sampled data, which is not big, compared with dumping all the tensors, it is much faster and space efficient. For us to use it, two single lines are needed before wrapping ORTModule. For baseline run, need also apply the same trick. ``` + from onnxruntime.training.utils.hooks import SubscriberManager, StatisticsSubscriber + SubscriberManager.subscribe(model, [StatisticsSubscriber("pt_out", override_output_dir=True)]) ``` Once you run the steps, following command can be used to merge result into per-step-summary respectively for ORT and baseline runs. ```bash python -m onnxruntime.training.utils.hooks.merge_activation_summary --pt_dir pt_out --ort_dir ort_out --output_dir /tmp/output ``` Docs is added here as part of this PR [convergence investigation notes](https://github.com/microsoft/onnxruntime/blob/pengwa/conv_tool/docs/ORTModule_Convergence_Notes.md) Based on the generated merged files, we can compare them with tools. ![image](https://user-images.githubusercontent.com/10530022/224653929-4e4480bd-bb02-4bbe-bd44-2672bdf91a87.png) ### Design and Implementation This PR introduced a common mechanism registering custom logic for nn.Module's post forward hooks. And statistics for activation (StatisticsSubscriber) is one of the implementations. If there is other needs, we can define another XXSubscriber to do the customized things.
2023-03-23 12:34:24 +00:00
file(GLOB onnxruntime_python_utils_hooks_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/utils/hooks/*"
)
if (onnxruntime_ENABLE_TRAINING_APIS)
file(GLOB onnxruntime_python_onnxblock_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/onnxblock/*"
)
file(GLOB onnxruntime_python_onnxblock_loss_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/onnxblock/loss/*"
)
file(GLOB onnxruntime_python_api_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/api/*"
)
file(GLOB onnxruntime_python_onnxblock_optim_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_SOURCE_DIR}/python/training/onnxblock/optim/*"
)
endif()
endif()
if (onnxruntime_BUILD_UNIT_TESTS)
file(GLOB onnxruntime_python_test_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/test/python/*.py"
"${ORTTRAINING_SOURCE_DIR}/test/python/*.py"
"${ORTTRAINING_SOURCE_DIR}/test/python/*.json"
)
file(GLOB onnxruntime_python_quantization_test_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/test/python/quantization/*.py"
)
file(GLOB onnxruntime_python_transformers_test_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/test/python/transformers/*.py"
)
file(GLOB onnxruntime_python_transformers_testdata_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/test/python/transformers/test_data/models/*.onnx"
)
Whisper Model Optimization (#15473) ### Description This PR contains fusion-level and kernel-level optimizations for [OpenAI's Whisper](https://github.com/openai/whisper). Some of the added optimizations include: - Pruning of duplicate/unnecessary inputs and outputs - Fusion support for Whisper models with or without these inputs/outputs (e.g. with these inputs/outputs if exporting with an older official Optimum version, without these inputs/outputs if exporting with Optimum from source) - Attention fusions - For Whisper's encoder and decoder - Modified symbolic shape inference for present output when no past input exists (for decoder) - Multi-head attention fusions - For Whisper's decoder and decoder with past - Packed MatMul for the 3 MatMuls excluded in multi-head attention fusion - Attention kernel changes - CPU: - Different Q and KV sequence lengths - Parallel memset for large sequence lengths - Convert broadcast add after MatMul of Q and K (add_qk) to element-wise add - Separate present key-value output into present key and present value (for multi-head attention spec) - CUDA: - Use memory efficient attention compute kernel with present state (for decoder) - Multi-head attention kernel changes - CPU: - Introduction of multi-head attention CPU kernel (previously did not exist) - Use AddBiasReshape instead of AddBiasTranspose when sequence length = 1 (for decoder with past) - Different Q, K, V input shapes - Pass past key and past value directly as key and value - CUDA: - Use memory efficient attention compute kernel with past and/or present state (for decoder with past) ### Usage To use the optimizations, run the ORT transformer optimizer script as follows: ``` $ cd onnxruntime/onnxruntime/python/tools/transformers/ $ python3 optimizer.py --input <filename>.onnx --output <filename>.onnx --model_type bart --num_heads <number of attention heads, depends on the size of the whisper model used> --hidden_size <attention hidden size, depends on the size of the whisper model used> --use_external_data_format --use_multi_head_attention ``` Once optimized, here's an example of how to run Whisper with [Hugging Face's Optimum](https://github.com/huggingface/optimum): ``` from transformers.onnx.utils import get_preprocessor from optimum.onnxruntime import ORTModelForSpeechSeq2Seq from optimum.pipelines import pipeline as ort_pipeline import whisper # Installed from OpenAI's repo - setup instructions at https://github.com/openai/whisper/ directory = './whisper_opt' # Where the optimized ONNX models are located model_name = 'openai/whisper-tiny' device = 'cpu' # Get pipeline processor = get_preprocessor(model_name) model = ORTModelForSpeechSeq2Seq.from_pretrained( directory, use_io_binding=(device == 'cuda'), provider='CPUExecutionProvider', ).to(device) pipe = ort_pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=(-1 if device == 'cpu' else 0), ) # Load audio file and run pipeline audio = whisper.load_audio('tests/jfk.flac') audio = whisper.pad_or_trim(audio) outputs = pipe([audio]) print(outputs) ``` Note: In order to use these changes with Optimum, it is recommended to use Optimum from source to have the following changes: - https://github.com/huggingface/optimum/pull/872 - https://github.com/huggingface/optimum/pull/920 ### Motivation and Context This PR helps the following issues: - https://github.com/microsoft/onnxruntime/issues/15100 - https://github.com/microsoft/onnxruntime/issues/15235 - https://github.com/huggingface/optimum/issues/869 (work in progress) This PR can be used with the other currently merged Whisper PRs: - https://github.com/microsoft/onnxruntime/pull/15247 - https://github.com/microsoft/onnxruntime/pull/15339 - https://github.com/microsoft/onnxruntime/pull/15362 - https://github.com/microsoft/onnxruntime/pull/15365 - https://github.com/microsoft/onnxruntime/pull/15427 This PR uses changes from the following merged PRs: - https://github.com/microsoft/onnxruntime/pull/14198 - https://github.com/microsoft/onnxruntime/pull/14146 - https://github.com/microsoft/onnxruntime/pull/14201 - https://github.com/microsoft/onnxruntime/pull/14928 (this introduced the new multi-head attention spec)
2023-04-19 00:13:54 +00:00
file(GLOB onnxruntime_python_transformers_testdata_whisper CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/test/python/transformers/test_data/models/whisper/*.onnx"
)
file(GLOB onnxruntime_python_transformers_testdata_conformer CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/test/python/transformers/test_data/models/conformer/*.onnx"
)
endif()
2019-04-29 19:58:20 +00:00
file(GLOB onnxruntime_python_tools_srcs CONFIGURE_DEPENDS
2018-11-20 00:48:22 +00:00
"${ONNXRUNTIME_ROOT}/python/tools/*.py"
)
file(GLOB onnxruntime_python_quantization_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/quantization/*.py"
)
file(GLOB onnxruntime_python_quantization_operators_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/quantization/operators/*.py"
)
file(GLOB onnxruntime_python_quantization_cal_table_flatbuffers_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/quantization/CalTableFlatBuffers/*.py"
)
file(GLOB onnxruntime_python_quantization_fusions_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/quantization/fusions/*.py"
)
[Quantization] Tensor quant overrides and QNN EP quantization configuration (#18465) ### Description #### 1. Adds `TensorQuantOverrides` extra option Allows specifying a dictionary of tensor-level quantization overrides: ``` TensorQuantOverrides = dictionary : Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For per-channel quantization, the list contains a dictionary for each channel in the tensor. Each dictionary contains optional overrides with the following keys and values. 'quant_type' = QuantType : The tensor's quantization data type. 'scale' = Float : The scale value to use. Must also specify `zero_point` if set. 'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set. 'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also set `scale` or `zero_point`. 'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also set `scale` or `zero_point`. 'rmax' = Float : Override the maximum real tensor value in calibration data. Invalid if also set `scale` or `zero_point`. 'rmin' = Float : Override the minimum real tensor value in calibration data. Invalid if also set `scale` or `zero_point`. ``` - All of the options are optional. - Some combinations are invalid. - Ex: `rmax` and `rmin` are unnecessary if the `zero_point` and `scale` are also specified. Example for per-tensor quantization overrides: ```Python3 extra_options = { "TensorQuantOverrides": { "SIG_OUT": [{"scale": 1.0, "zero_point": 127}], "WGT": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}], "BIAS": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}], }, } ``` Example for per-channel quantization overrides (Conv weight and bias): ```Python3 extra_options = { "TensorQuantOverrides": { "WGT": [ { "quant_type": quantization.QuantType.QUInt8, "rmin": 0.0, "rmax": 2.5, "reduce_range": True, }, { "quant_type": quantization.QuantType.QUInt8, "rmin": 0.2, "rmax": 2.55, "reduce_range": False, }, ], "BIAS": [ {"zero_point": 0, "scale": 0.000621}, {"zero_point": 0, "scale": 0.23}, ], }, } ``` #### 2. Adds utilities to get the default QDQ configs for QNN EP Added a `quantization.execution_providers.qnn.get_qnn_qdq_config` method that inspects the model and returns suitable quantization configurations. Example usage: ```python3 from quantization import quantize, QuantType from quantization.execution_providers.qnn import get_qnn_qdq_config qnn_config = get_qnn_qdq_config(input_model_path, data_reader, activation_type=QuantType.QUInt16, weight_type=QuantType.QUInt8) quantize(input_model_path, output_model_path, qnn_config) ``` ### Motivation and Context Make it possible to create more QDQ models that run on QNN EP. --------- Signed-off-by: adrianlizarraga <adlizarraga@microsoft.com>
2023-12-05 01:54:58 +00:00
file(GLOB onnxruntime_python_quantization_ep_qnn_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/quantization/execution_providers/qnn/*.py"
)
file(GLOB onnxruntime_python_transformers_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/*.py"
)
file(GLOB onnxruntime_python_transformers_models_bart_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/bart/*.py"
)
file(GLOB onnxruntime_python_transformers_models_bert_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/bert/*.py"
)
file(GLOB onnxruntime_python_transformers_models_gpt2_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/gpt2/*.py"
)
file(GLOB onnxruntime_python_transformers_models_llama_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/llama/*.py"
)
file(GLOB onnxruntime_python_transformers_models_longformer_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/longformer/*.py"
)
file(GLOB onnxruntime_python_transformers_models_phi2_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/phi2/*.py"
)
file(GLOB onnxruntime_python_transformers_models_stable_diffusion_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/stable_diffusion/*.py"
)
file(GLOB onnxruntime_python_transformers_models_t5_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/t5/*.py"
Sync ORTModule branch with master and fix tests (#6526) * Deprecate Python global configuration functions [Part 1] (#5923) Enable options to be set via execution provider (EP)-specific options and log deprecation warning from current global configuration functions. * remove dnnl_dll_path from post build copy (#6142) * Model Fusion For Bart (#6105) Fusion fix for Bart models * Unify IExecutionProvider and IExecutionProviderFactory interfaces (#6108) * Remove Provider_IExecutionProvider and make the internal IExecutionProvider usable by shared providers * Change Provider_IExecutionProviderFactory to be the core version. * Enable running the mnist_training sample without cuda (#6085) Signed-off-by: George Nash <george.nash@intel.com> * nnapi add min max support (#6117) * Fix CUDA test hang: (#6138) - Make condition check in `CUDAAllocatorTest` to ensure CUDA device is present. * Fix TensorRT kernel conflict issue for subgraphs of control flow operators (#6115) * add static subgraph kernel index * change kernel naming to avoid conflicts * Add gradient registration for Abs. (#6139) * Partition initial optimizer state for Zero-1 (#6093) * Initial changes * Working changes * Working changes * Cleanup * fix windows CI * Review comments * review comments * Fix edge case in BFCArena where allocation failures could lead to an infinite loop. (#6145) #4656 * Revert "work around of the build break in mac (#6069)" (#6150) This reverts commit 3cae28699bed5de1fcaadb219fa69bae0fc3cee8. * Fix clean_docker_image_cache.py detection of image pushes. (#6151) Fix clean_docker_image_cache.py detection of image pushes. They were being ignored because the expected HTTP status code was wrong. For pushes, it's 201 instead of 200. * MLAS: add NEON version of int8 depthwise convolution (#6152) * Using a map of of ops to stages as input of partition function. (#5940) * New partition algorithm running before AD * Convert cut_group_info into device map. Work in progress -- works for bert-tiny with pp=2 * Removing code for partition of bwd graphs * Remove old code * Adding some verification code * Handle Shared Initializer * Renaming rank with stage * Added first unit test * new test * redundant check * undo change in bert * Moved cut-based partition to testing utils file Co-authored-by: xzhu1900 Co-authored-by: wschin * New conversion function and tests * minor * remove test that is not needed2 * improve GetDeviceAssignment and PR comments * minor changes * PR comments * improving documentation and variable naming * add documentation * Variable naming and docs * more doc improvements * more doc improvements * missing static cast * Fix test file for windows * Fix test file for windows * Fix test file for windows * stage id is not the same as rank id * PR comments * PR comments * More comments * More comments * Minor fix to satisfy c++14 (#6162) * Deprecating Horovod and refactored Adasum computations (#5468) deprecated horovod submodule refactored adasum logic to be ort-native added tests for native kernel and e2e tests * Update TensorRT-ExecutionProvider.md (#6161) * Bugfix for topk cuda kernel (#6164) * fix the issue that std::numeric_limits cannot handle half type * adding a test Co-authored-by: Du Li <duli@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Revert "Fuse MatMulIntegerToFloat only when scales are scalar (#6008)" (#6169) This reverts commit f2dcba7afe0d42ebdaaef0c6cdf913a1156c9e98. * Remove ignored build warnings for pybind on Mac (#6165) * save_checkpoint, load_checkpoint and aggregate_checkpoints (#6136) * save_checkpoint and load_checkpoint implementations * checkpoint aggregation logic * unit tests for save_checkpoint, load_checkpoint and aggregate_checkpoints * Don't try to bind unused inputs in the Training frontend (#6166) * Update documentation for contributing a PR and add deprecation notices for PyOp and ORT server. (#6172) * aggregate model states only for the case when mixed precision was true (#6176) * [NNAPI EP] Enable per-channel quantization for QlinearConv (#6155) * Enable qlinearconv per-channel quantization * Fix the android CI test failure * Add Android Version Check for Per-Channel Quant * Address PR comments * Fix some minor issues * Add verification of per-channel zero points * Make the error tolerance configurable * Fix typo in BERT pretraining script (#6175) A misplaced `}` meant that the `'enable_adasum'` option was interpreted incorrectly, causing the test to fail. * Update get_docker_image.py to enable use without image cache container registry. (#6177) Update get_docker_image.py to enable use without image cache container registry. * Helper for compiling EP to generate deterministic unique ids for use in MetaDef names (#6156) * Create a helper for generating unique ids that can be used by an EP that creates compiled nodes and needs ids to be deterministic for a model when used in multiple sessions. Added to IExecutionProvider as this can potentially be used by all compiling EPs and is more robust than a simplistic counter (although EP implementer is free to choose either approach). * Restructure the helper so it can be called across the EP bridge. Add ability to call id generation helper from EP bridge - convert DNNL EP to use helper to validate Address issue where a new Model may be loaded into the same address as a previous one. - hash the bytes in the Graph instance (1728 bytes currently) to use as the key to the full hash for the model Add lock around id generation to ensure no issues if multiple sessions partitions graphs at exactly the same time. - Extremely unlikely but would be hard to debug and the locking cost is not an issue as it's only incurred during graph partitioning and not execution. * Backend APIs for checkpointing (#5803) * Add backend API GetOptimizerState and GetModelState * add GetPartitionInfoMap * Android coverage dashboard (#6163) * Write the report to a file. * Post code coverage to the Dashboard database. * Add usage details of unified MCR container image (#6182) Going forward, a single unifed docker image will be published in MCR. The hardware accelerator target choice will have to be made in the application using OpenVINO EP's runtime config options. * improve perf for softmax (#6128) * improve perf for both gathergrad and softmax * revert the change in gathergrad and will be done in another PR. * address comments from code review. * Tune fast Gelu to use exp(x) instead of tanh(x) on Rocm platform (#6174) * tune fast gelu to use exp(x) instead of tanh(x) on rocm * update to use expression 2/(1+exp(-2x))-1 for stability * Add Status.csv to EP Perf Tool (#6167) * merge master, keep postprocess status commit * download float16.py everytime * removing hardcoded values * Lochi/quantization tool for trt (#6103) * Initial implementation of generating calibration dynamic range table * Initialize validation support for Quantization * Initialize validation support for Quantization (cont.) * Improve validation support for Quantization * Improve validation support for Quantization * Rewrite/Refine for calibration and validation * Rewrite/Refine for calibration and validation (cont.) * Refine code * Refine code * Add data reader for BERT * Add flatbuffers to serialize calibration table * Refine code and add BERT evaluation * Refine the code * minor modification * Add preprocess/postprocess of vision team yolov3 and refine the code * Update annotation * Make bbox cooridates more accurate * Fix bug * Add support of batch processing * Batch processing for model zoo yolov3 * Add batch inference for evaluation * Refine the code * Add README * Add comments * Refine the code for PR * Remove batch support checking in data_reader and refine the code * Refine the code for PR * Refine the code for PR review Co-authored-by: Olivia Jain <oljain@microsoft.com> * Implement ScatterND for CUDA EP (#6184) * Condition fix in Resize operator (#6193) * Clean up checkpoint tests to use the new checkpoint functions (#6188) * add deprecation warning for old checkpoint functions * update all the distributed checkpoint tests to use new checkpoint functions * Implement comparing outputs that are sequence of maps of strings to floats (#6180) * Implement conversion from ortvalue to Itensor for string tensors and comparing sequence of maps of strings to floats * PR comments * Dockerfile to build onnxruntime with ROCm 4.0 * Add ability to skip GPU tests based on GPU adapter name (#6198) * Implement conversion from ortvalue to Itensor for string tensors and comparing sequence of maps of strings to floats * PR comments * Add ability to skip gpu tests according to adapter description * spacing * spacing * spacing * Openvino ep 2021.2 (#6196) * Enabling fasterrcnn variant and vehicle detector * changes for 2021_2 branch * yolov3_pytorch commit * fixed braces in basic_backend.cc * ci information added * faster rcnn variant and vehicle detector changes were made in 2021.1 and not in 2021.2 * some changes to support unit tests * disable some tests which are failing * fix myriad tests for vehicle detector * Did some cleanup *cleaned up comments *Disabled Add_Broadcast_0x1 and Add_Broadcast_1x0 tests on MYRIAD_FP16 backend due to a bug *cleaned up capability_2021_2.cc file *Removed extra conditions which were added for some validation in backend_utils Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * yolov3 pytorch workaround to ensure that the output names are matched * gemmoptest fixed on myriad * Fixed MYRIADX CPP Test Failures *Expand,GatherND,Range,Round op's are only supported in model *where op with float input data types are not supported and fixed *Scatter and ScatterElements op's with negative axis are fixed *Reshape op with 0 dim value are not supported and fixed *Disabled InstanceNorm_2 test on MYRIADX Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * make changes to yolov3 pytorch * Fixed python unit tests *Fixed failing python tests on vpu, GPU and CPU Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixes POW op failures on GPU_FP16 Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Clean up capability_2021_2.cc Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Updated docx for MultiThreading option *Added extra info on setting the num_of_threads option using the API and it's actual usage Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * fixed slice and removed extra prints * Disabled failing python tests Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Minor changes added in capabilty_2021_2 Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * made changes to slice to avoid failures * Disabling FP16 support for GPU_FP32 ->Inferencing an FP16 model on GPU_FP32 leads to accuracy mismatches. so, we would rather use GPU_FP16 to infer an FP16 model on GPU Device Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Updated docx for Inferencing a FP16 Model Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * fix for mask rcnn * Script for installing openvino from source * Updated with openvino 2021.2 online installation * code comment fixes fixed accuracy mismatch for div * Update OpenvinoEP-ExecutionProvider.md updated for 2021.2 branch * Update README.md updated dockerfile documentation * Update BUILD.md build.md update documentation * permissiong change of install_openvino.sh * made changes to align with microsoft onnxruntime changes * Updated with ov 2021.2.200 Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: sfatimar <sahar.fatima@intel/com> Co-authored-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: mohdansx <mohdx.ansari@intel.com> * Fix a memory leak in test_inference.cc (#6201) * Fix a memory leak in test_inference.cc * Use TArray in AMD element-wise kernels, rather than manually copying memory to device. * Remove most ROCm-specific element-wise code and reuse CUDA element-wise code. * Minor change to improve performance for operator Pad. (#5537) * small improvment for pad * Support double for operators Log, Reciprocal, Sum (CPU) (#6032) * Support double for operators Log, Reciprocal, Sum * remove tesdt erf_double * Support double for operators Where, LpNormalisation (#6034) * Support double for operators Relu, Tanh, Sigmoid (#6221) * Fix ImportError in build.py (#6231) There is a possible ImportError where build.py can import the wrong 'util' package if there are others present in `sys.path` already * Removed executor todo that looks dead. (#6234) * Remove MKLML/openblas/jemalloc build config (#6212) * Remove python 3.5 * Update the readme file * Upgrade build.py to assert for python 3.6+ Upgrade build.py to assert for python 3.6+ as python 3.5 cannot build anymore todays master. * Support MLFloat16 type in Pow opset-12 CUDA kernel (#6233) * MLAS: handle MlasGemm(M/N/K==0) cases (#6238) * Support double for operator TopK + fix one bug in TopK implementation for GPU for double (#6220) * Support double for operator TopK * add static classes for topk/double * fix cast issue in topk * Support double for operator Gemm + fix bug in gemm implementation for cuda, rocm when sizeof(type) != sizeof(float) (#6223) * Support double for operator Gemm * fix type size while copying data in gemm operator for GPU * fix type in gemm implementation for rocm * Support double for operator ReduceMean, ReduceLogSumExp (#6217) * Support double for operators ReduceMean, ReduceLogSumExp * Support double for operator ArgMin (#6222) * Support double for operator ArgMin * add test specifically for double * add new test on pai-excluded-tests.txt * Update BUILD.md * Update manylinux docker image to the latest (#6242) * Fix allocator issue for TensorRT IOBinding (#6240) * Fix issue: https://github.com/microsoft/onnxruntime/issues/6094 Root cause: we didn't expose the OrtMemoryInfo for TRT, so it will cause issue if user want use IObinding for Tensorrt. Short term fix, add the OrtMemoryInfo for TRT. Long term should unify the allocator for CUDA and TRT * Tune BiasGeluGradDx kernel in approximation mode to avoid tanh(...) on Rocm (#6239) * bias gelu grad use exp(...) instead * update cuda to rocm * missing semicolon * comment * remove dockerfile * missing factor of two * Refactor EP Perf Tool (#6202) * merge master, keep postprocess status commit * download float16.py everytime * using variables to reference eps * adding ACL EP to ep perf tool * accuracy with absolute tolerance configurable * add acl to dict + remove commented line * Documentation for distributed CI tests pipeline (#6140) * Remove a debug log in provider_test_utils.cc (#6200) * Add the Concat Slice Elimination transform, fix constant_folding transform (#5457) * Add concat slice transform + test * Cosmetic improvements in concat slice transform * Remove unrelated file, fix comment, fix constant folding bug * Add test onnx graph * fix windows build * Review comments * review comment Co-authored-by: Aishwarya <aibhanda@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add MakeStringLite which uses current locale, update some MakeString call sites to use it instead. (#6252) * Add MakeStringLite which uses current locale, update macros to use that to generate messages. * Convert calls to MakeStringLite(). * Liqun/speech model loop to scan (#6070) Provide a tool to convert Loop to Scan for Nuphar performance Fix Nuphar CI pipeline failures. Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * model parallel refinement (#6244) * Megatron Transformation as a seperate step * remove useless header * clang formating * Re-Structure megatron transformer for subsquent changes * fix comments * Allow querying a GraphProto's doc_string as part of ModelMetadata (#6248) * Fix Linux/Mac error message on input type mismatch (#6256) * add bfloat16 to gathergrad type constrains (#6267) Co-authored-by: Cheng Tang <chenta@microsoft.com> * Fix VS 2017 build break (#6276) * Deprecate Python global configuration functions [Part 2] (#6171) Update Python API to allow more flexibility for setting providers and provider options. The providers argument (InferenceSession/TrainingSession constructors, InferenceSession.set_providers()) now also accepts a tuple of (name, options dict). Fix get_available_providers() API (and the corresponding function in the C API) to return the providers in default priority order. Now it can be used as a starting point for the providers argument and maintain the default priority order. Convert some usages of the deprecated global configuration functions to use EP-specific options instead. Update some EP-specific option parsing to fail on unknown options. Other clean up. * Add script to preprocess python documentation before publishing (#6129) * add script to preprocessing python documentation before publishing * rename past to past_key_values for GPT-2 (#6269) rename past to past_key_values for transformers 4.* * Rename MakeString and ParseString functions. (#6272) Rename MakeString to MakeStringWithClassicLocale, MakeStringLite to MakeString, *ParseString to *ParseStringWithClassicLocale. Add missing pass-through versions of MakeStringWithClassicLocale for string types. * Increase timeout for Linux GPU CUDA11 build. (#6280) * Add helper to compare model with different precision (#6270) * add parity_check_helper.py * add real example * remove lines * Fix Min/Max CPU kernels for float16 type (#6205) * fix data_ptr assertion error for past_sequence_length=0 in GPT-2 (#6284) fix io binding crash for past_sequence_length=0 * A list of changes in transformers tool (#6224) * longformer fp16 e2e * add fp16/fp32 parity check helper file * excludes nodes with subgraph in profiling * use onnxconverter_common to do fp32->fp16 * add version check for onnxconverter_common * remove helper file * add pkg installation on notebooks and script * Workaround for static_cast<double>(half) * Add workaround to remove ROCm-specific binary-elementwise files. * Update nuget build (#6297) 1. Update the ProtoSrc path. The old one is not used anymore. 2. Regenerate OnnxMl.cs 3. Delete some unused code in tools/ci_build/build.py 4. Avoid set intra_op_param.thread_pool_size in ModelTests in OpenMP build. 5. Fix a typo in the C API pipeline. * Enable ONNX backend test of SequenceProto input/output (#6043) * assert sequence tensor and remove skips * update testdata json * use ONNX 1.8 in cgmanifest.json * use previous commit to workaround * update ONNX commit ID in docker * skip test_maxpool_2d_dilations test for now * update function name * add --sequence_lengths option (#6285) * more dtype for Equal CUDA kernel (#6288) Co-authored-by: Vincent Wang <weicwang@microsoft.com> * Force reinstall onnx python package on Windows (#6309) * update transformers required package versions (#6315) * Remove abs in LpPool (#6303) * Support 1D input for Conv + Mul/Add fusion optimizer with test (#6295) * Support 1D input (N C H) for Conv + Mul/Add fusion optimizer with test cases and test models. * Add longformer to python package (#6314) * add longformer to python package * move test related script and data to a new folder * Avoid false sharing on thread pool data structures (#6298) Description: This change adds alignment and padding to avoid false sharing on fields in the thread pool. It also adds a new microbenchmark to profile thread-pool performance over short loops. Motivation and Context MobileNet on a 2*12-core system showed a performance gap between the ORT thread pool and OpenMP. One cause appeared to be false sharing on fields in the thread pool: ThreadPoolParallelSection::tasks_finished (which the main thread spins on waiting for workers to complete a loop), and the RunQueue::front_ and back_ fields (used respectively by the worker thread and the main thread). The additional micro-benchmark BM_ThreadPoolSimpleParallelFor tests performance of loops of different sizes at different thread counts. The results below are on a machine with 2*14-core processors (E5-2690 v4) running with 1, 14, 15, and 28 threads. For each test, the microbenchmark has N threads run a loop with N iterations; hence a perfect result is for the time taken to be constant as additional threads are added (although we will also see power management effects helping at very low thread counts). The loop durations (100000, 10000, 1000) correspond roughly to 200us, 20us, and 2us on this machine. Before change: BM_ThreadPoolSimpleParallelFor/1/1/100000/real_time 17153 us 17154 us 32 BM_ThreadPoolSimpleParallelFor/14/14/100000/real_time 22553 us 22553 us 30 BM_ThreadPoolSimpleParallelFor/15/15/100000/real_time 21521 us 21521 us 29 BM_ThreadPoolSimpleParallelFor/28/28/100000/real_time 24111 us 24111 us 24 BM_ThreadPoolSimpleParallelFor/1/1/10000/real_time 1719 us 1719 us 407 BM_ThreadPoolSimpleParallelFor/14/14/10000/real_time 3409 us 3409 us 200 BM_ThreadPoolSimpleParallelFor/15/15/10000/real_time 3541 us 3541 us 201 BM_ThreadPoolSimpleParallelFor/28/28/10000/real_time 4576 us 4576 us 151 BM_ThreadPoolSimpleParallelFor/1/1/1000/real_time 174 us 174 us 4017 BM_ThreadPoolSimpleParallelFor/14/14/1000/real_time 1586 us 1586 us 402 BM_ThreadPoolSimpleParallelFor/15/15/1000/real_time 1586 us 1586 us 397 BM_ThreadPoolSimpleParallelFor/28/28/1000/real_time 2864 us 2864 us 232 After change: BM_ThreadPoolSimpleParallelFor/1/1/100000/real_time 17160 us 17160 us 33 BM_ThreadPoolSimpleParallelFor/14/14/100000/real_time 20989 us 20989 us 31 BM_ThreadPoolSimpleParallelFor/15/15/100000/real_time 22286 us 22286 us 31 BM_ThreadPoolSimpleParallelFor/28/28/100000/real_time 24631 us 24631 us 25 BM_ThreadPoolSimpleParallelFor/1/1/10000/real_time 1718 us 1718 us 407 BM_ThreadPoolSimpleParallelFor/14/14/10000/real_time 2868 us 2868 us 242 BM_ThreadPoolSimpleParallelFor/15/15/10000/real_time 2907 us 2907 us 240 BM_ThreadPoolSimpleParallelFor/28/28/10000/real_time 3872 us 3872 us 186 BM_ThreadPoolSimpleParallelFor/1/1/1000/real_time 175 us 175 us 3938 BM_ThreadPoolSimpleParallelFor/14/14/1000/real_time 933 us 933 us 659 BM_ThreadPoolSimpleParallelFor/15/15/1000/real_time 912 us 912 us 591 BM_ThreadPoolSimpleParallelFor/28/28/1000/real_time 1976 us 1976 us 317 * fix opset imports for function body (#6287) * fix function opsets * add tests and update onnx * changes per review comments * add comments * plus updates * build fix * Remove false positive prefast warning from threadpool (#6324) * Java: add Semmle to Java publishing pipelines (#6326) Add Semmle to Java API pipeline Add security results publishing and add Java GPU. * Quantization support for split operator with its NHWC support (#6107) * Make split working for quantization. * NHWC transformer support for split operator * Refactor some according to Feedback. Will add test cases soon. * Fix build error on windows. * Add test case for split op on uint8_t support * Add nhwc_transformer_test for split uint8_t support * Some change according to PR feedbacks. * Liqun/enable pipeline parallel test (#6331) enable pipeline parallel test Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Use onnxruntime_USE_FULL_PROTOBUF=OFF for the cuda execution provider (#6340) This removes a special case of the cuda EP. * MLAS: add fallback implementation for quantized GEMM (#6335) Add a non-vectorized version of the kernel used for the quantized version of MlasGemm. * Delete float16.py (#6336) No longer needed. Also doesn't pass policheck. * Enable add + softmax fusion for Rocm platform (#6259) * add bias softmax; tests appear to pass * check fusion occurs for rocm as well * check for rocm provider compatible as well * build for cpu scenario as well * try again; broader cope * proper scope on kGpuExecutionProvider * been editing wrong file * remove commented #include lines * try again due to mac os ci error * try again * test fusion both cuda and rocm to avoid mac ci error * add external data support to tensor proto utils (#6257) * update unpack tensor utilities to support loading external data * more updates * fix test * fix nuphar build * minor build fix * add tests * fix Android CI * fix warning * fix DML build failure and some warnings * more updates * more updates * plus few updates * plus some refactoring * changes per review * plus some change * remove temp code * plus updates to safeint usage * build fix * fix for safeint * changed wording. (#6337) * Remove OpSchema dummy definition. Only needed for Function now, and we can just exclude the method in Function (#6321) * remove gemmlowp submodule (#6341) * [NNAPI] Add pow support (#6310) * Add support for running Android emulator from build.py on Windows. (#6317) * fix the pipeline failure (#6346) * Train BERT Using BFloat16 on A100 (#6090) * traing bert using bf16 * Adam support bf16 * bugfix * add fusedmatmul support * fix after merge from master. * bugfix * bugfix after merge from master * fast reduction for bf16. * resolve comments * fix win build * bugfix * change header file. Co-authored-by: Vincent Wang <weicwang@microsoft.com> * Fix DerefNullPtr issues raised by SDLNativeRules. (#6348) * update quantize to support basic optimization and e2e example for image classification (#6313) update the resnet50-v1 to standard one from onnx zoo. add an example for mobilenet run basic optimization before quantization fix a bug in Clip * Enable graph save for orttrainer (#6333) * Enable graph save for orttrainer * Fix CI * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add PREfast to python packaging pipeline (#6343) * Add PREfast to python packaging pipeline * fix longformer benchmark io_binding output_buffers (#6345) * fix longformer benchmark io_binding output_buffers * format * import benchmark_helper from parent directory. * Use readelf for minimal build binary size checks. (#6338) * Use readelf for minimal build binary size checks. The on-disk size grows in 4KB chunks which makes it hard to see how much growth an individual checkin causes. Only downside is that the sum of the sections is larger than the on-disk size (assumably things get packed smaller on disk and some of the section alignment constraints can be ignored) * Remove unused function * Java: Set C language warnings to W4 and adjust JNI code (#6347) Set /W3 for C language and fix up JNI warnings. * Pipeline Parallel Experimental Python API (#5815) * Add create session to WinML telemetry to track WinML Usage (#6356) * Fix one more SDL warning (#6359) * fix -Wdangling-gsl (#6357) * Add python example of TensorRT INT8 inference on ResNet model (#6255) * add trt int8 example on resnet model * Update e2e_tensorrt_resnet_example.py * remove keras dependency and update class names * move ImageNetDataReader and ImageClassificationEvaluator to tensorrt resnet example * simplify e2e_tensorrt_resnet_example.py * Update preprocessing.py * merge tensorrt_calibrate * Update calibrate.py * Update calibrate.py * generalize calibrate * Update calibrate.py * fix issues * fix formating * remove augment_all * This added telemetry isn't needed (#6363) * Wezuo/memory analysis (#5658) * merged alloc_plan * pass compilation * Start running, incorrect allocation memory info * add in comments * fix a bug of recording pattern too early. * debugging lifetime * fix lifetime * passed mnist * in process of visualization * Add code to generate chrome trace for allocations. * in process of collecting fragmentation * before rebuild * passed mnist * passed bert tiny * fix the inplace reuse * fix the exception of weight in pinned memory * add guards to ensure the tensor is in AllocPlan * add customized profiling * debugging * debugging * fix the reuse of differnt location type * add rank * add the rank * add fragmentation * add time_step_trace * Add summary for each execution step (total bytes, used/free bytes). * add top k * change type of top k parameter * remove prints * change heap to set{ * add the name pattern * add the useage for pattern * add partition * change to static class * add custom group * remove const * update memory_info * in process of adding it as runtime config * change the memory profiling to be an argument * add some comments * add checks to recored meomry_info in traaining session * set the "local rank setting" to correct argument. * addressing comments * format adjustment * formatting * remove alloc_interval * update memory_info.cc to skip session when there is no tensor for a particular memory type * fix memory_info multiple iteration seg-fault * consolidate mainz changes * fixed some minor errors * guard by ORT_MINIMAL_BUILD * add ORT_MEMORY_PROFILE flag * added compiler flag to turn on/off memory profiling related code * clean up the code regarding comments * add comments * revoke the onnx version * clean up the code to match master * clean up the code to match master * clean up the code to match master Co-authored-by: Jesse Benson <benson.jesse@gmail.com> Co-authored-by: Wei Zuo <wezuo@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-mgtbby.eastus.cloudapp.azure.com> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-yclzsf.eastus.cloudapp.azure.com> * Support MLFloat16 in CumSum Cuda op for Opset 14 (#6355) * Add CumSum-14 for Cuda * fix convert_common version retrival (#6382) * Refine auto_pad based pad computation in ConvTranspose (#6305) * Fix SDL warning (#6390) * Add max_norm for gradient clipping. (#6289) * add max_norm as user option for gradient clipping * add adam and lamb test cases for clip norm * add frontend tests * Add the custom op project information (#6334) * Dont use default string marshalling in C# (#6219) * Fix Windows x86 compiler warnings in the optimizers project (#6377) * [Perf] Optimize Tile CPU and CUDA kernels for a corner case (#6376) * Unblock Android CI code coverage failure (#6393) * fix build on cuda11 (#6394) Co-authored-by: Vincent Wang <weicwang@microsoft.com> * Load the model path correctly (#6369) * Fix some compile warnings (#6316) * OpenVino docker file changes to bypass privileged mode Description: Builds and installs libusb without UDEV support, which is used for communicating with the VPU device. Motivation and Context This enables the resulting docker container to be run without '--privileged' and '--network host' options which may not be suitable in deployment environments. * Megatron checkpointing (#6293) * Add bart fairseq run script * Add frontend change to enable megatron * Initial changes for checkpointing * Megatron optim state loading, checkpoint aggregation, frontend distributed tests for H, D+H * Add load_checkpoint changes * Fix CI * Cleanup * Fix CI * review comments * review comments * review comments: * Fix generate_submodule_cgmanifest.py Windows issues. (#6404) * Continue memory planning when unknown shape tensor is encountered. (#6413) * Reintroduce experimental api changes and fix remote build break (#6385) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Add support for custom ops to minimal build. (#6228) * Add support for custom ops to minimal build. Cost is only ~8KB so including in base minimal build. * enable pipeline to run quantization tests (#6416) * enable pipeline to run quantization tests setup test pipeline for quantization * Minor cmake change (#6431) * Liqun/liqun/enable pipeline parallel test2 (#6399) * enable data and pipeline parallism test Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Farewell TrainableDropout (#5793) * Deprecate TrainableDropout kernel. * Update bert_toy_postprocessed.onnx to opset 12. * Add more dropout tests. * Fix BiasDropout kernel. Co-authored-by: Ubuntu <OrtTrainingDev3@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sergii Dymchenko <sedymche@microsoft.com> * fix null dereference warning (#6437) * Expose graph ModelPath to TensorRT shared library (#6353) * Update graph_viewer.cc * Update tensorrt_execution_provider.cc * Update graph_viewer.h * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * Update provider_api.h * Update provider_bridge_ort.cc * Update provider_interfaces.h * Update provider_interfaces.h * expose GraphViewer ModelPath API to TRT shared lib * add modelpath to compile * update * add model_path to onnx tensorrt parser * use GenerateMetaDefId to generate unique TRT kernel name * use GenerateMetaDefId to generate unique TRT engine name * fix issue * Update tensorrt_execution_provider.cc * remove GetVecHash * Update tensorrt_execution_provider.h * convert wchar_t to char for tensorrt parser * update tensorrt parser to include latest changes * fix issues * Update tensorrt_execution_provider.cc * merge trt parser latest change * add PROVIDER_DISALLOW_ALL(Path) * add tool for generating test data for longformer (#6415) * only build experimental api in redist (#6465) Co-authored-by: Sheil Kumar <sheilk@microsoft.com> * Add an option to save the training graph after optimization (#6410) * expose optimized_model_filepath in SessionOptions as `debug.graph_save_paths.model_with_training_graph_after_optimization_path` in `ORTTrainerOptions` * Share allocator between CUDA EP & TRT EP. (#6332) * Share allocator between CUDA EP & TRT EP. limitation: 1. Does not cover the per-thread allocator created by CUDA EP, still need to figure out the way to remove it 2. Need to have more identifiers to make it able to share CPU allocator across all EPs * fix max norm clipping test in python packaging pipeline test (#6468) * fix python packaging pipeline * make clip norm test compatabile with both V100 and M60 GPUs * Initial version of CoreML EP (#6392) * Bug 31463811: Servicing: Redist (Nuget) conflicts with Microsoft.AI.MachineLearning starting 21H1+ (#6460) * update load library code to have the fullly qualified path * make it work for syswow32 * git Revert "make it work for syswow32" This reverts commit b9f594341b7cf07241b18d0c376af905edcabae3. Co-authored-by: Sheil Kumar <sheilk@microsoft.com> * dequantize 1st input of lstm back if it is quantized (#6444) * [java] Adds support for OrtEnvironment thread pools (#6406) * Updates for Gradle 7. * Adding support for OrtThreadingOptions into the Java API. * Fixing a typo in the JNI code. * Adding a test for the environment's thread pool. * Fix cuda test, add comment to failure. * Updating build.gradle * fix SDL native rule warning #6246 (#6461) * fix SDL rule (#6464) * use tickcount64 (#6447) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Update pypi package metadata (#6354) * Update setup file data * add missing comma * remove python 3.5 * fix typo bracket * Delete nuget extra configs (#6477) * Op kernel type reduction infrastructure. (#6466) Add infrastructure to support type reduction in Op kernel implementations. Update Cast and IsInf CPU kernels to use it. * Fixing a leak in OnnxSequences with String keys or values. (#6473) * Increase the distributes tests pipeline timeout to 120 minutes (#6479) * [CoreML EP] Add CI for CoreML EP (macOS) and add coreml_flags for EP options (#6481) * Add macos coreml CI and coreml_flags * Move save debuggubg model to use environment var * Move pipeline off from macos CI template * Fix an issue building using unix make, add parallel to build script * Fixed build break for shared_lib and cmpile warning * Fix a compile warning * test * Revert the accidental push from another branch This reverts commit 472029ba25d50f9508474c9eeceb3454cead7877. * Add ability to track per operator types in reduced build config. (#6428) * Add ability to generate configuration that includes required types for individual operators, to allow build size reduction based on that. - Add python bindings for ORT format models - Add script to update bindings and help info - Add parsing of ORT format models - Add ability to enable type reduction to config generation - Update build.py to only allow operator/type reduction via config - simpler to require config to be generated first - can't mix a type aware (ORT format model only) and non-type aware config as that may result in insufficient types being enabled - Add script to create reduced build config - Update CIs * merge e2e with distributed pipeline (#6443) merge e2e with distributed pipeline * Fix test breaks in Windows ingestion pipeline (#6476) * fix various build breaks with Windows build * fix runtime errors loading libraries from system32 * add build_inbox check to winml_test_common * use raw string * cleanup * fix dll load Co-authored-by: Sheil Kumar <sheilk@microsoft.com> * Speed up the Mac CI runs (#6483) * expose learningmodelpixelrange property (#5877) * Fix of support api version bug for [de]quantize (#6492) * SDL fixes: add proper casts/format specifiers (#6446) * SDL annotation fixes (#6448) Co-authored-by: Ori Levari <orlevari@microsoft.com> * [OpenVINO-EP] Remove support for OpenVINO 2020.2 (#6493) * Removed OpenVINO 2020.2 support * Updated documentation and build.py * Removed unnecessary libraries from setup.py * Support pad operator in quantization and quantized nhwc transformer. Fix Pad operator bug. (#6325) Support pad operator in quantization tool. Support pad operator in quantized nhwc transformer. Fix pad() operator bug when pad input's inner(right) most axis value is zero for Edge and Reflect mode, it copied wrong value to the cells to be padded. Note the Constant mode will not trigger this bug, as Edge/Reflect need copy value from the already copied array while Constant mode only fill specified value. Add more test cases to cover pad() operator bug fixed here. Fix quantization tools uint8/int8 value overflow issue when quantize weights in python. * Improve work distribution for Expand operator, and sharded LoopCounter configuration (#6454) Description: This PR makes two changes identified while looking at a PGAN model. First, it uses ThreadPool::TryParallelFor for the main parallel loops in the Expand operator. This lets the thread pool decide on the granularity at which to distribute work (unlike TrySimpleParallelFor). Profiling showed high costs when running "simple" loops with 4M iterations each of which copied only 4 bytes. Second, it updates the sharded loop counter in the thread pool so that the number of shards is capped by the number of threads. This helps make the performance of any other high-contention "simple" loops more robust at low thread counts by letting each thread work on its own "home" shard for longer. Motivation and Context Profiling showed a PGAN model taking 2x+ longer with the non-OpenMP build. The root cause was that the OpenMP build uses simple static scheduling of loop iterations, while the non-OpenMP build uses dynamic scheduling. The combination of large numbers of tiny iterations is less significant with static scheduling --- although still desirable to avoid, given that each iteration incurs a std::function invocation. * Update document of transformer optimization (#6487) * nuphar test to avoid test data download to improve passing rate (#6467) nuphar test to avoid test data download to improve passing rate * Fuse cuda conv with activation (#6351) * optimize cuda conv by fused activation * remove needless print out * exclude test from cpu * handle status error from cudnn 8.x * add reference to base class * add hipify * [CoreML EP] Add support for some activations/Transpose, move some shared helpers from NNAPI to shared space (#6498) * Init change * Move some helper from nnapi ep to shared * Add transpose support * Fix trt ci build break * Refine transformers profiler output (#6502) * output nodes in the original order; grouped by node name * add document for profiler * Update to match new test setup. (#6496) * Update to match new test setup. * Add Gemm(7) manually for now. Will fix properly on Monday. It's used by mnist.ort as that is created by optimizing mnist.onnx to level 1 causing 2 nodes to be replaced by a Gemm and the op to be missing from the required list as that is created using the original onnx model. * Enable dense sequence optimized version of Pytorch exported BERT-L on AMD GPU (#6504) * Permit dense seq optimization on BERT-L pytorch export by enabling ReduceSumTraining, Equal, and NonZero on AMD * enable Equal tests * enable fast_matrix_reduction test case * Optimize GatherGrad for AMD GPU (#6381) * optimize gathergrad * address comments Co-authored-by: Weixing Zhang <wezhan@microsoft.com> * add explicit barriers for buffer overread and overrwrite (#6484) Co-authored-by: Ori Levari <orlevari@microsoft.com> * fix sdl bugs for uninitialized variables and returns (#6450) Co-authored-by: Ori Levari <orlevari@microsoft.com> * handle hr error conditions (#6449) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Dnnl training (#6045) * Add ReluGrad and ConvGrad ops for the dnnl provider * the mnist sample is updated to add the --use_dnnl option that will cause the sample to use the dnnl execution provider for nodes that exist in dnnl provider. * Added the ability to find forward ops. Dnnl backward gradient ops require the forward primitive description and workspace from the forward operation. * Enable specifying the execution provider for Gradient Checker Tests * Prevent memory leak when running dnnl_provider in training mode Prevent creating a SubgraphPrimitivePool when the code is built with the ENABLE_TRAINING build flag. Instead create a SubgraphPrimitive directly. The SubgraphPrimitivePool was causing a pool of SubgraphPrimitives to be stashed in a map for reuse. Due to the way the Training Loop uses threads the pool of SubgraphPrimitives were not being reuse instead a new pool of SubgraphPrimitives being created each run. The old pool was not instantly freed. This behavior could be a language error when using thread_local memory. Signed-off-by: George Nash <george.nash@intel.com> * Added fixes to maxpoolgrad and memory leak. Maxpoolgrad will now pass all unit tests. With the conv and convgrad disabled for dnnl, mnist is able to train till 95% Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> * Fixed misc issues when testing training code with dnnl provider * fix conv_grad dnnl tests with dilation to run dnnl execution provider * update mnist training sample to accept convolution type models convolution models require the input shape to be {1, 28, 28} instead of the flat {728} image that is used for the gemm models this will enable models that require the different shape by adding `--model_type conv` to the command line when running the mnist sample. (while testing a workaround was used see #4762) * Disable weight caching in dnnl conv operator when using training When training we can not use cached weights because the weight will be updated each run. This re-enables dnnl Conv and ConvGrad Ops. The weight caching was the source of the error from Conv when training. * Fix issues found when building grad ops on Linux * The dnnl_convgrad code was over using the scope operator causing a compilation problem. * The dnnl_maxpoolgrad code had a logic error that is was comparing with the source description when it should have been comparing with the destination despription. * Update BUILD.md so it shows DNNL for training * Updated the table of contents. Since the same providers are listed twice. Once for Infrance and again for Training an HTML anchor was added to distinguish the second header from the first for the TOC. * Fix build failure when not using --enable-training build option * reorganize the gradient operators so they are grouped together * Fix issues found when running onnx_backend_test_series.py * Pooling code only supports 2 outputs when built with --enable-training * Address code review feedback * class member variables end in underscore_ * use dst instead of dist to match pattern use elsewhere in DNNL code. * Remove workaround that was introduced to handle problems running convolution based training models. See issue #4762 Signed-off-by: George Nash <george.nash@intel.com> * Isolate training code and code cleanup * Do not build if dnnl_gpu_runtime if enable_training is set training code does not support dnnl_gpu_runtime yet. * Isolated Training code inside ifdefs so that they wont affect project if built without training enabled * Inadvertant changes in whitespace were removed to make code review simpler * Undid some code reordering that was not needed * comments added to closing #endif statments to simplify reading complex ifdefs * Modified the GetPrimitiveDesc functions to return shared_ptr instead of raw pointer. This matches what was done in Pool code and is safer memory code. Signed-off-by: George Nash <george.nash@intel.com> * Address code review issues - whitespace changes caused by running clang-format on the code - Several spelling errors fixed - Removed/changed some ifdefs to improve readability - other misc. changes in responce to code review. Signed-off-by: George Nash <george.nash@intel.com> * Code changes to address code review - Simplify iteration code using `auto` keyword - remove C style cast that was not needed - remove instance variable that was not needed [relugrad.h] - added the execution providers to `ComputeGradientErrorInternal()` and `ComputeTheoreticalJacobianTranspose()` instead of using a pointer to an instance varaible [gradient_checker.h/.cc] Signed-off-by: George Nash <george.nash@intel.com> * Combined the default gradient ops test and dnnl gradient ops test for ConvGrad and MaxPoolGrad into one function with the help of a helper function. This will reduce repeated code. Signed-off-by: Palangotu Keshava, Chethan's avatarChethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> * Replaced the stack used by convgrad to vector so that the vector(used as stack) can be easily cleared everytime the graph is created. This will prevent memory leak from convolution kernels being pushed constantly onto the stack. Signed-off-by: chethan.palangotu.keshava@intel.com * Code clean up and formating updates - Removed empty else statment - updated indentation of code that was causing double curly brackets to look unususal - Changed check for NumDimensions to Size in Relu and ReluGrad error checking code. - isolated training code Signed-off-by: George Nash <george.nash@intel.com> * Restore inadvertantly removed ConvGrad tests When combining the DNNL and CPU version of the ConvGrad tests two test were inadvertantly excluded. This adds back the Conv3d and Conv3d with strides test cases. Signed-off-by: George Nash <george.nash@intel.com> * Add validation to ConvGrad This validates the dimensions of the ConvGrad match the passed in Convolution forward primitive description. The current code for DNNL ConvGrad makes the assumption that the ConvGrad nodes will be visited in the reverse order from the corresponding Conv nodes The added validation will return an error if this assumption is not true. Signed-off-by: George Nash <george.nash@intel.com> * Do not create new execution providers in provider_test_utils This removes the code that generated new execution providers in the OpTester::Run function. This was added because the std::move was leaving the `entry` value empty so subsequent calls would cause a segfault. Problem is this potentially changed the execution_provider because it would create the default provider dropping any custom arguments. When the now removed code was originally added the std::move was causing crashes when the GradientChecker unit tests were run. However, it is no longer causing problems even with the code removed. Signed-off-by: George Nash <george.nash@intel.com> * Change the forward conv stack to a forward conv map This changes how the forward conv kernel is mapped to the bwd ConvGrad kernel the problematic stack is no longer used. The convolution stack made the assumption that the corresponding ConvGrad operator would be visited in reverse order of the forward Conv operators. This was always problematic and was unlikely to work for inception models. Important changes: - The weight_name is added to the ConvGrad dnnl_node making it possible to use the weight_name as a lookup key to find the Conv forward Kernel - the `std::vector fwd_conv_stack_` has been replaced with a `std::map fwd_conv_kernel_map_` - Although it is not needed lock_guards were added when writing to and reading from the fwd_conv_kernel_map_ as well as the fwd_kernel_map_. These should always be accessed by a single thread when preparing the dnnl subgraphs so the guard should not be needed but its added just in case. - Updated the comments ConvGrad.h code to no longer mention the stack. The error check is not removed. It will be good to verify there are no errors as we continue to test against more models. Signed-off-by: George Nash <george.nash@intel.com> Co-authored-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> Co-authored-by: unknown <63478620+jeyblu@users.noreply.github.com> * Lochi/refactor yolov3 quantization (#6290) * Refactor the code and move data reader, preprocessing, evaluation to E2E_example_mode * Refactor the code. Move data reader, preprocessing, evaluation to model specific example under E2E_example_mode * refactor code * Move yolov3 example to specific folder and add additional pre/post processing * Print a warning message for using newer c_api header on old binary (#6507) * Fix issues with ArmNN build setup (#6495) * ArmNN build fixes * Update BUILD.md to document that the ACL paths must be specified to build ArmNN * Fix CUDA build error. We don't setup the link libraries correctly/consistently so improve that. * Fix Windows CI builds by updating test scripts to work with numpy 1.20. (#6518) * Update onnxruntime_test_python.py to work with numpy 1.20. Some aliases are deprecated in favor of the built-in python types. See https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations np.array with bytes for entries and dtype of np.void no longer automatically pads. Change a test to adjust for that. * Fix another test script * Fix ORTModule branch for orttraining-* pipelines * Update pytorch nightly version dependency Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: George Wu <jywu@microsoft.com> Co-authored-by: Cecilia Liu <ziyue.liu7@gmail.com> Co-authored-by: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Co-authored-by: George Nash <george.nash@intel.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Yateng Hong <toothache9010@gmail.com> Co-authored-by: stevenlix <38092805+stevenlix@users.noreply.github.com> Co-authored-by: Derek Murray <Derek.Murray@microsoft.com> Co-authored-by: ashbhandare <ash.bhandare@gmail.com> Co-authored-by: Scott McKay <skottmckay@gmail.com> Co-authored-by: Changming Sun <chasun@microsoft.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: Juliana Franco <jufranc@microsoft.com> Co-authored-by: Pranav Sharma <prs@microsoft.com> Co-authored-by: Tixxx <tix@microsoft.com> Co-authored-by: Jay Rodge <jayrodge@live.com> Co-authored-by: Du Li <duli1@microsoft.com> Co-authored-by: Du Li <duli@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Yufeng Li <liyufeng1987@gmail.com> Co-authored-by: baijumeswani <bmeswani@microsoft.com> Co-authored-by: Sergii Dymchenko <sedymche@microsoft.com> Co-authored-by: jingyanwangms <47403504+jingyanwangms@users.noreply.github.com> Co-authored-by: satyajandhyala <satya.k.jandhyala@gmail.com> Co-authored-by: S. Manohar Karlapalem <manohar.karlapalem@intel.com> Co-authored-by: Weixing Zhang <weixingzhang@users.noreply.github.com> Co-authored-by: Suffian Khan <sukha@microsoft.com> Co-authored-by: Olivia Jain <oljain@microsoft.com> Co-authored-by: Chi Lo <54722500+chilo-ms@users.noreply.github.com> Co-authored-by: Hariharan Seshadri <shariharan91@gmail.com> Co-authored-by: Ryan Lai <rylai@microsoft.com> Co-authored-by: Jesse Benson <jesseb@microsoft.com> Co-authored-by: sfatimar <64512376+sfatimar@users.noreply.github.com> Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: sfatimar <sahar.fatima@intel/com> Co-authored-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: mohdansx <mohdx.ansari@intel.com> Co-authored-by: Xavier Dupré <xadupre@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin@vols.utk.edu> Co-authored-by: Michael Giba <michaelgiba@gmail.com> Co-authored-by: William Tambellini <wtambellini@sdl.com> Co-authored-by: Hector Li <hecli@microsoft.com> Co-authored-by: Aishwarya <aibhanda@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: liqunfu <liqfu@microsoft.com> Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: pengwa <pengwa@microsoft.com> Co-authored-by: Tang, Cheng <souptc@gmail.com> Co-authored-by: Cheng Tang <chenta@microsoft.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Ye Wang <52801275+wangyems@users.noreply.github.com> Co-authored-by: Chun-Wei Chen <jacky82226@gmail.com> Co-authored-by: Vincent Wang <wangwchpku@outlook.com> Co-authored-by: Vincent Wang <weicwang@microsoft.com> Co-authored-by: Luyao Ren <375833274@qq.com> Co-authored-by: Zhang Lei <zhang.huanning@hotmail.com> Co-authored-by: Tim Harris <tiharr@microsoft.com> Co-authored-by: Ashwini Khade <askhade@microsoft.com> Co-authored-by: Dmitri Smirnov <yuslepukhin@users.noreply.github.com> Co-authored-by: Alberto Magni <49027342+alberto-magni@users.noreply.github.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: wezuo <49965641+wezuo@users.noreply.github.com> Co-authored-by: Jesse Benson <benson.jesse@gmail.com> Co-authored-by: Wei Zuo <wezuo@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-mgtbby.eastus.cloudapp.azure.com> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-yclzsf.eastus.cloudapp.azure.com> Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Co-authored-by: Martin Man <supermt@gmail.com> Co-authored-by: M. Zeeshan Siddiqui <mzs@microsoft.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Ubuntu <OrtTrainingDev3@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sheil Kumar <smk2007@gmail.com> Co-authored-by: Sheil Kumar <sheilk@microsoft.com> Co-authored-by: Ryota Tomioka <ryoto@microsoft.com> Co-authored-by: Adam Pocock <adam.pocock@oracle.com> Co-authored-by: Yulong Wang <f.s@qq.com> Co-authored-by: Faith Xu <faxu@microsoft.com> Co-authored-by: Xiang Zhang <xianz@microsoft.com> Co-authored-by: suryasidd <48925384+suryasidd@users.noreply.github.com> Co-authored-by: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com> Co-authored-by: Weixing Zhang <wezhan@microsoft.com> Co-authored-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> Co-authored-by: unknown <63478620+jeyblu@users.noreply.github.com>
2021-02-02 16:59:56 +00:00
)
file(GLOB onnxruntime_python_transformers_models_whisper_src CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/tools/transformers/models/whisper/*.py"
)
2019-04-29 19:58:20 +00:00
file(GLOB onnxruntime_python_datasets_srcs CONFIGURE_DEPENDS
2018-11-20 00:48:22 +00:00
"${ONNXRUNTIME_ROOT}/python/datasets/*.py"
)
2019-04-29 19:58:20 +00:00
file(GLOB onnxruntime_python_datasets_data CONFIGURE_DEPENDS
2018-11-20 00:48:22 +00:00
"${ONNXRUNTIME_ROOT}/python/datasets/*.pb"
"${ONNXRUNTIME_ROOT}/python/datasets/*.onnx"
)
# ORT Mobile helpers to convert ONNX model to ORT format, analyze model for suitability in mobile scenarios,
# and assist with export from PyTorch.
set(onnxruntime_mobile_util_srcs
${REPO_ROOT}/tools/python/util/check_onnx_model_mobile_usability.py
${REPO_ROOT}/tools/python/util/convert_onnx_models_to_ort.py
${REPO_ROOT}/tools/python/util/file_utils.py
${REPO_ROOT}/tools/python/util/logger.py
${REPO_ROOT}/tools/python/util/make_dynamic_shape_fixed.py
${REPO_ROOT}/tools/python/util/onnx_model_utils.py
${REPO_ROOT}/tools/python/util/optimize_onnx_model.py
${REPO_ROOT}/tools/python/util/pytorch_export_helpers.py
${REPO_ROOT}/tools/python/util/reduced_build_config_parser.py
${REPO_ROOT}/tools/python/util/update_onnx_opset.py
)
file(GLOB onnxruntime_ort_format_model_srcs CONFIGURE_DEPENDS
${REPO_ROOT}/tools/python/util/ort_format_model/*.py
)
file(GLOB onnxruntime_mobile_helpers_srcs CONFIGURE_DEPENDS
${REPO_ROOT}/tools/python/util/mobile_helpers/*.py
${REPO_ROOT}/tools/ci_build/github/android/nnapi_supported_ops.md
${REPO_ROOT}/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
${REPO_ROOT}/tools/ci_build/github/apple/coreml_supported_neuralnetwork_ops.md
)
file(GLOB onnxruntime_qdq_helper_srcs CONFIGURE_DEPENDS
${REPO_ROOT}/tools/python/util/qdq_helpers/*.py
)
Enable model subgraph execution in OVEP and setting the OpenVINO dll's to the path from the OpenVINO pypi packge in OVEP and fix OVEP windows io buffer sample (#16147) ### Description This PR enables execution of subgraphs in OVEP and currently, when OVEP developers install the onnxruntime-openvino package on windows from pypi, they would have to additionally download OpenVINO windows binaries and run the setupvars.bat script which sets the environment PATH to locate the OV dll's. Also this PR fixes issues of OVEP windows io buffer sample. ### Motivation and Context Fix: We want to make the user experience easy for OVEP Python developers on windows platform. This fix, introduces a function add_openvino_libs_to_path at the location tools/python/util/add_openvino_win_libs.py. The above function, can be called by OVEP python users in the application code and that takes care of setting the OpenVINO dll's to the path from the OpenVINO pypi packge (openvino) which was installed. This change also makes sure that add_openvino_libs_to_path() function is added to onnxruntime python package only when it is build for OpenVINO Execution Provider for ONNXRuntime and not for default ORT python package builds. New user experience for Python OVEP developers on windows platform: step 1: pip install onnxruntime-openvino step 2: pip install openvino step 3: <Add these 2 lines in the application code> import onnxruntime.tools.add_openvino_win_libs as utils utils.add_openvino_libs_to_path() --------- Signed-off-by: MaajidKhan <n.maajid.khan@intel.com> Co-authored-by: MaajidKhan <n.maajid.khan@intel.com> Co-authored-by: Suryaprakash Shanmugam <suryaprakash.shanmugam@intel.com>
2023-06-17 02:47:09 +00:00
if (onnxruntime_USE_OPENVINO)
file(GLOB onnxruntime_python_openvino_python_srcs CONFIGURE_DEPENDS
${REPO_ROOT}/tools/python/util/add_openvino_win_libs.py
)
endif()
set(build_output_target onnxruntime_common)
if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS)
2018-11-20 00:48:22 +00:00
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/backend
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/training
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/datasets
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/mobile_helpers
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/qdq_helpers
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/ort_format_model
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/ort_format_model/ort_flatbuffers_py
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/bart
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/bert
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/gpt2
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/llama
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/longformer
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/phi2
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/stable_diffusion
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/t5
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/whisper
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/operators
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/CalTableFlatBuffers
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/fusions
[Quantization] Tensor quant overrides and QNN EP quantization configuration (#18465) ### Description #### 1. Adds `TensorQuantOverrides` extra option Allows specifying a dictionary of tensor-level quantization overrides: ``` TensorQuantOverrides = dictionary : Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For per-channel quantization, the list contains a dictionary for each channel in the tensor. Each dictionary contains optional overrides with the following keys and values. 'quant_type' = QuantType : The tensor's quantization data type. 'scale' = Float : The scale value to use. Must also specify `zero_point` if set. 'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set. 'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also set `scale` or `zero_point`. 'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also set `scale` or `zero_point`. 'rmax' = Float : Override the maximum real tensor value in calibration data. Invalid if also set `scale` or `zero_point`. 'rmin' = Float : Override the minimum real tensor value in calibration data. Invalid if also set `scale` or `zero_point`. ``` - All of the options are optional. - Some combinations are invalid. - Ex: `rmax` and `rmin` are unnecessary if the `zero_point` and `scale` are also specified. Example for per-tensor quantization overrides: ```Python3 extra_options = { "TensorQuantOverrides": { "SIG_OUT": [{"scale": 1.0, "zero_point": 127}], "WGT": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}], "BIAS": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}], }, } ``` Example for per-channel quantization overrides (Conv weight and bias): ```Python3 extra_options = { "TensorQuantOverrides": { "WGT": [ { "quant_type": quantization.QuantType.QUInt8, "rmin": 0.0, "rmax": 2.5, "reduce_range": True, }, { "quant_type": quantization.QuantType.QUInt8, "rmin": 0.2, "rmax": 2.55, "reduce_range": False, }, ], "BIAS": [ {"zero_point": 0, "scale": 0.000621}, {"zero_point": 0, "scale": 0.23}, ], }, } ``` #### 2. Adds utilities to get the default QDQ configs for QNN EP Added a `quantization.execution_providers.qnn.get_qnn_qdq_config` method that inspects the model and returns suitable quantization configurations. Example usage: ```python3 from quantization import quantize, QuantType from quantization.execution_providers.qnn import get_qnn_qdq_config qnn_config = get_qnn_qdq_config(input_model_path, data_reader, activation_type=QuantType.QUInt16, weight_type=QuantType.QUInt8) quantize(input_model_path, output_model_path, qnn_config) ``` ### Motivation and Context Make it possible to create more QDQ models that run on QNN EP. --------- Signed-off-by: adrianlizarraga <adlizarraga@microsoft.com>
2023-12-05 01:54:58 +00:00
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/execution_providers
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/execution_providers/qnn
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/quantization
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/transformers
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/transformers/test_data/models
Whisper Model Optimization (#15473) ### Description This PR contains fusion-level and kernel-level optimizations for [OpenAI's Whisper](https://github.com/openai/whisper). Some of the added optimizations include: - Pruning of duplicate/unnecessary inputs and outputs - Fusion support for Whisper models with or without these inputs/outputs (e.g. with these inputs/outputs if exporting with an older official Optimum version, without these inputs/outputs if exporting with Optimum from source) - Attention fusions - For Whisper's encoder and decoder - Modified symbolic shape inference for present output when no past input exists (for decoder) - Multi-head attention fusions - For Whisper's decoder and decoder with past - Packed MatMul for the 3 MatMuls excluded in multi-head attention fusion - Attention kernel changes - CPU: - Different Q and KV sequence lengths - Parallel memset for large sequence lengths - Convert broadcast add after MatMul of Q and K (add_qk) to element-wise add - Separate present key-value output into present key and present value (for multi-head attention spec) - CUDA: - Use memory efficient attention compute kernel with present state (for decoder) - Multi-head attention kernel changes - CPU: - Introduction of multi-head attention CPU kernel (previously did not exist) - Use AddBiasReshape instead of AddBiasTranspose when sequence length = 1 (for decoder with past) - Different Q, K, V input shapes - Pass past key and past value directly as key and value - CUDA: - Use memory efficient attention compute kernel with past and/or present state (for decoder with past) ### Usage To use the optimizations, run the ORT transformer optimizer script as follows: ``` $ cd onnxruntime/onnxruntime/python/tools/transformers/ $ python3 optimizer.py --input <filename>.onnx --output <filename>.onnx --model_type bart --num_heads <number of attention heads, depends on the size of the whisper model used> --hidden_size <attention hidden size, depends on the size of the whisper model used> --use_external_data_format --use_multi_head_attention ``` Once optimized, here's an example of how to run Whisper with [Hugging Face's Optimum](https://github.com/huggingface/optimum): ``` from transformers.onnx.utils import get_preprocessor from optimum.onnxruntime import ORTModelForSpeechSeq2Seq from optimum.pipelines import pipeline as ort_pipeline import whisper # Installed from OpenAI's repo - setup instructions at https://github.com/openai/whisper/ directory = './whisper_opt' # Where the optimized ONNX models are located model_name = 'openai/whisper-tiny' device = 'cpu' # Get pipeline processor = get_preprocessor(model_name) model = ORTModelForSpeechSeq2Seq.from_pretrained( directory, use_io_binding=(device == 'cuda'), provider='CPUExecutionProvider', ).to(device) pipe = ort_pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=(-1 if device == 'cpu' else 0), ) # Load audio file and run pipeline audio = whisper.load_audio('tests/jfk.flac') audio = whisper.pad_or_trim(audio) outputs = pipe([audio]) print(outputs) ``` Note: In order to use these changes with Optimum, it is recommended to use Optimum from source to have the following changes: - https://github.com/huggingface/optimum/pull/872 - https://github.com/huggingface/optimum/pull/920 ### Motivation and Context This PR helps the following issues: - https://github.com/microsoft/onnxruntime/issues/15100 - https://github.com/microsoft/onnxruntime/issues/15235 - https://github.com/huggingface/optimum/issues/869 (work in progress) This PR can be used with the other currently merged Whisper PRs: - https://github.com/microsoft/onnxruntime/pull/15247 - https://github.com/microsoft/onnxruntime/pull/15339 - https://github.com/microsoft/onnxruntime/pull/15362 - https://github.com/microsoft/onnxruntime/pull/15365 - https://github.com/microsoft/onnxruntime/pull/15427 This PR uses changes from the following merged PRs: - https://github.com/microsoft/onnxruntime/pull/14198 - https://github.com/microsoft/onnxruntime/pull/14146 - https://github.com/microsoft/onnxruntime/pull/14201 - https://github.com/microsoft/onnxruntime/pull/14928 (this introduced the new multi-head attention spec)
2023-04-19 00:13:54 +00:00
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/transformers/test_data/models/whisper
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/eager_test
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/transformers/test_data/models/conformer
2018-11-20 00:48:22 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${ONNXRUNTIME_ROOT}/__init__.py
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/
COMMAND ${CMAKE_COMMAND} -E copy
${REPO_ROOT}/requirements.txt
$<TARGET_FILE_DIR:${build_output_target}>
2018-11-20 00:48:22 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${REPO_ROOT}/ThirdPartyNotices.txt
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/
COMMAND ${CMAKE_COMMAND} -E copy
${REPO_ROOT}/docs/Privacy.md
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/
2018-11-20 00:48:22 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${REPO_ROOT}/LICENSE
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/
2018-11-20 00:48:22 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_backend_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/backend/
2018-11-20 00:48:22 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
COMMAND ${CMAKE_COMMAND} -E copy_if_different
${CMAKE_BINARY_DIR}/onnxruntime/capi/_pybind_state.py
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
2018-11-20 00:48:22 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_pybind11_state>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
2018-11-20 00:48:22 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_datasets_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/datasets/
2018-11-20 00:48:22 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_datasets_data}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/datasets/
2018-11-20 00:48:22 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_tools_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_mobile_util_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/
# append the /tools/python/utils imports to the __init__.py that came from /onnxruntime/tools.
# we're aggregating scripts from two different locations, and only include selected functionality from
# /tools/python/util. due to that we take the full __init__.py from /onnxruntime/tools and append
# the required content from /tools/python/util/__init__append.py.
COMMAND ${CMAKE_COMMAND} -E cat
${REPO_ROOT}/tools/python/util/__init__append.py >>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/__init__.py
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_qdq_helper_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/qdq_helpers/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_mobile_helpers_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/mobile_helpers/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_ort_format_model_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/ort_format_model/
COMMAND ${CMAKE_COMMAND} -E copy_directory
${ONNXRUNTIME_ROOT}/core/flatbuffers/ort_flatbuffers_py
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/ort_format_model/ort_flatbuffers_py
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_quantization_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_quantization_operators_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/operators/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_quantization_cal_table_flatbuffers_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/CalTableFlatBuffers/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_quantization_fusions_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/fusions/
[Quantization] Tensor quant overrides and QNN EP quantization configuration (#18465) ### Description #### 1. Adds `TensorQuantOverrides` extra option Allows specifying a dictionary of tensor-level quantization overrides: ``` TensorQuantOverrides = dictionary : Default is {}. Set tensor quantization overrides. The key is a tensor name and the value is a list of dictionaries. For per-tensor quantization, the list contains a single dictionary. For per-channel quantization, the list contains a dictionary for each channel in the tensor. Each dictionary contains optional overrides with the following keys and values. 'quant_type' = QuantType : The tensor's quantization data type. 'scale' = Float : The scale value to use. Must also specify `zero_point` if set. 'zero_point' = Int : The zero-point value to use. Must also specify `scale` is set. 'symmetric' = Bool : If the tensor should use symmetric quantization. Invalid if also set `scale` or `zero_point`. 'reduce_range' = Bool : If the quantization range should be reduced. Invalid if also set `scale` or `zero_point`. 'rmax' = Float : Override the maximum real tensor value in calibration data. Invalid if also set `scale` or `zero_point`. 'rmin' = Float : Override the minimum real tensor value in calibration data. Invalid if also set `scale` or `zero_point`. ``` - All of the options are optional. - Some combinations are invalid. - Ex: `rmax` and `rmin` are unnecessary if the `zero_point` and `scale` are also specified. Example for per-tensor quantization overrides: ```Python3 extra_options = { "TensorQuantOverrides": { "SIG_OUT": [{"scale": 1.0, "zero_point": 127}], "WGT": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}], "BIAS": [{"quant_type": quantization.QuantType.QInt8, "symmetric": True, "reduce_range": True}], }, } ``` Example for per-channel quantization overrides (Conv weight and bias): ```Python3 extra_options = { "TensorQuantOverrides": { "WGT": [ { "quant_type": quantization.QuantType.QUInt8, "rmin": 0.0, "rmax": 2.5, "reduce_range": True, }, { "quant_type": quantization.QuantType.QUInt8, "rmin": 0.2, "rmax": 2.55, "reduce_range": False, }, ], "BIAS": [ {"zero_point": 0, "scale": 0.000621}, {"zero_point": 0, "scale": 0.23}, ], }, } ``` #### 2. Adds utilities to get the default QDQ configs for QNN EP Added a `quantization.execution_providers.qnn.get_qnn_qdq_config` method that inspects the model and returns suitable quantization configurations. Example usage: ```python3 from quantization import quantize, QuantType from quantization.execution_providers.qnn import get_qnn_qdq_config qnn_config = get_qnn_qdq_config(input_model_path, data_reader, activation_type=QuantType.QUInt16, weight_type=QuantType.QUInt8) quantize(input_model_path, output_model_path, qnn_config) ``` ### Motivation and Context Make it possible to create more QDQ models that run on QNN EP. --------- Signed-off-by: adrianlizarraga <adlizarraga@microsoft.com>
2023-12-05 01:54:58 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_quantization_ep_qnn_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/quantization/execution_providers/qnn/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_bart_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/bart/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_bert_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/bert/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_gpt2_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/gpt2/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_llama_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/llama/
Sync ORTModule branch with master and fix tests (#6526) * Deprecate Python global configuration functions [Part 1] (#5923) Enable options to be set via execution provider (EP)-specific options and log deprecation warning from current global configuration functions. * remove dnnl_dll_path from post build copy (#6142) * Model Fusion For Bart (#6105) Fusion fix for Bart models * Unify IExecutionProvider and IExecutionProviderFactory interfaces (#6108) * Remove Provider_IExecutionProvider and make the internal IExecutionProvider usable by shared providers * Change Provider_IExecutionProviderFactory to be the core version. * Enable running the mnist_training sample without cuda (#6085) Signed-off-by: George Nash <george.nash@intel.com> * nnapi add min max support (#6117) * Fix CUDA test hang: (#6138) - Make condition check in `CUDAAllocatorTest` to ensure CUDA device is present. * Fix TensorRT kernel conflict issue for subgraphs of control flow operators (#6115) * add static subgraph kernel index * change kernel naming to avoid conflicts * Add gradient registration for Abs. (#6139) * Partition initial optimizer state for Zero-1 (#6093) * Initial changes * Working changes * Working changes * Cleanup * fix windows CI * Review comments * review comments * Fix edge case in BFCArena where allocation failures could lead to an infinite loop. (#6145) #4656 * Revert "work around of the build break in mac (#6069)" (#6150) This reverts commit 3cae28699bed5de1fcaadb219fa69bae0fc3cee8. * Fix clean_docker_image_cache.py detection of image pushes. (#6151) Fix clean_docker_image_cache.py detection of image pushes. They were being ignored because the expected HTTP status code was wrong. For pushes, it's 201 instead of 200. * MLAS: add NEON version of int8 depthwise convolution (#6152) * Using a map of of ops to stages as input of partition function. (#5940) * New partition algorithm running before AD * Convert cut_group_info into device map. Work in progress -- works for bert-tiny with pp=2 * Removing code for partition of bwd graphs * Remove old code * Adding some verification code * Handle Shared Initializer * Renaming rank with stage * Added first unit test * new test * redundant check * undo change in bert * Moved cut-based partition to testing utils file Co-authored-by: xzhu1900 Co-authored-by: wschin * New conversion function and tests * minor * remove test that is not needed2 * improve GetDeviceAssignment and PR comments * minor changes * PR comments * improving documentation and variable naming * add documentation * Variable naming and docs * more doc improvements * more doc improvements * missing static cast * Fix test file for windows * Fix test file for windows * Fix test file for windows * stage id is not the same as rank id * PR comments * PR comments * More comments * More comments * Minor fix to satisfy c++14 (#6162) * Deprecating Horovod and refactored Adasum computations (#5468) deprecated horovod submodule refactored adasum logic to be ort-native added tests for native kernel and e2e tests * Update TensorRT-ExecutionProvider.md (#6161) * Bugfix for topk cuda kernel (#6164) * fix the issue that std::numeric_limits cannot handle half type * adding a test Co-authored-by: Du Li <duli@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Revert "Fuse MatMulIntegerToFloat only when scales are scalar (#6008)" (#6169) This reverts commit f2dcba7afe0d42ebdaaef0c6cdf913a1156c9e98. * Remove ignored build warnings for pybind on Mac (#6165) * save_checkpoint, load_checkpoint and aggregate_checkpoints (#6136) * save_checkpoint and load_checkpoint implementations * checkpoint aggregation logic * unit tests for save_checkpoint, load_checkpoint and aggregate_checkpoints * Don't try to bind unused inputs in the Training frontend (#6166) * Update documentation for contributing a PR and add deprecation notices for PyOp and ORT server. (#6172) * aggregate model states only for the case when mixed precision was true (#6176) * [NNAPI EP] Enable per-channel quantization for QlinearConv (#6155) * Enable qlinearconv per-channel quantization * Fix the android CI test failure * Add Android Version Check for Per-Channel Quant * Address PR comments * Fix some minor issues * Add verification of per-channel zero points * Make the error tolerance configurable * Fix typo in BERT pretraining script (#6175) A misplaced `}` meant that the `'enable_adasum'` option was interpreted incorrectly, causing the test to fail. * Update get_docker_image.py to enable use without image cache container registry. (#6177) Update get_docker_image.py to enable use without image cache container registry. * Helper for compiling EP to generate deterministic unique ids for use in MetaDef names (#6156) * Create a helper for generating unique ids that can be used by an EP that creates compiled nodes and needs ids to be deterministic for a model when used in multiple sessions. Added to IExecutionProvider as this can potentially be used by all compiling EPs and is more robust than a simplistic counter (although EP implementer is free to choose either approach). * Restructure the helper so it can be called across the EP bridge. Add ability to call id generation helper from EP bridge - convert DNNL EP to use helper to validate Address issue where a new Model may be loaded into the same address as a previous one. - hash the bytes in the Graph instance (1728 bytes currently) to use as the key to the full hash for the model Add lock around id generation to ensure no issues if multiple sessions partitions graphs at exactly the same time. - Extremely unlikely but would be hard to debug and the locking cost is not an issue as it's only incurred during graph partitioning and not execution. * Backend APIs for checkpointing (#5803) * Add backend API GetOptimizerState and GetModelState * add GetPartitionInfoMap * Android coverage dashboard (#6163) * Write the report to a file. * Post code coverage to the Dashboard database. * Add usage details of unified MCR container image (#6182) Going forward, a single unifed docker image will be published in MCR. The hardware accelerator target choice will have to be made in the application using OpenVINO EP's runtime config options. * improve perf for softmax (#6128) * improve perf for both gathergrad and softmax * revert the change in gathergrad and will be done in another PR. * address comments from code review. * Tune fast Gelu to use exp(x) instead of tanh(x) on Rocm platform (#6174) * tune fast gelu to use exp(x) instead of tanh(x) on rocm * update to use expression 2/(1+exp(-2x))-1 for stability * Add Status.csv to EP Perf Tool (#6167) * merge master, keep postprocess status commit * download float16.py everytime * removing hardcoded values * Lochi/quantization tool for trt (#6103) * Initial implementation of generating calibration dynamic range table * Initialize validation support for Quantization * Initialize validation support for Quantization (cont.) * Improve validation support for Quantization * Improve validation support for Quantization * Rewrite/Refine for calibration and validation * Rewrite/Refine for calibration and validation (cont.) * Refine code * Refine code * Add data reader for BERT * Add flatbuffers to serialize calibration table * Refine code and add BERT evaluation * Refine the code * minor modification * Add preprocess/postprocess of vision team yolov3 and refine the code * Update annotation * Make bbox cooridates more accurate * Fix bug * Add support of batch processing * Batch processing for model zoo yolov3 * Add batch inference for evaluation * Refine the code * Add README * Add comments * Refine the code for PR * Remove batch support checking in data_reader and refine the code * Refine the code for PR * Refine the code for PR review Co-authored-by: Olivia Jain <oljain@microsoft.com> * Implement ScatterND for CUDA EP (#6184) * Condition fix in Resize operator (#6193) * Clean up checkpoint tests to use the new checkpoint functions (#6188) * add deprecation warning for old checkpoint functions * update all the distributed checkpoint tests to use new checkpoint functions * Implement comparing outputs that are sequence of maps of strings to floats (#6180) * Implement conversion from ortvalue to Itensor for string tensors and comparing sequence of maps of strings to floats * PR comments * Dockerfile to build onnxruntime with ROCm 4.0 * Add ability to skip GPU tests based on GPU adapter name (#6198) * Implement conversion from ortvalue to Itensor for string tensors and comparing sequence of maps of strings to floats * PR comments * Add ability to skip gpu tests according to adapter description * spacing * spacing * spacing * Openvino ep 2021.2 (#6196) * Enabling fasterrcnn variant and vehicle detector * changes for 2021_2 branch * yolov3_pytorch commit * fixed braces in basic_backend.cc * ci information added * faster rcnn variant and vehicle detector changes were made in 2021.1 and not in 2021.2 * some changes to support unit tests * disable some tests which are failing * fix myriad tests for vehicle detector * Did some cleanup *cleaned up comments *Disabled Add_Broadcast_0x1 and Add_Broadcast_1x0 tests on MYRIAD_FP16 backend due to a bug *cleaned up capability_2021_2.cc file *Removed extra conditions which were added for some validation in backend_utils Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * yolov3 pytorch workaround to ensure that the output names are matched * gemmoptest fixed on myriad * Fixed MYRIADX CPP Test Failures *Expand,GatherND,Range,Round op's are only supported in model *where op with float input data types are not supported and fixed *Scatter and ScatterElements op's with negative axis are fixed *Reshape op with 0 dim value are not supported and fixed *Disabled InstanceNorm_2 test on MYRIADX Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * make changes to yolov3 pytorch * Fixed python unit tests *Fixed failing python tests on vpu, GPU and CPU Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixes POW op failures on GPU_FP16 Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Clean up capability_2021_2.cc Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Updated docx for MultiThreading option *Added extra info on setting the num_of_threads option using the API and it's actual usage Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * fixed slice and removed extra prints * Disabled failing python tests Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Minor changes added in capabilty_2021_2 Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * made changes to slice to avoid failures * Disabling FP16 support for GPU_FP32 ->Inferencing an FP16 model on GPU_FP32 leads to accuracy mismatches. so, we would rather use GPU_FP16 to infer an FP16 model on GPU Device Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Updated docx for Inferencing a FP16 Model Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * fix for mask rcnn * Script for installing openvino from source * Updated with openvino 2021.2 online installation * code comment fixes fixed accuracy mismatch for div * Update OpenvinoEP-ExecutionProvider.md updated for 2021.2 branch * Update README.md updated dockerfile documentation * Update BUILD.md build.md update documentation * permissiong change of install_openvino.sh * made changes to align with microsoft onnxruntime changes * Updated with ov 2021.2.200 Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: sfatimar <sahar.fatima@intel/com> Co-authored-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: mohdansx <mohdx.ansari@intel.com> * Fix a memory leak in test_inference.cc (#6201) * Fix a memory leak in test_inference.cc * Use TArray in AMD element-wise kernels, rather than manually copying memory to device. * Remove most ROCm-specific element-wise code and reuse CUDA element-wise code. * Minor change to improve performance for operator Pad. (#5537) * small improvment for pad * Support double for operators Log, Reciprocal, Sum (CPU) (#6032) * Support double for operators Log, Reciprocal, Sum * remove tesdt erf_double * Support double for operators Where, LpNormalisation (#6034) * Support double for operators Relu, Tanh, Sigmoid (#6221) * Fix ImportError in build.py (#6231) There is a possible ImportError where build.py can import the wrong 'util' package if there are others present in `sys.path` already * Removed executor todo that looks dead. (#6234) * Remove MKLML/openblas/jemalloc build config (#6212) * Remove python 3.5 * Update the readme file * Upgrade build.py to assert for python 3.6+ Upgrade build.py to assert for python 3.6+ as python 3.5 cannot build anymore todays master. * Support MLFloat16 type in Pow opset-12 CUDA kernel (#6233) * MLAS: handle MlasGemm(M/N/K==0) cases (#6238) * Support double for operator TopK + fix one bug in TopK implementation for GPU for double (#6220) * Support double for operator TopK * add static classes for topk/double * fix cast issue in topk * Support double for operator Gemm + fix bug in gemm implementation for cuda, rocm when sizeof(type) != sizeof(float) (#6223) * Support double for operator Gemm * fix type size while copying data in gemm operator for GPU * fix type in gemm implementation for rocm * Support double for operator ReduceMean, ReduceLogSumExp (#6217) * Support double for operators ReduceMean, ReduceLogSumExp * Support double for operator ArgMin (#6222) * Support double for operator ArgMin * add test specifically for double * add new test on pai-excluded-tests.txt * Update BUILD.md * Update manylinux docker image to the latest (#6242) * Fix allocator issue for TensorRT IOBinding (#6240) * Fix issue: https://github.com/microsoft/onnxruntime/issues/6094 Root cause: we didn't expose the OrtMemoryInfo for TRT, so it will cause issue if user want use IObinding for Tensorrt. Short term fix, add the OrtMemoryInfo for TRT. Long term should unify the allocator for CUDA and TRT * Tune BiasGeluGradDx kernel in approximation mode to avoid tanh(...) on Rocm (#6239) * bias gelu grad use exp(...) instead * update cuda to rocm * missing semicolon * comment * remove dockerfile * missing factor of two * Refactor EP Perf Tool (#6202) * merge master, keep postprocess status commit * download float16.py everytime * using variables to reference eps * adding ACL EP to ep perf tool * accuracy with absolute tolerance configurable * add acl to dict + remove commented line * Documentation for distributed CI tests pipeline (#6140) * Remove a debug log in provider_test_utils.cc (#6200) * Add the Concat Slice Elimination transform, fix constant_folding transform (#5457) * Add concat slice transform + test * Cosmetic improvements in concat slice transform * Remove unrelated file, fix comment, fix constant folding bug * Add test onnx graph * fix windows build * Review comments * review comment Co-authored-by: Aishwarya <aibhanda@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add MakeStringLite which uses current locale, update some MakeString call sites to use it instead. (#6252) * Add MakeStringLite which uses current locale, update macros to use that to generate messages. * Convert calls to MakeStringLite(). * Liqun/speech model loop to scan (#6070) Provide a tool to convert Loop to Scan for Nuphar performance Fix Nuphar CI pipeline failures. Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * model parallel refinement (#6244) * Megatron Transformation as a seperate step * remove useless header * clang formating * Re-Structure megatron transformer for subsquent changes * fix comments * Allow querying a GraphProto's doc_string as part of ModelMetadata (#6248) * Fix Linux/Mac error message on input type mismatch (#6256) * add bfloat16 to gathergrad type constrains (#6267) Co-authored-by: Cheng Tang <chenta@microsoft.com> * Fix VS 2017 build break (#6276) * Deprecate Python global configuration functions [Part 2] (#6171) Update Python API to allow more flexibility for setting providers and provider options. The providers argument (InferenceSession/TrainingSession constructors, InferenceSession.set_providers()) now also accepts a tuple of (name, options dict). Fix get_available_providers() API (and the corresponding function in the C API) to return the providers in default priority order. Now it can be used as a starting point for the providers argument and maintain the default priority order. Convert some usages of the deprecated global configuration functions to use EP-specific options instead. Update some EP-specific option parsing to fail on unknown options. Other clean up. * Add script to preprocess python documentation before publishing (#6129) * add script to preprocessing python documentation before publishing * rename past to past_key_values for GPT-2 (#6269) rename past to past_key_values for transformers 4.* * Rename MakeString and ParseString functions. (#6272) Rename MakeString to MakeStringWithClassicLocale, MakeStringLite to MakeString, *ParseString to *ParseStringWithClassicLocale. Add missing pass-through versions of MakeStringWithClassicLocale for string types. * Increase timeout for Linux GPU CUDA11 build. (#6280) * Add helper to compare model with different precision (#6270) * add parity_check_helper.py * add real example * remove lines * Fix Min/Max CPU kernels for float16 type (#6205) * fix data_ptr assertion error for past_sequence_length=0 in GPT-2 (#6284) fix io binding crash for past_sequence_length=0 * A list of changes in transformers tool (#6224) * longformer fp16 e2e * add fp16/fp32 parity check helper file * excludes nodes with subgraph in profiling * use onnxconverter_common to do fp32->fp16 * add version check for onnxconverter_common * remove helper file * add pkg installation on notebooks and script * Workaround for static_cast<double>(half) * Add workaround to remove ROCm-specific binary-elementwise files. * Update nuget build (#6297) 1. Update the ProtoSrc path. The old one is not used anymore. 2. Regenerate OnnxMl.cs 3. Delete some unused code in tools/ci_build/build.py 4. Avoid set intra_op_param.thread_pool_size in ModelTests in OpenMP build. 5. Fix a typo in the C API pipeline. * Enable ONNX backend test of SequenceProto input/output (#6043) * assert sequence tensor and remove skips * update testdata json * use ONNX 1.8 in cgmanifest.json * use previous commit to workaround * update ONNX commit ID in docker * skip test_maxpool_2d_dilations test for now * update function name * add --sequence_lengths option (#6285) * more dtype for Equal CUDA kernel (#6288) Co-authored-by: Vincent Wang <weicwang@microsoft.com> * Force reinstall onnx python package on Windows (#6309) * update transformers required package versions (#6315) * Remove abs in LpPool (#6303) * Support 1D input for Conv + Mul/Add fusion optimizer with test (#6295) * Support 1D input (N C H) for Conv + Mul/Add fusion optimizer with test cases and test models. * Add longformer to python package (#6314) * add longformer to python package * move test related script and data to a new folder * Avoid false sharing on thread pool data structures (#6298) Description: This change adds alignment and padding to avoid false sharing on fields in the thread pool. It also adds a new microbenchmark to profile thread-pool performance over short loops. Motivation and Context MobileNet on a 2*12-core system showed a performance gap between the ORT thread pool and OpenMP. One cause appeared to be false sharing on fields in the thread pool: ThreadPoolParallelSection::tasks_finished (which the main thread spins on waiting for workers to complete a loop), and the RunQueue::front_ and back_ fields (used respectively by the worker thread and the main thread). The additional micro-benchmark BM_ThreadPoolSimpleParallelFor tests performance of loops of different sizes at different thread counts. The results below are on a machine with 2*14-core processors (E5-2690 v4) running with 1, 14, 15, and 28 threads. For each test, the microbenchmark has N threads run a loop with N iterations; hence a perfect result is for the time taken to be constant as additional threads are added (although we will also see power management effects helping at very low thread counts). The loop durations (100000, 10000, 1000) correspond roughly to 200us, 20us, and 2us on this machine. Before change: BM_ThreadPoolSimpleParallelFor/1/1/100000/real_time 17153 us 17154 us 32 BM_ThreadPoolSimpleParallelFor/14/14/100000/real_time 22553 us 22553 us 30 BM_ThreadPoolSimpleParallelFor/15/15/100000/real_time 21521 us 21521 us 29 BM_ThreadPoolSimpleParallelFor/28/28/100000/real_time 24111 us 24111 us 24 BM_ThreadPoolSimpleParallelFor/1/1/10000/real_time 1719 us 1719 us 407 BM_ThreadPoolSimpleParallelFor/14/14/10000/real_time 3409 us 3409 us 200 BM_ThreadPoolSimpleParallelFor/15/15/10000/real_time 3541 us 3541 us 201 BM_ThreadPoolSimpleParallelFor/28/28/10000/real_time 4576 us 4576 us 151 BM_ThreadPoolSimpleParallelFor/1/1/1000/real_time 174 us 174 us 4017 BM_ThreadPoolSimpleParallelFor/14/14/1000/real_time 1586 us 1586 us 402 BM_ThreadPoolSimpleParallelFor/15/15/1000/real_time 1586 us 1586 us 397 BM_ThreadPoolSimpleParallelFor/28/28/1000/real_time 2864 us 2864 us 232 After change: BM_ThreadPoolSimpleParallelFor/1/1/100000/real_time 17160 us 17160 us 33 BM_ThreadPoolSimpleParallelFor/14/14/100000/real_time 20989 us 20989 us 31 BM_ThreadPoolSimpleParallelFor/15/15/100000/real_time 22286 us 22286 us 31 BM_ThreadPoolSimpleParallelFor/28/28/100000/real_time 24631 us 24631 us 25 BM_ThreadPoolSimpleParallelFor/1/1/10000/real_time 1718 us 1718 us 407 BM_ThreadPoolSimpleParallelFor/14/14/10000/real_time 2868 us 2868 us 242 BM_ThreadPoolSimpleParallelFor/15/15/10000/real_time 2907 us 2907 us 240 BM_ThreadPoolSimpleParallelFor/28/28/10000/real_time 3872 us 3872 us 186 BM_ThreadPoolSimpleParallelFor/1/1/1000/real_time 175 us 175 us 3938 BM_ThreadPoolSimpleParallelFor/14/14/1000/real_time 933 us 933 us 659 BM_ThreadPoolSimpleParallelFor/15/15/1000/real_time 912 us 912 us 591 BM_ThreadPoolSimpleParallelFor/28/28/1000/real_time 1976 us 1976 us 317 * fix opset imports for function body (#6287) * fix function opsets * add tests and update onnx * changes per review comments * add comments * plus updates * build fix * Remove false positive prefast warning from threadpool (#6324) * Java: add Semmle to Java publishing pipelines (#6326) Add Semmle to Java API pipeline Add security results publishing and add Java GPU. * Quantization support for split operator with its NHWC support (#6107) * Make split working for quantization. * NHWC transformer support for split operator * Refactor some according to Feedback. Will add test cases soon. * Fix build error on windows. * Add test case for split op on uint8_t support * Add nhwc_transformer_test for split uint8_t support * Some change according to PR feedbacks. * Liqun/enable pipeline parallel test (#6331) enable pipeline parallel test Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Use onnxruntime_USE_FULL_PROTOBUF=OFF for the cuda execution provider (#6340) This removes a special case of the cuda EP. * MLAS: add fallback implementation for quantized GEMM (#6335) Add a non-vectorized version of the kernel used for the quantized version of MlasGemm. * Delete float16.py (#6336) No longer needed. Also doesn't pass policheck. * Enable add + softmax fusion for Rocm platform (#6259) * add bias softmax; tests appear to pass * check fusion occurs for rocm as well * check for rocm provider compatible as well * build for cpu scenario as well * try again; broader cope * proper scope on kGpuExecutionProvider * been editing wrong file * remove commented #include lines * try again due to mac os ci error * try again * test fusion both cuda and rocm to avoid mac ci error * add external data support to tensor proto utils (#6257) * update unpack tensor utilities to support loading external data * more updates * fix test * fix nuphar build * minor build fix * add tests * fix Android CI * fix warning * fix DML build failure and some warnings * more updates * more updates * plus few updates * plus some refactoring * changes per review * plus some change * remove temp code * plus updates to safeint usage * build fix * fix for safeint * changed wording. (#6337) * Remove OpSchema dummy definition. Only needed for Function now, and we can just exclude the method in Function (#6321) * remove gemmlowp submodule (#6341) * [NNAPI] Add pow support (#6310) * Add support for running Android emulator from build.py on Windows. (#6317) * fix the pipeline failure (#6346) * Train BERT Using BFloat16 on A100 (#6090) * traing bert using bf16 * Adam support bf16 * bugfix * add fusedmatmul support * fix after merge from master. * bugfix * bugfix after merge from master * fast reduction for bf16. * resolve comments * fix win build * bugfix * change header file. Co-authored-by: Vincent Wang <weicwang@microsoft.com> * Fix DerefNullPtr issues raised by SDLNativeRules. (#6348) * update quantize to support basic optimization and e2e example for image classification (#6313) update the resnet50-v1 to standard one from onnx zoo. add an example for mobilenet run basic optimization before quantization fix a bug in Clip * Enable graph save for orttrainer (#6333) * Enable graph save for orttrainer * Fix CI * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add PREfast to python packaging pipeline (#6343) * Add PREfast to python packaging pipeline * fix longformer benchmark io_binding output_buffers (#6345) * fix longformer benchmark io_binding output_buffers * format * import benchmark_helper from parent directory. * Use readelf for minimal build binary size checks. (#6338) * Use readelf for minimal build binary size checks. The on-disk size grows in 4KB chunks which makes it hard to see how much growth an individual checkin causes. Only downside is that the sum of the sections is larger than the on-disk size (assumably things get packed smaller on disk and some of the section alignment constraints can be ignored) * Remove unused function * Java: Set C language warnings to W4 and adjust JNI code (#6347) Set /W3 for C language and fix up JNI warnings. * Pipeline Parallel Experimental Python API (#5815) * Add create session to WinML telemetry to track WinML Usage (#6356) * Fix one more SDL warning (#6359) * fix -Wdangling-gsl (#6357) * Add python example of TensorRT INT8 inference on ResNet model (#6255) * add trt int8 example on resnet model * Update e2e_tensorrt_resnet_example.py * remove keras dependency and update class names * move ImageNetDataReader and ImageClassificationEvaluator to tensorrt resnet example * simplify e2e_tensorrt_resnet_example.py * Update preprocessing.py * merge tensorrt_calibrate * Update calibrate.py * Update calibrate.py * generalize calibrate * Update calibrate.py * fix issues * fix formating * remove augment_all * This added telemetry isn't needed (#6363) * Wezuo/memory analysis (#5658) * merged alloc_plan * pass compilation * Start running, incorrect allocation memory info * add in comments * fix a bug of recording pattern too early. * debugging lifetime * fix lifetime * passed mnist * in process of visualization * Add code to generate chrome trace for allocations. * in process of collecting fragmentation * before rebuild * passed mnist * passed bert tiny * fix the inplace reuse * fix the exception of weight in pinned memory * add guards to ensure the tensor is in AllocPlan * add customized profiling * debugging * debugging * fix the reuse of differnt location type * add rank * add the rank * add fragmentation * add time_step_trace * Add summary for each execution step (total bytes, used/free bytes). * add top k * change type of top k parameter * remove prints * change heap to set{ * add the name pattern * add the useage for pattern * add partition * change to static class * add custom group * remove const * update memory_info * in process of adding it as runtime config * change the memory profiling to be an argument * add some comments * add checks to recored meomry_info in traaining session * set the "local rank setting" to correct argument. * addressing comments * format adjustment * formatting * remove alloc_interval * update memory_info.cc to skip session when there is no tensor for a particular memory type * fix memory_info multiple iteration seg-fault * consolidate mainz changes * fixed some minor errors * guard by ORT_MINIMAL_BUILD * add ORT_MEMORY_PROFILE flag * added compiler flag to turn on/off memory profiling related code * clean up the code regarding comments * add comments * revoke the onnx version * clean up the code to match master * clean up the code to match master * clean up the code to match master Co-authored-by: Jesse Benson <benson.jesse@gmail.com> Co-authored-by: Wei Zuo <wezuo@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-mgtbby.eastus.cloudapp.azure.com> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-yclzsf.eastus.cloudapp.azure.com> * Support MLFloat16 in CumSum Cuda op for Opset 14 (#6355) * Add CumSum-14 for Cuda * fix convert_common version retrival (#6382) * Refine auto_pad based pad computation in ConvTranspose (#6305) * Fix SDL warning (#6390) * Add max_norm for gradient clipping. (#6289) * add max_norm as user option for gradient clipping * add adam and lamb test cases for clip norm * add frontend tests * Add the custom op project information (#6334) * Dont use default string marshalling in C# (#6219) * Fix Windows x86 compiler warnings in the optimizers project (#6377) * [Perf] Optimize Tile CPU and CUDA kernels for a corner case (#6376) * Unblock Android CI code coverage failure (#6393) * fix build on cuda11 (#6394) Co-authored-by: Vincent Wang <weicwang@microsoft.com> * Load the model path correctly (#6369) * Fix some compile warnings (#6316) * OpenVino docker file changes to bypass privileged mode Description: Builds and installs libusb without UDEV support, which is used for communicating with the VPU device. Motivation and Context This enables the resulting docker container to be run without '--privileged' and '--network host' options which may not be suitable in deployment environments. * Megatron checkpointing (#6293) * Add bart fairseq run script * Add frontend change to enable megatron * Initial changes for checkpointing * Megatron optim state loading, checkpoint aggregation, frontend distributed tests for H, D+H * Add load_checkpoint changes * Fix CI * Cleanup * Fix CI * review comments * review comments * review comments: * Fix generate_submodule_cgmanifest.py Windows issues. (#6404) * Continue memory planning when unknown shape tensor is encountered. (#6413) * Reintroduce experimental api changes and fix remote build break (#6385) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Add support for custom ops to minimal build. (#6228) * Add support for custom ops to minimal build. Cost is only ~8KB so including in base minimal build. * enable pipeline to run quantization tests (#6416) * enable pipeline to run quantization tests setup test pipeline for quantization * Minor cmake change (#6431) * Liqun/liqun/enable pipeline parallel test2 (#6399) * enable data and pipeline parallism test Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Farewell TrainableDropout (#5793) * Deprecate TrainableDropout kernel. * Update bert_toy_postprocessed.onnx to opset 12. * Add more dropout tests. * Fix BiasDropout kernel. Co-authored-by: Ubuntu <OrtTrainingDev3@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sergii Dymchenko <sedymche@microsoft.com> * fix null dereference warning (#6437) * Expose graph ModelPath to TensorRT shared library (#6353) * Update graph_viewer.cc * Update tensorrt_execution_provider.cc * Update graph_viewer.h * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * Update provider_api.h * Update provider_bridge_ort.cc * Update provider_interfaces.h * Update provider_interfaces.h * expose GraphViewer ModelPath API to TRT shared lib * add modelpath to compile * update * add model_path to onnx tensorrt parser * use GenerateMetaDefId to generate unique TRT kernel name * use GenerateMetaDefId to generate unique TRT engine name * fix issue * Update tensorrt_execution_provider.cc * remove GetVecHash * Update tensorrt_execution_provider.h * convert wchar_t to char for tensorrt parser * update tensorrt parser to include latest changes * fix issues * Update tensorrt_execution_provider.cc * merge trt parser latest change * add PROVIDER_DISALLOW_ALL(Path) * add tool for generating test data for longformer (#6415) * only build experimental api in redist (#6465) Co-authored-by: Sheil Kumar <sheilk@microsoft.com> * Add an option to save the training graph after optimization (#6410) * expose optimized_model_filepath in SessionOptions as `debug.graph_save_paths.model_with_training_graph_after_optimization_path` in `ORTTrainerOptions` * Share allocator between CUDA EP & TRT EP. (#6332) * Share allocator between CUDA EP & TRT EP. limitation: 1. Does not cover the per-thread allocator created by CUDA EP, still need to figure out the way to remove it 2. Need to have more identifiers to make it able to share CPU allocator across all EPs * fix max norm clipping test in python packaging pipeline test (#6468) * fix python packaging pipeline * make clip norm test compatabile with both V100 and M60 GPUs * Initial version of CoreML EP (#6392) * Bug 31463811: Servicing: Redist (Nuget) conflicts with Microsoft.AI.MachineLearning starting 21H1+ (#6460) * update load library code to have the fullly qualified path * make it work for syswow32 * git Revert "make it work for syswow32" This reverts commit b9f594341b7cf07241b18d0c376af905edcabae3. Co-authored-by: Sheil Kumar <sheilk@microsoft.com> * dequantize 1st input of lstm back if it is quantized (#6444) * [java] Adds support for OrtEnvironment thread pools (#6406) * Updates for Gradle 7. * Adding support for OrtThreadingOptions into the Java API. * Fixing a typo in the JNI code. * Adding a test for the environment's thread pool. * Fix cuda test, add comment to failure. * Updating build.gradle * fix SDL native rule warning #6246 (#6461) * fix SDL rule (#6464) * use tickcount64 (#6447) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Update pypi package metadata (#6354) * Update setup file data * add missing comma * remove python 3.5 * fix typo bracket * Delete nuget extra configs (#6477) * Op kernel type reduction infrastructure. (#6466) Add infrastructure to support type reduction in Op kernel implementations. Update Cast and IsInf CPU kernels to use it. * Fixing a leak in OnnxSequences with String keys or values. (#6473) * Increase the distributes tests pipeline timeout to 120 minutes (#6479) * [CoreML EP] Add CI for CoreML EP (macOS) and add coreml_flags for EP options (#6481) * Add macos coreml CI and coreml_flags * Move save debuggubg model to use environment var * Move pipeline off from macos CI template * Fix an issue building using unix make, add parallel to build script * Fixed build break for shared_lib and cmpile warning * Fix a compile warning * test * Revert the accidental push from another branch This reverts commit 472029ba25d50f9508474c9eeceb3454cead7877. * Add ability to track per operator types in reduced build config. (#6428) * Add ability to generate configuration that includes required types for individual operators, to allow build size reduction based on that. - Add python bindings for ORT format models - Add script to update bindings and help info - Add parsing of ORT format models - Add ability to enable type reduction to config generation - Update build.py to only allow operator/type reduction via config - simpler to require config to be generated first - can't mix a type aware (ORT format model only) and non-type aware config as that may result in insufficient types being enabled - Add script to create reduced build config - Update CIs * merge e2e with distributed pipeline (#6443) merge e2e with distributed pipeline * Fix test breaks in Windows ingestion pipeline (#6476) * fix various build breaks with Windows build * fix runtime errors loading libraries from system32 * add build_inbox check to winml_test_common * use raw string * cleanup * fix dll load Co-authored-by: Sheil Kumar <sheilk@microsoft.com> * Speed up the Mac CI runs (#6483) * expose learningmodelpixelrange property (#5877) * Fix of support api version bug for [de]quantize (#6492) * SDL fixes: add proper casts/format specifiers (#6446) * SDL annotation fixes (#6448) Co-authored-by: Ori Levari <orlevari@microsoft.com> * [OpenVINO-EP] Remove support for OpenVINO 2020.2 (#6493) * Removed OpenVINO 2020.2 support * Updated documentation and build.py * Removed unnecessary libraries from setup.py * Support pad operator in quantization and quantized nhwc transformer. Fix Pad operator bug. (#6325) Support pad operator in quantization tool. Support pad operator in quantized nhwc transformer. Fix pad() operator bug when pad input's inner(right) most axis value is zero for Edge and Reflect mode, it copied wrong value to the cells to be padded. Note the Constant mode will not trigger this bug, as Edge/Reflect need copy value from the already copied array while Constant mode only fill specified value. Add more test cases to cover pad() operator bug fixed here. Fix quantization tools uint8/int8 value overflow issue when quantize weights in python. * Improve work distribution for Expand operator, and sharded LoopCounter configuration (#6454) Description: This PR makes two changes identified while looking at a PGAN model. First, it uses ThreadPool::TryParallelFor for the main parallel loops in the Expand operator. This lets the thread pool decide on the granularity at which to distribute work (unlike TrySimpleParallelFor). Profiling showed high costs when running "simple" loops with 4M iterations each of which copied only 4 bytes. Second, it updates the sharded loop counter in the thread pool so that the number of shards is capped by the number of threads. This helps make the performance of any other high-contention "simple" loops more robust at low thread counts by letting each thread work on its own "home" shard for longer. Motivation and Context Profiling showed a PGAN model taking 2x+ longer with the non-OpenMP build. The root cause was that the OpenMP build uses simple static scheduling of loop iterations, while the non-OpenMP build uses dynamic scheduling. The combination of large numbers of tiny iterations is less significant with static scheduling --- although still desirable to avoid, given that each iteration incurs a std::function invocation. * Update document of transformer optimization (#6487) * nuphar test to avoid test data download to improve passing rate (#6467) nuphar test to avoid test data download to improve passing rate * Fuse cuda conv with activation (#6351) * optimize cuda conv by fused activation * remove needless print out * exclude test from cpu * handle status error from cudnn 8.x * add reference to base class * add hipify * [CoreML EP] Add support for some activations/Transpose, move some shared helpers from NNAPI to shared space (#6498) * Init change * Move some helper from nnapi ep to shared * Add transpose support * Fix trt ci build break * Refine transformers profiler output (#6502) * output nodes in the original order; grouped by node name * add document for profiler * Update to match new test setup. (#6496) * Update to match new test setup. * Add Gemm(7) manually for now. Will fix properly on Monday. It's used by mnist.ort as that is created by optimizing mnist.onnx to level 1 causing 2 nodes to be replaced by a Gemm and the op to be missing from the required list as that is created using the original onnx model. * Enable dense sequence optimized version of Pytorch exported BERT-L on AMD GPU (#6504) * Permit dense seq optimization on BERT-L pytorch export by enabling ReduceSumTraining, Equal, and NonZero on AMD * enable Equal tests * enable fast_matrix_reduction test case * Optimize GatherGrad for AMD GPU (#6381) * optimize gathergrad * address comments Co-authored-by: Weixing Zhang <wezhan@microsoft.com> * add explicit barriers for buffer overread and overrwrite (#6484) Co-authored-by: Ori Levari <orlevari@microsoft.com> * fix sdl bugs for uninitialized variables and returns (#6450) Co-authored-by: Ori Levari <orlevari@microsoft.com> * handle hr error conditions (#6449) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Dnnl training (#6045) * Add ReluGrad and ConvGrad ops for the dnnl provider * the mnist sample is updated to add the --use_dnnl option that will cause the sample to use the dnnl execution provider for nodes that exist in dnnl provider. * Added the ability to find forward ops. Dnnl backward gradient ops require the forward primitive description and workspace from the forward operation. * Enable specifying the execution provider for Gradient Checker Tests * Prevent memory leak when running dnnl_provider in training mode Prevent creating a SubgraphPrimitivePool when the code is built with the ENABLE_TRAINING build flag. Instead create a SubgraphPrimitive directly. The SubgraphPrimitivePool was causing a pool of SubgraphPrimitives to be stashed in a map for reuse. Due to the way the Training Loop uses threads the pool of SubgraphPrimitives were not being reuse instead a new pool of SubgraphPrimitives being created each run. The old pool was not instantly freed. This behavior could be a language error when using thread_local memory. Signed-off-by: George Nash <george.nash@intel.com> * Added fixes to maxpoolgrad and memory leak. Maxpoolgrad will now pass all unit tests. With the conv and convgrad disabled for dnnl, mnist is able to train till 95% Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> * Fixed misc issues when testing training code with dnnl provider * fix conv_grad dnnl tests with dilation to run dnnl execution provider * update mnist training sample to accept convolution type models convolution models require the input shape to be {1, 28, 28} instead of the flat {728} image that is used for the gemm models this will enable models that require the different shape by adding `--model_type conv` to the command line when running the mnist sample. (while testing a workaround was used see #4762) * Disable weight caching in dnnl conv operator when using training When training we can not use cached weights because the weight will be updated each run. This re-enables dnnl Conv and ConvGrad Ops. The weight caching was the source of the error from Conv when training. * Fix issues found when building grad ops on Linux * The dnnl_convgrad code was over using the scope operator causing a compilation problem. * The dnnl_maxpoolgrad code had a logic error that is was comparing with the source description when it should have been comparing with the destination despription. * Update BUILD.md so it shows DNNL for training * Updated the table of contents. Since the same providers are listed twice. Once for Infrance and again for Training an HTML anchor was added to distinguish the second header from the first for the TOC. * Fix build failure when not using --enable-training build option * reorganize the gradient operators so they are grouped together * Fix issues found when running onnx_backend_test_series.py * Pooling code only supports 2 outputs when built with --enable-training * Address code review feedback * class member variables end in underscore_ * use dst instead of dist to match pattern use elsewhere in DNNL code. * Remove workaround that was introduced to handle problems running convolution based training models. See issue #4762 Signed-off-by: George Nash <george.nash@intel.com> * Isolate training code and code cleanup * Do not build if dnnl_gpu_runtime if enable_training is set training code does not support dnnl_gpu_runtime yet. * Isolated Training code inside ifdefs so that they wont affect project if built without training enabled * Inadvertant changes in whitespace were removed to make code review simpler * Undid some code reordering that was not needed * comments added to closing #endif statments to simplify reading complex ifdefs * Modified the GetPrimitiveDesc functions to return shared_ptr instead of raw pointer. This matches what was done in Pool code and is safer memory code. Signed-off-by: George Nash <george.nash@intel.com> * Address code review issues - whitespace changes caused by running clang-format on the code - Several spelling errors fixed - Removed/changed some ifdefs to improve readability - other misc. changes in responce to code review. Signed-off-by: George Nash <george.nash@intel.com> * Code changes to address code review - Simplify iteration code using `auto` keyword - remove C style cast that was not needed - remove instance variable that was not needed [relugrad.h] - added the execution providers to `ComputeGradientErrorInternal()` and `ComputeTheoreticalJacobianTranspose()` instead of using a pointer to an instance varaible [gradient_checker.h/.cc] Signed-off-by: George Nash <george.nash@intel.com> * Combined the default gradient ops test and dnnl gradient ops test for ConvGrad and MaxPoolGrad into one function with the help of a helper function. This will reduce repeated code. Signed-off-by: Palangotu Keshava, Chethan's avatarChethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> * Replaced the stack used by convgrad to vector so that the vector(used as stack) can be easily cleared everytime the graph is created. This will prevent memory leak from convolution kernels being pushed constantly onto the stack. Signed-off-by: chethan.palangotu.keshava@intel.com * Code clean up and formating updates - Removed empty else statment - updated indentation of code that was causing double curly brackets to look unususal - Changed check for NumDimensions to Size in Relu and ReluGrad error checking code. - isolated training code Signed-off-by: George Nash <george.nash@intel.com> * Restore inadvertantly removed ConvGrad tests When combining the DNNL and CPU version of the ConvGrad tests two test were inadvertantly excluded. This adds back the Conv3d and Conv3d with strides test cases. Signed-off-by: George Nash <george.nash@intel.com> * Add validation to ConvGrad This validates the dimensions of the ConvGrad match the passed in Convolution forward primitive description. The current code for DNNL ConvGrad makes the assumption that the ConvGrad nodes will be visited in the reverse order from the corresponding Conv nodes The added validation will return an error if this assumption is not true. Signed-off-by: George Nash <george.nash@intel.com> * Do not create new execution providers in provider_test_utils This removes the code that generated new execution providers in the OpTester::Run function. This was added because the std::move was leaving the `entry` value empty so subsequent calls would cause a segfault. Problem is this potentially changed the execution_provider because it would create the default provider dropping any custom arguments. When the now removed code was originally added the std::move was causing crashes when the GradientChecker unit tests were run. However, it is no longer causing problems even with the code removed. Signed-off-by: George Nash <george.nash@intel.com> * Change the forward conv stack to a forward conv map This changes how the forward conv kernel is mapped to the bwd ConvGrad kernel the problematic stack is no longer used. The convolution stack made the assumption that the corresponding ConvGrad operator would be visited in reverse order of the forward Conv operators. This was always problematic and was unlikely to work for inception models. Important changes: - The weight_name is added to the ConvGrad dnnl_node making it possible to use the weight_name as a lookup key to find the Conv forward Kernel - the `std::vector fwd_conv_stack_` has been replaced with a `std::map fwd_conv_kernel_map_` - Although it is not needed lock_guards were added when writing to and reading from the fwd_conv_kernel_map_ as well as the fwd_kernel_map_. These should always be accessed by a single thread when preparing the dnnl subgraphs so the guard should not be needed but its added just in case. - Updated the comments ConvGrad.h code to no longer mention the stack. The error check is not removed. It will be good to verify there are no errors as we continue to test against more models. Signed-off-by: George Nash <george.nash@intel.com> Co-authored-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> Co-authored-by: unknown <63478620+jeyblu@users.noreply.github.com> * Lochi/refactor yolov3 quantization (#6290) * Refactor the code and move data reader, preprocessing, evaluation to E2E_example_mode * Refactor the code. Move data reader, preprocessing, evaluation to model specific example under E2E_example_mode * refactor code * Move yolov3 example to specific folder and add additional pre/post processing * Print a warning message for using newer c_api header on old binary (#6507) * Fix issues with ArmNN build setup (#6495) * ArmNN build fixes * Update BUILD.md to document that the ACL paths must be specified to build ArmNN * Fix CUDA build error. We don't setup the link libraries correctly/consistently so improve that. * Fix Windows CI builds by updating test scripts to work with numpy 1.20. (#6518) * Update onnxruntime_test_python.py to work with numpy 1.20. Some aliases are deprecated in favor of the built-in python types. See https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations np.array with bytes for entries and dtype of np.void no longer automatically pads. Change a test to adjust for that. * Fix another test script * Fix ORTModule branch for orttraining-* pipelines * Update pytorch nightly version dependency Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: George Wu <jywu@microsoft.com> Co-authored-by: Cecilia Liu <ziyue.liu7@gmail.com> Co-authored-by: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Co-authored-by: George Nash <george.nash@intel.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Yateng Hong <toothache9010@gmail.com> Co-authored-by: stevenlix <38092805+stevenlix@users.noreply.github.com> Co-authored-by: Derek Murray <Derek.Murray@microsoft.com> Co-authored-by: ashbhandare <ash.bhandare@gmail.com> Co-authored-by: Scott McKay <skottmckay@gmail.com> Co-authored-by: Changming Sun <chasun@microsoft.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: Juliana Franco <jufranc@microsoft.com> Co-authored-by: Pranav Sharma <prs@microsoft.com> Co-authored-by: Tixxx <tix@microsoft.com> Co-authored-by: Jay Rodge <jayrodge@live.com> Co-authored-by: Du Li <duli1@microsoft.com> Co-authored-by: Du Li <duli@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Yufeng Li <liyufeng1987@gmail.com> Co-authored-by: baijumeswani <bmeswani@microsoft.com> Co-authored-by: Sergii Dymchenko <sedymche@microsoft.com> Co-authored-by: jingyanwangms <47403504+jingyanwangms@users.noreply.github.com> Co-authored-by: satyajandhyala <satya.k.jandhyala@gmail.com> Co-authored-by: S. Manohar Karlapalem <manohar.karlapalem@intel.com> Co-authored-by: Weixing Zhang <weixingzhang@users.noreply.github.com> Co-authored-by: Suffian Khan <sukha@microsoft.com> Co-authored-by: Olivia Jain <oljain@microsoft.com> Co-authored-by: Chi Lo <54722500+chilo-ms@users.noreply.github.com> Co-authored-by: Hariharan Seshadri <shariharan91@gmail.com> Co-authored-by: Ryan Lai <rylai@microsoft.com> Co-authored-by: Jesse Benson <jesseb@microsoft.com> Co-authored-by: sfatimar <64512376+sfatimar@users.noreply.github.com> Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: sfatimar <sahar.fatima@intel/com> Co-authored-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: mohdansx <mohdx.ansari@intel.com> Co-authored-by: Xavier Dupré <xadupre@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin@vols.utk.edu> Co-authored-by: Michael Giba <michaelgiba@gmail.com> Co-authored-by: William Tambellini <wtambellini@sdl.com> Co-authored-by: Hector Li <hecli@microsoft.com> Co-authored-by: Aishwarya <aibhanda@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: liqunfu <liqfu@microsoft.com> Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: pengwa <pengwa@microsoft.com> Co-authored-by: Tang, Cheng <souptc@gmail.com> Co-authored-by: Cheng Tang <chenta@microsoft.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Ye Wang <52801275+wangyems@users.noreply.github.com> Co-authored-by: Chun-Wei Chen <jacky82226@gmail.com> Co-authored-by: Vincent Wang <wangwchpku@outlook.com> Co-authored-by: Vincent Wang <weicwang@microsoft.com> Co-authored-by: Luyao Ren <375833274@qq.com> Co-authored-by: Zhang Lei <zhang.huanning@hotmail.com> Co-authored-by: Tim Harris <tiharr@microsoft.com> Co-authored-by: Ashwini Khade <askhade@microsoft.com> Co-authored-by: Dmitri Smirnov <yuslepukhin@users.noreply.github.com> Co-authored-by: Alberto Magni <49027342+alberto-magni@users.noreply.github.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: wezuo <49965641+wezuo@users.noreply.github.com> Co-authored-by: Jesse Benson <benson.jesse@gmail.com> Co-authored-by: Wei Zuo <wezuo@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-mgtbby.eastus.cloudapp.azure.com> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-yclzsf.eastus.cloudapp.azure.com> Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Co-authored-by: Martin Man <supermt@gmail.com> Co-authored-by: M. Zeeshan Siddiqui <mzs@microsoft.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Ubuntu <OrtTrainingDev3@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sheil Kumar <smk2007@gmail.com> Co-authored-by: Sheil Kumar <sheilk@microsoft.com> Co-authored-by: Ryota Tomioka <ryoto@microsoft.com> Co-authored-by: Adam Pocock <adam.pocock@oracle.com> Co-authored-by: Yulong Wang <f.s@qq.com> Co-authored-by: Faith Xu <faxu@microsoft.com> Co-authored-by: Xiang Zhang <xianz@microsoft.com> Co-authored-by: suryasidd <48925384+suryasidd@users.noreply.github.com> Co-authored-by: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com> Co-authored-by: Weixing Zhang <wezhan@microsoft.com> Co-authored-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> Co-authored-by: unknown <63478620+jeyblu@users.noreply.github.com>
2021-02-02 16:59:56 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_longformer_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/longformer/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_phi2_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/phi2/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_stable_diffusion_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/stable_diffusion/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_t5_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/t5/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_models_whisper_src}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/transformers/models/whisper/
2019-03-22 18:18:23 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${REPO_ROOT}/VERSION_NUMBER
$<TARGET_FILE_DIR:${build_output_target}>
2018-11-20 00:48:22 +00:00
)
if (onnxruntime_BUILD_SHARED_LIB)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
Enable model subgraph execution in OVEP and setting the OpenVINO dll's to the path from the OpenVINO pypi packge in OVEP and fix OVEP windows io buffer sample (#16147) ### Description This PR enables execution of subgraphs in OVEP and currently, when OVEP developers install the onnxruntime-openvino package on windows from pypi, they would have to additionally download OpenVINO windows binaries and run the setupvars.bat script which sets the environment PATH to locate the OV dll's. Also this PR fixes issues of OVEP windows io buffer sample. ### Motivation and Context Fix: We want to make the user experience easy for OVEP Python developers on windows platform. This fix, introduces a function add_openvino_libs_to_path at the location tools/python/util/add_openvino_win_libs.py. The above function, can be called by OVEP python users in the application code and that takes care of setting the OpenVINO dll's to the path from the OpenVINO pypi packge (openvino) which was installed. This change also makes sure that add_openvino_libs_to_path() function is added to onnxruntime python package only when it is build for OpenVINO Execution Provider for ONNXRuntime and not for default ORT python package builds. New user experience for Python OVEP developers on windows platform: step 1: pip install onnxruntime-openvino step 2: pip install openvino step 3: <Add these 2 lines in the application code> import onnxruntime.tools.add_openvino_win_libs as utils utils.add_openvino_libs_to_path() --------- Signed-off-by: MaajidKhan <n.maajid.khan@intel.com> Co-authored-by: MaajidKhan <n.maajid.khan@intel.com> Co-authored-by: Suryaprakash Shanmugam <suryaprakash.shanmugam@intel.com>
2023-06-17 02:47:09 +00:00
if (onnxruntime_USE_OPENVINO)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_openvino_python_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/tools/
)
endif()
if (onnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/external/include/
COMMAND ${CMAKE_COMMAND} -E create_symlink
$<TARGET_FILE_DIR:${build_output_target}>/include/google
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/external/include/google
COMMAND ${CMAKE_COMMAND} -E create_symlink
$<TARGET_FILE_DIR:${build_output_target}>/external/onnx/onnx
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/external/include/onnx
COMMAND ${CMAKE_COMMAND} -E copy_directory
${ORTTRAINING_ROOT}/orttraining/test/external_custom_ops
$<TARGET_FILE_DIR:${build_output_target}>/external_custom_ops
)
endif()
if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin|iOS|visionOS"
AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android"
AND NOT onnxruntime_USE_ROCM
AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_providers_shared>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
if (onnxruntime_BUILD_UNIT_TESTS)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_test_srcs}
$<TARGET_FILE_DIR:${build_output_target}>
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_quantization_test_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/quantization/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_test_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/transformers/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_testdata_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/transformers/test_data/models/
Whisper Model Optimization (#15473) ### Description This PR contains fusion-level and kernel-level optimizations for [OpenAI's Whisper](https://github.com/openai/whisper). Some of the added optimizations include: - Pruning of duplicate/unnecessary inputs and outputs - Fusion support for Whisper models with or without these inputs/outputs (e.g. with these inputs/outputs if exporting with an older official Optimum version, without these inputs/outputs if exporting with Optimum from source) - Attention fusions - For Whisper's encoder and decoder - Modified symbolic shape inference for present output when no past input exists (for decoder) - Multi-head attention fusions - For Whisper's decoder and decoder with past - Packed MatMul for the 3 MatMuls excluded in multi-head attention fusion - Attention kernel changes - CPU: - Different Q and KV sequence lengths - Parallel memset for large sequence lengths - Convert broadcast add after MatMul of Q and K (add_qk) to element-wise add - Separate present key-value output into present key and present value (for multi-head attention spec) - CUDA: - Use memory efficient attention compute kernel with present state (for decoder) - Multi-head attention kernel changes - CPU: - Introduction of multi-head attention CPU kernel (previously did not exist) - Use AddBiasReshape instead of AddBiasTranspose when sequence length = 1 (for decoder with past) - Different Q, K, V input shapes - Pass past key and past value directly as key and value - CUDA: - Use memory efficient attention compute kernel with past and/or present state (for decoder with past) ### Usage To use the optimizations, run the ORT transformer optimizer script as follows: ``` $ cd onnxruntime/onnxruntime/python/tools/transformers/ $ python3 optimizer.py --input <filename>.onnx --output <filename>.onnx --model_type bart --num_heads <number of attention heads, depends on the size of the whisper model used> --hidden_size <attention hidden size, depends on the size of the whisper model used> --use_external_data_format --use_multi_head_attention ``` Once optimized, here's an example of how to run Whisper with [Hugging Face's Optimum](https://github.com/huggingface/optimum): ``` from transformers.onnx.utils import get_preprocessor from optimum.onnxruntime import ORTModelForSpeechSeq2Seq from optimum.pipelines import pipeline as ort_pipeline import whisper # Installed from OpenAI's repo - setup instructions at https://github.com/openai/whisper/ directory = './whisper_opt' # Where the optimized ONNX models are located model_name = 'openai/whisper-tiny' device = 'cpu' # Get pipeline processor = get_preprocessor(model_name) model = ORTModelForSpeechSeq2Seq.from_pretrained( directory, use_io_binding=(device == 'cuda'), provider='CPUExecutionProvider', ).to(device) pipe = ort_pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=(-1 if device == 'cpu' else 0), ) # Load audio file and run pipeline audio = whisper.load_audio('tests/jfk.flac') audio = whisper.pad_or_trim(audio) outputs = pipe([audio]) print(outputs) ``` Note: In order to use these changes with Optimum, it is recommended to use Optimum from source to have the following changes: - https://github.com/huggingface/optimum/pull/872 - https://github.com/huggingface/optimum/pull/920 ### Motivation and Context This PR helps the following issues: - https://github.com/microsoft/onnxruntime/issues/15100 - https://github.com/microsoft/onnxruntime/issues/15235 - https://github.com/huggingface/optimum/issues/869 (work in progress) This PR can be used with the other currently merged Whisper PRs: - https://github.com/microsoft/onnxruntime/pull/15247 - https://github.com/microsoft/onnxruntime/pull/15339 - https://github.com/microsoft/onnxruntime/pull/15362 - https://github.com/microsoft/onnxruntime/pull/15365 - https://github.com/microsoft/onnxruntime/pull/15427 This PR uses changes from the following merged PRs: - https://github.com/microsoft/onnxruntime/pull/14198 - https://github.com/microsoft/onnxruntime/pull/14146 - https://github.com/microsoft/onnxruntime/pull/14201 - https://github.com/microsoft/onnxruntime/pull/14928 (this introduced the new multi-head attention spec)
2023-04-19 00:13:54 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_testdata_whisper}
$<TARGET_FILE_DIR:${build_output_target}>/transformers/test_data/models/whisper/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_transformers_testdata_conformer}
$<TARGET_FILE_DIR:${build_output_target}>/transformers/test_data/models/conformer/
)
endif()
if (onnxruntime_BUILD_UNIT_TESTS AND onnxruntime_ENABLE_EAGER_MODE)
file(GLOB onnxruntime_eager_test_srcs CONFIGURE_DEPENDS
"${ORTTRAINING_ROOT}/orttraining/eager/test/*.py"
)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_eager_test_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/eager_test/
)
endif()
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
if (onnxruntime_ENABLE_TRAINING)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/amp
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/experimental
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/experimental/gradient_graph
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/optim
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/json_config
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/hierarchical_ortmodule
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/aten_op_executor
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/fused_ops
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/graph_optimizers
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/pipe
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton/kernel
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/utils
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/utils/data/
Statistics tool for ORTModule convergence parity (#15020) ### Statistics tool for ORTModule convergence parity As ORTModule get more and more validated, it is pretty fast to intergrade PyTorch based model with ORT. The same time, we need make sure once there is convergence issue, we don't spend months of time to investigate. As part of this efforts, this PR is introducing a tool to dump activation statistics without much involvement from users. The dumping results contains only some statistic numbers plus sampled data, which is not big, compared with dumping all the tensors, it is much faster and space efficient. For us to use it, two single lines are needed before wrapping ORTModule. For baseline run, need also apply the same trick. ``` + from onnxruntime.training.utils.hooks import SubscriberManager, StatisticsSubscriber + SubscriberManager.subscribe(model, [StatisticsSubscriber("pt_out", override_output_dir=True)]) ``` Once you run the steps, following command can be used to merge result into per-step-summary respectively for ORT and baseline runs. ```bash python -m onnxruntime.training.utils.hooks.merge_activation_summary --pt_dir pt_out --ort_dir ort_out --output_dir /tmp/output ``` Docs is added here as part of this PR [convergence investigation notes](https://github.com/microsoft/onnxruntime/blob/pengwa/conv_tool/docs/ORTModule_Convergence_Notes.md) Based on the generated merged files, we can compare them with tools. ![image](https://user-images.githubusercontent.com/10530022/224653929-4e4480bd-bb02-4bbe-bd44-2672bdf91a87.png) ### Design and Implementation This PR introduced a common mechanism registering custom logic for nn.Module's post forward hooks. And statistics for activation (StatisticsSubscriber) is one of the implementations. If there is other needs, we can define another XXSubscriber to do the customized things.
2023-03-23 12:34:24 +00:00
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/utils/hooks/
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_root_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_amp_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/amp/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_experimental_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/experimental/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_gradient_graph_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/experimental/gradient_graph/
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_optim_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/optim/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_experimental_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_experimental_json_config_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/json_config/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_experimental_hierarchical_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/hierarchical_ortmodule/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_torch_cpp_ext_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_torch_cpp_ext_aten_op_executor_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/aten_op_executor/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_torch_cpp_ext_torch_interop_utils_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_torch_cpp_ext_torch_gpu_allocator_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/torch_gpu_allocator/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_torch_cpp_ext_fused_ops_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/torch_cpp_extensions/cuda/fused_ops/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_graph_optimizers_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/graph_optimizers/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ortmodule_pipe_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ortmodule/experimental/pipe/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ort_triton_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_ort_triton_kernel_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/ort_triton/kernel/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_utils_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/utils/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_utils_data_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/utils/data/
Statistics tool for ORTModule convergence parity (#15020) ### Statistics tool for ORTModule convergence parity As ORTModule get more and more validated, it is pretty fast to intergrade PyTorch based model with ORT. The same time, we need make sure once there is convergence issue, we don't spend months of time to investigate. As part of this efforts, this PR is introducing a tool to dump activation statistics without much involvement from users. The dumping results contains only some statistic numbers plus sampled data, which is not big, compared with dumping all the tensors, it is much faster and space efficient. For us to use it, two single lines are needed before wrapping ORTModule. For baseline run, need also apply the same trick. ``` + from onnxruntime.training.utils.hooks import SubscriberManager, StatisticsSubscriber + SubscriberManager.subscribe(model, [StatisticsSubscriber("pt_out", override_output_dir=True)]) ``` Once you run the steps, following command can be used to merge result into per-step-summary respectively for ORT and baseline runs. ```bash python -m onnxruntime.training.utils.hooks.merge_activation_summary --pt_dir pt_out --ort_dir ort_out --output_dir /tmp/output ``` Docs is added here as part of this PR [convergence investigation notes](https://github.com/microsoft/onnxruntime/blob/pengwa/conv_tool/docs/ORTModule_Convergence_Notes.md) Based on the generated merged files, we can compare them with tools. ![image](https://user-images.githubusercontent.com/10530022/224653929-4e4480bd-bb02-4bbe-bd44-2672bdf91a87.png) ### Design and Implementation This PR introduced a common mechanism registering custom logic for nn.Module's post forward hooks. And statistics for activation (StatisticsSubscriber) is one of the implementations. If there is other needs, we can define another XXSubscriber to do the customized things.
2023-03-23 12:34:24 +00:00
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_utils_hooks_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/utils/hooks/
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
)
if (onnxruntime_ENABLE_TRAINING_APIS)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/onnxblock
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/onnxblock/loss
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/onnxblock/optim
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/api
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_onnxblock_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/onnxblock/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_onnxblock_loss_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/onnxblock/loss/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_onnxblock_optim_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/onnxblock/optim/
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_api_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/training/api/
)
endif()
Add new PytTrch front-end (#4815) * Add ORTTrainerOptions class for the new pytorch frontend (#4382) Add ORTTrainerOptions class and some placeholders * Add _ORTTrainerModelDesc to perform validation for model description (#4416) * Add Loss Scaler classes to the new frontend (#4306) * Add TrainStepInfo used on the new frontend API (#4256) * Add Optimizer classes to the new frontend (#4280) * Add LRScheduler implementation (#4357) * Add basic ORTTrainer API (#4435) This PR presents the public API for ORTTrainer for the short term development. It also validates and saves input parameters, which will be used in the next stages, such as building ONNX model, post processing the model and configuring the training session * Add opset_version into ORTTrainerOptions and change type of ORTTrainer.loss_fn (#4592) * Update ModelDescription and minor fix on ORTTrainer ctor (#4605) * Update ModelDescription and minor fix on ORTTrainer/ORTTrainerOptions This PR keeps the public API intact, but changes how model description is stored on the backend Currently, users creates a dict with two lists of tuples. One list called 'inputs' and each tuple has the following format tuple(name, shape). The second list is called 'outputs' and each tuple can be either tuple(name, shape) or tuple(name, shape, is_loss). With this PR, when this dict is passed in to ORTTrainer, it is fully validated as usual. However, tuples are internally replaced by namedtuples and all output tuples will have tuple(name, shape, is_loss) format instead of is_loss being optionally present. Additionally to that normalization in the internal representation (which eases coding), two internal methods were created to replace a namedtuple(name, shape) to namedtuple(name, shape, dtype) or namedtuple(name, shape, is_loss, dtype) dependeing whether the tuple is an input or output. This is necessary as ORTTRainer finds out data types of each input/output during model export to onnx. Finally, a minor fix was done on ORTTrainer. It could initialize ORTTrainerOptions incorrectly when options=None * Rename input name for test * Add ONNX Model Export to New Frontend (#4612) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Create training session + minor improvements (#4668) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Save ONNX model in file (#4671) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add eval step (#4674) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add train_step (#4677) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add LR Scheduler (#4694) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add deterministic compute tests (#4716) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add legacy vs experimental ORTTrainer accuracy comparison (#4727) Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add Mixed precision/LossScaler + several fixes (#4739) Additionally to the mixed precision/loss scaler code, this PR includes: * Fix CUDA training * Add optimization_step into TrainStepInfo class * Refactor LRSCheduler to use optimization_step instead of step * Updated several default values at ORTTrainerOptions * Add initial Gradient Accumulation supported. Untested * Fix ONNX model post processing * Refactor unit tests * Add ONNX BERT example + minor fixes (#4757) * Fix training issue when passing ONNX file into ORTTrainer Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add Dynamic Shape support (#4758) * Update DeepSpeed Zero Stage option to a separate option group (#4772) * Add support to fetches (#4777) * Add Gradient Accumulation Steps support (#4793) * Fix Dynamic Axes feature and add unit test (#4795) * Add frozen weights test (#4807) * Move new pytorch front-end to 'experimental' namespace (#4814) * Fix build Co-authored-by: Rayan-Krishnan <rayankrishnan@live.com> Co-authored-by: Rayan Krishnan <t-rakr@OrtDevTest2v100.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net>
2020-08-17 16:45:25 +00:00
endif()
if (onnxruntime_USE_DNNL)
2018-11-20 00:48:22 +00:00
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${DNNL_DLL_PATH} $<TARGET_FILE:onnxruntime_providers_dnnl>
$<TARGET_FILE:onnxruntime_providers_shared>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
2018-11-20 00:48:22 +00:00
)
endif()
if (onnxruntime_USE_VITISAI)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${DNNL_DLL_PATH} $<TARGET_FILE:onnxruntime_providers_vitisai>
$<TARGET_FILE:onnxruntime_providers_shared>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
if (onnxruntime_USE_TENSORRT)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
Sync ORTModule branch with master and fix tests (#6526) * Deprecate Python global configuration functions [Part 1] (#5923) Enable options to be set via execution provider (EP)-specific options and log deprecation warning from current global configuration functions. * remove dnnl_dll_path from post build copy (#6142) * Model Fusion For Bart (#6105) Fusion fix for Bart models * Unify IExecutionProvider and IExecutionProviderFactory interfaces (#6108) * Remove Provider_IExecutionProvider and make the internal IExecutionProvider usable by shared providers * Change Provider_IExecutionProviderFactory to be the core version. * Enable running the mnist_training sample without cuda (#6085) Signed-off-by: George Nash <george.nash@intel.com> * nnapi add min max support (#6117) * Fix CUDA test hang: (#6138) - Make condition check in `CUDAAllocatorTest` to ensure CUDA device is present. * Fix TensorRT kernel conflict issue for subgraphs of control flow operators (#6115) * add static subgraph kernel index * change kernel naming to avoid conflicts * Add gradient registration for Abs. (#6139) * Partition initial optimizer state for Zero-1 (#6093) * Initial changes * Working changes * Working changes * Cleanup * fix windows CI * Review comments * review comments * Fix edge case in BFCArena where allocation failures could lead to an infinite loop. (#6145) #4656 * Revert "work around of the build break in mac (#6069)" (#6150) This reverts commit 3cae28699bed5de1fcaadb219fa69bae0fc3cee8. * Fix clean_docker_image_cache.py detection of image pushes. (#6151) Fix clean_docker_image_cache.py detection of image pushes. They were being ignored because the expected HTTP status code was wrong. For pushes, it's 201 instead of 200. * MLAS: add NEON version of int8 depthwise convolution (#6152) * Using a map of of ops to stages as input of partition function. (#5940) * New partition algorithm running before AD * Convert cut_group_info into device map. Work in progress -- works for bert-tiny with pp=2 * Removing code for partition of bwd graphs * Remove old code * Adding some verification code * Handle Shared Initializer * Renaming rank with stage * Added first unit test * new test * redundant check * undo change in bert * Moved cut-based partition to testing utils file Co-authored-by: xzhu1900 Co-authored-by: wschin * New conversion function and tests * minor * remove test that is not needed2 * improve GetDeviceAssignment and PR comments * minor changes * PR comments * improving documentation and variable naming * add documentation * Variable naming and docs * more doc improvements * more doc improvements * missing static cast * Fix test file for windows * Fix test file for windows * Fix test file for windows * stage id is not the same as rank id * PR comments * PR comments * More comments * More comments * Minor fix to satisfy c++14 (#6162) * Deprecating Horovod and refactored Adasum computations (#5468) deprecated horovod submodule refactored adasum logic to be ort-native added tests for native kernel and e2e tests * Update TensorRT-ExecutionProvider.md (#6161) * Bugfix for topk cuda kernel (#6164) * fix the issue that std::numeric_limits cannot handle half type * adding a test Co-authored-by: Du Li <duli@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Revert "Fuse MatMulIntegerToFloat only when scales are scalar (#6008)" (#6169) This reverts commit f2dcba7afe0d42ebdaaef0c6cdf913a1156c9e98. * Remove ignored build warnings for pybind on Mac (#6165) * save_checkpoint, load_checkpoint and aggregate_checkpoints (#6136) * save_checkpoint and load_checkpoint implementations * checkpoint aggregation logic * unit tests for save_checkpoint, load_checkpoint and aggregate_checkpoints * Don't try to bind unused inputs in the Training frontend (#6166) * Update documentation for contributing a PR and add deprecation notices for PyOp and ORT server. (#6172) * aggregate model states only for the case when mixed precision was true (#6176) * [NNAPI EP] Enable per-channel quantization for QlinearConv (#6155) * Enable qlinearconv per-channel quantization * Fix the android CI test failure * Add Android Version Check for Per-Channel Quant * Address PR comments * Fix some minor issues * Add verification of per-channel zero points * Make the error tolerance configurable * Fix typo in BERT pretraining script (#6175) A misplaced `}` meant that the `'enable_adasum'` option was interpreted incorrectly, causing the test to fail. * Update get_docker_image.py to enable use without image cache container registry. (#6177) Update get_docker_image.py to enable use without image cache container registry. * Helper for compiling EP to generate deterministic unique ids for use in MetaDef names (#6156) * Create a helper for generating unique ids that can be used by an EP that creates compiled nodes and needs ids to be deterministic for a model when used in multiple sessions. Added to IExecutionProvider as this can potentially be used by all compiling EPs and is more robust than a simplistic counter (although EP implementer is free to choose either approach). * Restructure the helper so it can be called across the EP bridge. Add ability to call id generation helper from EP bridge - convert DNNL EP to use helper to validate Address issue where a new Model may be loaded into the same address as a previous one. - hash the bytes in the Graph instance (1728 bytes currently) to use as the key to the full hash for the model Add lock around id generation to ensure no issues if multiple sessions partitions graphs at exactly the same time. - Extremely unlikely but would be hard to debug and the locking cost is not an issue as it's only incurred during graph partitioning and not execution. * Backend APIs for checkpointing (#5803) * Add backend API GetOptimizerState and GetModelState * add GetPartitionInfoMap * Android coverage dashboard (#6163) * Write the report to a file. * Post code coverage to the Dashboard database. * Add usage details of unified MCR container image (#6182) Going forward, a single unifed docker image will be published in MCR. The hardware accelerator target choice will have to be made in the application using OpenVINO EP's runtime config options. * improve perf for softmax (#6128) * improve perf for both gathergrad and softmax * revert the change in gathergrad and will be done in another PR. * address comments from code review. * Tune fast Gelu to use exp(x) instead of tanh(x) on Rocm platform (#6174) * tune fast gelu to use exp(x) instead of tanh(x) on rocm * update to use expression 2/(1+exp(-2x))-1 for stability * Add Status.csv to EP Perf Tool (#6167) * merge master, keep postprocess status commit * download float16.py everytime * removing hardcoded values * Lochi/quantization tool for trt (#6103) * Initial implementation of generating calibration dynamic range table * Initialize validation support for Quantization * Initialize validation support for Quantization (cont.) * Improve validation support for Quantization * Improve validation support for Quantization * Rewrite/Refine for calibration and validation * Rewrite/Refine for calibration and validation (cont.) * Refine code * Refine code * Add data reader for BERT * Add flatbuffers to serialize calibration table * Refine code and add BERT evaluation * Refine the code * minor modification * Add preprocess/postprocess of vision team yolov3 and refine the code * Update annotation * Make bbox cooridates more accurate * Fix bug * Add support of batch processing * Batch processing for model zoo yolov3 * Add batch inference for evaluation * Refine the code * Add README * Add comments * Refine the code for PR * Remove batch support checking in data_reader and refine the code * Refine the code for PR * Refine the code for PR review Co-authored-by: Olivia Jain <oljain@microsoft.com> * Implement ScatterND for CUDA EP (#6184) * Condition fix in Resize operator (#6193) * Clean up checkpoint tests to use the new checkpoint functions (#6188) * add deprecation warning for old checkpoint functions * update all the distributed checkpoint tests to use new checkpoint functions * Implement comparing outputs that are sequence of maps of strings to floats (#6180) * Implement conversion from ortvalue to Itensor for string tensors and comparing sequence of maps of strings to floats * PR comments * Dockerfile to build onnxruntime with ROCm 4.0 * Add ability to skip GPU tests based on GPU adapter name (#6198) * Implement conversion from ortvalue to Itensor for string tensors and comparing sequence of maps of strings to floats * PR comments * Add ability to skip gpu tests according to adapter description * spacing * spacing * spacing * Openvino ep 2021.2 (#6196) * Enabling fasterrcnn variant and vehicle detector * changes for 2021_2 branch * yolov3_pytorch commit * fixed braces in basic_backend.cc * ci information added * faster rcnn variant and vehicle detector changes were made in 2021.1 and not in 2021.2 * some changes to support unit tests * disable some tests which are failing * fix myriad tests for vehicle detector * Did some cleanup *cleaned up comments *Disabled Add_Broadcast_0x1 and Add_Broadcast_1x0 tests on MYRIAD_FP16 backend due to a bug *cleaned up capability_2021_2.cc file *Removed extra conditions which were added for some validation in backend_utils Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * yolov3 pytorch workaround to ensure that the output names are matched * gemmoptest fixed on myriad * Fixed MYRIADX CPP Test Failures *Expand,GatherND,Range,Round op's are only supported in model *where op with float input data types are not supported and fixed *Scatter and ScatterElements op's with negative axis are fixed *Reshape op with 0 dim value are not supported and fixed *Disabled InstanceNorm_2 test on MYRIADX Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * make changes to yolov3 pytorch * Fixed python unit tests *Fixed failing python tests on vpu, GPU and CPU Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Fixes POW op failures on GPU_FP16 Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Clean up capability_2021_2.cc Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Updated docx for MultiThreading option *Added extra info on setting the num_of_threads option using the API and it's actual usage Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * fixed slice and removed extra prints * Disabled failing python tests Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Minor changes added in capabilty_2021_2 Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * made changes to slice to avoid failures * Disabling FP16 support for GPU_FP32 ->Inferencing an FP16 model on GPU_FP32 leads to accuracy mismatches. so, we would rather use GPU_FP16 to infer an FP16 model on GPU Device Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * Updated docx for Inferencing a FP16 Model Signed-off-by: MaajidKhan <n.maajidkhan@gmail.com> * fix for mask rcnn * Script for installing openvino from source * Updated with openvino 2021.2 online installation * code comment fixes fixed accuracy mismatch for div * Update OpenvinoEP-ExecutionProvider.md updated for 2021.2 branch * Update README.md updated dockerfile documentation * Update BUILD.md build.md update documentation * permissiong change of install_openvino.sh * made changes to align with microsoft onnxruntime changes * Updated with ov 2021.2.200 Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: sfatimar <sahar.fatima@intel/com> Co-authored-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: mohdansx <mohdx.ansari@intel.com> * Fix a memory leak in test_inference.cc (#6201) * Fix a memory leak in test_inference.cc * Use TArray in AMD element-wise kernels, rather than manually copying memory to device. * Remove most ROCm-specific element-wise code and reuse CUDA element-wise code. * Minor change to improve performance for operator Pad. (#5537) * small improvment for pad * Support double for operators Log, Reciprocal, Sum (CPU) (#6032) * Support double for operators Log, Reciprocal, Sum * remove tesdt erf_double * Support double for operators Where, LpNormalisation (#6034) * Support double for operators Relu, Tanh, Sigmoid (#6221) * Fix ImportError in build.py (#6231) There is a possible ImportError where build.py can import the wrong 'util' package if there are others present in `sys.path` already * Removed executor todo that looks dead. (#6234) * Remove MKLML/openblas/jemalloc build config (#6212) * Remove python 3.5 * Update the readme file * Upgrade build.py to assert for python 3.6+ Upgrade build.py to assert for python 3.6+ as python 3.5 cannot build anymore todays master. * Support MLFloat16 type in Pow opset-12 CUDA kernel (#6233) * MLAS: handle MlasGemm(M/N/K==0) cases (#6238) * Support double for operator TopK + fix one bug in TopK implementation for GPU for double (#6220) * Support double for operator TopK * add static classes for topk/double * fix cast issue in topk * Support double for operator Gemm + fix bug in gemm implementation for cuda, rocm when sizeof(type) != sizeof(float) (#6223) * Support double for operator Gemm * fix type size while copying data in gemm operator for GPU * fix type in gemm implementation for rocm * Support double for operator ReduceMean, ReduceLogSumExp (#6217) * Support double for operators ReduceMean, ReduceLogSumExp * Support double for operator ArgMin (#6222) * Support double for operator ArgMin * add test specifically for double * add new test on pai-excluded-tests.txt * Update BUILD.md * Update manylinux docker image to the latest (#6242) * Fix allocator issue for TensorRT IOBinding (#6240) * Fix issue: https://github.com/microsoft/onnxruntime/issues/6094 Root cause: we didn't expose the OrtMemoryInfo for TRT, so it will cause issue if user want use IObinding for Tensorrt. Short term fix, add the OrtMemoryInfo for TRT. Long term should unify the allocator for CUDA and TRT * Tune BiasGeluGradDx kernel in approximation mode to avoid tanh(...) on Rocm (#6239) * bias gelu grad use exp(...) instead * update cuda to rocm * missing semicolon * comment * remove dockerfile * missing factor of two * Refactor EP Perf Tool (#6202) * merge master, keep postprocess status commit * download float16.py everytime * using variables to reference eps * adding ACL EP to ep perf tool * accuracy with absolute tolerance configurable * add acl to dict + remove commented line * Documentation for distributed CI tests pipeline (#6140) * Remove a debug log in provider_test_utils.cc (#6200) * Add the Concat Slice Elimination transform, fix constant_folding transform (#5457) * Add concat slice transform + test * Cosmetic improvements in concat slice transform * Remove unrelated file, fix comment, fix constant folding bug * Add test onnx graph * fix windows build * Review comments * review comment Co-authored-by: Aishwarya <aibhanda@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Add MakeStringLite which uses current locale, update some MakeString call sites to use it instead. (#6252) * Add MakeStringLite which uses current locale, update macros to use that to generate messages. * Convert calls to MakeStringLite(). * Liqun/speech model loop to scan (#6070) Provide a tool to convert Loop to Scan for Nuphar performance Fix Nuphar CI pipeline failures. Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * model parallel refinement (#6244) * Megatron Transformation as a seperate step * remove useless header * clang formating * Re-Structure megatron transformer for subsquent changes * fix comments * Allow querying a GraphProto's doc_string as part of ModelMetadata (#6248) * Fix Linux/Mac error message on input type mismatch (#6256) * add bfloat16 to gathergrad type constrains (#6267) Co-authored-by: Cheng Tang <chenta@microsoft.com> * Fix VS 2017 build break (#6276) * Deprecate Python global configuration functions [Part 2] (#6171) Update Python API to allow more flexibility for setting providers and provider options. The providers argument (InferenceSession/TrainingSession constructors, InferenceSession.set_providers()) now also accepts a tuple of (name, options dict). Fix get_available_providers() API (and the corresponding function in the C API) to return the providers in default priority order. Now it can be used as a starting point for the providers argument and maintain the default priority order. Convert some usages of the deprecated global configuration functions to use EP-specific options instead. Update some EP-specific option parsing to fail on unknown options. Other clean up. * Add script to preprocess python documentation before publishing (#6129) * add script to preprocessing python documentation before publishing * rename past to past_key_values for GPT-2 (#6269) rename past to past_key_values for transformers 4.* * Rename MakeString and ParseString functions. (#6272) Rename MakeString to MakeStringWithClassicLocale, MakeStringLite to MakeString, *ParseString to *ParseStringWithClassicLocale. Add missing pass-through versions of MakeStringWithClassicLocale for string types. * Increase timeout for Linux GPU CUDA11 build. (#6280) * Add helper to compare model with different precision (#6270) * add parity_check_helper.py * add real example * remove lines * Fix Min/Max CPU kernels for float16 type (#6205) * fix data_ptr assertion error for past_sequence_length=0 in GPT-2 (#6284) fix io binding crash for past_sequence_length=0 * A list of changes in transformers tool (#6224) * longformer fp16 e2e * add fp16/fp32 parity check helper file * excludes nodes with subgraph in profiling * use onnxconverter_common to do fp32->fp16 * add version check for onnxconverter_common * remove helper file * add pkg installation on notebooks and script * Workaround for static_cast<double>(half) * Add workaround to remove ROCm-specific binary-elementwise files. * Update nuget build (#6297) 1. Update the ProtoSrc path. The old one is not used anymore. 2. Regenerate OnnxMl.cs 3. Delete some unused code in tools/ci_build/build.py 4. Avoid set intra_op_param.thread_pool_size in ModelTests in OpenMP build. 5. Fix a typo in the C API pipeline. * Enable ONNX backend test of SequenceProto input/output (#6043) * assert sequence tensor and remove skips * update testdata json * use ONNX 1.8 in cgmanifest.json * use previous commit to workaround * update ONNX commit ID in docker * skip test_maxpool_2d_dilations test for now * update function name * add --sequence_lengths option (#6285) * more dtype for Equal CUDA kernel (#6288) Co-authored-by: Vincent Wang <weicwang@microsoft.com> * Force reinstall onnx python package on Windows (#6309) * update transformers required package versions (#6315) * Remove abs in LpPool (#6303) * Support 1D input for Conv + Mul/Add fusion optimizer with test (#6295) * Support 1D input (N C H) for Conv + Mul/Add fusion optimizer with test cases and test models. * Add longformer to python package (#6314) * add longformer to python package * move test related script and data to a new folder * Avoid false sharing on thread pool data structures (#6298) Description: This change adds alignment and padding to avoid false sharing on fields in the thread pool. It also adds a new microbenchmark to profile thread-pool performance over short loops. Motivation and Context MobileNet on a 2*12-core system showed a performance gap between the ORT thread pool and OpenMP. One cause appeared to be false sharing on fields in the thread pool: ThreadPoolParallelSection::tasks_finished (which the main thread spins on waiting for workers to complete a loop), and the RunQueue::front_ and back_ fields (used respectively by the worker thread and the main thread). The additional micro-benchmark BM_ThreadPoolSimpleParallelFor tests performance of loops of different sizes at different thread counts. The results below are on a machine with 2*14-core processors (E5-2690 v4) running with 1, 14, 15, and 28 threads. For each test, the microbenchmark has N threads run a loop with N iterations; hence a perfect result is for the time taken to be constant as additional threads are added (although we will also see power management effects helping at very low thread counts). The loop durations (100000, 10000, 1000) correspond roughly to 200us, 20us, and 2us on this machine. Before change: BM_ThreadPoolSimpleParallelFor/1/1/100000/real_time 17153 us 17154 us 32 BM_ThreadPoolSimpleParallelFor/14/14/100000/real_time 22553 us 22553 us 30 BM_ThreadPoolSimpleParallelFor/15/15/100000/real_time 21521 us 21521 us 29 BM_ThreadPoolSimpleParallelFor/28/28/100000/real_time 24111 us 24111 us 24 BM_ThreadPoolSimpleParallelFor/1/1/10000/real_time 1719 us 1719 us 407 BM_ThreadPoolSimpleParallelFor/14/14/10000/real_time 3409 us 3409 us 200 BM_ThreadPoolSimpleParallelFor/15/15/10000/real_time 3541 us 3541 us 201 BM_ThreadPoolSimpleParallelFor/28/28/10000/real_time 4576 us 4576 us 151 BM_ThreadPoolSimpleParallelFor/1/1/1000/real_time 174 us 174 us 4017 BM_ThreadPoolSimpleParallelFor/14/14/1000/real_time 1586 us 1586 us 402 BM_ThreadPoolSimpleParallelFor/15/15/1000/real_time 1586 us 1586 us 397 BM_ThreadPoolSimpleParallelFor/28/28/1000/real_time 2864 us 2864 us 232 After change: BM_ThreadPoolSimpleParallelFor/1/1/100000/real_time 17160 us 17160 us 33 BM_ThreadPoolSimpleParallelFor/14/14/100000/real_time 20989 us 20989 us 31 BM_ThreadPoolSimpleParallelFor/15/15/100000/real_time 22286 us 22286 us 31 BM_ThreadPoolSimpleParallelFor/28/28/100000/real_time 24631 us 24631 us 25 BM_ThreadPoolSimpleParallelFor/1/1/10000/real_time 1718 us 1718 us 407 BM_ThreadPoolSimpleParallelFor/14/14/10000/real_time 2868 us 2868 us 242 BM_ThreadPoolSimpleParallelFor/15/15/10000/real_time 2907 us 2907 us 240 BM_ThreadPoolSimpleParallelFor/28/28/10000/real_time 3872 us 3872 us 186 BM_ThreadPoolSimpleParallelFor/1/1/1000/real_time 175 us 175 us 3938 BM_ThreadPoolSimpleParallelFor/14/14/1000/real_time 933 us 933 us 659 BM_ThreadPoolSimpleParallelFor/15/15/1000/real_time 912 us 912 us 591 BM_ThreadPoolSimpleParallelFor/28/28/1000/real_time 1976 us 1976 us 317 * fix opset imports for function body (#6287) * fix function opsets * add tests and update onnx * changes per review comments * add comments * plus updates * build fix * Remove false positive prefast warning from threadpool (#6324) * Java: add Semmle to Java publishing pipelines (#6326) Add Semmle to Java API pipeline Add security results publishing and add Java GPU. * Quantization support for split operator with its NHWC support (#6107) * Make split working for quantization. * NHWC transformer support for split operator * Refactor some according to Feedback. Will add test cases soon. * Fix build error on windows. * Add test case for split op on uint8_t support * Add nhwc_transformer_test for split uint8_t support * Some change according to PR feedbacks. * Liqun/enable pipeline parallel test (#6331) enable pipeline parallel test Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Use onnxruntime_USE_FULL_PROTOBUF=OFF for the cuda execution provider (#6340) This removes a special case of the cuda EP. * MLAS: add fallback implementation for quantized GEMM (#6335) Add a non-vectorized version of the kernel used for the quantized version of MlasGemm. * Delete float16.py (#6336) No longer needed. Also doesn't pass policheck. * Enable add + softmax fusion for Rocm platform (#6259) * add bias softmax; tests appear to pass * check fusion occurs for rocm as well * check for rocm provider compatible as well * build for cpu scenario as well * try again; broader cope * proper scope on kGpuExecutionProvider * been editing wrong file * remove commented #include lines * try again due to mac os ci error * try again * test fusion both cuda and rocm to avoid mac ci error * add external data support to tensor proto utils (#6257) * update unpack tensor utilities to support loading external data * more updates * fix test * fix nuphar build * minor build fix * add tests * fix Android CI * fix warning * fix DML build failure and some warnings * more updates * more updates * plus few updates * plus some refactoring * changes per review * plus some change * remove temp code * plus updates to safeint usage * build fix * fix for safeint * changed wording. (#6337) * Remove OpSchema dummy definition. Only needed for Function now, and we can just exclude the method in Function (#6321) * remove gemmlowp submodule (#6341) * [NNAPI] Add pow support (#6310) * Add support for running Android emulator from build.py on Windows. (#6317) * fix the pipeline failure (#6346) * Train BERT Using BFloat16 on A100 (#6090) * traing bert using bf16 * Adam support bf16 * bugfix * add fusedmatmul support * fix after merge from master. * bugfix * bugfix after merge from master * fast reduction for bf16. * resolve comments * fix win build * bugfix * change header file. Co-authored-by: Vincent Wang <weicwang@microsoft.com> * Fix DerefNullPtr issues raised by SDLNativeRules. (#6348) * update quantize to support basic optimization and e2e example for image classification (#6313) update the resnet50-v1 to standard one from onnx zoo. add an example for mobilenet run basic optimization before quantization fix a bug in Clip * Enable graph save for orttrainer (#6333) * Enable graph save for orttrainer * Fix CI * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py * Update orttraining/orttraining/python/training/orttrainer_options.py Co-authored-by: Thiago Crepaldi <thiago.crepaldi@microsoft.com> * Add PREfast to python packaging pipeline (#6343) * Add PREfast to python packaging pipeline * fix longformer benchmark io_binding output_buffers (#6345) * fix longformer benchmark io_binding output_buffers * format * import benchmark_helper from parent directory. * Use readelf for minimal build binary size checks. (#6338) * Use readelf for minimal build binary size checks. The on-disk size grows in 4KB chunks which makes it hard to see how much growth an individual checkin causes. Only downside is that the sum of the sections is larger than the on-disk size (assumably things get packed smaller on disk and some of the section alignment constraints can be ignored) * Remove unused function * Java: Set C language warnings to W4 and adjust JNI code (#6347) Set /W3 for C language and fix up JNI warnings. * Pipeline Parallel Experimental Python API (#5815) * Add create session to WinML telemetry to track WinML Usage (#6356) * Fix one more SDL warning (#6359) * fix -Wdangling-gsl (#6357) * Add python example of TensorRT INT8 inference on ResNet model (#6255) * add trt int8 example on resnet model * Update e2e_tensorrt_resnet_example.py * remove keras dependency and update class names * move ImageNetDataReader and ImageClassificationEvaluator to tensorrt resnet example * simplify e2e_tensorrt_resnet_example.py * Update preprocessing.py * merge tensorrt_calibrate * Update calibrate.py * Update calibrate.py * generalize calibrate * Update calibrate.py * fix issues * fix formating * remove augment_all * This added telemetry isn't needed (#6363) * Wezuo/memory analysis (#5658) * merged alloc_plan * pass compilation * Start running, incorrect allocation memory info * add in comments * fix a bug of recording pattern too early. * debugging lifetime * fix lifetime * passed mnist * in process of visualization * Add code to generate chrome trace for allocations. * in process of collecting fragmentation * before rebuild * passed mnist * passed bert tiny * fix the inplace reuse * fix the exception of weight in pinned memory * add guards to ensure the tensor is in AllocPlan * add customized profiling * debugging * debugging * fix the reuse of differnt location type * add rank * add the rank * add fragmentation * add time_step_trace * Add summary for each execution step (total bytes, used/free bytes). * add top k * change type of top k parameter * remove prints * change heap to set{ * add the name pattern * add the useage for pattern * add partition * change to static class * add custom group * remove const * update memory_info * in process of adding it as runtime config * change the memory profiling to be an argument * add some comments * add checks to recored meomry_info in traaining session * set the "local rank setting" to correct argument. * addressing comments * format adjustment * formatting * remove alloc_interval * update memory_info.cc to skip session when there is no tensor for a particular memory type * fix memory_info multiple iteration seg-fault * consolidate mainz changes * fixed some minor errors * guard by ORT_MINIMAL_BUILD * add ORT_MEMORY_PROFILE flag * added compiler flag to turn on/off memory profiling related code * clean up the code regarding comments * add comments * revoke the onnx version * clean up the code to match master * clean up the code to match master * clean up the code to match master Co-authored-by: Jesse Benson <benson.jesse@gmail.com> Co-authored-by: Wei Zuo <wezuo@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-mgtbby.eastus.cloudapp.azure.com> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-yclzsf.eastus.cloudapp.azure.com> * Support MLFloat16 in CumSum Cuda op for Opset 14 (#6355) * Add CumSum-14 for Cuda * fix convert_common version retrival (#6382) * Refine auto_pad based pad computation in ConvTranspose (#6305) * Fix SDL warning (#6390) * Add max_norm for gradient clipping. (#6289) * add max_norm as user option for gradient clipping * add adam and lamb test cases for clip norm * add frontend tests * Add the custom op project information (#6334) * Dont use default string marshalling in C# (#6219) * Fix Windows x86 compiler warnings in the optimizers project (#6377) * [Perf] Optimize Tile CPU and CUDA kernels for a corner case (#6376) * Unblock Android CI code coverage failure (#6393) * fix build on cuda11 (#6394) Co-authored-by: Vincent Wang <weicwang@microsoft.com> * Load the model path correctly (#6369) * Fix some compile warnings (#6316) * OpenVino docker file changes to bypass privileged mode Description: Builds and installs libusb without UDEV support, which is used for communicating with the VPU device. Motivation and Context This enables the resulting docker container to be run without '--privileged' and '--network host' options which may not be suitable in deployment environments. * Megatron checkpointing (#6293) * Add bart fairseq run script * Add frontend change to enable megatron * Initial changes for checkpointing * Megatron optim state loading, checkpoint aggregation, frontend distributed tests for H, D+H * Add load_checkpoint changes * Fix CI * Cleanup * Fix CI * review comments * review comments * review comments: * Fix generate_submodule_cgmanifest.py Windows issues. (#6404) * Continue memory planning when unknown shape tensor is encountered. (#6413) * Reintroduce experimental api changes and fix remote build break (#6385) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Add support for custom ops to minimal build. (#6228) * Add support for custom ops to minimal build. Cost is only ~8KB so including in base minimal build. * enable pipeline to run quantization tests (#6416) * enable pipeline to run quantization tests setup test pipeline for quantization * Minor cmake change (#6431) * Liqun/liqun/enable pipeline parallel test2 (#6399) * enable data and pipeline parallism test Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> * Farewell TrainableDropout (#5793) * Deprecate TrainableDropout kernel. * Update bert_toy_postprocessed.onnx to opset 12. * Add more dropout tests. * Fix BiasDropout kernel. Co-authored-by: Ubuntu <OrtTrainingDev3@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sergii Dymchenko <sedymche@microsoft.com> * fix null dereference warning (#6437) * Expose graph ModelPath to TensorRT shared library (#6353) * Update graph_viewer.cc * Update tensorrt_execution_provider.cc * Update graph_viewer.h * Update tensorrt_execution_provider.cc * Update tensorrt_execution_provider.cc * Update provider_api.h * Update provider_bridge_ort.cc * Update provider_interfaces.h * Update provider_interfaces.h * expose GraphViewer ModelPath API to TRT shared lib * add modelpath to compile * update * add model_path to onnx tensorrt parser * use GenerateMetaDefId to generate unique TRT kernel name * use GenerateMetaDefId to generate unique TRT engine name * fix issue * Update tensorrt_execution_provider.cc * remove GetVecHash * Update tensorrt_execution_provider.h * convert wchar_t to char for tensorrt parser * update tensorrt parser to include latest changes * fix issues * Update tensorrt_execution_provider.cc * merge trt parser latest change * add PROVIDER_DISALLOW_ALL(Path) * add tool for generating test data for longformer (#6415) * only build experimental api in redist (#6465) Co-authored-by: Sheil Kumar <sheilk@microsoft.com> * Add an option to save the training graph after optimization (#6410) * expose optimized_model_filepath in SessionOptions as `debug.graph_save_paths.model_with_training_graph_after_optimization_path` in `ORTTrainerOptions` * Share allocator between CUDA EP & TRT EP. (#6332) * Share allocator between CUDA EP & TRT EP. limitation: 1. Does not cover the per-thread allocator created by CUDA EP, still need to figure out the way to remove it 2. Need to have more identifiers to make it able to share CPU allocator across all EPs * fix max norm clipping test in python packaging pipeline test (#6468) * fix python packaging pipeline * make clip norm test compatabile with both V100 and M60 GPUs * Initial version of CoreML EP (#6392) * Bug 31463811: Servicing: Redist (Nuget) conflicts with Microsoft.AI.MachineLearning starting 21H1+ (#6460) * update load library code to have the fullly qualified path * make it work for syswow32 * git Revert "make it work for syswow32" This reverts commit b9f594341b7cf07241b18d0c376af905edcabae3. Co-authored-by: Sheil Kumar <sheilk@microsoft.com> * dequantize 1st input of lstm back if it is quantized (#6444) * [java] Adds support for OrtEnvironment thread pools (#6406) * Updates for Gradle 7. * Adding support for OrtThreadingOptions into the Java API. * Fixing a typo in the JNI code. * Adding a test for the environment's thread pool. * Fix cuda test, add comment to failure. * Updating build.gradle * fix SDL native rule warning #6246 (#6461) * fix SDL rule (#6464) * use tickcount64 (#6447) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Update pypi package metadata (#6354) * Update setup file data * add missing comma * remove python 3.5 * fix typo bracket * Delete nuget extra configs (#6477) * Op kernel type reduction infrastructure. (#6466) Add infrastructure to support type reduction in Op kernel implementations. Update Cast and IsInf CPU kernels to use it. * Fixing a leak in OnnxSequences with String keys or values. (#6473) * Increase the distributes tests pipeline timeout to 120 minutes (#6479) * [CoreML EP] Add CI for CoreML EP (macOS) and add coreml_flags for EP options (#6481) * Add macos coreml CI and coreml_flags * Move save debuggubg model to use environment var * Move pipeline off from macos CI template * Fix an issue building using unix make, add parallel to build script * Fixed build break for shared_lib and cmpile warning * Fix a compile warning * test * Revert the accidental push from another branch This reverts commit 472029ba25d50f9508474c9eeceb3454cead7877. * Add ability to track per operator types in reduced build config. (#6428) * Add ability to generate configuration that includes required types for individual operators, to allow build size reduction based on that. - Add python bindings for ORT format models - Add script to update bindings and help info - Add parsing of ORT format models - Add ability to enable type reduction to config generation - Update build.py to only allow operator/type reduction via config - simpler to require config to be generated first - can't mix a type aware (ORT format model only) and non-type aware config as that may result in insufficient types being enabled - Add script to create reduced build config - Update CIs * merge e2e with distributed pipeline (#6443) merge e2e with distributed pipeline * Fix test breaks in Windows ingestion pipeline (#6476) * fix various build breaks with Windows build * fix runtime errors loading libraries from system32 * add build_inbox check to winml_test_common * use raw string * cleanup * fix dll load Co-authored-by: Sheil Kumar <sheilk@microsoft.com> * Speed up the Mac CI runs (#6483) * expose learningmodelpixelrange property (#5877) * Fix of support api version bug for [de]quantize (#6492) * SDL fixes: add proper casts/format specifiers (#6446) * SDL annotation fixes (#6448) Co-authored-by: Ori Levari <orlevari@microsoft.com> * [OpenVINO-EP] Remove support for OpenVINO 2020.2 (#6493) * Removed OpenVINO 2020.2 support * Updated documentation and build.py * Removed unnecessary libraries from setup.py * Support pad operator in quantization and quantized nhwc transformer. Fix Pad operator bug. (#6325) Support pad operator in quantization tool. Support pad operator in quantized nhwc transformer. Fix pad() operator bug when pad input's inner(right) most axis value is zero for Edge and Reflect mode, it copied wrong value to the cells to be padded. Note the Constant mode will not trigger this bug, as Edge/Reflect need copy value from the already copied array while Constant mode only fill specified value. Add more test cases to cover pad() operator bug fixed here. Fix quantization tools uint8/int8 value overflow issue when quantize weights in python. * Improve work distribution for Expand operator, and sharded LoopCounter configuration (#6454) Description: This PR makes two changes identified while looking at a PGAN model. First, it uses ThreadPool::TryParallelFor for the main parallel loops in the Expand operator. This lets the thread pool decide on the granularity at which to distribute work (unlike TrySimpleParallelFor). Profiling showed high costs when running "simple" loops with 4M iterations each of which copied only 4 bytes. Second, it updates the sharded loop counter in the thread pool so that the number of shards is capped by the number of threads. This helps make the performance of any other high-contention "simple" loops more robust at low thread counts by letting each thread work on its own "home" shard for longer. Motivation and Context Profiling showed a PGAN model taking 2x+ longer with the non-OpenMP build. The root cause was that the OpenMP build uses simple static scheduling of loop iterations, while the non-OpenMP build uses dynamic scheduling. The combination of large numbers of tiny iterations is less significant with static scheduling --- although still desirable to avoid, given that each iteration incurs a std::function invocation. * Update document of transformer optimization (#6487) * nuphar test to avoid test data download to improve passing rate (#6467) nuphar test to avoid test data download to improve passing rate * Fuse cuda conv with activation (#6351) * optimize cuda conv by fused activation * remove needless print out * exclude test from cpu * handle status error from cudnn 8.x * add reference to base class * add hipify * [CoreML EP] Add support for some activations/Transpose, move some shared helpers from NNAPI to shared space (#6498) * Init change * Move some helper from nnapi ep to shared * Add transpose support * Fix trt ci build break * Refine transformers profiler output (#6502) * output nodes in the original order; grouped by node name * add document for profiler * Update to match new test setup. (#6496) * Update to match new test setup. * Add Gemm(7) manually for now. Will fix properly on Monday. It's used by mnist.ort as that is created by optimizing mnist.onnx to level 1 causing 2 nodes to be replaced by a Gemm and the op to be missing from the required list as that is created using the original onnx model. * Enable dense sequence optimized version of Pytorch exported BERT-L on AMD GPU (#6504) * Permit dense seq optimization on BERT-L pytorch export by enabling ReduceSumTraining, Equal, and NonZero on AMD * enable Equal tests * enable fast_matrix_reduction test case * Optimize GatherGrad for AMD GPU (#6381) * optimize gathergrad * address comments Co-authored-by: Weixing Zhang <wezhan@microsoft.com> * add explicit barriers for buffer overread and overrwrite (#6484) Co-authored-by: Ori Levari <orlevari@microsoft.com> * fix sdl bugs for uninitialized variables and returns (#6450) Co-authored-by: Ori Levari <orlevari@microsoft.com> * handle hr error conditions (#6449) Co-authored-by: Ori Levari <orlevari@microsoft.com> * Dnnl training (#6045) * Add ReluGrad and ConvGrad ops for the dnnl provider * the mnist sample is updated to add the --use_dnnl option that will cause the sample to use the dnnl execution provider for nodes that exist in dnnl provider. * Added the ability to find forward ops. Dnnl backward gradient ops require the forward primitive description and workspace from the forward operation. * Enable specifying the execution provider for Gradient Checker Tests * Prevent memory leak when running dnnl_provider in training mode Prevent creating a SubgraphPrimitivePool when the code is built with the ENABLE_TRAINING build flag. Instead create a SubgraphPrimitive directly. The SubgraphPrimitivePool was causing a pool of SubgraphPrimitives to be stashed in a map for reuse. Due to the way the Training Loop uses threads the pool of SubgraphPrimitives were not being reuse instead a new pool of SubgraphPrimitives being created each run. The old pool was not instantly freed. This behavior could be a language error when using thread_local memory. Signed-off-by: George Nash <george.nash@intel.com> * Added fixes to maxpoolgrad and memory leak. Maxpoolgrad will now pass all unit tests. With the conv and convgrad disabled for dnnl, mnist is able to train till 95% Signed-off-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> * Fixed misc issues when testing training code with dnnl provider * fix conv_grad dnnl tests with dilation to run dnnl execution provider * update mnist training sample to accept convolution type models convolution models require the input shape to be {1, 28, 28} instead of the flat {728} image that is used for the gemm models this will enable models that require the different shape by adding `--model_type conv` to the command line when running the mnist sample. (while testing a workaround was used see #4762) * Disable weight caching in dnnl conv operator when using training When training we can not use cached weights because the weight will be updated each run. This re-enables dnnl Conv and ConvGrad Ops. The weight caching was the source of the error from Conv when training. * Fix issues found when building grad ops on Linux * The dnnl_convgrad code was over using the scope operator causing a compilation problem. * The dnnl_maxpoolgrad code had a logic error that is was comparing with the source description when it should have been comparing with the destination despription. * Update BUILD.md so it shows DNNL for training * Updated the table of contents. Since the same providers are listed twice. Once for Infrance and again for Training an HTML anchor was added to distinguish the second header from the first for the TOC. * Fix build failure when not using --enable-training build option * reorganize the gradient operators so they are grouped together * Fix issues found when running onnx_backend_test_series.py * Pooling code only supports 2 outputs when built with --enable-training * Address code review feedback * class member variables end in underscore_ * use dst instead of dist to match pattern use elsewhere in DNNL code. * Remove workaround that was introduced to handle problems running convolution based training models. See issue #4762 Signed-off-by: George Nash <george.nash@intel.com> * Isolate training code and code cleanup * Do not build if dnnl_gpu_runtime if enable_training is set training code does not support dnnl_gpu_runtime yet. * Isolated Training code inside ifdefs so that they wont affect project if built without training enabled * Inadvertant changes in whitespace were removed to make code review simpler * Undid some code reordering that was not needed * comments added to closing #endif statments to simplify reading complex ifdefs * Modified the GetPrimitiveDesc functions to return shared_ptr instead of raw pointer. This matches what was done in Pool code and is safer memory code. Signed-off-by: George Nash <george.nash@intel.com> * Address code review issues - whitespace changes caused by running clang-format on the code - Several spelling errors fixed - Removed/changed some ifdefs to improve readability - other misc. changes in responce to code review. Signed-off-by: George Nash <george.nash@intel.com> * Code changes to address code review - Simplify iteration code using `auto` keyword - remove C style cast that was not needed - remove instance variable that was not needed [relugrad.h] - added the execution providers to `ComputeGradientErrorInternal()` and `ComputeTheoreticalJacobianTranspose()` instead of using a pointer to an instance varaible [gradient_checker.h/.cc] Signed-off-by: George Nash <george.nash@intel.com> * Combined the default gradient ops test and dnnl gradient ops test for ConvGrad and MaxPoolGrad into one function with the help of a helper function. This will reduce repeated code. Signed-off-by: Palangotu Keshava, Chethan's avatarChethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> * Replaced the stack used by convgrad to vector so that the vector(used as stack) can be easily cleared everytime the graph is created. This will prevent memory leak from convolution kernels being pushed constantly onto the stack. Signed-off-by: chethan.palangotu.keshava@intel.com * Code clean up and formating updates - Removed empty else statment - updated indentation of code that was causing double curly brackets to look unususal - Changed check for NumDimensions to Size in Relu and ReluGrad error checking code. - isolated training code Signed-off-by: George Nash <george.nash@intel.com> * Restore inadvertantly removed ConvGrad tests When combining the DNNL and CPU version of the ConvGrad tests two test were inadvertantly excluded. This adds back the Conv3d and Conv3d with strides test cases. Signed-off-by: George Nash <george.nash@intel.com> * Add validation to ConvGrad This validates the dimensions of the ConvGrad match the passed in Convolution forward primitive description. The current code for DNNL ConvGrad makes the assumption that the ConvGrad nodes will be visited in the reverse order from the corresponding Conv nodes The added validation will return an error if this assumption is not true. Signed-off-by: George Nash <george.nash@intel.com> * Do not create new execution providers in provider_test_utils This removes the code that generated new execution providers in the OpTester::Run function. This was added because the std::move was leaving the `entry` value empty so subsequent calls would cause a segfault. Problem is this potentially changed the execution_provider because it would create the default provider dropping any custom arguments. When the now removed code was originally added the std::move was causing crashes when the GradientChecker unit tests were run. However, it is no longer causing problems even with the code removed. Signed-off-by: George Nash <george.nash@intel.com> * Change the forward conv stack to a forward conv map This changes how the forward conv kernel is mapped to the bwd ConvGrad kernel the problematic stack is no longer used. The convolution stack made the assumption that the corresponding ConvGrad operator would be visited in reverse order of the forward Conv operators. This was always problematic and was unlikely to work for inception models. Important changes: - The weight_name is added to the ConvGrad dnnl_node making it possible to use the weight_name as a lookup key to find the Conv forward Kernel - the `std::vector fwd_conv_stack_` has been replaced with a `std::map fwd_conv_kernel_map_` - Although it is not needed lock_guards were added when writing to and reading from the fwd_conv_kernel_map_ as well as the fwd_kernel_map_. These should always be accessed by a single thread when preparing the dnnl subgraphs so the guard should not be needed but its added just in case. - Updated the comments ConvGrad.h code to no longer mention the stack. The error check is not removed. It will be good to verify there are no errors as we continue to test against more models. Signed-off-by: George Nash <george.nash@intel.com> Co-authored-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> Co-authored-by: unknown <63478620+jeyblu@users.noreply.github.com> * Lochi/refactor yolov3 quantization (#6290) * Refactor the code and move data reader, preprocessing, evaluation to E2E_example_mode * Refactor the code. Move data reader, preprocessing, evaluation to model specific example under E2E_example_mode * refactor code * Move yolov3 example to specific folder and add additional pre/post processing * Print a warning message for using newer c_api header on old binary (#6507) * Fix issues with ArmNN build setup (#6495) * ArmNN build fixes * Update BUILD.md to document that the ACL paths must be specified to build ArmNN * Fix CUDA build error. We don't setup the link libraries correctly/consistently so improve that. * Fix Windows CI builds by updating test scripts to work with numpy 1.20. (#6518) * Update onnxruntime_test_python.py to work with numpy 1.20. Some aliases are deprecated in favor of the built-in python types. See https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations np.array with bytes for entries and dtype of np.void no longer automatically pads. Change a test to adjust for that. * Fix another test script * Fix ORTModule branch for orttraining-* pipelines * Update pytorch nightly version dependency Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> Co-authored-by: George Wu <jywu@microsoft.com> Co-authored-by: Cecilia Liu <ziyue.liu7@gmail.com> Co-authored-by: Ryan Hill <38674843+RyanUnderhill@users.noreply.github.com> Co-authored-by: George Nash <george.nash@intel.com> Co-authored-by: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Co-authored-by: Yateng Hong <toothache9010@gmail.com> Co-authored-by: stevenlix <38092805+stevenlix@users.noreply.github.com> Co-authored-by: Derek Murray <Derek.Murray@microsoft.com> Co-authored-by: ashbhandare <ash.bhandare@gmail.com> Co-authored-by: Scott McKay <skottmckay@gmail.com> Co-authored-by: Changming Sun <chasun@microsoft.com> Co-authored-by: Tracy Sharpe <42477615+tracysh@users.noreply.github.com> Co-authored-by: Juliana Franco <jufranc@microsoft.com> Co-authored-by: Pranav Sharma <prs@microsoft.com> Co-authored-by: Tixxx <tix@microsoft.com> Co-authored-by: Jay Rodge <jayrodge@live.com> Co-authored-by: Du Li <duli1@microsoft.com> Co-authored-by: Du Li <duli@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Yufeng Li <liyufeng1987@gmail.com> Co-authored-by: baijumeswani <bmeswani@microsoft.com> Co-authored-by: Sergii Dymchenko <sedymche@microsoft.com> Co-authored-by: jingyanwangms <47403504+jingyanwangms@users.noreply.github.com> Co-authored-by: satyajandhyala <satya.k.jandhyala@gmail.com> Co-authored-by: S. Manohar Karlapalem <manohar.karlapalem@intel.com> Co-authored-by: Weixing Zhang <weixingzhang@users.noreply.github.com> Co-authored-by: Suffian Khan <sukha@microsoft.com> Co-authored-by: Olivia Jain <oljain@microsoft.com> Co-authored-by: Chi Lo <54722500+chilo-ms@users.noreply.github.com> Co-authored-by: Hariharan Seshadri <shariharan91@gmail.com> Co-authored-by: Ryan Lai <rylai@microsoft.com> Co-authored-by: Jesse Benson <jesseb@microsoft.com> Co-authored-by: sfatimar <64512376+sfatimar@users.noreply.github.com> Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: sfatimar <sahar.fatima@intel/com> Co-authored-by: MaajidKhan <n.maajidkhan@gmail.com> Co-authored-by: mohdansx <mohdx.ansari@intel.com> Co-authored-by: Xavier Dupré <xadupre@users.noreply.github.com> Co-authored-by: Michael Goin <mgoin@vols.utk.edu> Co-authored-by: Michael Giba <michaelgiba@gmail.com> Co-authored-by: William Tambellini <wtambellini@sdl.com> Co-authored-by: Hector Li <hecli@microsoft.com> Co-authored-by: Aishwarya <aibhanda@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: liqunfu <liqfu@microsoft.com> Co-authored-by: liqun <liqun@OrtTrainingDev4.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: pengwa <pengwa@microsoft.com> Co-authored-by: Tang, Cheng <souptc@gmail.com> Co-authored-by: Cheng Tang <chenta@microsoft.com> Co-authored-by: Tianlei Wu <tlwu@microsoft.com> Co-authored-by: Ye Wang <52801275+wangyems@users.noreply.github.com> Co-authored-by: Chun-Wei Chen <jacky82226@gmail.com> Co-authored-by: Vincent Wang <wangwchpku@outlook.com> Co-authored-by: Vincent Wang <weicwang@microsoft.com> Co-authored-by: Luyao Ren <375833274@qq.com> Co-authored-by: Zhang Lei <zhang.huanning@hotmail.com> Co-authored-by: Tim Harris <tiharr@microsoft.com> Co-authored-by: Ashwini Khade <askhade@microsoft.com> Co-authored-by: Dmitri Smirnov <yuslepukhin@users.noreply.github.com> Co-authored-by: Alberto Magni <49027342+alberto-magni@users.noreply.github.com> Co-authored-by: Wei-Sheng Chin <wschin@outlook.com> Co-authored-by: wezuo <49965641+wezuo@users.noreply.github.com> Co-authored-by: Jesse Benson <benson.jesse@gmail.com> Co-authored-by: Wei Zuo <wezuo@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-mgtbby.eastus.cloudapp.azure.com> Co-authored-by: wezuo <wezuo@az-eus-v100-32gb-5-worker-yclzsf.eastus.cloudapp.azure.com> Co-authored-by: Wenbing Li <10278425+wenbingl@users.noreply.github.com> Co-authored-by: Martin Man <supermt@gmail.com> Co-authored-by: M. Zeeshan Siddiqui <mzs@microsoft.com> Co-authored-by: Ori Levari <ori.levari@microsoft.com> Co-authored-by: Ori Levari <orlevari@microsoft.com> Co-authored-by: Ubuntu <OrtTrainingDev3@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sherlock Huang <bahuang@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: Sheil Kumar <smk2007@gmail.com> Co-authored-by: Sheil Kumar <sheilk@microsoft.com> Co-authored-by: Ryota Tomioka <ryoto@microsoft.com> Co-authored-by: Adam Pocock <adam.pocock@oracle.com> Co-authored-by: Yulong Wang <f.s@qq.com> Co-authored-by: Faith Xu <faxu@microsoft.com> Co-authored-by: Xiang Zhang <xianz@microsoft.com> Co-authored-by: suryasidd <48925384+suryasidd@users.noreply.github.com> Co-authored-by: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com> Co-authored-by: Weixing Zhang <wezhan@microsoft.com> Co-authored-by: Chethan Palangotu Keshava <chethan.palangotu.keshava@intel.com> Co-authored-by: unknown <63478620+jeyblu@users.noreply.github.com>
2021-02-02 16:59:56 +00:00
$<TARGET_FILE:onnxruntime_providers_tensorrt>
$<TARGET_FILE:onnxruntime_providers_shared>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
if (onnxruntime_USE_MIGRAPHX)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_providers_migraphx>
$<TARGET_FILE:onnxruntime_providers_shared>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
if (onnxruntime_USE_OPENVINO)
OpenVINO EP v2.0 (#3585) * Added FP16 transformations * Revert "Added CMAKE_BUILD_TYPE to make building dynamic" This reverts commit d3e17af1af655cfdc4d2fec33f52055caa525e85. * Added FP16 transformations for FP16 builds * Backend logic cleanup Cleans the backend(intel_graph.*) code in the following ways:- 1. Minimize global usage: Since all the IR graphs need to be re-generated on every Infer, it is bad practice to rely on globals for their saving and usage as there would be multiple readers and writers to the same global variable leading to incorrect usages or contentions. This change replaces globals with locals where possible. This change also fixes an existing bug with due to incorrect global usage. 2. Remove all unused functions. 3. Remove all unused headers and prepocessor directives. * removed commented out code * Disabled default optimization for Intel EP Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fix missed plugins.xml for python bindings * Fixed the build after latest master changes Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled unsupported ops for accelerators Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added some more disabled ops Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added environment variable to enable debugging Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added more debug statements Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fixed unsupported ops list for GPU and VPU Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fixed unsqueeze unit tests Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added error message to the status Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Overwrite Model proto with shape info from data Overwrites the shape info of Model proto with the shape from actual input data. Needed for inferring models with Dynamic shapes. * Removed print statement and disabled where op Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled Reshape with Empty initializer * Added more debug statements for 1P * Don't allow 1D inputs with symbol for dimension * Disabled some 3rd phase ops * Disabled split and added zero dimension check for OutputDefs * Cleanup zero dimensionality check * Added different data type check for inputs and initializers * Added conditions for Mod, Cast and Pad * Removed unused variable * Disabled scan and added conditions for squeeze * Added changes for fixing all C++ unit tests * Implements Backend Manager class for caching Backend Manager provides a layer of indirection between EP interface and OV backend that provides caching services for models with symbolic dims in input shapes. * clean up commented blocks * clang-formatting * Read I/O type info from ModleProto Read the tensor element type information from ModelProto object, as FusedNode is no longer available. * code cleanup * clang-formatting * Added print statement for jenkins * Disabled some python tests * Changed the path of convert fp32 to fp16 hpp * Added conditions for BatchNorm in GetCapability * Fixed failed tests * Revert "Added conditions for BatchNorm in GetCapability" This reverts commit c3c28c3b00d27892c42546b35dacdd807a48ee90. * Added Intel to onnxruntime backends * pick up vars set by OV package setupvars.sh * Added conditions for Identity * remove a few cout prints * Added conditions for GPU_FP32 unit tests * Revert "pick up vars set by OV package setupvars.sh" This reverts commit 8199e029c03eae21a1a7ef6bfdc93d00e5d0198b. * Commented out fatal message for protobuf * Might need to be removed * Add interface class for current backend * moved common logic to base class * simplified cpu backend * Removed unused headers * use vectors to save i/o tensors for windows compatibility * move utils fxns to backend_utils namespace * rename ov_backend to ibackend * Factory pattern for backend creation * rename CPU backend to Basic backend * renamed to vad-M and added to factory list * Added conditions for VPU * Added print statements * Changed the logic for checking for symbolic shapes * Modified logic for zero dimension check * Removed VPU single dimension condition * Removed comments * Modified logic in DimensionCheck method * Remove legacy OpenVINO EP Remove all the legacy code for OpenVINO EP. UEP code will take its place going forward. This change does NOT remove OVEP files in the following areas asa they will be reused by UEP:- 1. Documentation: All .md files 2. Docker releated files 3. Python bindings 4. Java bindings 5. C# bindings 6. ORT Server 7. CI pipeline setup files * Rename Intel EP to OpenVINO EP * Added unique names to the subgraphs * Removed subgraphs with only constant inputs * Modified subgraph partitioning algorithm to remove const input subgraphs * Apply suggestion to onnxruntime/core/providers/openvino/openvino_execution_provider.cc * Tracking output names to fix the output order bug * Changed output names to a unordered map * Modified logic to check for symbolic input shapes * Fixed a bug in Reshape check * Added empty model path to Model constructor * Made necessary changes to cmake to build from the binary package * Changed INTEL_CVSDK_DIR to INTEL_OPENVINO_DIR * Enable dyn device selection with C++ API * Added Round operator to unsupported list * Modified subgraph partition logic for MYRIAD * Removed supported ops from the list * Enable dyn dev selection in Py API's * Add documentation for dynamic device selection * Use MYRIAD || HDDL instead of VPU * Removed temporary cast of Int64 to FP32 * Disabled unit Tests for CPU_FP32 and GPU_FP32 * Removed default "CPU" from unit tests to allow overriding * Removed ops Concat, Squeeze, Unsqueeze from unsupported list * Get the device id from info * Removed overwriting device_id and precision * Enabled ConvTranspose and EyeLike * Reordered unsupported ops in alphabetical order * Fixed syntax error * Fixed syntax error * Code clean-up: Handle exceptions, logs and formatting Code formatted according to ORT coding guidelines. * remove debug print from pybind code * updated docs with ops and models * formatting prints * Added default values for c and j for openvino * Overriding the values set for c and j to be 1 * BACKEND_OPENVINO should be empty if openvino is not in build * Overriding c value with default for perftest * fix VAD-M device string bug * Add IE error details to exceptions * Use IE specific device names in EP * Add VAD-F (FPGA) device support * Removed unecessary libraries from whl package * Code changes for Windows compatibility * Add VAD-F option to python API * [revert before merge] cmake changes for RC * Enable Windows build in CMake * Unset macro OPTIONAL for windows builds inference_engine.hpp's include chain defines a macro 'OPTIONAL' which conflicts with onnx project's headers when using MSVC. So would need to explictly unset it for MSVC. * Use a single copy of plugin/IE::Core Defined as a static member in Backend manager * Remove restriction of single subgraphs for myriad * Passed subgraph name to Backend to enhance log statements * Disabled zero dimension conditions * Disabled concat to remove zero dims * Enabled building ngraph as part of ORT * Removed serializing and added versioning * Fix CPU_FP32 unit tests * Removed unecessary condition * add ngraph.so.0.0 to .whl * Check for zero dimensions only for inputs and outputs * Restrict loading only 10 subgraphs on myriad * Build ngraph.dll within UEP. Doesn't link yet * Rename Linux included libngraph.so to libovep_ngraph.so Renames locally built libngraph.so containing ONNX importer to libovep_ngraph.so in order to avoid linkage conflicts with libngraph.so supplied by OpenVINO binary installer. Applies only for Linux builds. * use output_name cmake properties for lib name * fix .so name format in lib_name.patch * CMake code cleanup * Rename WIN32 included ngraph.dll to ovep_ngraph.dll To avoid conflict with ngraph.dll distributed by openvino. * Added myriad config for networks without 4 dimensions * Loading the 10 max clusters for inference on myriad * Refactor code and add Batching support Encapsulate subgraph settings into context structs. Add batching support for completely supported models. * Disabled some broken tests * use input_indexes to avoid batch-checking initializers * Avoid static initialization order error on WOS * Added candy to broken tests * InternalCI changes for 2020.2 * Updated DLDT instructions * Unsaved changed in install_openvino.sh * Changes after manual check * Remove custom ngraph onnx_import build for WOS ONNX Importer on WOS does not have protobuf issue. * Remove FP32ToFP16 ngraph pass This conversion is performed implicitly within IE. * Surround debug logic by #ifndef NDEBUG * remove invalid TODO comments * removed references to ngrpah-ep * clang-formatting * remove commented code * comment edits * updating copyright year to that of first OpenVINO-EP release * remove redundant log msg * Modified operator and topology support * Update build instructions * doc formatting * Fixed clip unit tests * Revert "Remove FP32ToFP16 ngraph pass" This reverts commit ec962ca5f315a5658ad980e740196f19de2639c1. * Applying FP16 transformation only for GPU FP16 * Fixed GPU FP32 python tests * automatically use full protobuf * disable onnxrt server for now * Disabled upsample * update dockerfile instructions * Removed MO paths and added ngraph path * Remove OVEP from ORT Server docs Will put it back in after validation * Updated path to Ngraph lib * Disabled Resize and some other python tests * Removed unnecesary header files * Use commit SHA to fetch ngraph repo * Avoid un-needed file changes due to version update * Fixed clip tests * Fixed Pow, max and min onnx tests * build.md doc typo * Update cmake patch command for ngraph src * remove dead cmake code for onnxruntime_USE_OPENVINO_BINARY * use spaces instead of tab * remove commented code * Add info about protobuf version * edit debug env var and enable for WIN32 * specify only version tag of 2020.2 for dockerbuilds * remove unnecessary file changes * Pass empty string as default argument to C# tests * Use ${OPENVINO_VERSION} to name openvino install directory in CI builds * Enabled unnecessarily disabled tests * Fixed ngraph protobuf patch * Fixed error in protobuf patch * Revert "Use ${OPENVINO_VERSION} to name openvino install directory in CI builds" This reverts commit 89e72adb8bf3b9712f5c81c5e13fe68c6c0df002. * Remove unsetting OPTIONAL macro This is no longer used in recent ONNX update onnx/onnx@da13be2, so this unset workaround is no longer necessary. * Use a null string default argument for C# API * Set OpenVINO version yml files and pass to CI Docker builds Git Tag info for DLDT as well as install directory are set using this value. This reverts commit 9fa9c20348ed72ae360a95c98e9b074d2f9fafc5. * Documentation: recommendation and instructions for disabling ORT graph optimizations * more doc updates * Reduced the number of models according to CI time constraints Co-authored-by: ynimmaga <yamini.nimmagadda@intel.com> Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: Mikhail Treskin <mikhail.treskin@intel.com> Co-authored-by: mbencer <mateusz.bencer@intel.com> Co-authored-by: Aravind <aravindx.gunda@intel.com> Co-authored-by: suryasidd <48925384+suryasidd@users.noreply.github.com>
2020-04-24 11:06:02 +00:00
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_providers_openvino>
$<TARGET_FILE:onnxruntime_providers_shared>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
OpenVINO EP v2.0 (#3585) * Added FP16 transformations * Revert "Added CMAKE_BUILD_TYPE to make building dynamic" This reverts commit d3e17af1af655cfdc4d2fec33f52055caa525e85. * Added FP16 transformations for FP16 builds * Backend logic cleanup Cleans the backend(intel_graph.*) code in the following ways:- 1. Minimize global usage: Since all the IR graphs need to be re-generated on every Infer, it is bad practice to rely on globals for their saving and usage as there would be multiple readers and writers to the same global variable leading to incorrect usages or contentions. This change replaces globals with locals where possible. This change also fixes an existing bug with due to incorrect global usage. 2. Remove all unused functions. 3. Remove all unused headers and prepocessor directives. * removed commented out code * Disabled default optimization for Intel EP Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fix missed plugins.xml for python bindings * Fixed the build after latest master changes Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled unsupported ops for accelerators Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added some more disabled ops Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added environment variable to enable debugging Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added more debug statements Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fixed unsupported ops list for GPU and VPU Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Fixed unsqueeze unit tests Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Added error message to the status Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Overwrite Model proto with shape info from data Overwrites the shape info of Model proto with the shape from actual input data. Needed for inferring models with Dynamic shapes. * Removed print statement and disabled where op Signed-off-by: suryasidd <surya.siddharth.pemmaraju@intel.com> * Disabled Reshape with Empty initializer * Added more debug statements for 1P * Don't allow 1D inputs with symbol for dimension * Disabled some 3rd phase ops * Disabled split and added zero dimension check for OutputDefs * Cleanup zero dimensionality check * Added different data type check for inputs and initializers * Added conditions for Mod, Cast and Pad * Removed unused variable * Disabled scan and added conditions for squeeze * Added changes for fixing all C++ unit tests * Implements Backend Manager class for caching Backend Manager provides a layer of indirection between EP interface and OV backend that provides caching services for models with symbolic dims in input shapes. * clean up commented blocks * clang-formatting * Read I/O type info from ModleProto Read the tensor element type information from ModelProto object, as FusedNode is no longer available. * code cleanup * clang-formatting * Added print statement for jenkins * Disabled some python tests * Changed the path of convert fp32 to fp16 hpp * Added conditions for BatchNorm in GetCapability * Fixed failed tests * Revert "Added conditions for BatchNorm in GetCapability" This reverts commit c3c28c3b00d27892c42546b35dacdd807a48ee90. * Added Intel to onnxruntime backends * pick up vars set by OV package setupvars.sh * Added conditions for Identity * remove a few cout prints * Added conditions for GPU_FP32 unit tests * Revert "pick up vars set by OV package setupvars.sh" This reverts commit 8199e029c03eae21a1a7ef6bfdc93d00e5d0198b. * Commented out fatal message for protobuf * Might need to be removed * Add interface class for current backend * moved common logic to base class * simplified cpu backend * Removed unused headers * use vectors to save i/o tensors for windows compatibility * move utils fxns to backend_utils namespace * rename ov_backend to ibackend * Factory pattern for backend creation * rename CPU backend to Basic backend * renamed to vad-M and added to factory list * Added conditions for VPU * Added print statements * Changed the logic for checking for symbolic shapes * Modified logic for zero dimension check * Removed VPU single dimension condition * Removed comments * Modified logic in DimensionCheck method * Remove legacy OpenVINO EP Remove all the legacy code for OpenVINO EP. UEP code will take its place going forward. This change does NOT remove OVEP files in the following areas asa they will be reused by UEP:- 1. Documentation: All .md files 2. Docker releated files 3. Python bindings 4. Java bindings 5. C# bindings 6. ORT Server 7. CI pipeline setup files * Rename Intel EP to OpenVINO EP * Added unique names to the subgraphs * Removed subgraphs with only constant inputs * Modified subgraph partitioning algorithm to remove const input subgraphs * Apply suggestion to onnxruntime/core/providers/openvino/openvino_execution_provider.cc * Tracking output names to fix the output order bug * Changed output names to a unordered map * Modified logic to check for symbolic input shapes * Fixed a bug in Reshape check * Added empty model path to Model constructor * Made necessary changes to cmake to build from the binary package * Changed INTEL_CVSDK_DIR to INTEL_OPENVINO_DIR * Enable dyn device selection with C++ API * Added Round operator to unsupported list * Modified subgraph partition logic for MYRIAD * Removed supported ops from the list * Enable dyn dev selection in Py API's * Add documentation for dynamic device selection * Use MYRIAD || HDDL instead of VPU * Removed temporary cast of Int64 to FP32 * Disabled unit Tests for CPU_FP32 and GPU_FP32 * Removed default "CPU" from unit tests to allow overriding * Removed ops Concat, Squeeze, Unsqueeze from unsupported list * Get the device id from info * Removed overwriting device_id and precision * Enabled ConvTranspose and EyeLike * Reordered unsupported ops in alphabetical order * Fixed syntax error * Fixed syntax error * Code clean-up: Handle exceptions, logs and formatting Code formatted according to ORT coding guidelines. * remove debug print from pybind code * updated docs with ops and models * formatting prints * Added default values for c and j for openvino * Overriding the values set for c and j to be 1 * BACKEND_OPENVINO should be empty if openvino is not in build * Overriding c value with default for perftest * fix VAD-M device string bug * Add IE error details to exceptions * Use IE specific device names in EP * Add VAD-F (FPGA) device support * Removed unecessary libraries from whl package * Code changes for Windows compatibility * Add VAD-F option to python API * [revert before merge] cmake changes for RC * Enable Windows build in CMake * Unset macro OPTIONAL for windows builds inference_engine.hpp's include chain defines a macro 'OPTIONAL' which conflicts with onnx project's headers when using MSVC. So would need to explictly unset it for MSVC. * Use a single copy of plugin/IE::Core Defined as a static member in Backend manager * Remove restriction of single subgraphs for myriad * Passed subgraph name to Backend to enhance log statements * Disabled zero dimension conditions * Disabled concat to remove zero dims * Enabled building ngraph as part of ORT * Removed serializing and added versioning * Fix CPU_FP32 unit tests * Removed unecessary condition * add ngraph.so.0.0 to .whl * Check for zero dimensions only for inputs and outputs * Restrict loading only 10 subgraphs on myriad * Build ngraph.dll within UEP. Doesn't link yet * Rename Linux included libngraph.so to libovep_ngraph.so Renames locally built libngraph.so containing ONNX importer to libovep_ngraph.so in order to avoid linkage conflicts with libngraph.so supplied by OpenVINO binary installer. Applies only for Linux builds. * use output_name cmake properties for lib name * fix .so name format in lib_name.patch * CMake code cleanup * Rename WIN32 included ngraph.dll to ovep_ngraph.dll To avoid conflict with ngraph.dll distributed by openvino. * Added myriad config for networks without 4 dimensions * Loading the 10 max clusters for inference on myriad * Refactor code and add Batching support Encapsulate subgraph settings into context structs. Add batching support for completely supported models. * Disabled some broken tests * use input_indexes to avoid batch-checking initializers * Avoid static initialization order error on WOS * Added candy to broken tests * InternalCI changes for 2020.2 * Updated DLDT instructions * Unsaved changed in install_openvino.sh * Changes after manual check * Remove custom ngraph onnx_import build for WOS ONNX Importer on WOS does not have protobuf issue. * Remove FP32ToFP16 ngraph pass This conversion is performed implicitly within IE. * Surround debug logic by #ifndef NDEBUG * remove invalid TODO comments * removed references to ngrpah-ep * clang-formatting * remove commented code * comment edits * updating copyright year to that of first OpenVINO-EP release * remove redundant log msg * Modified operator and topology support * Update build instructions * doc formatting * Fixed clip unit tests * Revert "Remove FP32ToFP16 ngraph pass" This reverts commit ec962ca5f315a5658ad980e740196f19de2639c1. * Applying FP16 transformation only for GPU FP16 * Fixed GPU FP32 python tests * automatically use full protobuf * disable onnxrt server for now * Disabled upsample * update dockerfile instructions * Removed MO paths and added ngraph path * Remove OVEP from ORT Server docs Will put it back in after validation * Updated path to Ngraph lib * Disabled Resize and some other python tests * Removed unnecesary header files * Use commit SHA to fetch ngraph repo * Avoid un-needed file changes due to version update * Fixed clip tests * Fixed Pow, max and min onnx tests * build.md doc typo * Update cmake patch command for ngraph src * remove dead cmake code for onnxruntime_USE_OPENVINO_BINARY * use spaces instead of tab * remove commented code * Add info about protobuf version * edit debug env var and enable for WIN32 * specify only version tag of 2020.2 for dockerbuilds * remove unnecessary file changes * Pass empty string as default argument to C# tests * Use ${OPENVINO_VERSION} to name openvino install directory in CI builds * Enabled unnecessarily disabled tests * Fixed ngraph protobuf patch * Fixed error in protobuf patch * Revert "Use ${OPENVINO_VERSION} to name openvino install directory in CI builds" This reverts commit 89e72adb8bf3b9712f5c81c5e13fe68c6c0df002. * Remove unsetting OPTIONAL macro This is no longer used in recent ONNX update onnx/onnx@da13be2, so this unset workaround is no longer necessary. * Use a null string default argument for C# API * Set OpenVINO version yml files and pass to CI Docker builds Git Tag info for DLDT as well as install directory are set using this value. This reverts commit 9fa9c20348ed72ae360a95c98e9b074d2f9fafc5. * Documentation: recommendation and instructions for disabling ORT graph optimizations * more doc updates * Reduced the number of models according to CI time constraints Co-authored-by: ynimmaga <yamini.nimmagadda@intel.com> Co-authored-by: suryasidd <surya.siddharth.pemmaraju@intel.com> Co-authored-by: Mikhail Treskin <mikhail.treskin@intel.com> Co-authored-by: mbencer <mateusz.bencer@intel.com> Co-authored-by: Aravind <aravindx.gunda@intel.com> Co-authored-by: suryasidd <48925384+suryasidd@users.noreply.github.com>
2020-04-24 11:06:02 +00:00
)
endif()
if (DEFINED ENV{OPENVINO_MANYLINUX})
file(GLOB onnxruntime_python_openvino_python_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/core/providers/openvino/scripts/*"
)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${onnxruntime_python_openvino_python_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
Ryanunderhill/cuda shared (#7626) * First iteration of making cuda a shared provider. Separated out shared OpKernel change, so doing this to merge with that change. * More cuda shared library refactoring * More cuda shared library refactoring * More build options tested, converted the training ops over. * Fix merge breaks * Fix submodules * Fix submodules * Fix submodules * Fix python * Fix compile errors * Duplicate symbol fix * Test fix for ROCM provider * Another ROCM test workaround * ROCM Build Test * ROCM build fix * ROCM * ROCM * ROCM * ROCM * ROCM * ROCM test * Reduce header dependencies * Remove redundant namespace * Test fix for linux * Fix linux build * Fix Eigen build error * Fix unused parameter warning * Test link error * Another linker test * Linker test * Linker test * Another test * Another build test * Fix linux link error * Build test * Fix control flow ops to use common base class with core code * Remove extra qualifiers * Fix template syntax for linux * Fix cuda memory leak * Fix pybind * Test disabling cast * Cleanup * Restore cuda in test * Remove more header dependencies * Test not adding cuda provider to session * Make GetProviderInfo_CUDA throw * No-op cuda provider creation * Fix some setup issues * Fix memory cleanup on unload * Diagnostics * Don't unload library * Add diagnostics * Fix deleting registry at right time. * Test disabling profiler * Fix merge break * Revert profiler change * Move unloading of shared providers into Environment * Free more global allocations before library unloads * Add more diagnostics * Move unloading back to the OrtEnv as there are multiple Environments created during a session. Remove some library dependencies for tests. * Fix more cmake files * ERROR -> WARNING * Fix python shutdown * Test not using dml in pipeline * Change python version and disable dml * Update python version * Test adding unload method for shared providers * Disable DLL test * Python test * Revert "Python test" This reverts commit c7ec2cfe98e4dcc30e1a54cad71adb89ab18adde. * Revert "Disable DLL test" This reverts commit e901cb93aafd537af088fa29629029a479926362. * Revert "Test adding unload method for shared providers" This reverts commit c427b7879989be93572b09c4662b1fa8b153eec9. * Point to RyanWinGPU * Revert python version * Fix id_to_allocator_map * Another python exit test * Remove extra debug messages Try a more clean python shutdown through DllMain * Revert DllMain idea, it didn't work * Merge conflicts * Fix merge with master issues. * Comments * Undo edit to file * Cleanup + new training ops * Revert yml changes * Fix another merge error * ROCM fix * ROCM fix v2 * Put back Linux hack, it is necessary * Stupid fixes * Fix submodule out of sync * ROCM fix 3 * ROCM 4 * Test java fix * Fix typos * Java test on my VM * Fix build error * Spotless fix * Leave temp file around to load properly * Fix cleanup on exit * Fix break * Java comments * Remove LongformerAttentionBase workaround * Spotless fix * Switch yml back to regular build pool * Revert "Switch yml back to regular build pool" This reverts commit be35fc2a5ab53868568791f1e54320e6c7fc75ce. * Code review feedback * Fix errors due to merge * Spotless fix * Fix minimal build * Java fix for non cuda case * Java fix for CPU build * Fix Nuphar? * Fix nuphar 2 * Fix formatting * Revert "Remove LongformerAttentionBase workaround" This reverts commit 648679b37093740bbefc7b41ff53d3239451333f. * Training fix * Another java fix * Formatting * Formatting * For orttraining * Last orttraining build fix... * training fixes * Fix test provider error * Missing pass command * Removed in wrong spot * Python typo * Python typos * Python crash on exit, possibly due to unloading of libraries. * Remove test_execution_provider from training build Only enable python atexit on windows Remove assert on provider library exit * Still can't unload providers in python, alas. * Disable Nvtx temporarily * MPI Kernels for Training * MPI Kernels part 2 * Patch through INcclService * Oops, wrong CMakeLists * Missing namespace * Fix missing () * Move INcclService::GetInstance around to link nicer * Missing } * Missing MPI libraries for Cuda * Add extra GetType functions used by MPI * Missing Nccl library * Remove LOGS statements as a test * Add in a couple more missing GetType methods * Update comments * Missed a logging reference in mpi_context.h * Convert aten_op to shared (due to marge with master) * Test moving DistributedRunContext instance into shared provider layer (with purpose error to verify it's being built properly) * Test passed, now with fix * Missing static * Oops, scope DistributedRunContext to just NCCL * Merge related issues and code review feedback. * Merge error * Bump to rel-1.9.1 (#7684) * Formatting * Code review feedback for Java build on non Windows * Remove cupti library dependency from core library * Test Java pipeline fix * Linux build fix * Revert "Linux build fix" This reverts commit a73a811516ad2acdbbea890a2ae2143341f915ad. * Revert "Remove cupti library dependency from core library" This reverts commit 6a889ee8bf72f34f44b8a4642728b7bd83f61255. * Packaging pipeline fixes to copy cuda shared provider for tensorrt & standard packages * Add cuda to Tensorrt nuget package * onnxruntime_common still has a cuda header dependency Co-authored-by: ashbhandare <ash.bhandare@gmail.com>
2021-05-20 14:53:47 +00:00
if (onnxruntime_USE_CUDA)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_providers_cuda>
$<TARGET_FILE:onnxruntime_providers_shared>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
if (onnxruntime_USE_CANN)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_providers_cann>
$<TARGET_FILE:onnxruntime_providers_shared>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
if (onnxruntime_USE_ROCM)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_providers_rocm>
$<TARGET_FILE:onnxruntime_providers_shared>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
if (onnxruntime_USE_TVM)
file(GLOB onnxruntime_python_providers_tvm_srcs CONFIGURE_DEPENDS
"${ONNXRUNTIME_ROOT}/python/providers/tvm/*.py"
)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/providers
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
COMMAND ${CMAKE_COMMAND} -E make_directory $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/providers/tvm
COMMAND ${CMAKE_COMMAND} -E copy
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
${onnxruntime_python_providers_tvm_srcs}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/providers/tvm
COMMAND ${CMAKE_COMMAND} -E copy
[TVM EP] Rename Standalone TVM (STVM) Execution Provider to TVM EP (#10260) * update java API for STVM EP. Issue is from PR#10019 * use_stvm -> use_tvm * rename stvm worktree * STVMAllocator -> TVMAllocator * StvmExecutionProviderInfo -> TvmExecutionProviderInfo * stvm -> tvm for cpu_targets. resolve onnxruntime::tvm and origin tvm namespaces conflict * STVMRunner -> TVMRunner * StvmExecutionProvider -> TvmExecutionProvider * tvm::env_vars * StvmProviderFactory -> TvmProviderFactory * rename factory funcs * StvmCPUDataTransfer -> TvmCPUDataTransfer * small clean * STVMFuncState -> TVMFuncState * USE_TVM -> NUPHAR_USE_TVM * USE_STVM -> USE_TVM * python API: providers.stvm -> providers.tvm. clean TVM_EP.md * clean build scripts #1 * clean build scripts, java frontend and others #2 * once more clean #3 * fix build of nuphar tvm test * final transfer stvm namespace to onnxruntime::tvm * rename stvm->tvm * NUPHAR_USE_TVM -> USE_NUPHAR_TVM * small fixes for correct CI tests * clean after rebase. Last renaming stvm to tvm, separate TVM and Nuphar in cmake and build files * update CUDA support for TVM EP * roll back CudaNN home check * ERROR for not positive input shape dimension instead of WARNING * update documentation for CUDA * small corrections after review * update GPU description * update GPU description * misprints were fixed * cleaned up error msgs Co-authored-by: Valery Chernov <valery.chernov@deelvin.com> Co-authored-by: KJlaccHoeUM9l <wotpricol@mail.ru> Co-authored-by: Thierry Moreau <tmoreau@octoml.ai>
2022-02-15 09:21:02 +00:00
$<TARGET_FILE:onnxruntime_providers_tvm>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
WORKING_DIRECTORY ${tvm_SOURCE_DIR}/python
COMMAND ${Python_EXECUTABLE} setup.py bdist_wheel
)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${Python_EXECUTABLE}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/providers/tvm/extend_python_file.py
--target_file $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/_ld_preload.py
)
endif()
if (onnxruntime_USE_DML)
if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
set(dml_shared_lib_path ${DML_PACKAGE_DIR}/bin/${onnxruntime_target_platform}-win/${DML_SHARED_LIB})
else()
set(dml_shared_lib_path ${DML_PACKAGE_DIR}/bin/${DML_SHARED_LIB})
endif()
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${dml_shared_lib_path}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
if (onnxruntime_USE_NNAPI_BUILTIN)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_providers_nnapi>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
if (onnxruntime_USE_COREML)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
$<TARGET_FILE:onnxruntime_providers_coreml>
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
endif()
if (onnxruntime_USE_QNN)
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
${QNN_LIB_FILES}
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
)
if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
add_custom_command(
TARGET onnxruntime_pybind11_state POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy
"${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf"
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/
)
endif()
endif()
endif()