From f4f49535a4ec6fa5b2fff108b45aade25c8ba8ea Mon Sep 17 00:00:00 2001 From: Sumit Agarwal Date: Mon, 12 Aug 2024 07:02:00 -0700 Subject: [PATCH] [ORT 1.18.2] Cherry Pick Pad Optimizations + Update DML to 1.15.1 (#21670) ### Description This change cherry-picks 2 Pad fusion optimization: https://github.com/microsoft/onnxruntime/pull/21640 and https://github.com/microsoft/onnxruntime/pull/21556. It also has to cherry-pick 2 extra changes to unblock pipeline and dependency failure: https://github.com/microsoft/onnxruntime/pull/21300 and https://github.com/microsoft/onnxruntime/pull/21662 (didn't include test which are part of 1.18.1 payload). Also uploaded new version of [onnxruntime_build_dependencies:10.177](https://dev.azure.com/onnxruntime/onnxruntime/_artifacts/feed/onnxruntime/UPack/onnxruntime_build_dependencies/overview/1.0.177) and updated the same in `download-deps.yml`. Additionally it also updates DML binary to 1.15.1. ### Motivation and Context --------- Co-authored-by: Changming Sun Co-authored-by: Tianlei Wu --- .pipelines/nuget_config/x64/packages.config | 2 +- .pipelines/nuget_config/x86/packages.config | 2 +- VERSION_NUMBER | 2 +- cgmanifests/generated/cgmanifest.json | 2 +- cmake/CMakeLists.txt | 21 +++-- cmake/deps.txt | 2 +- cmake/external/dml.cmake | 2 +- cmake/patches/abseil/absl_windows.patch | 51 ++++++++-- docs/python/README.rst | 5 + js/common/lib/version.ts | 2 +- js/common/package-lock.json | 4 +- js/common/package.json | 2 +- js/node/lib/version.ts | 2 +- js/node/package-lock.json | 6 +- js/node/package.json | 2 +- js/react_native/lib/version.ts | 2 +- js/react_native/package.json | 2 +- js/react_native/yarn.lock | 2 +- js/web/lib/version.ts | 2 +- js/web/package-lock.json | 6 +- js/web/package.json | 2 +- onnxruntime/__init__.py | 2 +- onnxruntime/core/optimizer/pad_fusion.cc | 94 +++++++++++++------ onnxruntime/core/optimizer/pad_fusion.h | 2 +- .../migraphx/migraphx_execution_provider.cc | 1 + .../core/providers/qnn/builder/qnn_utils.cc | 3 + onnxruntime/core/session/onnxruntime_c_api.cc | 2 +- onnxruntime/test/onnx/TestCase.cc | 4 + .../python/orttraining_test_ortmodule_api.py | 4 +- packages.config | 2 +- .../orttraining-linux-ci-pipeline.yml | 1 - .../orttraining-linux-gpu-ci-pipeline.yml | 1 - ...ortmodule-distributed-test-ci-pipeline.yml | 3 +- .../templates/download-deps.yml | 4 +- ...orttraining-linux-gpu-test-ci-pipeline.yml | 4 +- .../nuget/generate_nuspec_for_native_nuget.py | 2 +- 36 files changed, 170 insertions(+), 82 deletions(-) diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config index 9066e13ee1..96bb053a13 100644 --- a/.pipelines/nuget_config/x64/packages.config +++ b/.pipelines/nuget_config/x64/packages.config @@ -1,6 +1,6 @@  - + diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config index a8e5b35b28..6bf842ac18 100644 --- a/.pipelines/nuget_config/x86/packages.config +++ b/.pipelines/nuget_config/x86/packages.config @@ -1,6 +1,6 @@  - + diff --git a/VERSION_NUMBER b/VERSION_NUMBER index ec6d649be6..b57fc7228b 100644 --- a/VERSION_NUMBER +++ b/VERSION_NUMBER @@ -1 +1 @@ -1.18.1 +1.18.2 diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index eb74178b3e..148a3ba61f 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -36,7 +36,7 @@ "component": { "type": "git", "git": { - "commitHash": "4a2c63365eff8823a5221db86ef490e828306f9d", + "commitHash": "f46495ea96f68fc3f6c394f099b2992743f6ff7f", "repositoryUrl": "https://github.com/abseil/abseil-cpp.git" }, "comments": "abseil_cpp" diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 8a1333206c..fa907e24f6 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -652,6 +652,12 @@ else() check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE) check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST) check_cxx_compiler_flag(-Wstringop-overflow HAS_STRINGOP_OVERFLOW) + if(onnxruntime_ENABLE_TRAINING_APIS) + check_cxx_compiler_flag(-Wdangling-reference HAS_DANGLING_REFERENCE) + if(HAS_DANGLING_REFERENCE) + list(APPEND ORT_WARNING_FLAGS -Wno-dangling-reference) + endif() + endif() check_function_exists(reallocarray HAS_REALLOCARRAY) if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64") check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16) @@ -819,8 +825,8 @@ if (onnxruntime_USE_QNN) file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libQnn*.so" "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/Qnn*.dll") if (${QNN_ARCH_ABI} STREQUAL "aarch64-windows-msvc" OR ${QNN_ARCH_ABI} STREQUAL "arm64x-windows-msvc") file(GLOB EXTRA_HTP_LIB LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so" - "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so" - "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libqnnhtpv73.cat") + "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so" + "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libqnnhtpv73.cat") list(APPEND QNN_LIB_FILES ${EXTRA_HTP_LIB}) endif() message(STATUS "QNN lib files: " ${QNN_LIB_FILES}) @@ -1031,6 +1037,9 @@ function(onnxruntime_set_compile_flags target_name) foreach(FLAG ${ORT_WARNING_FLAGS}) target_compile_options(${target_name} PRIVATE "$<$:${FLAG}>") endforeach() + if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 13 AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + target_compile_options(${target_name} PRIVATE "$<$:-Wno-maybe-uninitialized>") + endif() if (onnxruntime_USE_CUDA) foreach(FLAG ${ORT_WARNING_FLAGS}) target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options ${FLAG}>") @@ -1172,11 +1181,11 @@ if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 if (onnxruntime_USE_ACL_2002) add_definitions(-DACL_2002=1) else() - if (onnxruntime_USE_ACL_2308) - add_definitions(-DACL_2308=1) - else() + if (onnxruntime_USE_ACL_2308) + add_definitions(-DACL_2308=1) + else() add_definitions(-DACL_1905=1) - endif() + endif() endif() endif() endif() diff --git a/cmake/deps.txt b/cmake/deps.txt index d213b09034..62adbf53e2 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -12,7 +12,7 @@ # NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI. # See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29 # -abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20240116.0.zip;bc2cec6baaad67fcb6c0c38972b687d4797927e9 +abseil_cpp;https://github.com/abseil/abseil-cpp/archive/f46495ea96f68fc3f6c394f099b2992743f6ff7f.zip;0e2b6d1dc7f0a808d1e23f7dd985f7bc18d52cbc coremltools;https://github.com/apple/coremltools/archive/refs/tags/7.1.zip;f1bab0f30966f2e217d8e01207d518f230a1641a cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0 date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159 diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake index f74b694471..8b5f602643 100644 --- a/cmake/external/dml.cmake +++ b/cmake/external/dml.cmake @@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML) set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config) set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config) get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE) - set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.14.1) + set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.1) # Restore nuget packages, which will pull down the DirectML redist package. add_custom_command( diff --git a/cmake/patches/abseil/absl_windows.patch b/cmake/patches/abseil/absl_windows.patch index 584c49d612..8298364652 100644 --- a/cmake/patches/abseil/absl_windows.patch +++ b/cmake/patches/abseil/absl_windows.patch @@ -1,8 +1,43 @@ +diff --git a/absl/base/attributes.h b/absl/base/attributes.h +index 5ea5ee3e..f4949898 100644 +--- a/absl/base/attributes.h ++++ b/absl/base/attributes.h +@@ -559,7 +559,7 @@ + #undef ABSL_ATTRIBUTE_UNUSED + #define ABSL_ATTRIBUTE_UNUSED __attribute__((__unused__)) + #else +-#define ABSL_ATTRIBUTE_UNUSED ++#define ABSL_ATTRIBUTE_UNUSED [[maybe_unused]] + #endif + + // ABSL_ATTRIBUTE_INITIAL_EXEC +diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h +index d4fe8f5c..27418d13 100644 +--- a/absl/container/internal/raw_hash_set.h ++++ b/absl/container/internal/raw_hash_set.h +@@ -1924,7 +1924,7 @@ HashtablezInfoHandle SampleHashtablezInfo(size_t sizeof_slot, size_t sizeof_key, + // In SOO, we sample on the first insertion so if this is an empty SOO case + // (e.g. when reserve is called), then we still need to sample. + if (kSooEnabled && was_soo && c.size() == 0) { +- return Sample(sizeof_slot, sizeof_key, sizeof_value, SooCapacity()); ++ return Sample(sizeof_slot, sizeof_key, sizeof_value, (int16_t)SooCapacity()); + } + // For non-SOO cases, we sample whenever the capacity is increasing from zero + // to non-zero. +@@ -3525,7 +3525,7 @@ class raw_hash_set { + assert(is_soo()); + if (!ShouldSampleHashtablezInfo()) return HashtablezInfoHandle{}; + return Sample(sizeof(slot_type), sizeof(key_type), sizeof(value_type), +- SooCapacity()); ++ (int16_t)SooCapacity()); + } + + inline void destroy_slots() { diff --git a/absl/copts/GENERATED_AbseilCopts.cmake b/absl/copts/GENERATED_AbseilCopts.cmake -index a4ab1aa2..dfd13fd7 100644 +index da2282fe..4c7fc26f 100644 --- a/absl/copts/GENERATED_AbseilCopts.cmake +++ b/absl/copts/GENERATED_AbseilCopts.cmake -@@ -129,8 +129,6 @@ list(APPEND ABSL_MSVC_FLAGS +@@ -181,8 +181,6 @@ list(APPEND ABSL_MSVC_FLAGS "/wd4005" "/wd4068" "/wd4180" @@ -10,12 +45,12 @@ index a4ab1aa2..dfd13fd7 100644 - "/wd4267" "/wd4503" "/wd4800" - ) + "/DNOMINMAX" diff --git a/absl/copts/GENERATED_copts.bzl b/absl/copts/GENERATED_copts.bzl -index a6efc98e..8c4de8e7 100644 +index b9e0071e..dd8410ec 100644 --- a/absl/copts/GENERATED_copts.bzl +++ b/absl/copts/GENERATED_copts.bzl -@@ -130,8 +130,6 @@ ABSL_MSVC_FLAGS = [ +@@ -182,8 +182,6 @@ ABSL_MSVC_FLAGS = [ "/wd4005", "/wd4068", "/wd4180", @@ -23,12 +58,12 @@ index a6efc98e..8c4de8e7 100644 - "/wd4267", "/wd4503", "/wd4800", - ] + "/DNOMINMAX", diff --git a/absl/copts/copts.py b/absl/copts/copts.py -index e6e11949..0aa7d868 100644 +index 2d85ac74..4875d668 100644 --- a/absl/copts/copts.py +++ b/absl/copts/copts.py -@@ -115,10 +115,6 @@ MSVC_WARNING_FLAGS = [ +@@ -118,10 +118,6 @@ MSVC_WARNING_FLAGS = [ "/wd4068", # unknown pragma # qualifier applied to function type has no meaning; ignored "/wd4180", diff --git a/docs/python/README.rst b/docs/python/README.rst index de54b120da..2830df1460 100644 --- a/docs/python/README.rst +++ b/docs/python/README.rst @@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime `_ or the `Github project `_. """ -__version__ = "1.18.1" +__version__ = "1.18.2" __author__ = "Microsoft" # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package). diff --git a/onnxruntime/core/optimizer/pad_fusion.cc b/onnxruntime/core/optimizer/pad_fusion.cc index a1c7f8de9e..3391e20cf0 100644 --- a/onnxruntime/core/optimizer/pad_fusion.cc +++ b/onnxruntime/core/optimizer/pad_fusion.cc @@ -8,26 +8,9 @@ namespace onnxruntime { -/* - * It matches following pattern: - * Pad - * | - * Conv/MaxPool - */ -bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger&) const { - // if Pad has input axis, don't fuse it. - if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Pad", {1, 2, 11, 13, 18, 19}) || - node.GetOutputEdgesCount() != 1 || - node.InputDefs().size() > 3) { - return false; - } - - if (graph.NodeProducesGraphOutput(node)) { - return false; - } - - const Node& child_node = *node.OutputNodesBegin(); +bool VerifyNotCastChild(const Node& child_node) { if (!graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "Conv", {1, 11}) && + !graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "AveragePool", {1, 7, 10, 11, 19}) && !graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "MaxPool", {1, 8, 10, 11, 12})) { return false; } @@ -53,6 +36,45 @@ bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const log return false; } + return true; +} + +void UpdatePaddingAttribute(Node& child_node, const std::vector& pads_values, const uint32_t pads_size) { + auto child_pads = child_node.GetMutableAttributes()["pads"].mutable_ints(); + uint32_t child_pads_size = static_cast(child_pads->size()); + + for (uint32_t pads_index = 2, child_index = 0; pads_index < pads_size / 2; pads_index++, child_index++) { + child_pads->Set(child_index, child_pads->Get(child_index) + pads_values[pads_index]); + uint32_t mirrored_child_index = child_index + (child_pads_size / 2); + uint32_t mirrored_pad_index = pads_index + (pads_size / 2); + child_pads->Set(mirrored_child_index, child_pads->Get(mirrored_child_index) + pads_values[mirrored_pad_index]); + } +} +/* + * Before: + * Pad + * | + * Cast (Optional) + * | + * Conv/MaxPool/AveragePool + * + * After: + * Cast (Optional) + * | + * Conv/MaxPool/AveragePool + */ +bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger&) const { + // if Pad has input axis, don't fuse it. + if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Pad", {1, 2, 11, 13, 18, 19}) || + node.GetOutputEdgesCount() != 1 || + node.InputDefs().size() > 3) { + return false; + } + + if (graph.NodeProducesGraphOutput(node)) { + return false; + } + const NodeAttributes& pad_attributes = node.GetAttributes(); if (pad_attributes.find("mode") != pad_attributes.end() && pad_attributes.at("mode").s() != "constant") { @@ -82,7 +104,19 @@ bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const log } } - return true; + const Node& child_node = *node.OutputNodesBegin(); + if (graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "Cast", {1, 6, 9, 13})) { + if (child_node.GetOutputEdgesCount() != 1) { + return false; + } + + if (graph.NodeProducesGraphOutput(child_node)) { + return false; + } + return VerifyNotCastChild(*child_node.OutputNodesBegin()); + } else { + return VerifyNotCastChild(child_node); + } } /* @@ -99,8 +133,6 @@ Status PadFusion::Apply(Graph& graph, Node& pad_node, RewriteRuleEffect& rule_ef pads_values.assign(pad_node.GetAttributes().at("pads").ints().begin(), pad_node.GetAttributes().at("pads").ints().end()); } - assert(static_cast(pads_values.size()) == (2 * static_cast(pad_node.InputDefs()[0]->Shape()->dim_size()))); - uint32_t pads_size = static_cast(pads_values.size()); // check if padding is applied only on feature dims if (pads_values[0] != 0 || pads_values[1] != 0 || pads_values[pads_size / 2] != 0 || @@ -114,18 +146,18 @@ Status PadFusion::Apply(Graph& graph, Node& pad_node, RewriteRuleEffect& rule_ef } Node& child_node = *graph.GetNode(pad_node.OutputNodesBegin()->Index()); - auto child_pads = child_node.GetMutableAttributes()["pads"].mutable_ints(); - uint32_t child_pads_size = static_cast(child_pads->size()); - - for (uint32_t pads_index = 2, child_index = 0; pads_index < pads_size / 2; pads_index++, child_index++) { - child_pads->Set(child_index, child_pads->Get(child_index) + pads_values[pads_index]); - uint32_t mirrored_child_index = child_index + (child_pads_size / 2); - uint32_t mirrored_pad_index = pads_index + (pads_size / 2); - child_pads->Set(mirrored_child_index, child_pads->Get(mirrored_child_index) + pads_values[mirrored_pad_index]); - } + // We don't need to cast the pad_constant_value because this fusion requires that constant_pad_value + // to be zero. See PadFusion::SatisfyCondition for details. + Node& target_padding_node = (child_node.OpType() == "Cast") ? *graph.GetNode(child_node.OutputNodesBegin()->Index()) : child_node; + UpdatePaddingAttribute(target_padding_node, pads_values, pads_size); graph_utils::RemoveNodeOutputEdges(graph, pad_node); graph_utils::ReplaceNodeInput(child_node, 0, *pad_node.MutableInputDefs()[0]); + // Un-pad the output shape of Cast node + if (child_node.OpType() == "Cast") { + auto* cast_output_node_arg = child_node.MutableOutputDefs()[0]; + cast_output_node_arg->SetShape(*pad_node.MutableInputDefs()[0]->Shape()); + } graph.RemoveNode(pad_node.Index()); rule_effect = RewriteRuleEffect::kRemovedCurrentNode; return Status::OK(); diff --git a/onnxruntime/core/optimizer/pad_fusion.h b/onnxruntime/core/optimizer/pad_fusion.h index a1b6978a83..ca05d219b7 100644 --- a/onnxruntime/core/optimizer/pad_fusion.h +++ b/onnxruntime/core/optimizer/pad_fusion.h @@ -8,7 +8,7 @@ namespace onnxruntime { /* * This fusion submerges a Pad operator to it's child - * Conv or MaxPool operator, if and only if PadFusion::SatisfyCondition() + * Conv or MaxPool or AveragePool operator, if and only if PadFusion::SatisfyCondition() * is true. */ class PadFusion : public RewriteRule { diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index 50782569ee..182dba0ca4 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -16,6 +16,7 @@ #include "hip_allocator.h" #include "gpu_data_transfer.h" #include "migraphx_inc.h" +#include // TODO: find a better way to share this #include "core/providers/rocm/rocm_stream_handle.h" diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc index 43a46e1788..aed334c778 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc @@ -319,6 +319,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_Tensor_t& tensor) { } out << ")"; out << " memType=" << GetQnnTensorMemType(tensor); +// TODO: the code below has compilation errors with the latest ABSL +#if 0 if (GetQnnTensorMemType(tensor) == QNN_TENSORMEMTYPE_RAW) { if (GetQnnTensorDataType(tensor) == QNN_DATATYPE_FLOAT_32) { operator<< (out, GetQnnTensorClientBuf(tensor)); @@ -335,6 +337,7 @@ std::ostream& operator<<(std::ostream& out, const Qnn_Tensor_t& tensor) { operator<< (out, GetQnnTensorClientBuf(tensor)); } } +#endif out << " quantizeParams:" << GetQnnTensorQParams(tensor); return out; } diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 069251c4de..82bee3d788 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -2763,7 +2763,7 @@ static_assert(offsetof(OrtApi, SessionOptionsAppendExecutionProvider_OpenVINO_V2 static_assert(offsetof(OrtApi, AddExternalInitializersFromFilesInMemory) / sizeof(void*) == 279, "Size of version 18 API cannot change"); // So that nobody forgets to finish an API version, this check will serve as a reminder: -static_assert(std::string_view(ORT_VERSION) == "1.18.1", +static_assert(std::string_view(ORT_VERSION) == "1.18.2", "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly"); // 1. Update the hardcoded version string in above static_assert to silence it // 2. If there were any APIs added to ort_api_1_to_18 above: diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc index e12e940141..c9bd0d51e8 100644 --- a/onnxruntime/test/onnx/TestCase.cc +++ b/onnxruntime/test/onnx/TestCase.cc @@ -1030,6 +1030,10 @@ std::unique_ptr> GetBrokenTests(const std::string& provider // std::set broken_tests_keyword_set = {}; if (provider_name == "cuda") { +#ifdef ENABLE_TRAINING_CORE + // cudnn frontend exception in orttraining-linux-gpu-ci-pipeline. + broken_tests->insert({"keras_lotus_resnet3D", "Temporarily disabled pending investigation", {}}); +#endif #ifdef _WIN32 broken_tests->insert({"LSTM_Seq_lens_unpacked", "this test fails with new image since Aug 25."}); broken_tests->insert({"bidaf", "this test fails with new image since Aug 25."}); diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py index 24c637bd77..e1edf7767d 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py @@ -769,6 +769,8 @@ def test_scatternd_correctness(device, indices): @pytest.mark.parametrize("input_requires_grad", [False, True]) @pytest.mark.parametrize("conv_algo_search", [None, "EXHAUSTIVE", "HEURISTIC"]) def test_gradient_correctness_conv1d(use_fp16, input_requires_grad, conv_algo_search): + pytest.skip("Temporarily disabled pending investigation (might be related to cudnn frontend).") + class NeuralNetConv1D(torch.nn.Module): def __init__(self, in_channels, out_channels, kernel_size, padding=0, groups=1): super().__init__() @@ -6013,7 +6015,7 @@ def test_e2e_padding_elimination(): torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) - torch.backends.cudnn.determinstic = True + torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False class OneLayer(torch.nn.Module): diff --git a/packages.config b/packages.config index 3f3e4f5298..24289f3668 100644 --- a/packages.config +++ b/packages.config @@ -1,6 +1,6 @@  - + diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml index 96e2e0a758..1cc3eb816d 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml @@ -16,7 +16,6 @@ pr: branches: include: - main - - rel-* paths: exclude: - docs/** diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml index 2d2719fef8..ba31bb340e 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml @@ -16,7 +16,6 @@ pr: branches: include: - main - - rel-* paths: exclude: - docs/** diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml index 2c6b6183a9..5d37f58e96 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml @@ -16,7 +16,6 @@ pr: branches: include: - main - - rel-* paths: exclude: - docs/** @@ -71,7 +70,7 @@ stages: --volume $(Build.BinariesDirectory):/build \ --volume $(Agent.TempDirectory)/mnist:/mnist \ onnxruntime_ortmodule_distributed_tests_image \ - bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \ + bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && echo temporarily skip /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \ displayName: 'Run orttraining_ortmodule_distributed_tests.py' condition: succeededOrFailed() timeoutInMinutes: 30 diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index 85722c1cb8..0bbdd6463e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.164 + version: 1.0.177 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.164 + version: 1.0.177 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml index f832315c1f..5f07343326 100644 --- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml @@ -21,7 +21,7 @@ steps: --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \ --volume $(Agent.TempDirectory)/mnist:/mnist \ ${{ parameters.DockerImageTag }} \ - bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw' --cwd /build" \ + bash -c "rm -rf /build/onnxruntime/ && python3 -m pip show torch && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw' --cwd /build" \ displayName: 'Run orttraining_ortmodule_tests.py' condition: succeededOrFailed() timeoutInMinutes: 60 @@ -35,7 +35,7 @@ steps: --volume $(Build.SourcesDirectory):/onnxruntime_src \ --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \ ${{ parameters.DockerImageTag }} \ - bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \ + bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \ displayName: 'Run ORT Training APIs Tests' condition: succeededOrFailed() timeoutInMinutes: 120 diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index d200a2f666..52adb0a333 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -219,7 +219,7 @@ def add_common_dependencies(xml_text, package_name, version): def generate_dependencies(xml_text, package_name, version): - dml_dependency = '' + dml_dependency = '' if package_name == "Microsoft.AI.MachineLearning": xml_text.append("")