From f4f49535a4ec6fa5b2fff108b45aade25c8ba8ea Mon Sep 17 00:00:00 2001
From: Sumit Agarwal <sumitagarwal330@gmail.com>
Date: Mon, 12 Aug 2024 07:02:00 -0700
Subject: [PATCH] [ORT 1.18.2] Cherry Pick Pad Optimizations + Update DML to
 1.15.1 (#21670)

### Description
This change cherry-picks 2 Pad fusion optimization:
https://github.com/microsoft/onnxruntime/pull/21640 and
https://github.com/microsoft/onnxruntime/pull/21556.

It also has to cherry-pick 2 extra changes to unblock pipeline and
dependency failure: https://github.com/microsoft/onnxruntime/pull/21300
and https://github.com/microsoft/onnxruntime/pull/21662 (didn't include
test which are part of 1.18.1 payload).

Also uploaded new version of
[onnxruntime_build_dependencies:10.177](https://dev.azure.com/onnxruntime/onnxruntime/_artifacts/feed/onnxruntime/UPack/onnxruntime_build_dependencies/overview/1.0.177)
and updated the same in `download-deps.yml`.

Additionally it also updates DML binary to 1.15.1.



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Changming Sun <chasun@microsoft.com>
Co-authored-by: Tianlei Wu <tlwu@microsoft.com>
---
 .pipelines/nuget_config/x64/packages.config   |  2 +-
 .pipelines/nuget_config/x86/packages.config   |  2 +-
 VERSION_NUMBER                                |  2 +-
 cgmanifests/generated/cgmanifest.json         |  2 +-
 cmake/CMakeLists.txt                          | 21 +++--
 cmake/deps.txt                                |  2 +-
 cmake/external/dml.cmake                      |  2 +-
 cmake/patches/abseil/absl_windows.patch       | 51 ++++++++--
 docs/python/README.rst                        |  5 +
 js/common/lib/version.ts                      |  2 +-
 js/common/package-lock.json                   |  4 +-
 js/common/package.json                        |  2 +-
 js/node/lib/version.ts                        |  2 +-
 js/node/package-lock.json                     |  6 +-
 js/node/package.json                          |  2 +-
 js/react_native/lib/version.ts                |  2 +-
 js/react_native/package.json                  |  2 +-
 js/react_native/yarn.lock                     |  2 +-
 js/web/lib/version.ts                         |  2 +-
 js/web/package-lock.json                      |  6 +-
 js/web/package.json                           |  2 +-
 onnxruntime/__init__.py                       |  2 +-
 onnxruntime/core/optimizer/pad_fusion.cc      | 94 +++++++++++++------
 onnxruntime/core/optimizer/pad_fusion.h       |  2 +-
 .../migraphx/migraphx_execution_provider.cc   |  1 +
 .../core/providers/qnn/builder/qnn_utils.cc   |  3 +
 onnxruntime/core/session/onnxruntime_c_api.cc |  2 +-
 onnxruntime/test/onnx/TestCase.cc             |  4 +
 .../python/orttraining_test_ortmodule_api.py  |  4 +-
 packages.config                               |  2 +-
 .../orttraining-linux-ci-pipeline.yml         |  1 -
 .../orttraining-linux-gpu-ci-pipeline.yml     |  1 -
 ...ortmodule-distributed-test-ci-pipeline.yml |  3 +-
 .../templates/download-deps.yml               |  4 +-
 ...orttraining-linux-gpu-test-ci-pipeline.yml |  4 +-
 .../nuget/generate_nuspec_for_native_nuget.py |  2 +-
 36 files changed, 170 insertions(+), 82 deletions(-)
diff --git a/.pipelines/nuget_config/x64/packages.config b/.pipelines/nuget_config/x64/packages.config
index 9066e13ee1..96bb053a13 100644
--- a/.pipelines/nuget_config/x64/packages.config
+++ b/.pipelines/nuget_config/x64/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="python" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.14.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/.pipelines/nuget_config/x86/packages.config b/.pipelines/nuget_config/x86/packages.config
index a8e5b35b28..6bf842ac18 100644
--- a/.pipelines/nuget_config/x86/packages.config
+++ b/.pipelines/nuget_config/x86/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="pythonx86" version="3.9.7" targetFramework="native" />
-  <package id="Microsoft.AI.DirectML" version="1.14.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
 </packages>
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index ec6d649be6..b57fc7228b 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.18.1
+1.18.2
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index eb74178b3e..148a3ba61f 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -36,7 +36,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "4a2c63365eff8823a5221db86ef490e828306f9d",
+          "commitHash": "f46495ea96f68fc3f6c394f099b2992743f6ff7f",
           "repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
         },
         "comments": "abseil_cpp"
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8a1333206c..fa907e24f6 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -652,6 +652,12 @@ else()
   check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE)
   check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST)
   check_cxx_compiler_flag(-Wstringop-overflow HAS_STRINGOP_OVERFLOW)
+  if(onnxruntime_ENABLE_TRAINING_APIS)
+    check_cxx_compiler_flag(-Wdangling-reference HAS_DANGLING_REFERENCE)
+    if(HAS_DANGLING_REFERENCE)
+      list(APPEND ORT_WARNING_FLAGS -Wno-dangling-reference)
+    endif()
+  endif()
   check_function_exists(reallocarray HAS_REALLOCARRAY)
   if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64")
    check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16)
@@ -819,8 +825,8 @@ if (onnxruntime_USE_QNN)
       file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libQnn*.so" "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/Qnn*.dll")
       if (${QNN_ARCH_ABI} STREQUAL "aarch64-windows-msvc" OR ${QNN_ARCH_ABI} STREQUAL "arm64x-windows-msvc")
         file(GLOB EXTRA_HTP_LIB LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/hexagon-v68/unsigned/libQnnHtpV68Skel.so"
-		                                       "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so"
-		                                       "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libqnnhtpv73.cat")
+                                               "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libQnnHtpV73Skel.so"
+                                               "${onnxruntime_QNN_HOME}/lib/hexagon-v73/unsigned/libqnnhtpv73.cat")
         list(APPEND QNN_LIB_FILES ${EXTRA_HTP_LIB})
       endif()
       message(STATUS "QNN lib files: " ${QNN_LIB_FILES})
@@ -1031,6 +1037,9 @@ function(onnxruntime_set_compile_flags target_name)
     foreach(FLAG ${ORT_WARNING_FLAGS})
       target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${FLAG}>")
     endforeach()
+    if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 13 AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 12)
+      target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-maybe-uninitialized>")
+    endif()
     if (onnxruntime_USE_CUDA)
       foreach(FLAG ${ORT_WARNING_FLAGS})
         target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options ${FLAG}>")
@@ -1172,11 +1181,11 @@ if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905
       if (onnxruntime_USE_ACL_2002)
         add_definitions(-DACL_2002=1)
       else()
-	if (onnxruntime_USE_ACL_2308)
-	  add_definitions(-DACL_2308=1)
-	else()
+    if (onnxruntime_USE_ACL_2308)
+      add_definitions(-DACL_2308=1)
+    else()
           add_definitions(-DACL_1905=1)
-	endif()
+    endif()
       endif()
     endif()
   endif()
diff --git a/cmake/deps.txt b/cmake/deps.txt
index d213b09034..62adbf53e2 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -12,7 +12,7 @@
 # NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI.
 # See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29
 #
-abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20240116.0.zip;bc2cec6baaad67fcb6c0c38972b687d4797927e9
+abseil_cpp;https://github.com/abseil/abseil-cpp/archive/f46495ea96f68fc3f6c394f099b2992743f6ff7f.zip;0e2b6d1dc7f0a808d1e23f7dd985f7bc18d52cbc
 coremltools;https://github.com/apple/coremltools/archive/refs/tags/7.1.zip;f1bab0f30966f2e217d8e01207d518f230a1641a
 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
 date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index f74b694471..8b5f602643 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.14.1)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.1)
 
   # Restore nuget packages, which will pull down the DirectML redist package.
   add_custom_command(
diff --git a/cmake/patches/abseil/absl_windows.patch b/cmake/patches/abseil/absl_windows.patch
index 584c49d612..8298364652 100644
--- a/cmake/patches/abseil/absl_windows.patch
+++ b/cmake/patches/abseil/absl_windows.patch
@@ -1,8 +1,43 @@
+diff --git a/absl/base/attributes.h b/absl/base/attributes.h
+index 5ea5ee3e..f4949898 100644
+--- a/absl/base/attributes.h
++++ b/absl/base/attributes.h
+@@ -559,7 +559,7 @@
+ #undef ABSL_ATTRIBUTE_UNUSED
+ #define ABSL_ATTRIBUTE_UNUSED __attribute__((__unused__))
+ #else
+-#define ABSL_ATTRIBUTE_UNUSED
++#define ABSL_ATTRIBUTE_UNUSED [[maybe_unused]]
+ #endif
+ 
+ // ABSL_ATTRIBUTE_INITIAL_EXEC
+diff --git a/absl/container/internal/raw_hash_set.h b/absl/container/internal/raw_hash_set.h
+index d4fe8f5c..27418d13 100644
+--- a/absl/container/internal/raw_hash_set.h
++++ b/absl/container/internal/raw_hash_set.h
+@@ -1924,7 +1924,7 @@ HashtablezInfoHandle SampleHashtablezInfo(size_t sizeof_slot, size_t sizeof_key,
+   // In SOO, we sample on the first insertion so if this is an empty SOO case
+   // (e.g. when reserve is called), then we still need to sample.
+   if (kSooEnabled && was_soo && c.size() == 0) {
+-    return Sample(sizeof_slot, sizeof_key, sizeof_value, SooCapacity());
++    return Sample(sizeof_slot, sizeof_key, sizeof_value, (int16_t)SooCapacity());
+   }
+   // For non-SOO cases, we sample whenever the capacity is increasing from zero
+   // to non-zero.
+@@ -3525,7 +3525,7 @@ class raw_hash_set {
+     assert(is_soo());
+     if (!ShouldSampleHashtablezInfo<CharAlloc>()) return HashtablezInfoHandle{};
+     return Sample(sizeof(slot_type), sizeof(key_type), sizeof(value_type),
+-                  SooCapacity());
++                  (int16_t)SooCapacity());
+   }
+ 
+   inline void destroy_slots() {
 diff --git a/absl/copts/GENERATED_AbseilCopts.cmake b/absl/copts/GENERATED_AbseilCopts.cmake
-index a4ab1aa2..dfd13fd7 100644
+index da2282fe..4c7fc26f 100644
 --- a/absl/copts/GENERATED_AbseilCopts.cmake
 +++ b/absl/copts/GENERATED_AbseilCopts.cmake
-@@ -129,8 +129,6 @@ list(APPEND ABSL_MSVC_FLAGS
+@@ -181,8 +181,6 @@ list(APPEND ABSL_MSVC_FLAGS
      "/wd4005"
      "/wd4068"
      "/wd4180"
@@ -10,12 +45,12 @@ index a4ab1aa2..dfd13fd7 100644
 -    "/wd4267"
      "/wd4503"
      "/wd4800"
- )
+     "/DNOMINMAX"
 diff --git a/absl/copts/GENERATED_copts.bzl b/absl/copts/GENERATED_copts.bzl
-index a6efc98e..8c4de8e7 100644
+index b9e0071e..dd8410ec 100644
 --- a/absl/copts/GENERATED_copts.bzl
 +++ b/absl/copts/GENERATED_copts.bzl
-@@ -130,8 +130,6 @@ ABSL_MSVC_FLAGS = [
+@@ -182,8 +182,6 @@ ABSL_MSVC_FLAGS = [
      "/wd4005",
      "/wd4068",
      "/wd4180",
@@ -23,12 +58,12 @@ index a6efc98e..8c4de8e7 100644
 -    "/wd4267",
      "/wd4503",
      "/wd4800",
- ]
+     "/DNOMINMAX",
 diff --git a/absl/copts/copts.py b/absl/copts/copts.py
-index e6e11949..0aa7d868 100644
+index 2d85ac74..4875d668 100644
 --- a/absl/copts/copts.py
 +++ b/absl/copts/copts.py
-@@ -115,10 +115,6 @@ MSVC_WARNING_FLAGS = [
+@@ -118,10 +118,6 @@ MSVC_WARNING_FLAGS = [
      "/wd4068",  # unknown pragma
      # qualifier applied to function type has no meaning; ignored
      "/wd4180",
diff --git a/docs/python/README.rst b/docs/python/README.rst
index de54b120da..2830df1460 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.18.2
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.18.2
+
 1.18.1
 ^^^^^^
 
diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
index 9d230e4398..e605ca526b 100644
--- a/js/common/lib/version.ts
+++ b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.18.1';
+export const version = '1.18.2';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index 4ad0c584e0..081e2b0ffc 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-common",
-  "version": "1.18.1",
+  "version": "1.18.2",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-common",
-      "version": "1.18.1",
+      "version": "1.18.2",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/common/package.json b/js/common/package.json
index b047ef97cc..61fd770630 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -2,7 +2,7 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.18.1",
+  "version": "1.18.2",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"
diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
index 9d230e4398..e605ca526b 100644
--- a/js/node/lib/version.ts
+++ b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.18.1';
+export const version = '1.18.2';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index 4fd2267daf..15ee5ef18b 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-node",
-  "version": "1.18.1",
+  "version": "1.18.2",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-node",
-      "version": "1.18.1",
+      "version": "1.18.2",
       "hasInstallScript": true,
       "license": "MIT",
       "os": [
@@ -29,7 +29,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.18.1",
+      "version": "1.18.2",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/node/package.json b/js/node/package.json
index 9953e361ba..f844e6d085 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -13,7 +13,7 @@
       3
     ]
   },
-  "version": "1.18.1",
+  "version": "1.18.2",
   "dependencies": {
     "onnxruntime-common": "file:../common",
     "tar": "^7.0.1"
diff --git a/js/react_native/lib/version.ts b/js/react_native/lib/version.ts
index 9d230e4398..e605ca526b 100644
--- a/js/react_native/lib/version.ts
+++ b/js/react_native/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.18.1';
+export const version = '1.18.2';
diff --git a/js/react_native/package.json b/js/react_native/package.json
index 87275da85d..1bedfeed9b 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -36,7 +36,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.18.1",
+  "version": "1.18.2",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [
diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
index 953f93b74a..9df73e00c0 100644
--- a/js/react_native/yarn.lock
+++ b/js/react_native/yarn.lock
@@ -5254,7 +5254,7 @@ onetime@^5.1.0, onetime@^5.1.2:
     mimic-fn "^2.1.0"
 
 "onnxruntime-common@file:../common":
-  version "1.18.1"
+  version "1.18.2"
 
 open@^6.2.0:
   version "6.4.0"
diff --git a/js/web/lib/version.ts b/js/web/lib/version.ts
index 9d230e4398..e605ca526b 100644
--- a/js/web/lib/version.ts
+++ b/js/web/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.18.1';
+export const version = '1.18.2';
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 17e55dfeab..0f5c4515bd 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-web",
-  "version": "1.18.1",
+  "version": "1.18.2",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-web",
-      "version": "1.18.1",
+      "version": "1.18.2",
       "license": "MIT",
       "dependencies": {
         "flatbuffers": "^1.12.0",
@@ -49,7 +49,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.18.1",
+      "version": "1.18.2",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/web/package.json b/js/web/package.json
index b06a3cf9a2..c1f790d5df 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -7,7 +7,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.18.1",
+  "version": "1.18.2",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^1.12.0",
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index 040b497f90..37eba94ee8 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -7,7 +7,7 @@ ONNX Runtime is a performance-focused scoring engine for Open Neural Network Exc
 For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://aka.ms/onnxruntime/>`_
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
-__version__ = "1.18.1"
+__version__ = "1.18.2"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
diff --git a/onnxruntime/core/optimizer/pad_fusion.cc b/onnxruntime/core/optimizer/pad_fusion.cc
index a1c7f8de9e..3391e20cf0 100644
--- a/onnxruntime/core/optimizer/pad_fusion.cc
+++ b/onnxruntime/core/optimizer/pad_fusion.cc
@@ -8,26 +8,9 @@
 
 namespace onnxruntime {
 
-/*
- * It matches following pattern:
- *     Pad
- *      |
- *   Conv/MaxPool
- */
-bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger&) const {
-  // if Pad has input axis, don't fuse it.
-  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Pad", {1, 2, 11, 13, 18, 19}) ||
-      node.GetOutputEdgesCount() != 1 ||
-      node.InputDefs().size() > 3) {
-    return false;
-  }
-
-  if (graph.NodeProducesGraphOutput(node)) {
-    return false;
-  }
-
-  const Node& child_node = *node.OutputNodesBegin();
+bool VerifyNotCastChild(const Node& child_node) {
   if (!graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "Conv", {1, 11}) &&
+      !graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "AveragePool", {1, 7, 10, 11, 19}) &&
       !graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "MaxPool", {1, 8, 10, 11, 12})) {
     return false;
   }
@@ -53,6 +36,45 @@ bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const log
     return false;
   }
 
+  return true;
+}
+
+void UpdatePaddingAttribute(Node& child_node, const std::vector<int64_t>& pads_values, const uint32_t pads_size) {
+  auto child_pads = child_node.GetMutableAttributes()["pads"].mutable_ints();
+  uint32_t child_pads_size = static_cast<uint32_t>(child_pads->size());
+
+  for (uint32_t pads_index = 2, child_index = 0; pads_index < pads_size / 2; pads_index++, child_index++) {
+    child_pads->Set(child_index, child_pads->Get(child_index) + pads_values[pads_index]);
+    uint32_t mirrored_child_index = child_index + (child_pads_size / 2);
+    uint32_t mirrored_pad_index = pads_index + (pads_size / 2);
+    child_pads->Set(mirrored_child_index, child_pads->Get(mirrored_child_index) + pads_values[mirrored_pad_index]);
+  }
+}
+/*
+ * Before:
+ *     Pad
+ *      |
+ *    Cast (Optional)
+ *      |
+ *   Conv/MaxPool/AveragePool
+ *
+ * After:
+ *    Cast (Optional)
+ *      |
+ *   Conv/MaxPool/AveragePool
+ */
+bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger&) const {
+  // if Pad has input axis, don't fuse it.
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Pad", {1, 2, 11, 13, 18, 19}) ||
+      node.GetOutputEdgesCount() != 1 ||
+      node.InputDefs().size() > 3) {
+    return false;
+  }
+
+  if (graph.NodeProducesGraphOutput(node)) {
+    return false;
+  }
+
   const NodeAttributes& pad_attributes = node.GetAttributes();
   if (pad_attributes.find("mode") != pad_attributes.end() &&
       pad_attributes.at("mode").s() != "constant") {
@@ -82,7 +104,19 @@ bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const log
     }
   }
 
-  return true;
+  const Node& child_node = *node.OutputNodesBegin();
+  if (graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "Cast", {1, 6, 9, 13})) {
+    if (child_node.GetOutputEdgesCount() != 1) {
+      return false;
+    }
+
+    if (graph.NodeProducesGraphOutput(child_node)) {
+      return false;
+    }
+    return VerifyNotCastChild(*child_node.OutputNodesBegin());
+  } else {
+    return VerifyNotCastChild(child_node);
+  }
 }
 
 /*
@@ -99,8 +133,6 @@ Status PadFusion::Apply(Graph& graph, Node& pad_node, RewriteRuleEffect& rule_ef
     pads_values.assign(pad_node.GetAttributes().at("pads").ints().begin(), pad_node.GetAttributes().at("pads").ints().end());
   }
 
-  assert(static_cast<uint32_t>(pads_values.size()) == (2 * static_cast<uint32_t>(pad_node.InputDefs()[0]->Shape()->dim_size())));
-
   uint32_t pads_size = static_cast<uint32_t>(pads_values.size());
   // check if padding is applied only on feature dims
   if (pads_values[0] != 0 || pads_values[1] != 0 || pads_values[pads_size / 2] != 0 ||
@@ -114,18 +146,18 @@ Status PadFusion::Apply(Graph& graph, Node& pad_node, RewriteRuleEffect& rule_ef
   }
 
   Node& child_node = *graph.GetNode(pad_node.OutputNodesBegin()->Index());
-  auto child_pads = child_node.GetMutableAttributes()["pads"].mutable_ints();
-  uint32_t child_pads_size = static_cast<uint32_t>(child_pads->size());
-
-  for (uint32_t pads_index = 2, child_index = 0; pads_index < pads_size / 2; pads_index++, child_index++) {
-    child_pads->Set(child_index, child_pads->Get(child_index) + pads_values[pads_index]);
-    uint32_t mirrored_child_index = child_index + (child_pads_size / 2);
-    uint32_t mirrored_pad_index = pads_index + (pads_size / 2);
-    child_pads->Set(mirrored_child_index, child_pads->Get(mirrored_child_index) + pads_values[mirrored_pad_index]);
-  }
+  // We don't need to cast the pad_constant_value because this fusion requires that constant_pad_value
+  // to be zero. See PadFusion::SatisfyCondition for details.
+  Node& target_padding_node = (child_node.OpType() == "Cast") ? *graph.GetNode(child_node.OutputNodesBegin()->Index()) : child_node;
+  UpdatePaddingAttribute(target_padding_node, pads_values, pads_size);
 
   graph_utils::RemoveNodeOutputEdges(graph, pad_node);
   graph_utils::ReplaceNodeInput(child_node, 0, *pad_node.MutableInputDefs()[0]);
+  // Un-pad the output shape of Cast node
+  if (child_node.OpType() == "Cast") {
+    auto* cast_output_node_arg = child_node.MutableOutputDefs()[0];
+    cast_output_node_arg->SetShape(*pad_node.MutableInputDefs()[0]->Shape());
+  }
   graph.RemoveNode(pad_node.Index());
   rule_effect = RewriteRuleEffect::kRemovedCurrentNode;
   return Status::OK();
diff --git a/onnxruntime/core/optimizer/pad_fusion.h b/onnxruntime/core/optimizer/pad_fusion.h
index a1b6978a83..ca05d219b7 100644
--- a/onnxruntime/core/optimizer/pad_fusion.h
+++ b/onnxruntime/core/optimizer/pad_fusion.h
@@ -8,7 +8,7 @@
 namespace onnxruntime {
 /*
  *   This fusion submerges a Pad operator to it's child
- *   Conv or MaxPool operator, if and only if PadFusion::SatisfyCondition()
+ *   Conv or MaxPool or AveragePool operator, if and only if PadFusion::SatisfyCondition()
  *   is true.
  */
 class PadFusion : public RewriteRule {
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 50782569ee..182dba0ca4 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -16,6 +16,7 @@
 #include "hip_allocator.h"
 #include "gpu_data_transfer.h"
 #include "migraphx_inc.h"
+#include <hip/hip_version.h>
 
 // TODO: find a better way to share this
 #include "core/providers/rocm/rocm_stream_handle.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 43a46e1788..aed334c778 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -319,6 +319,8 @@ std::ostream& operator<<(std::ostream& out, const Qnn_Tensor_t& tensor) {
   }
   out << ")";
   out << " memType=" << GetQnnTensorMemType(tensor);
+// TODO: the code below has compilation errors with the latest ABSL
+#if 0
   if (GetQnnTensorMemType(tensor) == QNN_TENSORMEMTYPE_RAW) {
     if (GetQnnTensorDataType(tensor) == QNN_DATATYPE_FLOAT_32) {
       operator<< <float>(out, GetQnnTensorClientBuf(tensor));
@@ -335,6 +337,7 @@ std::ostream& operator<<(std::ostream& out, const Qnn_Tensor_t& tensor) {
       operator<< <int8_t>(out, GetQnnTensorClientBuf(tensor));
     }
   }
+#endif
   out << " quantizeParams:" << GetQnnTensorQParams(tensor);
   return out;
 }
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 069251c4de..82bee3d788 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2763,7 +2763,7 @@ static_assert(offsetof(OrtApi, SessionOptionsAppendExecutionProvider_OpenVINO_V2
 static_assert(offsetof(OrtApi, AddExternalInitializersFromFilesInMemory) / sizeof(void*) == 279, "Size of version 18 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.18.1",
+static_assert(std::string_view(ORT_VERSION) == "1.18.2",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
 // 2. If there were any APIs added to ort_api_1_to_18 above:
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index e12e940141..c9bd0d51e8 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -1030,6 +1030,10 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
   // std::set<std::string> broken_tests_keyword_set = {};
 
   if (provider_name == "cuda") {
+#ifdef ENABLE_TRAINING_CORE
+    // cudnn frontend exception in orttraining-linux-gpu-ci-pipeline.
+    broken_tests->insert({"keras_lotus_resnet3D", "Temporarily disabled pending investigation", {}});
+#endif
 #ifdef _WIN32
     broken_tests->insert({"LSTM_Seq_lens_unpacked", "this test fails with new image since Aug 25."});
     broken_tests->insert({"bidaf", "this test fails with new image since Aug 25."});
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index 24c637bd77..e1edf7767d 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -769,6 +769,8 @@ def test_scatternd_correctness(device, indices):
 @pytest.mark.parametrize("input_requires_grad", [False, True])
 @pytest.mark.parametrize("conv_algo_search", [None, "EXHAUSTIVE", "HEURISTIC"])
 def test_gradient_correctness_conv1d(use_fp16, input_requires_grad, conv_algo_search):
+    pytest.skip("Temporarily disabled pending investigation (might be related to cudnn frontend).")
+
     class NeuralNetConv1D(torch.nn.Module):
         def __init__(self, in_channels, out_channels, kernel_size, padding=0, groups=1):
             super().__init__()
@@ -6013,7 +6015,7 @@ def test_e2e_padding_elimination():
     torch.manual_seed(seed)
     torch.cuda.manual_seed(seed)
     torch.cuda.manual_seed_all(seed)
-    torch.backends.cudnn.determinstic = True
+    torch.backends.cudnn.deterministic = True
     torch.backends.cudnn.benchmark = False
 
     class OneLayer(torch.nn.Module):
diff --git a/packages.config b/packages.config
index 3f3e4f5298..24289f3668 100644
--- a/packages.config
+++ b/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="Microsoft.AI.DirectML" version="1.14.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.15.1" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
   <package id="google.protobuf.tools" version="3.21.12" targetFramework="native" />
 </packages>
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
index 96e2e0a758..1cc3eb816d 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-ci-pipeline.yml
@@ -16,7 +16,6 @@ pr:
   branches:
     include:
     - main
-    - rel-*
   paths:
     exclude:
     - docs/**
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
index 2d2719fef8..ba31bb340e 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ci-pipeline.yml
@@ -16,7 +16,6 @@ pr:
   branches:
     include:
     - main
-    - rel-*
   paths:
     exclude:
     - docs/**
diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml
index 2c6b6183a9..5d37f58e96 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml
@@ -16,7 +16,6 @@ pr:
   branches:
     include:
     - main
-    - rel-*
   paths:
     exclude:
     - docs/**
@@ -71,7 +70,7 @@ stages:
           --volume $(Build.BinariesDirectory):/build \
           --volume $(Agent.TempDirectory)/mnist:/mnist \
           onnxruntime_ortmodule_distributed_tests_image \
-            bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \
+            bash -c "rm -rf /build/RelWithDebInfo/onnxruntime/ && python3 -m pip install /build/RelWithDebInfo/dist/onnxruntime*.whl && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && echo temporarily skip /build/RelWithDebInfo/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_distributed_tests.py --mnist /mnist' --cwd /build/RelWithDebInfo" \
       displayName: 'Run orttraining_ortmodule_distributed_tests.py'
       condition: succeededOrFailed()
       timeoutInMinutes: 30
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 85722c1cb8..0bbdd6463e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.164
+      version: 1.0.177
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.164
+      version: 1.0.177
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.
diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
index f832315c1f..5f07343326 100644
--- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml
@@ -21,7 +21,7 @@ steps:
       --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \
       --volume $(Agent.TempDirectory)/mnist:/mnist \
       ${{ parameters.DockerImageTag }} \
-        bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw' --cwd /build" \
+        bash -c "rm -rf /build/onnxruntime/ && python3 -m pip show torch && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw' --cwd /build" \
   displayName: 'Run orttraining_ortmodule_tests.py'
   condition: succeededOrFailed()
   timeoutInMinutes: 60
@@ -35,7 +35,7 @@ steps:
       --volume $(Build.SourcesDirectory):/onnxruntime_src \
       --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \
       ${{ parameters.DockerImageTag }} \
-        bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \
+        bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \
   displayName: 'Run ORT Training APIs Tests'
   condition: succeededOrFailed()
   timeoutInMinutes: 120
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index d200a2f666..52adb0a333 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -219,7 +219,7 @@ def add_common_dependencies(xml_text, package_name, version):
 
 
 def generate_dependencies(xml_text, package_name, version):
-    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.14.1"/>'
+    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.15.1"/>'
 
     if package_name == "Microsoft.AI.MachineLearning":
         xml_text.append("<dependencies>")