From 628c0e00c476645f74f40eaac6918115e5e15180 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 21 Jan 2025 20:07:20 -0500
Subject: [PATCH 01/37] Change MacOS-13 to ubuntu on for
 android-java-api-aar-test.yml. (#23444)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../templates/android-java-api-aar-test.yml       | 15 +++------------
 .../azure-pipelines/templates/c-api-cpu.yml       |  8 ++++++--
 .../ondevice-training-cpu-packaging-pipeline.yml  |  6 ++++--
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index d44952690f..ede9ec1a08 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -4,11 +4,6 @@ parameters:
   type: string
   default: ''
 
-- name: job_name_suffix
-  displayName: job name
-  type: string
-  default: ''
-
 - name: packageName
   displayName: Package Name
   type: string
@@ -25,17 +20,13 @@ parameters:
   default: '2.30.0.250109'
 
 jobs:
-- job: Final_AAR_Testing_Android_${{ parameters.job_name_suffix }}
+- job: Final_AAR_Testing_Android
+  pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
   workspace:
     clean: all
-  pool:
-    vmImage: 'macOS-13'
   variables:
-  - name: runCodesignValidationInjection
-    value: false
+    runCodesignValidationInjection: false
   timeoutInMinutes: 90
-  dependsOn:
-    - Android_Java_API_AAR_Packaging_${{ parameters.job_name_suffix }}
   steps:
   - template: set-version-number-variables-step.yml
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 98206bcb69..1ab4fd2a8e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -82,10 +82,12 @@ stages:
       packageName: 'onnxruntime-android'
       ReleaseVersionSuffix: $(ReleaseVersionSuffix)
 
+- stage: Android_Java_API_AAR_Testing_Full
+  dependsOn: Android_Java_API_AAR_Packaging_Full
+  jobs:
   - template: android-java-api-aar-test.yml
     parameters:
       artifactName: 'onnxruntime-android-full-aar'
-      job_name_suffix: 'Full'
       ReleaseVersionSuffix: $(ReleaseVersionSuffix)
 
 - stage: Android_Java_API_AAR_Packaging_QNN
@@ -105,10 +107,12 @@ stages:
       ReleaseVersionSuffix: $(ReleaseVersionSuffix)
       QnnSDKVersion: ${{ parameters.QnnSDKVersion }}
 
+- stage: Final_AAR_Testing_Android_QNN
+  dependsOn: Android_Java_API_AAR_Packaging_QNN
+  jobs:
   - template: android-java-api-aar-test.yml
     parameters:
       artifactName: 'onnxruntime-android-qnn-aar'
-      job_name_suffix: 'QNN'
       packageName: 'onnxruntime-android-qnn'
       QnnSDKVersion: ${{ parameters.QnnSDKVersion }}
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
index 8bbe8f8253..523f3ab58b 100644
--- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml
@@ -102,10 +102,12 @@ stages:
       packageName: onnxruntime-training-android
       enable_code_sign: true
 
+- stage: Final_AAR_Testing_Android_Training_Full
+  dependsOn: Android_Java_API_AAR_Packaging_Training_Full
+  jobs:
   - template: android-java-api-aar-test.yml
     parameters:
       artifactName: 'onnxruntime-training-android-full-aar'
-      job_name_suffix: 'Training_Full'
       packageName: onnxruntime-training-android
 
 - stage: NuGet_Packaging_Training_CPU
@@ -115,7 +117,7 @@ stages:
   - Windows_Packaging_Training_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_Training_CPU_x64_${{ parameters.BuildVariant }}
   - Windows_Packaging_Training_CPU_arm64_${{ parameters.BuildVariant }}
-  - Android_Java_API_AAR_Packaging_Training_Full
+  - Final_AAR_Testing_Android_Training_Full
   condition: succeeded()
   jobs:
   - job: NuGet_Packaging_Training_CPU

From 368e2431945d338917a4379a773291809038688d Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 21 Jan 2025 17:17:47 -0800
Subject: [PATCH 02/37] Make ORT and Dawn use the same protobuf/abseil source
 code (#23447)

### Description
Make ORT and Dawn use the same protobuf/abseil source code
---
 cmake/external/abseil-cpp.cmake                | 8 +++++++-
 cmake/external/onnxruntime_external_deps.cmake | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/cmake/external/abseil-cpp.cmake b/cmake/external/abseil-cpp.cmake
index 7b6e2141ee..6b4404a124 100644
--- a/cmake/external/abseil-cpp.cmake
+++ b/cmake/external/abseil-cpp.cmake
@@ -32,7 +32,13 @@ FetchContent_Declare(
 
 onnxruntime_fetchcontent_makeavailable(abseil_cpp)
 FetchContent_GetProperties(abseil_cpp)
-set(ABSEIL_SOURCE_DIR ${abseil_cpp_SOURCE_DIR})
+if(abseil_cpp_SOURCE_DIR)
+  set(ABSEIL_SOURCE_DIR ${abseil_cpp_SOURCE_DIR})
+  if(onnxruntime_USE_WEBGPU)
+    set(DAWN_ABSEIL_DIR ${abseil_cpp_SOURCE_DIR})
+  endif()
+endif()
+
 # abseil_cpp_SOURCE_DIR is non-empty if we build it from source
 message(STATUS "Abseil source dir:" ${ABSEIL_SOURCE_DIR})
 # abseil_cpp_VERSION  is non-empty if we find a preinstalled ABSL
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index b8e90026b4..761ce47582 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -222,6 +222,11 @@ onnxruntime_fetchcontent_makeavailable(Protobuf)
 if(Protobuf_FOUND)
   message(STATUS "Protobuf version: ${Protobuf_VERSION}")
 else()
+  if(protobuf_SOURCE_DIR)
+    if(onnxruntime_USE_WEBGPU)
+      set(DAWN_PROTOBUF_DIR ${protobuf_SOURCE_DIR})
+    endif()
+  endif()
   # Adjust warning flags
   if (TARGET libprotoc)
     if (NOT MSVC)

From f4dc9655220b5c0c72743811561e0611efe22099 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 21 Jan 2025 17:18:39 -0800
Subject: [PATCH 03/37] Bump vite from 6.0.7 to 6.0.11 in
 /js/web/test/e2e/exports/testcases/vite-default (#23446)

Bumps [vite](https://github.com/vitejs/vite/tree/HEAD/packages/vite)
from 6.0.7 to 6.0.11.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/vitejs/vite/releases">vite's
releases</a>.</em></p>
<blockquote>
<h2>v6.0.11</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.0.11/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.0.10</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.0.10/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.0.9</h2>
<p>This version contains a breaking change due to security fixes. See <a
href="https://github.com/vitejs/vite/security/advisories/GHSA-vg6x-rcgg-rjx6">https://github.com/vitejs/vite/security/advisories/GHSA-vg6x-rcgg-rjx6</a>
for more details.</p>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.0.9/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
<h2>v6.0.8</h2>
<p>Please refer to <a
href="https://github.com/vitejs/vite/blob/v6.0.8/packages/vite/CHANGELOG.md">CHANGELOG.md</a>
for details.</p>
</blockquote>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/vitejs/vite/blob/main/packages/vite/CHANGELOG.md">vite's
changelog</a>.</em></p>
<blockquote>
<h2><!-- raw HTML omitted -->6.0.11 (2025-01-21)<!-- raw HTML omitted
--></h2>
<ul>
<li>fix: <code>preview.allowedHosts</code> with specific values was not
respected (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19246">#19246</a>)
(<a
href="https://github.com/vitejs/vite/commit/aeb3ec84a288d6be227a1284607f13428a4f14a1">aeb3ec8</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19246">#19246</a></li>
<li>fix: allow CORS from loopback addresses by default (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19249">#19249</a>)
(<a
href="https://github.com/vitejs/vite/commit/3d038997377a30022b6a6b7916e0b4b5d8b9a363">3d03899</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19249">#19249</a></li>
</ul>
<h2><!-- raw HTML omitted -->6.0.10 (2025-01-20)<!-- raw HTML omitted
--></h2>
<ul>
<li>fix: try parse <code>server.origin</code> URL (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19241">#19241</a>)
(<a
href="https://github.com/vitejs/vite/commit/2495022420fda05ee389c2dcf26921b21e2aed3b">2495022</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19241">#19241</a></li>
</ul>
<h2><!-- raw HTML omitted -->6.0.9 (2025-01-20)<!-- raw HTML omitted
--></h2>
<ul>
<li>fix!: check host header to prevent DNS rebinding attacks and
introduce <code>server.allowedHosts</code> (<a
href="https://github.com/vitejs/vite/commit/bd896fb5f312fc0ff1730166d1d142fc0d34ba6d">bd896fb</a>)</li>
<li>fix!: default <code>server.cors: false</code> to disallow fetching
from untrusted origins (<a
href="https://github.com/vitejs/vite/commit/b09572acc939351f4e4c50ddf793017a92c678b1">b09572a</a>)</li>
<li>fix: verify token for HMR WebSocket connection (<a
href="https://github.com/vitejs/vite/commit/029dcd6d77d3e3ef10bc38e9a0829784d9760fdb">029dcd6</a>)</li>
</ul>
<h2><!-- raw HTML omitted -->6.0.8 (2025-01-20)<!-- raw HTML omitted
--></h2>
<ul>
<li>fix: avoid SSR HMR for HTML files (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19193">#19193</a>)
(<a
href="https://github.com/vitejs/vite/commit/3bd55bcb7e831d2c4f66c90d7bbb3e1fbf7a02b6">3bd55bc</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19193">#19193</a></li>
<li>fix: build time display 7m 60s (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19108">#19108</a>)
(<a
href="https://github.com/vitejs/vite/commit/cf0d2c8e232a1af716c71cdd2218d180f7ecc02b">cf0d2c8</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19108">#19108</a></li>
<li>fix: don't resolve URL starting with double slash (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19059">#19059</a>)
(<a
href="https://github.com/vitejs/vite/commit/35942cde11fd8a68fa89bf25f7aa1ddb87d775b2">35942cd</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19059">#19059</a></li>
<li>fix: ensure <code>server.close()</code> only called once (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19204">#19204</a>)
(<a
href="https://github.com/vitejs/vite/commit/db81c2dada961f40c0882b5182adf2f34bb5c178">db81c2d</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19204">#19204</a></li>
<li>fix: resolve.conditions in ResolvedConfig was
<code>defaultServerConditions</code> (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19174">#19174</a>)
(<a
href="https://github.com/vitejs/vite/commit/ad75c56dce5618a3a416e18f9a5c3880d437a107">ad75c56</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19174">#19174</a></li>
<li>fix: tree shake stringified JSON imports (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19189">#19189</a>)
(<a
href="https://github.com/vitejs/vite/commit/f2aed62d0bf1b66e870ee6b4aab80cd1702793ab">f2aed62</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19189">#19189</a></li>
<li>fix: use shared sigterm callback (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19203">#19203</a>)
(<a
href="https://github.com/vitejs/vite/commit/47039f4643179be31a8d7c7fbff83c5c13deb787">47039f4</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19203">#19203</a></li>
<li>fix(deps): update all non-major dependencies (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19098">#19098</a>)
(<a
href="https://github.com/vitejs/vite/commit/8639538e6498d1109da583ad942c1472098b5919">8639538</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19098">#19098</a></li>
<li>fix(optimizer): use correct default install state path for yarn PnP
(<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19119">#19119</a>)
(<a
href="https://github.com/vitejs/vite/commit/e690d8bb1e5741e81df5b7a6a5c8c3c1c971fa41">e690d8b</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19119">#19119</a></li>
<li>fix(types): improve <code>ESBuildOptions.include / exclude</code>
type to allow <code>readonly (string | RegExp)[]</code> (<a
href="https://github.com/vitejs/vite/commit/ea53e7095297ea4192490fd58556414cc59a8975">ea53e70</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19146">#19146</a></li>
<li>chore(deps): update dependency pathe to v2 (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19139">#19139</a>)
(<a
href="https://github.com/vitejs/vite/commit/71506f0a8deda5254cb49c743cd439dfe42859ce">71506f0</a>),
closes <a
href="https://redirect.github.com/vitejs/vite/issues/19139">#19139</a></li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/vitejs/vite/commit/a0ed4057c90a1135aa58d06305f446e232f63e2a"><code>a0ed405</code></a>
release: v6.0.11</li>
<li><a
href="https://github.com/vitejs/vite/commit/3d038997377a30022b6a6b7916e0b4b5d8b9a363"><code>3d03899</code></a>
fix: allow CORS from loopback addresses by default (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19249">#19249</a>)</li>
<li><a
href="https://github.com/vitejs/vite/commit/aeb3ec84a288d6be227a1284607f13428a4f14a1"><code>aeb3ec8</code></a>
fix: <code>preview.allowedHosts</code> with specific values was not
respected (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19246">#19246</a>)</li>
<li><a
href="https://github.com/vitejs/vite/commit/9654348258eaa0883171533a2b74b4e2825f5fb6"><code>9654348</code></a>
release: v6.0.10</li>
<li><a
href="https://github.com/vitejs/vite/commit/2495022420fda05ee389c2dcf26921b21e2aed3b"><code>2495022</code></a>
fix: try parse <code>server.origin</code> URL (<a
href="https://github.com/vitejs/vite/tree/HEAD/packages/vite/issues/19241">#19241</a>)</li>
<li><a
href="https://github.com/vitejs/vite/commit/a55f8ba3e43108de340610d4d021dcd926be5876"><code>a55f8ba</code></a>
release: v6.0.9</li>
<li><a
href="https://github.com/vitejs/vite/commit/bd896fb5f312fc0ff1730166d1d142fc0d34ba6d"><code>bd896fb</code></a>
fix!: check host header to prevent DNS rebinding attacks and introduce
`serve...</li>
<li><a
href="https://github.com/vitejs/vite/commit/029dcd6d77d3e3ef10bc38e9a0829784d9760fdb"><code>029dcd6</code></a>
fix: verify token for HMR WebSocket connection</li>
<li><a
href="https://github.com/vitejs/vite/commit/b09572acc939351f4e4c50ddf793017a92c678b1"><code>b09572a</code></a>
fix!: default <code>server.cors: false</code> to disallow fetching from
untrusted origins</li>
<li><a
href="https://github.com/vitejs/vite/commit/c0f72a695c5308cba605e3db4f851f4f6692e50c"><code>c0f72a6</code></a>
release: v6.0.8</li>
<li>Additional commits viewable in <a
href="https://github.com/vitejs/vite/commits/v6.0.11/packages/vite">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=vite&package-manager=npm_and_yarn&previous-version=6.0.7&new-version=6.0.11)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/microsoft/onnxruntime/network/alerts).

</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .../e2e/exports/testcases/vite-default/package-lock.json  | 8 ++++----
 .../test/e2e/exports/testcases/vite-default/package.json  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
index 96c19af947..891b40710f 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package-lock.json
@@ -12,7 +12,7 @@
       },
       "devDependencies": {
         "@vitejs/plugin-vue": "^5.2.1",
-        "vite": "^6.0.5"
+        "vite": "^6.0.11"
       }
     },
     "node_modules/@babel/helper-string-parser": {
@@ -1069,9 +1069,9 @@
       }
     },
     "node_modules/vite": {
-      "version": "6.0.7",
-      "resolved": "https://registry.npmjs.org/vite/-/vite-6.0.7.tgz",
-      "integrity": "sha512-RDt8r/7qx9940f8FcOIAH9PTViRrghKaK2K1jY3RaAURrEUbm9Du1mJ72G+jlhtG3WwodnfzY8ORQZbBavZEAQ==",
+      "version": "6.0.11",
+      "resolved": "https://registry.npmjs.org/vite/-/vite-6.0.11.tgz",
+      "integrity": "sha512-4VL9mQPKoHy4+FE0NnRE/kbY51TOfaknxAjt3fJbGJxhIpBZiqVzlZDEesWWsuREXHwNdAoOFZ9MkPEVXczHwg==",
       "dev": true,
       "license": "MIT",
       "dependencies": {
diff --git a/js/web/test/e2e/exports/testcases/vite-default/package.json b/js/web/test/e2e/exports/testcases/vite-default/package.json
index 7a1f370885..9e204875a1 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/package.json
+++ b/js/web/test/e2e/exports/testcases/vite-default/package.json
@@ -13,6 +13,6 @@
   },
   "devDependencies": {
     "@vitejs/plugin-vue": "^5.2.1",
-    "vite": "^6.0.5"
+    "vite": "^6.0.11"
   }
 }

From 18a54284c8abc316416440593c13a7b378d1703c Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Wed, 22 Jan 2025 09:20:19 +0800
Subject: [PATCH 04/37] [WebNN] Remove workarounds for TFLite backend (#23406)

The WebNN CPU device type may now target different backends, such as
CoreML. Legacy special workarounds for the TFLite backend should be
removed and allowed to fail as is, as these are implementation issues.

Additionally, the WebNN EP should adhere to the WebNN API conformance.
We assume all the WebNN ops should be supported, so remove the WebNN op
support status for different device types in webnn-operators.md as well.
---
 js/web/docs/webnn-operators.md                | 208 +++++++++---------
 .../builders/impl/activation_op_builder.cc    |  28 ---
 .../webnn/builders/impl/binary_op_builder.cc  |  29 ---
 .../webnn/builders/impl/clip_op_builder.cc    |  22 +-
 .../webnn/builders/impl/conv_op_builder.cc    |  16 --
 5 files changed, 106 insertions(+), 197 deletions(-)

diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md
index 5ad2311ef8..a6a2ecdf6f 100644
--- a/js/web/docs/webnn-operators.md
+++ b/js/web/docs/webnn-operators.md
@@ -6,108 +6,110 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim
 
 (**Note**: ONNX Runtime only *guarantees* support for models stamped with opset version 7 or above for opset domain 'ai.onnx'.)
 
-[WebNN API](https://webmachinelearning.github.io/webnn) provides two device types `cpu` and `gpu` to leverage different on-device accelerators. WebNN API implementation in Chromium uses TFLite XNNPack delegate backend for `cpu` device type and DirectML backend for `gpu` device type. [The op support status](https://webmachinelearning.github.io/webnn-status/) behind these two backends is inconsistent.
+The [WebNN API](https://webmachinelearning.github.io/webnn) is available in the latest versions of Chrome and Edge on Windows,
+Linux, macOS, Android, and ChromeOS behind an <i>"Enables WebNN API"</i> flag. The operator support status may vary across these
+platforms. Check the [WebNN status](https://webmachinelearning.github.io/webnn-status/) for the latest implementation details.
 
 
-| Operator | Opset | WebNN API | WebNN CPU | WebNN GPU | Comments |
-|:------:|:------:|:------:|:-:|:-:|:------|
-| Abs | ai.onnx(7-12, 13+) | abs | ✓ | ✓ | |
-| Add | ai.onnx(7-12, 13, 14+) | add | ✓ | ✓ | |
-| And | ai.onnx(7+) | logicalAnd | ✗ | ✓ | |
-| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✓ | ✓ | |
-| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✓ | ✓ | |
-| AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 |
-| BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | ✓ | ✓ | Only supports 'training_mode' value is 0, one output |
-| Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | ✓ | ✓ | WebNN CPU backend doesn't support casting to uint64 data type |
-| Ceil | ai.onnx(7-12, 13+) | ceil | ✓ | ✓ | |
-| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | ✓ | ✓ | WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0] (Chromium issue: https://issues.chromium.org/issues/326156496) |
-| Concat | ai.onnx(7-10, 11-12, 13+) | concat | ✓ | ✓ | |
-| Conv | ai.onnx(7-10, 11+) | conv2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight) |
-| ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight). WebNN CPU backend only supports default dilations and group |
-| Cos | ai.onnx(7+) | cos | ✓ | ✓ | |
-| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | ✓ | ✓ | 'axis' input should be a constant |
-| Div | ai.onnx(7-12, 13, 14+) | div | ✓ | ✓ | |
-| DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | ✓ | ✓ | The shape of x_scale should be a subsample of the shape of input |
-| Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | ✓ | ✓ | Only supports test mode |
-| Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | ✓ | ✓ | |
-| Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 |
-| Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | ✓ | ✓ | |
-| Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✓ | ✓ | |
-| Exp | ai.onnx(7-12, 13+) | exp | ✓ | ✓ | |
-| Expand | ai.onnx(8-12, 13+) | expand | ✓ | ✓ | 'shape' input should be a constant |
-| Flatten | ai.onnx(7-8, 9-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | |
-| Floor | ai.onnx(7-12, 13+) | floor | ✓ | ✓ | |
-| Gather | ai.onnx(7-10, 11-12, 13+) | gather | ✓ | ✓ | |
-| GatherElements | ai.onnx(11-12, 13+) | gatherElements | ✗ | ✓ | |
-| GatherND | ai.onnx(11, 12, 13+) | gatherND | ✓ | ✓ | Only supports 'batch_dims' == 0 |
-| Gelu | ai.onnx(20+) | gelu | ✓ | ✓ | |
-| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | ✓ | ✓ | Only supports 1-D 'C' input |
-| GlobalAveragePool | ai.onnx(7+) | averagePool2d | ✓ | ✓ | Only supports 4-D input |
-| GlobalMaxPool | ai.onnx(7+) | maxPool2d | ✓ | ✓ | Only supports 4-D input |
-| GlobalLpPool| ai.onnx(7+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 'p' value is 2 |
-| Greater | ai.onnx(7-8, 9-12, 13+) | greater | ✓ | ✓ | |
-| GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | ✓ | ✓ | |
-| GRU | ai.onnx(7-13, 14-21, 22+) | gru | ✓ | ✓ | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
-| HardSigmoid | ai.onnx(7+) | hardSigmoid | ✓ | ✓ | |
-| HardSwish | ai.onnx(14+) | hardSwish | ✓ | ✓ | |
-| Identity | ai.onnx(7-13, 14-15, 16-18, 19-20, 21+) | identity | ✓ | ✓ | |
-| InstanceNormalization | ai.onnx(7+) | instanceNormalization | ✓ | ✓ | |
-| LayerNormalization | ai.onnx(7-16, 17+) | layerNormalization | ✓ | ✓ | |
-| LeakyRelu | ai.onnx(7-15, 16+) | leakyRelu | ✓ | ✓ | |
-| Less | ai.onnx(7-8, 9-12, 13+) | lesser | ✓ | ✓ | |
-| LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | ✓ | ✓ | |
-| Log | ai.onnx(7-12, 13+) | log | ✓ | ✓ | |
-| LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 |
-| LRN | ai.onnx(7-12, 13+) | pad, averagePool2d, transpose, add, mul, pow, div | ✓ | ✓ | |
-| LSTM | ai.onnx(7-13, 14-21, 22+) | lstm | ✓ | ✓ | Only supports 'layout' == 0, 'input_forget' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
-| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | ✓ | ✓ | |
-| Max | ai.onnx(7, 8-11, 12, 13+) | max | ✓ | ✓ | |
-| MaxPool | ai.onnx(7, 8-9, 10, 11, 12+) | maxPool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'storage_order' != 1, one output |
-| Min | ai.onnx(7, 8-11, 12, 13+) | min | ✓ | ✓ | |
-| Mul | ai.onnx(7-12, 13, 14+) | mul | ✓ | ✓ | |
-| Neg | ai.onnx(7-12, 13+) | neg | ✓ | ✓ | |
-| Not | ai.onnx(7+) | logicalNot | ✓ | ✓ | |
-| Or | ai.onnx(7+) | logicalOr | ✗ | ✓ | |
-| Pad | ai.onnx(7-10, 11-12, 13-17, 18, 19-20, 21+) | pad | ✓ | ✓ | modes == 'wrap' is not supported |
-| Pow | ai.onnx(7-11, 12, 13-14, 15+) | pow | ✓ | ✓ | |
-| PRelu | ai.onnx(7-8, 9-15, 16+) | prelu | ✓ | ✓ | WebNN CPU backend restricts the last dimension of input and slope to be same (Chromium issue: https://issues.chromium.org/issues/335517470) |
-| QuantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | quantizeLinear | ✓ | ✓ | The shape of x_scale should be a subsample of the shape of input |
-| Reciprocal | ai.onnx(7-12, 13+) | reciprocal | ✓ | ✓ | |
-| ReduceL1 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL1 | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceL2 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL2 | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceLogSum| ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSum| ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceLogSumExp | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSumExp | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceMax | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMax | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceMean | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceMean | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceMin | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMin | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceProd | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceProduct | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceSum | ai.onnx(7-10, 11-12, 13+) | reduceSum | ✓ | ✓ | Input 'axes' if present should be a constant |
-| ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | ✓ | ✓ | Input 'axes' if present should be a constant |
-| Relu | ai.onnx(7-12, 13, 14+) | relu | ✓ | ✓ | |
-| Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | ✓ | ✓ | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported |
-| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, antialias == 0, exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
-| RotaryEmbedding | com.microsoft(1+) | add, concat, gather, mul, reshape, split | ✓ | ✓ | |
-| ScatterElements | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterElements | ✗ | ✓ | Only supports 'reduction' == 'none' |
-| ScatterND | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterND | ✗ | ✓ | Only supports 'reduction' == 'none' |
-| Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | ✓ | ✓ | |
-| SimplifiedLayerNormalization | ai.onnx(1+) | pow, reduceMean, add, sqrt, div, mul | ✓ | ✓ | |
-| Sigmoid | ai.onnx(7-12, 13+) | sigmoid | ✓ | ✓ | |
-| Sign | ai.onnx(9-12, 13+) | sign | ✓ | ✓ | |
-| SkipSimplifiedLayerNormalization | com.microsoft(1+) | pow, reduceMean, add, sqrt, div, mul | ✓ | ✓ | |
-| Softplus | ai.onnx(7+) | softplus | ✓ | ✓ | |
-| Softsign | ai.onnx(7+) | softsign | ✓ | ✓ | |
-| Sin | ai.onnx(7+) | sin | ✓ | ✓ | |
-| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice, reverse | ✓ | ✓ | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant |
-| Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | ✓ | ✓ | |
-| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | ✓ | ✓ | Input 'split' if present should be a constant |
-| Sqrt | ai.onnx(7-12, 13+) | sqrt | ✓ | ✓ | |
-| Squeeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | Input 'axes' if present should be a constant |
-| Sub | ai.onnx(7-12, 13, 14+) | sub | ✓ | ✓ | |
-| Tan | ai.onnx(7+) | tan | ✓ | ✓ | |
-| Tanh | ai.onnx(7-12, 13+) | tanh | ✓ | ✓ | |
-| Tile | ai.onnx(7-12, 13+) | tile | ✗ | ✓ | Input 'repeats' should be a constant |
-| Transpose | ai.onnx(7-12, 13-20, 21+) | transpose | ✓ | ✓ | |
-| Trilu | ai.onnx(14+) | triangular | ✓ | ✓ | Input 'k' (option 'diagonal' for WebNN) if present should be a constant |
-| Unsqueeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | |
-| Where | ai.onnx(7-8, 9-15, 16+) | where | ✓ | ✓ | |
-| Xor | ai.onnx(7+) | logicalXor | ✗ | ✓ | |
+| Operator | Opset | WebNN API | Comments |
+|:------:|:------:|:------:|:------|
+| Abs | ai.onnx(7-12, 13+) | abs | |
+| Add | ai.onnx(7-12, 13, 14+) | add | |
+| And | ai.onnx(7+) | logicalAnd | |
+| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | |
+| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | |
+| AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 |
+| BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | Only supports 'training_mode' value is 0, one output |
+| Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | |
+| Ceil | ai.onnx(7-12, 13+) | ceil | |
+| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | |
+| Concat | ai.onnx(7-10, 11-12, 13+) | concat | |
+| Conv | ai.onnx(7-10, 11+) | conv2d | Only supports 3-D or 4-D input and 'W' (weight) |
+| ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | Only supports 3-D or 4-D input and 'W' (weight) |
+| Cos | ai.onnx(7+) | cos | |
+| CumSum | ai.onnx(11-13, 14+) | cumulativeSum | 'axis' input should be a constant |
+| Div | ai.onnx(7-12, 13, 14+) | div | |
+| DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | The shape of x_scale should be a subsample of the shape of input |
+| Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | Only supports test mode |
+| Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | |
+| Elu | ai.onnx(7+) | elu | |
+| Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | |
+| Erf | ai.onnx(7-9, 10-12, 13+) | erf | |
+| Exp | ai.onnx(7-12, 13+) | exp | |
+| Expand | ai.onnx(8-12, 13+) | expand | 'shape' input should be a constant |
+| Flatten | ai.onnx(7-8, 9-10, 11-12, 13-20, 21+) | reshape | |
+| Floor | ai.onnx(7-12, 13+) | floor | |
+| Gather | ai.onnx(7-10, 11-12, 13+) | gather | |
+| GatherElements | ai.onnx(11-12, 13+) | gatherElements | |
+| GatherND | ai.onnx(11, 12, 13+) | gatherND | Only supports 'batch_dims' == 0 |
+| Gelu | ai.onnx(20+) | gelu | |
+| Gemm | ai.onnx(7-8, 9-10, 11-12, 13+) | gemm | Only supports 1-D 'C' input |
+| GlobalAveragePool | ai.onnx(7+) | averagePool2d | Only supports 4-D input |
+| GlobalMaxPool | ai.onnx(7+) | maxPool2d | Only supports 4-D input |
+| GlobalLpPool| ai.onnx(7+) | l2Pool2d | Only supports 4-D input, 'p' value is 2 |
+| Greater | ai.onnx(7-8, 9-12, 13+) | greater | |
+| GreaterOrEqual | ai.onnx(12-15, 16+) | greaterOrEqual | |
+| GRU | ai.onnx(7-13, 14-21, 22+) | gru | Only supports 'layout' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
+| HardSigmoid | ai.onnx(7+) | hardSigmoid | |
+| HardSwish | ai.onnx(14+) | hardSwish | |
+| Identity | ai.onnx(7-13, 14-15, 16-18, 19-20, 21+) | identity | |
+| InstanceNormalization | ai.onnx(7+) | instanceNormalization | |
+| LayerNormalization | ai.onnx(7-16, 17+) | layerNormalization | |
+| LeakyRelu | ai.onnx(7-15, 16+) | leakyRelu | |
+| Less | ai.onnx(7-8, 9-12, 13+) | lesser | |
+| LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | |
+| Log | ai.onnx(7-12, 13+) | log | |
+| LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 |
+| LRN | ai.onnx(7-12, 13+) | pad, averagePool2d, transpose, add, mul, pow, div | |
+| LSTM | ai.onnx(7-13, 14-21, 22+) | lstm | Only supports 'layout' == 0, 'input_forget' == 0. 'clip' is not supported. The activation functions in 'activations' must be one of 'Relu', 'Tanh', 'Sigmoid'. Forward and backward activations must be the same if bidirectional. 'sequence_lens' if present should be constant with values equal to the first dimension length of input 'X' |
+| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | |
+| Max | ai.onnx(7, 8-11, 12, 13+) | max | |
+| MaxPool | ai.onnx(7, 8-9, 10, 11, 12+) | maxPool2d | Only supports 4-D input, 2-D 'kernel_shape', 'storage_order' != 1, one output |
+| Min | ai.onnx(7, 8-11, 12, 13+) | min | |
+| Mul | ai.onnx(7-12, 13, 14+) | mul | |
+| Neg | ai.onnx(7-12, 13+) | neg | |
+| Not | ai.onnx(7+) | logicalNot | |
+| Or | ai.onnx(7+) | logicalOr | |
+| Pad | ai.onnx(7-10, 11-12, 13-17, 18, 19-20, 21+) | pad | modes == 'wrap' is not supported |
+| Pow | ai.onnx(7-11, 12, 13-14, 15+) | pow | |
+| PRelu | ai.onnx(7-8, 9-15, 16+) | prelu | |
+| QuantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | quantizeLinear | The shape of x_scale should be a subsample of the shape of input |
+| Reciprocal | ai.onnx(7-12, 13+) | reciprocal | |
+| ReduceL1 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL1 | Input 'axes' if present should be a constant |
+| ReduceL2 | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceL2 | Input 'axes' if present should be a constant |
+| ReduceLogSum| ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSum | Input 'axes' if present should be a constant |
+| ReduceLogSumExp | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceLogSumExp | Input 'axes' if present should be a constant |
+| ReduceMax | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMax | Input 'axes' if present should be a constant |
+| ReduceMean | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceMean | Input 'axes' if present should be a constant |
+| ReduceMin | ai.onnx(7-10, 11, 12, 13-17, 18-19, 20+) | reduceMin | Input 'axes' if present should be a constant |
+| ReduceProd | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceProduct | Input 'axes' if present should be a constant |
+| ReduceSum | ai.onnx(7-10, 11-12, 13+) | reduceSum | Input 'axes' if present should be a constant |
+| ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | Input 'axes' if present should be a constant |
+| Relu | ai.onnx(7-12, 13, 14+) | relu | |
+| Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported |
+| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | Only supports 4-D input, antialias == 0, exclude_outside == 0, keep_aspect_ratio_policy == 'stretch', 'linear' and 'nearest' modes, input 'scales' and 'sizes' if present must be a constant |
+| RotaryEmbedding | com.microsoft(1+) | add, concat, gather, mul, reshape, split | |
+| ScatterElements | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterElements | Only supports 'reduction' == 'none' |
+| ScatterND | ai.onnx(11-12, 13-15, 16-17, 18+) | scatterND | Only supports 'reduction' == 'none' |
+| Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | |
+| SimplifiedLayerNormalization | ai.onnx(1+) | pow, reduceMean, add, sqrt, div, mul | |
+| Sigmoid | ai.onnx(7-12, 13+) | sigmoid | |
+| Sign | ai.onnx(9-12, 13+) | sign | |
+| SkipSimplifiedLayerNormalization | com.microsoft(1+) | pow, reduceMean, add, sqrt, div, mul | |
+| Softplus | ai.onnx(7+) | softplus | |
+| Softsign | ai.onnx(7+) | softsign | |
+| Sin | ai.onnx(7+) | sin | |
+| Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice, reverse | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant |
+| Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | |
+| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | Input 'split' if present should be a constant |
+| Sqrt | ai.onnx(7-12, 13+) | sqrt | |
+| Squeeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | Input 'axes' if present should be a constant |
+| Sub | ai.onnx(7-12, 13, 14+) | sub | |
+| Tan | ai.onnx(7+) | tan | |
+| Tanh | ai.onnx(7-12, 13+) | tanh | |
+| Tile | ai.onnx(7-12, 13+) | tile | Input 'repeats' should be a constant |
+| Transpose | ai.onnx(7-12, 13-20, 21+) | transpose | |
+| Trilu | ai.onnx(14+) | triangular | Input 'k' (option 'diagonal' for WebNN) if present should be a constant |
+| Unsqueeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | |
+| Where | ai.onnx(7-8, 9-15, 16+) | where | |
+| Xor | ai.onnx(7+) | logicalXor | |
diff --git a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
index 781ddcb896..585fddfd1f 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc
@@ -17,10 +17,6 @@ class ActivationOpBuilder : public BaseOpBuilder {
  private:
   Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node,
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
-
-  // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         WebnnDeviceType device_type, const logging::Logger& logger) const override;
 };
 
 // Add operator related.
@@ -68,30 +64,6 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
   return Status::OK();
 }
 
-// Operator support related.
-bool ActivationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */,
-                                            const Node& node,
-                                            WebnnDeviceType device_type,
-                                            const logging::Logger& logger) const {
-  const auto& input_defs = node.InputDefs();
-  const auto& op_type = node.OpType();
-
-  std::vector<int64_t> input_shape;
-  if (!GetShape(*input_defs[0], input_shape, logger))
-    return false;
-
-  if (op_type == "Elu" && device_type == WebnnDeviceType::CPU) {
-    NodeAttrHelper helper(node);
-    float alpha = helper.Get("alpha", 1.0f);
-    if (alpha != 1.0f) {
-      LOGS(logger, VERBOSE) << "WebNN CPU backend only supports Elu's alpha == 1.0";
-      return false;
-    }
-  }
-
-  return true;
-}
-
 void CreateActivationOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
   if (op_registrations.op_builder_map.count(op_type) > 0)
     return;
diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
index e14507e8f5..c5493f97fd 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc
@@ -20,8 +20,6 @@ class BinaryOpBuilder : public BaseOpBuilder {
                                const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
 
   // Operator support related.
-  bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
   bool HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
                               const emscripten::val& wnn_limits, const logging::Logger& logger) const override;
 };
@@ -59,33 +57,6 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const
   return Status::OK();
 }
 
-bool BinaryOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
-                                        const Node& node,
-                                        const WebnnDeviceType device_type,
-                                        const logging::Logger& logger) const {
-  const auto& input_defs = node.InputDefs();
-  const auto& op_type = node.OpType();
-
-  std::vector<int64_t> input0_shape;
-  std::vector<int64_t> input1_shape;
-  if (!GetShape(*input_defs[0], input0_shape, logger) ||
-      !GetShape(*input_defs[1], input1_shape, logger)) {
-    return false;
-  }
-
-  // 'prelu' op in WebNN CPU backend restricts the last dimension of input and slope to be same.
-  // TODO: Remove this workaround once the associated issue is resolved in Chromium:
-  // https://issues.chromium.org/issues/335517470.
-  if (op_type == "PRelu" && device_type == WebnnDeviceType::CPU) {
-    if (input0_shape.back() != input1_shape.back()) {
-      LOGS(logger, VERBOSE) << "The last dimension of input and slope for PRelu must be same for WebNN CPU backend.";
-      return false;
-    }
-  }
-
-  return true;
-}
-
 bool BinaryOpBuilder::HasSupportedInputsImpl(const InitializedTensorSet& /* initializers */, const Node& node,
                                              const emscripten::val& wnn_limits, const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
diff --git a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
index 374143c886..a244efdd9b 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc
@@ -69,27 +69,7 @@ bool ClipOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
   // can ensure initializers are constant. See #19401 for details of how this update was made to the NNAPI EP.
   // GetClipMinMax(graph_viewer, node, minValue, maxValue, logger)
   float min, max;
-  if (GetClipMinMax(initializers, node, min, max, logger)) {
-    // WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0].
-    // TODO: Remove this workaround once the associated issue is resolved in Chromium:
-    // https://issues.chromium.org/issues/326156496.
-    if (device_type == WebnnDeviceType::CPU) {
-      if ((min == 0.0f && max == std::numeric_limits<float>::infinity()) ||
-          (min == -1.0f && max == 1.0f) ||
-          (min == 0.0f && max == 6.0f)) {
-        return true;
-      } else {
-        LOGS(logger, VERBOSE) << "Clip min and max values ("
-                              << min << ", "
-                              << max << ") are not supported for WebNN CPU backend";
-        return false;
-      }
-    }
-
-    return true;
-  } else {
-    return false;
-  };
+  return GetClipMinMax(initializers, node, min, max, logger);
 }
 
 void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
index 548e718b87..e623590e3b 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc
@@ -378,22 +378,6 @@ bool ConvOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
     return false;
   }
 
-  // WebNN CPU backend (TFLite) only supports default dilations and group.
-  // https://source.chromium.org/chromium/chromium/src/+/main:services/webnn/tflite/graph_builder_tflite.cc;l=1040
-  if (device_type == WebnnDeviceType::CPU && op_type == "ConvTranspose") {
-    NodeAttrHelper helper(node);
-    const auto dilations = helper.Get("dilations", std::vector<int64_t>{1, 1});
-    const auto group = helper.Get("group", 1);
-    if (dilations[0] != 1 || (dilations.size() > 1 && dilations[1] != 1)) {
-      LOGS(logger, VERBOSE) << op_type << " for WebNN CPU backend only supports default dilation 1.";
-      return false;
-    }
-    if (group != 1) {
-      LOGS(logger, VERBOSE) << op_type << " for WebNN CPU backend only supports default group 1.";
-      return false;
-    }
-  }
-
   return true;
 }
 

From 87582ba9b7729a8f84977550f65d2baa653735d4 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 21 Jan 2025 17:21:21 -0800
Subject: [PATCH 05/37] Bump ruff from 0.9.1 to 0.9.2 (#23427)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bumps [ruff](https://github.com/astral-sh/ruff) from 0.9.1 to 0.9.2.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/astral-sh/ruff/releases">ruff's
releases</a>.</em></p>
<blockquote>
<h2>0.9.2</h2>
<h2>Release Notes</h2>
<h3>Preview features</h3>
<ul>
<li>[<code>airflow</code>] Fix typo &quot;security_managr&quot; to
&quot;security_manager&quot; (<code>AIR303</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15463">#15463</a>)</li>
<li>[<code>airflow</code>] extend and fix AIR302 rules (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15525">#15525</a>)</li>
<li>[<code>fastapi</code>] Handle parameters with <code>Depends</code>
correctly (<code>FAST003</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15364">#15364</a>)</li>
<li>[<code>flake8-pytest-style</code>] Implement pytest.warns
diagnostics (<code>PT029</code>, <code>PT030</code>, <code>PT031</code>)
(<a
href="https://redirect.github.com/astral-sh/ruff/pull/15444">#15444</a>)</li>
<li>[<code>flake8-pytest-style</code>] Test function parameters with
default arguments (<code>PT028</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15449">#15449</a>)</li>
<li>[<code>flake8-type-checking</code>] Avoid false positives for
<code>|</code> in <code>TC008</code> (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15201">#15201</a>)</li>
</ul>
<h3>Rule changes</h3>
<ul>
<li>[<code>flake8-todos</code>] Allow VSCode GitHub PR extension style
links in <code>missing-todo-link</code> (<code>TD003</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15519">#15519</a>)</li>
<li>[<code>pyflakes</code>] Show syntax error message for
<code>F722</code> (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15523">#15523</a>)</li>
</ul>
<h3>Formatter</h3>
<ul>
<li>Fix curly bracket spacing around f-string expressions containing
curly braces (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15471">#15471</a>)</li>
<li>Fix joining of f-strings with different quotes when using quote
style <code>Preserve</code> (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15524">#15524</a>)</li>
</ul>
<h3>Server</h3>
<ul>
<li>Avoid indexing the same workspace multiple times (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15495">#15495</a>)</li>
<li>Display context for <code>ruff.configuration</code> errors (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15452">#15452</a>)</li>
</ul>
<h3>Configuration</h3>
<ul>
<li>Remove <code>flatten</code> to improve deserialization error
messages (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15414">#15414</a>)</li>
</ul>
<h3>Bug fixes</h3>
<ul>
<li>Parse triple-quoted string annotations as if parenthesized (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15387">#15387</a>)</li>
<li>[<code>fastapi</code>] Update <code>Annotated</code> fixes
(<code>FAST002</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15462">#15462</a>)</li>
<li>[<code>flake8-bandit</code>] Check for <code>builtins</code> instead
of <code>builtin</code> (<code>S102</code>, <code>PTH123</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15443">#15443</a>)</li>
<li>[<code>flake8-pathlib</code>] Fix <code>--select</code> for
<code>os-path-dirname</code> (<code>PTH120</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15446">#15446</a>)</li>
<li>[<code>ruff</code>] Fix false positive on global keyword
(<code>RUF052</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15235">#15235</a>)</li>
</ul>
<h2>Contributors</h2>
<ul>
<li><a
href="https://github.com/AlexWaygood"><code>@​AlexWaygood</code></a></li>
<li><a
href="https://github.com/BurntSushi"><code>@​BurntSushi</code></a></li>
<li><a
href="https://github.com/Daverball"><code>@​Daverball</code></a></li>
<li><a
href="https://github.com/Garrett-R"><code>@​Garrett-R</code></a></li>
<li><a
href="https://github.com/Glyphack"><code>@​Glyphack</code></a></li>
<li><a
href="https://github.com/InSyncWithFoo"><code>@​InSyncWithFoo</code></a></li>
<li><a href="https://github.com/Lee-W"><code>@​Lee-W</code></a></li>
<li><a
href="https://github.com/MichaReiser"><code>@​MichaReiser</code></a></li>
<li><a
href="https://github.com/cake-monotone"><code>@​cake-monotone</code></a></li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md">ruff's
changelog</a>.</em></p>
<blockquote>
<h2>0.9.2</h2>
<h3>Preview features</h3>
<ul>
<li>[<code>airflow</code>] Fix typo &quot;security_managr&quot; to
&quot;security_manager&quot; (<code>AIR303</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15463">#15463</a>)</li>
<li>[<code>airflow</code>] extend and fix AIR302 rules (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15525">#15525</a>)</li>
<li>[<code>fastapi</code>] Handle parameters with <code>Depends</code>
correctly (<code>FAST003</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15364">#15364</a>)</li>
<li>[<code>flake8-pytest-style</code>] Implement pytest.warns
diagnostics (<code>PT029</code>, <code>PT030</code>, <code>PT031</code>)
(<a
href="https://redirect.github.com/astral-sh/ruff/pull/15444">#15444</a>)</li>
<li>[<code>flake8-pytest-style</code>] Test function parameters with
default arguments (<code>PT028</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15449">#15449</a>)</li>
<li>[<code>flake8-type-checking</code>] Avoid false positives for
<code>|</code> in <code>TC008</code> (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15201">#15201</a>)</li>
</ul>
<h3>Rule changes</h3>
<ul>
<li>[<code>flake8-todos</code>] Allow VSCode GitHub PR extension style
links in <code>missing-todo-link</code> (<code>TD003</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15519">#15519</a>)</li>
<li>[<code>pyflakes</code>] Show syntax error message for
<code>F722</code> (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15523">#15523</a>)</li>
</ul>
<h3>Formatter</h3>
<ul>
<li>Fix curly bracket spacing around f-string expressions containing
curly braces (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15471">#15471</a>)</li>
<li>Fix joining of f-strings with different quotes when using quote
style <code>Preserve</code> (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15524">#15524</a>)</li>
</ul>
<h3>Server</h3>
<ul>
<li>Avoid indexing the same workspace multiple times (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15495">#15495</a>)</li>
<li>Display context for <code>ruff.configuration</code> errors (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15452">#15452</a>)</li>
</ul>
<h3>Configuration</h3>
<ul>
<li>Remove <code>flatten</code> to improve deserialization error
messages (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15414">#15414</a>)</li>
</ul>
<h3>Bug fixes</h3>
<ul>
<li>Parse triple-quoted string annotations as if parenthesized (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15387">#15387</a>)</li>
<li>[<code>fastapi</code>] Update <code>Annotated</code> fixes
(<code>FAST002</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15462">#15462</a>)</li>
<li>[<code>flake8-bandit</code>] Check for <code>builtins</code> instead
of <code>builtin</code> (<code>S102</code>, <code>PTH123</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15443">#15443</a>)</li>
<li>[<code>flake8-pathlib</code>] Fix <code>--select</code> for
<code>os-path-dirname</code> (<code>PTH120</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15446">#15446</a>)</li>
<li>[<code>ruff</code>] Fix false positive on global keyword
(<code>RUF052</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15235">#15235</a>)</li>
</ul>
</blockquote>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/astral-sh/ruff/commit/0a393483811e0999578b5655d82e2c03238296f3"><code>0a39348</code></a>
Include build binaries</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/027f8009e557e9b5c21b812c9b70874bf02b590b"><code>027f800</code></a>
Comment out non-npm-publish jobs</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/425870df7666ef1c8e7d033cbc3195f43a54213e"><code>425870d</code></a>
Upload npm publish logs when failed</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/c20255abe4013866173ba0515ad9a3190bdfac51"><code>c20255a</code></a>
Bump version to 0.9.2 (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15529">#15529</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/420365811f27d597ea33a62270667ce9cee1bb5f"><code>4203658</code></a>
Fix joining of f-strings with different quotes when using quote style
`Preser...</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/fc9dd63d64ebc18cdca2e9648264704da43b902e"><code>fc9dd63</code></a>
[airflow] extend and fix AIR302 rules (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15525">#15525</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/79e52c7fdf90597d933aea771a9cde0ad510bba6"><code>79e52c7</code></a>
[<code>pyflakes</code>] Show syntax error message for <code>F722</code>
(<a
href="https://redirect.github.com/astral-sh/ruff/issues/15523">#15523</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/cf4ab7cba16b25f42d9d6b2464e22eb57df0fa8c"><code>cf4ab7c</code></a>
Parse triple quoted string annotations as if parenthesized (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15387">#15387</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/d2656e88a3c17ca3351cd5069642253ac22490f5"><code>d2656e8</code></a>
[<code>flake8-todos</code>] Allow VSCode GitHub PR extension style links
in `missing-tod...</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/c53ee608a1df4e471f0089e4f5d2881291e085be"><code>c53ee60</code></a>
Typeshed-sync workflow: add appropriate labels, link directly to failing
run ...</li>
<li>Additional commits viewable in <a
href="https://github.com/astral-sh/ruff/compare/0.9.1...0.9.2">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ruff&package-manager=pip&previous-version=0.9.1&new-version=0.9.2)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-lintrunner.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index 406d0b7f19..4aef423f4b 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -3,6 +3,6 @@
 lintrunner==0.12.5
 lintrunner-adapters==0.12.4
 # RUFF
-ruff==0.9.1
+ruff==0.9.2
 # CLANGFORMAT
 clang-format==19.1.7

From c9614fbf9041c2486f9883030fa81aac3a09e8d5 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 21 Jan 2025 17:37:08 -0800
Subject: [PATCH 06/37] Suppress some strict-aliasing related warnings in
 WebGPU EP (#23454)

### Description
Suppress some strict-aliasing related warnings in WebGPU EP

For example:
```
/home/chasun/src/onnxruntime/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc:208:30: error: dereferencing type-punned pointer will break strict-aliasing rules [-Werror=strict-aliasing]
  208 |       float encoded_value = *reinterpret_cast<const float*>(attr);
      |                              ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
```

This PR does not really fix the problems. It just suppresses the
warnings to make build pass. Some issues related to strict aliasing may
be fixed by using std::bit_cast, which requires c++20 however.


### Motivation and Context
Build the code on Azure Linux 3 fails. To reproduce the issue, you may
get an AzureLinux3 machine and run:
```
 python3 tools/ci_build/build.py --update --build  --build_wheel --use_xnnpack --build_nodejs   --use_webgpu --build_dir b --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --config Release
```
---
 onnxruntime/core/providers/webgpu/generator/range.cc     | 9 +++++++++
 .../core/providers/webgpu/math/unary_elementwise_ops.cc  | 7 +++++++
 onnxruntime/core/providers/webgpu/program.h              | 9 +++++++++
 onnxruntime/core/providers/webgpu/shader_variable.h      | 8 ++++++++
 onnxruntime/core/providers/webgpu/webgpu_context.cc      | 8 ++++++++
 5 files changed, 41 insertions(+)

diff --git a/onnxruntime/core/providers/webgpu/generator/range.cc b/onnxruntime/core/providers/webgpu/generator/range.cc
index ee7c67ec24..a0b65f08a5 100644
--- a/onnxruntime/core/providers/webgpu/generator/range.cc
+++ b/onnxruntime/core/providers/webgpu/generator/range.cc
@@ -25,6 +25,11 @@ Status Range<T>::ComputeInternal(ComputeContext& context) const {
 
   uint32_t output_size = gsl::narrow<uint32_t>(n);
   RangeProgram program{};
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
   program.AddOutput({output_tensor, ProgramTensorMetadataDependency::Type})
       .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
       .AddUniformVariables({
@@ -33,6 +38,10 @@ Status Range<T>::ComputeInternal(ComputeContext& context) const {
           *reinterpret_cast<uint32_t*>(&delta),
       });
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
   return context.RunProgram(program);
 }
 
diff --git a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
index 8dcf636710..eaaad206eb 100644
--- a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
@@ -194,6 +194,10 @@ class Clip final : public UnaryElementwise {
                          "Clip",
                          std::is_same_v<T, MLFloat16> ? ClipF16Impl : ClipImpl,
                          "", ShaderUsage::UseElementTypeAlias} {}
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
 
   Status ConfigureProgram(const ComputeContext& context, UnaryElementwiseProgram& program) const override {
     const auto* clip_min_tensor = context.Input<Tensor>(1);
@@ -214,6 +218,9 @@ class Clip final : public UnaryElementwise {
     }
     return Status::OK();
   }
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 
   // uniforms.attr is a f32 value. It is encoded as a float for 2 f16 values.
   // bitcast<vec2<f16>>(uniforms.attr)[0] is clip_min, bitcast<vec2<f16>>(uniforms.attr)[1] is clip_max
diff --git a/onnxruntime/core/providers/webgpu/program.h b/onnxruntime/core/providers/webgpu/program.h
index 1562ec158b..7bfd9e8800 100644
--- a/onnxruntime/core/providers/webgpu/program.h
+++ b/onnxruntime/core/providers/webgpu/program.h
@@ -150,6 +150,11 @@ enum class ProgramTensorMetadataDependency : int {
 };
 std::ostream& operator<<(std::ostream& os, ProgramTensorMetadataDependency);
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
 inline ProgramTensorMetadataDependency operator|(ProgramTensorMetadataDependency a, ProgramTensorMetadataDependency b) {
   return (ProgramTensorMetadataDependency)((int&)a | (int&)b);
 }
@@ -163,6 +168,10 @@ inline ProgramTensorMetadataDependency& operator&=(ProgramTensorMetadataDependen
   return (ProgramTensorMetadataDependency&)((int&)a &= (int&)b);
 }
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
 constexpr SafeInt<uint32_t> WORKGROUP_SIZE = 64;
 
 // data type of variable
diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h
index 4c87bc9158..2aba2a59d1 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.h
+++ b/onnxruntime/core/providers/webgpu/shader_variable.h
@@ -189,6 +189,10 @@ class ShaderVariableHelper : public ShaderIndicesHelper {
 
   friend class ShaderHelper;
 };
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
 
 inline ShaderUsage operator|(ShaderUsage a, ShaderUsage b) {
   return (uint32_t)a.usage | (uint32_t)b.usage;
@@ -205,6 +209,10 @@ inline ShaderUsage& operator&=(ShaderUsage& a, ShaderUsage b) {
   return a;
 }
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+
 namespace detail {
 template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
 std::string pass_as_string(T&& v) {
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 1c9a16bf36..f7d9420701 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -4,12 +4,20 @@
 #include <memory>
 #include <cmath>
 
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif
+
 #if !defined(__wasm__)
 #include "dawn/dawn_proc.h"
 #if !defined(USE_EXTERNAL_DAWN)
 #include "dawn/native/DawnNative.h"
 #endif
 #endif
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 
 #include "core/common/common.h"
 #include "core/common/path_string.h"

From ff8465eda45a5ab1259fc5f1091fd38a33b596c4 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 21 Jan 2025 20:25:12 -0800
Subject: [PATCH 07/37] Use onnx_protobuf.h to suppress some GCC warnings
 (#23453)

### Description
Use onnx_protobuf.h to suppress some GCC warnings.

All the changes are autogenerated by a shell command.
```bash
find . -type f -exec sed -i 's/#include\s\+<onnx\/onnx_pb.h>/#include "core\/graph\/onnx_protobuf.h"/g' {} \;
```

### Motivation and Context
This PR is needed for making vcpkg work(without disabling all warnings)
This PR is split from another bigger PR per request from a reviewer.
---
 include/onnxruntime/core/graph/node_arg.h                       | 2 +-
 {onnxruntime => include/onnxruntime}/core/graph/onnx_protobuf.h | 0
 onnxruntime/core/graph/function_template.h                      | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc         | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc   | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc      | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc        | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc        | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc      | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc        | 2 +-
 .../nnapi_builtin/builders/impl/depthtospace_op_builder.cc      | 2 +-
 .../nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc  | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc         | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc     | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc      | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc        | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc    | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc   | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc      | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc         | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc        | 2 +-
 .../nnapi_builtin/builders/impl/quantizelinear_op_builder.cc    | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc   | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc        | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc     | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc      | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc       | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc     | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/split_op_builder.cc       | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc     | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc   | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc       | 2 +-
 .../nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc   | 2 +-
 .../core/providers/nnapi/nnapi_builtin/builders/model_builder.h | 2 +-
 onnxruntime/core/providers/rknpu/node_attr_helper.h             | 2 +-
 onnxruntime/core/providers/rknpu/onnx_converter.h               | 2 +-
 onnxruntime/test/fuzzing/include/OnnxPrediction.h               | 2 +-
 onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp    | 2 +-
 onnxruntime/test/providers/coreml/coreml_basic_test.cc          | 2 +-
 onnxruntime/test/providers/qnn/argmaxmin_op_test.cc             | 2 +-
 onnxruntime/test/providers/qnn/average_pool_test.cc             | 2 +-
 onnxruntime/test/providers/qnn/cast_test.cc                     | 2 +-
 onnxruntime/test/providers/qnn/clip_op_test.cc                  | 2 +-
 onnxruntime/test/providers/qnn/flatten_op_test.cc               | 2 +-
 onnxruntime/test/providers/qnn/gather_elems_op_test.cc          | 2 +-
 onnxruntime/test/providers/qnn/gemm_op_test.cc                  | 2 +-
 onnxruntime/test/providers/qnn/logical_comp_ops_test.cc         | 2 +-
 onnxruntime/test/providers/qnn/lrn_op_test.cc                   | 2 +-
 onnxruntime/test/providers/qnn/matmul_test.cpp                  | 2 +-
 onnxruntime/test/providers/qnn/max_min_op_test.cc               | 2 +-
 onnxruntime/test/providers/qnn/pad_op_test.cpp                  | 2 +-
 onnxruntime/test/providers/qnn/pool_op_test.cpp                 | 2 +-
 onnxruntime/test/providers/qnn/reshape_expand_op_test.cc        | 2 +-
 onnxruntime/test/providers/qnn/resize_test.cc                   | 2 +-
 onnxruntime/test/providers/qnn/split_op_test.cc                 | 2 +-
 onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc     | 2 +-
 onnxruntime/test/providers/qnn/tile_op_test.cc                  | 2 +-
 onnxruntime/test/providers/qnn/topk_op_test.cc                  | 2 +-
 58 files changed, 57 insertions(+), 57 deletions(-)
 rename {onnxruntime => include/onnxruntime}/core/graph/onnx_protobuf.h (100%)

diff --git a/include/onnxruntime/core/graph/node_arg.h b/include/onnxruntime/core/graph/node_arg.h
index 921bff59fb..0ddf1a2b9d 100644
--- a/include/onnxruntime/core/graph/node_arg.h
+++ b/include/onnxruntime/core/graph/node_arg.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/graph/basic_types.h"
 #include "core/common/status.h"
diff --git a/onnxruntime/core/graph/onnx_protobuf.h b/include/onnxruntime/core/graph/onnx_protobuf.h
similarity index 100%
rename from onnxruntime/core/graph/onnx_protobuf.h
rename to include/onnxruntime/core/graph/onnx_protobuf.h
diff --git a/onnxruntime/core/graph/function_template.h b/onnxruntime/core/graph/function_template.h
index 978174d943..0d3fee18d5 100644
--- a/onnxruntime/core/graph/function_template.h
+++ b/onnxruntime/core/graph/function_template.h
@@ -2,7 +2,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc
index 91cad034d8..fd1720d69e 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/LRN_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc
index 75a66d3a14..5874eb1e7d 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/batchnorm_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc
index 5599fbdc69..91d1a38e71 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/binary_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc
index 9059de817e..03329b9159 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/cast_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc
index 9821d9267c..becd677e32 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/clip_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc
index a8394faec5..fa5e292be0 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/concat_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc
index 5477cd16f9..a7a837ae21 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/conv_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc
index ef8709641e..039d8510bb 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/depthtospace_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc
index 7d0e04fbd7..ed9062f894 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/dequantizelinear_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc
index 218c41d6f0..fc2348951e 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/elu_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc
index b5e9c01199..986ce78fb1 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/flatten_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc
index d6da9181b5..ccd3f8b571 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gather_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc
index 66eefcd6e4..cff96c2f1f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/gemm_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc
index d7b35572e6..250b190091 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/identity_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc
index 6a633c443c..e3dcee1e3d 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/leakyrelu_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/framework/tensorprotoutils.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc
index aeadbd1705..a80742aef9 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/minmax_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc
index b0404ebec0..8127de0a0f 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pad_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc
index a2a4786b72..10c5efb84e 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/pool_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc
index d13b81c2a1..eb81f5e3f5 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/quantizelinear_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
index a6da290753..fbb353f949 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reduction_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc
index c8641093ee..d65c069851 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/relu_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc
index f2f9165d2f..fad5d8289c 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/reshape_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
index 44403010c9..af5aeba6c8 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
index facdc7132d..52b075b027 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/slice_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc
index a2a8b4512b..8fa915de95 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/softmax_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc
index edee298ad1..7509fd15f1 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/split_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 #include <algorithm>
 
 #include "core/common/logging/logging.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc
index fb3ca5e617..44510c33c0 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/squeeze_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
index 6fe5ca32fe..4a9e3eb00a 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/transpose_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc
index dbd960ee55..77df9d2fd7 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unary_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc
index 95cd813800..b9ebbace8d 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/unsqueeze_op_builder.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/common/logging/logging.h"
 #include "core/common/safeint.h"
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
index 4db335afa9..3cbf7d1ee4 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.h
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #pragma once
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 #include <unordered_set>
 
 #include "core/common/inlined_containers_fwd.h"
diff --git a/onnxruntime/core/providers/rknpu/node_attr_helper.h b/onnxruntime/core/providers/rknpu/node_attr_helper.h
index 6ab8f8c6bb..76a0c721f7 100644
--- a/onnxruntime/core/providers/rknpu/node_attr_helper.h
+++ b/onnxruntime/core/providers/rknpu/node_attr_helper.h
@@ -4,7 +4,7 @@
 
 #pragma once
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 #include <vector>
 #include <string>
 
diff --git a/onnxruntime/core/providers/rknpu/onnx_converter.h b/onnxruntime/core/providers/rknpu/onnx_converter.h
index e90efd75b9..10cc09a9db 100644
--- a/onnxruntime/core/providers/rknpu/onnx_converter.h
+++ b/onnxruntime/core/providers/rknpu/onnx_converter.h
@@ -2,7 +2,7 @@
 
 #pragma once
 
-#include <onnx/onnx_pb.h>
+#include "core/graph/onnx_protobuf.h"
 
 #include <map>
 #include <memory>
diff --git a/onnxruntime/test/fuzzing/include/OnnxPrediction.h b/onnxruntime/test/fuzzing/include/OnnxPrediction.h
index c169aaa16f..c99120dc45 100644
--- a/onnxruntime/test/fuzzing/include/OnnxPrediction.h
+++ b/onnxruntime/test/fuzzing/include/OnnxPrediction.h
@@ -20,7 +20,7 @@
 #include <filesystem>
 
 #include "BetaDistribution.h"
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "onnxruntime_cxx_api.h"
 
 #include "testlog.h"
diff --git a/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp b/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp
index 607d9cfd9c..472122be58 100644
--- a/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp
+++ b/onnxruntime/test/fuzzing/ort_libfuzzer/OrtProtoLibfuzzer.cpp
@@ -5,7 +5,7 @@
 #include "OnnxPrediction.h"
 #include "onnxruntime_session_options_config_keys.h"
 #include "src/libfuzzer/libfuzzer_macro.h"
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include <type_traits>
 
diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
index 302ad57fb8..a9aa78b7a3 100644
--- a/onnxruntime/test/providers/coreml/coreml_basic_test.cc
+++ b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -15,7 +15,7 @@
 #include "test/util/include/inference_session_wrapper.h"
 #include "test/util/include/test_environment.h"
 #include "test/util/include/test_utils.h"
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 // if this is a full build we need the provider test utils
diff --git a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
index c514cf16b2..da6eda1317 100644
--- a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
+++ b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
@@ -9,7 +9,7 @@
 #include "core/graph/node_attr_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/average_pool_test.cc b/onnxruntime/test/providers/qnn/average_pool_test.cc
index 1a0f9bfcba..f897a08da6 100644
--- a/onnxruntime/test/providers/qnn/average_pool_test.cc
+++ b/onnxruntime/test/providers/qnn/average_pool_test.cc
@@ -11,7 +11,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/cast_test.cc b/onnxruntime/test/providers/qnn/cast_test.cc
index 9b83dd281a..e2e4b0d714 100644
--- a/onnxruntime/test/providers/qnn/cast_test.cc
+++ b/onnxruntime/test/providers/qnn/cast_test.cc
@@ -9,7 +9,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index cfa77a4621..21bd6fcc98 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/flatten_op_test.cc b/onnxruntime/test/providers/qnn/flatten_op_test.cc
index 637d3257dd..b33f8f9c00 100644
--- a/onnxruntime/test/providers/qnn/flatten_op_test.cc
+++ b/onnxruntime/test/providers/qnn/flatten_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/gather_elems_op_test.cc b/onnxruntime/test/providers/qnn/gather_elems_op_test.cc
index 81c0887306..85dc792666 100644
--- a/onnxruntime/test/providers/qnn/gather_elems_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gather_elems_op_test.cc
@@ -11,7 +11,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index da0c7f2c36..0c1146ba22 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -9,7 +9,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc b/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc
index 5910513678..522b781379 100644
--- a/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc
+++ b/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc
@@ -9,7 +9,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc
index a99cba66bf..4b26ed0da9 100644
--- a/onnxruntime/test/providers/qnn/lrn_op_test.cc
+++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc
@@ -9,7 +9,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index f3f584f24a..dec9369b81 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -8,7 +8,7 @@
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/max_min_op_test.cc b/onnxruntime/test/providers/qnn/max_min_op_test.cc
index 3deff121f3..9a45d11b7e 100644
--- a/onnxruntime/test/providers/qnn/max_min_op_test.cc
+++ b/onnxruntime/test/providers/qnn/max_min_op_test.cc
@@ -7,7 +7,7 @@
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/pad_op_test.cpp b/onnxruntime/test/providers/qnn/pad_op_test.cpp
index a6b8664c6c..4ce6db7fac 100644
--- a/onnxruntime/test/providers/qnn/pad_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pad_op_test.cpp
@@ -10,7 +10,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/pool_op_test.cpp b/onnxruntime/test/providers/qnn/pool_op_test.cpp
index 5dd3a6aaa3..f0ca355719 100644
--- a/onnxruntime/test/providers/qnn/pool_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pool_op_test.cpp
@@ -10,7 +10,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc
index 3964edc114..b66547a939 100644
--- a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc
+++ b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc
index 15612e3267..651f55bc05 100644
--- a/onnxruntime/test/providers/qnn/resize_test.cc
+++ b/onnxruntime/test/providers/qnn/resize_test.cc
@@ -9,7 +9,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 
 #include "gtest/gtest.h"
 
diff --git a/onnxruntime/test/providers/qnn/split_op_test.cc b/onnxruntime/test/providers/qnn/split_op_test.cc
index 6dc721edb4..23682f7e93 100644
--- a/onnxruntime/test/providers/qnn/split_op_test.cc
+++ b/onnxruntime/test/providers/qnn/split_op_test.cc
@@ -7,7 +7,7 @@
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
index 33d2f64c03..abc1b3a89d 100644
--- a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc
@@ -7,7 +7,7 @@
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/tile_op_test.cc b/onnxruntime/test/providers/qnn/tile_op_test.cc
index 2b35c730ee..85541efe56 100644
--- a/onnxruntime/test/providers/qnn/tile_op_test.cc
+++ b/onnxruntime/test/providers/qnn/tile_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/test/providers/qnn/topk_op_test.cc b/onnxruntime/test/providers/qnn/topk_op_test.cc
index 5a9351b936..354a5d1e3b 100644
--- a/onnxruntime/test/providers/qnn/topk_op_test.cc
+++ b/onnxruntime/test/providers/qnn/topk_op_test.cc
@@ -8,7 +8,7 @@
 #include "test/providers/qnn/qnn_test_utils.h"
 #include "core/graph/node_attr_utils.h"
 
-#include "onnx/onnx_pb.h"
+#include "core/graph/onnx_protobuf.h"
 #include "gtest/gtest.h"
 
 namespace onnxruntime {

From 25f427466e6d590c55d7e252fef7e495ec709465 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajiaqin@microsoft.com>
Date: Thu, 23 Jan 2025 00:59:17 +0800
Subject: [PATCH 08/37] [js/webgpu] Optimize ConvTranspose (Continue) (#23429)

BUG #23273

This PR does below optimizations:
1. When output channels is one, 1) calculate the offset before the
inchannel loop to reduce indices to offsets calculation, 2) split the
`inputChannelsPerGroup` into `inputChannelsPerGroupInt` and
`inputChannelsRemainder` parts so that we can always access 4 data for
`inputChannelsPerGroupInt`.
2. Use precise initial value to reduce useless loop iterations. Thanks
@jiangzhaoming 's suggestion's on this.

With this PR, ConvTranspose becomes 3.7s from 8.4s on Intel Meteor Lake.
On NV RTX 2000 Ada, it becomes 1.6s from 2.7s.
---
 .../ops/3rd-party/conv_backprop_webgpu.ts     | 118 ++++++++++----
 js/web/test/data/ops/conv-transpose.jsonc     | 146 ++++++++++++++++++
 2 files changed, 238 insertions(+), 26 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
index 0aa3ad6c4c..097e255256 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_webgpu.ts
@@ -46,6 +46,11 @@ export const createConvTranspose2DProgramInfo = (
   const inputChannelsPerGroup = wShape[2] / group;
   const outputChannelsPerGroup = wShape[3];
   const aComponents = isChannelsLast ? getMaxComponents(inputChannelsPerGroup) : 1;
+  const packInputAs4 = isChannelsLast && outputChannelsPerGroup === 1;
+  const inputChannelsPerGroupInt = packInputAs4
+    ? Math.floor(inputChannelsPerGroup / 4) * 4
+    : Math.floor(inputChannelsPerGroup / aComponents) * aComponents;
+  const inputChannelsRemainder = inputChannelsPerGroup - inputChannelsPerGroupInt;
   const components = isChannelsLast ? getMaxComponents(outputChannelsPerGroup) : 1;
   const bComponents = isChannelsLast ? (outputChannelsPerGroup === 1 ? aComponents : components) : 1;
   const outputSize = ShapeUtil.size(outputShape) / components;
@@ -78,7 +83,7 @@ export const createConvTranspose2DProgramInfo = (
     { type: DataType.uint32, data: dilations },
     { type: DataType.uint32, data: effectiveFilterDims },
     { type: DataType.int32, data: pads },
-    { type: DataType.uint32, data: inputChannelsPerGroup },
+    { type: DataType.uint32, data: inputChannelsPerGroupInt },
     { type: DataType.uint32, data: outputChannelsPerGroup },
     ...createTensorShapeVariables(inputs[0].dims, inputs[1].dims),
   ];
@@ -114,16 +119,40 @@ export const createConvTranspose2DProgramInfo = (
 
     const calculateResult = (): string => {
       let calcStr = '';
-      if (aComponents === 1) {
-        calcStr += `
-        let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
-        let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)};
-        dotProd = dotProd + xValue * wValue;`;
-      } else {
-        if (outputChannelsPerGroup === 1) {
+      if (packInputAs4) {
+        if (aComponents === 4) {
           calcStr += `
-          let wValue = ${w.getByOffset(`${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)} / ${bComponents}`)};
-          dotProd = dotProd + dot(xValue, wValue);`;
+        let xValue = ${dy.getByOffset('x_offset')};
+        let wValue = ${w.getByOffset('w_offset')};
+        dotProd = dotProd + dot(xValue, wValue);
+        x_offset += 1u;
+        w_offset += 1u;`;
+        } else if (aComponents === 2) {
+          calcStr += `
+          dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')}));
+          x_offset += 2u;
+          w_offset += 2u;`;
+        } else if (aComponents === 1) {
+          calcStr += `
+          dotProd = dotProd + dot(vec4<${dataType}>(${dy.getByOffset('x_offset')}, ${dy.getByOffset('x_offset + 1u')}, ${dy.getByOffset('x_offset + 2u')}, ${dy.getByOffset('x_offset + 3u')}), vec4<${dataType}>(${w.getByOffset('w_offset')}, ${w.getByOffset('w_offset + 1u')}, ${w.getByOffset('w_offset + 2u')}, ${w.getByOffset('w_offset + 3u')}));
+          x_offset += 4u;
+          w_offset += 4u;`;
+        }
+      } else {
+        calcStr += `
+                  let xValue = ${
+                    isChannelsLast
+                      ? dy.getByOffset(
+                          `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`,
+                        )
+                      : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
+                  };
+        `;
+        if (aComponents === 1) {
+          calcStr += `
+          let w_offset = ${w.indicesToOffset(`${w.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
+          let wValue = ${w.getByOffset(`w_offset / ${bComponents}`)};
+          dotProd = dotProd + xValue * wValue;`;
         } else {
           for (let c = 0; c < aComponents; c++) {
             calcStr += `
@@ -134,6 +163,32 @@ export const createConvTranspose2DProgramInfo = (
       }
       return calcStr;
     };
+    const calculateRemainder = (): string => {
+      if (inputChannelsRemainder === 0) {
+        return '';
+      }
+      if (!packInputAs4) {
+        throw new Error(`packInputAs4 ${packInputAs4} is not true.`);
+      }
+      let calcStr = '';
+      if (aComponents === 1) {
+        calcStr += 'dotProd = dotProd';
+        for (let i = 0; i < inputChannelsRemainder; i++) {
+          calcStr += `
+            + ${dy.getByOffset(`x_offset + ${i}`)} * ${w.getByOffset(`w_offset + ${i}`)}`;
+        }
+        calcStr += ';';
+      } else if (aComponents === 2) {
+        if (inputChannelsRemainder !== 2) {
+          throw new Error(`Invalid inputChannelsRemainder ${inputChannelsRemainder}.`);
+        }
+        calcStr += `
+          let xValue = ${dy.getByOffset('x_offset')};
+          let wValue = ${w.getByOffset('w_offset')};
+          dotProd = dotProd + dot(xValue, wValue);`;
+      }
+      return calcStr;
+    };
     const codeSnippet = `
             let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)};
             let batch = ${output.indicesGet('outputIndices', 0)};
@@ -148,7 +203,12 @@ export const createConvTranspose2DProgramInfo = (
             // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
             // ? = to be determined. : = across all values in that axis.
             var dotProd = ${output.type.value}(0.0);
-            for (var wR: u32 = 0; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
+            var wR: u32 = 0;
+            if (uniforms.dilations.x == 1) {
+              // Minimum wR >= 0 that satisfies (dyRCorner + wR) % (uniforms.strides.x) == 0
+              wR = u32(((dyRCorner + i32(uniforms.strides.x) - 1) / i32(uniforms.strides.x)) * i32(uniforms.strides.x) - dyRCorner);
+            }
+            for (; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
               if (wR % uniforms.dilations.x != 0) {
                 continue;
               }
@@ -158,10 +218,13 @@ export const createConvTranspose2DProgramInfo = (
                   wRPerm < 0) {
                 continue;
               }
-              wR = wR + uniforms.strides[0] - 1;
               let idyR: u32 = u32(dyR);
-
-              for (var wC: u32 = 0; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
+              var wC: u32 = 0;
+              if (uniforms.dilations.y == 1) {
+                // Minimum wC >= 0 that satisfies (dyCCorner + wC) % (uniforms.strides.y) == 0
+                wC = u32(((dyCCorner + i32(uniforms.strides.y) - 1) / i32(uniforms.strides.y)) * i32(uniforms.strides.y) - dyCCorner);
+              }
+              for (; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
                 if (wC % uniforms.dilations.y != 0) {
                   continue;
                 }
@@ -171,21 +234,24 @@ export const createConvTranspose2DProgramInfo = (
                     fract(dyC) > 0.0 || wCPerm < 0) {
                   continue;
                 }
-                wC = wC + uniforms.strides.y - 1;
                 let idyC: u32 = u32(dyC);
                 var inputChannel = groupId * uniforms.input_channels_per_group;
-                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + ${aComponents}) {
-                  let xValue = ${
-                    isChannelsLast
-                      ? dy.getByOffset(
-                          `${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents}`,
-                        )
-                      : dy.get('batch', 'inputChannel', 'idyR', 'idyC')
-                  };
-                  ${calculateResult()}
-                  inputChannel = inputChannel + ${aComponents};
+                ${
+                  packInputAs4
+                    ? `
+                var x_offset = ${dy.indicesToOffset(`${dy.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${aComponents};
+                var w_offset = ${w.indicesToOffset(`${w.type.indices}(wRPerm, wCPerm, inputChannel, wOutChannel)`)} / ${bComponents};
+                  `
+                    : ''
                 }
+                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group; d2 = d2 + ${packInputAs4 ? 4 : aComponents}) {
+                  ${calculateResult()}
+                  inputChannel = inputChannel + ${packInputAs4 ? 4 : aComponents};
+                }
+                ${calculateRemainder()}
+                wC = wC + uniforms.strides.y - 1;
               }
+              wR = wR + uniforms.strides[0] - 1;
             }
             let value = dotProd${hasBias ? ` + bias[d1 / ${components}]` : ''};
             ${output.setByOffset('global_idx', 'value')};
@@ -201,7 +267,7 @@ export const createConvTranspose2DProgramInfo = (
   return {
     name: 'ConvTranspose2D',
     shaderCache: {
-      hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${outputChannelsPerGroup === 1}`,
+      hint: `${attributes.cacheKey};${aComponents}${bComponents}${components}${outputChannelsPerGroup === 1}${inputChannelsRemainder}`,
       inputDependencies,
     },
     getRunData: () => ({
diff --git a/js/web/test/data/ops/conv-transpose.jsonc b/js/web/test/data/ops/conv-transpose.jsonc
index f827601b3a..a6a799dcce 100644
--- a/js/web/test/data/ops/conv-transpose.jsonc
+++ b/js/web/test/data/ops/conv-transpose.jsonc
@@ -458,6 +458,152 @@
       }
     ]
   },
+  {
+    "name": "ConvTranspose with output channels = 1",
+    "operator": "ConvTranspose",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "inChannels = 5",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45
+            ],
+            "dims": [1, 5, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [5, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              437, 532, 458, 558, 479, 584, 627, 722, 658, 758, 689, 794, 500, 610, 521, 636, 542, 662, 720, 830, 751,
+              866, 782, 902, 563, 688, 584, 714, 605, 740, 813, 938, 844, 974, 875, 1010
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "inChannels = 6",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9
+            ],
+            "dims": [1, 6, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4],
+            "dims": [6, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              438, 534, 460, 562, 482, 590, 630, 726, 664, 766, 698, 806, 504, 618, 526, 646, 548, 674, 732, 846, 766,
+              886, 800, 926, 570, 702, 592, 730, 614, 758, 834, 966, 868, 1006, 902, 1046
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "inChannels = 7",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15, 16, 17, 18
+            ],
+            "dims": [1, 7, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8],
+            "dims": [7, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              488, 594, 515, 628, 542, 662, 700, 806, 741, 854, 782, 902, 569, 696, 596, 730, 623, 764, 823, 950, 864,
+              998, 905, 1046, 650, 798, 677, 832, 704, 866, 946, 1094, 987, 1142, 1028, 1190
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      },
+      {
+        "name": "inChannels = 8",
+        "inputs": [
+          {
+            "data": [
+              1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+              30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+              14, 15, 16, 17, 18, 1, 2, 3, 4, 5, 6, 7, 8, 9
+            ],
+            "dims": [1, 8, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4],
+            "dims": [8, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [2],
+            "dims": [1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              489, 596, 517, 632, 545, 668, 703, 810, 747, 862, 791, 914, 573, 704, 601, 740, 629, 776, 835, 966, 879,
+              1018, 923, 1070, 657, 812, 685, 848, 713, 884, 967, 1122, 1011, 1174, 1055, 1226
+            ],
+            "dims": [1, 1, 6, 6],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "ConvTranspose without bias addition C",
     "operator": "ConvTranspose",

From 9f9fcf74ff55d7239803d45ca3a989f225da136b Mon Sep 17 00:00:00 2001
From: Caroline Zhu <wolfivyaura@gmail.com>
Date: Wed, 22 Jan 2025 10:57:09 -0800
Subject: [PATCH 09/37] [Mobile] Add BrowserStack Android MAUI Test (#23383)

### Description
Add test project that will perform an automated UI test that runs the
unit tests on Android.

### Motivation
- Enables end-to-end on-device MAUI unit testing which we want to add to
the packaging pipelines

### Context
Microsoft.ML.OnnxRuntime.Tests.MAUI uses DeviceRunners.VisualRunners to
allow running the unit tests (found in
Microsoft.ML.OnnxRuntime.Tests.Common) across multiple devices.
DeviceRunners.VisualRunners provides a simple UI with a button that will
run the unit tests and a panel with the unit test results.

In order to automate the process of running the unit tests across mobile
devices, Appium is used for UI testing orchestration (it provides a way
to interact with the UI), and BrowserStack automatically runs these
Appium tests across different mobile devices.

This project does not include the capability to start an Appium server
locally or attach to a local emulator or device.

## Build & run instructions
### Requirements
* A BrowserStack account with access to App Automate
* You can set BrowserStack credentials as environment variables as shown
[here](https://www.browserstack.com/docs/app-automate/appium/getting-started/c-sharp/nunit/integrate-your-tests#CLI)
* ONNXRuntime NuGet package
1. You can either download the [stable NuGet
package](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime) then
follow the instructions from [NativeLibraryInclude.props
file](../Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props)
to use the downloaded .nupkg file
2. Or follow the [build
instructions](https://onnxruntime.ai/docs/build/android.html) to build
the Android package locally
* The dotnet workloads for maui and maui-android, which will not always
automatically install correctly
    1. `dotnet workload install maui`
    2. `dotnet workload install maui-android`
* [Appium](https://appium.io/docs/en/latest/quickstart/) and the
[UiAutomator2
driver](https://appium.io/docs/en/latest/quickstart/uiauto2-driver/)

### Run instructions
1. Build the Microsoft.ML.OnnxRuntime.Tests.MAUI project into a signed
APK.
1. Run the following: `dotnet publish -c Release -f net8.0-android` in
the Microsoft.ML.OnnxRuntime.Tests.MAUI directory.
2. Search for the APK files generated. They should be located in
`bin\Release\net8.0-android\publish`.
3. If they're in a different location, edit the `browserstack.yml` file
to target the path to the signed APK.
2. Ensure you've set the BrowserStack credentials as environment
variables.
3. Run the following in the
Microsoft.ML.OnnxRuntime.Tests.Android.BrowserStack directory: `dotnet
test`
4. Navigate to the [BrowserStack App Automate
dashboard](https://app-automate.browserstack.com/dashboard/v2/builds) to
see your test running!
---
 .../.config/dotnet-tools.json                 |  13 ++
 .../BrowserStackTest.cs                       |  68 +++++++
 ...xRuntime.Tests.BrowserStack.Android.csproj |  22 +++
 .../README.md                                 |  48 +++++
 .../RunAllTest.cs                             | 123 ++++++++++++
 .../browserstack.yml                          |  13 ++
 ...Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj | 186 +++++++++---------
 7 files changed, 383 insertions(+), 90 deletions(-)
 create mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/.config/dotnet-tools.json
 create mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs
 create mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android.csproj
 create mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/README.md
 create mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs
 create mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml

diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/.config/dotnet-tools.json b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/.config/dotnet-tools.json
new file mode 100644
index 0000000000..67d39c423d
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/.config/dotnet-tools.json
@@ -0,0 +1,13 @@
+{
+  "version": 1,
+  "isRoot": true,
+  "tools": {
+    "browserstack-sdk": {
+      "version": "1.16.13",
+      "commands": [
+        "browserstack-sdk"
+      ],
+      "rollForward": false
+    }
+  }
+}
\ No newline at end of file
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs
new file mode 100644
index 0000000000..84377d65d1
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs
@@ -0,0 +1,68 @@
+﻿using Newtonsoft.Json;
+using NUnit.Framework.Interfaces;
+using NUnit.Framework;
+using OpenQA.Selenium;
+using OpenQA.Selenium.Appium;
+using OpenQA.Selenium.Appium.Android;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android
+{
+    public class BrowserStackTest
+    {
+        public AndroidDriver driver;
+        public BrowserStackTest()
+        {}
+
+        [SetUp]
+        public void Init()
+        {
+            var androidOptions = new AppiumOptions {
+                AutomationName = "UIAutomator2",
+                PlatformName = "Android",
+            };
+
+            driver = new AndroidDriver(new Uri("http://127.0.0.1:4723/wd/hub"), androidOptions);
+        }
+
+        /// <summary>
+        /// Passes the correct test status to BrowserStack and ensures the driver quits.
+        /// </summary>
+        [TearDown]
+        public void Dispose()
+        {
+            try
+            {
+                // According to
+                // https://www.browserstack.com/docs/app-automate/appium/set-up-tests/mark-tests-as-pass-fail
+                // BrowserStack doesn't know whether test assertions have passed or failed. Below handles
+                // passing the test status to BrowserStack along with any relevant information.
+                if (TestContext.CurrentContext.Result.Outcome.Status == TestStatus.Failed)
+                {
+                    String failureMessage = TestContext.CurrentContext.Result.Message;
+                    String jsonToSendFailure =
+                        String.Format("browserstack_executor: {\"action\": \"setSessionStatus\", \"arguments\": " +
+                                      "{\"status\":\"failed\", \"reason\": {0}}}",
+                                      JsonConvert.ToString(failureMessage));
+
+                    ((IJavaScriptExecutor)driver).ExecuteScript(jsonToSendFailure);
+                }
+                else
+                {
+                    ((IJavaScriptExecutor)driver)
+                        .ExecuteScript("browserstack_executor: {\"action\": \"setSessionStatus\", \"arguments\": " +
+                                       "{\"status\":\"passed\", \"reason\": \"\"}}");
+                }
+            }
+            finally
+            {
+                // will run even if exception is thrown by previous block
+                ((AndroidDriver)driver).Quit();
+            }
+        }
+    }
+}
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android.csproj
new file mode 100644
index 0000000000..9b9028d30c
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android.csproj
@@ -0,0 +1,22 @@
+<Project Sdk="Microsoft.NET.Sdk">
+	
+	<PropertyGroup>
+		<TargetFramework>net8.0</TargetFramework>
+		<ImplicitUsings>enable</ImplicitUsings>
+		<Nullable>enable</Nullable>
+
+		<IsPackable>false</IsPackable>
+		<IsTestProject>true</IsTestProject>
+	</PropertyGroup>
+
+	<ItemGroup>
+		<PackageReference Include="Appium.WebDriver" Version="5.0.0-rc.5" />
+		<PackageReference Include="BrowserStack.TestAdapter" Version="0.13.13" />
+		<PackageReference Include="Microsoft.Extensions.Logging.Debug" Version="8.0.0" />
+		<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
+		<PackageReference Include="NUnit" Version="3.13.0" />
+		<PackageReference Include="NUnit.Analyzers" Version="3.3.0" />
+		<PackageReference Include="NUnit3TestAdapter" Version="4.3.0" />
+	</ItemGroup>
+	
+</Project>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/README.md b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/README.md
new file mode 100644
index 0000000000..9c4e2307d8
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/README.md
@@ -0,0 +1,48 @@
+# BrowserStack Android test
+This project will run the Android MAUI tests on BrowserStack, which allows you to run automated tests on a variety of mobile devices.
+
+## Context
+Microsoft.ML.OnnxRuntime.Tests.MAUI uses DeviceRunners.VisualRunners to allow running the unit tests (found in Microsoft.ML.OnnxRuntime.Tests.Common) across multiple devices. DeviceRunners.VisualRunners provides a simple UI with a button that will run the unit tests and a panel with the unit test results. 
+
+In order to automate the process of running the unit tests across mobile devices, Appium is used for UI testing orchestration (it provides a way to interact with the UI), and BrowserStack automatically runs these Appium tests across different mobile devices.
+
+This project does not include the capability to start an Appium server locally or attach to a local emulator or device. 
+
+## Build & run instructions
+### Requirements
+* A BrowserStack account with access to App Automate
+    * You can set BrowserStack credentials as environment variables as shown [here](https://www.browserstack.com/docs/app-automate/appium/getting-started/c-sharp/nunit/integrate-your-tests#CLI)
+* ONNXRuntime NuGet package
+    1. You can either download the [stable NuGet package](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime) then follow the instructions from [NativeLibraryInclude.props file](../Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props) to use the downloaded .nupkg file
+    2. Or follow the [build instructions](https://onnxruntime.ai/docs/build/android.html) to build the Android package locally
+* The dotnet workloads for maui and maui-android, which will not always automatically install correctly
+    1. `dotnet workload install maui`
+    2. `dotnet workload install maui-android`
+* [Appium](https://appium.io/docs/en/latest/quickstart/) and the [UiAutomator2 driver](https://appium.io/docs/en/latest/quickstart/uiauto2-driver/)
+
+### Run instructions
+1. Build the Microsoft.ML.OnnxRuntime.Tests.MAUI project into a signed APK.
+    1. Run the following: `dotnet publish -c Release -f net8.0-android` in the Microsoft.ML.OnnxRuntime.Tests.MAUI directory.
+    2. Search for the APK files generated. They should be located in `bin\Release\net8.0-android\publish`. 
+    3. If they're in a different location, edit the `browserstack.yml` file to target the path to the signed APK.
+2. Ensure you've set the BrowserStack credentials as environment variables.
+3. Run the following in the Microsoft.ML.OnnxRuntime.Tests.Android.BrowserStack directory: `dotnet test`
+4. Navigate to the [BrowserStack App Automate dashboard](https://app-automate.browserstack.com/dashboard/v2/builds) to see your test running!
+
+## Troubleshooting & Resources
+### BrowserStack Resources
+- [Configuration docs](https://www.browserstack.com/docs/app-automate/appium/sdk-params#test-context) for browserstack.yml
+- [Configuration generator](https://www.browserstack.com/docs/app-automate/capabilities) for browserstack.yml
+- [Integration guide](https://www.browserstack.com/docs/app-automate/appium/getting-started/c-sharp/nunit/integrate-your-tests#CLI)
+
+### Troubleshooting
+- Issues building the MAUI app: 
+    - Make sure that the maui and maui-android workloads are installed correctly by running `dotnet workload list`
+    - If you believe the issues are workload related, you can also try running `dotnet workload repair` (this has personally never worked for me)
+    - Try running `dotnet clean`. However, this does not fully remove all the previous intermediaries. If you're still running into the errors, manually deleting the bin and obj folders can sometimes resolve them. 
+- After building the MAUI app, try installing on an emulator and clicking the "Run All" button to ensure that everything is working. (If you are missing the ONNXRuntime package, it will not show up as an error until you click "Run All".)
+    - Running the MAUI app from Visual Studio will not replicate running it through BrowserStack. Instead, use `adb install [path to signed apk]` to install the app then use the emulator to launch the app.
+- Issues with the Android.BrowserStack test app: there is an Appium Doctor package on npm -- run `npm install @appium/doctor --location=global` then `appium-doctor --android` and follow the directed instructions. Some errors with Appium on Android will not appear until runtime.
+- Connection refused by Appium server: this can happen if you already have an Appium server running locally. If you do, stop the Appium server then try `dotnet test` again.
+- App is crashing on BrowserStack or it emits an error that it cannot run this APK file: make sure that you are passing in the correct signed APK from the publish folder. 
+- It appears that a test runs on CLI but a build is not launched on BrowserStack: this happens when the BrowserStack Test Adapter cannot find the browserstack.yml file (which has to be named "browserstack.yml" -- do not be tricked by BrowserStack's article on custom-named configuration files)
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs
new file mode 100644
index 0000000000..5db3dc9957
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs
@@ -0,0 +1,123 @@
+﻿using OpenQA.Selenium.Appium;
+using OpenQA.Selenium;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android
+{
+    /// <summary>
+    /// This class contains a single test: RunAll, which interacts with the UI from
+    /// https://github.com/mattleibow/DeviceRunners/tree/main by clicking the "Run All" button and checking the number
+    /// of passed and failed tests.
+    ///
+    /// It searches for elements on the page using Appium's WebDriver. These searches use the XPath attributes.
+    ///
+    /// Launching the MAUI test app in Appium Inspector will allow you to see the exact XPath attributes for each
+    /// element.
+    /// </summary>
+    [TestFixture]
+    public class RunAllTest : BrowserStackTest
+    {
+        public AppiumElement FindAppiumElement(String xpathQuery, String text)
+        {
+            IReadOnlyCollection<AppiumElement> appiumElements = driver.FindElements(By.XPath(xpathQuery));
+
+            foreach (var element in appiumElements)
+            {
+                if (element.Text.Contains(text))
+                {
+                    return element;
+                }
+            }
+            // was unable to find given element
+            throw new Exception(String.Format("Could not find {0}: {1} on the page.", xpathQuery, text));
+        }
+
+        public AppiumElement FindAppiumElementThenClick(String xpathQuery, String text)
+        {
+            AppiumElement appiumElement = FindAppiumElement(xpathQuery, text);
+            appiumElement.Click();
+            return appiumElement;
+        }
+
+        public (int, int) GetPassFailCount()
+        {
+            int numPassed = -1;
+            int numFailed = -1;
+
+            IReadOnlyCollection<AppiumElement> labelElements =
+                driver.FindElements(By.XPath("//android.widget.TextView"));
+
+            for (int i = 0; i < labelElements.Count; i++)
+            {
+                AppiumElement element = labelElements.ElementAt(i);
+
+                if (element.Text.Equals("✔"))
+                {
+                    i++;
+                    numPassed = int.Parse(labelElements.ElementAt(i).Text);
+                }
+
+                if (element.Text.Equals("⛔"))
+                {
+                    i++;
+                    numFailed = int.Parse(labelElements.ElementAt(i).Text);
+                    break;
+                }
+            }
+
+            Assert.That(numPassed, Is.GreaterThanOrEqualTo(0), "Could not find number passed label.");
+            Assert.That(numFailed, Is.GreaterThanOrEqualTo(0), "Could not find number failed label.");
+
+            return (numPassed, numFailed);
+        }
+
+        [Test]
+        public async Task ClickRunAllTest()
+        {
+            // XAML for the main page:
+            // https://github.com/mattleibow/DeviceRunners/blob/cba7644e07b305ba64dc930b01c3eee55ef2b93d/src/DeviceRunners.VisualRunners.Maui/App/Pages/HomePage.xaml
+            AppiumElement runAllButton = FindAppiumElementThenClick("//android.widget.Button", "Run All");
+
+            while (!runAllButton.Enabled)
+            {
+                // waiting for unit tests to execute
+                await Task.Delay(500);
+            }
+
+            var (numPassed, numFailed) = GetPassFailCount();
+
+            if (numFailed == 0)
+            {
+                return;
+            }
+
+            // click into test results if tests have failed
+            FindAppiumElementThenClick("//android.widget.TextView", "⛔");
+            await Task.Delay(500);
+
+            // Brings you to the test assembly page
+            // XAML for test assembly page:
+            // https://github.com/mattleibow/DeviceRunners/blob/cba7644e07b305ba64dc930b01c3eee55ef2b93d/src/DeviceRunners.VisualRunners.Maui/App/Pages/TestAssemblyPage.xaml
+            FindAppiumElementThenClick("//android.widget.EditText", "All");
+            await Task.Delay(100);
+            FindAppiumElementThenClick("//android.widget.TextView", "Failed");
+            await Task.Delay(500);
+
+            StringBuilder sb = new StringBuilder();
+            sb.AppendLine("PASSED TESTS: " + numPassed + " | FAILED TESTS: " + numFailed);
+
+            IReadOnlyCollection<AppiumElement> textResults = driver.FindElements(By.XPath("//android.widget.TextView"));
+            foreach (var element in textResults)
+            {
+                sb.AppendLine(element.Text);
+            }
+
+            Assert.That(numFailed, Is.EqualTo(0), sb.ToString());
+        }
+    }
+}
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml
new file mode 100644
index 0000000000..9efbc9fc6a
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml
@@ -0,0 +1,13 @@
+app: ..\Microsoft.ML.OnnxRuntime.Tests.MAUI\bin\Release\net8.0-android\publish\ORT.CSharp.Tests.MAUI-Signed.apk
+platforms:
+  - platformName: android
+    deviceName: Samsung Galaxy S22 Ultra
+    platformVersion: 12.0
+browserstackLocal: true
+buildName: ORT android test
+buildIdentifier: ${BUILD_NUMBER}
+projectName: ORT-UITests
+debug: true
+networkLogs: false
+testContextOptions:
+    skipSessionStatus: true   
\ No newline at end of file
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj
index e07448daee..652da8899f 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj
@@ -1,125 +1,131 @@
 ﻿<Project Sdk="Microsoft.NET.Sdk">
-    <PropertyGroup>
-        <OnnxRuntimeRoot>$(ProjectDir)..\..\..</OnnxRuntimeRoot>
-    </PropertyGroup>
+	<PropertyGroup>
+		<OnnxRuntimeRoot>$(ProjectDir)..\..\..</OnnxRuntimeRoot>
+	</PropertyGroup>
 
-    <Import Project="../Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props" />
+	<Import Project="../Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props" />
 
-    <!-- General app properties -->
-    <PropertyGroup>
-        <TargetFrameworks>net8.0-android;net8.0-ios;net8.0-maccatalyst</TargetFrameworks>
-        <TargetFrameworks Condition="$([MSBuild]::IsOSPlatform('windows'))">$(TargetFrameworks);net8.0-windows10.0.19041.0</TargetFrameworks>
+	<!-- General app properties -->
+	<PropertyGroup>
+		<TargetFrameworks>net8.0-android;net8.0-ios;net8.0-maccatalyst</TargetFrameworks>
+		<TargetFrameworks Condition="$([MSBuild]::IsOSPlatform('windows'))">$(TargetFrameworks);net8.0-windows10.0.19041.0</TargetFrameworks>
 
-        <!-- Note for MacCatalyst:
+		<!-- Note for MacCatalyst:
         The default runtime is maccatalyst-x64, except in Release config, in which case the default is maccatalyst-x64;maccatalyst-arm64.
         When specifying both architectures, use the plural <RuntimeIdentifiers> instead of the singular <RuntimeIdentifier>.
         The Mac App Store will NOT accept apps with ONLY maccatalyst-arm64 indicated;
         either BOTH runtimes must be indicated or ONLY macatalyst-x64. -->
-        <!-- For example: <RuntimeIdentifiers>maccatalyst-x64;maccatalyst-arm64</RuntimeIdentifiers> -->
+		<!-- For example: <RuntimeIdentifiers>maccatalyst-x64;maccatalyst-arm64</RuntimeIdentifiers> -->
 
-        <OutputType>Exe</OutputType>
-        <RootNamespace>Microsoft.ML.OnnxRuntime.Tests.MAUI</RootNamespace>
-        <UseMaui>true</UseMaui>
-        <SingleProject>true</SingleProject>
-        <ImplicitUsings>enable</ImplicitUsings>
-        <Nullable>enable</Nullable>
-        <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
-        <!-- some of the helper packages don't have strong named assemblies. -->
-        <NoWarn>8002</NoWarn>
+		<RootNamespace>Microsoft.ML.OnnxRuntime.Tests.MAUI</RootNamespace>
+		<UseMaui>true</UseMaui>
+		<SingleProject>true</SingleProject>
+		<ImplicitUsings>enable</ImplicitUsings>
+		<Nullable>enable</Nullable>
+		<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+		<!-- some of the helper packages don't have strong named assemblies. -->
+		<NoWarn>8002</NoWarn>
 
-        <!-- These are copied from the sample. TBD what we really need. -->
-        <DefineConstants Condition="'$(CI)' != 'true'">$(DefineConstants);INCLUDE_FAILING_TESTS</DefineConstants>
-        <DefineConstants Condition="'$(TestingMode)' == 'NonInteractiveVisual'">$(DefineConstants);MODE_NON_INTERACTIVE_VISUAL</DefineConstants>
-        <DefineConstants Condition="'$(TestingMode)' == 'XHarness'">$(DefineConstants);MODE_XHARNESS</DefineConstants>
+		<!-- These are copied from the sample. TBD what we really need. -->
+		<DefineConstants Condition="'$(CI)' != 'true'">$(DefineConstants);INCLUDE_FAILING_TESTS</DefineConstants>
+		<DefineConstants Condition="'$(TestingMode)' == 'NonInteractiveVisual'">$(DefineConstants);MODE_NON_INTERACTIVE_VISUAL</DefineConstants>
+		<DefineConstants Condition="'$(TestingMode)' == 'XHarness'">$(DefineConstants);MODE_XHARNESS</DefineConstants>
 
-        <!-- Display name -->
-        <ApplicationTitle>Microsoft.ML.OnnxRuntime.Tests.MAUI</ApplicationTitle>
+		<!-- Display name -->
+		<ApplicationTitle>Microsoft.ML.OnnxRuntime.Tests.MAUI</ApplicationTitle>
 
-        <!-- App Identifier. MUST be short or you get a misleading error about not being able to deploy the app -->
-        <ApplicationId>ORT.CSharp.Tests.MAUI</ApplicationId>
+		<!-- App Identifier. MUST be short or you get a misleading error about not being able to deploy the app -->
+		<ApplicationId>ORT.CSharp.Tests.MAUI</ApplicationId>
 
-        <!-- Versions -->
-        <ApplicationDisplayVersion>1.0</ApplicationDisplayVersion>
-        <ApplicationVersion>1</ApplicationVersion>
+		<!-- Versions -->
+		<ApplicationDisplayVersion>1.0</ApplicationDisplayVersion>
+		<ApplicationVersion>1</ApplicationVersion>
 
-        <SupportedOSPlatformVersion Condition="'$(IsIOSTarget)' == 'true'">15.0</SupportedOSPlatformVersion>
-        <SupportedOSPlatformVersion Condition="'$(IsMacCatalystTarget)' == 'true'">13.1</SupportedOSPlatformVersion>
-        <SupportedOSPlatformVersion Condition="'$(IsAndroidTarget)' == 'true'">30.0</SupportedOSPlatformVersion>
-        <SupportedOSPlatformVersion Condition="'$(IsWindowsTarget)' == 'true'">10.0.17763.0</SupportedOSPlatformVersion>
-        <TargetPlatformMinVersion Condition="'$(IsWindowsTarget)' == 'true'">10.0.17763.0</TargetPlatformMinVersion>
+		<SupportedOSPlatformVersion Condition="'$(IsIOSTarget)' == 'true'">15.0</SupportedOSPlatformVersion>
+		<SupportedOSPlatformVersion Condition="'$(IsMacCatalystTarget)' == 'true'">13.1</SupportedOSPlatformVersion>
+		<SupportedOSPlatformVersion Condition="'$(IsAndroidTarget)' == 'true'">30.0</SupportedOSPlatformVersion>
+		<SupportedOSPlatformVersion Condition="'$(IsWindowsTarget)' == 'true'">10.0.17763.0</SupportedOSPlatformVersion>
+		<TargetPlatformMinVersion Condition="'$(IsWindowsTarget)' == 'true'">10.0.17763.0</TargetPlatformMinVersion>
 
-        <SignAssembly>true</SignAssembly>
-        <AssemblyOriginatorKeyFile>..\..\OnnxRuntime.snk</AssemblyOriginatorKeyFile>
-    </PropertyGroup>
+		<SignAssembly>true</SignAssembly>
+		<AssemblyOriginatorKeyFile>..\..\OnnxRuntime.snk</AssemblyOriginatorKeyFile>
+	</PropertyGroup>
 
-    <ItemGroup>
-        <!-- App Icon -->
-        <MauiIcon Include="Resources\AppIcon\appicon.svg" ForegroundFile="Resources\AppIcon\appiconfg.svg" Color="#512BD4" />
+	<ItemGroup>
+		<!-- App Icon -->
+		<MauiIcon Include="Resources\AppIcon\appicon.svg" ForegroundFile="Resources\AppIcon\appiconfg.svg" Color="#512BD4" />
 
-        <!-- Splash Screen -->
-        <MauiSplashScreen Include="Resources\Splash\splash.svg" Color="#512BD4" BaseSize="128,128" />
+		<!-- Splash Screen -->
+		<MauiSplashScreen Include="Resources\Splash\splash.svg" Color="#512BD4" BaseSize="128,128" />
 
-        <!-- Images -->
-        <MauiImage Include="Resources\Images\*" />
-        <MauiImage Update="Resources\Images\dotnet_bot.png" Resize="True" BaseSize="300,185" />
+		<!-- Images -->
+		<MauiImage Include="Resources\Images\*" />
+		<MauiImage Update="Resources\Images\dotnet_bot.png" Resize="True" BaseSize="300,185" />
 
-        <!-- Custom Fonts -->
-        <MauiFont Include="Resources\Fonts\*" />
+		<!-- Custom Fonts -->
+		<MauiFont Include="Resources\Fonts\*" />
 
-        <!-- Raw Assets (also remove the "Resources\Raw" prefix) -->
-        <MauiAsset Include="Resources\Raw\**" LogicalName="%(RecursiveDir)%(Filename)%(Extension)" />
-    </ItemGroup>
+		<!-- Raw Assets (also remove the "Resources\Raw" prefix) -->
+		<MauiAsset Include="Resources\Raw\**" LogicalName="%(RecursiveDir)%(Filename)%(Extension)" />
+	</ItemGroup>
 
-    <!-- NOTE: The xUnit framework doesn't pickup the tests defined within the referenced
+	<!-- NOTE: The xUnit framework doesn't pickup the tests defined within the referenced
     Microsoft.ML.OnnxRuntime.Tests.Common project -->
-    <ItemGroup>
-        <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\InferenceTest.cs">
-            <Link>InferenceTest.cs</Link>
-        </Compile>
-        <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OrtIoBindingAllocationTest.cs">
-            <Link>OrtIoBindingAllocationTest.cs</Link>
-        </Compile>
-        <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\Tensors\TensorTests.cs">
-            <Link>TensorTests.cs</Link>
-        </Compile>
-    </ItemGroup>
+	<ItemGroup>
+		<Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\InferenceTest.cs">
+			<Link>InferenceTest.cs</Link>
+		</Compile>
+		<Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OrtIoBindingAllocationTest.cs">
+			<Link>OrtIoBindingAllocationTest.cs</Link>
+		</Compile>
+		<Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\Tensors\TensorTests.cs">
+			<Link>TensorTests.cs</Link>
+		</Compile>
+	</ItemGroup>
 
-    <ItemGroup>
-        <ProjectReference
+	<ItemGroup>
+		<ProjectReference
             Include="..\..\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj"
             name="Microsoft.ML.OnnxRuntime" />
-        <ProjectReference
+		<ProjectReference
             Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\Microsoft.ML.OnnxRuntime.Tests.Common.csproj"
             name="Microsoft.ML.OnnxRuntime.Tests.Common" />
-        <ProjectReference
+		<ProjectReference
             Include="..\Microsoft.ML.OnnxRuntime.Tests.Devices\Microsoft.ML.OnnxRuntime.Tests.Devices.csproj"
             name="Microsoft.ML.OnnxRuntime.Tests.Devices" />
-    </ItemGroup>
+	</ItemGroup>
 
-    <ItemGroup>
-        <PackageReference Include="DeviceRunners.VisualRunners.Maui" Version="0.1.0-preview.2" />
-        <PackageReference Include="DeviceRunners.VisualRunners.Xunit" Version="0.1.0-preview.2" />
-        <PackageReference Include="DeviceRunners.XHarness.Maui" Version="0.1.0-preview.2" />
-        <PackageReference Include="DeviceRunners.XHarness.Xunit" Version="0.1.0-preview.2" />
-        <PackageReference Include="Microsoft.DotNet.XHarness.TestRunners.Xunit" Version="9.0.0-prerelease.24374.1" />
-        <PackageReference Include="Microsoft.Maui.Controls" Version="8.0.70" />
-        <PackageReference Include="Microsoft.Maui.Controls.Compatibility" Version="8.0.70" />
-        <PackageReference Include="Microsoft.Extensions.Logging.Debug" Version="8.0.0" />
-        <PackageReference Include="xunit" Version="2.9.0" />
-        <PackageReference Include="xunit.runner.utility" Version="2.9.0" />
-    </ItemGroup>
+	<ItemGroup>
+		<PackageReference Include="DeviceRunners.VisualRunners.Maui" Version="0.1.0-preview.2" />
+		<PackageReference Include="DeviceRunners.VisualRunners.Xunit" Version="0.1.0-preview.2" />
+		<PackageReference Include="DeviceRunners.XHarness.Maui" Version="0.1.0-preview.2" />
+		<PackageReference Include="DeviceRunners.XHarness.Xunit" Version="0.1.0-preview.2" />
+		<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
+		<PackageReference Include="Microsoft.DotNet.XHarness.TestRunners.Xunit" Version="9.0.0-prerelease.24374.1" />
+		<PackageReference Include="Microsoft.Maui.Controls" Version="8.0.70" />
+		<PackageReference Include="Microsoft.Maui.Controls.Compatibility" Version="8.0.70" />
+		<PackageReference Include="Microsoft.Extensions.Logging.Debug" Version="8.0.0" />
+		<PackageReference Include="xunit" Version="2.9.0" />
+		<PackageReference Include="xunit.runner.utility" Version="2.9.0" />
+	</ItemGroup>
 
-    <ItemGroup Condition="$(IsIOSTarget)=='true' OR $(IsMacCatalystTarget)=='true'">
-        <!-- need the dummy ORT Extensions package to resolve the RegisterCustomOps symbol. -->
-        <PackageReference Include="Microsoft.ML.OnnxRuntime.Extensions.Dummy" Version="0.12.0" />
-    </ItemGroup>
+	<ItemGroup Condition="$(IsIOSTarget)=='true' OR $(IsMacCatalystTarget)=='true'">
+		<!-- need the dummy ORT Extensions package to resolve the RegisterCustomOps symbol. -->
+		<PackageReference Include="Microsoft.ML.OnnxRuntime.Extensions.Dummy" Version="0.12.0" />
+	</ItemGroup>
 
-    <Target Name="RemoveVisualStudioTestRunner" BeforeTargets="_ComputeAppxPackagePayload">
-        <ItemGroup>
-            <_VisualStudioTestRunnerFiles
+	<Target Name="RemoveVisualStudioTestRunner" BeforeTargets="_ComputeAppxPackagePayload">
+		<ItemGroup>
+			<_VisualStudioTestRunnerFiles
                 Include="@(PackagingOutputs)"
                 Condition="$([System.String]::Copy('%(PackagingOutputs.FullPath)').Contains('xunit.runner.visualstudio'))" />
-            <PackagingOutputs Remove="@(_VisualStudioTestRunnerFiles)" />
-        </ItemGroup>
-    </Target>
+			<PackagingOutputs Remove="@(_VisualStudioTestRunnerFiles)" />
+		</ItemGroup>
+	</Target>
+
+	<PropertyGroup Condition="'$(IsAndroidTarget)' !='true'">
+		<GenerateProgramFile>false</GenerateProgramFile>
+		<DefaultLanguage>en</DefaultLanguage>
+	</PropertyGroup>
+
 </Project>

From 3dcc90119b3836a549d9745ef5a62dca282f226c Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 22 Jan 2025 11:48:38 -0800
Subject: [PATCH 10/37] Update the compile flags for vcpkg packages (#23455)

### Description

This PR updates the triplets files that manage the compile flags for
vcpkg packages.
All the changes are autogenerated except for the gen.py file in this PR.

Main changes:
1. Enable debug info for all Linux build config(Release and Debug)
2. Set CMAKE_CXX_STANDARD in each triplet. The value is set to 20 for
macOS targets and 17 for the others.
3. Only set _FORTIFY_SOURCE in release build. This is to address a build
issue on some platforms with the following glibc change:
"Warn if user requests __FORTIFY_SOURCE but it is disabled"

https://sourceware.org/git/?p=glibc.git;a=commit;f=include/features.h;h=05c2c9618f583ea4acd69b3fe5ae2a2922dd2ddc


### Motivation and Context
Address a Linux build error.
---
 cmake/vcpkg-triplets/asan/arm64-linux.cmake   |  9 +++--
 cmake/vcpkg-triplets/asan/arm64-osx.cmake     |  9 +++--
 .../asan/arm64-windows-static-md.cmake        |  2 +-
 .../asan/arm64-windows-static.cmake           |  2 +-
 .../asan/arm64ec-windows-static-md.cmake      |  2 +-
 .../asan/arm64ec-windows-static.cmake         |  2 +-
 .../vcpkg-triplets/asan/universal2-osx.cmake  |  9 +++--
 cmake/vcpkg-triplets/asan/x64-linux.cmake     |  9 +++--
 cmake/vcpkg-triplets/asan/x64-osx.cmake       |  9 +++--
 .../asan/x64-windows-static-md.cmake          |  2 +-
 .../asan/x64-windows-static.cmake             |  2 +-
 .../asan/x86-windows-static-md.cmake          |  2 +-
 .../asan/x86-windows-static.cmake             |  2 +-
 .../asan_nortti/arm64-linux.cmake             |  9 +++--
 .../asan_nortti/arm64-osx.cmake               |  9 +++--
 .../asan_nortti/arm64-windows-static-md.cmake |  2 +-
 .../asan_nortti/arm64-windows-static.cmake    |  2 +-
 .../arm64ec-windows-static-md.cmake           |  2 +-
 .../asan_nortti/arm64ec-windows-static.cmake  |  2 +-
 .../asan_nortti/universal2-osx.cmake          |  9 +++--
 .../asan_nortti/x64-linux.cmake               |  9 +++--
 .../vcpkg-triplets/asan_nortti/x64-osx.cmake  |  9 +++--
 .../asan_nortti/x64-windows-static-md.cmake   |  2 +-
 .../asan_nortti/x64-windows-static.cmake      |  2 +-
 .../asan_nortti/x86-windows-static-md.cmake   |  2 +-
 .../asan_nortti/x86-windows-static.cmake      |  2 +-
 .../vcpkg-triplets/binskim/arm64-linux.cmake  |  9 +++--
 cmake/vcpkg-triplets/binskim/arm64-osx.cmake  |  8 +++--
 .../binskim/arm64-windows-static-md.cmake     |  2 +-
 .../binskim/arm64-windows-static.cmake        |  2 +-
 .../binskim/arm64ec-windows-static-md.cmake   |  2 +-
 .../binskim/arm64ec-windows-static.cmake      |  2 +-
 .../binskim/universal2-osx.cmake              |  8 +++--
 cmake/vcpkg-triplets/binskim/x64-linux.cmake  |  9 +++--
 cmake/vcpkg-triplets/binskim/x64-osx.cmake    |  8 +++--
 .../binskim/x64-windows-static-md.cmake       |  2 +-
 .../binskim/x64-windows-static.cmake          |  2 +-
 .../binskim/x86-windows-static-md.cmake       |  2 +-
 .../binskim/x86-windows-static.cmake          |  2 +-
 .../binskim_nortti/arm64-linux.cmake          |  9 +++--
 .../binskim_nortti/arm64-osx.cmake            |  8 +++--
 .../arm64-windows-static-md.cmake             |  2 +-
 .../binskim_nortti/arm64-windows-static.cmake |  2 +-
 .../arm64ec-windows-static-md.cmake           |  2 +-
 .../arm64ec-windows-static.cmake              |  2 +-
 .../binskim_nortti/universal2-osx.cmake       |  8 +++--
 .../binskim_nortti/x64-linux.cmake            |  9 +++--
 .../binskim_nortti/x64-osx.cmake              |  8 +++--
 .../x64-windows-static-md.cmake               |  2 +-
 .../binskim_nortti/x64-windows-static.cmake   |  2 +-
 .../x86-windows-static-md.cmake               |  2 +-
 .../binskim_nortti/x86-windows-static.cmake   |  2 +-
 .../vcpkg-triplets/default/arm64-linux.cmake  |  8 +++--
 cmake/vcpkg-triplets/default/arm64-osx.cmake  |  8 +++--
 .../default/arm64-windows-static-md.cmake     |  2 +-
 .../default/arm64-windows-static.cmake        |  2 +-
 .../default/arm64ec-windows-static-md.cmake   |  2 +-
 .../default/arm64ec-windows-static.cmake      |  2 +-
 .../default/universal2-osx.cmake              |  8 +++--
 cmake/vcpkg-triplets/default/x64-linux.cmake  |  8 +++--
 cmake/vcpkg-triplets/default/x64-osx.cmake    |  8 +++--
 .../default/x64-windows-static-md.cmake       |  2 +-
 .../default/x64-windows-static.cmake          |  2 +-
 .../default/x86-windows-static-md.cmake       |  2 +-
 .../default/x86-windows-static.cmake          |  2 +-
 cmake/vcpkg-triplets/gen.py                   | 36 ++++++++++++++-----
 cmake/vcpkg-triplets/nortti/arm64-linux.cmake |  8 +++--
 cmake/vcpkg-triplets/nortti/arm64-osx.cmake   |  8 +++--
 .../nortti/arm64-windows-static-md.cmake      |  2 +-
 .../nortti/arm64-windows-static.cmake         |  2 +-
 .../nortti/arm64ec-windows-static-md.cmake    |  2 +-
 .../nortti/arm64ec-windows-static.cmake       |  2 +-
 .../nortti/universal2-osx.cmake               |  8 +++--
 cmake/vcpkg-triplets/nortti/x64-linux.cmake   |  8 +++--
 cmake/vcpkg-triplets/nortti/x64-osx.cmake     |  8 +++--
 .../nortti/x64-windows-static-md.cmake        |  2 +-
 .../nortti/x64-windows-static.cmake           |  2 +-
 .../nortti/x86-windows-static-md.cmake        |  2 +-
 .../nortti/x86-windows-static.cmake           |  2 +-
 79 files changed, 255 insertions(+), 131 deletions(-)

diff --git a/cmake/vcpkg-triplets/asan/arm64-linux.cmake b/cmake/vcpkg-triplets/asan/arm64-linux.cmake
index 6875a03064..9f5c9997da 100644
--- a/cmake/vcpkg-triplets/asan/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64-osx.cmake b/cmake/vcpkg-triplets/asan/arm64-osx.cmake
index 4ac6bd8097..ba56684949 100644
--- a/cmake/vcpkg-triplets/asan/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake
index c03c9e718f..79e10ad9e4 100644
--- a/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake b/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake
index 184001d423..d0a3305b1f 100644
--- a/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake
index 36176fe040..05a9718835 100644
--- a/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake
index aa086c1220..e0f4b2e1e4 100644
--- a/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/universal2-osx.cmake b/cmake/vcpkg-triplets/asan/universal2-osx.cmake
index de2c8cee48..d74494d578 100644
--- a/cmake/vcpkg-triplets/asan/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/asan/universal2-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x64-linux.cmake b/cmake/vcpkg-triplets/asan/x64-linux.cmake
index dd1d066eb3..64ba6b2216 100644
--- a/cmake/vcpkg-triplets/asan/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/asan/x64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x64-osx.cmake b/cmake/vcpkg-triplets/asan/x64-osx.cmake
index 5f1442c1d5..bbcaff4c39 100644
--- a/cmake/vcpkg-triplets/asan/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/asan/x64-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address")
-set(VCPKG_CXX_FLAGS "-fsanitize=address")
+set(VCPKG_C_FLAGS "-g -fsanitize=address")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake
index 27f7a0190a..c0edb9ca31 100644
--- a/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x64-windows-static.cmake b/cmake/vcpkg-triplets/asan/x64-windows-static.cmake
index 23b8082fbd..3370987c55 100644
--- a/cmake/vcpkg-triplets/asan/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake
index cb9c639049..429a4ac7ce 100644
--- a/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan/x86-windows-static.cmake b/cmake/vcpkg-triplets/asan/x86-windows-static.cmake
index 0667f5f0ea..404cb3fbd0 100644
--- a/cmake/vcpkg-triplets/asan/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake
index 77f35ebada..3d78741ebc 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake
index 5cc70905e6..b25f8f8ebb 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake
index cb0957791f..c4ba82b7ca 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake
index 2d38883062..3b028c4e40 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake
index 4cc7102bf3..d2d4bda334 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake
index d84533c8de..8e986eb139 100644
--- a/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake b/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake
index cacbfa7516..6181e6d1c1 100644
--- a/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/universal2-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake
index b53e668a64..d7103ff250 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake
index 9f4adb513e..191dfb3d35 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x64-osx.cmake
@@ -3,13 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -fsanitize=address -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-fsanitize=address")
+set(VCPKG_LINKER_FLAGS "-fsanitize=address -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake
index 2812ed9419..ae3f00b851 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake
index ccdb919b3e..d64f20d3ce 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake
index 7a6b45666a..24ddfa43c0 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake
index 96b2a2ad74..53fcb44313 100644
--- a/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/asan_nortti/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /fsanitize=address /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/arm64-linux.cmake b/cmake/vcpkg-triplets/binskim/arm64-linux.cmake
index 4b738553e0..8a3cf645d7 100644
--- a/cmake/vcpkg-triplets/binskim/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack")
+set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/arm64-osx.cmake b/cmake/vcpkg-triplets/binskim/arm64-osx.cmake
index 4b6999874b..9892a3eac8 100644
--- a/cmake/vcpkg-triplets/binskim/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake
index 89dfae4bcb..3818356b5c 100644
--- a/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake b/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake
index 28ef65c4d1..ab38e9f9a9 100644
--- a/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake
index 0c087aa1b5..6937aea847 100644
--- a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake
index 8c7de3b8a9..84c0531033 100644
--- a/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/universal2-osx.cmake b/cmake/vcpkg-triplets/binskim/universal2-osx.cmake
index 60826f1ede..da4c6abb39 100644
--- a/cmake/vcpkg-triplets/binskim/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim/universal2-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/x64-linux.cmake b/cmake/vcpkg-triplets/binskim/x64-linux.cmake
index 8d7aeb2342..e3d4d34326 100644
--- a/cmake/vcpkg-triplets/binskim/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/binskim/x64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack")
+set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/x64-osx.cmake b/cmake/vcpkg-triplets/binskim/x64-osx.cmake
index e391ab9eae..426a35e33f 100644
--- a/cmake/vcpkg-triplets/binskim/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim/x64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake
index ef67223cd0..0f600d7931 100644
--- a/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake b/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake
index 62948a156c..17d41775c9 100644
--- a/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake
index 8ac022c7ee..cb981c264a 100644
--- a/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake b/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake
index 8fd2d29dc3..53342263d5 100644
--- a/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake
index c9787f460b..203c85fa3a 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack")
+set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake
index f5866d6863..c57a2401e4 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake
index 927b110c98..9963cfb66f 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake
index b0419c9a0d..0f4948ff07 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake
index aa8b7a5f0e..6a5c8b9f10 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake
index 96da5d9b13..668d4fb4dc 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake b/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake
index f4ef6f0c65..1956daf30e 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/universal2-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake
index 8fe977fb86..da17e00739 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x64-linux.cmake
@@ -3,12 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
-set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack")
+set(VCPKG_LINKER_FLAGS "-Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake
index 196018d7cf..c74e60bc7c 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3 -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake
index 38b5cbdde2..6491d31ae4 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake
index bea970b669..011999df2a 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake
index e75d0c645c..bf843c3e95 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake
index 6de6f80d97..21e0858066 100644
--- a/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/binskim_nortti/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /guard:cf /Qspectre /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 set(VCPKG_LINKER_FLAGS "/profile /DYNAMICBASE")
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
diff --git a/cmake/vcpkg-triplets/default/arm64-linux.cmake b/cmake/vcpkg-triplets/default/arm64-linux.cmake
index 581367931b..120865a5b0 100644
--- a/cmake/vcpkg-triplets/default/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/default/arm64-linux.cmake
@@ -3,11 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64-osx.cmake b/cmake/vcpkg-triplets/default/arm64-osx.cmake
index 4d74306ba4..02e8a34304 100644
--- a/cmake/vcpkg-triplets/default/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/default/arm64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake
index 135dc6ed6f..6d5cf67665 100644
--- a/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/default/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64-windows-static.cmake b/cmake/vcpkg-triplets/default/arm64-windows-static.cmake
index 56e1aebfe6..19ca6f16cd 100644
--- a/cmake/vcpkg-triplets/default/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/default/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake
index 9256f07f54..d7982158f3 100644
--- a/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/default/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake
index bbdfed06fb..fb14ad71c1 100644
--- a/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/default/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/universal2-osx.cmake b/cmake/vcpkg-triplets/default/universal2-osx.cmake
index 64b19451dd..57386c423c 100644
--- a/cmake/vcpkg-triplets/default/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/default/universal2-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x64-linux.cmake b/cmake/vcpkg-triplets/default/x64-linux.cmake
index 57114dd5fc..30c7b1b786 100644
--- a/cmake/vcpkg-triplets/default/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/default/x64-linux.cmake
@@ -3,11 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x64-osx.cmake b/cmake/vcpkg-triplets/default/x64-osx.cmake
index dd50e62267..7af622e135 100644
--- a/cmake/vcpkg-triplets/default/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/default/x64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "")
-set(VCPKG_CXX_FLAGS "")
+set(VCPKG_C_FLAGS "-g")
+set(VCPKG_CXX_FLAGS "-g")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake
index 5339a03371..bec5f2724d 100644
--- a/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/default/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x64-windows-static.cmake b/cmake/vcpkg-triplets/default/x64-windows-static.cmake
index 579740efb6..3f62418071 100644
--- a/cmake/vcpkg-triplets/default/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/default/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake
index 34223c67e8..d93d87b328 100644
--- a/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/default/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/default/x86-windows-static.cmake b/cmake/vcpkg-triplets/default/x86-windows-static.cmake
index fc95d409f8..727b35cd1f 100644
--- a/cmake/vcpkg-triplets/default/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/default/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/gen.py b/cmake/vcpkg-triplets/gen.py
index 615ca66fc8..bec1a87a0a 100644
--- a/cmake/vcpkg-triplets/gen.py
+++ b/cmake/vcpkg-triplets/gen.py
@@ -88,9 +88,11 @@ for enable_rtti in [True, False]:
                             # Disable RTTI and turn usage of dynamic_cast and typeid into errors
                             cxxflags += ["/GR-", "/we4541"]
                         # TODO: should it be a cmake list separated by semicolons?
-                        f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags)))
-                        f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags)))
-                        f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)\n")
+                        if len(cflags) >= 1:
+                            f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags)))
+                        if len(cxxflags) >= 1:
+                            f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags)))
+                        f.write("list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)\n")
                         if ldflags:
                             f.write('set(VCPKG_LINKER_FLAGS "{}")\n'.format(" ".join(ldflags)))
                         add_port_configs(f)
@@ -135,27 +137,34 @@ for os_name in ["linux", "osx"]:
                                 f.write(f"set(VCPKG_TARGET_ARCHITECTURE {target_abi})\n")
                             f.write(f"set(VCPKG_CRT_LINKAGE {crt_linkage})\n")
                             f.write("set(VCPKG_LIBRARY_LINKAGE static)\n")
+
+
                             if enable_binskim and os_name == "linux":
                                 ldflags = [
                                     "-Wl,-Bsymbolic-functions",
                                     "-Wl,-z,relro",
                                     "-Wl,-z,now",
-                                    "-Wl,-z,noexecstack",
+                                    "-Wl,-z,noexecstack"
                                 ]
                             else:
                                 ldflags = []
-                            cflags = []
+                            # Enable debug info for all build configs
+                            cflags = ["-g"]
+                            cflags_release = ["-DNDEBUG", "-O3"]
                             if enable_binskim:
-                                cflags += [
+                                # A warning may be generated from include/features.h if the _FORTIFY_SOURCE flag was used in a debug build
+                                cflags_release += [
                                     "-Wp,-D_FORTIFY_SOURCE=2",
                                     "-Wp,-D_GLIBCXX_ASSERTIONS",
                                     "-fstack-protector-strong",
                                 ]
                                 if target_abi == "x64":
-                                    cflags += ["-fstack-clash-protection", "-fcf-protection"]
+                                    cflags_release += ["-fstack-clash-protection", "-fcf-protection"]
                             elif enable_asan:
                                 cflags += ["-fsanitize=address"]
                                 ldflags += ["-fsanitize=address"]
+                            # Enable debug info for all build configs
+                            ldflags.append('-g')
                             # Avoid unboundTypeError for WebNN EP since unbound type names are illegal with RTTI disabled
                             # in Embind API, relevant issue: https://github.com/emscripten-core/emscripten/issues/7001
                             if not enable_rtti:
@@ -163,8 +172,13 @@ for os_name in ["linux", "osx"]:
                             cxxflags = cflags.copy()
                             if not enable_rtti:
                                 cxxflags.append("-fno-rtti")
-                            f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags)))
-                            f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags)))
+                            if len(cflags) >= 1:
+                                f.write('set(VCPKG_C_FLAGS "{}")\n'.format(" ".join(cflags)))
+                            if len(cxxflags) >= 1:
+                                f.write('set(VCPKG_CXX_FLAGS "{}")\n'.format(" ".join(cxxflags)))
+                            if len(cflags_release) >= 1:
+                                f.write('set(VCPKG_C_FLAGS_RELEASE "{}")\n'.format(" ".join(cflags_release)))
+                                f.write('set(VCPKG_CXX_FLAGS_RELEASE "{}")\n'.format(" ".join(cflags_release)))
                             if os_name == "linux":
                                 f.write("set(VCPKG_CMAKE_SYSTEM_NAME Linux)\n")
                             else:
@@ -184,4 +198,8 @@ for os_name in ["linux", "osx"]:
 
                             if ldflags:
                                 f.write('set(VCPKG_LINKER_FLAGS "{}")\n'.format(" ".join(ldflags)))
+                            if os_name == 'osx':
+                                f.write('list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)\n')
+                            else:
+                                f.write('list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)\n')
                             add_port_configs(f)
diff --git a/cmake/vcpkg-triplets/nortti/arm64-linux.cmake b/cmake/vcpkg-triplets/nortti/arm64-linux.cmake
index 4bd974a112..f9035fc299 100644
--- a/cmake/vcpkg-triplets/nortti/arm64-linux.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64-linux.cmake
@@ -3,11 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64-osx.cmake b/cmake/vcpkg-triplets/nortti/arm64-osx.cmake
index fd8be60b71..d8971e8122 100644
--- a/cmake/vcpkg-triplets/nortti/arm64-osx.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE arm64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake
index 45b24baa2c..9d3c86ce64 100644
--- a/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake b/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake
index 947fe9b61b..238f7405ec 100644
--- a/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake
index ea5741fa42..da314824ca 100644
--- a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake
index 2b354ba511..0c7fb60401 100644
--- a/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake
+++ b/cmake/vcpkg-triplets/nortti/arm64ec-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/universal2-osx.cmake b/cmake/vcpkg-triplets/nortti/universal2-osx.cmake
index 7111bb87c9..febc002c04 100644
--- a/cmake/vcpkg-triplets/nortti/universal2-osx.cmake
+++ b/cmake/vcpkg-triplets/nortti/universal2-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64;arm64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x64-linux.cmake b/cmake/vcpkg-triplets/nortti/x64-linux.cmake
index 34fcc968e6..c1dac19d33 100644
--- a/cmake/vcpkg-triplets/nortti/x64-linux.cmake
+++ b/cmake/vcpkg-triplets/nortti/x64-linux.cmake
@@ -3,11 +3,15 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x64-osx.cmake b/cmake/vcpkg-triplets/nortti/x64-osx.cmake
index 0a7fcc08e0..242d34a358 100644
--- a/cmake/vcpkg-triplets/nortti/x64-osx.cmake
+++ b/cmake/vcpkg-triplets/nortti/x64-osx.cmake
@@ -3,12 +3,16 @@
 set(VCPKG_TARGET_ARCHITECTURE x64)
 set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
-set(VCPKG_C_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
-set(VCPKG_CXX_FLAGS "-DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0")
+set(VCPKG_CXX_FLAGS "-g -DEMSCRIPTEN_HAS_UNBOUND_TYPE_NAMES=0 -fno-rtti")
+set(VCPKG_C_FLAGS_RELEASE "-DNDEBUG -O3")
+set(VCPKG_CXX_FLAGS_RELEASE "-DNDEBUG -O3")
 set(VCPKG_CMAKE_SYSTEM_NAME Darwin)
 set(VCPKG_OSX_ARCHITECTURES "x86_64")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DBENCHMARK_ENABLE_WERROR=OFF)
+set(VCPKG_LINKER_FLAGS "-g")
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS -DCMAKE_CXX_STANDARD=20)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake
index 5c62c4263f..a8d2441583 100644
--- a/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/nortti/x64-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake b/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake
index deceefcd95..688ed230fd 100644
--- a/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake
+++ b/cmake/vcpkg-triplets/nortti/x64-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake b/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake
index cb1b3cd887..1d3de9c142 100644
--- a/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake
+++ b/cmake/vcpkg-triplets/nortti/x86-windows-static-md.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE dynamic)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"
diff --git a/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake b/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake
index 0320217450..3a856c2679 100644
--- a/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake
+++ b/cmake/vcpkg-triplets/nortti/x86-windows-static.cmake
@@ -5,7 +5,7 @@ set(VCPKG_CRT_LINKAGE static)
 set(VCPKG_LIBRARY_LINKAGE static)
 set(VCPKG_C_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000")
 set(VCPKG_CXX_FLAGS "/MP /DWIN32 /D_WINDOWS /DWINAPI_FAMILY=100 /DWINVER=0x0A00 /D_WIN32_WINNT=0x0A00 /DNTDDI_VERSION=0x0A000000 /Zc:__cplusplus /GR- /we4541")
-list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error)
+list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS --compile-no-warning-as-error -DCMAKE_CXX_STANDARD=17)
 if(PORT MATCHES "onnx")
     list(APPEND VCPKG_CMAKE_CONFIGURE_OPTIONS
         "-DONNX_DISABLE_STATIC_REGISTRATION=ON"

From 77adf4b04025adf13bb0e0cff530a3fd28fbd813 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Wed, 22 Jan 2025 11:49:16 -0800
Subject: [PATCH 11/37] Add custom vcpkg ports (#23456)

### Description
Add custom vcpkg ports for the following packages:
1. cpuinfo
2. onnx
3. pthreadpool
4. xnnpack

Because:

- The cpuinfo/pthreadpool/xnnpack packages in the official vcpkg repo
are too old.
   - XNNPack's version is updated from 2022-12-22 to 2025-01-17
   - CPUINFO's version is updated from 2022-07-19 to 2024-12-09
- Pthreadpool's version is updated from 2020-04-10 to 2024-12-17, and
the source code location is changed from
https://github.com/Maratyszcza/pthreadpool to
https://github.com/google/pthreadpool
- The onnx package in the official repo requires building python from
source, which then requires a lot of additional dependencies to be
installed. This PR removes them.
- Added a disable_gcc_warning.patch file for xnnpack for addressing the
issue reported in https://github.com/google/XNNPACK/issues/7650. I will
remove this patch when the issue is fully addressed.
- Added " -DONNX_DISABLE_STATIC_REGISTRATION=ON" to ONNX's config
options.
-
---
 cmake/vcpkg-ports/cpuinfo/portfile.cmake      | 63 ++++++++++++++
 cmake/vcpkg-ports/cpuinfo/vcpkg.json          | 25 ++++++
 cmake/vcpkg-ports/onnx/fix-cmakelists.patch   | 67 +++++++++++++++
 .../onnx/fix-dependency-protobuf.patch        | 28 +++++++
 cmake/vcpkg-ports/onnx/portfile.cmake         | 83 +++++++++++++++++++
 cmake/vcpkg-ports/onnx/vcpkg.json             | 23 +++++
 .../pthreadpool/fix-cmakelists.patch          | 82 ++++++++++++++++++
 cmake/vcpkg-ports/pthreadpool/portfile.cmake  | 25 ++++++
 cmake/vcpkg-ports/pthreadpool/vcpkg.json      | 17 ++++
 .../xnnpack/disable_gcc_warning.patch         | 12 +++
 cmake/vcpkg-ports/xnnpack/fix-build.patch     | 71 ++++++++++++++++
 cmake/vcpkg-ports/xnnpack/portfile.cmake      | 39 +++++++++
 cmake/vcpkg-ports/xnnpack/vcpkg.json          | 17 ++++
 13 files changed, 552 insertions(+)
 create mode 100644 cmake/vcpkg-ports/cpuinfo/portfile.cmake
 create mode 100644 cmake/vcpkg-ports/cpuinfo/vcpkg.json
 create mode 100644 cmake/vcpkg-ports/onnx/fix-cmakelists.patch
 create mode 100644 cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch
 create mode 100644 cmake/vcpkg-ports/onnx/portfile.cmake
 create mode 100644 cmake/vcpkg-ports/onnx/vcpkg.json
 create mode 100644 cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch
 create mode 100644 cmake/vcpkg-ports/pthreadpool/portfile.cmake
 create mode 100644 cmake/vcpkg-ports/pthreadpool/vcpkg.json
 create mode 100644 cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch
 create mode 100644 cmake/vcpkg-ports/xnnpack/fix-build.patch
 create mode 100644 cmake/vcpkg-ports/xnnpack/portfile.cmake
 create mode 100644 cmake/vcpkg-ports/xnnpack/vcpkg.json

diff --git a/cmake/vcpkg-ports/cpuinfo/portfile.cmake b/cmake/vcpkg-ports/cpuinfo/portfile.cmake
new file mode 100644
index 0000000000..e61308bf64
--- /dev/null
+++ b/cmake/vcpkg-ports/cpuinfo/portfile.cmake
@@ -0,0 +1,63 @@
+# On Windows, we can get a cpuinfo.dll, but it exports no symbols.
+if(VCPKG_TARGET_IS_WINDOWS)
+    vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
+endif()
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO pytorch/cpuinfo
+    REF 8a1772a0c5c447df2d18edf33ec4603a8c9c04a6
+    SHA512 b94ccbfa886221d6bb16513d074675af0a72928a9dd9485dcacdc1124a8a60aacbbe91913a1579e766dfb024f0be1d52eeead40342004ff0238a8b94a095ed08
+    HEAD_REF master
+)
+
+vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
+    FEATURES
+        tools CPUINFO_BUILD_TOOLS
+)
+
+set(LINK_OPTIONS "")
+if(VCPKG_LIBRARY_LINKAGE STREQUAL "dynamic")
+    list(APPEND LINK_OPTIONS -DCPUINFO_LIBRARY_TYPE=shared)
+else()
+    list(APPEND LINK_OPTIONS -DCPUINFO_LIBRARY_TYPE=static)
+endif()
+
+if(VCPKG_CRT_LINKAGE STREQUAL "dynamic")
+    list(APPEND LINK_OPTIONS -DCPUINFO_RUNTIME_TYPE=shared)
+else()
+    list(APPEND LINK_OPTIONS -DCPUINFO_RUNTIME_TYPE=static)
+endif()
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    OPTIONS
+        ${FEATURE_OPTIONS}
+        ${LINK_OPTIONS}
+        -DCPUINFO_BUILD_UNIT_TESTS=OFF
+        -DCPUINFO_BUILD_MOCK_TESTS=OFF
+        -DCPUINFO_BUILD_BENCHMARKS=OFF
+    OPTIONS_DEBUG
+        -DCPUINFO_LOG_LEVEL=debug
+    OPTIONS_RELEASE
+        -DCPUINFO_LOG_LEVEL=default
+)
+vcpkg_cmake_install()
+vcpkg_cmake_config_fixup()
+vcpkg_copy_pdbs()
+vcpkg_fixup_pkgconfig() # pkg_check_modules(libcpuinfo)
+
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
+
+if("tools" IN_LIST FEATURES)
+    set(additional_tools "")
+    if(EXISTS "${CURRENT_PACKAGES_DIR}/bin/cpuid-dump${VCPKG_TARGET_EXECUTABLE_SUFFIX}")
+        list(APPEND additional_tools "cpuid-dump")
+    endif()
+    vcpkg_copy_tools(
+        TOOL_NAMES cache-info cpu-info isa-info ${additional_tools}
+        AUTO_CLEAN
+    )
+endif()
+
+file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
diff --git a/cmake/vcpkg-ports/cpuinfo/vcpkg.json b/cmake/vcpkg-ports/cpuinfo/vcpkg.json
new file mode 100644
index 0000000000..ce93591dba
--- /dev/null
+++ b/cmake/vcpkg-ports/cpuinfo/vcpkg.json
@@ -0,0 +1,25 @@
+{
+  "name": "cpuinfo",
+  "version-date": "2024-12-09",
+  "port-version": 3,
+  "description": "CPU INFOrmation library (x86/x86-64/ARM/ARM64, Linux/Windows/Android/macOS/iOS)",
+  "homepage": "https://github.com/pytorch/cpuinfo",
+  "license": "BSD-2-Clause",
+  "supports": "!(uwp & arm32)",
+  "dependencies": [
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ],
+  "features": {
+    "tools": {
+      "description": "Build cpuinfo command-line tools",
+      "supports": "!uwp"
+    }
+  }
+}
diff --git a/cmake/vcpkg-ports/onnx/fix-cmakelists.patch b/cmake/vcpkg-ports/onnx/fix-cmakelists.patch
new file mode 100644
index 0000000000..f8d300103a
--- /dev/null
+++ b/cmake/vcpkg-ports/onnx/fix-cmakelists.patch
@@ -0,0 +1,67 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 4dd56b6..2ff3e29 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -65,6 +65,27 @@ endif()
+ 
+ include(GNUInstallDirs)
+ 
++# install protobuf files
++install(FILES ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-data.proto
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-data.proto3
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-ml.proto
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-ml.proto3
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-operators-ml.proto
++              ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx-operators-ml.proto3
++        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnx
++)
++# install python files
++if(BUILD_ONNX_PYTHON)
++  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_data_pb.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_data_pb2.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_ml_pb2.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_operators_ml_pb2.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_operators_pb.py
++                ${CMAKE_CURRENT_BINARY_DIR}/onnx/onnx_pb.py
++          DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnx
++  )
++endif()
++
+ set(ONNX_ROOT ${PROJECT_SOURCE_DIR})
+ 
+ # Read ONNX version
+@@ -116,7 +137,8 @@ endif()
+ # find_package Python has replaced PythonInterp and PythonLibs since cmake 3.12
+ # Use the following command in the future; now this is only compatible with the latest pybind11
+ # find_package(Python ${PY_VERSION} COMPONENTS Interpreter Development REQUIRED)
+-find_package(PythonInterp ${PY_VERSION} REQUIRED)
++find_package(Python3 ${PY_VERSION} COMPONENTS Interpreter REQUIRED)
++set(PYTHON_EXECUTABLE ${Python3_EXECUTABLE})
+ if(BUILD_ONNX_PYTHON)
+   find_package(PythonLibs ${PY_VERSION})
+ endif()
+@@ -434,6 +456,7 @@ target_link_libraries(onnx PUBLIC onnx_proto)
+ add_onnx_global_defines(onnx)
+ 
+ if(BUILD_ONNX_PYTHON)
++  find_package(Python3 ${PY_VERSION} COMPONENTS Development REQUIRED)
+   if("${PY_EXT_SUFFIX}" STREQUAL "")
+     if(MSVC)
+       set(PY_EXT_SUFFIX ".pyd")
+@@ -452,10 +475,14 @@ if(BUILD_ONNX_PYTHON)
+   target_include_directories(onnx_cpp2py_export PRIVATE
+                              $<BUILD_INTERFACE:${ONNX_ROOT}>
+                              $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+-                             $<INSTALL_INTERFACE:include>)
++                             ${Python3_INCLUDE_DIRS})
++  target_link_directories(onnx_cpp2py_export PRIVATE
++                          ${Python3_LIBRARY_DIRS})
++  target_link_libraries(onnx_cpp2py_export PRIVATE
++                        ${Python3_LIBRARIES})
+ 
+   # pybind11 is a header only lib
+-  find_package(pybind11 2.2 CONFIG)
++  find_package(pybind11 2.2 CONFIG REQUIRED)
+   if(NOT pybind11_FOUND)
+     if(EXISTS "${ONNX_ROOT}/third_party/pybind11/include/pybind11/pybind11.h")
+       add_subdirectory("${ONNX_ROOT}/third_party/pybind11")
diff --git a/cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch b/cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch
new file mode 100644
index 0000000000..c435922d01
--- /dev/null
+++ b/cmake/vcpkg-ports/onnx/fix-dependency-protobuf.patch
@@ -0,0 +1,28 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index d81ac1d..9f97998 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -149,6 +149,7 @@ if(ONNX_BUILD_TESTS)
+   set(googletest_STATIC_LIBRARIES GTest::gtest)
+ endif()
+ 
++find_package(protobuf CONFIG REQUIRED)
+ if((ONNX_USE_LITE_PROTO AND TARGET protobuf::libprotobuf-lite) OR ((NOT ONNX_USE_LITE_PROTO) AND TARGET protobuf::libprotobuf))
+   # Sometimes we need to use protoc compiled for host architecture while linking
+   # libprotobuf against target architecture. See https://github.com/caffe2/caffe
+diff --git a/cmake/ONNXConfig.cmake.in b/cmake/ONNXConfig.cmake.in
+index d588f8a..dbd4398 100644
+--- a/cmake/ONNXConfig.cmake.in
++++ b/cmake/ONNXConfig.cmake.in
+@@ -6,9 +6,8 @@
+ # library version information
+ set(ONNX_VERSION "@ONNX_VERSION@")
+ 
+-list(APPEND CMAKE_PREFIX_PATH "@PROTOBUF_DIR@")
+-set(Protobuf_INCLUDE_DIR "@PROTOBUF_INCLUDE_DIR@")
+-find_package(Protobuf REQUIRED)
++include(CMakeFindDependencyMacro)
++find_dependency(protobuf CONFIG)
+ 
+ # import targets
+ include ("${CMAKE_CURRENT_LIST_DIR}/ONNXTargets.cmake")
diff --git a/cmake/vcpkg-ports/onnx/portfile.cmake b/cmake/vcpkg-ports/onnx/portfile.cmake
new file mode 100644
index 0000000000..a0c9978038
--- /dev/null
+++ b/cmake/vcpkg-ports/onnx/portfile.cmake
@@ -0,0 +1,83 @@
+vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO onnx/onnx
+    REF "v${VERSION}"
+    SHA512 5a18e2b19ec9c18c8b115fb7e12ed98eddaa581c95f15c4dd420cd6c86e7caa04f9a393da589e76b89cf9b3544abd3749a8c77c2446782f37502eb74e9b1f661
+    PATCHES
+        fix-cmakelists.patch
+        fix-dependency-protobuf.patch
+)
+
+string(COMPARE EQUAL "${VCPKG_CRT_LINKAGE}" "static" USE_STATIC_RUNTIME)
+
+# ONNX_USE_PROTOBUF_SHARED_LIBS: find the library and check its file extension
+find_library(PROTOBUF_LIBPATH NAMES protobuf PATHS "${CURRENT_INSTALLED_DIR}/bin" "${CURRENT_INSTALLED_DIR}/lib" REQUIRED)
+get_filename_component(PROTOBUF_LIBNAME "${PROTOBUF_LIBPATH}" NAME)
+
+set(USE_PROTOBUF_SHARED OFF)
+
+
+
+# Like protoc, python is required for codegen.
+vcpkg_find_acquire_program(PYTHON3)
+
+# PATH for .bat scripts so it can find 'python'
+get_filename_component(PYTHON_DIR "${PYTHON3}" PATH)
+vcpkg_add_to_path(PREPEND "${PYTHON_DIR}")
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    OPTIONS
+        ${FEATURE_OPTIONS}
+        -DPython3_EXECUTABLE=${PYTHON3}
+        -DONNX_ML=ON
+        -DONNX_GEN_PB_TYPE_STUBS=ON
+        -DONNX_USE_PROTOBUF_SHARED_LIBS=${USE_PROTOBUF_SHARED}
+        -DONNX_USE_LITE_PROTO=OFF
+        -DONNX_USE_MSVC_STATIC_RUNTIME=${USE_STATIC_RUNTIME}
+        -DONNX_BUILD_TESTS=OFF
+        -DONNX_BUILD_BENCHMARKS=OFF
+        -DONNX_DISABLE_STATIC_REGISTRATION=ON
+    MAYBE_UNUSED_VARIABLES
+        ONNX_USE_MSVC_STATIC_RUNTIME
+)
+
+vcpkg_cmake_install()
+vcpkg_cmake_config_fixup(CONFIG_PATH lib/cmake/ONNX)
+
+vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/LICENSE")
+
+file(REMOVE_RECURSE
+    "${CURRENT_PACKAGES_DIR}/debug/include"
+    "${CURRENT_PACKAGES_DIR}/debug/share"
+    # the others are empty
+    "${CURRENT_PACKAGES_DIR}/include/onnx/backend"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/bin"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/controlflow"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/generator"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/image"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/logical"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/math"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/nn"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/object_detection"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/optional"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/quantization"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/reduction"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/rnn"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/sequence"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/text"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/traditionalml"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/defs/training"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/examples"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/frontend"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_cpp2py_export"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/test"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/tools"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_ml"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_data"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/onnx_operators_ml"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/reference/ops"
+    "${CURRENT_PACKAGES_DIR}/include/onnx/reference"
+)
diff --git a/cmake/vcpkg-ports/onnx/vcpkg.json b/cmake/vcpkg-ports/onnx/vcpkg.json
new file mode 100644
index 0000000000..7d2bbd84c0
--- /dev/null
+++ b/cmake/vcpkg-ports/onnx/vcpkg.json
@@ -0,0 +1,23 @@
+{
+  "name": "onnx",
+  "version-semver": "1.17.0",
+  "description": "Open standard for machine learning interoperability",
+  "homepage": "https://onnx.ai",
+  "license": "Apache-2.0",
+  "supports": "!uwp",
+  "dependencies": [
+    "protobuf",
+    {
+      "name": "protobuf",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ]
+}
diff --git a/cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch b/cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch
new file mode 100644
index 0000000000..97fd1ac7a2
--- /dev/null
+++ b/cmake/vcpkg-ports/pthreadpool/fix-cmakelists.patch
@@ -0,0 +1,82 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index f06aada..3c6c6e2 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -31,8 +31,6 @@ IF(CCACHE_BINARY)
+ ENDIF()
+ 
+ # ---[ Options.
+-SET(PTHREADPOOL_LIBRARY_TYPE "default" CACHE STRING "Type of library (shared, static, or default) to build")
+-SET_PROPERTY(CACHE PTHREADPOOL_LIBRARY_TYPE PROPERTY STRINGS default static shared)
+ OPTION(PTHREADPOOL_ALLOW_DEPRECATED_API "Enable deprecated API functions" ON)
+ SET(PTHREADPOOL_SYNC_PRIMITIVE "default" CACHE STRING "Synchronization primitive (condvar, futex, gcd, event, or default) for worker threads")
+ SET_PROPERTY(CACHE PTHREADPOOL_SYNC_PRIMITIVE PROPERTY STRINGS default condvar futex gcd event)
+@@ -41,7 +39,7 @@ IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i[3-6]86|AMD64|x86(_64)?)$")
+ ELSE()
+   OPTION(PTHREADPOOL_ENABLE_FASTPATH "Enable fast path using atomic decrement instead of atomic compare-and-swap" OFF)
+ ENDIF()
+-IF("${CMAKE_SOURCE_DIR}" STREQUAL "${PROJECT_SOURCE_DIR}")
++IF(FALSE)
+   OPTION(PTHREADPOOL_BUILD_TESTS "Build pthreadpool unit tests" ON)
+   OPTION(PTHREADPOOL_BUILD_BENCHMARKS "Build pthreadpool micro-benchmarks" ON)
+ ELSE()
+@@ -67,7 +65,8 @@ MACRO(PTHREADPOOL_TARGET_ENABLE_CXX11 target)
+ ENDMACRO()
+ 
+ # ---[ Download deps
+-IF(NOT DEFINED FXDIV_SOURCE_DIR)
++find_path(FXDIV_INCLUDE_DIRS "fxdiv.h")
++IF(FALSE)
+   MESSAGE(STATUS "Downloading FXdiv to ${CMAKE_BINARY_DIR}/FXdiv-source (define FXDIV_SOURCE_DIR to avoid it)")
+   CONFIGURE_FILE(cmake/DownloadFXdiv.cmake "${CMAKE_BINARY_DIR}/FXdiv-download/CMakeLists.txt")
+   EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+@@ -118,21 +117,13 @@ ELSE()
+ ENDIF()
+ 
+ ADD_LIBRARY(pthreadpool_interface INTERFACE)
+-TARGET_INCLUDE_DIRECTORIES(pthreadpool_interface INTERFACE include)
++TARGET_INCLUDE_DIRECTORIES(pthreadpool_interface INTERFACE $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include> $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+ IF(NOT PTHREADPOOL_ALLOW_DEPRECATED_API)
+   TARGET_COMPILE_DEFINITIONS(pthreadpool_interface INTERFACE PTHREADPOOL_NO_DEPRECATED_API=1)
+ ENDIF()
+ INSTALL(FILES include/pthreadpool.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+ 
+-IF(PTHREADPOOL_LIBRARY_TYPE STREQUAL "default")
+-  ADD_LIBRARY(pthreadpool ${PTHREADPOOL_SRCS})
+-ELSEIF(PTHREADPOOL_LIBRARY_TYPE STREQUAL "shared")
+-  ADD_LIBRARY(pthreadpool SHARED ${PTHREADPOOL_SRCS})
+-ELSEIF(PTHREADPOOL_LIBRARY_TYPE STREQUAL "static")
+-  ADD_LIBRARY(pthreadpool STATIC ${PTHREADPOOL_SRCS})
+-ELSE()
+-  MESSAGE(FATAL_ERROR "Unsupported library type ${PTHREADPOOL_LIBRARY_TYPE}")
+-ENDIF()
++ADD_LIBRARY(pthreadpool ${PTHREADPOOL_SRCS})
+ 
+ IF(PTHREADPOOL_SYNC_PRIMITIVE STREQUAL "condvar")
+   TARGET_COMPILE_DEFINITIONS(pthreadpool PRIVATE PTHREADPOOL_USE_FUTEX=0)
+@@ -181,18 +172,22 @@ IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ ENDIF()
+ 
+ # ---[ Configure FXdiv
+-IF(NOT TARGET fxdiv)
++IF(FALSE)
+   SET(FXDIV_BUILD_TESTS OFF CACHE BOOL "")
+   SET(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "")
+   ADD_SUBDIRECTORY(
+     "${FXDIV_SOURCE_DIR}"
+     "${CMAKE_BINARY_DIR}/FXdiv")
+ ENDIF()
+-TARGET_LINK_LIBRARIES(pthreadpool PRIVATE fxdiv)
++TARGET_INCLUDE_DIRECTORIES(pthreadpool PRIVATE ${FXDIV_INCLUDE_DIRS})
+ 
+-INSTALL(TARGETS pthreadpool
++INSTALL(TARGETS pthreadpool pthreadpool_interface
++  EXPORT unofficial-pthreadpool-config
++  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
++install(EXPORT unofficial-pthreadpool-config NAMESPACE unofficial::
++  DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/unofficial-${PROJECT_NAME}) # share/unofficial-pthreadpool
+ 
+ IF(PTHREADPOOL_BUILD_TESTS)
+   # ---[ Build google test
diff --git a/cmake/vcpkg-ports/pthreadpool/portfile.cmake b/cmake/vcpkg-ports/pthreadpool/portfile.cmake
new file mode 100644
index 0000000000..9400e5e886
--- /dev/null
+++ b/cmake/vcpkg-ports/pthreadpool/portfile.cmake
@@ -0,0 +1,25 @@
+if(VCPKG_TARGET_IS_WINDOWS)
+    vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
+endif()
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO google/pthreadpool
+    REF 4e80ca24521aa0fb3a746f9ea9c3eaa20e9afbb0
+    SHA512 776017cc5d2aa94337292f2f4fbd54d099ef29abf736ab8147f07f98f12b7654cbd2fe38d34646a479a519c261ac253bbaf19c6dcbb0ec4cc0859de70f7e6472
+    PATCHES
+        fix-cmakelists.patch
+)
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    OPTIONS
+        -DPTHREADPOOL_BUILD_TESTS=OFF
+        -DPTHREADPOOL_BUILD_BENCHMARKS=OFF
+)
+vcpkg_cmake_install()
+vcpkg_copy_pdbs()
+vcpkg_cmake_config_fixup(PACKAGE_NAME unofficial-${PORT})
+
+#file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include")
diff --git a/cmake/vcpkg-ports/pthreadpool/vcpkg.json b/cmake/vcpkg-ports/pthreadpool/vcpkg.json
new file mode 100644
index 0000000000..16c0bea5b7
--- /dev/null
+++ b/cmake/vcpkg-ports/pthreadpool/vcpkg.json
@@ -0,0 +1,17 @@
+{
+  "name": "pthreadpool",
+  "version-date": "2024-12-17",
+  "description": "Portable (POSIX/Windows/Emscripten) thread pool for C/C++",
+  "homepage": "https://github.com/google/pthreadpool",
+  "dependencies": [
+    "fxdiv",
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ]
+}
diff --git a/cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch b/cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch
new file mode 100644
index 0000000000..a7c5e0e254
--- /dev/null
+++ b/cmake/vcpkg-ports/xnnpack/disable_gcc_warning.patch
@@ -0,0 +1,12 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 4a9fad59a..2713cded3 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -334,6 +334,7 @@ ENDIF()
+ IF(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+   # Disable "note: parameter passing for argument of type ... changed/will change in ..."
+   ADD_COMPILE_OPTIONS("-Wno-psabi")
++  ADD_COMPILE_OPTIONS("-Wno-incompatible-pointer-types")
+ ENDIF()
+ 
+ # ---[ Build flags
diff --git a/cmake/vcpkg-ports/xnnpack/fix-build.patch b/cmake/vcpkg-ports/xnnpack/fix-build.patch
new file mode 100644
index 0000000000..b867377d2f
--- /dev/null
+++ b/cmake/vcpkg-ports/xnnpack/fix-build.patch
@@ -0,0 +1,71 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index f0b3410ae..ba54c3bfe 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -1047,9 +1047,11 @@ ENDIF()
+ IF(XNNPACK_BUILD_ALL_MICROKERNELS)
+   TARGET_INCLUDE_DIRECTORIES(microkernels-all PRIVATE include src)
+ ENDIF()
++
+ TARGET_INCLUDE_DIRECTORIES(datatype PRIVATE include src)
+ TARGET_INCLUDE_DIRECTORIES(microkernels-prod PRIVATE include src)
+-TARGET_INCLUDE_DIRECTORIES(hardware-config PRIVATE include src ${CPUINFO_SOURCE_DIR}/include)
++TARGET_INCLUDE_DIRECTORIES(hardware-config PRIVATE include src)
++
+ TARGET_INCLUDE_DIRECTORIES(indirection PRIVATE include src)
+ TARGET_INCLUDE_DIRECTORIES(microparams-init PRIVATE include src)
+ TARGET_INCLUDE_DIRECTORIES(normalization PRIVATE include src)
+@@ -1104,14 +1106,9 @@ IF(NOT TARGET cpuinfo)
+       "${CPUINFO_SOURCE_DIR}"
+       "${CMAKE_BINARY_DIR}/cpuinfo")
+   ELSE()
+-    ADD_LIBRARY(cpuinfo SHARED IMPORTED)
+-    FIND_LIBRARY(CPUINFO_LIBRARY cpuinfo PATHS "${CPUINFO_SOURCE_DIR}/lib")
+-    IF(NOT CPUINFO_LIBRARY)
+-      MESSAGE(FATAL_ERROR "Cannot find cpuinfo")
+-    ENDIF()
+-    TARGET_INCLUDE_DIRECTORIES(cpuinfo INTERFACE "${CPUINFO_SOURCE_DIR}/include")
+-    SET_PROPERTY(TARGET cpuinfo PROPERTY IMPORTED_LOCATION "${CPUINFO_LIBRARY}")
+-    SET_PROPERTY(TARGET cpuinfo PROPERTY IMPORTED_IMPLIB "${CPUINFO_LIBRARY}")
++    ADD_LIBRARY(cpuinfo INTERFACE)
++    FIND_PACKAGE(cpuinfo CONFIG REQUIRED)
++    TARGET_LINK_LIBRARIES(cpuinfo INTERFACE cpuinfo::cpuinfo)
+   ENDIF()
+ ENDIF()
+ IF(XNNPACK_BUILD_LIBRARY)
+@@ -1129,16 +1126,12 @@ IF(NOT TARGET pthreadpool)
+       "${PTHREADPOOL_SOURCE_DIR}"
+       "${CMAKE_BINARY_DIR}/pthreadpool")
+   ELSE()
++    find_package(unofficial-pthreadpool CONFIG REQUIRED)
+     ADD_LIBRARY(pthreadpool SHARED IMPORTED)
+-    FIND_LIBRARY(PTHREADPOOL_LIBRARY pthreadpool PATHS "${PTHREADPOOL_SOURCE_DIR}/lib")
+-    IF(NOT PTHREADPOOL_LIBRARY)
+-      MESSAGE(FATAL_ERROR "Cannot find pthreadpool")
+-    ENDIF()
++    FIND_LIBRARY(PTHREADPOOL_LIBRARY NAMES pthreadpool REQUIRED)
+     FIND_PACKAGE(Threads REQUIRED)
+-    TARGET_INCLUDE_DIRECTORIES(pthreadpool INTERFACE "${PTHREADPOOL_SOURCE_DIR}/include")
+-    TARGET_LINK_LIBRARIES(pthreadpool INTERFACE Threads::Threads)
++    TARGET_LINK_LIBRARIES(pthreadpool INTERFACE Threads::Threads unofficial::pthreadpool unofficial::pthreadpool_interface)
+     SET_PROPERTY(TARGET pthreadpool PROPERTY IMPORTED_LOCATION "${PTHREADPOOL_LIBRARY}")
+-    SET_PROPERTY(TARGET pthreadpool PROPERTY IMPORTED_IMPLIB "${PTHREADPOOL_LIBRARY}")
+   ENDIF()
+ ENDIF()
+ TARGET_LINK_LIBRARIES(xnnpack-base INTERFACE pthreadpool)
+@@ -1152,12 +1145,12 @@ IF(NOT TARGET fxdiv)
+       "${FXDIV_SOURCE_DIR}"
+       "${CMAKE_BINARY_DIR}/FXdiv")
+   ELSE()
+-    FIND_FILE(FXDIV_HDR fxdiv.h PATH_SUFFIXES include PATHS "${FXDIV_SOURCE_DIR}")
++    FIND_PATH(FXDIV_HDR fxdiv.h PATH_SUFFIXES include)
+     IF(NOT FXDIV_HDR)
+       MESSAGE(FATAL_ERROR "Cannot find fxdiv")
+     ENDIF()
+-    ADD_LIBRARY(fxdiv STATIC "${FXDIV_HDR}")
+-    TARGET_INCLUDE_DIRECTORIES(fxdiv INTERFACE "${FXDIV_SOURCE_DIR}/include")
++    ADD_LIBRARY(fxdiv INTERFACE IMPORTED)
++    target_include_directories(fxdiv INTERFACE "${FXDIV_HDR}")
+     SET_PROPERTY(TARGET fxdiv PROPERTY LINKER_LANGUAGE C)
+   ENDIF()
+ ENDIF()
diff --git a/cmake/vcpkg-ports/xnnpack/portfile.cmake b/cmake/vcpkg-ports/xnnpack/portfile.cmake
new file mode 100644
index 0000000000..b07da3186b
--- /dev/null
+++ b/cmake/vcpkg-ports/xnnpack/portfile.cmake
@@ -0,0 +1,39 @@
+if(VCPKG_TARGET_IS_WINDOWS)
+    vcpkg_check_linkage(ONLY_STATIC_LIBRARY)
+endif()
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO google/XNNPACK
+    REF 854b343f9cad36bd596e4390959ca3648208e048
+    SHA512 f37384b43022cb74bf87bd99c2e82e51d48fe4e0e4642611fcbc10cbb86ff2468b67964027f13f82a715dc7201c490d88d5020fb565ad236187b9dd219f3f644
+    HEAD_REF master
+    PATCHES
+        fix-build.patch
+	disable_gcc_warning.patch
+)
+vcpkg_find_acquire_program(PYTHON3)
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+	WINDOWS_USE_MSBUILD
+    OPTIONS
+        "-DPython3_EXECUTABLE=${PYTHON3}"
+        "-DPython_EXECUTABLE=${PYTHON3}"
+        -DXNNPACK_USE_SYSTEM_LIBS=ON
+        -DXNNPACK_ENABLE_AVXVNNI=OFF
+        -DXNNPACK_ENABLE_ASSEMBLY=ON
+        -DXNNPACK_ENABLE_MEMOPT=ON
+        -DXNNPACK_ENABLE_SPARSE=ON
+        -DXNNPACK_ENABLE_KLEIDIAI=OFF
+        -DXNNPACK_BUILD_TESTS=OFF
+        -DXNNPACK_BUILD_BENCHMARKS=OFF
+)
+vcpkg_cmake_install()
+vcpkg_copy_pdbs()
+
+file(INSTALL "${SOURCE_PATH}/LICENSE" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}" RENAME copyright)
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/debug/include"
+                    "${CURRENT_PACKAGES_DIR}/debug/bin"
+                    "${CURRENT_PACKAGES_DIR}/debug/share"
+)
diff --git a/cmake/vcpkg-ports/xnnpack/vcpkg.json b/cmake/vcpkg-ports/xnnpack/vcpkg.json
new file mode 100644
index 0000000000..5e383c0b37
--- /dev/null
+++ b/cmake/vcpkg-ports/xnnpack/vcpkg.json
@@ -0,0 +1,17 @@
+{
+  "name": "xnnpack",
+  "version-date": "2025-01-17",
+  "description": "High-efficiency floating-point neural network inference operators for mobile, server, and Web",
+  "homepage": "https://github.com/google/XNNPACK",
+  "license": "BSD-3-Clause",
+  "supports": "!(arm & windows) & !uwp & !arm32",
+  "dependencies": [
+    "cpuinfo",
+    "fxdiv",
+    "pthreadpool",
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    }
+  ]
+}

From 3b4c7df4e9f13827538caaad48e1586dda4d18f3 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 22 Jan 2025 12:11:00 -0800
Subject: [PATCH 12/37] [QNN EP] Make QNN EP a shared library (#23120)

### Description
- Makes QNN EP a shared library **by default** when building with
`--use_qnn` or `--use_qnn shared_lib`. Generates the following build
artifacts:
- **Windows**: `onnxruntime_providers_qnn.dll` and
`onnxruntime_providers_shared.dll`
- **Linux**: `libonnxruntime_providers_qnn.so` and
`libonnxruntime_providers_shared.so`
  - **Android**: Not supported. Must build QNN EP as a static library.
- Allows QNN EP to still be built as a static library with `--use_qnn
static_lib`. This is primarily for the Android QNN AAR package.
- Unit tests run for both the static and shared QNN EP builds.

### Detailed changes
- Updates Java bindings to support both shared and static QNN EP builds.
- Provider bridge API:
- Adds logging sink ETW to the provider bridge. Allows EPs to register
ETW callbacks for ORT logging.
- Adds a variety of methods for onnxruntime objects that are needed by
QNN EP.
- QNN EP:
- Adds `ort_api.h` and `ort_api.cc` that encapsulates the API provided
by ORT in a manner that allows the EP to be built as either a shared or
static library.
- Adds custom function to transpose weights for Conv and Gemm (instead
of adding util to provider bridge API).
- Adds custom function to quantize data for LeakyRelu (instead of adding
util to provider bridge API).
  - Adds custom ETW tracing for QNN profiling events:
    - shared library: defines its own TraceLogging provider handle
- static library: uses ORT's TraceLogging provider handle and existing
telemetry provider.
- ORT-QNN Packages:
- **Python**: Pipelines build QNN EP as a shared library by default.
User can build a local python wheel with QNN EP as a static library by
passing `--use_qnn static_lib`.
- **NuGet**: Pipelines build QNN EP as a shared library by default.
`build.py` currently enforces QNN EP to be built as a shared library.
Can add support for building a QNN NuGet package with static later if
deemed necessary.
- **Android**: Pipelines build QNN EP as a **static library**.
`build.py` enforces QNN EP to be built as a static library. Packaging
multiple shared libraries into an Android AAR package is not currently
supported due to the added need to also distribute a shared libcpp.so
library.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 cmake/CMakeLists.txt                          |   1 +
 cmake/onnxruntime.cmake                       |  19 +-
 cmake/onnxruntime_java.cmake                  |  10 +-
 cmake/onnxruntime_providers.cmake             |   3 -
 cmake/onnxruntime_providers_cpu.cmake         |   4 +-
 cmake/onnxruntime_providers_qnn.cmake         | 122 ++++++---
 cmake/onnxruntime_python.cmake                |  25 +-
 cmake/onnxruntime_unittests.cmake             |  24 +-
 .../main/java/ai/onnxruntime/OnnxRuntime.java |  22 +-
 .../main/java/ai/onnxruntime/OrtSession.java  |   4 +
 .../core/platform/windows/logging/etw_sink.cc |  18 ++
 .../core/platform/windows/logging/etw_sink.h  |  31 +++
 .../qnn/builder/onnx_ctx_model_helper.cc      |  25 +-
 .../qnn/builder/onnx_ctx_model_helper.h       |   8 +-
 .../core/providers/qnn/builder/op_builder.h   |   4 +-
 .../qnn/builder/op_builder_factory.cc         |   2 -
 .../opbuilder/argmax_min_op_builder.cc        |   7 +-
 .../qnn/builder/opbuilder/base_op_builder.cc  | 208 ++++++++++++---
 .../qnn/builder/opbuilder/base_op_builder.h   |  45 ++--
 .../opbuilder/batch_norm_op_builder.cc        |   7 +-
 .../qnn/builder/opbuilder/cast_op_builder.cc  |   3 +-
 .../qnn/builder/opbuilder/clip_op_builder.cc  |   5 +-
 .../qnn/builder/opbuilder/conv_op_builder.cc  |  40 ++-
 .../builder/opbuilder/expand_op_builder.cc    |   6 +-
 .../builder/opbuilder/gather_op_builder.cc    |   6 +-
 .../qnn/builder/opbuilder/gemm_op_builder.cc  |   6 +-
 .../opbuilder/instance_norm_op_builder.cc     |   8 +-
 .../opbuilder/layer_norm_op_builder.cc        |   8 +-
 .../qnn/builder/opbuilder/lrn_op_builder.cc   |   4 +-
 .../builder/opbuilder/matmul_op_builder.cc    |   4 +-
 .../qnn/builder/opbuilder/pad_op_builder.cc   |   8 +-
 .../qnn/builder/opbuilder/pool_op_builder.cc  |   8 +-
 .../builder/opbuilder/reduce_op_builder.cc    |  20 +-
 .../builder/opbuilder/reshape_op_builder.cc   |   6 +-
 .../builder/opbuilder/resize_op_builder.cc    |  11 +-
 .../builder/opbuilder/simple_op_builder.cc    |  25 +-
 .../qnn/builder/opbuilder/slice_op_builder.cc |  29 +-
 .../builder/opbuilder/softmax_op_builder.cc   |   7 +-
 .../qnn/builder/opbuilder/split_op_builder.cc |   7 +-
 .../qnn/builder/opbuilder/tile_op_builder.cc  |   7 +-
 .../providers/qnn/builder/opbuilder/topk.cc   |   2 +-
 .../builder/opbuilder/transpose_op_builder.cc |   5 +-
 .../qnn/builder/qnn_backend_manager.cc        |  72 +++--
 .../qnn/builder/qnn_backend_manager.h         |   5 +-
 .../qnn/builder/qnn_configs_helper.h          |  30 ++-
 .../builder/qnn_context_mem_handle_manager.cc |   2 +-
 .../builder/qnn_context_mem_handle_manager.h  |   5 +-
 .../core/providers/qnn/builder/qnn_def.h      |   3 +-
 .../core/providers/qnn/builder/qnn_model.cc   |   7 +-
 .../core/providers/qnn/builder/qnn_model.h    |   5 +-
 .../qnn/builder/qnn_model_wrapper.cc          |  37 +--
 .../providers/qnn/builder/qnn_model_wrapper.h |   7 +-
 .../providers/qnn/builder/qnn_node_group.h    |   3 +-
 .../qnn/builder/qnn_node_group/dq_q_fusion.cc |   5 +-
 .../qnn/builder/qnn_node_group/dq_q_fusion.h  |   3 +-
 .../qnn_node_group/hardsigmoid_mul_fusion.cc  |   5 +-
 .../qnn_node_group/hardsigmoid_mul_fusion.h   |   3 +-
 .../builder/qnn_node_group/qnn_node_group.cc  |   3 +-
 .../qnn/builder/qnn_node_group/utils.cc       |   3 +-
 .../qnn/builder/qnn_node_group/utils.h        |   3 +-
 .../qnn/builder/qnn_quant_params_wrapper.h    |   6 +-
 .../core/providers/qnn/builder/qnn_utils.cc   | 216 ++++++++++++---
 .../core/providers/qnn/builder/qnn_utils.h    | 131 ++++++++-
 onnxruntime/core/providers/qnn/ort_api.cc     | 211 +++++++++++++++
 onnxruntime/core/providers/qnn/ort_api.h      | 178 +++++++++++++
 .../core/providers/qnn/qnn_allocator.cc       |   7 +-
 .../core/providers/qnn/qnn_allocator.h        |   6 +-
 .../providers/qnn/qnn_execution_provider.cc   | 251 +++++++++---------
 .../providers/qnn/qnn_execution_provider.h    |  27 +-
 .../providers/qnn/qnn_provider_factory.cc     |  52 +++-
 .../qnn/qnn_provider_factory_creator.h        |   3 +
 .../core/providers/qnn/qnn_telemetry.cc       | 211 +++++++++++++++
 .../core/providers/qnn/qnn_telemetry.h        |  98 +++++++
 .../core/providers/qnn/rpcmem_library.cc      |  10 +-
 .../core/providers/qnn/rpcmem_library.h       |   2 +-
 .../core/providers/qnn/shared_context.h       |   2 +-
 onnxruntime/core/providers/qnn/symbols.def    |   2 +
 .../core/providers/qnn/version_script.lds     |   9 +
 .../providers/shared_library/provider_api.h   |  52 ++++
 .../provider_bridge_provider.cc               |   5 +-
 .../shared_library/provider_interfaces.h      |  72 ++++-
 .../shared_library/provider_wrappedtypes.h    |  77 +++++-
 .../core/session/provider_bridge_ort.cc       | 157 ++++++++++-
 .../test/providers/qnn/qnn_basic_test.cc      |   8 +-
 .../test/providers/qnn/qnn_ep_context_test.cc |  29 +-
 onnxruntime/test/qnn_ctx_gen/main.cc          |  31 ++-
 setup.py                                      |  10 +-
 tools/ci_build/build.py                       |  27 +-
 .../github/android/build_aar_package.py       |   6 +-
 .../default_qnn_aar_build_settings.json       |   2 +-
 ...arm64-v8a-QNN-crosscompile-ci-pipeline.yml |   3 +-
 .../azure-pipelines/linux-qnn-ci-pipeline.yml |  13 +-
 .../templates/py-win-arm64-qnn.yml            |   1 +
 .../templates/py-win-arm64ec-qnn.yml          |   1 +
 .../templates/py-win-x64-qnn.yml              |   1 +
 .../azure-pipelines/templates/qnn-ep-win.yml  |   8 +-
 .../win-qnn-arm64-ci-pipeline.yml             |  18 +-
 .../azure-pipelines/win-qnn-ci-pipeline.yml   |  14 +-
 .../linux/build_linux_python_package.sh       |   2 +-
 .../nuget/generate_nuspec_for_native_nuget.py |  19 ++
 100 files changed, 2299 insertions(+), 694 deletions(-)
 create mode 100644 onnxruntime/core/providers/qnn/ort_api.cc
 create mode 100644 onnxruntime/core/providers/qnn/ort_api.h
 create mode 100644 onnxruntime/core/providers/qnn/qnn_telemetry.cc
 create mode 100644 onnxruntime/core/providers/qnn/qnn_telemetry.h
 create mode 100644 onnxruntime/core/providers/qnn/symbols.def
 create mode 100644 onnxruntime/core/providers/qnn/version_script.lds

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 304236743f..b332583035 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -93,6 +93,7 @@ option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_COREML "Build with CoreML support" OFF)
 option(onnxruntime_USE_NNAPI_BUILTIN "Build with builtin NNAPI lib for Android NNAPI support" OFF)
 option(onnxruntime_USE_QNN "Build with QNN support" OFF)
+option(onnxruntime_BUILD_QNN_EP_STATIC_LIB "Build with QNN EP as a static library" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index d72b61a085..78edb4179f 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -199,17 +199,12 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android" AND onnxruntime_BUILD_JAVA)
   endforeach()
 endif()
 
-# This list is a reversed topological ordering of library dependencies.
-# Earlier entries may depend on later ones. Later ones should not depend on earlier ones.
-set(onnxruntime_INTERNAL_LIBRARIES
-  onnxruntime_session
-  ${onnxruntime_libs}
+set(onnxruntime_INTERNAL_PROVIDER_LIBRARIES
   ${PROVIDERS_ACL}
   ${PROVIDERS_ARMNN}
   ${PROVIDERS_COREML}
   ${PROVIDERS_DML}
   ${PROVIDERS_NNAPI}
-  ${PROVIDERS_QNN}
   ${PROVIDERS_SNPE}
   ${PROVIDERS_RKNPU}
   ${PROVIDERS_VSINPU}
@@ -218,6 +213,18 @@ set(onnxruntime_INTERNAL_LIBRARIES
   ${PROVIDERS_WEBNN}
   ${PROVIDERS_AZURE}
   ${PROVIDERS_INTERNAL_TESTING}
+)
+
+if (onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+  list(APPEND onnxruntime_INTERNAL_PROVIDER_LIBRARIES onnxruntime_providers_qnn)
+endif()
+
+# This list is a reversed topological ordering of library dependencies.
+# Earlier entries may depend on later ones. Later ones should not depend on earlier ones.
+set(onnxruntime_INTERNAL_LIBRARIES
+  onnxruntime_session
+  ${onnxruntime_libs}
+  ${onnxruntime_INTERNAL_PROVIDER_LIBRARIES}
   ${onnxruntime_winml}
   onnxruntime_optimizer
   onnxruntime_providers
diff --git a/cmake/onnxruntime_java.cmake b/cmake/onnxruntime_java.cmake
index b15b9632e9..1227264e59 100644
--- a/cmake/onnxruntime_java.cmake
+++ b/cmake/onnxruntime_java.cmake
@@ -148,7 +148,7 @@ if (WIN32)
   if(NOT onnxruntime_ENABLE_STATIC_ANALYSIS)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime>)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${JAVA_PACKAGE_JNI_DIR}/$<TARGET_FILE_NAME:onnxruntime4j_jni>)
-    if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT)
+    if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB))
       add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_shared> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_shared>)
     endif()
     if (onnxruntime_USE_CUDA)
@@ -163,11 +163,14 @@ if (WIN32)
     if (onnxruntime_USE_TENSORRT)
       add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_tensorrt> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_tensorrt>)
     endif()
+    if (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+      add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_qnn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_FILE_NAME:onnxruntime_providers_qnn>)
+    endif()
   endif()
 else()
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime>)
   add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime4j_jni> ${JAVA_PACKAGE_JNI_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime4j_jni>)
-  if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT)
+  if (onnxruntime_USE_CUDA OR onnxruntime_USE_DNNL OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_TENSORRT OR (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB))
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_shared> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_shared>)
   endif()
   if (onnxruntime_USE_CUDA)
@@ -182,6 +185,9 @@ else()
   if (onnxruntime_USE_TENSORRT)
     add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_tensorrt> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_tensorrt>)
   endif()
+  if (onnxruntime_USE_QNN AND NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    add_custom_command(TARGET onnxruntime4j_jni POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different $<TARGET_FILE:onnxruntime_providers_qnn> ${JAVA_PACKAGE_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:onnxruntime_providers_qnn>)
+  endif()
 endif()
 
 # run the build process (this copies the results back into CMAKE_CURRENT_BINARY_DIR)
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 582491de95..67fa48b282 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -74,9 +74,6 @@ endif()
 if(onnxruntime_USE_JSEP)
   set(PROVIDERS_JS onnxruntime_providers_js)
 endif()
-if(onnxruntime_USE_QNN)
-  set(PROVIDERS_QNN onnxruntime_providers_qnn)
-endif()
 if(onnxruntime_USE_RKNPU)
   set(PROVIDERS_RKNPU onnxruntime_providers_rknpu)
 endif()
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index 91a2b13002..4ae89a3922 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -239,7 +239,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst")
   elseif(UNIX)
     if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
-      set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections")
+      target_link_options(onnxruntime_providers_shared PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds"
+                          "LINKER:--gc-sections")
     endif()
   elseif(WIN32)
   set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def")
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index b68d84c23b..3030201458 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -3,41 +3,89 @@
 
   add_compile_definitions(USE_QNN=1)
 
-  # These are shared utils,
-  # TODO, move to a separate lib when used by EPs other than QNN, NNAPI and CoreML
-  file(GLOB onnxruntime_providers_shared_utils_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/shared/utils/utils.cc"
-  )
-
-  file(GLOB_RECURSE
-    onnxruntime_providers_qnn_ep_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.cc"
-  )
-
-  file(GLOB_RECURSE
-    onnxruntime_providers_qnn_builder_cc_srcs CONFIGURE_DEPENDS
-    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.h"
-    "${ONNXRUNTIME_ROOT}/core/providers/qnn/builder/*.cc"
-  )
-
-  set(onnxruntime_providers_qnn_cc_srcs
-    ${onnxruntime_providers_shared_utils_cc_srcs}
-    ${onnxruntime_providers_qnn_ep_cc_srcs}
-    ${onnxruntime_providers_qnn_builder_cc_srcs}
-  )
-
-  source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_cc_srcs})
-  onnxruntime_add_static_library(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_qnn onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf-lite flatbuffers::flatbuffers Boost::mp11)
-  target_link_libraries(onnxruntime_providers_qnn)
-  add_dependencies(onnxruntime_providers_qnn onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
-  target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_QNN_HOME}/include/QNN ${onnxruntime_QNN_HOME}/include)
-  set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
-  # ignore the warning unknown-pragmas on "pragma region"
-  if(NOT MSVC)
-    target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
+  if(onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    add_compile_definitions(BUILD_QNN_EP_STATIC_LIB=1)
+  endif()
+
+  file(GLOB_RECURSE
+       onnxruntime_providers_qnn_ep_srcs CONFIGURE_DEPENDS
+       "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.h"
+       "${ONNXRUNTIME_ROOT}/core/providers/qnn/*.cc"
+  )
+
+  if(onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    #
+    # Build QNN EP as a static library
+    #
+    set(onnxruntime_providers_qnn_srcs ${onnxruntime_providers_qnn_ep_srcs})
+    source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_srcs})
+    onnxruntime_add_static_library(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_srcs})
+    onnxruntime_add_include_to_target(onnxruntime_providers_qnn onnxruntime_common onnxruntime_framework onnx
+                                                                onnx_proto protobuf::libprotobuf-lite
+                                                                flatbuffers::flatbuffers Boost::mp11)
+    add_dependencies(onnxruntime_providers_qnn onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
+    target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT}
+                                                                 ${onnxruntime_QNN_HOME}/include/QNN
+                                                                 ${onnxruntime_QNN_HOME}/include)
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
+
+    # ignore the warning unknown-pragmas on "pragma region"
+    if(NOT MSVC)
+      target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
+    endif()
+  else()
+    #
+    # Build QNN EP as a shared library
+    #
+    file(GLOB_RECURSE
+         onnxruntime_providers_qnn_shared_lib_srcs CONFIGURE_DEPENDS
+         "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h"
+         "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc"
+    )
+    set(onnxruntime_providers_qnn_srcs ${onnxruntime_providers_qnn_ep_srcs}
+	                               ${onnxruntime_providers_qnn_shared_lib_srcs})
+
+    source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_qnn_srcs})
+    onnxruntime_add_shared_library_module(onnxruntime_providers_qnn ${onnxruntime_providers_qnn_srcs})
+    onnxruntime_add_include_to_target(onnxruntime_providers_qnn ${ONNXRUNTIME_PROVIDERS_SHARED} ${GSL_TARGET} onnx
+	                                                        onnxruntime_common Boost::mp11 safeint_interface)
+    target_link_libraries(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_PROVIDERS_SHARED} ${ABSEIL_LIBS} ${CMAKE_DL_LIBS})
+    add_dependencies(onnxruntime_providers_qnn onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES})
+    target_include_directories(onnxruntime_providers_qnn PRIVATE ${ONNXRUNTIME_ROOT}
+                                                                 ${CMAKE_CURRENT_BINARY_DIR}
+                                                                 ${onnxruntime_QNN_HOME}/include/QNN
+                                                                 ${onnxruntime_QNN_HOME}/include)
+
+    # Set linker flags for function(s) exported by EP DLL
+    if(UNIX)
+      target_link_options(onnxruntime_providers_qnn PRIVATE
+                          "LINKER:--version-script=${ONNXRUNTIME_ROOT}/core/providers/qnn/version_script.lds"
+                          "LINKER:--gc-sections"
+                          "LINKER:-rpath=\$ORIGIN"
+      )
+    elseif(WIN32)
+      set_property(TARGET onnxruntime_providers_qnn APPEND_STRING PROPERTY LINK_FLAGS
+                   "-DEF:${ONNXRUNTIME_ROOT}/core/providers/qnn/symbols.def")
+    else()
+      message(FATAL_ERROR "onnxruntime_providers_qnn unknown platform, need to specify shared library exports for it")
+    endif()
+
+    # Set compile options
+    if(MSVC)
+      target_compile_options(onnxruntime_providers_qnn PUBLIC /wd4099 /wd4005)
+    else()
+      # ignore the warning unknown-pragmas on "pragma region"
+      target_compile_options(onnxruntime_providers_qnn PRIVATE "-Wno-unknown-pragmas")
+    endif()
+
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES LINKER_LANGUAGE CXX)
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES CXX_STANDARD_REQUIRED ON)
+    set_target_properties(onnxruntime_providers_qnn PROPERTIES FOLDER "ONNXRuntime")
+
+    install(TARGETS onnxruntime_providers_qnn
+            ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+            RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR})
   endif()
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 5b29d1093a..15a2862ced 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -169,9 +169,7 @@ if (onnxruntime_ENABLE_LAZY_TENSOR)
   endif()
 endif()
 
-target_link_libraries(onnxruntime_pybind11_state PRIVATE
-    onnxruntime_session
-    ${onnxruntime_libs}
+set(onnxruntime_pybind11_state_static_providers
     ${PROVIDERS_NNAPI}
     ${PROVIDERS_VSINPU}
     ${PROVIDERS_XNNPACK}
@@ -183,7 +181,16 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
     ${PROVIDERS_XNNPACK}
     ${PROVIDERS_WEBGPU}
     ${PROVIDERS_AZURE}
-    ${PROVIDERS_QNN}
+)
+
+if(onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+  list(APPEND onnxruntime_pybind11_state_static_providers PRIVATE onnxruntime_providers_qnn)
+endif()
+
+target_link_libraries(onnxruntime_pybind11_state PRIVATE
+    onnxruntime_session
+    ${onnxruntime_libs}
+    ${onnxruntime_pybind11_state_static_providers}
     onnxruntime_optimizer
     onnxruntime_providers
     onnxruntime_util
@@ -1000,6 +1007,16 @@ if (onnxruntime_USE_COREML)
 endif()
 
 if (onnxruntime_USE_QNN)
+  if(NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    add_custom_command(
+      TARGET onnxruntime_pybind11_state POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy
+        $<TARGET_FILE:onnxruntime_providers_qnn>
+        $<TARGET_FILE:onnxruntime_providers_shared>
+        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
+    )
+  endif()
+
   add_custom_command(
     TARGET onnxruntime_pybind11_state POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 7c1b9ddc15..c727f4b7e3 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -619,16 +619,13 @@ if(onnxruntime_USE_ARMNN)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_armnn)
 endif()
 
-set(ONNXRUNTIME_TEST_LIBS
-    onnxruntime_session
-    ${ONNXRUNTIME_INTEROP_TEST_LIBS}
-    ${onnxruntime_libs}
-    # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime
+set(ONNXRUNTIME_TEST_STATIC_PROVIDER_LIBS
+    # CUDA, ROCM, TENSORRT, MIGRAPHX, DNNL, and OpenVINO are dynamically loaded at runtime.
+    # QNN EP can be built as either a dynamic and static libs.
     ${PROVIDERS_NNAPI}
     ${PROVIDERS_VSINPU}
     ${PROVIDERS_JS}
     ${PROVIDERS_WEBGPU}
-    ${PROVIDERS_QNN}
     ${PROVIDERS_SNPE}
     ${PROVIDERS_RKNPU}
     ${PROVIDERS_DML}
@@ -637,6 +634,17 @@ set(ONNXRUNTIME_TEST_LIBS
     ${PROVIDERS_COREML}
     ${PROVIDERS_XNNPACK}
     ${PROVIDERS_AZURE}
+)
+
+if (onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+  list(APPEND ONNXRUNTIME_TEST_STATIC_PROVIDER_LIBS onnxruntime_providers_qnn)
+endif()
+
+set(ONNXRUNTIME_TEST_LIBS
+    onnxruntime_session
+    ${ONNXRUNTIME_INTEROP_TEST_LIBS}
+    ${onnxruntime_libs}
+    ${ONNXRUNTIME_TEST_STATIC_PROVIDER_LIBS}
     onnxruntime_optimizer
     onnxruntime_providers
     onnxruntime_util
@@ -700,7 +708,9 @@ if(onnxruntime_USE_QNN AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_RED
   list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/qnn/*)
   list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_qnn)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_qnn)
-  list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_qnn)
+  if(NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
+    list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_shared)
+  endif()
 endif()
 
 if(onnxruntime_USE_SNPE)
diff --git a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
index b80debdde4..c28c79f1e7 100644
--- a/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
+++ b/java/src/main/java/ai/onnxruntime/OnnxRuntime.java
@@ -76,6 +76,9 @@ final class OnnxRuntime {
   /** The short name of the ONNX runtime TensorRT provider library */
   static final String ONNXRUNTIME_LIBRARY_TENSORRT_NAME = "onnxruntime_providers_tensorrt";
 
+  /** The short name of the ONNX runtime QNN provider library */
+  static final String ONNXRUNTIME_LIBRARY_QNN_NAME = "onnxruntime_providers_qnn";
+
   /** The OS & CPU architecture string */
   private static final String OS_ARCH_STR = initOsArch();
 
@@ -159,8 +162,11 @@ final class OnnxRuntime {
       // the ONNX Runtime native library will load it
       extractProviderLibrary(ONNXRUNTIME_LIBRARY_SHARED_NAME);
 
-      load(ONNXRUNTIME_LIBRARY_NAME);
+      if (!isAndroid()) {
+        load(ONNXRUNTIME_LIBRARY_NAME);
+      }
       load(ONNXRUNTIME_JNI_LIBRARY_NAME);
+
       ortApiHandle = initialiseAPIBase(ORT_API_VERSION_14);
       if (ortApiHandle == 0L) {
         throw new IllegalStateException(
@@ -252,6 +258,16 @@ final class OnnxRuntime {
     return extractProviderLibrary(ONNXRUNTIME_LIBRARY_TENSORRT_NAME);
   }
 
+  /**
+   * Extracts the QNN provider library from the classpath resources if present, or checks to see if
+   * the QNN provider library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}.
+   *
+   * @return True if the QNN provider library is ready for loading, false otherwise.
+   */
+  static boolean extractQNN() {
+    return extractProviderLibrary(ONNXRUNTIME_LIBRARY_QNN_NAME);
+  }
+
   /**
    * Extracts a shared provider library from the classpath resources if present, or checks to see if
    * that library is in the directory specified by {@link #ONNXRUNTIME_NATIVE_PATH}.
@@ -260,7 +276,7 @@ final class OnnxRuntime {
    * @return True if the library is ready for loading by ORT's native code, false otherwise.
    */
   static synchronized boolean extractProviderLibrary(String libraryName) {
-    // Android does not need to extract library and it has no shared provider library
+    // Android does not need to extract provider libraries.
     if (isAndroid()) {
       return false;
     }
@@ -312,7 +328,7 @@ final class OnnxRuntime {
   private static void load(String library) throws IOException {
     // On Android, we simply use System.loadLibrary
     if (isAndroid()) {
-      System.loadLibrary("onnxruntime4j_jni");
+      System.loadLibrary(library);
       return;
     }
 
diff --git a/java/src/main/java/ai/onnxruntime/OrtSession.java b/java/src/main/java/ai/onnxruntime/OrtSession.java
index 32dc9d9f84..bd988e2bb7 100644
--- a/java/src/main/java/ai/onnxruntime/OrtSession.java
+++ b/java/src/main/java/ai/onnxruntime/OrtSession.java
@@ -1320,6 +1320,10 @@ public class OrtSession implements AutoCloseable {
      */
     public void addQnn(Map<String, String> providerOptions) throws OrtException {
       String qnnProviderName = "QNN";
+
+      // QNN can either be built as a shared or static library. extractQNN() will extract the
+      // (lib)onnxruntime_providers_qnn(.so/.dll) from classpath resources if present.
+      OnnxRuntime.extractQNN();
       addExecutionProvider(qnnProviderName, providerOptions);
     }
 
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc
index 950ac247a2..489cd19b11 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.cc
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc
@@ -64,6 +64,10 @@ EtwRegistrationManager& EtwRegistrationManager::Instance() {
   return instance;
 }
 
+bool EtwRegistrationManager::SupportsETW() {
+  return true;
+}
+
 bool EtwRegistrationManager::IsEnabled() const {
   std::lock_guard<std::mutex> lock(provider_change_mutex_);
   return is_enabled_;
@@ -248,5 +252,19 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id,
 }
 }  // namespace logging
 }  // namespace onnxruntime
+#else
+// ETW is not supported on this platform but should still define a dummy EtwRegistrationManager
+// so that it can be used in the EP provider bridge.
+namespace onnxruntime {
+namespace logging {
+EtwRegistrationManager& EtwRegistrationManager::Instance() {
+  static EtwRegistrationManager instance;
+  return instance;
+}
 
+bool EtwRegistrationManager::SupportsETW() {
+  return false;
+}
+}  // namespace logging
+}  // namespace onnxruntime
 #endif  // ETW_TRACE_LOGGING_SUPPORTED
diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h
index 2a798a28f1..62b762886c 100644
--- a/onnxruntime/core/platform/windows/logging/etw_sink.h
+++ b/onnxruntime/core/platform/windows/logging/etw_sink.h
@@ -60,6 +60,9 @@ class EtwRegistrationManager {
   // Singleton instance access
   static EtwRegistrationManager& Instance();
 
+  // Returns true if ETW is supported at all.
+  static bool SupportsETW();
+
   // Check if ETW logging is enabled
   bool IsEnabled() const;
 
@@ -110,5 +113,33 @@ class EtwRegistrationManager {
 
 }  // namespace logging
 }  // namespace onnxruntime
+#else
+// ETW is not supported on this platform but should still define a dummy EtwRegistrationManager
+// so that it can be used in the EP provider bridge.
+#include "core/common/logging/severity.h"
 
+namespace onnxruntime {
+namespace logging {
+class EtwRegistrationManager {
+ public:
+  using EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                 ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                 PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
+
+  static EtwRegistrationManager& Instance();
+  static bool SupportsETW();
+  bool IsEnabled() const { return false; }
+  UCHAR Level() const { return 0; }
+  Severity MapLevelToSeverity() { return Severity::kFATAL; }
+  uint64_t Keyword() const { return 0; }
+  HRESULT Status() const { return 0; }
+  void RegisterInternalCallback(const EtwInternalCallback& callback) {}
+  void UnregisterInternalCallback(const EtwInternalCallback& callback) {}
+
+ private:
+  EtwRegistrationManager() = default;
+  ~EtwRegistrationManager() = default;
+};
+}  // namespace logging
+}  // namespace onnxruntime
 #endif  // ETW_TRACE_LOGGING_SUPPORTED
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index 79674fd706..3df231e53e 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -2,13 +2,15 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
-#include "core/graph/constants.h"
-#include "core/providers/qnn/builder/qnn_model.h"
 
 #include <iostream>
 #include <fstream>
 #include <filesystem>
 
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model.h"
+
 namespace onnxruntime {
 namespace qnn {
 
@@ -51,9 +53,9 @@ Status GetMainContextNode(const std::vector<IExecutionProvider::FusedNodeAndGrap
     // There is only one EPContext node in one filtered graph -- this is guaranteed by GetCapability
     const onnxruntime::GraphViewer& graph_viewer(fused_nodes_and_graphs[i].filtered_graph);
     ORT_RETURN_IF(graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
-    const auto& ep_context_node = graph_viewer.Nodes().begin();
-    ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node->OpType(), "Should only filter in the EPContext node.");
-    NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *graph_viewer.Nodes().begin();
+    ORT_RETURN_IF_NOT(EPCONTEXT_OP == ep_context_node.OpType(), "Should only filter in the EPContext node.");
+    NodeAttrHelper node_helper(ep_context_node);
     int64_t is_main_context = node_helper.Get(MAIN_CONTEXT, static_cast<int64_t>(0));
     if (1 == is_main_context) {
       main_context_pos.push_back(static_cast<int>(i));
@@ -68,17 +70,16 @@ Status CreateNodeArgs(const std::vector<std::string>& names,
                       const std::unordered_map<std::string, OnnxTensorInfo>& tensor_info_table,
                       std::vector<NodeArg*>& node_args,
                       onnxruntime::Graph& graph) {
-  using namespace ONNX_NAMESPACE;
   for (size_t i = 0; i < names.size(); ++i) {
     std::string name = names[i];
     ORT_RETURN_IF(tensor_info_table.find(name) == tensor_info_table.end(), "Tensor name: ", name, " not found in tensor_info_table");
     const OnnxTensorInfo& tensor_info = tensor_info_table.at(name);
-    TypeProto tensor_type;
-    tensor_type.mutable_tensor_type()->set_elem_type(tensor_info.data_type_);
+    std::unique_ptr<ONNX_NAMESPACE::TypeProto> tensor_type = Factory<ONNX_NAMESPACE::TypeProto>::Create();
+    tensor_type->mutable_tensor_type()->set_elem_type(tensor_info.data_type_);
     for (size_t j = 0; j < tensor_info.shape_.size(); ++j) {
-      tensor_type.mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]);
+      tensor_type->mutable_tensor_type()->mutable_shape()->add_dim()->set_dim_value(tensor_info.shape_[j]);
     }
-    auto& input_arg = graph.GetOrCreateNodeArg(name, &tensor_type);
+    auto& input_arg = graph.GetOrCreateNodeArg(name, tensor_type.get());
     node_args.push_back(&input_arg);
   }
   return Status::OK();
@@ -161,8 +162,8 @@ Status TryGetMaxSpillFillSize(const std::vector<IExecutionProvider::FusedNodeAnd
     auto index = main_context_pos_list[i];
     const onnxruntime::GraphViewer& main_ctx_graph_viewer(fused_nodes_and_graphs[index].filtered_graph);
     ORT_RETURN_IF(main_ctx_graph_viewer.NumberOfNodes() != 1, "One filtered graph should has only one EPContext node!");
-    const auto& ep_context_node = main_ctx_graph_viewer.Nodes().begin();
-    NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *main_ctx_graph_viewer.Nodes().begin();
+    NodeAttrHelper node_helper(ep_context_node);
     int64_t max_size = node_helper.Get(MAX_SIZE, static_cast<int64_t>(0));
     if (max_size > max_spill_fill_size) {
       max_spill_fill_size = max_size;
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index 92c5391b40..3dfa0ae210 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -6,12 +6,8 @@
 #include <string>
 #include <vector>
 
-#include "qnn_def.h"
-#include "core/common/logging/logging.h"
-#include "core/graph/graph_viewer.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/graph/model.h"
-#include "core/framework/execution_provider.h"
+#include "core/providers/qnn/builder/qnn_def.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder.h b/onnxruntime/core/providers/qnn/builder/op_builder.h
index 05398c3f22..0846275496 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder.h
@@ -3,9 +3,7 @@
 
 #pragma once
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index e411c2a6bf..3d66003fb2 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -5,8 +5,6 @@
 #include <unordered_map>
 #include <string>
 
-#include <core/graph/graph.h>
-
 #include "op_builder_factory.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
index c685fa065e..e3a6141c29 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/argmax_min_op_builder.cc
@@ -1,14 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index ed70111087..cd1ee72e00 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -2,15 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include <utility>
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include <core/providers/common.h>
-
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/cpu/tensor/transpose.h"
-#include "core/common/safeint.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -271,37 +265,189 @@ Status BaseOpBuilder::SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper&
   return Status::OK();
 }
 
-Status BaseOpBuilder::TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper,
-                                           const onnx::TensorProto& initializer,
-                                           const std::vector<size_t>& perm,
-                                           std::vector<uint8_t>& transposed_data) const {
-  const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer.data_type())->GetElementType();
-  const auto tensor_shape_dims = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-  TensorShape tensor_shape{tensor_shape_dims};
-  AllocatorPtr cpu_allocator = std::make_shared<CPUAllocator>();
-  Tensor in_tensor = Tensor(tensor_dtype, tensor_shape, cpu_allocator);
+static Status GetTransposeStrides(const TensorShape& input_shape,
+                                  gsl::span<const size_t> perm,
+                                  gsl::span<size_t> input_strides,
+                                  gsl::span<size_t> output_strides) {
+  const size_t rank = input_shape.NumDimensions();
+  ORT_RETURN_IF_NOT(perm.size() == rank, "Expected perm size of ", rank);
+  ORT_RETURN_IF_NOT(input_strides.size() == rank, "Expected input_strides size of ", rank);
+  ORT_RETURN_IF_NOT(output_strides.size() == rank, "Expected output_strides size of ", rank);
+  std::vector<int64_t> output_shape_dims(rank);
+  ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape<int64_t, size_t>(input_shape.GetDims(), perm, output_shape_dims)));
+  const TensorShape output_shape = TensorShape::FromExistingBuffer(output_shape_dims);
 
-  auto rank = perm.size();
-  std::vector<int64_t> new_tensor_shape_dims;
-  std::vector<size_t> permutations;
-  new_tensor_shape_dims.reserve(rank);
-  permutations.reserve(rank);
-  for (int64_t p : perm) {
-    permutations.push_back(p);
-    new_tensor_shape_dims.push_back(tensor_shape_dims[p]);
+  for (size_t i = 0; i < rank; ++i) {
+    int64_t stride = (i < rank - 1) ? input_shape.SizeFromDimension(i + 1) : 1;
+    ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides.");
+    input_strides[i] = static_cast<size_t>(stride);
   }
 
-  TensorShape new_tensor_shape(new_tensor_shape_dims);
-  Tensor out_tensor = Tensor(tensor_dtype, new_tensor_shape, cpu_allocator);
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(
-      Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), initializer, in_tensor));
-  ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutations, in_tensor, out_tensor));
-  onnx::TensorProto new_tensor_proto = onnxruntime::utils::TensorToTensorProto(out_tensor, "test");
-  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(new_tensor_proto, transposed_data));
+  for (size_t i = 0; i < rank; ++i) {
+    int64_t stride = (i < rank - 1) ? output_shape.SizeFromDimension(i + 1) : 1;
+    ORT_RETURN_IF_NOT(stride > 0, "Expected positive shape dims when computing strides.");
+    output_strides[i] = static_cast<size_t>(stride);
+  }
 
   return Status::OK();
 }
 
+// Internal function to transpose data of rank 5 with the given permutation.
+// Example: transpose input from either (N,C,H,W,D) or (C,N,H,W,D) to (H,W,D,C,N).
+static Status TransposeDataRank5(const TensorShape& input_shape,
+                                 gsl::span<const size_t> perm,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer) {
+  std::array<size_t, 5> input_strides = {};
+  std::array<size_t, 5> output_strides = {};
+  ORT_RETURN_IF_ERROR(GetTransposeStrides(input_shape, perm, input_strides, output_strides));
+
+  std::vector<size_t> perm_inverse(perm.size());
+  ORT_RETURN_IF_ERROR(qnn::utils::InvertPerm<size_t>(perm, perm_inverse));
+
+  for (int64_t d0 = 0; d0 < input_shape[0]; ++d0) {
+    for (int64_t d1 = 0; d1 < input_shape[1]; ++d1) {
+      for (int64_t d2 = 0; d2 < input_shape[2]; ++d2) {
+        for (int64_t d3 = 0; d3 < input_shape[3]; ++d3) {
+          for (int64_t d4 = 0; d4 < input_shape[4]; ++d4) {
+            const size_t src_elem_index = ((d0 * input_strides[0]) +
+                                           (d1 * input_strides[1]) +
+                                           (d2 * input_strides[2]) +
+                                           (d3 * input_strides[3]) +
+                                           (d4 * input_strides[4]));
+            const size_t dst_elem_index = ((d0 * output_strides[perm_inverse[0]]) +
+                                           (d1 * output_strides[perm_inverse[1]]) +
+                                           (d2 * output_strides[perm_inverse[2]]) +
+                                           (d3 * output_strides[perm_inverse[3]]) +
+                                           (d4 * output_strides[perm_inverse[4]]));
+
+            const size_t src_byte_index = src_elem_index * elem_byte_size;
+            const size_t dst_byte_index = dst_elem_index * elem_byte_size;
+            assert(src_byte_index < input_buffer.size());
+            assert(dst_byte_index < output_buffer.size());
+
+            std::memcpy(&output_buffer[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size);
+          }
+        }
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+Status BaseOpBuilder::TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper,
+                                            std::vector<uint32_t>& data_shape,
+                                            const onnx::TensorProto& initializer,
+                                            std::vector<uint8_t>& transposed_data) const {
+  ORT_RETURN_IF_NOT(data_shape.size() == 2, "Expected shape of rank 2");
+
+  std::array<size_t, 2> perm = {1, 0};
+  std::vector<uint32_t> output_shape(data_shape.size());
+  ORT_RETURN_IF_ERROR((qnn::utils::PermuteShape<uint32_t, size_t>(data_shape, perm, output_shape)));
+
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Can't get element byte size from given ONNX type");
+
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+
+  for (size_t row = 0; row < data_shape[0]; row++) {
+    for (size_t col = 0; col < data_shape[1]; col++) {
+      const size_t src_elem_index = (row * data_shape[1] + col);
+      const size_t dst_elem_index = (col * output_shape[1] + row);
+      const size_t src_byte_index = src_elem_index * elem_byte_size;
+      const size_t dst_byte_index = dst_elem_index * elem_byte_size;
+      assert(src_byte_index < input_buffer.size());
+      assert(dst_byte_index < transposed_data.size());
+
+      std::memcpy(&transposed_data[dst_byte_index], &input_buffer[src_byte_index], elem_byte_size);
+    }
+  }
+
+  data_shape = std::move(output_shape);  // Update parameter with final transposed shape
+  return Status::OK();
+}
+
+Status BaseOpBuilder::TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
+                                              const onnx::TensorProto& initializer,
+                                              std::vector<uint8_t>& transposed_data,
+                                              bool is_3d) const {
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  std::vector<int64_t> input_shape = qnn::utils::GetInitializerShape<int64_t>(initializer);
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+  return TransposeFromNchwToHwcn(std::move(input_shape), elem_byte_size, input_buffer, transposed_data, is_3d);
+}
+
+Status BaseOpBuilder::TransposeFromNchwToHwcn(std::vector<int64_t>&& original_input_shape_dims,
+                                              size_t elem_byte_size,
+                                              gsl::span<const uint8_t> input_buffer,
+                                              gsl::span<uint8_t> output_buffer,
+                                              bool is_3d) const {
+  std::vector<int64_t> input_shape_dims = std::move(original_input_shape_dims);
+  const size_t rank = input_shape_dims.size();
+  ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Only support input of rank 4 or 5 but got rank ",
+                    rank);
+  ORT_RETURN_IF_NOT(output_buffer.size() == input_buffer.size(),
+                    "Expected output buffer's size to equal the input buffer's size: ",
+                    output_buffer.size(), " != ", input_buffer.size());
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Invalid element byte size due to potentially unsupported type");
+
+  if (!is_3d) {
+    input_shape_dims.push_back(1);  // Make it 3D by making shape (N,C,H,W,1)
+  }
+
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims),
+                            nchw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            output_buffer);
+}
+
+Status BaseOpBuilder::TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
+                                              const onnx::TensorProto& initializer,
+                                              std::vector<uint8_t>& transposed_data,
+                                              bool is_3d) const {
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer.data_type());
+  const size_t elem_byte_size = qnn::utils::GetElementSizeByType(onnx_type);
+  std::vector<int64_t> input_shape = qnn::utils::GetInitializerShape<int64_t>(initializer);
+  std::vector<uint8_t> input_buffer;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(initializer, input_buffer));
+  transposed_data.resize(input_buffer.size());
+  return TransposeFromCnhwToHwcn(std::move(input_shape), elem_byte_size, input_buffer, transposed_data, is_3d);
+}
+
+Status BaseOpBuilder::TransposeFromCnhwToHwcn(std::vector<int64_t>&& original_input_shape_dims,
+                                              size_t elem_byte_size,
+                                              gsl::span<const uint8_t> input_buffer,
+                                              gsl::span<uint8_t> output_buffer,
+                                              bool is_3d) const {
+  std::vector<int64_t> input_shape_dims = std::move(original_input_shape_dims);
+  const size_t rank = input_shape_dims.size();
+  ORT_RETURN_IF_NOT((is_3d && rank == 5) || (!is_3d && rank == 4), "Only support input of rank 4 or 5 but got rank ",
+                    rank);
+  ORT_RETURN_IF_NOT(output_buffer.size() == input_buffer.size(),
+                    "Expected output buffer's size to equal the input buffer's size: ",
+                    output_buffer.size(), " != ", input_buffer.size());
+  ORT_RETURN_IF_NOT(elem_byte_size != 0, "Invalid element byte size due to potentially unsupported type");
+
+  if (!is_3d) {
+    input_shape_dims.push_back(1);  // Make it 3D by making shape (C,N,H,W,1)
+  }
+
+  return TransposeDataRank5(TensorShape::FromExistingBuffer(input_shape_dims),
+                            cnhw2hwcn_perm_3d,
+                            elem_byte_size,
+                            input_buffer,
+                            output_buffer);
+}
+
 Status BaseOpBuilder::ProcessAxisAttribute(const QnnModelWrapper& qnn_model_wrapper,
                                            const NodeUnit& node_unit,
                                            Qnn_Scalar_t& axis_qnn_scalar,
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 055c0f6ccf..8e34b5d87c 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -3,11 +3,11 @@
 
 #pragma once
 
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
-#include "core/framework/allocator.h"
 
 #include "QnnOpDef.h"
 
@@ -215,7 +215,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // NCHW shape to channel last
-  Status NchwShapeToNhwc(const std::vector<uint32_t>& nchw_shape, std::vector<uint32_t>& nhwc_shape) const {
+  template <typename T>
+  Status NchwShapeToNhwc(gsl::span<const T> nchw_shape, gsl::span<T> nhwc_shape) const {
     ORT_RETURN_IF_NOT(nchw_shape.size() == 4, "shape should have 4 dimension NCHW.");
     nhwc_shape[0] = nchw_shape[0];
     nhwc_shape[1] = nchw_shape[2];
@@ -226,7 +227,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // NCHW shape to HWCN shape, required for Conv weight
-  Status NchwShapeToHwcn(const std::vector<uint32_t>& nchw_shape, std::vector<uint32_t>& hwcn_shape) const {
+  template <typename T>
+  Status NchwShapeToHwcn(gsl::span<const T> nchw_shape, gsl::span<T> hwcn_shape) const {
     if (nchw_shape.size() == 4) {
       hwcn_shape[0] = nchw_shape[2];
       hwcn_shape[1] = nchw_shape[3];
@@ -246,7 +248,8 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // CNHW shape to HWCN shape, required for Conv weight
-  Status CnhwShapeToHwcn(const std::vector<uint32_t>& cnhw_shape, std::vector<uint32_t>& hwcn_shape) const {
+  template <typename T>
+  Status CnhwShapeToHwcn(gsl::span<const T> cnhw_shape, gsl::span<T> hwcn_shape) const {
     if (cnhw_shape.size() == 4) {
       hwcn_shape[0] = cnhw_shape[2];
       hwcn_shape[1] = cnhw_shape[3];
@@ -264,37 +267,31 @@ class BaseOpBuilder : public IOpBuilder {
 
     return Status::OK();
   }
-  Status TransposeInitializer(const QnnModelWrapper& qnn_model_wrapper,
-                              const onnx::TensorProto& initializer,
-                              const std::vector<size_t>& perm,
-                              std::vector<uint8_t>& transposed_data) const;
 
   Status TransposeFromNchwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
                                  const onnx::TensorProto& initializer,
                                  std::vector<uint8_t>& transposed_data,
-                                 bool is_3d = false) const {
-    auto& perm = is_3d ? nchw2hwcn_perm_3d : nchw2hwcn_perm;
-    return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data);
-  }
+                                 bool is_3d = false) const;
+  Status TransposeFromNchwToHwcn(std::vector<int64_t>&& input_shape_dims,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer,
+                                 bool is_3d = false) const;
 
   Status TransposeFromCnhwToHwcn(const QnnModelWrapper& qnn_model_wrapper,
                                  const onnx::TensorProto& initializer,
                                  std::vector<uint8_t>& transposed_data,
-                                 bool is_3d = false) const {
-    auto& perm = is_3d ? cnhw2hwcn_perm_3d : cnhw2hwcn_perm;
-    return TransposeInitializer(qnn_model_wrapper, initializer, perm, transposed_data);
-  }
+                                 bool is_3d = false) const;
+  Status TransposeFromCnhwToHwcn(std::vector<int64_t>&& input_shape_dims,
+                                 size_t elem_byte_size,
+                                 gsl::span<const uint8_t> input_buffer,
+                                 gsl::span<uint8_t> output_buffer,
+                                 bool is_3d = false) const;
 
   Status TwoDimensionTranspose(const QnnModelWrapper& qnn_model_wrapper,
                                std::vector<uint32_t>& data_shape,
                                const onnx::TensorProto& initializer,
-                               std::vector<uint8_t>& transposed_data) const {
-    auto tmp = data_shape[0];
-    data_shape[0] = data_shape[1];
-    data_shape[1] = tmp;
-    std::vector<size_t> two_dim_trans_perm{1, 0};
-    return TransposeInitializer(qnn_model_wrapper, initializer, two_dim_trans_perm, transposed_data);
-  }
+                               std::vector<uint8_t>& transposed_data) const;
 
   // Onnx Pads is [x1_begin, x2_begin, x1_end, x2_end], QNN requires [x1_begin, x1_end, x2_begin, x2_end]
   void ReArranagePads(std::vector<uint32_t>& pads) const {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
index 07abcf1c7b..14f50fa78c 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc
@@ -5,16 +5,11 @@
 #include <cmath>
 #include <utility>
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/float16.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 class BatchNormOpBuilder : public BaseOpBuilder {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
index d3bdee0243..3139c05378 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/cast_op_builder.cc
@@ -4,12 +4,11 @@
 #include <string>
 #include <vector>
 
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index e5dc4d04af..23b3dfb063 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -4,14 +4,11 @@
 #include <cassert>
 #include <limits>
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 class ClipOpBuilder : public BaseOpBuilder {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index 12887f0fb7..0f92778252 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -1,16 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -211,9 +206,9 @@ Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
 
     // Change shape to HWCN, it could be initializer or normal input
     if (conv_type == OnnxConvType::kConv) {
-      ORT_RETURN_IF_ERROR(NchwShapeToHwcn(input_info.shape, actual_shape));
+      ORT_RETURN_IF_ERROR(NchwShapeToHwcn<uint32_t>(input_info.shape, actual_shape));
     } else if (conv_type == OnnxConvType::kConvTranspose) {
-      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(input_info.shape, actual_shape));
+      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn<uint32_t>(input_info.shape, actual_shape));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
     }
@@ -413,9 +408,9 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
 
     // Create the final shape after the weights are transposed to HWCN.
     if (conv_type == OnnxConvType::kConv) {
-      ORT_RETURN_IF_ERROR(NchwShapeToHwcn(shape_2d, final_shape));
+      ORT_RETURN_IF_ERROR(NchwShapeToHwcn<uint32_t>(shape_2d, final_shape));
     } else if (conv_type == OnnxConvType::kConvTranspose) {
-      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn(shape_2d, final_shape));
+      ORT_RETURN_IF_ERROR(CnhwShapeToHwcn<uint32_t>(shape_2d, final_shape));
     } else {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
     }
@@ -434,16 +429,6 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
         return static_cast<int64_t>(dim);
       });
 
-      const TensorShape tensor_shape = TensorShape::FromExistingBuffer(shape_2d_int64);  // Does not own shape data.
-      const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(
-                                             input_info.initializer_tensor->data_type())
-                                             ->GetElementType();
-      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor, unpacked_tensor));
-
-      Tensor tensor_2d(tensor_dtype, tensor_shape, unpacked_tensor.data(), OrtMemoryInfo{});  // Does not own data.
-      ONNX_NAMESPACE::TensorProto reshaped_initializer = onnxruntime::utils::TensorToTensorProto(tensor_2d,
-                                                                                                 reshape_output);
-
       // The reshape (unsqueeze) may require us to shift the quant parameter's axis.
       if (input_info.quant_param.IsPerChannel()) {
         ORT_RETURN_IF_ERROR(input_info.quant_param.HandleUnsqueeze<uint32_t>(input_info.shape, shape_2d));
@@ -452,10 +437,21 @@ Status ConvOpBuilder::ProcessConv1DInputs(QnnModelWrapper& qnn_model_wrapper,
       //
       // Get transposed initializer bytes.
       //
+      std::vector<uint8_t> original_tensor_bytes;
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*input_info.initializer_tensor,
+                                                                  original_tensor_bytes));
+      unpacked_tensor.resize(original_tensor_bytes.size());
+      const size_t elem_byte_size = qnn::utils::GetElementSizeByType(
+          static_cast<ONNX_NAMESPACE::TensorProto_DataType>(input_info.initializer_tensor->data_type()));
+      ORT_RETURN_IF(elem_byte_size == 0, "Can't get element byte size from given ONNX type for initializer ",
+                    input1_name.c_str());
+
       if (conv_type == OnnxConvType::kConv) {
-        ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor));
+        ORT_RETURN_IF_ERROR(TransposeFromNchwToHwcn(std::move(shape_2d_int64), elem_byte_size, original_tensor_bytes,
+                                                    unpacked_tensor, /*is_3d*/ false));
       } else if (conv_type == OnnxConvType::kConvTranspose) {
-        ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(qnn_model_wrapper, reshaped_initializer, unpacked_tensor));
+        ORT_RETURN_IF_ERROR(TransposeFromCnhwToHwcn(std::move(shape_2d_int64), elem_byte_size, original_tensor_bytes,
+                                                    unpacked_tensor, /*is_3d*/ false));
       } else {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unexpected convolution op type: ", node_unit.OpType().c_str());
       }
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
index 64f676aaa9..2bae345219 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc
@@ -1,14 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
index 5549716751..d25ec3f333 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gather_op_builder.cc
@@ -2,14 +2,10 @@
 // Licensed under the MIT License.
 
 #include <cassert>
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
index eeee26c177..76bc766d2b 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/gemm_op_builder.cc
@@ -1,14 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
index 4b8d079c00..d77d9534bf 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/instance_norm_op_builder.cc
@@ -1,16 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
index d1a0e88686..fc92f42b37 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc
@@ -2,16 +2,10 @@
 // Licensed under the MIT License.
 
 #include <cassert>
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
index 2f66069b66..3c9bdf0e7f 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/lrn_op_builder.cc
@@ -2,11 +2,9 @@
 // Licensed under the MIT License.
 
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
 
 #include "QnnOpDef.h"  // From QNN SDK: contains QNN constants (e.g., op names, param values).
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
index 850fd28758..5a158af8d5 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/matmul_op_builder.cc
@@ -1,13 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/safeint.h"
-#include "core/providers/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
index 5fc6d42a8a..40e0ccdd4a 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -1,15 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
index ef1990ad8e..795886fa25 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
@@ -1,16 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
index 77bc58bd6f..a98110bc96 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reduce_op_builder.cc
@@ -2,16 +2,13 @@
 // Licensed under the MIT License.
 
 #include <algorithm>
-#include <string>
 #include <array>
+#include <set>
+#include <string>
 #include <vector>
 
-#include "core/common/safeint.h"
-#include "onnx/defs/data_type_utils.h"
-#include "core/providers/common.h"
-#include "core/framework/endian_utils.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
@@ -71,7 +68,7 @@ class ReduceOpBuilder : public BaseOpBuilder {
   using AxesQnnIntType = uint32_t;
 
   Status GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                    InlinedHashSet<AxesOnnxIntType>& axes_set) const;
+                    std::set<AxesOnnxIntType>& axes_set) const;
 
   // Maps an operator type to the opset in which "axes" became an input instead of an attribute.
   static const std::array<int, REDUCE_OP_TYPE_COUNT> opset_with_axes_as_input;
@@ -87,7 +84,7 @@ const std::array<int, REDUCE_OP_TYPE_COUNT> ReduceOpBuilder::opset_with_axes_as_
 };
 
 Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
-                                   InlinedHashSet<AxesOnnxIntType>& axes_set) const {
+                                   std::set<AxesOnnxIntType>& axes_set) const {
   ReduceOpType reduce_op_type = GetReduceOpType(node_unit.OpType());
   if (reduce_op_type == ReduceOpType::REDUCE_OP_TYPE_UNKNOWN) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unknown reduce operator ", node_unit.OpType());
@@ -146,10 +143,7 @@ Status ReduceOpBuilder::GetAxesSet(QnnModelWrapper& qnn_model_wrapper, const Nod
       auto src_span = gsl::make_span(axes_bytes.data(), axes_bytes.size());
       auto dst_span = gsl::make_span(reduce_axes.data(), reduce_axes.size());
 
-      // Copy initializer bytes (stored in little-endian order) to vector of int64_t.
-      // ReadLittleEndian returns a status error if the source and destination spans do not have
-      // matching byte sizes.
-      ORT_RETURN_IF_ERROR(onnxruntime::utils::ReadLittleEndian(src_span, dst_span));
+      std::memcpy(dst_span.data(), src_span.data(), src_span.size_bytes());
     }
   }
 
@@ -218,7 +212,7 @@ Status ReduceOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
   //
   // Handle axes param.
   //
-  InlinedHashSet<AxesOnnxIntType> axes_set;
+  std::set<AxesOnnxIntType> axes_set;
   ORT_RETURN_IF_ERROR(GetAxesSet(qnn_model_wrapper, node_unit, axes_set));
   const size_t num_axes = axes_set.size();
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
index b6f414da95..6fd67a72b6 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/reshape_op_builder.cc
@@ -1,15 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
index c62fca88b6..5e173b7aff 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc
@@ -5,17 +5,10 @@
 #include <cassert>
 #include <unordered_map>
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/qnn/builder/qnn_model_wrapper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
index cf87266754..48c637cd2e 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/simple_op_builder.cc
@@ -1,16 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-#include "core/util/qmath.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -170,15 +164,16 @@ Status ProcessAlphaAttributeAsInput(QnnModelWrapper& qnn_model_wrapper,
   // Check LeakyRelu input 0 to see if it's quantized tensor
   bool is_quantized_tensor = node_unit.Outputs()[0].quant_param.has_value();
   if (is_quantized_tensor) {
-    float scale;
-    uint8_t zero_point;
-    int64_t num_of_elements = 1;
-    concurrency::ThreadPool* thread_pool = nullptr;
-    GetQuantizationParameter(&tensor_data.alpha, num_of_elements, scale, zero_point, thread_pool);
-    unpacked_data.resize(1);
-    ParQuantizeLinearStd(&tensor_data.alpha, unpacked_data.data(), num_of_elements, scale, zero_point, thread_pool);
-    quantize_param = QnnQuantParamsWrapper(scale, static_cast<int32_t>(zero_point));
     qnn_data_type = QNN_DATATYPE_UFIXED_POINT_8;
+    std::array<float, 1> scales = {1.0f};
+    std::array<int32_t, 1> offsets = {0};
+    std::array<uint32_t, 1> shape = {1};
+    auto float_data = gsl::make_span<const float>(&tensor_data.alpha, 1);
+    ORT_RETURN_IF_ERROR(qnn::utils::GetDataQuantParams(float_data, shape, scales, offsets, qnn_data_type));
+
+    unpacked_data.resize(1);
+    ORT_RETURN_IF_ERROR(qnn::utils::QuantizeData(float_data, shape, scales, offsets, unpacked_data, qnn_data_type));
+    quantize_param = QnnQuantParamsWrapper(scales[0], static_cast<int32_t>(offsets[0]));
   } else {
     const auto& inputs = node_unit.Inputs();
     TensorInfo input_info = {};
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index b033c8723e..fcc7d27c3a 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -1,17 +1,12 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
 
-#include "core/framework/tensorprotoutils.h"
-
-#include "base_op_builder.h"
-
 namespace onnxruntime {
 namespace qnn {
 
@@ -86,26 +81,22 @@ static Status GetInitializerInputData(const NodeUnitIODef& input, const QnnModel
   ORT_RETURN_IF_NOT(initializer_proto->has_data_type(), "Expected initializer ", input_name.c_str(),
                     " to have a proto data type.");
 
-  // Create empty Tensor.
-  const auto* dtype = DataTypeImpl::TensorTypeFromONNXEnum(initializer_proto->data_type())->GetElementType();
-  TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*initializer_proto);
-  Tensor tensor(dtype, shape, std::make_shared<CPUAllocator>());
-
-  // Deserialize initializer into Tensor.
-  ORT_RETURN_IF_ERROR(onnxruntime::utils::TensorProtoToTensor(
-      onnxruntime::Env::Default(), qnn_model_wrapper.GetGraphViewer().ModelPath(), *initializer_proto, tensor));
+  // Deserialize initializer into byte buffer
+  std::vector<uint8_t> initializer_bytes;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*initializer_proto, initializer_bytes));
 
   Status status;
 
   // Copy Tensor of int32_t or int64_t elems into output (int64_ts).
-  if (tensor.IsDataType<int64_t>()) {
-    gsl::span<const int64_t> tensor_elems = tensor.DataAsSpan<int64_t>();
+  auto onnx_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(initializer_proto->data_type());
+  if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    gsl::span<const int64_t> tensor_elems = ReinterpretAsSpan<int64_t, uint8_t>(initializer_bytes);
     output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
-  } else if (tensor.IsDataType<int32_t>()) {
-    gsl::span<const int32_t> tensor_elems = tensor.DataAsSpan<int32_t>();
+  } else if (onnx_type == ONNX_NAMESPACE::TensorProto_DataType_INT32) {
+    gsl::span<const int32_t> tensor_elems = ReinterpretAsSpan<int32_t, uint8_t>(initializer_bytes);
     output.insert(output.end(), tensor_elems.begin(), tensor_elems.end());
   } else {
-    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", DataTypeImpl::ToString(dtype),
+    status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Data type ", onnx_type,
                              " is not supported for Slice initializer input ", input.node_arg.Name().c_str());
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
index b62534bacf..7326523737 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/softmax_op_builder.cc
@@ -1,15 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
-#include "core/framework/tensorprotoutils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
index ba5ad2cf03..1db9a8f1e3 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/split_op_builder.cc
@@ -1,16 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
index 851ca84dce..1d518c3ed5 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
@@ -1,16 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/common.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/cpu/tensor/slice_helper.h"
-#include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/common/safeint.h"
-
-#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
index d22c081168..adaa13912a 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/topk.cc
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 #include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
-#include "core/framework/utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 namespace onnxruntime {
 namespace qnn {
 const int TOPK_MIN_INPUT = 2;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
index a42d7312f0..bcd8a6d0f7 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/transpose_op_builder.cc
@@ -4,12 +4,11 @@
 #include <string>
 #include <vector>
 
+#include "core/providers/qnn/ort_api.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/common/safeint.h"
-
-#include "base_op_builder.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 077e89a6c7..8df4e5bb3b 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -17,18 +17,14 @@
 #include "HTP/QnnHtpSystemContext.h"
 #include "Saver/QnnSaver.h"
 #include <gsl/gsl>
-#include "core/framework/endian_utils.h"
-#include "core/common/logging/capture.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/qnn_allocator.h"
+#include "core/providers/qnn/qnn_telemetry.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
-#ifdef _WIN32
-#include <winmeta.h>
-#include "core/platform/tracing.h"
-#endif
-
 // Flag to determine if Backend should do node validation for each opNode added
 #define DO_GRAPH_NODE_VALIDATIONS 1
 
@@ -262,12 +258,12 @@ void QnnLogging(const char* format,
   const auto data_type = ::onnxruntime::logging::DataType::SYSTEM;
 
   if (logger.OutputIsEnabled(severity, data_type)) {
-    ::onnxruntime::logging::Capture(logger,
-                                    severity,
-                                    ::onnxruntime::logging::Category::onnxruntime,
-                                    data_type,
-                                    ORT_WHERE)
-        .ProcessPrintf(format, argument_parameter);
+    auto log_capture = Factory<logging::Capture>::Create(logger,
+                                                         severity,
+                                                         logging::Category::onnxruntime,
+                                                         data_type,
+                                                         ORT_WHERE);
+    log_capture->ProcessPrintf(format, argument_parameter);
   }
 }
 
@@ -408,25 +404,25 @@ Status QnnBackendManager::CreateDevice() {
     // Set SoC Model. The *enum* Qnn_SocModel_t is deprecated and will not be updated in the future. Therefore,
     // must use the latest SDK documentation to get the SoC model of the latest HW.
     if (soc_model_ != QNN_SOC_MODEL_UNKNOWN) {
-      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
-      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
-      custom_config.socModel = soc_model_;
+      gsl::not_null<QnnHtpDevice_CustomConfig_t*> custom_config = device_configs_builder.PushCustomConfig();
+      custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+      custom_config->socModel = soc_model_;
 
-      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
-      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-      device_config.customConfig = &custom_config;
+      gsl::not_null<QnnDevice_Config_t*> device_config = device_configs_builder.PushConfig();
+      device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config->customConfig = custom_config;
     }
 
     // Set the minimum HTP architecture. The driver will use ops that are compatible with this minimum architecture.
     if (htp_arch_ != QNN_HTP_DEVICE_ARCH_NONE) {
-      QnnHtpDevice_CustomConfig_t& custom_config = device_configs_builder.PushCustomConfig();
-      custom_config.option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
-      custom_config.arch.arch = htp_arch_;
-      custom_config.arch.deviceId = device_id_;
+      gsl::not_null<QnnHtpDevice_CustomConfig_t*> custom_config = device_configs_builder.PushCustomConfig();
+      custom_config->option = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
+      custom_config->arch.arch = htp_arch_;
+      custom_config->arch.deviceId = device_id_;
 
-      QnnDevice_Config_t& device_config = device_configs_builder.PushConfig();
-      device_config.option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-      device_config.customConfig = &custom_config;
+      gsl::not_null<QnnDevice_Config_t*> device_config = device_configs_builder.PushConfig();
+      device_config->option = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+      device_config->customConfig = custom_config;
     }
   }
 
@@ -1163,15 +1159,16 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() {
   }
 
   bool tracelogging_provider_ep_enabled = false;
-  const Env& env = Env::Default();
-  auto& provider = env.GetTelemetryProvider();
-  auto level = provider.Level();
+#ifdef _WIN32
+  auto& provider = QnnTelemetry::Instance();
   if (provider.IsEnabled()) {
+    auto level = provider.Level();
     auto keyword = provider.Keyword();
     if ((keyword & static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0 && level >= 5) {
       tracelogging_provider_ep_enabled = true;
     }
   }
+#endif  // defined(_WIN32)
 
   // ETW disabled previously, but enabled now
   if (ProfilingLevel::INVALID == profiling_level_etw_ && tracelogging_provider_ep_enabled) {
@@ -1389,18 +1386,8 @@ void QnnBackendManager::LogQnnProfileEventAsTraceLogging(
     const std::string& timingSource,
     const std::string& eventLevel,
     const char* eventIdentifier) {
-  TraceLoggingWrite(
-      telemetry_provider_handle,
-      "QNNProfilingEvent",
-      TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)),
-      TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE),
-      TraceLoggingValue(timestamp, "Timestamp"),
-      TraceLoggingString(message.c_str(), "Message"),
-      TraceLoggingString(qnnScalarValue.c_str(), "Value"),
-      TraceLoggingString(unit.c_str(), "Unit of Measurement"),
-      TraceLoggingString(timingSource.c_str(), "Timing Source"),
-      TraceLoggingString(eventLevel.c_str(), "Event Level"),
-      TraceLoggingString(eventIdentifier, "Event Identifier"));
+  QnnTelemetry& qnn_telemetry = QnnTelemetry::Instance();
+  qnn_telemetry.LogQnnProfileEvent(timestamp, message, qnnScalarValue, unit, timingSource, eventLevel, eventIdentifier);
 }
 #endif
 
@@ -1552,7 +1539,8 @@ void* QnnBackendManager::LoadLib(const char* file_name, int flags, std::string&
   auto file_path = std::filesystem::path(file_name);
   if (!file_path.is_absolute()) {
     // construct an absolute path from ORT runtime path + file_name and check whether it exists.
-    auto pathstring = Env::Default().GetRuntimePath() + ToPathString(file_name);
+    const Env& env = GetDefaultEnv();
+    auto pathstring = env.GetRuntimePath() + ToPathString(file_name);
     auto absolute_path = pathstring.c_str();
     if (std::filesystem::exists(std::filesystem::path(absolute_path))) {
       // load library from absolute path and search for dependencies there.
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 685e03f17c..4a69859a7e 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -22,9 +22,8 @@
 #include "QnnLog.h"
 #include "QnnTypes.h"
 #include "System/QnnSystemInterface.h"
-#include "core/common/status.h"
-#include "core/common/logging/logging.h"
-#include "core/common/path_string.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_context_mem_handle_manager.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
index 9dd9bbaa08..b581cd9053 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_configs_helper.h
@@ -3,7 +3,8 @@
 
 #pragma once
 
-#include <core/common/inlined_containers_fwd.h>
+#include <gsl/gsl>
+#include <vector>
 
 namespace onnxruntime {
 namespace qnn {
@@ -49,9 +50,9 @@ class QnnConfigsBuilder {
    *
    * \return A reference to a default CustomConfigType object.
    */
-  CustomConfigType& PushCustomConfig() {
-    custom_configs_.push_back(custom_config_init_);
-    return custom_configs_.back();
+  gsl::not_null<CustomConfigType*> PushCustomConfig() {
+    custom_configs_.push_back(std::make_unique<CustomConfigType>(custom_config_init_));
+    return custom_configs_.back().get();
   }
 
   /**
@@ -60,15 +61,15 @@ class QnnConfigsBuilder {
    *
    * \return A reference to a default BaseConfigType object.
    */
-  BaseConfigType& PushConfig() {
-    configs_.push_back(base_config_init_);
-    BaseConfigType& config = configs_.back();
+  gsl::not_null<BaseConfigType*> PushConfig() {
+    configs_.push_back(std::make_unique<BaseConfigType>(base_config_init_));
+    BaseConfigType* config = configs_.back().get();
 
     // Add pointer to this new config to the list of config pointers.
     if (IsNullTerminated()) {
-      config_ptrs_.back() = &config;  // Replace last nullptr entry.
+      config_ptrs_.back() = config;  // Replace last nullptr entry.
     } else {
-      config_ptrs_.push_back(&config);
+      config_ptrs_.push_back(config);
     }
 
     return config;
@@ -81,9 +82,14 @@ class QnnConfigsBuilder {
 
   BaseConfigType base_config_init_;
   CustomConfigType custom_config_init_;
-  InlinedVector<CustomConfigType> custom_configs_;
-  InlinedVector<BaseConfigType> configs_;
-  InlinedVector<const BaseConfigType*> config_ptrs_;
+
+  // Store elements of unique_ptrs instead of by value because std::vector reallocation would change the
+  // location of elements in memory. BaseConfigType objects may contain pointers to CustomConfigType objects,
+  // so we need to make sure that pointers to these objects are stable in memory.
+  std::vector<std::unique_ptr<CustomConfigType>> custom_configs_;
+  std::vector<std::unique_ptr<BaseConfigType>> configs_;
+
+  std::vector<const BaseConfigType*> config_ptrs_;
 };
 
 }  // namespace qnn
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc
index 22bbc2d48e..4d868c6ab9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.cc
@@ -5,7 +5,7 @@
 
 #include "HTP/QnnHtpMem.h"
 
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/qnn_allocator.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h
index 397ea8bad6..0dd8a8466d 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_context_mem_handle_manager.h
@@ -9,10 +9,7 @@
 
 #include "QnnInterface.h"
 
-#include "core/common/common.h"
-#include "core/common/inlined_containers.h"
-#include "core/common/logging/logging.h"
-#include "core/common/status.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime::qnn {
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
index f0619eb218..148fa115d4 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_def.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -9,8 +9,7 @@
 #include <memory>
 #include <climits>
 #include <type_traits>
-#include "core/graph/basic_types.h"
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 5f8b7f35ee..a9ccb9cc15 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -7,15 +7,12 @@
 #include <gsl/gsl>
 #include "QnnOpDef.h"
 
-#include "core/framework/utils.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/qnn_allocator.h"
 #include "core/providers/qnn/shared_context.h"
-#include "core/providers/shared/utils/utils.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -104,7 +101,7 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
   // valid throughout the lifetime of the ModelBuilder
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
+  std::tie(node_unit_holder, node_unit_map) = GetQDQNodeUnits(graph_viewer, logger);
 
   // This name must be same with the EPContext node name
   const auto& graph_name = fused_node.Name();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h
index 2f220e708c..3a2a080aa3 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h
@@ -6,14 +6,11 @@
 #include <mutex>
 #include <vector>
 
-#include "core/common/status.h"
-#include "core/framework/node_unit.h"
-#include "core/graph/graph_viewer.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/rpcmem_library.h"
-#include "core/session/onnxruntime_cxx_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index 79f8f176a2..6bd12959af 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+
 #include <algorithm>
 #include <cstdlib>
 #include <cstring>
@@ -8,10 +10,7 @@
 #include <utility>
 #include <vector>
 
-#include "qnn_model_wrapper.h"
-#include "core/common/safeint.h"
-#include "core/framework/tensorprotoutils.h"
-#include "core/providers/shared/utils/utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 
 namespace onnxruntime {
@@ -461,7 +460,7 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
   ORT_RETURN_IF(iter == graph_initializers.end(), "Unable to find initializer for scale(s): ",
                 scale_name.c_str());
   gsl::not_null<const onnx::TensorProto*> scale_tensor_proto = iter->second;
-  TensorShape scale_shape = onnxruntime::utils::GetTensorShapeFromTensorProto(*scale_tensor_proto);
+  TensorShape scale_shape(qnn::utils::GetInitializerShape<int64_t>(*scale_tensor_proto));
 
   // Check the number of scale values to determine if the tensor is per-channel.
   // This is consistent with CPU EP's Quant/Dequant logic. We can't use the presence of an axis because even a
@@ -636,29 +635,13 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto&
 
   // If this is an int4, we need to unpack it because QNN treats int4 as a full int8.
   if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT4) {
-    TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-    const size_t num_elems = shape.Size();
-    std::vector<uint8_t> packed_int4_bytes = std::move(unpacked_tensor);
-    unpacked_tensor = std::vector<uint8_t>(num_elems);
-
-    auto dst = gsl::make_span(reinterpret_cast<int8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
-    auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
-    ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
-
-    // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
-    // Docs explicitly state that masking off top 4 bits should not be required.
-    for (size_t i = 0; i < dst.size(); i++) {
-      dst[i] &= 0x0F;  // -3 (0b1111_1101) becomes 13 (0b0000_1101)
-    }
+    TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
+    const size_t num_int4_elems = shape.Size();
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<true>(num_int4_elems, unpacked_tensor));
   } else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
-    TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
-    const size_t num_elems = shape.Size();
-    std::vector<uint8_t> packed_int4_bytes = std::move(unpacked_tensor);
-    unpacked_tensor = std::vector<uint8_t>(num_elems);
-
-    auto dst = gsl::make_span(reinterpret_cast<uint8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
-    auto src = gsl::make_span(reinterpret_cast<const UInt4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
-    ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor<UInt4x2> for QNN");
+    TensorShape shape(qnn::utils::GetInitializerShape<int64_t>(initializer));
+    const size_t num_uint4_elems = shape.Size();
+    ORT_RETURN_IF_ERROR(qnn::utils::UnpackInt4ToInt8<false>(num_uint4_elems, unpacked_tensor));
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
index d018ca12d6..203250204d 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -7,13 +7,10 @@
 #include <string>
 #include <vector>
 
-#include "core/common/status.h"
 #include "QnnInterface.h"
 #include "qnn_def.h"
-#include "core/common/logging/logging.h"
-#include "core/framework/node_unit.h"
-#include "core/graph/graph_viewer.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_quant_params_wrapper.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
index f9ef014113..276fbaae3b 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group.h
@@ -8,8 +8,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/logging/logging.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
index caf4725626..3af2fdd1f0 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.cc
@@ -6,9 +6,8 @@
 #include <limits>
 #include <optional>
 #include <utility>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_node_group/utils.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
index 90fe44c3af..d3d552bc17 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
@@ -7,8 +7,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
index 76b1726646..5094ad9672 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.cc
@@ -6,9 +6,8 @@
 #include <limits>
 #include <optional>
 #include <utility>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
-#include "core/providers/shared/utils/utils.h"
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
index 3b67f13492..0a1b16d24f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
@@ -7,8 +7,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
index c398d1fae5..e947da1a60 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
@@ -10,8 +10,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "core/graph/graph_utils.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
index 5548d7d37c..93b2fca296 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
@@ -4,8 +4,7 @@
 #include <string_view>
 #include <unordered_map>
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
index 0d11d21906..c4cf4e8a20 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
@@ -7,8 +7,7 @@
 #include <string_view>
 #include <unordered_map>
 
-#include "core/graph/graph_viewer.h"
-#include "core/framework/node_unit.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
index 23330f5616..01c15cf4be 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h
@@ -4,10 +4,10 @@
 #pragma once
 #include <memory>
 #include <vector>
-#include "QnnTypes.h"
-#include "core/common/common.h"
 #include <gsl/gsl>
-#include "core/framework/node_unit.h"
+
+#include "core/providers/qnn/ort_api.h"
+#include "QnnTypes.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 08d3120260..56c3d3e803 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -5,14 +5,13 @@
 
 #include <algorithm>
 #include <functional>
+#include <limits>
 #include <map>
 #include <numeric>
 #include <string>
 #include <vector>
 
-#include "core/common/common.h"
-#include "core/common/safeint.h"
-#include "core/framework/data_types.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 
 namespace onnxruntime {
@@ -66,6 +65,42 @@ size_t GetElementSizeByType(ONNXTensorElementDataType elem_type) {
   return pos->second;
 }
 
+size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type) {
+  switch (onnx_type) {
+    case ONNX_NAMESPACE::TensorProto_DataType_INT4:
+      return sizeof(Int4x2);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT4:
+      return sizeof(UInt4x2);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:
+      return sizeof(int8_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
+      return sizeof(uint8_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:
+      return sizeof(int16_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:
+      return sizeof(uint16_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:
+      return sizeof(int32_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:
+      return sizeof(uint32_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:
+      return sizeof(int64_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:
+      return sizeof(uint64_t);
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:
+      return 2;
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
+      return sizeof(float);
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:
+      return sizeof(double);
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
+      return sizeof(bool);
+    default:
+      return 0;
+  }
+  // Unreachable
+}
+
 size_t GetQnnTensorDataSizeInBytes(gsl::span<const uint32_t> shape, Qnn_DataType_t element_type) {
   ORT_ENFORCE(!shape.empty(), "Empty shape not allowed.");  // TODO can we just treat empty shape as a scalar?
   SafeInt<size_t> data_length = GetElementSizeByType(element_type);
@@ -507,39 +542,22 @@ bool OnnxDataTypeToQnnDataType(const int32_t onnx_data_type, Qnn_DataType_t& qnn
 }
 
 std::pair<float, float> CheckMinMax(float rmin, float rmax) {
-  // Ensure a minimum range of 0.0001 (required by QNN)
-  rmax = std::max(rmax, rmin + 0.0001f);
-
   // Both QNN and ORT require the range to include 0.0f
   rmin = std::min(rmin, 0.0f);
   rmax = std::max(rmax, 0.0f);
 
+  // Ensure a minimum range of 0.0001 (required by QNN)
+  rmax = std::max(rmax, rmin + 0.0001f);
+
   return std::make_pair(rmin, rmax);
 }
 
-template <typename T>
-Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
-                   T& qmin,
-                   T& qmax) {
-  if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) {
-    qmin = static_cast<T>(std::numeric_limits<int8_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int8_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) {
-    qmin = static_cast<T>(std::numeric_limits<uint8_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<uint8_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) {
-    qmin = static_cast<T>(std::numeric_limits<int16_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int16_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
-    qmin = static_cast<T>(std::numeric_limits<uint16_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<uint16_t>::max());
-  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
-    qmin = static_cast<T>(std::numeric_limits<int32_t>::min());
-    qmax = static_cast<T>(std::numeric_limits<int32_t>::max());
-  } else {
-    ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type);
+inline float RoundHalfToEven(float input) {
+  if (!std::isfinite(input)) {
+    return input;
   }
-  return Status::OK();
+  // std::remainder returns x - n, where n is the integral value nearest to x. When |x - n| = 0.5, n is chosen to be even
+  return input - std::remainderf(input, 1.f);
 }
 
 Status GetQuantParams(float rmin,
@@ -555,20 +573,22 @@ Status GetQuantParams(float rmin,
     rmin = -abs_max;
   }
 
-  float qmin = 0.0f;
-  float qmax = 255.0f;
-  ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax));
+  double rmin_dbl = static_cast<double>(rmin);
+  double rmax_dbl = static_cast<double>(rmax);
+  double qmin = 0.0;
+  double qmax = 0.0;
+  ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax, symmetric));
 
-  scale = (rmax - rmin) / (qmax - qmin);
-  float initial_zero_point = 0.0f;
+  double scale_dbl = (rmax_dbl - rmin_dbl) / (qmax - qmin);
+  double initial_zero_point = 0.0;
   if (symmetric) {
-    initial_zero_point = std::round(rmin + rmax) / 2;
+    initial_zero_point = std::round(rmin_dbl + rmax_dbl) / 2;
   } else {
-    initial_zero_point = qmin - (rmin / scale);
+    initial_zero_point = qmin - (rmin_dbl / scale_dbl);
   }
-  zero_point = static_cast<int32_t>(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point)));
-  // To match QNN quantization definition
-  zero_point = 0 - zero_point;
+  zero_point = static_cast<int32_t>(RoundHalfToEven(static_cast<float>(Saturate(qmax, qmin, initial_zero_point))));
+  zero_point = -zero_point;  // Negate to match QNN quantization definition.
+  scale = static_cast<float>(scale_dbl);
   return Status::OK();
 }
 
@@ -590,6 +610,126 @@ Status Quantize(const double double_value,
   return Status::OK();
 }
 
+size_t ShapeSizeCalc(gsl::span<const uint32_t> shape, size_t start, size_t end) {
+  size_t size = 1;
+  for (size_t i = start; i < end; i++) {
+    size *= shape[i];
+  }
+  return size;
+}
+
+Status GetDataQuantParams(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                          /*out*/ gsl::span<float> scales, /*out*/ gsl::span<int32_t> offsets,
+                          Qnn_DataType_t data_type, bool symmetric, std::optional<int64_t> axis) {
+  const size_t num_dims = shape.size();
+  const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims);
+  ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize");
+
+  size_t block_count = 1;
+  size_t broadcast_dim = 1;
+  size_t block_size = num_elems;
+
+  if (axis.has_value()) {
+    size_t axis_no_neg = *axis < 0 ? static_cast<size_t>(*axis) + num_dims : static_cast<size_t>(*axis);
+    block_count = ShapeSizeCalc(shape, 0, axis_no_neg);
+    broadcast_dim = shape[axis_no_neg];
+    block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims);
+  }
+
+  ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer");
+  ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer");
+
+  size_t i = 0;
+  for (size_t n = 0; n < block_count; n++) {
+    for (size_t bd = 0; bd < broadcast_dim; bd++) {
+      float rmin = std::numeric_limits<float>::max();
+      float rmax = std::numeric_limits<float>::lowest();
+      for (size_t j = 0; j < block_size; j++) {
+        rmin = std::min(rmin, data[i]);
+        rmax = std::max(rmax, data[i]);
+        i++;
+      }
+
+      scales[bd] = 1.0f;
+      offsets[bd] = 0;
+      ORT_RETURN_IF_ERROR(GetQuantParams(rmin, rmax, data_type, scales[bd], offsets[bd], symmetric));
+    }
+  }
+
+  assert(i == data.size());
+  return Status::OK();
+}
+
+Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                    gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+                    /*out*/ gsl::span<uint8_t> quant_bytes, Qnn_DataType_t data_type,
+                    std::optional<int64_t> axis) {
+  const size_t num_dims = shape.size();
+  const size_t num_elems = ShapeSizeCalc(shape, 0, num_dims);
+  ORT_RETURN_IF_NOT(num_elems == data.size(), "Shape mismatch with data to quantize");
+  size_t expected_num_quant_bytes = GetElementSizeByType(data_type) * data.size();
+  ORT_RETURN_IF_NOT(quant_bytes.size() == expected_num_quant_bytes,
+                    "Cannot quantize data because output buffer is not the correct size");
+
+  size_t block_count = 1;
+  size_t broadcast_dim = 1;
+  size_t block_size = num_elems;
+
+  if (axis.has_value()) {
+    size_t axis_no_neg = *axis < 0 ? static_cast<size_t>(*axis) + num_dims : static_cast<size_t>(*axis);
+    block_count = ShapeSizeCalc(shape, 0, axis_no_neg);
+    broadcast_dim = shape[axis_no_neg];
+    block_size = ShapeSizeCalc(shape, axis_no_neg + 1, num_dims);
+  }
+
+  ORT_RETURN_IF_NOT(scales.size() == broadcast_dim, "Unexpected size of scales output buffer");
+  ORT_RETURN_IF_NOT(offsets.size() == broadcast_dim, "Unexpected size of offsets output buffer");
+
+  size_t i = 0;
+  for (size_t n = 0; n < block_count; n++) {
+    for (size_t bd = 0; bd < broadcast_dim; bd++) {
+      switch (data_type) {
+        case QNN_DATATYPE_SFIXED_POINT_8: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int8_t)], sizeof(int8_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int8_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_UFIXED_POINT_8: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(uint8_t)], sizeof(uint8_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<uint8_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_SFIXED_POINT_16: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int16_t)], sizeof(int16_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int16_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_UFIXED_POINT_16: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(uint16_t)], sizeof(uint16_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<uint16_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        case QNN_DATATYPE_SFIXED_POINT_32: {
+          auto input_span = gsl::make_span<const float>(&data[i], block_size);
+          auto output_span = gsl::make_span<uint8_t>(&quant_bytes[i * sizeof(int32_t)], sizeof(int32_t) * block_size);
+          ORT_RETURN_IF_ERROR(QuantizeData<int32_t>(input_span, scales[bd], offsets[bd], output_span));
+          break;
+        }
+        default:
+          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported quantization data type for QuantizeData");
+      }
+      i += block_size;
+    }
+  }
+  assert(i == data.size());
+
+  return Status::OK();
+}
+
 std::string_view GetQnnErrorMessage(const QNN_INTERFACE_VER_TYPE& qnn_interface, Qnn_ErrorHandle_t qnn_error_handle) {
   // From QNN SDK: The memory is statically owned and should not be freed by the caller.
   const char* error_msg = nullptr;
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
index 950f349c50..853debb61a 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h
@@ -2,11 +2,13 @@
 // Licensed under the MIT License.
 #pragma once
 
+#include <algorithm>
 #include <functional>
 #include <numeric>
 #include <string>
 #include <string_view>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include <gsl/gsl>
@@ -14,9 +16,7 @@
 #include "QnnInterface.h"
 #include "QnnTypes.h"
 
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/framework/node_unit.h"
-#include "core/util/qmath.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -27,6 +27,8 @@ size_t GetElementSizeByType(const Qnn_DataType_t& data_type);
 
 size_t GetElementSizeByType(ONNXTensorElementDataType elem_type);
 
+size_t GetElementSizeByType(ONNX_NAMESPACE::TensorProto_DataType onnx_type);
+
 size_t GetQnnTensorDataSizeInBytes(gsl::span<const uint32_t> shape, Qnn_DataType_t element_data_type);
 
 bool QnnTensorHasDynamicShape(const Qnn_Tensor_t& tensor);
@@ -83,7 +85,30 @@ static bool ArrayHasString(const std::array<std::string_view, N>& strings, std::
 std::pair<float, float> CheckMinMax(float rmin, float rmax);
 
 template <typename T>
-Status GetQminQmax(const Qnn_DataType_t qnn_data_type, T& qmin, T& qmax);
+Status GetQminQmax(const Qnn_DataType_t qnn_data_type,
+                   T& qmin,
+                   T& qmax,
+                   bool symmetric = false) {
+  if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) {
+    qmin = static_cast<T>(std::numeric_limits<int8_t>::min() + static_cast<int8_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int8_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_8) {
+    qmin = static_cast<T>(std::numeric_limits<uint8_t>::min());
+    qmax = static_cast<T>(std::numeric_limits<uint8_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_16) {
+    qmin = static_cast<T>(std::numeric_limits<int16_t>::min() + static_cast<int16_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int16_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) {
+    qmin = static_cast<T>(std::numeric_limits<uint16_t>::min());
+    qmax = static_cast<T>(std::numeric_limits<uint16_t>::max());
+  } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) {
+    qmin = static_cast<T>(std::numeric_limits<int32_t>::min() + static_cast<int32_t>(symmetric));
+    qmax = static_cast<T>(std::numeric_limits<int32_t>::max());
+  } else {
+    ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type);
+  }
+  return Status::OK();
+}
 
 template <typename T>
 inline T Saturate(const T qmax,
@@ -113,6 +138,104 @@ Status Quantize(const double double_value,
                 const Qnn_DataType_t qnn_data_type,
                 int& quant_value);
 
+size_t ShapeSizeCalc(gsl::span<const uint32_t> shape, size_t start, size_t end);
+
+// Computes the quantization parameters (scales and offsets) for the given data.
+// Supports both per-tensor and per-channel quantization. Must provide an axis argument
+// for per-channel quantization.
+// The offsets use the QNN convention where offset = -zero_point.
+Status GetDataQuantParams(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                          /*out*/ gsl::span<float> scales, /*out*/ gsl::span<int32_t> offsets,
+                          Qnn_DataType_t data_type, bool symmetric = false,
+                          std::optional<int64_t> axis = std::nullopt);
+
+// Quantizes the given float data using the provided quantization parameters (scales and offsets).
+// Supports both per-tensor and per-channel quantization. Must provide an axis argument
+// for per-channel quantization.
+// The provided offsets must use the QNN convention where offset = -zero_point.
+Status QuantizeData(gsl::span<const float> data, gsl::span<const uint32_t> shape,
+                    gsl::span<const float> scales, gsl::span<const int32_t> offsets,
+                    /*out*/ gsl::span<uint8_t> quant_bytes, Qnn_DataType_t data_type,
+                    std::optional<int64_t> axis = std::nullopt);
+
+// Quantizes (per-tensor) the given float data using the provided scale and offset.
+// The provided offset must use the QNN convention where offset = -zero_point.
+template <typename QuantType>
+inline Status QuantizeData(gsl::span<const float> data, float scale, int32_t offset,
+                           /*out*/ gsl::span<uint8_t> quant_bytes) {
+  const size_t num_elems = data.size();
+  const size_t expected_output_bytes = sizeof(QuantType) * num_elems;
+  ORT_RETURN_IF_NOT(expected_output_bytes == quant_bytes.size(),
+                    "Output buffer is not large enough to hold quantized bytes.");
+  const double clip_min = static_cast<double>(std::numeric_limits<QuantType>::lowest());
+  const double clip_max = static_cast<double>(std::numeric_limits<QuantType>::max());
+
+  QuantType* output = reinterpret_cast<QuantType*>(quant_bytes.data());
+  for (size_t i = 0; i < num_elems; ++i) {
+    const double scale_dbl = static_cast<double>(scale);
+    const double offset_dbl = static_cast<double>(offset);
+    double float_val = std::nearbyint(static_cast<double>(data[i]) / scale_dbl) - offset_dbl;
+    float_val = std::max(float_val, clip_min);
+    float_val = std::min(float_val, clip_max);
+    output[i] = static_cast<QuantType>(float_val);
+  }
+  return Status::OK();
+}
+
+// Re-writes a buffer of packed 4-bit elements to a buffer of unpacked 8-bit elements.
+// QNN requires that 4-bit weights are unpacked to 8-bit.
+template <bool Signed>
+Status UnpackInt4ToInt8(size_t num_int4_elems, std::vector<uint8_t>& data_bytes) {
+  if constexpr (Signed) {  // INT4
+    std::vector<uint8_t> packed_int4_bytes = std::move(data_bytes);
+    data_bytes = std::vector<uint8_t>(num_int4_elems);
+
+    auto dst = gsl::make_span(reinterpret_cast<int8_t*>(data_bytes.data()), data_bytes.size());
+    auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
+    ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
+
+    // NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
+    // Docs explicitly state that masking off top 4 bits should not be required, but we have to do it.
+    for (size_t i = 0; i < dst.size(); i++) {
+      dst[i] &= 0x0F;  // -3 (0b1111_1101) becomes 13 (0b0000_1101)
+    }
+  } else {  // UINT4
+    std::vector<uint8_t> packed_uint4_bytes = std::move(data_bytes);
+    data_bytes = std::vector<uint8_t>(num_int4_elems);
+
+    auto dst = gsl::make_span(reinterpret_cast<uint8_t*>(data_bytes.data()), data_bytes.size());
+    auto src = gsl::make_span(reinterpret_cast<const UInt4x2*>(packed_uint4_bytes.data()), packed_uint4_bytes.size());
+    ORT_RETURN_IF_NOT(UInt4x2::Unpack(dst, src), "Failed to unpack Tensor<UInt4x2> for QNN");
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+std::vector<T> GetInitializerShape(const ONNX_NAMESPACE::TensorProto& tensor_proto) {
+  const auto& dims = tensor_proto.dims();
+  std::vector<T> tensor_shape_vec(static_cast<size_t>(dims.size()));
+  for (int i = 0; i < dims.size(); ++i) {
+    tensor_shape_vec[i] = static_cast<T>(dims[i]);
+  }
+
+  return tensor_shape_vec;
+}
+
+template <typename T, typename P>
+Status PermuteShape(gsl::span<const T> input_shape, gsl::span<const P> perm, gsl::span<T> output_shape) {
+  const size_t rank = input_shape.size();
+  ORT_RETURN_IF_NOT(rank == perm.size() && rank == output_shape.size(),
+                    "PermuteShape(): expect all arguments to have the same rank.");
+
+  for (size_t i = 0; i < rank; ++i) {
+    size_t p = static_cast<size_t>(perm[i]);
+    output_shape[i] = input_shape[p];
+  }
+
+  return Status::OK();
+}
+
 // Gets error message associated with QNN error handle value.
 std::string_view GetQnnErrorMessage(const QNN_INTERFACE_VER_TYPE& qnn_interface,
                                     Qnn_ErrorHandle_t qnn_error_handle);
diff --git a/onnxruntime/core/providers/qnn/ort_api.cc b/onnxruntime/core/providers/qnn/ort_api.cc
new file mode 100644
index 0000000000..809593b409
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/ort_api.cc
@@ -0,0 +1,211 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/ort_api.h"
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+namespace onnxruntime {
+
+#if BUILD_QNN_EP_STATIC_LIB
+static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
+
+void RunOnUnload(std::function<void()> function) {
+  static std::mutex mutex;
+  std::lock_guard<std::mutex> guard(mutex);
+  if (!s_run_on_unload_) {
+    s_run_on_unload_ = std::make_unique<std::vector<std::function<void()>>>();
+  }
+  s_run_on_unload_->push_back(std::move(function));
+}
+
+struct OnUnload {
+  ~OnUnload() {
+    if (!s_run_on_unload_)
+      return;
+
+    for (auto& function : *s_run_on_unload_)
+      function();
+
+    s_run_on_unload_.reset();
+  }
+
+} g_on_unload;
+#endif  // BUILD_QNN_EP_STATIC_LIB
+
+std::vector<const Node*> Graph__Nodes(const Graph& graph) {
+#if BUILD_QNN_EP_STATIC_LIB
+  std::vector<const Node*> nodes;
+  nodes.reserve(graph.NumberOfNodes());
+
+  for (const Node& node : graph.Nodes()) {
+    nodes.push_back(&node);
+  }
+
+  return nodes;
+#else
+  return graph.Nodes();
+#endif
+}
+
+#if BUILD_QNN_EP_STATIC_LIB
+#define NODE_ATTR_ITER_VAL(iter) (iter)->second
+#else
+#define NODE_ATTR_ITER_VAL(iter) (iter)->second()
+#endif
+
+NodeAttrHelper::NodeAttrHelper(const onnxruntime::Node& node)
+    : node_attributes_(node.GetAttributes()) {}
+
+NodeAttrHelper::NodeAttrHelper(const NodeUnit& node_unit)
+    : node_attributes_(node_unit.GetNode().GetAttributes()) {}
+
+float NodeAttrHelper::Get(const std::string& key, float def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return NODE_ATTR_ITER_VAL(entry).f();
+  }
+
+  return def_val;
+}
+
+int32_t NodeAttrHelper::Get(const std::string& key, int32_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<int32_t>(NODE_ATTR_ITER_VAL(entry).i());
+  }
+
+  return def_val;
+}
+
+uint32_t NodeAttrHelper::Get(const std::string& key, uint32_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return narrow<uint32_t>(NODE_ATTR_ITER_VAL(entry).i());
+  }
+
+  return def_val;
+}
+
+int64_t NodeAttrHelper::Get(const std::string& key, int64_t def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return NODE_ATTR_ITER_VAL(entry).i();
+  }
+
+  return def_val;
+}
+
+const std::string& NodeAttrHelper::Get(const std::string& key, const std::string& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    return NODE_ATTR_ITER_VAL(entry).s();
+  }
+
+  return def_val;
+}
+
+std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int32_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    std::vector<int32_t> v;
+    v.reserve(static_cast<size_t>(values.size()));
+    std::transform(cbegin, cend, std::back_inserter(v),
+                   [](int64_t val) -> int32_t { return narrow<int32_t>(val); });
+    return v;
+  }
+
+  return def_val;
+}
+
+std::vector<uint32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<uint32_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    std::vector<uint32_t> v;
+    v.reserve(static_cast<size_t>(values.size()));
+    std::transform(cbegin, cend, std::back_inserter(v),
+                   [](int64_t val) -> uint32_t { return narrow<uint32_t>(val); });
+    return v;
+  }
+
+  return def_val;
+}
+
+std::vector<int64_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int64_t>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    return std::vector<int64_t>{cbegin, cend};
+  }
+
+  return def_val;
+}
+
+std::vector<float> NodeAttrHelper::Get(const std::string& key, const std::vector<float>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).floats();
+    const float* cbegin = values.data();
+    const float* cend = values.data() + values.size();
+    return std::vector<float>{cbegin, cend};
+  }
+
+  return def_val;
+}
+
+std::optional<float> NodeAttrHelper::GetFloat(const std::string& key) const {
+  std::optional<float> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = NODE_ATTR_ITER_VAL(entry).f();
+  }
+
+  return result;
+}
+
+std::optional<int64_t> NodeAttrHelper::GetInt64(const std::string& key) const {
+  std::optional<int64_t> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = NODE_ATTR_ITER_VAL(entry).i();
+  }
+
+  return result;
+}
+
+std::optional<std::vector<float>> NodeAttrHelper::GetFloats(const std::string& key) const {
+  std::optional<std::vector<float>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).floats();
+    const float* cbegin = values.data();
+    const float* cend = values.data() + values.size();
+    result = std::vector<float>(cbegin, cend);
+  }
+
+  return result;
+}
+
+std::optional<std::vector<int64_t>> NodeAttrHelper::GetInt64s(const std::string& key) const {
+  std::optional<std::vector<int64_t>> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    const auto& values = NODE_ATTR_ITER_VAL(entry).ints();
+    const int64_t* cbegin = values.data();
+    const int64_t* cend = values.data() + values.size();
+    result = std::vector<int64_t>(cbegin, cend);
+  }
+
+  return result;
+}
+
+std::optional<std::string> NodeAttrHelper::GetString(const std::string& key) const {
+  std::optional<std::string> result;
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    result = NODE_ATTR_ITER_VAL(entry).s();
+  }
+
+  return result;
+}
+
+bool NodeAttrHelper::HasAttr(const std::string& key) const {
+  return node_attributes_.find(key) != node_attributes_.end();
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
new file mode 100644
index 0000000000..030ebbb54c
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -0,0 +1,178 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License
+
+#pragma once
+
+// This compilation unit (ort_api.h/.cc) encapsulates the interface between the EP and ORT in a manner
+// that allows QNN EP to built either as a static library or a dynamic shared library.
+// The preprocessor macro `BUILD_QNN_EP_STATIC_LIB` is defined and set to 1 if QNN EP
+// is built as a static library.
+
+#if BUILD_QNN_EP_STATIC_LIB
+// Includes when building QNN EP statically
+#ifdef _WIN32
+#include <Windows.h>
+#include <winmeta.h>
+#include "core/platform/tracing.h"
+#include "core/platform/windows/logging/etw_sink.h"
+#endif
+
+#include "onnx/defs/data_type_utils.h"
+#include "core/common/common.h"
+#include "core/common/status.h"
+#include "core/common/safeint.h"
+#include "core/common/logging/logging.h"
+#include "core/common/logging/capture.h"
+#include "core/common/path_string.h"
+#include "core/platform/env.h"
+#include "core/framework/data_types.h"
+#include "core/framework/float16.h"
+#include "core/framework/run_options.h"
+#include "core/framework/execution_provider.h"
+#include "core/framework/model_metadef_id_generator.h"
+#include "core/framework/compute_capability.h"
+#include "core/framework/tensor_shape.h"
+#include "core/framework/node_unit.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/framework/utils.h"
+#include "core/graph/constants.h"
+#include "core/graph/basic_types.h"
+#include "core/graph/model.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
+#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
+#include "core/providers/common.h"
+#include "core/providers/partitioning_utils.h"
+#include "core/session/onnxruntime_cxx_api.h"
+#else
+// Includes when building QNN EP as a shared library
+#include "core/providers/shared_library/provider_api.h"
+#define ORT_API_MANUAL_INIT
+#include "core/session/onnxruntime_cxx_api.h"
+#endif
+
+#include "core/common/inlined_containers.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
+
+#include <memory>
+#include <vector>
+
+namespace onnxruntime {
+#if BUILD_QNN_EP_STATIC_LIB
+using Node_EdgeEnd = Node::EdgeEnd;
+#endif
+
+#if BUILD_QNN_EP_STATIC_LIB
+void RunOnUnload(std::function<void()> function);
+inline const Env& GetDefaultEnv() { return Env::Default(); }
+#endif
+
+inline void InitOrtCppApi() {
+#if BUILD_QNN_EP_STATIC_LIB
+  // Do nothing. Including "onnxruntime_cxx_api.h" normally initializes the global api_ object.
+#else
+  // Call util function in provider bridge that initializes the global api_ object.
+  InitProviderOrtApi();
+#endif
+}
+
+/// <summary>
+/// Creates an onnxruntime or onnx object. Works for both static and shared library builds of QNN EP.
+/// <!-- Example: auto model = Factory<Model>::Create(/* args ... */); -->
+/// Example: auto model = Factory&lt;Model&gt;::Create(/* args ... */);
+/// </summary>
+/// <typeparam name="T">Type of the object to create</typeparam>
+template <typename T>
+struct Factory {
+  template <typename... Params>
+  static inline std::unique_ptr<T> Create(Params&&... params) {
+#if BUILD_QNN_EP_STATIC_LIB
+    return std::make_unique<T>(std::forward<Params>(params)...);
+#else
+    return T::Create(std::forward<Params>(params)...);
+#endif
+  }
+};
+
+inline const ConfigOptions& RunOptions__GetConfigOptions(const RunOptions& run_options) {
+#if BUILD_QNN_EP_STATIC_LIB
+  return run_options.config_options;
+#else
+  return run_options.GetConfigOptions();
+#endif
+}
+
+inline std::unique_ptr<IndexedSubGraph>& ComputeCapability__SubGraph(ComputeCapability& compute_cability) {
+#if BUILD_QNN_EP_STATIC_LIB
+  return compute_cability.sub_graph;
+#else
+  return compute_cability.SubGraph();
+#endif
+}
+
+inline std::vector<NodeIndex>& IndexedSubGraph__Nodes(IndexedSubGraph& indexed_sub_graph) {
+#if BUILD_QNN_EP_STATIC_LIB
+  return indexed_sub_graph.nodes;
+#else
+  return indexed_sub_graph.Nodes();
+#endif
+}
+
+std::vector<const Node*> Graph__Nodes(const Graph& graph);
+
+inline std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
+GetQDQNodeUnits(const GraphViewer& graph_viewer, const logging::Logger& logger) {
+#if BUILD_QNN_EP_STATIC_LIB
+  return QDQ::GetAllNodeUnits(graph_viewer, logger);
+#else
+  return QDQ::GetAllNodeUnits(&graph_viewer, logger);
+#endif
+}
+
+/**
+ * Wrapping onnxruntime::Node for retrieving attribute values
+ */
+class NodeAttrHelper {
+ public:
+  explicit NodeAttrHelper(const Node& node);
+
+  // Get the attributes from the target node of the node_unit
+  explicit NodeAttrHelper(const NodeUnit& node_unit);
+
+  /*
+   * Get with default
+   */
+  float Get(const std::string& key, float def_val) const;
+  std::vector<float> Get(const std::string& key, const std::vector<float>& def_val) const;
+
+  int64_t Get(const std::string& key, int64_t def_val) const;
+  std::vector<int64_t> Get(const std::string& key, const std::vector<int64_t>& def_val) const;
+
+  const std::string& Get(const std::string& key, const std::string& def_val) const;
+
+  // Convert the i() or ints() of the attribute from int64_t to int32_t
+  int32_t Get(const std::string& key, int32_t def_val) const;
+  std::vector<int32_t> Get(const std::string& key, const std::vector<int32_t>& def_val) const;
+
+  // Convert the i() or ints() of the attribute from int64_t to uint32_t
+  uint32_t Get(const std::string& key, uint32_t def_val) const;
+  std::vector<uint32_t> Get(const std::string& key, const std::vector<uint32_t>& def_val) const;
+
+  /*
+   * Get without default.
+   */
+  std::optional<float> GetFloat(const std::string& key) const;
+  std::optional<std::vector<float>> GetFloats(const std::string& key) const;
+
+  std::optional<int64_t> GetInt64(const std::string& key) const;
+  std::optional<std::vector<int64_t>> GetInt64s(const std::string& key) const;
+
+  std::optional<std::string> GetString(const std::string& key) const;
+
+  bool HasAttr(const std::string& key) const;
+
+ private:
+  const NodeAttributes& node_attributes_;
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_allocator.cc b/onnxruntime/core/providers/qnn/qnn_allocator.cc
index 68dac68275..1fb8742f72 100644
--- a/onnxruntime/core/providers/qnn/qnn_allocator.cc
+++ b/onnxruntime/core/providers/qnn/qnn_allocator.cc
@@ -7,9 +7,7 @@
 #include <cstddef>
 #include <algorithm>
 
-#include "core/common/common.h"
-#include "core/common/safeint.h"
-#include "core/mlas/inc/mlas.h"  // for MlasGetPreferredBufferAlignment()
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime::qnn {
 
@@ -52,7 +50,8 @@ struct AllocationHeader {
 };
 
 size_t AllocationAlignment() {
-  return std::max(alignof(AllocationHeader), MlasGetPreferredBufferAlignment());
+  constexpr size_t min_allocation_alignment = 64;  // Equal to MlasGetPreferredBufferAlignment()
+  return std::max(alignof(AllocationHeader), min_allocation_alignment);
 }
 
 size_t DivRoundUp(size_t a, size_t b) {  // TODO is there already a helper function somewhere for this?
diff --git a/onnxruntime/core/providers/qnn/qnn_allocator.h b/onnxruntime/core/providers/qnn/qnn_allocator.h
index f642368697..e64f38f494 100644
--- a/onnxruntime/core/providers/qnn/qnn_allocator.h
+++ b/onnxruntime/core/providers/qnn/qnn_allocator.h
@@ -6,11 +6,7 @@
 #include <memory>
 #include <mutex>
 
-#include "core/common/common.h"
-#include "core/common/inlined_containers.h"
-#include "core/common/logging/logging.h"
-#include "core/common/status.h"
-#include "core/framework/allocator.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/rpcmem_library.h"
 
 namespace onnxruntime::qnn {
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index e9d6884b8c..b1555b6050 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -6,60 +6,22 @@
 #include <filesystem>
 #include <unordered_set>
 
-#include "core/framework/compute_capability.h"
-#include "core/framework/kernel_registry.h"
-#include "core/framework/run_options.h"
-#include "core/graph/graph_viewer.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
-#include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/platform/env.h"
-#include "core/providers/common.h"
-#include "core/providers/partitioning_utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_node_group.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
 #include "core/providers/qnn/qnn_allocator.h"
+#include "core/providers/qnn/qnn_telemetry.h"
 #include "core/providers/qnn/rpcmem_library.h"
 #include "core/providers/qnn/shared_context.h"
-#include "core/session/onnxruntime_cxx_api.h"
-#include "core/session/onnxruntime_run_options_config_keys.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
-
-#ifdef _WIN32
-#include <Windows.h>
-#include "core/platform/windows/logging/etw_sink.h"
-#endif
 
 namespace onnxruntime {
 
 constexpr const char* QNN = "QNN";
 
-static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
-
-void RunOnUnload(std::function<void()> function) {
-  static std::mutex mutex;
-  std::lock_guard<std::mutex> guard(mutex);
-  if (!s_run_on_unload_) {
-    s_run_on_unload_ = std::make_unique<std::vector<std::function<void()>>>();
-  }
-  s_run_on_unload_->push_back(std::move(function));
-}
-
-struct OnUnload {
-  ~OnUnload() {
-    if (!s_run_on_unload_)
-      return;
-
-    for (auto& function : *s_run_on_unload_)
-      function();
-
-    s_run_on_unload_.reset();
-  }
-
-} g_on_unload;
-
 static void ParseProfilingLevel(std::string profiling_level_string,
                                 qnn::ProfilingLevel& profiling_level) {
   std::transform(profiling_level_string.begin(),
@@ -196,17 +158,20 @@ qnn::ProfilingLevel QNNExecutionProvider::GetProfilingLevelFromETWLevel(unsigned
 }
 
 QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_options_map,
-                                           const SessionOptions* session_options)
+                                           const ConfigOptions* config_options)
     : IExecutionProvider{onnxruntime::kQnnExecutionProvider} {
-  if (session_options) {
-    disable_cpu_ep_fallback_ = session_options->config_options.GetConfigOrDefault(
+  InitOrtCppApi();
+  metadef_id_generator_ = Factory<ModelMetadefIdGenerator>::Create();
+
+  if (config_options) {
+    disable_cpu_ep_fallback_ = config_options->GetConfigOrDefault(
                                    kOrtSessionOptionsDisableCPUEPFallback, "0") == "1";
 
-    context_cache_enabled_ = session_options->config_options.GetConfigOrDefault(
+    context_cache_enabled_ = config_options->GetConfigOrDefault(
                                  kOrtSessionOptionEpContextEnable, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "Context cache enable: " << context_cache_enabled_;
 
-    std::string embed_mode = session_options->config_options.GetConfigOrDefault(
+    std::string embed_mode = config_options->GetConfigOrDefault(
         kOrtSessionOptionEpContextEmbedMode, "0");
     if ("1" == embed_mode) {
       qnn_context_embed_mode_ = true;
@@ -217,18 +182,18 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     }
     LOGS_DEFAULT(VERBOSE) << "User specified context cache embed mode: " << qnn_context_embed_mode_;
 
-    context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+    context_cache_path_cfg_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
     LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_;
 
     // For the case that workaround QNN context PD memory limit, user need split the model into pieces and
     // generate the QNN context model separately.
     // It could happen that the generated EPContext node in separate graph has same node name.
     // User can set this context_node_name_prefix for each split pieces to avoid that happens.
-    context_node_name_prefix_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, "");
+    context_node_name_prefix_ = config_options->GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, "");
     LOGS_DEFAULT(VERBOSE) << "User specified QNN context node name prefix: " << context_node_name_prefix_;
 
     share_ep_contexts_ =
-        session_options->config_options.GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
+        config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
     LOGS_DEFAULT(VERBOSE) << "User specified option - share EP contexts across sessions: " << share_ep_contexts_;
   }
 
@@ -249,8 +214,9 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   // separate out the profiling level for ETW in case it gets disabled later when we extract the events
   // set to invalid to indicate that ETW is no enabled when we setup QNN
   qnn::ProfilingLevel profiling_level_etw = qnn::ProfilingLevel::INVALID;
-  const Env& env = Env::Default();
-  auto& provider = env.GetTelemetryProvider();
+
+#ifdef _WIN32
+  auto& provider = qnn::QnnTelemetry::Instance();
   if (provider.IsEnabled()) {
     auto level = provider.Level();
     auto keyword = provider.Keyword();
@@ -260,6 +226,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
       }
     }
   }
+#endif  // defined(_WIN32)
 
   // In case ETW gets disabled later
   auto profiling_level_pos = provider_options_map.find(PROFILING_LEVEL);
@@ -412,47 +379,53 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
                                    soc_model,
                                    enable_htp_weight_sharing});
 
-#ifdef _WIN32
-  auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
-  // Register callback for ETW capture state (rundown)
-  callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback(
-      [&etwRegistrationManager, this](
-          LPCGUID SourceId,
-          ULONG IsEnabled,
-          UCHAR Level,
-          ULONGLONG MatchAnyKeyword,
-          ULONGLONG MatchAllKeyword,
-          PEVENT_FILTER_DESCRIPTOR FilterData,
-          PVOID CallbackContext) {
-        ORT_UNUSED_PARAMETER(SourceId);
-        ORT_UNUSED_PARAMETER(MatchAnyKeyword);
-        ORT_UNUSED_PARAMETER(MatchAllKeyword);
-        ORT_UNUSED_PARAMETER(FilterData);
-        ORT_UNUSED_PARAMETER(CallbackContext);
+#if defined(_WIN32)
+  if (onnxruntime::logging::EtwRegistrationManager::SupportsETW()) {
+    auto& etwRegistrationManager = logging::EtwRegistrationManager::Instance();
+    // Register callback for ETW capture state (rundown)
+    callback_ETWSink_provider_ = onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback(
+        [&etwRegistrationManager, this](
+            LPCGUID SourceId,
+            ULONG IsEnabled,
+            UCHAR Level,
+            ULONGLONG MatchAnyKeyword,
+            ULONGLONG MatchAllKeyword,
+            PEVENT_FILTER_DESCRIPTOR FilterData,
+            PVOID CallbackContext) {
+          ORT_UNUSED_PARAMETER(SourceId);
+          ORT_UNUSED_PARAMETER(MatchAnyKeyword);
+          ORT_UNUSED_PARAMETER(MatchAllKeyword);
+          ORT_UNUSED_PARAMETER(FilterData);
+          ORT_UNUSED_PARAMETER(CallbackContext);
 
-        if (IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) {
-          if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) {
-            auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity();
-            (void)qnn_backend_manager_->ResetQnnLogLevel(ortETWSeverity);
-          }
-          if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) {
-            if (Level != 0) {
-              // Commenting out Dynamic QNN Profiling for now
-              // There seems to be a crash in 3rd party QC QnnHtp.dll with this.
-              // Repro Scenario - start ETW tracing prior to session creation.
-              //    Then disable/enable ETW Tracing with the code below uncommented a few times
-              // auto profiling_level_etw = GetProfilingLevelFromETWLevel(Level);
-              // (void)qnn_backend_manager_->SetProfilingLevelETW(profiling_level_etw);
+          if (IsEnabled == EVENT_CONTROL_CODE_ENABLE_PROVIDER) {
+            if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Logs)) != 0) {
+              auto ortETWSeverity = etwRegistrationManager.MapLevelToSeverity();
+              (void)qnn_backend_manager_->ResetQnnLogLevel(ortETWSeverity);
+            }
+            if ((MatchAnyKeyword & static_cast<ULONGLONG>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)) != 0) {
+              if (Level != 0) {
+                // Commenting out Dynamic QNN Profiling for now
+                // There seems to be a crash in 3rd party QC QnnHtp.dll with this.
+                // Repro Scenario - start ETW tracing prior to session creation.
+                //    Then disable/enable ETW Tracing with the code below uncommented a few times
+                // auto profiling_level_etw = GetProfilingLevelFromETWLevel(Level);
+                // (void)qnn_backend_manager_->SetProfilingLevelETW(profiling_level_etw);
+                //
+                // NOTE(1/2/2025): It is possible that the above was not working in part because it is using the
+                // *logging ETW* subsystem to modify profiling, which should use an entirely different
+                // ETW provider (see QnnTelemetry). Should add callbacks for profiling to the QnnTelemetry ETW provider.
+              }
             }
           }
-        }
 
-        if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) {
-          // (void)qnn_backend_manager_->SetProfilingLevelETW(qnn::ProfilingLevel::INVALID);
-          (void)qnn_backend_manager_->ResetQnnLogLevel(std::nullopt);
-        }
-      });
-  etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);
+          if (IsEnabled == EVENT_CONTROL_CODE_DISABLE_PROVIDER) {
+            // (void)qnn_backend_manager_->SetProfilingLevelETW(qnn::ProfilingLevel::INVALID);
+            (void)qnn_backend_manager_->ResetQnnLogLevel(std::nullopt);
+          }
+        });
+    etwRegistrationManager.RegisterInternalCallback(callback_ETWSink_provider_);
+  }
 #endif
 }
 
@@ -466,7 +439,7 @@ QNNExecutionProvider::~QNNExecutionProvider() {
   }
 
   // Unregister the ETW callback
-#ifdef _WIN32
+#if defined(_WIN32)
   if (callback_ETWSink_provider_ != nullptr) {
     logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_provider_);
   }
@@ -498,9 +471,10 @@ static void LogNodeSupport(const logging::Logger& logger,
     oss << "\tREASON : " << support_status.ErrorMessage() << std::endl;
   }
 
-  logging::Capture(logger, log_severity, logging::Category::onnxruntime,
-                   log_data_type, call_site)
-          .Stream()
+  auto log_capture = Factory<logging::Capture>::Create(logger, log_severity,
+                                                       logging::Category::onnxruntime,
+                                                       log_data_type, call_site);
+  log_capture->Stream()
       << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for " << num_nodes
       << " nodes in " << qnn_node_group.Type() << " (" << qnn_node_group.GetTargetNodeUnit()->OpType() << ") :"
       << std::endl
@@ -604,11 +578,11 @@ static bool EpSharedContextsHasAllGraphs(const std::vector<IExecutionProvider::F
                                          const logging::Logger& logger) {
   for (auto fused_node_and_graph : fused_nodes_and_graphs) {
     const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-    const auto& ep_context_node = graph_viewer.Nodes().begin();
-    NodeAttrHelper node_helper(*ep_context_node);
+    const Node& ep_context_node = *graph_viewer.Nodes().begin();
+    NodeAttrHelper node_helper(ep_context_node);
     std::string cache_source = node_helper.Get(qnn::SOURCE, "");
 
-    const std::string& graph_name = ep_context_node->Name();
+    const std::string& graph_name = ep_context_node.Name();
     bool has_shared_qnn_model = SharedContext::GetInstance().HasQnnModel(graph_name);
     if (!has_shared_qnn_model) {
       LOGS(logger, VERBOSE) << "Graph: " << graph_name << " from EpContext node not found from shared EP contexts.";
@@ -623,7 +597,7 @@ static bool EpSharedContextsHasAllGraphs(const std::vector<IExecutionProvider::F
 static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer,
                               const size_t num_nodes_in_graph,
                               std::vector<std::unique_ptr<ComputeCapability>>& result,
-                              const utils::GenerateMetadefNameFn& gen_metadef_name,
+                              const std::function<std::string()>& gen_metadef_name,
                               const logging::Logger& logger) {
   std::unordered_set<const Node*> supported_nodes{};
   std::vector<std::vector<const Node*>> supported_groups{};
@@ -683,7 +657,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
 
   const auto gen_metadef_name = [&]() {
     uint64_t model_hash;
-    int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash);
+    int metadef_id = metadef_id_generator_->GenerateId(graph_viewer, model_hash);
     return MakeString(QNN, context_node_name_prefix_, "_", model_hash, "_", metadef_id);
   };
 
@@ -734,7 +708,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
   std::vector<std::unique_ptr<NodeUnit>> node_unit_holder;
   std::unordered_map<const Node*, const NodeUnit*> node_unit_map;
 
-  std::tie(node_unit_holder, node_unit_map) = QDQ::GetAllNodeUnits(graph_viewer, logger);
+  std::tie(node_unit_holder, node_unit_map) = GetQDQNodeUnits(graph_viewer, logger);
 
   // remove is_qnn_ctx_model related code
   const auto supported_nodes = GetSupportedNodes(graph_viewer, node_unit_map,
@@ -777,11 +751,14 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer
     bool is_valid_partition = true;
     size_t nodes_in_partition = 0;
 
-    if (partition && partition->sub_graph) {
-      nodes_in_partition = partition->sub_graph->nodes.size();
+    if (partition && ComputeCapability__SubGraph(*partition)) {
+      const auto& subgraph = ComputeCapability__SubGraph(*partition);
+      const auto& subgraph_nodes = IndexedSubGraph__Nodes(*subgraph);
+
+      nodes_in_partition = subgraph_nodes.size();
 
       if (nodes_in_partition == 1 && !is_qnn_ctx_model) {
-        const Node* node = graph_viewer.GetNode(partition->sub_graph->nodes[0]);
+        const Node* node = graph_viewer.GetNode(subgraph_nodes[0]);
 
         if (!node) {
           LOGS(logger, ERROR) << "QNN EP: Invalid node in partition of one node.";
@@ -850,34 +827,34 @@ Status QNNExecutionProvider::CreateComputeFunc(std::vector<NodeComputeInfo>& nod
 void QNNExecutionProvider::InitQnnGraphConfigs(qnn::QnnConfigsBuilder<QnnGraph_Config_t, QnnHtpGraph_CustomConfig_t>& configs_builder) const {
   if (qnn_backend_manager_->GetQnnBackendType() == qnn::QnnBackendType::HTP) {
     if (htp_graph_finalization_opt_mode_ != qnn::HtpGraphFinalizationOptimizationMode::kDefault) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config = configs_builder.PushCustomConfig();
-      htp_graph_opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-      htp_graph_opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-      htp_graph_opt_config.optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_opt_config = configs_builder.PushCustomConfig();
+      htp_graph_opt_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+      htp_graph_opt_config->optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+      htp_graph_opt_config->optimizationOption.floatValue = static_cast<float>(htp_graph_finalization_opt_mode_);
 
-      QnnGraph_Config_t& graph_opt_config = configs_builder.PushConfig();
-      graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_opt_config.customConfig = &htp_graph_opt_config;
+      gsl::not_null<QnnGraph_Config_t*> graph_opt_config = configs_builder.PushConfig();
+      graph_opt_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_opt_config->customConfig = htp_graph_opt_config;
     }
 
     if (vtcm_size_in_mb_ > 0) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
-      htp_graph_opt_config_vtcm.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-      htp_graph_opt_config_vtcm.vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_opt_config_vtcm = configs_builder.PushCustomConfig();
+      htp_graph_opt_config_vtcm->option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+      htp_graph_opt_config_vtcm->vtcmSizeInMB = static_cast<uint32_t>(vtcm_size_in_mb_);
 
-      QnnGraph_Config_t& graph_opt_config_vtcm = configs_builder.PushConfig();
-      graph_opt_config_vtcm.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_opt_config_vtcm.customConfig = &htp_graph_opt_config_vtcm;
+      gsl::not_null<QnnGraph_Config_t*> graph_opt_config_vtcm = configs_builder.PushConfig();
+      graph_opt_config_vtcm->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_opt_config_vtcm->customConfig = htp_graph_opt_config_vtcm;
     }
 
     if (enable_HTP_FP16_precision_) {
-      QnnHtpGraph_CustomConfig_t& htp_graph_precision_config = configs_builder.PushCustomConfig();
-      htp_graph_precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-      htp_graph_precision_config.precision = QNN_PRECISION_FLOAT16;
+      gsl::not_null<QnnHtpGraph_CustomConfig_t*> htp_graph_precision_config = configs_builder.PushCustomConfig();
+      htp_graph_precision_config->option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+      htp_graph_precision_config->precision = QNN_PRECISION_FLOAT16;
 
-      QnnGraph_Config_t& graph_precision_config = configs_builder.PushConfig();
-      graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-      graph_precision_config.customConfig = &htp_graph_precision_config;
+      gsl::not_null<QnnGraph_Config_t*> graph_precision_config = configs_builder.PushConfig();
+      graph_precision_config->option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+      graph_precision_config->customConfig = htp_graph_precision_config;
     }
   }
 }
@@ -933,10 +910,10 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
       if (EpSharedContextsHasAllGraphs(fused_nodes_and_graphs, logger)) {
         for (auto fused_node_and_graph : fused_nodes_and_graphs) {
           const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-          const auto& ep_context_node = graph_viewer.Nodes().begin();
+          const Node& ep_context_node = *graph_viewer.Nodes().begin();
           const Node& fused_node = fused_node_and_graph.fused_node;
           const std::string& graph_meta_id = fused_node.Name();
-          std::string key = ep_context_node->Name();
+          std::string key = ep_context_node.Name();
           auto qnn_model_shared = SharedContext::GetInstance().GetSharedQnnModel(key);
           ORT_RETURN_IF(nullptr == qnn_model_shared, "Graph: " + key + " not found from shared EP contexts.");
           ORT_RETURN_IF_ERROR(qnn_model_shared->SetGraphInputOutputInfo(graph_viewer, fused_node, logger));
@@ -978,10 +955,10 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
 
     for (auto fused_node_and_graph : fused_nodes_and_graphs) {
       const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
-      const auto& ep_context_node = graph_viewer.Nodes().begin();
+      const Node& ep_context_node = *graph_viewer.Nodes().begin();
       const Node& fused_node = fused_node_and_graph.fused_node;
       const std::string& graph_meta_id = fused_node.Name();
-      std::string key = ep_context_node->Name();
+      std::string key = ep_context_node.Name();
       ORT_RETURN_IF(qnn_models.find(key) == qnn_models.end(), key + " key name not exist in table qnn_models.");
       auto qnn_model = std::move(qnn_models[key]);
       ORT_RETURN_IF_ERROR(qnn_model->SetGraphInputOutputInfo(graph_viewer, fused_node, logger));
@@ -1022,7 +999,7 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
                                                                           buffer_size,
                                                                           max_spill_fill_buffer_size));
     }
-    qnn_ep_context_model_ = std::make_unique<Model>("qnn_ep_context_model", false, logger);
+    qnn_ep_context_model_ = Factory<Model>::Create(std::string{"qnn_ep_context_model"}, false, logger);
     ORT_RETURN_IF_ERROR(qnn::CreateEPContextNodes(qnn_ep_context_model_.get(),
                                                   context_buffer.get(),
                                                   buffer_size,
@@ -1041,8 +1018,8 @@ const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const
   InlinedVector<const Node*> ep_context_nodes;
   if (qnn_ep_context_model_) {
     const auto& graph = qnn_ep_context_model_->MainGraph();
-    for (const auto& node : graph.Nodes()) {
-      ep_context_nodes.push_back(graph.GetNode(node.Index()));
+    for (gsl::not_null<const Node*> node : Graph__Nodes(graph)) {
+      ep_context_nodes.push_back(graph.GetNode(node->Index()));
     }
   }
 
@@ -1133,22 +1110,34 @@ void QNNExecutionProvider::ReleasePerThreadContext() const {
   per_thread_context_cache->erase(cached_context_it);
 }
 
+static bool TryGetConfigEntry(const ConfigOptions& config_options, const std::string& key, std::string& value) {
+  std::optional<std::string> new_value = config_options.GetConfigEntry(key);
+  if (!new_value.has_value()) {
+    return false;
+  }
+
+  value = *new_value;
+  return true;
+}
+
 Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
   auto backend_type = qnn_backend_manager_->GetQnnBackendType();
   if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
     return Status::OK();
   }
 
+  const ConfigOptions& config_options = RunOptions__GetConfigOptions(run_options);
+
   std::string htp_perf_mode = "";
   qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
     // set power mode
     ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
   }
 
   std::string rpc_latency = "";
   uint32_t rpc_control_latency = 0;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
     rpc_control_latency = static_cast<uint32_t>(std::stoul(rpc_latency));
     LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
   }
@@ -1174,9 +1163,11 @@ Status QNNExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::R
     return Status::OK();
   }
 
+  const ConfigOptions& config_options = RunOptions__GetConfigOptions(run_options);
+
   std::string htp_perf_mode = "";
   qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
-  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
+  if (TryGetConfigEntry(config_options, kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
     // set power mode
     ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
   }
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 317b34e66a..48f41c4da3 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -3,32 +3,25 @@
 
 #pragma once
 
-#include "core/framework/execution_provider.h"
-#include "core/framework/session_options.h"
-#include "core/framework/model_metadef_id_generator.h"
-#include "core/graph/model.h"
+#include <memory>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_backend_manager.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 #include "core/providers/qnn/rpcmem_library.h"
 #include "HTP/QnnHtpGraph.h"
-#include <memory>
-#include <vector>
-#include <set>
-#include <string>
-#include <unordered_map>
-#ifdef _WIN32
-#include "core/platform/windows/logging/etw_sink.h"
-#endif
 
 namespace onnxruntime {
 
-void RunOnUnload(std::function<void()> function);
-
 // Logical device representation.
 class QNNExecutionProvider : public IExecutionProvider {
  public:
-  explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options);
+  explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options);
   virtual ~QNNExecutionProvider();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QNNExecutionProvider);
 
@@ -90,14 +83,14 @@ class QNNExecutionProvider : public IExecutionProvider {
   bool qnn_context_embed_mode_ = true;
   int32_t vtcm_size_in_mb_ = 0;
   std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
-  ModelMetadefIdGenerator metadef_id_generator_;
+  std::unique_ptr<ModelMetadefIdGenerator> metadef_id_generator_;
   uint32_t device_id_ = 0;
   qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
   uint32_t default_rpc_control_latency_ = 0;
   bool enable_HTP_FP16_precision_ = true;
   bool share_ep_contexts_ = false;
   bool enable_spill_fill_buffer_ = false;
-#ifdef _WIN32
+#if defined(_WIN32)
   onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;
 #endif
   qnn::ModelSettings model_settings_ = {};
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
index 4095d7ff02..d4dd446751 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory.cc
@@ -2,32 +2,68 @@
 // Licensed under the MIT License
 
 #include "core/providers/qnn/qnn_provider_factory_creator.h"
-
-#include "core/session/abi_session_options_impl.h"
 #include "core/providers/qnn/qnn_execution_provider.h"
-#include "core/session/ort_apis.h"
 
 namespace onnxruntime {
 struct QNNProviderFactory : IExecutionProviderFactory {
-  QNNProviderFactory(const ProviderOptions& provider_options_map, const SessionOptions* session_options)
-      : provider_options_map_(provider_options_map), session_options_(session_options) {
+  QNNProviderFactory(const ProviderOptions& provider_options_map, const ConfigOptions* config_options)
+      : provider_options_map_(provider_options_map), config_options_(config_options) {
   }
 
   ~QNNProviderFactory() override {
   }
 
   std::unique_ptr<IExecutionProvider> CreateProvider() override {
-    return std::make_unique<QNNExecutionProvider>(provider_options_map_, session_options_);
+    return std::make_unique<QNNExecutionProvider>(provider_options_map_, config_options_);
   }
 
  private:
   ProviderOptions provider_options_map_;
-  const SessionOptions* session_options_;
+  const ConfigOptions* config_options_;
 };
 
+#if BUILD_QNN_EP_STATIC_LIB
 std::shared_ptr<IExecutionProviderFactory> QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map,
                                                                              const SessionOptions* session_options) {
-  return std::make_shared<onnxruntime::QNNProviderFactory>(provider_options_map, session_options);
+  const ConfigOptions* config_options = nullptr;
+  if (session_options != nullptr) {
+    config_options = &session_options->config_options;
+  }
+
+  return std::make_shared<onnxruntime::QNNProviderFactory>(provider_options_map, config_options);
 }
+#else
+struct QNN_Provider : Provider {
+  std::shared_ptr<IExecutionProviderFactory> CreateExecutionProviderFactory(const void* param) override {
+    if (param == nullptr) {
+      LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL options to CreateExecutionProviderFactory()";
+      return nullptr;
+    }
+
+    std::array<const void*, 2> pointers_array = *reinterpret_cast<const std::array<const void*, 2>*>(param);
+    const ProviderOptions* provider_options = reinterpret_cast<const ProviderOptions*>(pointers_array[0]);
+    const ConfigOptions* config_options = reinterpret_cast<const ConfigOptions*>(pointers_array[1]);
+
+    if (provider_options == nullptr) {
+      LOGS_DEFAULT(ERROR) << "[QNN EP] Passed NULL ProviderOptions to CreateExecutionProviderFactory()";
+      return nullptr;
+    }
+
+    return std::make_shared<onnxruntime::QNNProviderFactory>(*provider_options, config_options);
+  }
+
+  void Initialize() override {}
+  void Shutdown() override {}
+} g_provider;
+#endif  // BUILD_QNN_EP_STATIC_LIB
 
 }  // namespace onnxruntime
+
+#if !BUILD_QNN_EP_STATIC_LIB
+extern "C" {
+
+ORT_API(onnxruntime::Provider*, GetProvider) {
+  return &onnxruntime::g_provider;
+}
+}
+#endif  // !BUILD_QNN_EP_STATIC_LIB
diff --git a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
index 80f9d99b80..46b6c15b40 100644
--- a/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
+++ b/onnxruntime/core/providers/qnn/qnn_provider_factory_creator.h
@@ -11,6 +11,9 @@
 namespace onnxruntime {
 struct SessionOptions;
 
+// Defined in core/session/provider_bridge_ort.cc if built as a shared library (default build config).
+// Defined in core/providers/qnn/qnn_provider_factory.cc if built as a static library.
+// The preprocessor macro `BUILD_QNN_EP_STATIC_LIB` is defined and set to 1 if QNN is built as a static library.
 struct QNNProviderFactoryCreator {
   static std::shared_ptr<IExecutionProviderFactory> Create(const ProviderOptions& provider_options_map,
                                                            const SessionOptions* session_options);
diff --git a/onnxruntime/core/providers/qnn/qnn_telemetry.cc b/onnxruntime/core/providers/qnn/qnn_telemetry.cc
new file mode 100644
index 0000000000..b2c8350bfe
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/qnn_telemetry.cc
@@ -0,0 +1,211 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/qnn_telemetry.h"
+
+#ifdef _WIN32
+#if !BUILD_QNN_EP_STATIC_LIB
+// ETW includes
+// need space after Windows.h to prevent clang-format re-ordering breaking the build.
+// TraceLoggingProvider.h must follow Windows.h
+#include <Windows.h>
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 26440)  // Warning C26440 from TRACELOGGING_DEFINE_PROVIDER
+#endif
+
+#include <TraceLoggingProvider.h>
+#include <evntrace.h>
+#include <winmeta.h>
+#include "core/platform/windows/TraceLoggingConfig.h"
+
+// Seems this workaround can be dropped when we drop support for VS2017 toolchains
+// https://developercommunity.visualstudio.com/content/problem/85934/traceloggingproviderh-is-incompatible-with-utf-8.html
+#ifdef _TlgPragmaUtf8Begin
+#undef _TlgPragmaUtf8Begin
+#define _TlgPragmaUtf8Begin
+#endif
+
+#ifdef _TlgPragmaUtf8End
+#undef _TlgPragmaUtf8End
+#define _TlgPragmaUtf8End
+#endif
+
+// Different versions of TraceLoggingProvider.h contain different macro variable names for the utf8 begin and end,
+// and we need to cover the lower case version as well.
+#ifdef _tlgPragmaUtf8Begin
+#undef _tlgPragmaUtf8Begin
+#define _tlgPragmaUtf8Begin
+#endif
+
+#ifdef _tlgPragmaUtf8End
+#undef _tlgPragmaUtf8End
+#define _tlgPragmaUtf8End
+#endif
+
+TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntime",
+                             // {3a26b1ff-7484-7484-7484-15261f42614d}
+                             (0x3a26b1ff, 0x7484, 0x7484, 0x74, 0x84, 0x15, 0x26, 0x1f, 0x42, 0x61, 0x4d),
+                             TraceLoggingOptionMicrosoftTelemetry());
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+
+#include "core/providers/qnn/ort_api.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+#if !BUILD_QNN_EP_STATIC_LIB
+std::mutex QnnTelemetry::mutex_;
+std::mutex QnnTelemetry::provider_change_mutex_;
+uint32_t QnnTelemetry::global_register_count_ = 0;
+bool QnnTelemetry::enabled_ = true;
+UCHAR QnnTelemetry::level_ = 0;
+UINT64 QnnTelemetry::keyword_ = 0;
+std::vector<const QnnTelemetry::EtwInternalCallback*> QnnTelemetry::callbacks_;
+std::mutex QnnTelemetry::callbacks_mutex_;
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+
+QnnTelemetry::QnnTelemetry() {
+#if !BUILD_QNN_EP_STATIC_LIB
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (global_register_count_ == 0) {
+    // TraceLoggingRegister is fancy in that you can only register once GLOBALLY for the whole process
+    HRESULT hr = TraceLoggingRegisterEx(telemetry_provider_handle, ORT_TL_EtwEnableCallback, nullptr);
+    if (SUCCEEDED(hr)) {
+      global_register_count_ += 1;
+    }
+  }
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+}
+
+QnnTelemetry::~QnnTelemetry() {
+#if !BUILD_QNN_EP_STATIC_LIB
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (global_register_count_ > 0) {
+    global_register_count_ -= 1;
+    if (global_register_count_ == 0) {
+      TraceLoggingUnregister(telemetry_provider_handle);
+    }
+  }
+
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  callbacks_.clear();
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+}
+
+QnnTelemetry& QnnTelemetry::Instance() {
+  static QnnTelemetry instance;
+  return instance;
+}
+
+bool QnnTelemetry::IsEnabled() const {
+#if BUILD_QNN_EP_STATIC_LIB
+  const Env& env = GetDefaultEnv();
+  auto& provider = env.GetTelemetryProvider();
+  return provider.IsEnabled();
+#else
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  return enabled_;
+#endif
+}
+
+UCHAR QnnTelemetry::Level() const {
+#if BUILD_QNN_EP_STATIC_LIB
+  const Env& env = GetDefaultEnv();
+  auto& provider = env.GetTelemetryProvider();
+  return provider.Level();
+#else
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  return level_;
+#endif
+}
+
+UINT64 QnnTelemetry::Keyword() const {
+#if BUILD_QNN_EP_STATIC_LIB
+  const Env& env = GetDefaultEnv();
+  auto& provider = env.GetTelemetryProvider();
+  return provider.Keyword();
+#else
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  return keyword_;
+#endif
+}
+
+void QnnTelemetry::LogQnnProfileEvent(uint64_t timestamp,
+                                      const std::string& message,
+                                      const std::string& qnnScalarValue,
+                                      const std::string& unit,
+                                      const std::string& timingSource,
+                                      const std::string& eventLevel,
+                                      const char* eventIdentifier) const {
+  TraceLoggingWrite(
+      telemetry_provider_handle,
+      "QNNProfilingEvent",
+      TraceLoggingKeyword(static_cast<uint64_t>(onnxruntime::logging::ORTTraceLoggingKeyword::Profiling)),
+      TraceLoggingLevel(WINEVENT_LEVEL_VERBOSE),
+      TraceLoggingValue(timestamp, "Timestamp"),
+      TraceLoggingString(message.c_str(), "Message"),
+      TraceLoggingString(qnnScalarValue.c_str(), "Value"),
+      TraceLoggingString(unit.c_str(), "Unit of Measurement"),
+      TraceLoggingString(timingSource.c_str(), "Timing Source"),
+      TraceLoggingString(eventLevel.c_str(), "Event Level"),
+      TraceLoggingString(eventIdentifier, "Event Identifier"));
+}
+
+void QnnTelemetry::RegisterInternalCallback(const EtwInternalCallback& callback) {
+#if BUILD_QNN_EP_STATIC_LIB
+  WindowsTelemetry::RegisterInternalCallback(callback);
+#else
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  callbacks_.push_back(&callback);
+#endif
+}
+
+void QnnTelemetry::UnregisterInternalCallback(const EtwInternalCallback& callback) {
+#if BUILD_QNN_EP_STATIC_LIB
+  WindowsTelemetry::UnregisterInternalCallback(callback);
+#else
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  auto new_end = std::remove_if(callbacks_.begin(), callbacks_.end(),
+                                [&callback](const EtwInternalCallback* ptr) {
+                                  return ptr == &callback;
+                                });
+  callbacks_.erase(new_end, callbacks_.end());
+#endif
+}
+
+#if !BUILD_QNN_EP_STATIC_LIB
+void NTAPI QnnTelemetry::ORT_TL_EtwEnableCallback(
+    _In_ LPCGUID SourceId,
+    _In_ ULONG IsEnabled,
+    _In_ UCHAR Level,
+    _In_ ULONGLONG MatchAnyKeyword,
+    _In_ ULONGLONG MatchAllKeyword,
+    _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+    _In_opt_ PVOID CallbackContext) {
+  std::lock_guard<std::mutex> lock(provider_change_mutex_);
+  enabled_ = (IsEnabled != 0);
+  level_ = Level;
+  keyword_ = MatchAnyKeyword;
+
+  InvokeCallbacks(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+}
+
+void QnnTelemetry::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                                   ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData,
+                                   PVOID CallbackContext) {
+  std::lock_guard<std::mutex> lock_callbacks(callbacks_mutex_);
+  for (const auto& callback : callbacks_) {
+    (*callback)(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext);
+  }
+}
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+
+}  // namespace qnn
+}  // namespace onnxruntime
+#endif  // defined(_WIN32)
diff --git a/onnxruntime/core/providers/qnn/qnn_telemetry.h b/onnxruntime/core/providers/qnn/qnn_telemetry.h
new file mode 100644
index 0000000000..a2d42c518c
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/qnn_telemetry.h
@@ -0,0 +1,98 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#ifdef _WIN32
+#include <Windows.h>
+
+#if !BUILD_QNN_EP_STATIC_LIB
+#include <TraceLoggingProvider.h>
+#endif
+
+#include <functional>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "core/providers/qnn/ort_api.h"
+
+#if !BUILD_QNN_EP_STATIC_LIB
+TRACELOGGING_DECLARE_PROVIDER(telemetry_provider_handle);
+#endif
+
+namespace onnxruntime {
+namespace qnn {
+
+/// <summary>
+/// Singleton class used to log QNN profiling events to the ONNX Runtime telemetry tracelogging provider.
+///
+/// When QNN EP is a DLL, we must define our own tracelogging provider handle via TRACELOGGING_DEFINE_PROVIDER.
+/// TraceLogging documentation states that separate DLLs cannot share the same tracelogging provider handle. See:
+/// https://learn.microsoft.com/en-us/windows/win32/api/traceloggingprovider/nf-traceloggingprovider-tracelogging_define_provider#remarks
+///
+/// When QNN EP is a static library, we use the tracelogging provider handle already defined
+/// in core/platform/windows/telemetry.h/.cc. In this case, we forward method calls to the
+/// ORT Env's telemetry provider.
+/// </summary>
+class QnnTelemetry {
+ public:
+  static QnnTelemetry& Instance();
+  bool IsEnabled() const;
+
+  // Get the current logging level
+  unsigned char Level() const;
+
+  // Get the current keyword
+  UINT64 Keyword() const;
+
+  // Logs QNN profiling event as trace logging event.
+  void LogQnnProfileEvent(uint64_t timestamp,
+                          const std::string& message,
+                          const std::string& qnnScalarValue,
+                          const std::string& unit,
+                          const std::string& timingSource,
+                          const std::string& eventLevel,
+                          const char* eventIdentifier) const;
+
+  using EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                 ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                 PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext)>;
+
+  static void RegisterInternalCallback(const EtwInternalCallback& callback);
+
+  static void UnregisterInternalCallback(const EtwInternalCallback& callback);
+
+ private:
+  QnnTelemetry();
+  ~QnnTelemetry();
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QnnTelemetry);
+
+#if !BUILD_QNN_EP_STATIC_LIB
+  static std::mutex mutex_;
+  static uint32_t global_register_count_;
+  static bool enabled_;
+
+  static std::vector<const EtwInternalCallback*> callbacks_;
+  static std::mutex callbacks_mutex_;
+  static std::mutex provider_change_mutex_;
+  static UCHAR level_;
+  static ULONGLONG keyword_;
+
+  static void InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword,
+                              ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext);
+
+  static void NTAPI ORT_TL_EtwEnableCallback(
+      _In_ LPCGUID SourceId,
+      _In_ ULONG IsEnabled,
+      _In_ UCHAR Level,
+      _In_ ULONGLONG MatchAnyKeyword,
+      _In_ ULONGLONG MatchAllKeyword,
+      _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData,
+      _In_opt_ PVOID CallbackContext);
+#endif
+};
+
+}  // namespace qnn
+}  // namespace onnxruntime
+
+#endif  // defined(_WIN32)
diff --git a/onnxruntime/core/providers/qnn/rpcmem_library.cc b/onnxruntime/core/providers/qnn/rpcmem_library.cc
index 59e6cff925..93c5ed54ab 100644
--- a/onnxruntime/core/providers/qnn/rpcmem_library.cc
+++ b/onnxruntime/core/providers/qnn/rpcmem_library.cc
@@ -2,9 +2,7 @@
 // Licensed under the MIT License
 
 #include "core/providers/qnn/rpcmem_library.h"
-
-#include "core/common/logging/logging.h"
-#include "core/platform/env.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime::qnn {
 
@@ -25,7 +23,7 @@ DynamicLibraryHandle LoadDynamicLibrary(const PathString& path, bool global_symb
       return;
     }
 
-    const auto& env = Env::Default();
+    const auto& env = GetDefaultEnv();
     const auto unload_status = env.UnloadDynamicLibrary(library_handle);
 
     if (!unload_status.IsOK()) {
@@ -33,7 +31,7 @@ DynamicLibraryHandle LoadDynamicLibrary(const PathString& path, bool global_symb
     }
   };
 
-  const auto& env = Env::Default();
+  const auto& env = GetDefaultEnv();
   void* library_handle = nullptr;
 
   const auto load_status = env.LoadDynamicLibrary(path, global_symbols, &library_handle);
@@ -47,7 +45,7 @@ DynamicLibraryHandle LoadDynamicLibrary(const PathString& path, bool global_symb
 RpcMemApi CreateApi(void* library_handle) {
   RpcMemApi api{};
 
-  const auto& env = Env::Default();
+  const auto& env = GetDefaultEnv();
   ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(library_handle, "rpcmem_alloc", (void**)&api.alloc));
 
   ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(library_handle, "rpcmem_free", (void**)&api.free));
diff --git a/onnxruntime/core/providers/qnn/rpcmem_library.h b/onnxruntime/core/providers/qnn/rpcmem_library.h
index d5697ff298..0642c96798 100644
--- a/onnxruntime/core/providers/qnn/rpcmem_library.h
+++ b/onnxruntime/core/providers/qnn/rpcmem_library.h
@@ -6,7 +6,7 @@
 #include <cstdint>
 #include <memory>
 
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime::qnn {
 
diff --git a/onnxruntime/core/providers/qnn/shared_context.h b/onnxruntime/core/providers/qnn/shared_context.h
index a111e57038..81de357dbe 100644
--- a/onnxruntime/core/providers/qnn/shared_context.h
+++ b/onnxruntime/core/providers/qnn/shared_context.h
@@ -5,7 +5,7 @@
 #include <mutex>
 #include <vector>
 
-#include "core/common/common.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/qnn_model.h"
 
 #pragma once
diff --git a/onnxruntime/core/providers/qnn/symbols.def b/onnxruntime/core/providers/qnn/symbols.def
new file mode 100644
index 0000000000..4ec2f7914c
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/symbols.def
@@ -0,0 +1,2 @@
+EXPORTS
+   GetProvider
diff --git a/onnxruntime/core/providers/qnn/version_script.lds b/onnxruntime/core/providers/qnn/version_script.lds
new file mode 100644
index 0000000000..094abb3329
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/version_script.lds
@@ -0,0 +1,9 @@
+#_init and _fini should be local
+VERS_1.0 {
+  global:
+    GetProvider;    
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h
index 45f81ed22b..6ff2572e5e 100644
--- a/onnxruntime/core/providers/shared_library/provider_api.h
+++ b/onnxruntime/core/providers/shared_library/provider_api.h
@@ -9,6 +9,11 @@
 #pragma once
 #define SHARED_PROVIDER 1
 
+#ifdef _WIN32
+#include <Windows.h>
+#include <evntrace.h>
+#endif  // defined(_WIN32)
+
 #include <vector>
 #include <string>
 #include <map>
@@ -136,6 +141,17 @@ enum class DataType {
   USER = 1     ///< Contains potentially sensitive user data.
 };
 
+enum class ORTTraceLoggingKeyword : uint64_t {
+  Session = 0x1,    // ORT Session TraceLoggingWrite
+  Logs = 0x2,       // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required
+  Reserved1 = 0x4,  // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses
+  Reserved2 = 0x8,
+  Reserved3 = 0x10,
+  Reserved4 = 0x20,
+  Reserved5 = 0x40,
+  Reserved6 = 0x80,
+  Profiling = 0x100  // Enables profiling. At higher levels >5 can impact inference performance
+};
 }  // namespace logging
 
 // OnnxRuntime Types (these are the internal types)
@@ -143,6 +159,13 @@ struct CPUIDInfo;
 namespace logging {
 struct Logger;
 struct Capture;
+#ifdef _WIN32
+struct EtwRegistrationManager;
+using EtwRegistrationManager_EtwInternalCallback = std::function<void(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level,
+                                                                      ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword,
+                                                                      PEVENT_FILTER_DESCRIPTOR FilterData,
+                                                                      PVOID CallbackContext)>;
+#endif
 }  // namespace logging
 struct ComputeCapability;
 struct ConfigOptions;
@@ -157,10 +180,12 @@ struct KernelRegistry;
 struct Function;
 struct Graph;
 class GraphViewer;
+struct ConstGraphNodes;
 enum class DataLayout;
 struct Model;
 struct Path;
 struct Node;
+struct Node_EdgeEnd;
 struct NodeArg;
 struct NodeAttributes;
 struct NodeUnitIODef;
@@ -215,6 +240,7 @@ using DeleteFunc = void (*)(void*);
 using NodeArgInfo = ONNX_NAMESPACE::ValueInfoProto;
 
 using NameMLValMap = std::unordered_map<std::string, OrtValue>;
+
 }  // namespace onnxruntime
 
 #include "core/platform/threadpool.h"
@@ -368,6 +394,28 @@ template <>
 constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<UInt4x2>() {
   return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT4;
 }
+
+inline std::vector<std::unique_ptr<ComputeCapability>>
+CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                          const std::unordered_set<const Node*>& supported_nodes,
+                          const std::unordered_set<std::string>& stop_ops,
+                          const std::function<std::string()>& generate_metadef_name,
+                          const std::string& execution_provider_name,
+                          const std::string& execution_provider_type,
+                          const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                          bool drop_constant_initializers = false) {
+  return g_host->Utils__CreateSupportedPartitions(graph_viewer, supported_nodes, stop_ops, generate_metadef_name,
+                                                  execution_provider_name, execution_provider_type, node_unit_map,
+                                                  drop_constant_initializers);
+}
+inline std::unique_ptr<ComputeCapability> MakeComputeCapability(const GraphViewer& graph_viewer,
+                                                                const std::vector<const Node*>& group,
+                                                                const std::function<std::string()>& generate_metadef_name,
+                                                                const std::string& execution_provider_name,
+                                                                bool drop_constant_initializers) {
+  return g_host->Utils__MakeComputeCapability(graph_viewer, group, generate_metadef_name,
+                                              execution_provider_name, drop_constant_initializers);
+}
 }  // namespace utils
 
 namespace QDQ {
@@ -381,6 +429,10 @@ GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger)
 // So the C API (and C++) becomes available when ORT_API_MANUAL_INIT is used.
 void InitProviderOrtApi();
 
+// This is a replacement for Env::Default(). Returns a reference to the default ORT Environment.
+inline Env& GetDefaultEnv() {
+  return g_host->Env__Default();
+}
 }  // namespace onnxruntime
 
 #define CREATE_MESSAGE(logger, severity, category, datatype) \
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index aa8c367d25..4c05053445 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -505,6 +505,9 @@ Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, const st
                              /*out*/ std::vector<uint8_t>& unpacked_tensor) {
   return g_host->UnpackInitializerData(tensor, model_path, unpacked_tensor);
 }
+Status UnpackInitializerData(const ONNX_NAMESPACE::TensorProto& tensor, /*out*/ std::vector<uint8_t>& unpacked_tensor) {
+  return g_host->UnpackInitializerData(tensor, std::filesystem::path(), unpacked_tensor);
+}
 
 }  // namespace utils
 
@@ -788,5 +791,5 @@ std::string ToUTF8String(const std::wstring& s) {
 std::wstring ToWideString(const std::string& s) {
   return g_host->ToWideString(s);
 }
-#endif
+#endif  // _WIN32
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 5a179ec622..d22cb2ed00 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -120,11 +120,20 @@ struct Node__EdgeIterator {
   virtual bool operator!=(const Node__EdgeIterator& p) const = 0;
 
   virtual void operator++() = 0;
+  virtual const Node_EdgeEnd& operator*() const = 0;
   virtual const Node& GetNode() const = 0;
   virtual int GetSrcArgIndex() const = 0;
   virtual int GetDstArgIndex() const = 0;
 };
 
+struct ConstGraphNodes_Iterator {
+  virtual ~ConstGraphNodes_Iterator() {}
+
+  virtual bool operator!=(const ConstGraphNodes_Iterator& other) const = 0;
+  virtual void operator++() = 0;
+  virtual const Node& operator*() = 0;
+};
+
 // There are two ways to route a function, one is a virtual method and the other is a function pointer (or pointer to
 // member function).
 // The function pointers are nicer in that they directly call the target function, but they cannot be used in cases
@@ -273,20 +282,40 @@ struct ProviderHost {
 
   // logging::Logger
   virtual bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) = 0;
+  virtual logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) = 0;
 
   // logging::LoggingManager
   virtual const logging::Logger& logging__LoggingManager__DefaultLogger() = 0;
 
   // logging::Capture
-  virtual std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger, logging::Severity severity, const char* category, logging::DataType dataType, const CodeLocation& location) = 0;
+  virtual std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger,
+                                                                        logging::Severity severity,
+                                                                        const char* category,
+                                                                        logging::DataType data_type,
+                                                                        const CodeLocation& location) = 0;
   virtual void logging__Capture__operator_delete(logging::Capture* p) noexcept = 0;
   virtual std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept = 0;
+  virtual void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) = 0;
+
+#if defined(_WIN32)
+  // logging::EtwRegistrationManager
+  virtual logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() = 0;
+  virtual bool logging__EtwRegistrationManager__SupportsETW() = 0;
+  virtual logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) = 0;
+  virtual void logging__EtwRegistrationManager__RegisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+  virtual void logging__EtwRegistrationManager__UnregisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) = 0;
+#endif  // defined(_WIN32)
 
   // Env
   virtual Env& Env__Default() = 0;
 
   // Utils::DataTypeUtils
   virtual const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) = 0;
+  virtual const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) = 0;
 
   // int64s
   virtual int int64s__size(const ONNX_NAMESPACE::int64s* p) = 0;
@@ -328,6 +357,7 @@ struct ProviderHost {
   virtual bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
+  virtual bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) = 0;
   virtual void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) = 0;
 
@@ -342,6 +372,7 @@ struct ProviderHost {
   // TypeProto
   virtual std::unique_ptr<ONNX_NAMESPACE::TypeProto> TypeProto__construct() = 0;
   virtual void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) = 0;
+  virtual bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0;
   virtual const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) = 0;
   virtual ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) = 0;
 
@@ -462,6 +493,7 @@ struct ProviderHost {
   virtual bool TensorProto__has_raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) = 0;
+  virtual bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) = 0;
   virtual void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) = 0;
   virtual void TensorProto__CopyFrom(ONNX_NAMESPACE::TensorProto* p, const ONNX_NAMESPACE::TensorProto* other) = 0;
@@ -495,6 +527,7 @@ struct ProviderHost {
   // TensorShapeProto_Dimensions
   virtual std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__begin(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
   virtual std::unique_ptr<TensorShapeProto_Dimension_Iterator> TensorShapeProto_Dimensions__end(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
+  virtual size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) = 0;
 
   // TensorShapeProto
   virtual int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) = 0;
@@ -823,6 +856,8 @@ struct ProviderHost {
 
   virtual const NodeAttributes& Node__GetAttributes(const Node* p) noexcept = 0;
   virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) = 0;
+  virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) = 0;
+  virtual void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) = 0;
   virtual size_t Node__GetInputEdgesCount(const Node* p) noexcept = 0;
   virtual size_t Node__GetOutputEdgesCount(const Node* p) noexcept = 0;
 
@@ -842,6 +877,11 @@ struct ProviderHost {
   virtual const std::unordered_map<std::string, gsl::not_null<Graph*>>& Node__GetAttributeNameToMutableSubgraphMap(Node* p) = 0;
   virtual std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const = 0;
 
+  // Node_EdgeEnd
+  virtual const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) = 0;
+  virtual int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) = 0;
+  virtual int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) = 0;
+
   // NodeArg
   virtual const std::string& NodeArg__Name(const NodeArg* p) noexcept = 0;
   virtual const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) = 0;
@@ -872,6 +912,8 @@ struct ProviderHost {
   virtual void NodeAttributes__reserve(NodeAttributes* p, size_t size) = 0;
 
   // NodeUnit
+  virtual void NodeUnit__operator_delete(NodeUnit* p) noexcept = 0;
+
   virtual int NodeUnit__UnitType(const NodeUnit* p) noexcept = 0;
 
   virtual const std::vector<NodeUnitIODef>& NodeUnit__Inputs(const NodeUnit* p) noexcept = 0;
@@ -897,10 +939,29 @@ struct ProviderHost {
   virtual std::pair<std::vector<std::unique_ptr<NodeUnit>>, std::unordered_map<const Node*, const NodeUnit*>>
   QDQ__GetAllNodeUnits(const GraphViewer* graph_viewer, const logging::Logger& logger) = 0;
 
+  // Partitioning utils
+  virtual std::vector<std::unique_ptr<ComputeCapability>>
+  Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                                   const std::unordered_set<const Node*>& supported_nodes,
+                                   const std::unordered_set<std::string>& stop_ops,
+                                   const std::function<std::string()>& generate_metadef_name,
+                                   const std::string& execution_provider_name,
+                                   const std::string& execution_provider_type,
+                                   const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                                   bool drop_constant_initializers) = 0;
+
+  virtual std::unique_ptr<ComputeCapability>
+  Utils__MakeComputeCapability(const GraphViewer& graph_viewer,
+                               const std::vector<const Node*>& group,
+                               const std::function<std::string()>& generate_metadef_name,
+                               const std::string& execution_provider_name,
+                               bool drop_constant_initializers) = 0;
   // Model
   virtual std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                                   const IOnnxRuntimeOpSchemaRegistryList* local_registries,
                                                   const logging::Logger& logger) = 0;
+  virtual std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+                                                  const logging::Logger& logger) = 0;
   virtual void Model__operator_delete(Model* p) = 0;
   virtual Graph& Model__MainGraph(Model* p) = 0;
   virtual std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) = 0;
@@ -974,6 +1035,7 @@ struct ProviderHost {
   virtual const std::string& GraphViewer__Name(const GraphViewer* p) noexcept = 0;
   virtual const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept = 0;
 
+  virtual const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept = 0;
   virtual const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) = 0;
   virtual const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) = 0;
 
@@ -989,6 +1051,7 @@ struct ProviderHost {
 
   virtual const std::vector<const NodeArg*>& GraphViewer__GetInputs(const GraphViewer* p) noexcept = 0;
   virtual const std::vector<const NodeArg*>& GraphViewer__GetOutputs(const GraphViewer* p) noexcept = 0;
+  virtual bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) = 0;
   virtual const std::unordered_set<const NodeArg*>& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept = 0;
 
   virtual const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) = 0;
@@ -1007,6 +1070,13 @@ struct ProviderHost {
   virtual const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const = 0;
   virtual IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const = 0;
 
+  // ConstGraphNodes
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__begin(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__end(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cbegin(const ConstGraphNodes* p) = 0;
+  virtual std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cend(const ConstGraphNodes* p) = 0;
+  virtual bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept = 0;
+
   // OpKernel
   virtual const Node& OpKernel__Node(const OpKernel* p) = 0;
 
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 76b6d8063f..1b6c29e686 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -23,6 +23,9 @@ namespace logging {
 
 struct Logger final {
   bool OutputIsEnabled(Severity severity, DataType data_type) const noexcept { return g_host->logging__Logger__OutputIsEnabled(this, severity, data_type); }
+  Severity GetSeverity() const noexcept {
+    return g_host->logging__Logger__GetSeverity(this);
+  }
 
   PROVIDER_DISALLOW_ALL(Logger)
 };
@@ -35,15 +38,34 @@ struct LoggingManager final {
 
 struct Capture final {
   static std::unique_ptr<Capture> Create(const Logger& logger, logging::Severity severity, const char* category,
-                                         logging::DataType dataType, const CodeLocation& location) { return g_host->logging__Capture__construct(logger, severity, category, dataType, location); }
+                                         logging::DataType data_type, const CodeLocation& location) {
+    return g_host->logging__Capture__construct(logger, severity, category, data_type, location);
+  }
   static void operator delete(void* p) { g_host->logging__Capture__operator_delete(reinterpret_cast<Capture*>(p)); }
 
   std::ostream& Stream() noexcept { return g_host->logging__Capture__Stream(this); }
+  void ProcessPrintf(const char* format, va_list args) { g_host->logging__Capture__ProcessPrintf(this, format, args); }
 
   Capture() = delete;
   Capture(const Capture&) = delete;
   void operator=(const Capture&) = delete;
 };
+
+#if defined(_WIN32)
+struct EtwRegistrationManager final {
+  using EtwInternalCallback = EtwRegistrationManager_EtwInternalCallback;
+  static EtwRegistrationManager& Instance() { return g_host->logging__EtwRegistrationManager__Instance(); }
+  static bool SupportsETW() { return g_host->logging__EtwRegistrationManager__SupportsETW(); }
+  Severity MapLevelToSeverity() { return g_host->logging__EtwRegistrationManager__MapLevelToSeverity(this); }
+  void RegisterInternalCallback(const EtwInternalCallback& callback) {
+    g_host->logging__EtwRegistrationManager__RegisterInternalCallback(this, callback);
+  }
+  void UnregisterInternalCallback(const EtwInternalCallback& callback) {
+    g_host->logging__EtwRegistrationManager__UnregisterInternalCallback(this, callback);
+  }
+};
+#endif  // defined(_WIN32)
+
 }  // namespace logging
 }  // namespace onnxruntime
 
@@ -234,6 +256,7 @@ struct TensorProto final {
   const std::string& raw_data() const { return g_host->TensorProto__raw_data(this); }
   std::string* mutable_raw_data() { return g_host->TensorProto__mutable_raw_data(this); }
 
+  bool has_data_type() const { return g_host->TensorProto__has_data_type(this); }
   int32_t data_type() const { return g_host->TensorProto__data_type(this); }
   void set_data_type(int32_t type) { return g_host->TensorProto__set_data_type(this, type); }
 
@@ -286,6 +309,7 @@ struct TensorShapeProto_Dimension final {
 struct TensorShapeProto_Dimensions final {
   IteratorHolder<TensorShapeProto_Dimension_Iterator, const TensorShapeProto_Dimension> begin() const { return g_host->TensorShapeProto_Dimensions__begin(this); }
   IteratorHolder<TensorShapeProto_Dimension_Iterator, const TensorShapeProto_Dimension> end() const { return g_host->TensorShapeProto_Dimensions__end(this); }
+  size_t size() const { return g_host->TensorShapeProto_Dimensions__size(this); }
 
   PROVIDER_DISALLOW_ALL(TensorShapeProto_Dimensions)
 };
@@ -305,6 +329,7 @@ struct TypeProto_Tensor final {
   bool has_shape() const { return g_host->TypeProto_Tensor__has_shape(this); }
   const TensorShapeProto& shape() const { return g_host->TypeProto_Tensor__shape(this); }
   TensorShapeProto* mutable_shape() { return g_host->TypeProto_Tensor__mutable_shape(this); }
+  bool has_elem_type() const { return g_host->TypeProto_Tensor__has_elem_type(this); }
   int32_t elem_type() const { return g_host->TypeProto_Tensor__elem_type(this); }
   void set_elem_type(int32_t value) { g_host->TypeProto_Tensor__set_elem_type(this, value); }
 
@@ -339,6 +364,7 @@ struct TypeProto_Sequence final {
 struct TypeProto final {
   static std::unique_ptr<TypeProto> Create() { return g_host->TypeProto__construct(); }
 
+  bool has_tensor_type() const { return g_host->TypeProto__has_tensor_type(this); }
   const TypeProto_Tensor& tensor_type() const { return g_host->TypeProto__tensor_type(this); }
   TypeProto_Tensor* mutable_tensor_type() { return g_host->TypeProto__mutable_tensor_type(this); }
 
@@ -475,6 +501,7 @@ namespace Utils {
 
 struct DataTypeUtils final {
   static const std::string* ToType(const ONNX_NAMESPACE::TypeProto& type_proto) { return g_host->Utils__DataTypeUtils__ToType(type_proto); }
+  static const std::string* ToType(const std::string& type_str) { return g_host->Utils__DataTypeUtils__ToType(type_str); }
 
   PROVIDER_DISALLOW_ALL(DataTypeUtils)
 };
@@ -770,6 +797,14 @@ struct Function final {
   PROVIDER_DISALLOW_ALL(Function)
 };
 
+struct Node_EdgeEnd final {
+  const Node& GetNode() const { return g_host->Node_EdgeEnd__GetNode(this); }
+  int GetSrcArgIndex() const { return g_host->Node_EdgeEnd__GetSrcArgIndex(this); }
+  int GetDstArgIndex() const { return g_host->Node_EdgeEnd__GetDstArgIndex(this); }
+
+  PROVIDER_DISALLOW_ALL(Node_EdgeEnd)
+};
+
 struct Node final {
   enum class Type {
     Primitive = 0,
@@ -801,6 +836,12 @@ struct Node final {
   void AddAttribute(const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) {
     g_host->Node__AddAttribute(this, attr_name, value);
   }
+  void AddAttribute(const std::string& attr_name, const std::string& value) {
+    g_host->Node__AddAttribute(this, attr_name, value);
+  }
+  void AddAttribute(const std::string& attr_name, int64_t value) {
+    g_host->Node__AddAttribute(this, attr_name, value);
+  }
 
   size_t GetInputEdgesCount() const noexcept { return g_host->Node__GetInputEdgesCount(this); }
   size_t GetOutputEdgesCount() const noexcept { return g_host->Node__GetOutputEdgesCount(this); }
@@ -832,6 +873,7 @@ struct Node final {
     }
 
     void operator++() { impl_->operator++(); }
+    const Node_EdgeEnd& operator*() { return impl_->operator*(); }
     const Node__EdgeIterator* operator->() const { return impl_.get(); }
 
     std::unique_ptr<Node__EdgeIterator> impl_;
@@ -906,6 +948,13 @@ struct NodeUnit final {
     QDQGroup,    // The NodeUnit contain a QDQ group of nodes, such as "DQ->Sigmoid->Q"
   };
 
+  NodeUnit() = delete;
+  NodeUnit(const NodeUnit&) = delete;
+  void operator=(const NodeUnit& v) = delete;
+
+  // Need delete because of APIs that return unique_ptr<NodeUnit>
+  static void operator delete(void* p) { g_host->NodeUnit__operator_delete(reinterpret_cast<NodeUnit*>(p)); }
+
   Type UnitType() const noexcept { return static_cast<Type>(g_host->NodeUnit__UnitType(this)); }
 
   const std::vector<NodeUnitIODef>& Inputs() const noexcept { return g_host->NodeUnit__Inputs(this); }
@@ -941,6 +990,9 @@ struct Model final {
                                        const IOnnxRuntimeOpSchemaRegistryList* local_registries, const logging::Logger& logger) {
     return g_host->Model__construct(std::move(model_proto), model_path, local_registries, logger);
   }
+  static std::unique_ptr<Model> Create(const std::string& graph_name, bool is_onnx_domain_only, const logging::Logger& logger) {
+    return g_host->Model__construct(graph_name, is_onnx_domain_only, logger);
+  }
   static void operator delete(void* p) { g_host->Model__operator_delete(reinterpret_cast<Model*>(p)); }
   static Status Load(const PathString& file_path, /*out*/ ONNX_NAMESPACE::ModelProto& model_proto) { return g_host->Model__Load(file_path, model_proto); }
 
@@ -1041,6 +1093,7 @@ class GraphViewer final {
   const std::string& Name() const noexcept { return g_host->GraphViewer__Name(this); }
   const std::filesystem::path& ModelPath() const noexcept { return g_host->GraphViewer__ModelPath(this); }
 
+  const ConstGraphNodes& Nodes() const noexcept { return g_host->GraphViewer__Nodes(this); }
   const Node* GetNode(NodeIndex node_index) const { return g_host->GraphViewer__GetNode(this, node_index); }
   const NodeArg* GetNodeArg(const std::string& name) const { return g_host->GraphViewer__GetNodeArg(this, name); }
 
@@ -1058,6 +1111,9 @@ class GraphViewer final {
 
   const std::vector<const NodeArg*>& GetInputs() const noexcept { return g_host->GraphViewer__GetInputs(this); }
   const std::vector<const NodeArg*>& GetOutputs() const noexcept { return g_host->GraphViewer__GetOutputs(this); }
+  bool NodeProducesGraphOutput(const Node& node) const {
+    return g_host->GraphViewer__NodeProducesGraphOutput(this, node);
+  }
   const std::unordered_set<const NodeArg*>& GetValueInfo() const noexcept { return g_host->GraphViewer__GetValueInfo(this); }
 
   const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return g_host->GraphViewer__GetAllInitializedTensors(this); }
@@ -1085,6 +1141,25 @@ class GraphViewer final {
   void operator=(const GraphViewer&) = delete;
 };
 
+struct ConstGraphNodes final {
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> begin() const {
+    return g_host->ConstGraphNodes__begin(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> end() const {
+    return g_host->ConstGraphNodes__end(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> cbegin() const {
+    return g_host->ConstGraphNodes__cbegin(this);
+  }
+  IteratorHolder<ConstGraphNodes_Iterator, const Node> cend() const {
+    return g_host->ConstGraphNodes__cend(this);
+  }
+
+  bool empty() const noexcept { return g_host->ConstGraphNodes__empty(this); }
+
+  PROVIDER_DISALLOW_ALL(ConstGraphNodes)
+};
+
 struct OpKernelContext final {
   template <typename T>
   const T& RequiredInput(int index) const;
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index af39edae20..3239e2b6e3 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -37,7 +37,6 @@
 #include "core/framework/model_metadef_id_generator.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
 #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
-#include "core/session/onnxruntime_session_options_config_keys.h"
 
 #include "core/session/onnxruntime_c_api.h"
 #include "core/common/string_helper.h"
@@ -62,6 +61,10 @@
 #include "orttraining/core/framework/distributed_run_context.h"
 #endif
 
+#ifdef _WIN32
+#include "core/platform/windows/logging/etw_sink.h"
+#endif
+
 namespace ONNX_NAMESPACE {
 // We use these names in the provider API because we don't have the protobuf definitions of the RepeatedField* types
 using int64s = google::protobuf::RepeatedField<int64_t>;
@@ -76,11 +79,18 @@ using FunctionProtos = google::protobuf::RepeatedPtrField<FunctionProto>;
 namespace onnxruntime {
 using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;
 using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
+using Node_EdgeEnd = Node::EdgeEnd;
+#ifdef _WIN32
+namespace logging {
+using EtwRegistrationManager_EtwInternalCallback = EtwRegistrationManager::EtwInternalCallback;
+}
+#endif
 }  // namespace onnxruntime
 
 #include "core/common/cpuid_info.h"
 #include "core/common/logging/logging.h"
 #include "core/providers/shared_library/provider_interfaces.h"
+#include "core/providers/partitioning_utils.h"
 
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cann/cann_provider_factory_creator.h"
@@ -90,6 +100,7 @@ using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema;
 #include "core/providers/openvino/openvino_provider_factory_creator.h"
 #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h"
 #include "core/providers/vitisai/vitisai_provider_factory_creator.h"
+#include "core/providers/qnn/qnn_provider_factory_creator.h"
 
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include "core/providers/cann/cann_provider_factory.h"
@@ -181,6 +192,7 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator {
   bool operator!=(const Node__EdgeIterator& p) const override { return v_ != static_cast<const Node__EdgeIterator_Impl*>(&p)->v_; }
 
   void operator++() override { v_.operator++(); }
+  const Node_EdgeEnd& operator*() const override { return v_.operator*(); }
   const Node& GetNode() const override { return v_->GetNode(); }
   int GetSrcArgIndex() const override { return v_->GetSrcArgIndex(); }
   int GetDstArgIndex() const override { return v_->GetDstArgIndex(); }
@@ -188,6 +200,18 @@ struct Node__EdgeIterator_Impl : Node__EdgeIterator {
   Node::EdgeConstIterator v_;
 };
 
+struct ConstGraphNodes_Iterator_Impl : ConstGraphNodes_Iterator {
+  ConstGraphNodes_Iterator_Impl(ConstGraphNodes::ConstNodeIterator&& v) : v_{std::move(v)} {}
+
+  bool operator!=(const ConstGraphNodes_Iterator& other) const override {
+    return v_ != static_cast<const ConstGraphNodes_Iterator_Impl*>(&other)->v_;
+  }
+  void operator++() override { v_.operator++(); }
+  const Node& operator*() override { return *v_; }
+
+  ConstGraphNodes::ConstNodeIterator v_;
+};
+
 #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
 common::Status LoadDynamicLibraryFromProvider(onnxruntime::PathString library_name) {
   const auto& platform_env = onnxruntime::Env::Default();
@@ -367,22 +391,57 @@ struct ProviderHostImpl : ProviderHost {
 
   // logging::Logger (wrapped)
   bool logging__Logger__OutputIsEnabled(const logging::Logger* p, logging::Severity severity, logging::DataType data_type) override { return p->OutputIsEnabled(severity, data_type); }
+  logging::Severity logging__Logger__GetSeverity(const logging::Logger* p) override {
+    return p->GetSeverity();
+  }
 
   // logging::LoggingManager (wrapped)
   const logging::Logger& logging__LoggingManager__DefaultLogger() override { return logging::LoggingManager::DefaultLogger(); }
 
   // logging::Capture (wrapped)
-  std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger, logging::Severity severity, const char* category, logging::DataType dataType, const CodeLocation& location) override {
-    return std::make_unique<logging::Capture>(logger, severity, category, dataType, location);
+  std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger,
+                                                                logging::Severity severity, const char* category,
+                                                                logging::DataType data_type,
+                                                                const CodeLocation& location) override {
+    return std::make_unique<logging::Capture>(logger, severity, category, data_type, location);
   }
   void logging__Capture__operator_delete(logging::Capture* p) noexcept override { delete p; }
   std::ostream& logging__Capture__Stream(logging::Capture* p) noexcept override { return p->Stream(); }
+  void logging__Capture__ProcessPrintf(logging::Capture* p, const char* format, va_list args) override {
+    p->ProcessPrintf(format, args);
+  }
+
+#if defined(_WIN32)
+  // logging::EtwRegistrationManager
+  logging::EtwRegistrationManager& logging__EtwRegistrationManager__Instance() override {
+    return logging::EtwRegistrationManager::Instance();
+  }
+  bool logging__EtwRegistrationManager__SupportsETW() override {
+    return logging::EtwRegistrationManager::SupportsETW();
+  }
+  logging::Severity logging__EtwRegistrationManager__MapLevelToSeverity(logging::EtwRegistrationManager* p) override {
+    return p->MapLevelToSeverity();
+  }
+  void logging__EtwRegistrationManager__RegisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
+    p->RegisterInternalCallback(callback);
+  }
+  void logging__EtwRegistrationManager__UnregisterInternalCallback(
+      logging::EtwRegistrationManager* p,
+      const logging::EtwRegistrationManager_EtwInternalCallback& callback) override {
+    p->UnregisterInternalCallback(callback);
+  }
+#endif  // defined(_WIN32)
 
   // Env
   Env& Env__Default() override { return Env::Default(); }
 
   // Utils::DataTypeUtils (wrapped)
   const std::string* Utils__DataTypeUtils__ToType(const ONNX_NAMESPACE::TypeProto& type_proto) override { return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_proto); }
+  const std::string* Utils__DataTypeUtils__ToType(const std::string& type_str) override {
+    return ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(type_str);
+  }
 
   // int64s (wrapped)
   int int64s__size(const ONNX_NAMESPACE::int64s* p) override { return p->size(); }
@@ -424,6 +483,7 @@ struct ProviderHostImpl : ProviderHost {
   bool TypeProto_Tensor__has_shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_shape(); }
   const ONNX_NAMESPACE::TensorShapeProto& TypeProto_Tensor__shape(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->shape(); }
   ONNX_NAMESPACE::TensorShapeProto* TypeProto_Tensor__mutable_shape(ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->mutable_shape(); }
+  bool TypeProto_Tensor__has_elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->has_elem_type(); }
   int32_t TypeProto_Tensor__elem_type(const ONNX_NAMESPACE::TypeProto_Tensor* p) override { return p->elem_type(); }
   void TypeProto_Tensor__set_elem_type(ONNX_NAMESPACE::TypeProto_Tensor* p, int32_t value) override { p->set_elem_type(value); };
 
@@ -444,6 +504,7 @@ struct ProviderHostImpl : ProviderHost {
   // TypeProto (wrapped)
   std::unique_ptr<ONNX_NAMESPACE::TypeProto> TypeProto__construct() override { return std::make_unique<ONNX_NAMESPACE::TypeProto>(); }
   void TypeProto__CopyFrom(ONNX_NAMESPACE::TypeProto* p, const ONNX_NAMESPACE::TypeProto* other) override { p->CopyFrom(*other); }
+  bool TypeProto__has_tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->has_tensor_type(); }
   const ONNX_NAMESPACE::TypeProto_Tensor& TypeProto__tensor_type(const ONNX_NAMESPACE::TypeProto* p) override { return p->tensor_type(); }
   ONNX_NAMESPACE::TypeProto_Tensor* TypeProto__mutable_tensor_type(ONNX_NAMESPACE::TypeProto* p) override { return p->mutable_tensor_type(); }
   int TypeProto__value_case(const ONNX_NAMESPACE::TypeProto* p) override { return p->value_case(); }
@@ -572,6 +633,7 @@ struct ProviderHostImpl : ProviderHost {
   const std::string& TensorProto__raw_data(const ONNX_NAMESPACE::TensorProto* p) override { return p->raw_data(); }
   std::string* TensorProto__mutable_raw_data(ONNX_NAMESPACE::TensorProto* p) override { return p->mutable_raw_data(); }
 
+  bool TensorProto__has_data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->has_data_type(); }
   int32_t TensorProto__data_type(const ONNX_NAMESPACE::TensorProto* p) override { return p->data_type(); }
   void TensorProto__set_data_type(ONNX_NAMESPACE::TensorProto* p, int32_t type) override { p->set_data_type(type); }
 
@@ -610,6 +672,10 @@ struct ProviderHostImpl : ProviderHost {
     return std::make_unique<TensorShapeProto_Dimension_Iterator_Impl>(p->end());
   }
 
+  size_t TensorShapeProto_Dimensions__size(const ONNX_NAMESPACE::TensorShapeProto_Dimensions* p) override {
+    return p->size();
+  }
+
   // TensorShapeProto (wrapped)
   int TensorShapeProto__dim_size(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim_size(); }
   const ONNX_NAMESPACE::TensorShapeProto_Dimensions& TensorShapeProto__dim(const ONNX_NAMESPACE::TensorShapeProto* p) override { return p->dim(); }
@@ -960,6 +1026,12 @@ struct ProviderHostImpl : ProviderHost {
   void Node__AddAttribute(Node* p, const ::std::string& attr_name, const ONNX_NAMESPACE::GraphProto& value) override {
     p->AddAttribute(attr_name, value);
   }
+  void Node__AddAttribute(Node* p, const ::std::string& attr_name, const std::string& value) override {
+    p->AddAttribute(attr_name, value);
+  }
+  void Node__AddAttribute(Node* p, const ::std::string& attr_name, int64_t value) override {
+    p->AddAttribute(attr_name, value);
+  }
   size_t Node__GetInputEdgesCount(const Node* p) noexcept override { return p->GetInputEdgesCount(); }
   size_t Node__GetOutputEdgesCount(const Node* p) noexcept override { return p->GetOutputEdgesCount(); }
 
@@ -982,6 +1054,11 @@ struct ProviderHostImpl : ProviderHost {
   std::unordered_map<std::string, gsl::not_null<const Graph*>> Node__GetAttributeNameToSubgraphMap(const Node* p) const override { return p->GetAttributeNameToSubgraphMap(); }
   int Node__NodeType(const Node* p) const noexcept override { return int(p->NodeType()); }
 
+  // Node_EdgeEnd (wrapped). Maps to Node::EdgeEnd struct.
+  const Node& Node_EdgeEnd__GetNode(const Node_EdgeEnd* p) override { return p->GetNode(); }
+  int Node_EdgeEnd__GetSrcArgIndex(const Node_EdgeEnd* p) override { return p->GetSrcArgIndex(); }
+  int Node_EdgeEnd__GetDstArgIndex(const Node_EdgeEnd* p) override { return p->GetDstArgIndex(); }
+
   // NodeArg (wrapped)
   const std::string& NodeArg__Name(const NodeArg* p) noexcept override { return p->Name(); }
   const ONNX_NAMESPACE::TensorShapeProto* NodeArg__Shape(const NodeArg* p) override { return p->Shape(); }
@@ -1016,7 +1093,8 @@ struct ProviderHostImpl : ProviderHost {
   void NodeAttributes__insert_or_assign(NodeAttributes* p, const std::string& k, const ONNX_NAMESPACE::AttributeProto& v) override { p->insert_or_assign(k, v); }
   void NodeAttributes__reserve(NodeAttributes* p, size_t size) override { p->reserve(size); }
 
-  // NodeUnit (wrapped)
+  void NodeUnit__operator_delete(NodeUnit* p) noexcept override { delete p; }
+
   int NodeUnit__UnitType(const NodeUnit* p) noexcept override { return static_cast<int>(p->UnitType()); }
 
   const std::vector<NodeUnitIODef>& NodeUnit__Inputs(const NodeUnit* p) noexcept override {
@@ -1064,12 +1142,46 @@ struct ProviderHostImpl : ProviderHost {
     return QDQ::GetAllNodeUnits(*graph_viewer, logger);
   }
 
+  // Partitioning utils
+  std::vector<std::unique_ptr<ComputeCapability>>
+  Utils__CreateSupportedPartitions(const GraphViewer& graph_viewer,
+                                   const std::unordered_set<const Node*>& supported_nodes,
+                                   const std::unordered_set<std::string>& stop_ops,
+                                   const utils::GenerateMetadefNameFn& generate_metadef_name,
+                                   const std::string& execution_provider_name,
+                                   const std::string& execution_provider_type,
+                                   const std::unordered_map<const Node*, const NodeUnit*>* node_unit_map,
+                                   bool drop_constant_initializers) override {
+    return onnxruntime::utils::CreateSupportedPartitions(graph_viewer,
+                                                         supported_nodes,
+                                                         stop_ops,
+                                                         generate_metadef_name,
+                                                         execution_provider_name,
+                                                         execution_provider_type,
+                                                         node_unit_map,
+                                                         drop_constant_initializers);
+  }
+
+  std::unique_ptr<ComputeCapability>
+  Utils__MakeComputeCapability(const GraphViewer& graph_viewer,
+                               const std::vector<const Node*>& group,
+                               const std::function<std::string()>& generate_metadef_name,
+                               const std::string& execution_provider_name,
+                               bool drop_constant_initializers) override {
+    return onnxruntime::utils::MakeComputeCapability(graph_viewer, group, generate_metadef_name,
+                                                     execution_provider_name, drop_constant_initializers);
+  }
+
   // Model (wrapped)
   std::unique_ptr<Model> Model__construct(ONNX_NAMESPACE::ModelProto&& model_proto, const PathString& model_path,
                                           const IOnnxRuntimeOpSchemaRegistryList* local_registries,
                                           const logging::Logger& logger) override {
     return std::make_unique<Model>(model_proto, model_path, local_registries, logger);
   }
+  std::unique_ptr<Model> Model__construct(const std::string& graph_name, bool is_onnx_domain_only,
+                                          const logging::Logger& logger) override {
+    return std::make_unique<Model>(graph_name, is_onnx_domain_only, logger);
+  }
   void Model__operator_delete(Model* p) override { delete p; }
   Graph& Model__MainGraph(Model* p) override { return p->MainGraph(); }
   std::unique_ptr<ONNX_NAMESPACE::ModelProto> Model__ToProto(Model* p) override { return std::make_unique<ONNX_NAMESPACE::ModelProto>(p->ToProto()); }
@@ -1179,6 +1291,7 @@ struct ProviderHostImpl : ProviderHost {
   const std::string& GraphViewer__Name(const GraphViewer* p) noexcept override { return p->Name(); }
   const std::filesystem::path& GraphViewer__ModelPath(const GraphViewer* p) noexcept override { return p->ModelPath(); }
 
+  const ConstGraphNodes& GraphViewer__Nodes(const GraphViewer* p) noexcept override { return p->Nodes(); }
   const Node* GraphViewer__GetNode(const GraphViewer* p, NodeIndex node_index) override { return p->GetNode(node_index); }
   const NodeArg* GraphViewer__GetNodeArg(const GraphViewer* p, const std::string& name) override { return p->GetNodeArg(name); }
 
@@ -1196,6 +1309,9 @@ struct ProviderHostImpl : ProviderHost {
 
   const std::vector<const NodeArg*>& GraphViewer__GetInputs(const GraphViewer* p) noexcept override { return p->GetInputs(); }
   const std::vector<const NodeArg*>& GraphViewer__GetOutputs(const GraphViewer* p) noexcept override { return p->GetOutputs(); }
+  bool GraphViewer__NodeProducesGraphOutput(const GraphViewer* p, const Node& node) override {
+    return p->NodeProducesGraphOutput(node);
+  }
   const std::unordered_set<const NodeArg*>& GraphViewer__GetValueInfo(const GraphViewer* p) noexcept override { return p->GetValueInfo(); }
 
   const InitializedTensorSet& GraphViewer__GetAllInitializedTensors(const GraphViewer* p) override { return p->GetAllInitializedTensors(); }
@@ -1224,6 +1340,21 @@ struct ProviderHostImpl : ProviderHost {
   const Node* GraphViewer__GetProducerNode(const GraphViewer* p, const std::string& node_arg_name) const override { return p->GetProducerNode(node_arg_name); }
   IOnnxRuntimeOpSchemaCollectionPtr GraphViewer__GetSchemaRegistry(const GraphViewer* p) const override { return p->GetSchemaRegistry(); }
 
+  // ConstGraphNodes
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__begin(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->begin());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__end(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->end());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cbegin(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->cbegin());
+  }
+  std::unique_ptr<ConstGraphNodes_Iterator> ConstGraphNodes__cend(const ConstGraphNodes* p) override {
+    return std::make_unique<ConstGraphNodes_Iterator_Impl>(p->cend());
+  }
+  bool ConstGraphNodes__empty(const ConstGraphNodes* p) noexcept override { return p->empty(); }
+
   // OpKernel (direct)
   const Node& OpKernel__Node(const OpKernel* p) override { return p->OpKernel::Node(); }
 
@@ -1651,6 +1782,9 @@ static ProviderLibrary s_library_tensorrt(LIBRARY_PREFIX ORT_TSTR("onnxruntime_p
 );
 static ProviderLibrary s_library_migraphx(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_migraphx") LIBRARY_EXTENSION);
 
+// QNN EP can be built either as a static library or a shared library. Can safely define s_library_qnn even if static.
+static ProviderLibrary s_library_qnn(LIBRARY_PREFIX ORT_TSTR("onnxruntime_providers_qnn") LIBRARY_EXTENSION);
+
 void UnloadSharedProviders() {
   s_library_dnnl.Unload();
   s_library_vitisai.Unload();
@@ -1662,6 +1796,7 @@ void UnloadSharedProviders() {
   s_library_rocm.Unload();
   s_library_shared.Unload();
   s_library_migraphx.Unload();
+  s_library_qnn.Unload();
 }
 
 // Used by test code
@@ -1832,6 +1967,20 @@ ProviderOptions OrtOpenVINOProviderOptionsToOrtOpenVINOProviderOptionsV2(const O
   return ov_options_converted_map;
 }
 
+#if !BUILD_QNN_EP_STATIC_LIB
+std::shared_ptr<IExecutionProviderFactory> QNNProviderFactoryCreator::Create(const ProviderOptions& provider_options_map,
+                                                                             const SessionOptions* session_options) {
+  const ConfigOptions* config_options = nullptr;
+  if (session_options != nullptr) {
+    config_options = &session_options->config_options;
+  }
+
+  std::array<const void*, 2> configs_array = {&provider_options_map, config_options};
+  const void* arg = reinterpret_cast<const void*>(&configs_array);
+  return s_library_qnn.Get().CreateExecutionProviderFactory(arg);
+}
+#endif  // !BUILD_QNN_EP_STATIC_LIB
+
 std::shared_ptr<IExecutionProviderFactory> OpenVINOProviderFactoryCreator::Create(
     const ProviderOptions* provider_options_map, const SessionOptions* session_options) {
   // Append session options applicable for EP to EP Provider options.
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 92ec4ba3b0..a6fb664728 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -6,7 +6,9 @@
 #include <thread>
 
 #include "core/providers/cpu/cpu_provider_factory.h"  // For OrtSessionOptionsAppendExecutionProvider_CPU
-#include "core/providers/qnn/qnn_allocator.h"
+#if BUILD_QNN_EP_STATIC_LIB
+#include "core/providers/qnn/qnn_allocator.h"  // Used by QnnHTPBackendTests.UseHtpSharedMemoryAllocatorForInputs
+#endif
 #include "core/session/inference_session.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
@@ -1099,6 +1101,9 @@ TEST_F(QnnHTPBackendTests, EPOffloadsGraphIOQuantDequant) {
   }
 }
 
+// Only compile this test when QNN EP is built as a static library. When QNN EP is a shared library,
+// we cannot include internal QNN EP headers that use the provider-bridge API.
+#if BUILD_QNN_EP_STATIC_LIB
 TEST_F(QnnHTPBackendTests, UseHtpSharedMemoryAllocatorForInputs) {
   ProviderOptions provider_options;
 #if defined(_WIN32)
@@ -1145,6 +1150,7 @@ TEST_F(QnnHTPBackendTests, UseHtpSharedMemoryAllocatorForInputs) {
                   ExpectedEPNodeAssignment::All,
                   0.008f);
 }
+#endif  // BUILD_QNN_EP_STATIC_LIB
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 #endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index a3f0ed55b8..38fde332ca 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -7,7 +7,6 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/inference_session.h"
-#include "core/providers/shared/utils/utils.h"
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -25,6 +24,24 @@ namespace test {
 
 #if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
+static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.i();
+  }
+
+  return default_val;
+}
+
+static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.s();
+  }
+
+  return default_val;
+}
+
 // Create a model with FusedMatMul + Add (quantized)
 // input1 -> Add -> Q -> DQ \
 //                           FusedMatMul -> Q -> DQ -> output
@@ -873,10 +890,9 @@ static void GetLastContextBinaryFileName(const std::string last_onnx_ctx_file,
   auto& ctx_graph = ctx_model->MainGraph();
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
-      NodeAttrHelper node_helper(node);
-      int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
       if (1 == is_main_context) {
-        last_ctx_bin_file = node_helper.Get("ep_cache_context", "");
+        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
         return;
       }
     }
@@ -899,10 +915,9 @@ static void UpdateEpContextModel(const std::vector<std::string>& ep_ctx_files,
 
     for (auto& node : ctx_graph.Nodes()) {
       if (node.OpType() == "EPContext") {
-        NodeAttrHelper node_helper(node);
-        int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+        int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
         if (1 == is_main_context) {
-          std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", "");
+          std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", "");
           auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
           std::remove(file_path.string().c_str());
           node.ClearAttribute("ep_cache_context");
diff --git a/onnxruntime/test/qnn_ctx_gen/main.cc b/onnxruntime/test/qnn_ctx_gen/main.cc
index 3be0bd253c..bb5007b40b 100644
--- a/onnxruntime/test/qnn_ctx_gen/main.cc
+++ b/onnxruntime/test/qnn_ctx_gen/main.cc
@@ -16,7 +16,6 @@
 #include "core/common/logging/sinks/clog_sink.h"
 
 #include "core/graph/model.h"
-#include "core/providers/shared/utils/utils.h"
 #include "core/session/environment.h"
 #include "core/common/logging/logging.h"
 
@@ -31,6 +30,24 @@ static void CheckStatus(const Status& status) {
   }
 }
 
+static int64_t GetNodeAttr(const Node& node, const std::string& attr_name, int64_t default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.i();
+  }
+
+  return default_val;
+}
+
+static const std::string& GetNodeAttr(const Node& node, const std::string& attr_name, const std::string& default_val) {
+  const auto& attributes = node.GetAttributes();
+  if (auto entry = attributes.find(attr_name); entry != attributes.end()) {
+    return entry->second.s();
+  }
+
+  return default_val;
+}
+
 // from the last context cache Onnx model, find the EPContext node with main_context=1,
 // and get the QNN context binary file name, this context binary contains all graphs from all Onnx models
 // get the max spill fill buffer size
@@ -44,11 +61,10 @@ static void GetLastContextBinaryFileName(const std::basic_string<ORTCHAR_T> last
   auto& ctx_graph = ctx_model->MainGraph();
   for (auto& node : ctx_graph.Nodes()) {
     if (node.OpType() == "EPContext") {
-      NodeAttrHelper node_helper(node);
-      int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
-      max_size = node_helper.Get("max_size", static_cast<int64_t>(0));
+      int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
+      max_size = GetNodeAttr(node, "max_size", static_cast<int64_t>(0));
       if (1 == is_main_context) {
-        last_ctx_bin_file = node_helper.Get("ep_cache_context", "");
+        last_ctx_bin_file = GetNodeAttr(node, "ep_cache_context", "");
         return;
       }
     }
@@ -72,10 +88,9 @@ static void UpdateEpContextModel(const std::vector<std::basic_string<ORTCHAR_T>>
 
     for (auto& node : ctx_graph.Nodes()) {
       if (node.OpType() == "EPContext") {
-        NodeAttrHelper node_helper(node);
-        int64_t is_main_context = node_helper.Get("main_context", static_cast<int64_t>(0));
+        int64_t is_main_context = GetNodeAttr(node, "main_context", static_cast<int64_t>(0));
         if (1 == is_main_context) {
-          std::string old_qnn_ctx_binary_file_name = node_helper.Get("ep_cache_context", "");
+          std::string old_qnn_ctx_binary_file_name = GetNodeAttr(node, "ep_cache_context", "");
           auto file_path = path.replace_filename(old_qnn_ctx_binary_file_name);
           std::remove(file_path.string().c_str());
           node.ClearAttribute("ep_cache_context");
diff --git a/setup.py b/setup.py
index a2d50284b0..6481f58f69 100644
--- a/setup.py
+++ b/setup.py
@@ -315,17 +315,20 @@ providers_cuda_or_rocm = "onnxruntime_providers_" + ("rocm" if is_rocm else "cud
 providers_tensorrt_or_migraphx = "onnxruntime_providers_" + ("migraphx" if is_migraphx else "tensorrt")
 providers_openvino = "onnxruntime_providers_openvino"
 providers_cann = "onnxruntime_providers_cann"
+providers_qnn = "onnxruntime_providers_qnn"
 
 if platform.system() == "Linux":
     providers_cuda_or_rocm = "lib" + providers_cuda_or_rocm + ".so"
     providers_tensorrt_or_migraphx = "lib" + providers_tensorrt_or_migraphx + ".so"
     providers_openvino = "lib" + providers_openvino + ".so"
     providers_cann = "lib" + providers_cann + ".so"
+    providers_qnn = "lib" + providers_qnn + ".so"
 elif platform.system() == "Windows":
     providers_cuda_or_rocm = providers_cuda_or_rocm + ".dll"
     providers_tensorrt_or_migraphx = providers_tensorrt_or_migraphx + ".dll"
     providers_openvino = providers_openvino + ".dll"
     providers_cann = providers_cann + ".dll"
+    providers_qnn = providers_qnn + ".dll"
 
 # Additional binaries
 dl_libs = []
@@ -345,8 +348,9 @@ if platform.system() == "Linux" or platform.system() == "AIX":
     dl_libs.append(providers_cuda_or_rocm)
     dl_libs.append(providers_tensorrt_or_migraphx)
     dl_libs.append(providers_cann)
+    dl_libs.append(providers_qnn)
     dl_libs.append("libonnxruntime.so*")
-    # DNNL, TensorRT & OpenVINO EPs are built as shared libs
+    # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs
     libs.extend(["libonnxruntime_providers_shared.so"])
     libs.extend(["libonnxruntime_providers_dnnl.so"])
     libs.extend(["libonnxruntime_providers_openvino.so"])
@@ -354,6 +358,7 @@ if platform.system() == "Linux" or platform.system() == "AIX":
     libs.append(providers_cuda_or_rocm)
     libs.append(providers_tensorrt_or_migraphx)
     libs.append(providers_cann)
+    libs.append(providers_qnn)
     # QNN
     qnn_deps = [
         "libQnnCpu.so",
@@ -392,13 +397,14 @@ else:
         providers_cann,
         "onnxruntime.dll",
     ]
-    # DNNL, TensorRT & OpenVINO EPs are built as shared libs
+    # DNNL, TensorRT, OpenVINO, and QNN EPs are built as shared libs
     libs.extend(["onnxruntime_providers_shared.dll"])
     libs.extend(["onnxruntime_providers_dnnl.dll"])
     libs.extend(["onnxruntime_providers_tensorrt.dll"])
     libs.extend(["onnxruntime_providers_openvino.dll"])
     libs.extend(["onnxruntime_providers_cuda.dll"])
     libs.extend(["onnxruntime_providers_vitisai.dll"])
+    libs.extend(["onnxruntime_providers_qnn.dll"])
     # DirectML Libs
     libs.extend(["DirectML.dll"])
     # QNN V68/V73 dependencies
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index e7d93aeabe..bce7552854 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -129,6 +129,17 @@ def _openvino_verify_device_type(device_read):
     return device_read
 
 
+def _qnn_verify_library_kind(library_kind):
+    choices = ["shared_lib", "static_lib"]
+    if library_kind not in choices:
+        print("\nYou have specified an invalid library kind for QNN EP.")
+        print(f"The invalid library kind was: {library_kind}")
+        print("Provide a library kind from the following options: ", choices)
+        print(f"Example: --use_qnn {choices[0]}")
+        sys.exit("Incorrect build configuration")
+    return library_kind
+
+
 def parse_arguments():
     class Parser(argparse.ArgumentParser):
         # override argument file line parsing behavior - allow multiple arguments per line and handle quotes
@@ -578,7 +589,14 @@ def parse_arguments():
     parser.add_argument("--use_jsep", action="store_true", help="Build with JavaScript kernels.")
     parser.add_argument("--use_webgpu", action="store_true", help="Build with WebGPU support.")
     parser.add_argument("--use_external_dawn", action="store_true", help="Treat Dawn as an external dependency.")
-    parser.add_argument("--use_qnn", action="store_true", help="Build with QNN support.")
+    parser.add_argument(
+        "--use_qnn",
+        nargs="?",
+        const="shared_lib",  # If provide --use_qnn without an arg, defaults to a shared library.
+        type=_qnn_verify_library_kind,
+        help="Build with QNN support. Specify 'shared_lib' or 'static_lib' to build QNN EP "
+        "as a shared or static library, respectively.",
+    )
     parser.add_argument("--qnn_home", help="Path to QNN SDK dir.")
     parser.add_argument("--use_rknpu", action="store_true", help="Build with RKNPU.")
     parser.add_argument("--use_preinstalled_eigen", action="store_true", help="Use pre-installed Eigen.")
@@ -1350,6 +1368,11 @@ def generate_build_tree(
             raise BuildError("qnn_home=" + qnn_home + " not valid." + " qnn_home paths must be specified and valid.")
         cmake_args += ["-Donnxruntime_USE_QNN=ON"]
 
+        if args.use_qnn == "static_lib":
+            cmake_args += ["-Donnxruntime_BUILD_QNN_EP_STATIC_LIB=ON"]
+        if args.android and args.use_qnn != "static_lib":
+            raise BuildError("Only support Android + QNN builds with QNN EP built as a static library.")
+
     if args.use_coreml:
         cmake_args += ["-Donnxruntime_USE_COREML=ON"]
 
@@ -2401,6 +2424,8 @@ def build_nuget_package(
     elif use_rocm:
         package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm"
     elif use_qnn:
+        if use_qnn != "shared_lib":
+            raise BuildError("Currently NuGet packages with QNN require QNN EP to be built as a shared library.")
         execution_provider = "/p:ExecutionProvider=qnn"
         package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.QNN"
     elif any("OrtPackageId=" in x for x in msbuild_extra_options):
diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py
index e9f8fea951..c2bc5cba82 100644
--- a/tools/ci_build/github/android/build_aar_package.py
+++ b/tools/ci_build/github/android/build_aar_package.py
@@ -72,11 +72,15 @@ def _parse_build_settings(args):
     return build_settings
 
 
+def _is_qnn_android_build(build_settings):
+    return any(build_arg.startswith("--use_qnn") for build_arg in build_settings["build_params"])
+
+
 def _build_aar(args):
     build_settings = _parse_build_settings(args)
     build_dir = os.path.abspath(args.build_dir)
     ops_config_path = os.path.abspath(args.include_ops_by_config) if args.include_ops_by_config else None
-    qnn_android_build = "--use_qnn" in build_settings["build_params"]
+    qnn_android_build = _is_qnn_android_build(build_settings)
 
     # Setup temp environment for building
     temp_env = os.environ.copy()
diff --git a/tools/ci_build/github/android/default_qnn_aar_build_settings.json b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
index 8c36244035..5ac49f582d 100644
--- a/tools/ci_build/github/android/default_qnn_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_qnn_aar_build_settings.json
@@ -11,7 +11,7 @@
         "--cmake_generator=Ninja",
         "--build_java",
         "--build_shared_lib",
-        "--use_qnn",
+        "--use_qnn=static_lib",
         "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF",
         "--skip_tests"
 
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index b89aa50171..f237ef37fe 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -72,7 +72,8 @@ jobs:
         --android_abi=x86_64 \
         --android_api=31 \
         --parallel \
-        --use_qnn \
+        --build_shared_lib \
+        --use_qnn static_lib \
         --qnn_home $(QnnSDKRootDir) \
         --cmake_generator=Ninja \
         --skip_tests
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index fb235bda24..093db011e4 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -41,7 +41,12 @@ jobs:
     timeoutInMinutes: 60
     workspace:
       clean: all
-
+    strategy:
+      matrix:
+        SHARED_LIB:
+          QnnLibKind: 'shared_lib'
+        STATIC_LIB:
+          QnnLibKind: 'static_lib'
     steps:
       - script: |
           ls -R /data/qnn_test_data
@@ -65,7 +70,8 @@ jobs:
             --config Release \
             --use_binskim_compliant_compile_flags \
             --build_java \
-            --use_qnn \
+            --build_shared_lib \
+            --use_qnn $(QnnLibKind) \
             --qnn_home $(QnnSDKRootDir) \
             --cmake_generator=Ninja \
             --update --build --parallel
@@ -77,7 +83,8 @@ jobs:
             --config Release \
             --use_binskim_compliant_compile_flags \
             --build_java \
-            --use_qnn \
+            --build_shared_lib \
+            --use_qnn $(QnnLibKind) \
             --qnn_home $(QnnSDKRootDir) \
             --cmake_generator=Ninja \
             --test
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index d1b85e64fa..ff2ecb0d3c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -94,6 +94,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
index 8595a52cde..f382156c03 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
@@ -92,6 +92,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 979961d066..a5f2a481e6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -92,6 +92,7 @@ jobs:
             --build_dir $(Build.BinariesDirectory)
             --skip_submodule_sync
             --cmake_generator "$(VSGenerator)"
+            --build_shared_lib
             --use_qnn
             --qnn_home $(QnnSDKRootDir)
             --enable_pybind
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index 205bd0b5c3..5a74998ca4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -93,12 +93,18 @@ stages:
           workingFolder: '$(Build.BinariesDirectory)\${{ parameters.build_config }}'
           createLogFile: true
 
+      - task: CmdLine@2
+        displayName: 'Print contents of binaries directory'
+        inputs:
+          script: |
+            dir $(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}
+
       - template: win-esrp-dll.yml
         parameters:
           FolderPath: '$(Build.BinariesDirectory)\${{ parameters.build_config }}\${{ parameters.build_config }}'
           DisplayName: 'ESRP - Sign dlls'
           DoEsrp: ${{ parameters.DoEsrp }}
-          Pattern: 'onnxruntime.dll'
+          Pattern: 'onnxruntime*.dll'
 
       - task: MSBuild@1
         displayName: 'Restore NuGet Packages and create project.assets.json'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 84b71b37d9..787c3ffe23 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -36,7 +36,7 @@ parameters:
   default: 2.30.0.250109
 
 jobs:
-- job: 'build'
+- job: 'BUILD_QNN_EP'
   pool: 'onnxruntime-qnn-windows-vs-2022-arm64'
   variables:
     DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
@@ -46,6 +46,12 @@ jobs:
   timeoutInMinutes: 240
   workspace:
     clean: all
+  strategy:
+    matrix:
+      SHARED_LIB:
+        QnnLibKind: 'shared_lib'
+      STATIC_LIB:
+        QnnLibKind: 'static_lib'
   steps:
 
   - script: |
@@ -79,7 +85,8 @@ jobs:
         --config $(BuildConfig)
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
-        --use_qnn
+        --build_shared_lib
+        --use_qnn $(QnnLibKind)
         --qnn_home $(QnnSDKRootDir)
         --update --build --parallel
 
@@ -88,7 +95,8 @@ jobs:
         --config $(BuildConfig) ^
         --build_dir $(Build.BinariesDirectory) ^
         --cmake_generator "Visual Studio 17 2022" ^
-        --use_qnn ^
+        --build_shared_lib ^
+        --use_qnn $(QnnLibKind) ^
         --qnn_home $(QnnSDKRootDir) ^
         --test --enable_onnx_tests
     displayName: 'Run unit tests'
@@ -121,7 +129,7 @@ jobs:
       TargetFolder: '$(Build.ArtifactStagingDirectory)'
       CleanTargetFolder: true
       OverWrite: true
-    condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))
+    condition: and(succeeded(), and(ne(variables['Build.Reason'], 'PullRequest'), eq(variables['QnnLibKind'], 'shared_lib')))
 
   - task: PublishBuildArtifacts@1
     displayName: 'Publish Artifact'
@@ -129,4 +137,4 @@ jobs:
       PathtoPublish: '$(Build.ArtifactStagingDirectory)'
       ArtifactName: 'internal_release'
       publishLocation: 'Container'
-    condition: and(succeeded(), ne(variables['Build.Reason'], 'PullRequest'))
+    condition: and(succeeded(), and(ne(variables['Build.Reason'], 'PullRequest'), eq(variables['QnnLibKind'], 'shared_lib')))
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index e6792bc34a..28fbe4a109 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -36,7 +36,7 @@ parameters:
   default: 2.30.0.250109
 
 jobs:
-- job: 'build'
+- job: 'BUILD_QNN_EP'
   pool: 'Onnxruntime-QNNEP-Windows-2022-CPU'
   variables:
     MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary'
@@ -50,6 +50,12 @@ jobs:
   timeoutInMinutes: 120
   workspace:
     clean: all
+  strategy:
+    matrix:
+      SHARED_LIB:
+        QnnLibKind: 'shared_lib'
+      STATIC_LIB:
+        QnnLibKind: 'static_lib'
   steps:
 
   - task: UsePythonVersion@0
@@ -72,7 +78,8 @@ jobs:
         --build_dir $(Build.BinariesDirectory)
         --cmake_generator "Visual Studio 17 2022"
         --build_java
-        --use_qnn
+        --build_shared_lib
+        --use_qnn $(QnnLibKind)
         --qnn_home $(QnnSDKRootDir)
         --use_binskim_compliant_compile_flags
         --update --parallel
@@ -87,7 +94,8 @@ jobs:
         --build_dir $(Build.BinariesDirectory) ^
         --cmake_generator "Visual Studio 17 2022" ^
         --build_java ^
-        --use_qnn ^
+        --build_shared_lib ^
+        --use_qnn $(QnnLibKind) ^
         --qnn_home $(QnnSDKRootDir) ^
         --use_binskim_compliant_compile_flags ^
         --test --enable_onnx_tests
diff --git a/tools/ci_build/github/linux/build_linux_python_package.sh b/tools/ci_build/github/linux/build_linux_python_package.sh
index 6d86a57bd7..b5999da997 100755
--- a/tools/ci_build/github/linux/build_linux_python_package.sh
+++ b/tools/ci_build/github/linux/build_linux_python_package.sh
@@ -75,7 +75,7 @@ fi
 
 if [ "$BUILD_DEVICE" == "NPU" ]; then
     #Enable QNN EP
-    BUILD_ARGS+=("--use_qnn" "--qnn_home=/qnn_sdk")
+    BUILD_ARGS+=("--build_shared_lib" "--use_qnn" "--qnn_home=/qnn_sdk")
 fi
 
 export ONNX_ML=1
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 0568ae864d..8ccb2c0549 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -382,6 +382,7 @@ def generate_files(line_list, args):
             "tensorrt_ep_shared_lib": "onnxruntime_providers_tensorrt.dll",
             "openvino_ep_shared_lib": "onnxruntime_providers_openvino.dll",
             "cuda_ep_shared_lib": "onnxruntime_providers_cuda.dll",
+            "qnn_ep_shared_lib": "onnxruntime_providers_qnn.dll",
             "onnxruntime_perf_test": "onnxruntime_perf_test.exe",
             "onnx_test_runner": "onnx_test_runner.exe",
         }
@@ -777,6 +778,24 @@ def generate_files(line_list, args):
             + '\\native" />'
         )
 
+    if args.execution_provider == "qnn" or (is_qnn_package and not is_ado_packaging_build):
+        files_list.append(
+            "<file src="
+            + '"'
+            + os.path.join(args.native_build_path, nuget_dependencies["providers_shared_lib"])
+            + runtimes_target
+            + args.target_architecture
+            + '\\native" />'
+        )
+        files_list.append(
+            "<file src="
+            + '"'
+            + os.path.join(args.native_build_path, nuget_dependencies["qnn_ep_shared_lib"])
+            + runtimes_target
+            + args.target_architecture
+            + '\\native" />'
+        )
+
     # process all other library dependencies
     if is_cpu_package or is_cuda_gpu_package or is_dml_package or is_mklml_package:
         # Process dnnl dependency

From 46dc0b5f2106e1db6146c1c7c0d9c64577eb78b7 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Wed, 22 Jan 2025 21:26:24 -0800
Subject: [PATCH 13/37] [QNN EP] Add LoggingManager::HasDefaultLogger() to
 provider bridge API (#23467)

### Description
Fixes QNN EP builds due to missing function in provider bridge API:
`logging::LoggingManager::HasDefaultLogger()`


### Motivation and Context
A [recent PR](https://github.com/microsoft/onnxruntime/pull/23120) made
QNN EP a shared library. A [different
PR](https://github.com/microsoft/onnxruntime/pull/23435) added use of a
new function to QNN EP that was not part of the provider bridge API. The
CI did not catch it because main was not merged into the first PR before
merging.
---
 onnxruntime/core/providers/shared_library/provider_interfaces.h  | 1 +
 .../core/providers/shared_library/provider_wrappedtypes.h        | 1 +
 onnxruntime/core/session/provider_bridge_ort.cc                  | 1 +
 3 files changed, 3 insertions(+)

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index d22cb2ed00..962d10d895 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -286,6 +286,7 @@ struct ProviderHost {
 
   // logging::LoggingManager
   virtual const logging::Logger& logging__LoggingManager__DefaultLogger() = 0;
+  virtual bool logging__LoggingManager__HasDefaultLogger() = 0;
 
   // logging::Capture
   virtual std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger,
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index 1b6c29e686..e434935343 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -32,6 +32,7 @@ struct Logger final {
 
 struct LoggingManager final {
   static const Logger& DefaultLogger() { return g_host->logging__LoggingManager__DefaultLogger(); }
+  static bool HasDefaultLogger() { return g_host->logging__LoggingManager__HasDefaultLogger(); }
 
   PROVIDER_DISALLOW_ALL(LoggingManager)
 };
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 3239e2b6e3..d7c6dab72f 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -397,6 +397,7 @@ struct ProviderHostImpl : ProviderHost {
 
   // logging::LoggingManager (wrapped)
   const logging::Logger& logging__LoggingManager__DefaultLogger() override { return logging::LoggingManager::DefaultLogger(); }
+  bool logging__LoggingManager__HasDefaultLogger() override { return logging::LoggingManager::HasDefaultLogger(); }
 
   // logging::Capture (wrapped)
   std::unique_ptr<logging::Capture> logging__Capture__construct(const logging::Logger& logger,

From 06fc73b7d4d80bd97e140776590d98b868c7bc3a Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Thu, 23 Jan 2025 08:54:55 -0800
Subject: [PATCH 14/37] [TRT EP Perf Tool] Add annotations import to python
 script to support annotations on Python 3.8 (#23466)

### Description
Adds `from __future__ import annotations` to python script to support
annotations on Python 3.8.


### Motivation and Context
Pipeline that runs this script is using Ubuntu 20.04's default python
version (3.8), which does not support annotations unless one imports
from __future__.
---
 onnxruntime/python/tools/tensorrt/perf/build/build_image.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
index 0384300b99..7f418af06a 100644
--- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
+++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py
@@ -6,6 +6,8 @@
 Builds an Ubuntu-based Docker image with TensorRT.
 """
 
+from __future__ import annotations
+
 import argparse
 import os
 import pty

From 8b1d3b3d57148afc41b788e9cdc6ec5033a08bc7 Mon Sep 17 00:00:00 2001
From: Ti-Tai Wang <titaiwang@microsoft.com>
Date: Thu, 23 Jan 2025 17:35:11 -0800
Subject: [PATCH 15/37] Align AvgPool ceil_mode on last value to torch (#16752)

Fix #16203

Previous to this PR, if `ceil_mode` is on, the calculation of a value
would divide the kernel size, even if remaining pixels is less than the
kernel size, which causes the difference in this operator between ORT
and torch.

However, this fix only applies to the change in #15597, which only
supports AvgPool since 19. The older opset version is remain the same,
as it's using mlas files.

Also, the PR fixes the shape mismatch caused by sliding window starting
from padding. More detail: https://github.com/onnx/onnx/pull/6650 (And
this PR is also validated with the tests added in
https://github.com/onnx/onnx/pull/6650)
---
 .../core/providers/cpu/nn/pool_attributes.h   | 26 ++++++++++++-------
 .../core/providers/cpu/nn/pool_functors.h     |  6 +++++
 onnxruntime/test/onnx/TestCase.cc             |  1 +
 .../test/providers/cpu/nn/pool_op_test.cc     | 25 ++++++++++++++++++
 4 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/pool_attributes.h b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
index 118cb4a3ba..fbbd427375 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_attributes.h
@@ -150,14 +150,14 @@ struct PoolAttributes {
         case AutoPadType::VALID:
           *pad_head = 0;
           *pad_tail = 0;
-          *out_size = ComputeOutputSize(in_size, stride, kernel, 0, dilation);
+          *out_size = ComputeOutputSize(in_size, stride, kernel, 0, 0, dilation);
           break;
         case AutoPadType::SAME_LOWER: {
           int64_t legacy_target_size = (in_size + stride - 1) / stride;
           int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
           *pad_head = (pad_needed + 1) / 2;
           *pad_tail = pad_needed - *pad_head;
-          *out_size = ComputeOutputSize(in_size, stride, kernel, pad_needed, dilation);
+          *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation);
           break;
         }
         case AutoPadType::SAME_UPPER: {
@@ -165,7 +165,7 @@ struct PoolAttributes {
           int64_t pad_needed = (legacy_target_size - 1) * stride + kernel - in_size;
           *pad_head = pad_needed / 2;
           *pad_tail = pad_needed - *pad_head;
-          *out_size = ComputeOutputSize(in_size, stride, kernel, pad_needed, dilation);
+          *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation);
           break;
         }
         default: {
@@ -173,7 +173,7 @@ struct PoolAttributes {
         }
       }
     } else {
-      *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head + *pad_tail, dilation);
+      *out_size = ComputeOutputSize(in_size, stride, kernel, *pad_head, *pad_tail, dilation);
     }
   }
 #if defined(_MSC_VER) && !defined(__clang__)
@@ -184,13 +184,21 @@ struct PoolAttributes {
   int64_t ComputeOutputSize(int64_t in_size,
                             int64_t stride,
                             int64_t kernel,
-                            int64_t pad_needed,
+                            int64_t pad_head,
+                            int64_t pad_tail,
                             int64_t dilation) const {
-    if (ceil_mode == 0) {
-      return static_cast<int64_t>(static_cast<float>(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1);
+    int64_t numerator = in_size + pad_head + pad_tail - dilation * (kernel - 1) - 1;
+    int64_t out_size = numerator / stride + 1;
+
+    if (ceil_mode == 1) {
+      out_size = static_cast<int64_t>(std::ceil(static_cast<float>(numerator) / stride)) + 1;
+      // Ensure that the last pooling starts inside the image (at least 1 pixel)
+      // Reference: https://github.com/onnx/onnx/pull/5741
+      if ((out_size - 1) * stride >= in_size + pad_head) {
+        --out_size;
+      }
     }
-    return static_cast<int64_t>(
-        std::ceil(static_cast<float>(in_size + pad_needed - dilation * (kernel - 1) - 1) / stride + 1));
+    return out_size;
   }
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
diff --git a/onnxruntime/core/providers/cpu/nn/pool_functors.h b/onnxruntime/core/providers/cpu/nn/pool_functors.h
index d3205278b7..476a9a0338 100644
--- a/onnxruntime/core/providers/cpu/nn/pool_functors.h
+++ b/onnxruntime/core/providers/cpu/nn/pool_functors.h
@@ -406,6 +406,7 @@ struct AveragePool1DTask final {
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
       int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      hend = std::min(hend, height + pads[1]);
       y_d[ph] = 0;
       int total_elements = 0;
       for (int64_t h = hstart; h < hend; h += dilation_h) {
@@ -461,9 +462,11 @@ struct AveragePool2DTask final {
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
       int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      hend = std::min(hend, height + pads[1]);
       for (int64_t pw = 0; pw < pooled_width; ++pw) {
         int64_t wstart = pw * stride_w - pads[1];
         int64_t wend = wstart + kernel_shape[1] * dilation_w;
+        wend = std::min(wend, width + pads[3]);
         const int64_t pool_index = ph * pooled_width + pw;
         y_d[pool_index] = 0;
         int total_elements = 0;
@@ -532,12 +535,15 @@ struct AveragePool3DTask {
     for (int64_t ph = 0; ph < pooled_height; ++ph) {
       int64_t hstart = ph * stride_h - pads[0];
       int64_t hend = hstart + kernel_shape[0] * dilation_h;
+      hend = std::min(hend, height + pads[1]);
       for (int64_t pw = 0; pw < pooled_width; ++pw) {
         int64_t wstart = pw * stride_w - pads[1];
         int64_t wend = wstart + kernel_shape[1] * dilation_w;
+        wend = std::min(wend, width + pads[3]);
         for (int64_t pd = 0; pd < pooled_depth; ++pd) {
           int64_t dstart = pd * stride_d - pads[2];
           int64_t dend = dstart + kernel_shape[2] * dilation_d;
+          dend = std::min(dend, depth + pads[5]);
           const int64_t pool_index = ph * pooled_width * pooled_depth + pw * pooled_depth + pd;
           y_d[pool_index] = 0;
           int total_elements = 0;
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index b9b69fdc74..d44f098db6 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -961,6 +961,7 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
       {"reduce_prod_empty_set", "unknown version", {}},
       {"reduce_sum_empty_set", "unknown version", {}},
       {"reduce_sum_square_empty_set_expanded", "unknown version", {}},
+      {"averagepool_3d_dilations_large_count_include_pad_is_1_ceil_mode_is_True", "TODO(titaiwang): enable this in the next ONNX release."},
 #ifdef ENABLE_TRAINING_CORE
       {"adagrad", "not a registered function/op", {}},                  // Op not registered.
       {"adagrad_multiple", "not a registered function/op", {}},         // Op not registered.
diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
index a340f975ec..24a8c8491b 100644
--- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc
@@ -1030,6 +1030,31 @@ TEST(PoolTest, AveragePool_19_dilation_2d) {
             kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
 }
 
+TEST(PoolTest, AveragePool_19_ceil_count_include_pad_1d) {
+  // TODO: Unskip when fixed #41968513
+  if (DefaultDmlExecutionProvider().get() != nullptr) {
+    GTEST_SKIP() << "Skipping because of the following error: MLOperatorAuthorImpl.cpp(2100): The parameter is incorrect.";
+  }
+
+  OpTester test("AveragePool", 19);
+
+  test.AddAttribute("auto_pad", "");
+  test.AddAttribute("strides", std::vector<int64_t>{3});
+  test.AddAttribute("pads", vector<int64_t>{3, 3});
+  test.AddAttribute("kernel_shape", vector<int64_t>{7});
+  test.AddAttribute("ceil_mode", (int64_t)1);
+  test.AddAttribute("count_include_pad", (int64_t)1);
+
+  std::vector<float> x_vals = {2.0903f, 4.6493f, 1.6320f, -3.2051f, 4.6975f, 4.7296f, 3.3653f, -1.5815f, -2.3832f, 0.9628f, -1.5899f, -2.6820f, 5.7529f, 7.7346f, -0.8910f, -2.0151f, 0.1313f, -0.5374f};
+  std::vector<int64_t> x_dims = {1, 2, 9};
+  std::vector<int64_t> expected_dims = {1, 2, 4};
+  std::vector<float> expected_vals = {0.73807144f, 2.5655572f, 0.8032287f, -0.09990001f, 0.34911433f, 1.0389f, 1.4536142f, -0.40353334f};
+
+  test.AddInput<float>("X", x_dims, x_vals);
+  test.AddOutput<float>("Y", expected_dims, expected_vals);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider});
+}
+
 TEST(PoolTest, GlobalAveragePool) {
   OpTester test("GlobalAveragePool");
 

From d00ae325cea6b7032493c4459061c53c5a2b43d9 Mon Sep 17 00:00:00 2001
From: Caroline Zhu <wolfivyaura@gmail.com>
Date: Thu, 23 Jan 2025 21:48:27 -0800
Subject: [PATCH 16/37] Revert "[Mobile] Add BrowserStack Android MAUI Test
 (#23383)" (#23474)

This reverts commit 9f9fcf74ff55d7239803d45ca3a989f225da136b.

### Motivation and Context
- NuGet packaging pipelines failing with this error:
```Files\dotnet\packs\Microsoft.NET.Runtime.MonoTargets.Sdk\8.0.12\Sdk\RuntimeComponentManifest.targets(3,5):
error : Empty ResolveFrameworkReference.RuntimePackPath while trying to
read runtime components manifest. ResolvedFrameworkReference available:
{ Microsoft.NETCore.App, RuntimePackPath: }```
---
 .../.config/dotnet-tools.json                 |  13 --
 .../BrowserStackTest.cs                       |  68 -------
 ...xRuntime.Tests.BrowserStack.Android.csproj |  22 ---
 .../README.md                                 |  48 -----
 .../RunAllTest.cs                             | 123 ------------
 .../browserstack.yml                          |  13 --
 ...Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj | 186 +++++++++---------
 7 files changed, 90 insertions(+), 383 deletions(-)
 delete mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/.config/dotnet-tools.json
 delete mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs
 delete mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android.csproj
 delete mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/README.md
 delete mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs
 delete mode 100644 csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml

diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/.config/dotnet-tools.json b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/.config/dotnet-tools.json
deleted file mode 100644
index 67d39c423d..0000000000
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/.config/dotnet-tools.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "version": 1,
-  "isRoot": true,
-  "tools": {
-    "browserstack-sdk": {
-      "version": "1.16.13",
-      "commands": [
-        "browserstack-sdk"
-      ],
-      "rollForward": false
-    }
-  }
-}
\ No newline at end of file
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs
deleted file mode 100644
index 84377d65d1..0000000000
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/BrowserStackTest.cs
+++ /dev/null
@@ -1,68 +0,0 @@
-﻿using Newtonsoft.Json;
-using NUnit.Framework.Interfaces;
-using NUnit.Framework;
-using OpenQA.Selenium;
-using OpenQA.Selenium.Appium;
-using OpenQA.Selenium.Appium.Android;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-
-namespace Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android
-{
-    public class BrowserStackTest
-    {
-        public AndroidDriver driver;
-        public BrowserStackTest()
-        {}
-
-        [SetUp]
-        public void Init()
-        {
-            var androidOptions = new AppiumOptions {
-                AutomationName = "UIAutomator2",
-                PlatformName = "Android",
-            };
-
-            driver = new AndroidDriver(new Uri("http://127.0.0.1:4723/wd/hub"), androidOptions);
-        }
-
-        /// <summary>
-        /// Passes the correct test status to BrowserStack and ensures the driver quits.
-        /// </summary>
-        [TearDown]
-        public void Dispose()
-        {
-            try
-            {
-                // According to
-                // https://www.browserstack.com/docs/app-automate/appium/set-up-tests/mark-tests-as-pass-fail
-                // BrowserStack doesn't know whether test assertions have passed or failed. Below handles
-                // passing the test status to BrowserStack along with any relevant information.
-                if (TestContext.CurrentContext.Result.Outcome.Status == TestStatus.Failed)
-                {
-                    String failureMessage = TestContext.CurrentContext.Result.Message;
-                    String jsonToSendFailure =
-                        String.Format("browserstack_executor: {\"action\": \"setSessionStatus\", \"arguments\": " +
-                                      "{\"status\":\"failed\", \"reason\": {0}}}",
-                                      JsonConvert.ToString(failureMessage));
-
-                    ((IJavaScriptExecutor)driver).ExecuteScript(jsonToSendFailure);
-                }
-                else
-                {
-                    ((IJavaScriptExecutor)driver)
-                        .ExecuteScript("browserstack_executor: {\"action\": \"setSessionStatus\", \"arguments\": " +
-                                       "{\"status\":\"passed\", \"reason\": \"\"}}");
-                }
-            }
-            finally
-            {
-                // will run even if exception is thrown by previous block
-                ((AndroidDriver)driver).Quit();
-            }
-        }
-    }
-}
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android.csproj
deleted file mode 100644
index 9b9028d30c..0000000000
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android.csproj
+++ /dev/null
@@ -1,22 +0,0 @@
-<Project Sdk="Microsoft.NET.Sdk">
-	
-	<PropertyGroup>
-		<TargetFramework>net8.0</TargetFramework>
-		<ImplicitUsings>enable</ImplicitUsings>
-		<Nullable>enable</Nullable>
-
-		<IsPackable>false</IsPackable>
-		<IsTestProject>true</IsTestProject>
-	</PropertyGroup>
-
-	<ItemGroup>
-		<PackageReference Include="Appium.WebDriver" Version="5.0.0-rc.5" />
-		<PackageReference Include="BrowserStack.TestAdapter" Version="0.13.13" />
-		<PackageReference Include="Microsoft.Extensions.Logging.Debug" Version="8.0.0" />
-		<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
-		<PackageReference Include="NUnit" Version="3.13.0" />
-		<PackageReference Include="NUnit.Analyzers" Version="3.3.0" />
-		<PackageReference Include="NUnit3TestAdapter" Version="4.3.0" />
-	</ItemGroup>
-	
-</Project>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/README.md b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/README.md
deleted file mode 100644
index 9c4e2307d8..0000000000
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# BrowserStack Android test
-This project will run the Android MAUI tests on BrowserStack, which allows you to run automated tests on a variety of mobile devices.
-
-## Context
-Microsoft.ML.OnnxRuntime.Tests.MAUI uses DeviceRunners.VisualRunners to allow running the unit tests (found in Microsoft.ML.OnnxRuntime.Tests.Common) across multiple devices. DeviceRunners.VisualRunners provides a simple UI with a button that will run the unit tests and a panel with the unit test results. 
-
-In order to automate the process of running the unit tests across mobile devices, Appium is used for UI testing orchestration (it provides a way to interact with the UI), and BrowserStack automatically runs these Appium tests across different mobile devices.
-
-This project does not include the capability to start an Appium server locally or attach to a local emulator or device. 
-
-## Build & run instructions
-### Requirements
-* A BrowserStack account with access to App Automate
-    * You can set BrowserStack credentials as environment variables as shown [here](https://www.browserstack.com/docs/app-automate/appium/getting-started/c-sharp/nunit/integrate-your-tests#CLI)
-* ONNXRuntime NuGet package
-    1. You can either download the [stable NuGet package](https://www.nuget.org/packages/Microsoft.ML.OnnxRuntime) then follow the instructions from [NativeLibraryInclude.props file](../Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props) to use the downloaded .nupkg file
-    2. Or follow the [build instructions](https://onnxruntime.ai/docs/build/android.html) to build the Android package locally
-* The dotnet workloads for maui and maui-android, which will not always automatically install correctly
-    1. `dotnet workload install maui`
-    2. `dotnet workload install maui-android`
-* [Appium](https://appium.io/docs/en/latest/quickstart/) and the [UiAutomator2 driver](https://appium.io/docs/en/latest/quickstart/uiauto2-driver/)
-
-### Run instructions
-1. Build the Microsoft.ML.OnnxRuntime.Tests.MAUI project into a signed APK.
-    1. Run the following: `dotnet publish -c Release -f net8.0-android` in the Microsoft.ML.OnnxRuntime.Tests.MAUI directory.
-    2. Search for the APK files generated. They should be located in `bin\Release\net8.0-android\publish`. 
-    3. If they're in a different location, edit the `browserstack.yml` file to target the path to the signed APK.
-2. Ensure you've set the BrowserStack credentials as environment variables.
-3. Run the following in the Microsoft.ML.OnnxRuntime.Tests.Android.BrowserStack directory: `dotnet test`
-4. Navigate to the [BrowserStack App Automate dashboard](https://app-automate.browserstack.com/dashboard/v2/builds) to see your test running!
-
-## Troubleshooting & Resources
-### BrowserStack Resources
-- [Configuration docs](https://www.browserstack.com/docs/app-automate/appium/sdk-params#test-context) for browserstack.yml
-- [Configuration generator](https://www.browserstack.com/docs/app-automate/capabilities) for browserstack.yml
-- [Integration guide](https://www.browserstack.com/docs/app-automate/appium/getting-started/c-sharp/nunit/integrate-your-tests#CLI)
-
-### Troubleshooting
-- Issues building the MAUI app: 
-    - Make sure that the maui and maui-android workloads are installed correctly by running `dotnet workload list`
-    - If you believe the issues are workload related, you can also try running `dotnet workload repair` (this has personally never worked for me)
-    - Try running `dotnet clean`. However, this does not fully remove all the previous intermediaries. If you're still running into the errors, manually deleting the bin and obj folders can sometimes resolve them. 
-- After building the MAUI app, try installing on an emulator and clicking the "Run All" button to ensure that everything is working. (If you are missing the ONNXRuntime package, it will not show up as an error until you click "Run All".)
-    - Running the MAUI app from Visual Studio will not replicate running it through BrowserStack. Instead, use `adb install [path to signed apk]` to install the app then use the emulator to launch the app.
-- Issues with the Android.BrowserStack test app: there is an Appium Doctor package on npm -- run `npm install @appium/doctor --location=global` then `appium-doctor --android` and follow the directed instructions. Some errors with Appium on Android will not appear until runtime.
-- Connection refused by Appium server: this can happen if you already have an Appium server running locally. If you do, stop the Appium server then try `dotnet test` again.
-- App is crashing on BrowserStack or it emits an error that it cannot run this APK file: make sure that you are passing in the correct signed APK from the publish folder. 
-- It appears that a test runs on CLI but a build is not launched on BrowserStack: this happens when the BrowserStack Test Adapter cannot find the browserstack.yml file (which has to be named "browserstack.yml" -- do not be tricked by BrowserStack's article on custom-named configuration files)
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs
deleted file mode 100644
index 5db3dc9957..0000000000
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/RunAllTest.cs
+++ /dev/null
@@ -1,123 +0,0 @@
-﻿using OpenQA.Selenium.Appium;
-using OpenQA.Selenium;
-using NUnit.Framework;
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
-
-namespace Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android
-{
-    /// <summary>
-    /// This class contains a single test: RunAll, which interacts with the UI from
-    /// https://github.com/mattleibow/DeviceRunners/tree/main by clicking the "Run All" button and checking the number
-    /// of passed and failed tests.
-    ///
-    /// It searches for elements on the page using Appium's WebDriver. These searches use the XPath attributes.
-    ///
-    /// Launching the MAUI test app in Appium Inspector will allow you to see the exact XPath attributes for each
-    /// element.
-    /// </summary>
-    [TestFixture]
-    public class RunAllTest : BrowserStackTest
-    {
-        public AppiumElement FindAppiumElement(String xpathQuery, String text)
-        {
-            IReadOnlyCollection<AppiumElement> appiumElements = driver.FindElements(By.XPath(xpathQuery));
-
-            foreach (var element in appiumElements)
-            {
-                if (element.Text.Contains(text))
-                {
-                    return element;
-                }
-            }
-            // was unable to find given element
-            throw new Exception(String.Format("Could not find {0}: {1} on the page.", xpathQuery, text));
-        }
-
-        public AppiumElement FindAppiumElementThenClick(String xpathQuery, String text)
-        {
-            AppiumElement appiumElement = FindAppiumElement(xpathQuery, text);
-            appiumElement.Click();
-            return appiumElement;
-        }
-
-        public (int, int) GetPassFailCount()
-        {
-            int numPassed = -1;
-            int numFailed = -1;
-
-            IReadOnlyCollection<AppiumElement> labelElements =
-                driver.FindElements(By.XPath("//android.widget.TextView"));
-
-            for (int i = 0; i < labelElements.Count; i++)
-            {
-                AppiumElement element = labelElements.ElementAt(i);
-
-                if (element.Text.Equals("✔"))
-                {
-                    i++;
-                    numPassed = int.Parse(labelElements.ElementAt(i).Text);
-                }
-
-                if (element.Text.Equals("⛔"))
-                {
-                    i++;
-                    numFailed = int.Parse(labelElements.ElementAt(i).Text);
-                    break;
-                }
-            }
-
-            Assert.That(numPassed, Is.GreaterThanOrEqualTo(0), "Could not find number passed label.");
-            Assert.That(numFailed, Is.GreaterThanOrEqualTo(0), "Could not find number failed label.");
-
-            return (numPassed, numFailed);
-        }
-
-        [Test]
-        public async Task ClickRunAllTest()
-        {
-            // XAML for the main page:
-            // https://github.com/mattleibow/DeviceRunners/blob/cba7644e07b305ba64dc930b01c3eee55ef2b93d/src/DeviceRunners.VisualRunners.Maui/App/Pages/HomePage.xaml
-            AppiumElement runAllButton = FindAppiumElementThenClick("//android.widget.Button", "Run All");
-
-            while (!runAllButton.Enabled)
-            {
-                // waiting for unit tests to execute
-                await Task.Delay(500);
-            }
-
-            var (numPassed, numFailed) = GetPassFailCount();
-
-            if (numFailed == 0)
-            {
-                return;
-            }
-
-            // click into test results if tests have failed
-            FindAppiumElementThenClick("//android.widget.TextView", "⛔");
-            await Task.Delay(500);
-
-            // Brings you to the test assembly page
-            // XAML for test assembly page:
-            // https://github.com/mattleibow/DeviceRunners/blob/cba7644e07b305ba64dc930b01c3eee55ef2b93d/src/DeviceRunners.VisualRunners.Maui/App/Pages/TestAssemblyPage.xaml
-            FindAppiumElementThenClick("//android.widget.EditText", "All");
-            await Task.Delay(100);
-            FindAppiumElementThenClick("//android.widget.TextView", "Failed");
-            await Task.Delay(500);
-
-            StringBuilder sb = new StringBuilder();
-            sb.AppendLine("PASSED TESTS: " + numPassed + " | FAILED TESTS: " + numFailed);
-
-            IReadOnlyCollection<AppiumElement> textResults = driver.FindElements(By.XPath("//android.widget.TextView"));
-            foreach (var element in textResults)
-            {
-                sb.AppendLine(element.Text);
-            }
-
-            Assert.That(numFailed, Is.EqualTo(0), sb.ToString());
-        }
-    }
-}
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml
deleted file mode 100644
index 9efbc9fc6a..0000000000
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.BrowserStack.Android/browserstack.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-app: ..\Microsoft.ML.OnnxRuntime.Tests.MAUI\bin\Release\net8.0-android\publish\ORT.CSharp.Tests.MAUI-Signed.apk
-platforms:
-  - platformName: android
-    deviceName: Samsung Galaxy S22 Ultra
-    platformVersion: 12.0
-browserstackLocal: true
-buildName: ORT android test
-buildIdentifier: ${BUILD_NUMBER}
-projectName: ORT-UITests
-debug: true
-networkLogs: false
-testContextOptions:
-    skipSessionStatus: true   
\ No newline at end of file
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj
index 652da8899f..e07448daee 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.MAUI/Microsoft.ML.OnnxRuntime.Tests.MAUI.csproj
@@ -1,131 +1,125 @@
 ﻿<Project Sdk="Microsoft.NET.Sdk">
-	<PropertyGroup>
-		<OnnxRuntimeRoot>$(ProjectDir)..\..\..</OnnxRuntimeRoot>
-	</PropertyGroup>
+    <PropertyGroup>
+        <OnnxRuntimeRoot>$(ProjectDir)..\..\..</OnnxRuntimeRoot>
+    </PropertyGroup>
 
-	<Import Project="../Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props" />
+    <Import Project="../Microsoft.ML.OnnxRuntime.Tests.Common/NativeLibraryInclude.props" />
 
-	<!-- General app properties -->
-	<PropertyGroup>
-		<TargetFrameworks>net8.0-android;net8.0-ios;net8.0-maccatalyst</TargetFrameworks>
-		<TargetFrameworks Condition="$([MSBuild]::IsOSPlatform('windows'))">$(TargetFrameworks);net8.0-windows10.0.19041.0</TargetFrameworks>
+    <!-- General app properties -->
+    <PropertyGroup>
+        <TargetFrameworks>net8.0-android;net8.0-ios;net8.0-maccatalyst</TargetFrameworks>
+        <TargetFrameworks Condition="$([MSBuild]::IsOSPlatform('windows'))">$(TargetFrameworks);net8.0-windows10.0.19041.0</TargetFrameworks>
 
-		<!-- Note for MacCatalyst:
+        <!-- Note for MacCatalyst:
         The default runtime is maccatalyst-x64, except in Release config, in which case the default is maccatalyst-x64;maccatalyst-arm64.
         When specifying both architectures, use the plural <RuntimeIdentifiers> instead of the singular <RuntimeIdentifier>.
         The Mac App Store will NOT accept apps with ONLY maccatalyst-arm64 indicated;
         either BOTH runtimes must be indicated or ONLY macatalyst-x64. -->
-		<!-- For example: <RuntimeIdentifiers>maccatalyst-x64;maccatalyst-arm64</RuntimeIdentifiers> -->
+        <!-- For example: <RuntimeIdentifiers>maccatalyst-x64;maccatalyst-arm64</RuntimeIdentifiers> -->
 
-		<RootNamespace>Microsoft.ML.OnnxRuntime.Tests.MAUI</RootNamespace>
-		<UseMaui>true</UseMaui>
-		<SingleProject>true</SingleProject>
-		<ImplicitUsings>enable</ImplicitUsings>
-		<Nullable>enable</Nullable>
-		<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
-		<!-- some of the helper packages don't have strong named assemblies. -->
-		<NoWarn>8002</NoWarn>
+        <OutputType>Exe</OutputType>
+        <RootNamespace>Microsoft.ML.OnnxRuntime.Tests.MAUI</RootNamespace>
+        <UseMaui>true</UseMaui>
+        <SingleProject>true</SingleProject>
+        <ImplicitUsings>enable</ImplicitUsings>
+        <Nullable>enable</Nullable>
+        <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+        <!-- some of the helper packages don't have strong named assemblies. -->
+        <NoWarn>8002</NoWarn>
 
-		<!-- These are copied from the sample. TBD what we really need. -->
-		<DefineConstants Condition="'$(CI)' != 'true'">$(DefineConstants);INCLUDE_FAILING_TESTS</DefineConstants>
-		<DefineConstants Condition="'$(TestingMode)' == 'NonInteractiveVisual'">$(DefineConstants);MODE_NON_INTERACTIVE_VISUAL</DefineConstants>
-		<DefineConstants Condition="'$(TestingMode)' == 'XHarness'">$(DefineConstants);MODE_XHARNESS</DefineConstants>
+        <!-- These are copied from the sample. TBD what we really need. -->
+        <DefineConstants Condition="'$(CI)' != 'true'">$(DefineConstants);INCLUDE_FAILING_TESTS</DefineConstants>
+        <DefineConstants Condition="'$(TestingMode)' == 'NonInteractiveVisual'">$(DefineConstants);MODE_NON_INTERACTIVE_VISUAL</DefineConstants>
+        <DefineConstants Condition="'$(TestingMode)' == 'XHarness'">$(DefineConstants);MODE_XHARNESS</DefineConstants>
 
-		<!-- Display name -->
-		<ApplicationTitle>Microsoft.ML.OnnxRuntime.Tests.MAUI</ApplicationTitle>
+        <!-- Display name -->
+        <ApplicationTitle>Microsoft.ML.OnnxRuntime.Tests.MAUI</ApplicationTitle>
 
-		<!-- App Identifier. MUST be short or you get a misleading error about not being able to deploy the app -->
-		<ApplicationId>ORT.CSharp.Tests.MAUI</ApplicationId>
+        <!-- App Identifier. MUST be short or you get a misleading error about not being able to deploy the app -->
+        <ApplicationId>ORT.CSharp.Tests.MAUI</ApplicationId>
 
-		<!-- Versions -->
-		<ApplicationDisplayVersion>1.0</ApplicationDisplayVersion>
-		<ApplicationVersion>1</ApplicationVersion>
+        <!-- Versions -->
+        <ApplicationDisplayVersion>1.0</ApplicationDisplayVersion>
+        <ApplicationVersion>1</ApplicationVersion>
 
-		<SupportedOSPlatformVersion Condition="'$(IsIOSTarget)' == 'true'">15.0</SupportedOSPlatformVersion>
-		<SupportedOSPlatformVersion Condition="'$(IsMacCatalystTarget)' == 'true'">13.1</SupportedOSPlatformVersion>
-		<SupportedOSPlatformVersion Condition="'$(IsAndroidTarget)' == 'true'">30.0</SupportedOSPlatformVersion>
-		<SupportedOSPlatformVersion Condition="'$(IsWindowsTarget)' == 'true'">10.0.17763.0</SupportedOSPlatformVersion>
-		<TargetPlatformMinVersion Condition="'$(IsWindowsTarget)' == 'true'">10.0.17763.0</TargetPlatformMinVersion>
+        <SupportedOSPlatformVersion Condition="'$(IsIOSTarget)' == 'true'">15.0</SupportedOSPlatformVersion>
+        <SupportedOSPlatformVersion Condition="'$(IsMacCatalystTarget)' == 'true'">13.1</SupportedOSPlatformVersion>
+        <SupportedOSPlatformVersion Condition="'$(IsAndroidTarget)' == 'true'">30.0</SupportedOSPlatformVersion>
+        <SupportedOSPlatformVersion Condition="'$(IsWindowsTarget)' == 'true'">10.0.17763.0</SupportedOSPlatformVersion>
+        <TargetPlatformMinVersion Condition="'$(IsWindowsTarget)' == 'true'">10.0.17763.0</TargetPlatformMinVersion>
 
-		<SignAssembly>true</SignAssembly>
-		<AssemblyOriginatorKeyFile>..\..\OnnxRuntime.snk</AssemblyOriginatorKeyFile>
-	</PropertyGroup>
+        <SignAssembly>true</SignAssembly>
+        <AssemblyOriginatorKeyFile>..\..\OnnxRuntime.snk</AssemblyOriginatorKeyFile>
+    </PropertyGroup>
 
-	<ItemGroup>
-		<!-- App Icon -->
-		<MauiIcon Include="Resources\AppIcon\appicon.svg" ForegroundFile="Resources\AppIcon\appiconfg.svg" Color="#512BD4" />
+    <ItemGroup>
+        <!-- App Icon -->
+        <MauiIcon Include="Resources\AppIcon\appicon.svg" ForegroundFile="Resources\AppIcon\appiconfg.svg" Color="#512BD4" />
 
-		<!-- Splash Screen -->
-		<MauiSplashScreen Include="Resources\Splash\splash.svg" Color="#512BD4" BaseSize="128,128" />
+        <!-- Splash Screen -->
+        <MauiSplashScreen Include="Resources\Splash\splash.svg" Color="#512BD4" BaseSize="128,128" />
 
-		<!-- Images -->
-		<MauiImage Include="Resources\Images\*" />
-		<MauiImage Update="Resources\Images\dotnet_bot.png" Resize="True" BaseSize="300,185" />
+        <!-- Images -->
+        <MauiImage Include="Resources\Images\*" />
+        <MauiImage Update="Resources\Images\dotnet_bot.png" Resize="True" BaseSize="300,185" />
 
-		<!-- Custom Fonts -->
-		<MauiFont Include="Resources\Fonts\*" />
+        <!-- Custom Fonts -->
+        <MauiFont Include="Resources\Fonts\*" />
 
-		<!-- Raw Assets (also remove the "Resources\Raw" prefix) -->
-		<MauiAsset Include="Resources\Raw\**" LogicalName="%(RecursiveDir)%(Filename)%(Extension)" />
-	</ItemGroup>
+        <!-- Raw Assets (also remove the "Resources\Raw" prefix) -->
+        <MauiAsset Include="Resources\Raw\**" LogicalName="%(RecursiveDir)%(Filename)%(Extension)" />
+    </ItemGroup>
 
-	<!-- NOTE: The xUnit framework doesn't pickup the tests defined within the referenced
+    <!-- NOTE: The xUnit framework doesn't pickup the tests defined within the referenced
     Microsoft.ML.OnnxRuntime.Tests.Common project -->
-	<ItemGroup>
-		<Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\InferenceTest.cs">
-			<Link>InferenceTest.cs</Link>
-		</Compile>
-		<Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OrtIoBindingAllocationTest.cs">
-			<Link>OrtIoBindingAllocationTest.cs</Link>
-		</Compile>
-		<Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\Tensors\TensorTests.cs">
-			<Link>TensorTests.cs</Link>
-		</Compile>
-	</ItemGroup>
+    <ItemGroup>
+        <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\InferenceTest.cs">
+            <Link>InferenceTest.cs</Link>
+        </Compile>
+        <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\OrtIoBindingAllocationTest.cs">
+            <Link>OrtIoBindingAllocationTest.cs</Link>
+        </Compile>
+        <Compile Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\Tensors\TensorTests.cs">
+            <Link>TensorTests.cs</Link>
+        </Compile>
+    </ItemGroup>
 
-	<ItemGroup>
-		<ProjectReference
+    <ItemGroup>
+        <ProjectReference
             Include="..\..\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj"
             name="Microsoft.ML.OnnxRuntime" />
-		<ProjectReference
+        <ProjectReference
             Include="..\Microsoft.ML.OnnxRuntime.Tests.Common\Microsoft.ML.OnnxRuntime.Tests.Common.csproj"
             name="Microsoft.ML.OnnxRuntime.Tests.Common" />
-		<ProjectReference
+        <ProjectReference
             Include="..\Microsoft.ML.OnnxRuntime.Tests.Devices\Microsoft.ML.OnnxRuntime.Tests.Devices.csproj"
             name="Microsoft.ML.OnnxRuntime.Tests.Devices" />
-	</ItemGroup>
+    </ItemGroup>
 
-	<ItemGroup>
-		<PackageReference Include="DeviceRunners.VisualRunners.Maui" Version="0.1.0-preview.2" />
-		<PackageReference Include="DeviceRunners.VisualRunners.Xunit" Version="0.1.0-preview.2" />
-		<PackageReference Include="DeviceRunners.XHarness.Maui" Version="0.1.0-preview.2" />
-		<PackageReference Include="DeviceRunners.XHarness.Xunit" Version="0.1.0-preview.2" />
-		<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.12.0" />
-		<PackageReference Include="Microsoft.DotNet.XHarness.TestRunners.Xunit" Version="9.0.0-prerelease.24374.1" />
-		<PackageReference Include="Microsoft.Maui.Controls" Version="8.0.70" />
-		<PackageReference Include="Microsoft.Maui.Controls.Compatibility" Version="8.0.70" />
-		<PackageReference Include="Microsoft.Extensions.Logging.Debug" Version="8.0.0" />
-		<PackageReference Include="xunit" Version="2.9.0" />
-		<PackageReference Include="xunit.runner.utility" Version="2.9.0" />
-	</ItemGroup>
+    <ItemGroup>
+        <PackageReference Include="DeviceRunners.VisualRunners.Maui" Version="0.1.0-preview.2" />
+        <PackageReference Include="DeviceRunners.VisualRunners.Xunit" Version="0.1.0-preview.2" />
+        <PackageReference Include="DeviceRunners.XHarness.Maui" Version="0.1.0-preview.2" />
+        <PackageReference Include="DeviceRunners.XHarness.Xunit" Version="0.1.0-preview.2" />
+        <PackageReference Include="Microsoft.DotNet.XHarness.TestRunners.Xunit" Version="9.0.0-prerelease.24374.1" />
+        <PackageReference Include="Microsoft.Maui.Controls" Version="8.0.70" />
+        <PackageReference Include="Microsoft.Maui.Controls.Compatibility" Version="8.0.70" />
+        <PackageReference Include="Microsoft.Extensions.Logging.Debug" Version="8.0.0" />
+        <PackageReference Include="xunit" Version="2.9.0" />
+        <PackageReference Include="xunit.runner.utility" Version="2.9.0" />
+    </ItemGroup>
 
-	<ItemGroup Condition="$(IsIOSTarget)=='true' OR $(IsMacCatalystTarget)=='true'">
-		<!-- need the dummy ORT Extensions package to resolve the RegisterCustomOps symbol. -->
-		<PackageReference Include="Microsoft.ML.OnnxRuntime.Extensions.Dummy" Version="0.12.0" />
-	</ItemGroup>
+    <ItemGroup Condition="$(IsIOSTarget)=='true' OR $(IsMacCatalystTarget)=='true'">
+        <!-- need the dummy ORT Extensions package to resolve the RegisterCustomOps symbol. -->
+        <PackageReference Include="Microsoft.ML.OnnxRuntime.Extensions.Dummy" Version="0.12.0" />
+    </ItemGroup>
 
-	<Target Name="RemoveVisualStudioTestRunner" BeforeTargets="_ComputeAppxPackagePayload">
-		<ItemGroup>
-			<_VisualStudioTestRunnerFiles
+    <Target Name="RemoveVisualStudioTestRunner" BeforeTargets="_ComputeAppxPackagePayload">
+        <ItemGroup>
+            <_VisualStudioTestRunnerFiles
                 Include="@(PackagingOutputs)"
                 Condition="$([System.String]::Copy('%(PackagingOutputs.FullPath)').Contains('xunit.runner.visualstudio'))" />
-			<PackagingOutputs Remove="@(_VisualStudioTestRunnerFiles)" />
-		</ItemGroup>
-	</Target>
-
-	<PropertyGroup Condition="'$(IsAndroidTarget)' !='true'">
-		<GenerateProgramFile>false</GenerateProgramFile>
-		<DefaultLanguage>en</DefaultLanguage>
-	</PropertyGroup>
-
+            <PackagingOutputs Remove="@(_VisualStudioTestRunnerFiles)" />
+        </ItemGroup>
+    </Target>
 </Project>

From c89a798b732719b9884595f2f4de0b64cf2a80d6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9goire?= <gregoire.verdier@gmail.com>
Date: Fri, 24 Jan 2025 18:52:05 +0100
Subject: [PATCH 17/37] Enable opti on Microsoft.ML.OnnxRuntime with
 RelWithDebInfo config (#23463)

Microsoft.ML.OnnxRuntime is not built with the Release configuration but
RelWithDebInfo which is not recognized by the MSBuild SDK. Consequently,
the optimizations are not enabled. A fix would be to simply force the
configuration to be Release when building the .NET code even if it was
set to RelWithDebInfo in the command line arguments but I could not find
an easy way to do that. Instead, I try to mimic the behavior of the
Release configuration by setting the optimize property.

I can see a 15% performance improvement using this simple model summing
up the 3 inputs:
```csharp
using System.Buffers;
using System.Collections.Frozen;
using System.Net;
using System.Net.Sockets;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Text;
using System.Text.RegularExpressions;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Running;
using Microsoft.ML.OnnxRuntime;

var config = DefaultConfig.Instance; //.WithOptions(ConfigOptions.DisableOptimizationsValidator);
BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args, config);

public class OnnxBench
{
    private const int Iterations = 100_000;
    private const int BatchSize = 50;

    private InferenceSession _session = default!;
    private string[] _inputNames = default!;
    private OrtValue[] _inputValues = default!;
    private RunOptions _runOptions = default!;

    [GlobalSetup]
    public void GlobalSetup()
    {
        using SessionOptions sessionOptions = new();
        sessionOptions.InterOpNumThreads = 1;
        sessionOptions.IntraOpNumThreads = 1;
        sessionOptions.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL;
        sessionOptions.ExecutionMode = ExecutionMode.ORT_SEQUENTIAL;

        _session = new InferenceSession(
            Convert.FromBase64String("CAo6cAoOCgFBCgFCEgFEIgNBZGQKDgoBQwoBRBIBWCIDQWRkEgJscloRCgFBEgwKCggBEgYKAAoCCAFaEQoBQhIMCgoIARIGCgAKAggBWhEKAUMSDAoKCAESBgoACgIIAWIRCgFYEgwKCggBEgYKAAoCCAFCBAoAEBU="),
            sessionOptions);
        _inputNames = ["A", "B", "C"];
        _inputValues =
        [
            OrtValue.CreateTensorValueFromMemory(new float[BatchSize], [BatchSize, 1]),
            OrtValue.CreateTensorValueFromMemory(new float[BatchSize], [BatchSize, 1]),
            OrtValue.CreateTensorValueFromMemory(new float[BatchSize], [BatchSize, 1]),
        ];
        _runOptions = new RunOptions();
    }

    [Benchmark(OperationsPerInvoke = Iterations)]
    public float Run()
    {
        var inputValues0Span = _inputValues[0].GetTensorMutableDataAsSpan<float>();
        var inputValues1Span = _inputValues[1].GetTensorMutableDataAsSpan<float>();
        var inputValues2Span = _inputValues[2].GetTensorMutableDataAsSpan<float>();
        for (int i = 0; i < BatchSize; i += 1)
        {
            inputValues0Span[i] = Random.Shared.NextSingle();
            inputValues1Span[i] = Random.Shared.NextSingle();
            inputValues2Span[i] = Random.Shared.NextSingle();
        }

        float sum = 0f;
        for (int i = 0; i < Iterations; i += 1)
        {
            using var output = _session.Run(_runOptions, _inputNames, _inputValues, _session.OutputNames);
            ReadOnlySpan<float> outputData = output[0].GetTensorDataAsSpan<float>();
            for (int j = 0; j < outputData.Length; j += 1)
            {
                sum += outputData[j];
            }
        }

        return sum;
    }
}
```

| Method | Mean     | Error     | StdDev    |
|------- |---------:|----------:|----------:|
| Before | 5.003 us | 0.0318 us | 0.0297 us |
| After   | 4.325 us | 0.0568 us | 0.0503 us |
---
 .../Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 63131d05c0..b9155e748f 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -127,6 +127,11 @@
     <OrtConstants>$(OrtConstants);__ENABLE_TRAINING_APIS__</OrtConstants>
   </PropertyGroup>
 
+  <!-- Enable same optimizations as configuration Release for the custom RelWithDebInfo -->
+  <PropertyGroup Condition="'$(Configuration)' == 'RelWithDebInfo'">
+    <Optimize>true</Optimize>
+  </PropertyGroup>
+
   <!--
     Properties that are used when creating the managed package using the Pack target.
   -->

From 13348c572a315806c7745280073aacdf384dcea9 Mon Sep 17 00:00:00 2001
From: Jing Fang <126209182+fajin-corp@users.noreply.github.com>
Date: Fri, 24 Jan 2025 15:25:24 -0800
Subject: [PATCH 18/37] [ARM CPU] hgemm optimized for gqa (#23107)

### Description
Add fp16 kernels for GQA matmul on ARM CPU.
The kernels are mlas hgemm for C = alpha * A x B' + beta * C


### Motivation and Context
Add fp16 support for GQA, speed up the operator and reduce memory usage.

__Token Generation__
| | HGEMM Runtime (ns) | SGEMM Runtime (ns) | Speed-up (%) |

|---------------------------------|--------------------|--------------------|--------------|
| M:1/N:4096/K:4096 | 251551 | 1775905 | 85.84 |
| M:1/N:11008/K:4096 | 892507 | 4649145 | 80.80 |
| M:1/N:4096/K:11008 | 866860 | 3240015 | 73.25 |
| M:1/N:11008/K:11008 | 2631615 |8783877 | 70.04 |

__Prompting__
| | HGEMM Runtime (ns) | SGEMM Runtime (ns) | Speed-up (%) |

|---------------------------------|--------------------|--------------------|--------------|
| M:1024/N:4096/K:4096 | 90508701 | 111283029 | 18.67 |
| M:2048/N:4096/K:4096 | 181307522 | 240211107 | 24.52 |
| M:1024/N:11008/K:4096 | 241120234 | 307707933 | 21.64 |
| M:2048/N:11008/K:4096 | 481091232 | 648921367 | 25.86 |
| M:1024/N:4096/K:11008 | 241736343 | 310129880 | 22.05 |
| M:2048/N:4096/K:11008 | 480456703 | 644814999 | 25.49 |
| M:1024/N:11008/K:11008 | 642121440 | 847925766 | 24.27 |
| M:2048/N:11008/K:11008 | 1276097154 | 1731314509 | 26.29
---
 cmake/onnxruntime_mlas.cmake                  |    5 +
 .../contrib_ops/cpu/bert/gqa_attention_base.h |    6 +
 onnxruntime/core/mlas/inc/mlas.h              |  102 +-
 onnxruntime/core/mlas/lib/fp16_common.h       |   99 ++
 onnxruntime/core/mlas/lib/halfgemm.cpp        |  170 ++
 onnxruntime/core/mlas/lib/halfgemm.h          |  122 ++
 .../mlas/lib/halfgemm_kernel_neon_fp16.cpp    | 1572 +++++++++++++++++
 .../core/mlas/lib/hgemm_kernel_neon.cpp       |   28 +
 .../mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp  |   33 -
 onnxruntime/core/mlas/lib/mlasi.h             |   11 +
 onnxruntime/core/mlas/lib/platform.cpp        |    1 +
 onnxruntime/test/mlas/bench/bench_hgemm.cpp   |   86 +
 .../test/mlas/unittest/test_hgemm_neon.cpp    |  393 +++++
 13 files changed, 2594 insertions(+), 34 deletions(-)
 create mode 100644 onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp
 create mode 100644 onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp
 create mode 100644 onnxruntime/test/mlas/bench/bench_hgemm.cpp
 create mode 100644 onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 5124262ec0..ed3ad89247 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -95,6 +95,8 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
         ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
         ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
+        ${MLAS_SRC_DIR}/hgemm_kernel_neon.cpp
+        ${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp
       )
 
       set(mlas_platform_preprocess_srcs
@@ -374,6 +376,7 @@ else()
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.h
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon.cpp
+          ${MLAS_SRC_DIR}/hgemm_kernel_neon.cpp
         )
         set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
                                     PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
@@ -394,6 +397,7 @@ else()
             ${MLAS_SRC_DIR}/cast_kernel_neon.cpp
             ${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp
             ${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp
+            ${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp
           )
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
@@ -406,6 +410,7 @@ else()
           set_source_files_properties(${MLAS_SRC_DIR}/cast_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/hqnbitgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/rotary_embedding_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
         endif()
 
         if(ONNXRUNTIME_MLAS_MULTI_ARCH)
diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
index ccaeb6654e..abb24e20a6 100644
--- a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -75,6 +75,7 @@ class GQAAttentionBase {
     int seqlen_present_kv_cache = static_cast<int>(present_key->Shape().GetDims()[2]);
 
     // Compute the attention score.
+    // TODO(fajin): type depends on kernel supportability
     size_t bytes = SafeInt<size_t>(batch_size) * num_heads_ * sequence_length * seqlen_present_kv_cache * sizeof(float);
     auto attention_probs = allocator->Alloc(bytes);
     BufferUniquePtr scratch_buffer(attention_probs, BufferDeleter(allocator));
@@ -198,6 +199,11 @@ class GQAAttentionBase {
           math::GemmEx<float, ThreadPool>(CblasNoTrans, CblasTrans, sequence_length, total_seqlen, head_size, alpha, q,
                                           static_cast<int>(head_size), k, static_cast<int>(head_size), 0.0f /*bata*/,
                                           output, static_cast<int>(present_buffer_sequence_length), nullptr);
+          // TODO(fajin): update later
+          // } else if (MlasHGemmSupported(CblasNoTrans, CblasTrans)) {
+          //   MlasGemm(CblasNoTrans, CblasTrans, sequence_length, total_seqlen, head_size,
+          //            q, static_cast<int>(head_size), k, static_cast<int>(head_size), output,
+          //            static_cast<int>(present_buffer_sequence_length), alpha, 0.0f /*beta*/, nullptr);
         } else {
           size_t bytes = head_size * (sequence_length + total_seqlen) * sizeof(float);
           auto q_k_fp32 = allocator->Alloc(bytes);
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 207c058d89..7e0335cc66 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1458,7 +1458,107 @@ MlasRotaryEmbedOneRow(
     T* output
 );
 
-    /**
+/**
+ * @brief Supply matrices data information to half precision gemm functions
+ */
+struct MLAS_HGEMM_DATA_PARAMS {
+    const MLAS_FP16* A; /**< Supplies the address of matrix A */
+    size_t lda;         /**< Supplies the first dimension of matrix A. */
+    const MLAS_FP16* B; /**< Supplies the address of matrix B */
+    size_t ldb;         /**< Supplies the first dimension of matrix B. */
+    MLAS_FP16* C;       /**< Supplies the address of matrix C */
+    size_t ldc;         /**< Supplies the first dimension of matrix C. */
+    uint16_t alpha;     /**< Supplies the scalar alpha multiplier (see GEMM definition). FP16 encoding. */
+    uint16_t beta;      /**< Supplies the scalar beta multiplier (see GEMM definition). FP16 encoding. */
+};
+
+/**
+ * @brief Check whether current CPU supports half precision gemm.
+ */
+bool
+MLASCALL
+MlasHGemmSupported(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB
+    );
+
+/**
+ * @brief  Batched half precision matrix/matrix multiply operation (HGEMM)
+ *
+ * @param TransA     Supplies the transpose operation for matrix A.
+ * @param TransB     Supplies the transpose operation for matrix B.
+ * @param M          Supplies the number of rows of matrix A and matrix C.
+ * @param N          Supplies the number of columns of matrix B and matrix C.
+ * @param K          Supplies the number of columns of matrix A and the number of rows of matrix B.
+ * @param Data       A array of matrices data parameters
+ * @param BatchSize  Supplies number of multiplications in this batch
+ * @param ThreadPool Supplies the thread pool object to use, else nullptr if the
+                     base library threading support should be used.
+ */
+void
+MLASCALL
+MlasGemmBatch(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB,
+    size_t M,
+    size_t N,
+    size_t K,
+    const MLAS_HGEMM_DATA_PARAMS* Data,
+    size_t BatchSize,
+    MLAS_THREADPOOL* ThreadPool
+    );
+
+/**
+ * @brief  half precision matrix/matrix multiply operation (HGEMM)
+ *         C = alpha * op(A) * op(B) + beta * C
+ *
+ * @param TransA  Supplies the transpose operation for matrix A. Currently only support CblasNoTrans.
+ * @param TransB  Supplies the transpose operation for matrix B. Currently only support CblasTrans.
+ * @param M       Supplies the number of rows of matrix A and matrix C.
+ * @param N       Supplies the number of columns of matrix B and matrix C.
+ * @param K       Supplies the number of columns of matrix A and the number of rows of matrix B.
+ * @param A       Supplies the address of matrix A
+ * @param lda     Supplies the first dimension of matrix A.
+ * @param B       Supplies the address of matrix B
+ * @param ldb     Supplies the first dimension of matrix B.
+ * @param C       Supplies the address of matrix C
+ * @param ldc     Supplies the first dimension of matrix C.
+ * @param alpha   Supplies the scalar alpha multiplier (see GEMM definition)
+ * @param beta    Supplies the scalar beta multiplier (see GEMM definition)
+ * @param ThreadPool Supplies the thread pool object to use, else nullptr if the base library threading support
+ *                   should be used.
+ */
+inline
+void
+MlasGemm(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB,
+    size_t M,
+    size_t N,
+    size_t K,
+    const MLAS_FP16* A,
+    size_t lda,
+    const MLAS_FP16* B,
+    size_t ldb,
+    MLAS_FP16* C,
+    size_t ldc,
+    uint16_t alpha,
+    uint16_t beta,
+    MLAS_THREADPOOL* ThreadPool
+) {
+    MLAS_HGEMM_DATA_PARAMS Data;
+    Data.A = A;
+    Data.lda = lda;
+    Data.B = B;
+    Data.ldb = ldb;
+    Data.C = C;
+    Data.ldc = ldc;
+    Data.alpha = alpha;
+    Data.beta = beta;
+    MlasGemmBatch(TransA, TransB, M, N, K, &Data, 1, ThreadPool);
+}
+
+/**
  * @brief Whether current CPU supports FP16 acceleration.
 */
 bool MLASCALL
diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h
index f4c49905eb..acee567162 100644
--- a/onnxruntime/core/mlas/lib/fp16_common.h
+++ b/onnxruntime/core/mlas/lib/fp16_common.h
@@ -349,4 +349,103 @@ MlasBitwiseSelectFloat16x4(MLAS_UINT16X4 select, MLAS_FLOAT16X4 ones, MLAS_FLOAT
     return vbsl_f16(select, ones, zeros);
 }
 
+MLAS_FORCEINLINE
+void
+Transpose8x8(MLAS_FLOAT16X8& v0, MLAS_FLOAT16X8& v1, MLAS_FLOAT16X8& v2, MLAS_FLOAT16X8& v3,
+             MLAS_FLOAT16X8& v4, MLAS_FLOAT16X8& v5, MLAS_FLOAT16X8& v6, MLAS_FLOAT16X8& v7)
+{
+    // |v00|v01|v02|v03|v04|v05|v06|v07|
+    // |v10|v11|v12|v13|v14|v15|v16|v17|
+    // |v20|v21|v22|v23|v24|v25|v26|v27|
+    // |v30|v31|v32|v33|v34|v35|v36|v37|
+    // |v40|v41|v42|v43|v44|v45|v46|v47|
+    // |v50|v51|v52|v53|v54|v55|v56|v57|
+    // |v60|v61|v62|v63|v64|v65|v66|v67|
+    // |v70|v71|v72|v73|v74|v75|v76|v77|
+    float16x8x2_t t01 = vtrnq_f16(v0, v1);
+    float16x8x2_t t23 = vtrnq_f16(v2, v3);
+    float16x8x2_t t45 = vtrnq_f16(v4, v5);
+    float16x8x2_t t67 = vtrnq_f16(v6, v7);
+    // |v00|v10|v02|v12|v04|v14|v06|v16|
+    // |v01|v11|v03|v13|v05|v15|v07|v17|
+    // |v20|v30|v22|v32|v24|v34|v26|v36|
+    // |v21|v31|v23|v33|v25|v35|v27|v37|
+    // |v40|v50|v42|v52|v44|v54|v46|v56|
+    // |v41|v51|v43|v53|v45|v55|v47|v57|
+    // |v60|v70|v62|v72|v64|v74|v66|v76|
+    // |v61|v71|v63|v73|v65|v75|v67|v77|
+    float32x4x2_t t02 = vtrnq_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0]));
+    float32x4x2_t t13 = vtrnq_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1]));
+    float32x4x2_t t46 = vtrnq_f32(vreinterpretq_f32_f16(t45.val[0]), vreinterpretq_f32_f16(t67.val[0]));
+    float32x4x2_t t57 = vtrnq_f32(vreinterpretq_f32_f16(t45.val[1]), vreinterpretq_f32_f16(t67.val[1]));
+    // |v00|v10|v20|v30|v04|v14|v24|v34|
+    // |v01|v11|v21|v31|v05|v15|v25|v35|
+    // |v02|v12|v22|v32|v06|v16|v26|v36|
+    // |v03|v13|v23|v33|v07|v17|v27|v37|
+    // |v40|v50|v60|v70|v44|v54|v64|v74|
+    // |v41|v51|v61|v71|v45|v55|v65|v75|
+    // |v42|v52|v62|v72|v46|v56|v66|v76|
+    // |v43|v53|v63|v73|v47|v57|v67|v77|
+    v0 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t02.val[0]), vreinterpretq_f64_f32(t46.val[0])));
+    v4 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t02.val[0]), vreinterpretq_f64_f32(t46.val[0])));
+    v2 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t02.val[1]), vreinterpretq_f64_f32(t46.val[1])));
+    v6 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t02.val[1]), vreinterpretq_f64_f32(t46.val[1])));
+    v1 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t13.val[0]), vreinterpretq_f64_f32(t57.val[0])));
+    v5 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t13.val[0]), vreinterpretq_f64_f32(t57.val[0])));
+    v3 = vreinterpretq_f16_f64(vtrn1q_f64(vreinterpretq_f64_f32(t13.val[1]), vreinterpretq_f64_f32(t57.val[1])));
+    v7 = vreinterpretq_f16_f64(vtrn2q_f64(vreinterpretq_f64_f32(t13.val[1]), vreinterpretq_f64_f32(t57.val[1])));
+    // |v00|v10|v20|v30|v40|v50|v60|v70|
+    // |v01|v11|v21|v31|v41|v51|v61|v71|
+    // |v02|v12|v22|v32|v42|v52|v62|v72|
+    // |v03|v13|v23|v33|v43|v53|v63|v73|
+    // |v04|v14|v24|v34|v44|v54|v64|v74|
+    // |v05|v15|v25|v35|v45|v55|v65|v75|
+    // |v06|v16|v26|v36|v46|v56|v66|v76|
+    // |v07|v17|v27|v37|v47|v57|v67|v77|
+}
+
+MLAS_FORCEINLINE
+void
+Transpose4x8(MLAS_FLOAT16X8& v0, MLAS_FLOAT16X8& v1, MLAS_FLOAT16X8& v2, MLAS_FLOAT16X8& v3)
+{
+    // |v00|v01|v02|v03|v04|v05|v06|v07|
+    // |v10|v11|v12|v13|v14|v15|v16|v17|
+    // |v20|v21|v22|v23|v24|v25|v26|v27|
+    // |v30|v31|v32|v33|v34|v35|v36|v37|
+    //  =>
+    // |v00|v10|v20|v30|v04|v14|v24|v34|
+    // |v01|v11|v21|v31|v05|v15|v25|v35|
+    // |v02|v12|v22|v32|v06|v16|v26|v36|
+    // |v03|v13|v23|v33|v07|v17|v27|v37|
+    float16x8x2_t t01 = vtrnq_f16(v0, v1);
+    float16x8x2_t t23 = vtrnq_f16(v2, v3);
+
+    v0 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
+    v2 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
+    v1 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
+    v3 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
+}
+
+MLAS_FORCEINLINE
+void
+Transpose4x4(MLAS_FLOAT16X4& v0, MLAS_FLOAT16X4& v1, MLAS_FLOAT16X4& v2, MLAS_FLOAT16X4& v3)
+{
+    // |v00|v01|v02|v03|
+    // |v10|v11|v12|v13|
+    // |v20|v21|v22|v23|
+    // |v30|v31|v32|v33|
+    //  =>
+    // |v00|v10|v20|v30|
+    // |v01|v11|v21|v31|
+    // |v02|v12|v22|v32|
+    // |v03|v13|v23|v33|
+    float16x4x2_t t01 = vtrn_f16(v0, v1);
+    float16x4x2_t t23 = vtrn_f16(v2, v3);
+
+    v0 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
+    v1 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
+    v2 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
+    v3 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
+}
+
 #endif  // fp16 vector intrinsic supported
diff --git a/onnxruntime/core/mlas/lib/halfgemm.cpp b/onnxruntime/core/mlas/lib/halfgemm.cpp
index 49387d2fc9..65ab0e9ce4 100644
--- a/onnxruntime/core/mlas/lib/halfgemm.cpp
+++ b/onnxruntime/core/mlas/lib/halfgemm.cpp
@@ -324,6 +324,176 @@ MlasHalfGemmKernel<MLAS_HALF_GEMM_KERNEL_DEFAULT>(
     }
 }
 
+bool
+MLASCALL
+MlasHGemmSupported(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB
+) {
+    auto* dispatch = GetMlasPlatform().HGemmDispatch;
+    if (TransA == CblasNoTrans && TransB == CblasTrans) {
+        return dispatch &&
+        dispatch->HGemmKernel_TransposedB &&
+        dispatch->HPackBKernel_TransposedB &&
+        dispatch->HGemmKernel_TransposedPackedB;
+    }
+
+    return false;
+}
+
+void
+HGemmOperation(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB,
+    size_t K, // full K slice
+    const MLAS_HGEMM_DATA_PARAMS* DataParams,
+    const size_t RangeStartM,
+    const size_t RangeCountM,
+    const size_t RangeStartN,
+    const size_t RangeCountN
+) {
+    const size_t lda = DataParams->lda;
+    const size_t ldb = DataParams->ldb;
+    const size_t ldc = DataParams->ldc;
+    const _mlas_fp16_ alpha = DataParams->alpha;
+    const _mlas_fp16_ beta = DataParams->beta;
+    auto* dispatch = GetMlasPlatform().HGemmDispatch;
+    constexpr size_t StrideM = 2;
+    const auto beta_add = MLAS_FP16(1.0f);
+    constexpr size_t buffer_size = MLAS_HGEMM_STRIDEN * MLAS_HGEMM_STRIDEK;
+    MLAS_DECLSPEC_ALIGN(MLAS_FP16 PackedB[buffer_size], 16 * sizeof(_mlas_fp16_));
+
+    if (TransA == CblasNoTrans && TransB == CblasTrans) {
+        const auto* A = DataParams->A + RangeStartM * lda;
+        const auto* B = DataParams->B + RangeStartN * ldb;
+        auto* C = DataParams->C + RangeStartM * ldc + RangeStartN;
+
+        if (RangeCountM <= StrideM) {
+            if (!dispatch || !dispatch->HGemmKernel_TransposedB) {
+                MLAS_THROW_EX(std::runtime_error, "hgemm does not have A x Transposed(B) kernels");
+            }
+            // When M is small, B is visited once. The overhead of Pack(B') exceeds the benefits
+            // from A x Pack(B'). Therefore directly calculate A x B'.
+            // Without PackB, to utilize memory locality, iterate full K.
+            constexpr size_t StrideN = 16;
+            for (size_t n = 0, countN; n < RangeCountN; n += countN) {
+                countN = std::min(StrideN, RangeCountN - n);
+                dispatch->HGemmKernel_TransposedB(A, B, C, RangeCountM, countN, K, lda, ldb, ldc, alpha, beta);
+                B += countN * ldb;
+                C += countN;
+            }
+        } else {
+            if (!dispatch || !dispatch->HPackBKernel_TransposedB || !dispatch->HGemmKernel_TransposedPackedB) {
+                MLAS_THROW_EX(std::runtime_error, "hgemm does not have A x Transposed(B) kernels");
+            }
+            // 16N is the smallest pack unit.
+            const size_t StrideK = std::min(K, size_t(MLAS_HGEMM_STRIDEK));
+            const size_t StrideN = buffer_size/StrideK & (~15); // >= MLAS_HGEMM_STRIDEN
+            for (size_t n = 0, countN; n < RangeCountN; n += countN) {
+                countN = std::min(StrideN, RangeCountN - n);
+                const MLAS_FP16* a = A;
+                const MLAS_FP16* b = B;
+                MLAS_FP16* c = C;
+                for (size_t k = 0, countK; k < K; k += countK) {
+                    countK = std::min(StrideK, K - k);
+                    dispatch->HPackBKernel_TransposedB(b, PackedB, countN, countK, ldb);
+                    const MLAS_FP16* aa = a;
+                    MLAS_FP16* cc = c;
+                    for (size_t m = 0, countM; m < RangeCountM; m += countM) {
+                        countM = std::min(StrideM, RangeCountM - m);
+                        // First K iteration, beta is applied to the whole C. In rest K iterations, use add mode.
+                        dispatch->HGemmKernel_TransposedPackedB(
+                            aa, PackedB, cc, countM, countN, countK, lda, ldc, alpha, k == 0 ? beta : beta_add.val);
+                        aa += countM * lda;
+                        cc += countM * ldc;
+                    }
+                    a += countK;
+                    b += countK;
+                }
+                B += countN * ldb;
+                C += countN;
+            }
+        }
+    } else {
+        MLAS_THROW_EX(std::runtime_error, "hgemm currently only support A x Transpoe(B)");
+    }
+}
+
+void
+MLASCALL
+MlasGemmBatch(
+    CBLAS_TRANSPOSE TransA,
+    CBLAS_TRANSPOSE TransB,
+    size_t M,
+    size_t N,
+    size_t K,
+    const MLAS_HGEMM_DATA_PARAMS* Data,
+    size_t BatchSize,
+    MLAS_THREADPOOL* ThreadPool
+) {
+    if (!ThreadPool) {
+        for (size_t gemm_i = 0; gemm_i < BatchSize; gemm_i++) {
+            HGemmOperation(TransA, TransB, K, &Data[gemm_i], 0, M, 0, N);
+        }
+        return;
+    }
+
+    const double Complexity = double(M) * double(N) * double(K) * double(BatchSize);
+    ptrdiff_t TargetThreadCount;
+
+    if (Complexity < double(MLAS_HGEMM_THREAD_COMPLEXITY) * GetMlasPlatform().MaximumThreadCount) {
+        TargetThreadCount = ptrdiff_t(Complexity / double(MLAS_HGEMM_THREAD_COMPLEXITY)) + 1;
+    } else {
+        TargetThreadCount = GetMlasPlatform().MaximumThreadCount;
+    }
+
+    ptrdiff_t MaximumThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+    if (TargetThreadCount >= MaximumThreadCount) {
+        TargetThreadCount = MaximumThreadCount;
+    }
+
+    // Segment the operation across multiple threads.
+
+    ptrdiff_t ThreadsPerGemm = TargetThreadCount / BatchSize;
+    if (ThreadsPerGemm < 1) {
+        ThreadsPerGemm = 1;
+    }
+
+    constexpr size_t StrideM = 128;
+
+    size_t nc = N;
+    if (ThreadsPerGemm > 1) {
+        // more than one thread per GEMM
+
+        const size_t BlockedM = MlasDivRoundup(M, StrideM);
+        const size_t max_nc = MlasDivRoundup(N * BlockedM, ThreadsPerGemm);
+        if (max_nc < nc) {
+            nc = std::min(
+                nc, MlasDivRoundup(max_nc, MLAS_HGEMM_STRIDEN_THREAD_ALIGN) * MLAS_HGEMM_STRIDEN_THREAD_ALIGN);
+        }
+    }
+    const size_t StrideN = nc;
+
+    const size_t ThreadCountM = MlasDivRoundup(M, StrideM);
+    const size_t ThreadCountN = MlasDivRoundup(N, StrideN);
+    ThreadsPerGemm = ThreadCountM * ThreadCountN;
+
+    MlasTrySimpleParallel(ThreadPool, ThreadsPerGemm * static_cast<ptrdiff_t>(BatchSize), [&](ptrdiff_t tid) {
+        const auto gemm_i = tid / ThreadsPerGemm;
+        const auto blk_i = tid % ThreadsPerGemm;
+
+        const ptrdiff_t ThreadIdN = blk_i / ThreadCountM;
+        const ptrdiff_t ThreadIdM = blk_i % ThreadCountM;
+
+        const size_t RangeStartM = ThreadIdM * StrideM;
+        const size_t RangeCountM = std::min(M - RangeStartM, (size_t)StrideM);
+
+        const size_t RangeStartN = ThreadIdN * StrideN;
+        const size_t RangeCountN = std::min(N - RangeStartN, (size_t)StrideN);
+
+        HGemmOperation(TransA, TransB, K, &Data[gemm_i], RangeStartM, RangeCountM, RangeStartN, RangeCountN);
+    });
+}
 
 const MLAS_HALFGEMM_DISPATCH MlasHalfGemmDispatchDefault = {
     MlasHalfGemmOperation<MLAS_HALF_GEMM_KERNEL_DEFAULT>,
diff --git a/onnxruntime/core/mlas/lib/halfgemm.h b/onnxruntime/core/mlas/lib/halfgemm.h
index 61e2fbb0af..e280e6d409 100644
--- a/onnxruntime/core/mlas/lib/halfgemm.h
+++ b/onnxruntime/core/mlas/lib/halfgemm.h
@@ -513,3 +513,125 @@ MlasHalfGemmGetDispatch()
     return &MlasHalfGemmDispatchDefault;
 #endif
 }
+
+namespace hgemm_neon {
+
+void HPackB_TransposedB_Kernel(
+    const MLAS_FP16* B,
+    MLAS_FP16* PackedB,
+    size_t CountN,
+    size_t CountK,
+    size_t ldb
+);
+
+void HGemm_TransposedB_Kernel(
+    const MLAS_FP16* A,
+    const MLAS_FP16* B,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+);
+
+void HGemm_TransposedPackedB_Kernel(
+    const MLAS_FP16* A,
+    const MLAS_FP16* PackedB,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+);
+
+}  // namespace hgemm_neon
+
+struct MLAS_HGEMM_DISPATCH {
+    /**
+     * @brief Pack the B matrix segment. B is column-major. Elements from CountK rows x N columns are packed
+     *        continuously in row-major.
+     *        First pack CountK rows x 16 columns, then pack CountK rows x 8 columns.
+     *        If there are < 8 columns left, pad the columns with 0.
+     * @param      B                   the first element of the B matrix segment. Column major.
+     * @param[out] PackedB             the first element of the packed B matrix segment.
+     * @param      CountN              the number of columns of B chunk.
+     * @param      CountK              the number of rows of B chunk.
+     */
+    typedef void(HPackBKernel_TransposedB_Fn) (
+        const MLAS_FP16* B,
+        MLAS_FP16* PackedB,
+        size_t CountN,
+        size_t CountK,
+        size_t ldb
+    );
+
+    HPackBKernel_TransposedB_Fn* HPackBKernel_TransposedB = nullptr;
+
+    /**
+     * @brief C = alpha * A * Transpose(B) + beta * C. CountM <= 2. B is not packed. Used when M is small.
+     *
+     * @param       A                   first row of the A matrix segment. Row major.
+     * @param       B                   first column of the B matrix segment. Column major.
+     * @param[out]  C                   first element of the output matrix segment. Row major.
+     * @param       CountM              the number of rows of A chunk.
+     * @param       CountN              the number of columns of B chunk.
+     * @param       CountK              the number of columns of A chunk and the number of rows of B chunk.
+     * @param       lda                 the leading dimension of A.
+     * @param       ldb                 the leading dimension of B.
+     * @param       ldc                 the leading dimension of C.
+     * @param       alpha               the alpha scalar value.
+     * @param       beta                the beta scalar value.
+     */
+    typedef void(HGemmKernel_TransposedB_Fn)(
+        const MLAS_FP16* A,
+        const MLAS_FP16* B,
+        MLAS_FP16* C,
+        size_t CountM,
+        size_t CountN,
+        size_t CountK,
+        size_t lda,
+        size_t ldb,
+        size_t ldc,
+        _mlas_fp16_ alpha,
+        _mlas_fp16_ beta
+    );
+
+    HGemmKernel_TransposedB_Fn* HGemmKernel_TransposedB = nullptr;
+
+     /**
+     * @brief C = alpha * A * Transpose(B) + beta * C. CountM <= 2. B has been packed using HPackBKernel_TransposedB_Fn.
+     *        Use when M is large.
+     *
+     * @param       A                   first row of the A matrix segment. Row major.
+     * @param       PackedB             first element of the packed B buffer.
+     * @param[out]  C                   first element of the output matrix segment. Row major.
+     * @param       CountM              the number of rows of A chunk.
+     * @param       CountN              the number of columns of B chunk.
+     * @param       CountK              the number of columns of A chunk and the number of rows of B chunk.
+     * @param       lda                 the leading dimension of A.
+     * @param       ldc                 the leading dimension of C.
+     * @param       alpha               the alpha scalar value.
+     * @param       beta                the beta scalar value.
+     */
+    typedef void(HGemmKernel_TransposedPackedB_Fn)(
+        const MLAS_FP16* A,
+        const MLAS_FP16* PackedB,
+        MLAS_FP16* C,
+        size_t CountM,
+        size_t CountN,
+        size_t CountK,
+        size_t lda,
+        size_t ldc,
+        _mlas_fp16_ alpha,
+        _mlas_fp16_ beta
+    );
+
+    HGemmKernel_TransposedPackedB_Fn* HGemmKernel_TransposedPackedB = nullptr;
+};
diff --git a/onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp
new file mode 100644
index 0000000000..02ce38fcb2
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/halfgemm_kernel_neon_fp16.cpp
@@ -0,0 +1,1572 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    halfgemm_kernel_neon_fp16.cpp
+
+Abstract:
+
+    This module implements half precision GEMM kernel for neon.
+
+--*/
+
+#include <arm_neon.h>
+
+#include "halfgemm.h"
+#include "fp16_common.h"
+
+namespace hgemm_neon {
+
+void HPackB_TransposedB_Kernel(
+    const MLAS_FP16* B,
+    MLAS_FP16* PackedB,
+    size_t CountN,
+    size_t CountK,
+    size_t ldb
+) {
+    const _mlas_fp16_* B_data = reinterpret_cast<const _mlas_fp16_*>(B);
+    _mlas_fp16_* PackedB_data = reinterpret_cast<_mlas_fp16_*>(PackedB);
+
+    for (; CountN >= 16; CountN -= 16, B_data += 16 * ldb) {
+        const _mlas_fp16_* b = B_data;
+        size_t k = CountK;
+        constexpr size_t step = 8 * 16; // pack 8 * 16
+        for (; k >= 8; k -= 8, b += 8, PackedB_data += step) {
+            float16x8_t v0 = MlasLoadFloat16x8(b);
+            float16x8_t v1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t v2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t v3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t v4 = MlasLoadFloat16x8(b + 4 * ldb);
+            float16x8_t v5 = MlasLoadFloat16x8(b + 5 * ldb);
+            float16x8_t v6 = MlasLoadFloat16x8(b + 6 * ldb);
+            float16x8_t v7 = MlasLoadFloat16x8(b + 7 * ldb);
+            float16x8_t v8 = MlasLoadFloat16x8(b + 8 * ldb);
+            float16x8_t v9 = MlasLoadFloat16x8(b + 9 * ldb);
+            float16x8_t vA = MlasLoadFloat16x8(b + 10 * ldb);
+            float16x8_t vB = MlasLoadFloat16x8(b + 11 * ldb);
+            float16x8_t vC = MlasLoadFloat16x8(b + 12 * ldb);
+            float16x8_t vD = MlasLoadFloat16x8(b + 13 * ldb);
+            float16x8_t vE = MlasLoadFloat16x8(b + 14 * ldb);
+            float16x8_t vF = MlasLoadFloat16x8(b + 15 * ldb);
+            Transpose8x8(v0, v1, v2, v3, v4, v5, v6, v7);
+            Transpose8x8(v8, v9, vA, vB, vC, vD, vE, vF);
+
+            MlasStoreFloat16x8(PackedB_data, v0);
+            MlasStoreFloat16x8(PackedB_data + 8, v8);
+            MlasStoreFloat16x8(PackedB_data + 16, v1);
+            MlasStoreFloat16x8(PackedB_data + 24, v9);
+            MlasStoreFloat16x8(PackedB_data + 32, v2);
+            MlasStoreFloat16x8(PackedB_data + 40, vA);
+            MlasStoreFloat16x8(PackedB_data + 48, v3);
+            MlasStoreFloat16x8(PackedB_data + 56, vB);
+            MlasStoreFloat16x8(PackedB_data + 64, v4);
+            MlasStoreFloat16x8(PackedB_data + 72, vC);
+            MlasStoreFloat16x8(PackedB_data + 80, v5);
+            MlasStoreFloat16x8(PackedB_data + 88, vD);
+            MlasStoreFloat16x8(PackedB_data + 96, v6);
+            MlasStoreFloat16x8(PackedB_data + 104, vE);
+            MlasStoreFloat16x8(PackedB_data + 112, v7);
+            MlasStoreFloat16x8(PackedB_data + 120, vF);
+        }
+
+        if (k & 4) {
+            float16x4_t v0 = MlasLoadFloat16x4(b);
+            float16x4_t v1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t v2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t v3 = MlasLoadFloat16x4(b + 3 * ldb);
+            float16x4_t v4 = MlasLoadFloat16x4(b + 4 * ldb);
+            float16x4_t v5 = MlasLoadFloat16x4(b + 5 * ldb);
+            float16x4_t v6 = MlasLoadFloat16x4(b + 6 * ldb);
+            float16x4_t v7 = MlasLoadFloat16x4(b + 7 * ldb);
+            float16x4_t v8 = MlasLoadFloat16x4(b + 8 * ldb);
+            float16x4_t v9 = MlasLoadFloat16x4(b + 9 * ldb);
+            float16x4_t vA = MlasLoadFloat16x4(b + 10 * ldb);
+            float16x4_t vB = MlasLoadFloat16x4(b + 11 * ldb);
+            float16x4_t vC = MlasLoadFloat16x4(b + 12 * ldb);
+            float16x4_t vD = MlasLoadFloat16x4(b + 13 * ldb);
+            float16x4_t vE = MlasLoadFloat16x4(b + 14 * ldb);
+            float16x4_t vF = MlasLoadFloat16x4(b + 15 * ldb);
+            Transpose4x4(v0, v1, v2, v3);
+            Transpose4x4(v4, v5, v6, v7);
+            Transpose4x4(v8, v9, vA, vB);
+            Transpose4x4(vC, vD, vE, vF);
+            MlasStoreFloat16x4(PackedB_data, v0);
+            MlasStoreFloat16x4(PackedB_data + 4, v4);
+            MlasStoreFloat16x4(PackedB_data + 8, v8);
+            MlasStoreFloat16x4(PackedB_data + 12, vC);
+            MlasStoreFloat16x4(PackedB_data + 16, v1);
+            MlasStoreFloat16x4(PackedB_data + 20, v5);
+            MlasStoreFloat16x4(PackedB_data + 24, v9);
+            MlasStoreFloat16x4(PackedB_data + 28, vD);
+            MlasStoreFloat16x4(PackedB_data + 32, v2);
+            MlasStoreFloat16x4(PackedB_data + 36, v6);
+            MlasStoreFloat16x4(PackedB_data + 40, vA);
+            MlasStoreFloat16x4(PackedB_data + 44, vE);
+            MlasStoreFloat16x4(PackedB_data + 48, v3);
+            MlasStoreFloat16x4(PackedB_data + 52, v7);
+            MlasStoreFloat16x4(PackedB_data + 56, vB);
+            MlasStoreFloat16x4(PackedB_data + 60, vF);
+
+            k -= 4, b += 4, PackedB_data += 4 * 16;
+        }
+
+        if (k > 0) {
+            float16x4_t v0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t v1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t v2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t v3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            float16x4_t v4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k);
+            float16x4_t v5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k);
+            float16x4_t v6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k);
+            float16x4_t v7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k);
+            float16x4_t v8 = MlasLoadPartialFloat16x4(b + 8 * ldb, k);
+            float16x4_t v9 = MlasLoadPartialFloat16x4(b + 9 * ldb, k);
+            float16x4_t vA = MlasLoadPartialFloat16x4(b + 10 * ldb, k);
+            float16x4_t vB = MlasLoadPartialFloat16x4(b + 11 * ldb, k);
+            float16x4_t vC = MlasLoadPartialFloat16x4(b + 12 * ldb, k);
+            float16x4_t vD = MlasLoadPartialFloat16x4(b + 13 * ldb, k);
+            float16x4_t vE = MlasLoadPartialFloat16x4(b + 14 * ldb, k);
+            float16x4_t vF = MlasLoadPartialFloat16x4(b + 15 * ldb, k);
+            Transpose4x4(v0, v1, v2, v3);
+            Transpose4x4(v4, v5, v6, v7);
+            Transpose4x4(v8, v9, vA, vB);
+            Transpose4x4(vC, vD, vE, vF);
+            MlasStoreFloat16x4(PackedB_data, v0);
+            MlasStoreFloat16x4(PackedB_data + 4, v4);
+            MlasStoreFloat16x4(PackedB_data + 8, v8);
+            MlasStoreFloat16x4(PackedB_data + 12, vC);
+            if (k > 1) {
+                MlasStoreFloat16x4(PackedB_data + 16, v1);
+                MlasStoreFloat16x4(PackedB_data + 20, v5);
+                MlasStoreFloat16x4(PackedB_data + 24, v9);
+                MlasStoreFloat16x4(PackedB_data + 28, vD);
+            }
+            if (k > 2) {
+                MlasStoreFloat16x4(PackedB_data + 32, v2);
+                MlasStoreFloat16x4(PackedB_data + 36, v6);
+                MlasStoreFloat16x4(PackedB_data + 40, vA);
+                MlasStoreFloat16x4(PackedB_data + 44, vE);
+            }
+
+            PackedB_data += k * 16;
+        }
+    }
+
+    if (CountN & 8) {
+        const _mlas_fp16_* b = B_data;
+        size_t k = CountK;
+        constexpr size_t step = 8 * 8; // pack 8 * 8
+        for (; k >= 8; k -= 8, b += 8, PackedB_data += step) {
+            float16x8_t v0 = MlasLoadFloat16x8(b);
+            float16x8_t v1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t v2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t v3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t v4 = MlasLoadFloat16x8(b + 4 * ldb);
+            float16x8_t v5 = MlasLoadFloat16x8(b + 5 * ldb);
+            float16x8_t v6 = MlasLoadFloat16x8(b + 6 * ldb);
+            float16x8_t v7 = MlasLoadFloat16x8(b + 7 * ldb);
+            Transpose8x8(v0, v1, v2, v3, v4, v5, v6, v7);
+
+            MlasStoreFloat16x8(PackedB_data, v0);
+            MlasStoreFloat16x8(PackedB_data + 8, v1);
+            MlasStoreFloat16x8(PackedB_data + 16, v2);
+            MlasStoreFloat16x8(PackedB_data + 24, v3);
+            MlasStoreFloat16x8(PackedB_data + 32, v4);
+            MlasStoreFloat16x8(PackedB_data + 40, v5);
+            MlasStoreFloat16x8(PackedB_data + 48, v6);
+            MlasStoreFloat16x8(PackedB_data + 56, v7);
+        }
+
+        if (k & 4) {
+            float16x4_t v0 = MlasLoadFloat16x4(b);
+            float16x4_t v1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t v2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t v3 = MlasLoadFloat16x4(b + 3 * ldb);
+            float16x4_t v4 = MlasLoadFloat16x4(b + 4 * ldb);
+            float16x4_t v5 = MlasLoadFloat16x4(b + 5 * ldb);
+            float16x4_t v6 = MlasLoadFloat16x4(b + 6 * ldb);
+            float16x4_t v7 = MlasLoadFloat16x4(b + 7 * ldb);
+            Transpose4x4(v0, v1, v2, v3);
+            Transpose4x4(v4, v5, v6, v7);
+            MlasStoreFloat16x4(PackedB_data, v0);
+            MlasStoreFloat16x4(PackedB_data + 4, v4);
+            MlasStoreFloat16x4(PackedB_data + 8, v1);
+            MlasStoreFloat16x4(PackedB_data + 12, v5);
+            MlasStoreFloat16x4(PackedB_data + 16, v2);
+            MlasStoreFloat16x4(PackedB_data + 20, v6);
+            MlasStoreFloat16x4(PackedB_data + 24, v3);
+            MlasStoreFloat16x4(PackedB_data + 28, v7);
+            k -= 4, b += 4, PackedB_data += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t v0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t v1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t v2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t v3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            float16x4_t v4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k);
+            float16x4_t v5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k);
+            float16x4_t v6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k);
+            float16x4_t v7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k);
+            Transpose4x4(v0, v1, v2, v3);
+            Transpose4x4(v4, v5, v6, v7);
+            MlasStoreFloat16x4(PackedB_data, v0);
+            MlasStoreFloat16x4(PackedB_data + 4, v4);
+            if (k > 1) {
+                MlasStoreFloat16x4(PackedB_data + 8, v1);
+                MlasStoreFloat16x4(PackedB_data + 12, v5);
+            }
+            if (k > 2) {
+                MlasStoreFloat16x4(PackedB_data + 16, v2);
+                MlasStoreFloat16x4(PackedB_data + 20, v6);
+            }
+
+            PackedB_data += k * 8;
+        }
+
+        B_data += 8 * ldb;
+        CountN -= 8;
+    }
+
+    if (CountN > 0) {
+        const _mlas_fp16_* b = B_data;
+        size_t k = CountK;
+        constexpr size_t step = 8 * 8; // pack extended 8 * 8
+        for (; k >= 8; k -= 8, b += 8, PackedB_data += step) {
+            float16x8_t v[8];
+            size_t i = 0;
+            for (; i < CountN; ++i) {
+                v[i] = MlasLoadFloat16x8(b + i * ldb);
+            }
+            for (; i < 8; ++i) {
+                v[i] = MlasZeroFloat16x8();
+            }
+            Transpose8x8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+            MlasStoreFloat16x8(PackedB_data, v[0]);
+            MlasStoreFloat16x8(PackedB_data + 8, v[1]);
+            MlasStoreFloat16x8(PackedB_data + 16, v[2]);
+            MlasStoreFloat16x8(PackedB_data + 24, v[3]);
+            MlasStoreFloat16x8(PackedB_data + 32, v[4]);
+            MlasStoreFloat16x8(PackedB_data + 40, v[5]);
+            MlasStoreFloat16x8(PackedB_data + 48, v[6]);
+            MlasStoreFloat16x8(PackedB_data + 56, v[7]);
+        }
+
+        if (k & 4) {
+            float16x4_t v[8];
+            size_t i = 0;
+            for (; i < CountN; ++i) {
+                v[i] = MlasLoadFloat16x4(b + i * ldb);
+            }
+            for (; i < 8; ++i) {
+                v[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(v[0], v[1], v[2], v[3]);
+            Transpose4x4(v[4], v[5], v[6], v[7]);
+            MlasStoreFloat16x4(PackedB_data, v[0]);
+            MlasStoreFloat16x4(PackedB_data + 4, v[4]);
+            MlasStoreFloat16x4(PackedB_data + 8, v[1]);
+            MlasStoreFloat16x4(PackedB_data + 12, v[5]);
+            MlasStoreFloat16x4(PackedB_data + 16, v[2]);
+            MlasStoreFloat16x4(PackedB_data + 20, v[6]);
+            MlasStoreFloat16x4(PackedB_data + 24, v[3]);
+            MlasStoreFloat16x4(PackedB_data + 28, v[7]);
+            k -= 4, b += 4, PackedB_data += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t v[8];
+            size_t i = 0;
+            for (; i < CountN; ++i) {
+                v[i] = MlasLoadPartialFloat16x4(b + i * ldb, k);
+            }
+            for (; i < 8; ++i) {
+                v[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(v[0], v[1], v[2], v[3]);
+            Transpose4x4(v[4], v[5], v[6], v[7]);
+            MlasStoreFloat16x4(PackedB_data, v[0]);
+            MlasStoreFloat16x4(PackedB_data + 4, v[4]);
+            if (k > 1) {
+                MlasStoreFloat16x4(PackedB_data + 8, v[1]);
+                MlasStoreFloat16x4(PackedB_data + 12, v[5]);
+            }
+            if (k > 2) {
+                MlasStoreFloat16x4(PackedB_data + 16, v[2]);
+                MlasStoreFloat16x4(PackedB_data + 20, v[6]);
+            }
+        }
+    }
+}
+
+MLAS_FORCEINLINE
+float16x8_t addq_f16x4(float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3) {
+    v0 = vaddq_f16(v0, v1);
+    v2 = vaddq_f16(v2, v3);
+    v0 = vaddq_f16(v0, v2);
+    return v0;
+}
+
+MLAS_FORCEINLINE
+float16x8_t addq_f16x8(float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3,
+                       float16x8_t v4, float16x8_t v5, float16x8_t v6, float16x8_t v7) {
+    return vaddq_f16(addq_f16x4(v0, v1, v2, v3), addq_f16x4(v4, v5, v6, v7));
+}
+
+MLAS_FORCEINLINE
+float16x8_t maq_lane_f16_accu(float16x8_t accu0, float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3,
+                              float16x4_t a0) {
+    accu0 = vfmaq_lane_f16(accu0, v0, a0, 0);
+    accu0 = vfmaq_lane_f16(accu0, v1, a0, 1);
+    accu0 = vfmaq_lane_f16(accu0, v2, a0, 2);
+    accu0 = vfmaq_lane_f16(accu0, v3, a0, 3);
+    return accu0;
+}
+
+MLAS_FORCEINLINE
+float16x8_t maq_laneq_f16_accu(float16x8_t accu0, float16x8_t v0, float16x8_t v1, float16x8_t v2, float16x8_t v3,
+                               float16x8_t v4, float16x8_t v5, float16x8_t v6, float16x8_t v7, float16x8_t a0) {
+    accu0 = vfmaq_laneq_f16(accu0, v0, a0, 0);
+    accu0 = vfmaq_laneq_f16(accu0, v1, a0, 1);
+    accu0 = vfmaq_laneq_f16(accu0, v2, a0, 2);
+    accu0 = vfmaq_laneq_f16(accu0, v3, a0, 3);
+    accu0 = vfmaq_laneq_f16(accu0, v4, a0, 4);
+    accu0 = vfmaq_laneq_f16(accu0, v5, a0, 5);
+    accu0 = vfmaq_laneq_f16(accu0, v6, a0, 6);
+    accu0 = vfmaq_laneq_f16(accu0, v7, a0, 7);
+    return accu0;
+}
+
+MLAS_FORCEINLINE
+float16x4_t ma_lane_f16_accu(float16x4_t accu, float16x4_t v0, float16x4_t v1, float16x4_t v2, float16x4_t v3,
+                             float16x4_t a0) {
+    accu = vfma_lane_f16(accu, v0, a0, 0);
+    accu = vfma_lane_f16(accu, v1, a0, 1);
+    accu = vfma_lane_f16(accu, v2, a0, 2);
+    accu = vfma_lane_f16(accu, v3, a0, 3);
+    return accu;
+}
+
+template <int beta_behavior> // 0: beta == 0.0f16, 1: beta == 1.0f16, 2: beta != 0.0f16 && beta != 1.0f16
+void HGemm_TransposedB_Kernel_M1(
+    const _mlas_fp16_* A_data,
+    const _mlas_fp16_* B_data,
+    _mlas_fp16_* C_data,
+    size_t CountN,
+    size_t CountK,
+    size_t ldb,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    for (; CountN >= 8; CountN -= 8, B_data += 8 * ldb, C_data += 8) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        float16x8_t accu1 = MlasZeroFloat16x8();
+        float16x8_t accu2 = MlasZeroFloat16x8();
+        float16x8_t accu3 = MlasZeroFloat16x8();
+        float16x8_t accu4 = MlasZeroFloat16x8();
+        float16x8_t accu5 = MlasZeroFloat16x8();
+        float16x8_t accu6 = MlasZeroFloat16x8();
+        float16x8_t accu7 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(b);
+            float16x8_t b1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t b4 = MlasLoadFloat16x8(b + 4 * ldb);
+            float16x8_t b5 = MlasLoadFloat16x8(b + 5 * ldb);
+            float16x8_t b6 = MlasLoadFloat16x8(b + 6 * ldb);
+            float16x8_t b7 = MlasLoadFloat16x8(b + 7 * ldb);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = vfmaq_f16(accu0, b0, a0);
+            accu1 = vfmaq_f16(accu1, b1, a0);
+            accu2 = vfmaq_f16(accu2, b2, a0);
+            accu3 = vfmaq_f16(accu3, b3, a0);
+            accu4 = vfmaq_f16(accu4, b4, a0);
+            accu5 = vfmaq_f16(accu5, b5, a0);
+            accu6 = vfmaq_f16(accu6, b6, a0);
+            accu7 = vfmaq_f16(accu7, b7, a0);
+        }
+        Transpose8x8(accu0, accu1, accu2, accu3, accu4, accu5, accu6, accu7);
+        accu0 = addq_f16x8(accu0, accu1, accu2, accu3, accu4, accu5, accu6, accu7); // accumulator of 8 columns
+
+        if (k & 4) {
+            float16x4_t b0 = MlasLoadFloat16x4(b);
+            float16x4_t b1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb);
+            float16x4_t b4 = MlasLoadFloat16x4(b + 4 * ldb);
+            float16x4_t b5 = MlasLoadFloat16x4(b + 5 * ldb);
+            float16x4_t b6 = MlasLoadFloat16x4(b + 6 * ldb);
+            float16x4_t b7 = MlasLoadFloat16x4(b + 7 * ldb);
+            Transpose4x4(b0, b1, b2, b3);
+            Transpose4x4(b4, b5, b6, b7);
+            float16x8_t v0 = vcombine_f16(b0, b4);
+            float16x8_t v1 = vcombine_f16(b1, b5);
+            float16x8_t v2 = vcombine_f16(b2, b6);
+            float16x8_t v3 = vcombine_f16(b3, b7);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu0 = maq_lane_f16_accu(accu0, v0, v1, v2, v3, a0);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t b0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            float16x4_t b4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k);
+            float16x4_t b5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k);
+            float16x4_t b6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k);
+            float16x4_t b7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k);
+            Transpose4x4(b0, b1, b2, b3);
+            Transpose4x4(b4, b5, b6, b7);
+            float16x8_t v0 = vcombine_f16(b0, b4), v1, v2;
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            accu0 = vfmaq_lane_f16(accu0, v0, a0, 0);
+            if (k > 1) {
+                v1 = vcombine_f16(b1, b5);
+                accu0 = vfmaq_lane_f16(accu0, v1, a0, 1);
+            }
+            if (k > 2) {
+                v2 = vcombine_f16(b2, b6);
+                accu0 = vfmaq_lane_f16(accu0, v2, a0, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c = MlasLoadFloat16x8(C_data);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vfmaq_f16(c, accu0, alpha_v);
+            MlasStoreFloat16x8(C_data, accu0);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c = MlasLoadFloat16x8(C_data);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu0 = vfmaq_f16(vmulq_f16(c, beta_v), accu0, alpha_v);
+            MlasStoreFloat16x8(C_data, accu0);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vmulq_f16(accu0, alpha_v);
+            MlasStoreFloat16x8(C_data, accu0);
+        }
+    }
+
+    if (CountN & 4) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        float16x8_t accu1 = MlasZeroFloat16x8();
+        float16x8_t accu2 = MlasZeroFloat16x8();
+        float16x8_t accu3 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(b);
+            float16x8_t b1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = vfmaq_f16(accu0, b0, a0);
+            accu1 = vfmaq_f16(accu1, b1, a0);
+            accu2 = vfmaq_f16(accu2, b2, a0);
+            accu3 = vfmaq_f16(accu3, b3, a0);
+        }
+        Transpose4x8(accu0, accu1, accu2, accu3);
+        accu0 = addq_f16x4(accu0, accu1, accu2, accu3); // accumulator of 4 columns
+        float16x4_t accu = vadd_f16(vget_low_f16(accu0), vget_high_f16(accu0));
+
+        if (k & 4) {
+            float16x4_t b0 = MlasLoadFloat16x4(b);
+            float16x4_t b1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb);
+            Transpose4x4(b0, b1, b2, b3);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu = ma_lane_f16_accu(accu, b0, b1, b2, b3, a0);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t b0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            Transpose4x4(b0, b1, b2, b3);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            accu = vfma_lane_f16(accu, b0, a0, 0);
+            if (k > 1) {
+                accu = vfma_lane_f16(accu, b1, a0, 1);
+            }
+            if (k > 2) {
+                accu = vfma_lane_f16(accu, b2, a0, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x4_t c = MlasLoadFloat16x4(C_data);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu = vfma_f16(c, accu, alpha_v);
+            MlasStoreFloat16x4(C_data, accu);
+        } else if constexpr (beta_behavior == 2) {
+            float16x4_t c = MlasLoadFloat16x4(C_data);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+            accu = vfma_f16(vmul_f16(c, beta_v), accu, alpha_v);
+            MlasStoreFloat16x4(C_data, accu);
+        } else {
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu = vmul_f16(accu, alpha_v);
+            MlasStoreFloat16x4(C_data, accu);
+        }
+
+        CountN -= 4, B_data += 4 * ldb, C_data += 4;
+    }
+
+    if (CountN > 0) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accus[4];
+        size_t i = 0;
+        for (i = 0; i < 4; ++i) {
+            accus[i] = MlasZeroFloat16x8();
+        }
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            for (i = 0; i < CountN; ++i) {
+                accus[i] = vfmaq_f16(accus[i], MlasLoadFloat16x8(b + i * ldb), a0);
+            }
+        }
+        Transpose4x8(accus[0], accus[1], accus[2], accus[3]);
+        float16x8_t accu0 = addq_f16x4(accus[0], accus[1], accus[2], accus[3]); // accumulator of 4 columns
+        float16x4_t accu = vadd_f16(vget_low_f16(accu0), vget_high_f16(accu0));
+
+        if (k & 4) {
+            float16x4_t bs[4];
+            for (i = 0; i < CountN; ++i) {
+                bs[i] = MlasLoadFloat16x4(b + i * ldb);
+            }
+            for (; i < 4; ++i) {
+                bs[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(bs[0], bs[1], bs[2], bs[3]);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu = ma_lane_f16_accu(accu, bs[0], bs[1], bs[2], bs[3], a0);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t bs[4];
+            for (i = 0; i < CountN; ++i) {
+                bs[i] = MlasLoadPartialFloat16x4(b + i * ldb, k);
+            }
+            for (; i < 4; ++i) {
+                bs[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(bs[0], bs[1], bs[2], bs[3]);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            accu = vfma_lane_f16(accu, bs[0], a0, 0);
+            if (k > 1) {
+                accu = vfma_lane_f16(accu, bs[1], a0, 1);
+            }
+            if (k > 2) {
+                accu = vfma_lane_f16(accu, bs[2], a0, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x4_t c = MlasLoadPartialFloat16x4(C_data, CountN);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu = vfma_f16(c, accu, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu, CountN);
+        } else if constexpr (beta_behavior == 2) {
+            float16x4_t c = MlasLoadPartialFloat16x4(C_data, CountN);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+            accu = vfma_f16(vmul_f16(c, beta_v), accu, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu, CountN);
+        } else {
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu = vmul_f16(accu, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu, CountN);
+        }
+    }
+}
+
+template <int beta_behavior> // 0: beta == 0.0f16, 1: beta == 1.0f16, 2: beta != 0.0f16 && beta != 1.0f16
+void HGemm_TransposedB_Kernel_M2(
+    const _mlas_fp16_* A_data,
+    const _mlas_fp16_* B_data,
+    _mlas_fp16_* C_data,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    for (; CountN >= 8; CountN -= 8, B_data += 8 * ldb, C_data += 8) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu00 = MlasZeroFloat16x8();
+        float16x8_t accu01 = MlasZeroFloat16x8();
+        float16x8_t accu02 = MlasZeroFloat16x8();
+        float16x8_t accu03 = MlasZeroFloat16x8();
+        float16x8_t accu04 = MlasZeroFloat16x8();
+        float16x8_t accu05 = MlasZeroFloat16x8();
+        float16x8_t accu06 = MlasZeroFloat16x8();
+        float16x8_t accu07 = MlasZeroFloat16x8();
+        float16x8_t accu10 = MlasZeroFloat16x8();
+        float16x8_t accu11 = MlasZeroFloat16x8();
+        float16x8_t accu12 = MlasZeroFloat16x8();
+        float16x8_t accu13 = MlasZeroFloat16x8();
+        float16x8_t accu14 = MlasZeroFloat16x8();
+        float16x8_t accu15 = MlasZeroFloat16x8();
+        float16x8_t accu16 = MlasZeroFloat16x8();
+        float16x8_t accu17 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(b);
+            float16x8_t b1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t b4 = MlasLoadFloat16x8(b + 4 * ldb);
+            float16x8_t b5 = MlasLoadFloat16x8(b + 5 * ldb);
+            float16x8_t b6 = MlasLoadFloat16x8(b + 6 * ldb);
+            float16x8_t b7 = MlasLoadFloat16x8(b + 7 * ldb);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu00 = vfmaq_f16(accu00, b0, a0);
+            accu01 = vfmaq_f16(accu01, b1, a0);
+            accu02 = vfmaq_f16(accu02, b2, a0);
+            accu03 = vfmaq_f16(accu03, b3, a0);
+            accu04 = vfmaq_f16(accu04, b4, a0);
+            accu05 = vfmaq_f16(accu05, b5, a0);
+            accu06 = vfmaq_f16(accu06, b6, a0);
+            accu07 = vfmaq_f16(accu07, b7, a0);
+            accu10 = vfmaq_f16(accu10, b0, a1);
+            accu11 = vfmaq_f16(accu11, b1, a1);
+            accu12 = vfmaq_f16(accu12, b2, a1);
+            accu13 = vfmaq_f16(accu13, b3, a1);
+            accu14 = vfmaq_f16(accu14, b4, a1);
+            accu15 = vfmaq_f16(accu15, b5, a1);
+            accu16 = vfmaq_f16(accu16, b6, a1);
+            accu17 = vfmaq_f16(accu17, b7, a1);
+        }
+        Transpose8x8(accu00, accu01, accu02, accu03, accu04, accu05, accu06, accu07);
+        Transpose8x8(accu10, accu11, accu12, accu13, accu14, accu15, accu16, accu17);
+        accu00 = addq_f16x8(accu00, accu01, accu02, accu03, accu04, accu05, accu06, accu07);
+        accu10 = addq_f16x8(accu10, accu11, accu12, accu13, accu14, accu15, accu16, accu17);
+
+        if (k & 4) {
+            float16x4_t b0 = MlasLoadFloat16x4(b);
+            float16x4_t b1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb);
+            float16x4_t b4 = MlasLoadFloat16x4(b + 4 * ldb);
+            float16x4_t b5 = MlasLoadFloat16x4(b + 5 * ldb);
+            float16x4_t b6 = MlasLoadFloat16x4(b + 6 * ldb);
+            float16x4_t b7 = MlasLoadFloat16x4(b + 7 * ldb);
+            Transpose4x4(b0, b1, b2, b3);
+            Transpose4x4(b4, b5, b6, b7);
+            float16x8_t v0 = vcombine_f16(b0, b4);
+            float16x8_t v1 = vcombine_f16(b1, b5);
+            float16x8_t v2 = vcombine_f16(b2, b6);
+            float16x8_t v3 = vcombine_f16(b3, b7);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu00 = maq_lane_f16_accu(accu00, v0, v1, v2, v3, a0);
+            accu10 = maq_lane_f16_accu(accu10, v0, v1, v2, v3, a1);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t b0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            float16x4_t b4 = MlasLoadPartialFloat16x4(b + 4 * ldb, k);
+            float16x4_t b5 = MlasLoadPartialFloat16x4(b + 5 * ldb, k);
+            float16x4_t b6 = MlasLoadPartialFloat16x4(b + 6 * ldb, k);
+            float16x4_t b7 = MlasLoadPartialFloat16x4(b + 7 * ldb, k);
+            Transpose4x4(b0, b1, b2, b3);
+            Transpose4x4(b4, b5, b6, b7);
+            float16x8_t v0 = vcombine_f16(b0, b4);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            accu00 = vfmaq_lane_f16(accu00, v0, a0, 0);
+            accu10 = vfmaq_lane_f16(accu10, v0, a1, 0);
+            if (k > 1) {
+                float16x8_t v1 = vcombine_f16(b1, b5);
+                accu00 = vfmaq_lane_f16(accu00, v1, a0, 1);
+                accu10 = vfmaq_lane_f16(accu10, v1, a1, 1);
+            }
+            if (k > 2) {
+                float16x8_t v2 = vcombine_f16(b2, b6);
+                accu00 = vfmaq_lane_f16(accu00, v2, a0, 2);
+                accu10 = vfmaq_lane_f16(accu10, v2, a1, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c0 = MlasLoadFloat16x8(C_data);
+            float16x8_t c1 = MlasLoadFloat16x8(C_data + ldc);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vfmaq_f16(c0, accu00, alpha_v);
+            accu10 = vfmaq_f16(c1, accu10, alpha_v);
+            MlasStoreFloat16x8(C_data, accu00);
+            MlasStoreFloat16x8(C_data + ldc, accu10);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c0 = MlasLoadFloat16x8(C_data);
+            float16x8_t c1 = MlasLoadFloat16x8(C_data + ldc);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu00 = vfmaq_f16(vmulq_f16(c0, beta_v), accu00, alpha_v);
+            accu10 = vfmaq_f16(vmulq_f16(c1, beta_v), accu10, alpha_v);
+            MlasStoreFloat16x8(C_data, accu00);
+            MlasStoreFloat16x8(C_data + ldc, accu10);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vmulq_f16(accu00, alpha_v);
+            accu10 = vmulq_f16(accu10, alpha_v);
+            MlasStoreFloat16x8(C_data, accu00);
+            MlasStoreFloat16x8(C_data + ldc, accu10);
+        }
+    }
+
+    if (CountN & 4) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu00 = MlasZeroFloat16x8();
+        float16x8_t accu01 = MlasZeroFloat16x8();
+        float16x8_t accu02 = MlasZeroFloat16x8();
+        float16x8_t accu03 = MlasZeroFloat16x8();
+        float16x8_t accu10 = MlasZeroFloat16x8();
+        float16x8_t accu11 = MlasZeroFloat16x8();
+        float16x8_t accu12 = MlasZeroFloat16x8();
+        float16x8_t accu13 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(b);
+            float16x8_t b1 = MlasLoadFloat16x8(b + ldb);
+            float16x8_t b2 = MlasLoadFloat16x8(b + 2 * ldb);
+            float16x8_t b3 = MlasLoadFloat16x8(b + 3 * ldb);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu00 = vfmaq_f16(accu00, b0, a0);
+            accu01 = vfmaq_f16(accu01, b1, a0);
+            accu02 = vfmaq_f16(accu02, b2, a0);
+            accu03 = vfmaq_f16(accu03, b3, a0);
+            accu10 = vfmaq_f16(accu10, b0, a1);
+            accu11 = vfmaq_f16(accu11, b1, a1);
+            accu12 = vfmaq_f16(accu12, b2, a1);
+            accu13 = vfmaq_f16(accu13, b3, a1);
+        }
+        Transpose4x8(accu00, accu01, accu02, accu03);
+        Transpose4x8(accu10, accu11, accu12, accu13);
+        accu00 = addq_f16x4(accu00, accu01, accu02, accu03);
+        accu10 = addq_f16x4(accu10, accu11, accu12, accu13);
+        float16x4_t accu0 = vadd_f16(vget_low_f16(accu00), vget_high_f16(accu00));
+        float16x4_t accu1 = vadd_f16(vget_low_f16(accu10), vget_high_f16(accu10));
+
+        if (k & 4) {
+            float16x4_t b0 = MlasLoadFloat16x4(b);
+            float16x4_t b1 = MlasLoadFloat16x4(b + ldb);
+            float16x4_t b2 = MlasLoadFloat16x4(b + 2 * ldb);
+            float16x4_t b3 = MlasLoadFloat16x4(b + 3 * ldb);
+            Transpose4x4(b0, b1, b2, b3);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu0 = ma_lane_f16_accu(accu0, b0, b1, b2, b3, a0);
+            accu1 = ma_lane_f16_accu(accu1, b0, b1, b2, b3, a1);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t b0 = MlasLoadPartialFloat16x4(b, k);
+            float16x4_t b1 = MlasLoadPartialFloat16x4(b + ldb, k);
+            float16x4_t b2 = MlasLoadPartialFloat16x4(b + 2 * ldb, k);
+            float16x4_t b3 = MlasLoadPartialFloat16x4(b + 3 * ldb, k);
+            Transpose4x4(b0, b1, b2, b3);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            accu0 = vfma_lane_f16(accu0, b0, a0, 0);
+            accu1 = vfma_lane_f16(accu1, b0, a1, 0);
+            if (k > 1) {
+                accu0 = vfma_lane_f16(accu0, b1, a0, 1);
+                accu1 = vfma_lane_f16(accu1, b1, a1, 1);
+            }
+            if (k > 2) {
+                accu0 = vfma_lane_f16(accu0, b2, a0, 2);
+                accu1 = vfma_lane_f16(accu1, b2, a1, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x4_t c0 = MlasLoadFloat16x4(C_data);
+            float16x4_t c1 = MlasLoadFloat16x4(C_data + ldc);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu0 = vfma_f16(c0, accu0, alpha_v);
+            accu1 = vfma_f16(c1, accu1, alpha_v);
+            MlasStoreFloat16x4(C_data, accu0);
+            MlasStoreFloat16x4(C_data + ldc, accu1);
+        } else if constexpr (beta_behavior == 2) {
+            float16x4_t c0 = MlasLoadFloat16x4(C_data);
+            float16x4_t c1 = MlasLoadFloat16x4(C_data + ldc);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+            accu0 = vfma_f16(vmul_f16(c0, beta_v), accu0, alpha_v);
+            accu1 = vfma_f16(vmul_f16(c1, beta_v), accu1, alpha_v);
+            MlasStoreFloat16x4(C_data, accu0);
+            MlasStoreFloat16x4(C_data + ldc, accu1);
+        } else {
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu0 = vmul_f16(accu0, alpha_v);
+            accu1 = vmul_f16(accu1, alpha_v);
+            MlasStoreFloat16x4(C_data, accu0);
+            MlasStoreFloat16x4(C_data + ldc, accu1);
+        }
+
+        CountN -= 4, B_data += 4 * ldb, C_data += 4;
+    }
+
+    if (CountN > 0) {
+        const auto* a = A_data;
+        const auto* b = B_data;
+        size_t k = CountK;
+        float16x8_t accu0[4];
+        float16x8_t accu1[4];
+        size_t i = 0;
+        for (i = 0; i < 4; ++i) {
+            accu0[i] = MlasZeroFloat16x8();
+            accu1[i] = MlasZeroFloat16x8();
+        }
+        for (; k >= 8; k -= 8, a += 8, b += 8) {
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            for (i = 0; i < CountN; ++i) {
+                float16x8_t bi = MlasLoadFloat16x8(b + i * ldb);
+                accu0[i] = vfmaq_f16(accu0[i], bi, a0);
+                accu1[i] = vfmaq_f16(accu1[i], bi, a1);
+            }
+        }
+        Transpose4x8(accu0[0], accu0[1], accu0[2], accu0[3]);
+        Transpose4x8(accu1[0], accu1[1], accu1[2], accu1[3]);
+        float16x8_t accu00 = addq_f16x4(accu0[0], accu0[1], accu0[2], accu0[3]);
+        float16x4_t accu_0 = vadd_f16(vget_low_f16(accu00), vget_high_f16(accu00));
+        float16x8_t accu10 = addq_f16x4(accu1[0], accu1[1], accu1[2], accu1[3]);
+        float16x4_t accu_1 = vadd_f16(vget_low_f16(accu10), vget_high_f16(accu10));
+
+        if (k & 4) {
+            float16x4_t bs[4];
+            for (i = 0; i < CountN; ++i) {
+                bs[i] = MlasLoadFloat16x4(b + i * ldb);
+            }
+            for (; i < 4; ++i) {
+                bs[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(bs[0], bs[1], bs[2], bs[3]);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu_0 = ma_lane_f16_accu(accu_0, bs[0], bs[1], bs[2], bs[3], a0);
+            accu_1 = ma_lane_f16_accu(accu_1, bs[0], bs[1], bs[2], bs[3], a1);
+            k -= 4, a += 4, b += 4;
+        }
+
+        if (k > 0) {
+            float16x4_t bs[4];
+            for (i = 0; i < CountN; ++i) {
+                bs[i] = MlasLoadPartialFloat16x4(b + i * ldb, k);
+            }
+            for (; i < 4; ++i) {
+                bs[i] = MlasZeroFloat16x4();
+            }
+            Transpose4x4(bs[0], bs[1], bs[2], bs[3]);
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            accu_0 = vfma_lane_f16(accu_0, bs[0], a0, 0);
+            accu_1 = vfma_lane_f16(accu_1, bs[0], a1, 0);
+            if (k > 1) {
+                accu_0 = vfma_lane_f16(accu_0, bs[1], a0, 1);
+                accu_1 = vfma_lane_f16(accu_1, bs[1], a1, 1);
+            }
+            if (k > 2) {
+                accu_0 = vfma_lane_f16(accu_0, bs[2], a0, 2);
+                accu_1 = vfma_lane_f16(accu_1, bs[2], a1, 2);
+            }
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x4_t c0 = MlasLoadPartialFloat16x4(C_data, CountN);
+            float16x4_t c1 = MlasLoadPartialFloat16x4(C_data + ldc, CountN);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu_0 = vfma_f16(c0, accu_0, alpha_v);
+            accu_1 = vfma_f16(c1, accu_1, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu_0, CountN);
+            MlasStorePartialFloat16x4(C_data + ldc, accu_1, CountN);
+        } else if constexpr (beta_behavior == 2) {
+            float16x4_t c0 = MlasLoadPartialFloat16x4(C_data, CountN);
+            float16x4_t c1 = MlasLoadPartialFloat16x4(C_data + ldc, CountN);
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+            accu_0 = vfma_f16(vmul_f16(c0, beta_v), accu_0, alpha_v);
+            accu_1 = vfma_f16(vmul_f16(c1, beta_v), accu_1, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu_0, CountN);
+            MlasStorePartialFloat16x4(C_data + ldc, accu_1, CountN);
+        } else {
+            float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+            accu_0 = vmul_f16(accu_0, alpha_v);
+            accu_1 = vmul_f16(accu_1, alpha_v);
+            MlasStorePartialFloat16x4(C_data, accu_0, CountN);
+            MlasStorePartialFloat16x4(C_data + ldc, accu_1, CountN);
+        }
+    }
+}
+
+// Full K. Directly save to C.
+void HGemm_TransposedB_Kernel(
+    const MLAS_FP16* A,
+    const MLAS_FP16* B,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldb,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    if (CountM > 2) {
+        MLAS_THROW_EX(std::runtime_error, "HGemm_TransposedB_Kernel only support <= 2 rows");
+    }
+    const auto* A_data = reinterpret_cast<const _mlas_fp16_*>(A);
+    const auto* B_data = reinterpret_cast<const _mlas_fp16_*>(B);
+    auto* C_data = reinterpret_cast<_mlas_fp16_*>(C);
+    const auto f16_0 = MLAS_FP16(0.0f);
+    const auto f16_1 = MLAS_FP16(1.0f);
+    if (CountM == 1) {
+        if (beta == f16_0.val) {
+            HGemm_TransposedB_Kernel_M1<0>(A_data, B_data, C_data, CountN, CountK, ldb, alpha, beta);
+        } else if (beta == f16_1.val) {
+            HGemm_TransposedB_Kernel_M1<1>(A_data, B_data, C_data, CountN, CountK, ldb, alpha, beta);
+        } else {
+            HGemm_TransposedB_Kernel_M1<2>(A_data, B_data, C_data, CountN, CountK, ldb, alpha, beta);
+        }
+    } else {
+        if (beta == f16_0.val) {
+            HGemm_TransposedB_Kernel_M2<0>(A_data, B_data, C_data, CountN, CountK, lda, ldb, ldc, alpha, beta);
+        } else if (beta == f16_1.val) {
+            HGemm_TransposedB_Kernel_M2<1>(A_data, B_data, C_data, CountN, CountK, lda, ldb, ldc, alpha, beta);
+        } else {
+            HGemm_TransposedB_Kernel_M2<2>(A_data, B_data, C_data, CountN, CountK, lda, ldb, ldc, alpha, beta);
+        }
+    }
+}
+
+template <int beta_behavior> // 0: beta == 0, 1: beta == 1, 2: beta != 0 && beta != 1
+void HGemm_TransposedPackedB_Kernel_M1(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* PackedB,
+    _mlas_fp16_* C,
+    size_t CountN,
+    size_t CountK,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    for (; CountN >= 16; CountN -= 16, C += 16) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        float16x8_t accu1 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 16) {
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t b40 = MlasLoadFloat16x8(PackedB + 64);
+            float16x8_t b41 = MlasLoadFloat16x8(PackedB + 72);
+            float16x8_t b50 = MlasLoadFloat16x8(PackedB + 80);
+            float16x8_t b51 = MlasLoadFloat16x8(PackedB + 88);
+            float16x8_t b60 = MlasLoadFloat16x8(PackedB + 96);
+            float16x8_t b61 = MlasLoadFloat16x8(PackedB + 104);
+            float16x8_t b70 = MlasLoadFloat16x8(PackedB + 112);
+            float16x8_t b71 = MlasLoadFloat16x8(PackedB + 120);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = maq_laneq_f16_accu(accu0, b00, b10, b20, b30, b40, b50, b60, b70, a0);
+            accu1 = maq_laneq_f16_accu(accu1, b01, b11, b21, b31, b41, b51, b61, b71, a0);
+        }
+
+        if (k & 4) {
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu0 = maq_lane_f16_accu(accu0, b00, b10, b20, b30, a0);
+            accu1 = maq_lane_f16_accu(accu1, b01, b11, b21, b31, a0);
+            k -= 4, a += 4, PackedB += 4 * 16;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            accu0 = vfmaq_lane_f16(accu0, b00, a0, 0);
+            accu1 = vfmaq_lane_f16(accu1, b01, a0, 0);
+            if (k > 1) {
+                float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+                float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+                accu0 = vfmaq_lane_f16(accu0, b10, a0, 1);
+                accu1 = vfmaq_lane_f16(accu1, b11, a0, 1);
+            }
+            if (k > 2) {
+                float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+                float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+                accu0 = vfmaq_lane_f16(accu0, b20, a0, 2);
+                accu1 = vfmaq_lane_f16(accu1, b21, a0, 2);
+            }
+
+            PackedB += k * 16;
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t c1 = MlasLoadFloat16x8(C + 8);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vfmaq_f16(c0, accu0, alpha_v);
+            accu1 = vfmaq_f16(c1, accu1, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+            MlasStoreFloat16x8(C + 8, accu1);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t c1 = MlasLoadFloat16x8(C + 8);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu0 = vfmaq_f16(vmulq_f16(c0, beta_v), accu0, alpha_v);
+            accu1 = vfmaq_f16(vmulq_f16(c1, beta_v), accu1, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+            MlasStoreFloat16x8(C + 8, accu1);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vmulq_f16(accu0, alpha_v);
+            accu1 = vmulq_f16(accu1, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+            MlasStoreFloat16x8(C + 8, accu1);
+        }
+    }
+
+    if (CountN & 8) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = maq_laneq_f16_accu(accu0, b0, b1, b2, b3, b4, b5, b6, b7, a0);
+        }
+
+        if (k & 4) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu0 = maq_lane_f16_accu(accu0, b0, b1, b2, b3, a0);
+            k -= 4, a += 4, PackedB += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            accu0 = vfmaq_lane_f16(accu0, b0, a0, 0);
+            if (k > 1) {
+                float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+                accu0 = vfmaq_lane_f16(accu0, b1, a0, 1);
+            }
+            if (k > 2) {
+                float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+                accu0 = vfmaq_lane_f16(accu0, b2, a0, 2);
+            }
+            PackedB += k * 8;
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vfmaq_f16(c0, accu0, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu0 = vfmaq_f16(vmulq_f16(c0, beta_v), accu0, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu0 = vmulq_f16(accu0, alpha_v);
+            MlasStoreFloat16x8(C, accu0);
+        }
+
+        CountN -= 8, C += 8;
+    }
+
+    if (CountN > 0) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            accu0 = maq_laneq_f16_accu(accu0, b0, b1, b2, b3, b4, b5, b6, b7, a0);
+        }
+
+        if (k & 4) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            accu0 = maq_lane_f16_accu(accu0, b0, b1, b2, b3, a0);
+            k -= 4, a += 4, PackedB += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            accu0 = vfmaq_lane_f16(accu0, b0, a0, 0);
+            if (k > 1) {
+                float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+                accu0 = vfmaq_lane_f16(accu0, b1, a0, 1);
+            }
+            if (k > 2) {
+                float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+                accu0 = vfmaq_lane_f16(accu0, b2, a0, 2);
+            }
+            PackedB += k * 8;
+        }
+
+        float16x4_t accu_low = vget_low_f16(accu0);
+        float16x4_t accu_high = vget_high_f16(accu0);
+
+        if (CountN & 4) {
+            if constexpr (beta_behavior == 1) {
+                float16x4_t c0 = MlasLoadFloat16x4(C);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStoreFloat16x4(C, vfma_f16(c0, accu_low, alpha_v));
+            } else if constexpr (beta_behavior == 2) {
+                float16x4_t c0 = MlasLoadFloat16x4(C);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+                MlasStoreFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu_low, alpha_v));
+            } else {
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStoreFloat16x4(C, vmul_f16(accu_low, alpha_v));
+            }
+
+            CountN -= 4, C += 4;
+            accu_low = accu_high;
+        }
+
+        if (CountN) {
+            if constexpr (beta_behavior == 1) {
+                float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStorePartialFloat16x4(C, vfma_f16(c0, accu_low, alpha_v), CountN);
+            } else if constexpr (beta_behavior == 2) {
+                float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+                MlasStorePartialFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu_low, alpha_v), CountN);
+            } else {
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStorePartialFloat16x4(C, vmul_f16(accu_low, alpha_v), CountN);
+            }
+        }
+    }
+}
+
+template <int beta_behavior> // 0: beta == 0, 1: beta == 1, 2: beta != 0 && beta != 1
+void HGemm_TransposedPackedB_Kernel_M2(
+    const _mlas_fp16_* A,
+    const _mlas_fp16_* PackedB,
+    _mlas_fp16_* C,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    for (; CountN >= 16; CountN -= 16, C += 16) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu00 = MlasZeroFloat16x8();
+        float16x8_t accu01 = MlasZeroFloat16x8();
+        float16x8_t accu10 = MlasZeroFloat16x8();
+        float16x8_t accu11 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 16) {
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t b40 = MlasLoadFloat16x8(PackedB + 64);
+            float16x8_t b41 = MlasLoadFloat16x8(PackedB + 72);
+            float16x8_t b50 = MlasLoadFloat16x8(PackedB + 80);
+            float16x8_t b51 = MlasLoadFloat16x8(PackedB + 88);
+            float16x8_t b60 = MlasLoadFloat16x8(PackedB + 96);
+            float16x8_t b61 = MlasLoadFloat16x8(PackedB + 104);
+            float16x8_t b70 = MlasLoadFloat16x8(PackedB + 112);
+            float16x8_t b71 = MlasLoadFloat16x8(PackedB + 120);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu00 = maq_laneq_f16_accu(accu00, b00, b10, b20, b30, b40, b50, b60, b70, a0);
+            accu01 = maq_laneq_f16_accu(accu01, b01, b11, b21, b31, b41, b51, b61, b71, a0);
+            accu10 = maq_laneq_f16_accu(accu10, b00, b10, b20, b30, b40, b50, b60, b70, a1);
+            accu11 = maq_laneq_f16_accu(accu11, b01, b11, b21, b31, b41, b51, b61, b71, a1);
+        }
+
+        if (k & 4) {
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b30 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b31 = MlasLoadFloat16x8(PackedB + 56);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu00 = maq_lane_f16_accu(accu00, b00, b10, b20, b30, a0);
+            accu01 = maq_lane_f16_accu(accu01, b01, b11, b21, b31, a0);
+            accu10 = maq_lane_f16_accu(accu10, b00, b10, b20, b30, a1);
+            accu11 = maq_lane_f16_accu(accu11, b01, b11, b21, b31, a1);
+            k -= 4, a += 4, PackedB += 4 * 16;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            float16x8_t b00 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b01 = MlasLoadFloat16x8(PackedB + 8);
+            accu00 = vfmaq_lane_f16(accu00, b00, a0, 0);
+            accu01 = vfmaq_lane_f16(accu01, b01, a0, 0);
+            accu10 = vfmaq_lane_f16(accu10, b00, a1, 0);
+            accu11 = vfmaq_lane_f16(accu11, b01, a1, 0);
+            if (k > 1) {
+                float16x8_t b10 = MlasLoadFloat16x8(PackedB + 16);
+                float16x8_t b11 = MlasLoadFloat16x8(PackedB + 24);
+                accu00 = vfmaq_lane_f16(accu00, b10, a0, 1);
+                accu01 = vfmaq_lane_f16(accu01, b11, a0, 1);
+                accu10 = vfmaq_lane_f16(accu10, b10, a1, 1);
+                accu11 = vfmaq_lane_f16(accu11, b11, a1, 1);
+            }
+            if (k > 2) {
+                float16x8_t b20 = MlasLoadFloat16x8(PackedB + 32);
+                float16x8_t b21 = MlasLoadFloat16x8(PackedB + 40);
+                accu00 = vfmaq_lane_f16(accu00, b20, a0, 2);
+                accu01 = vfmaq_lane_f16(accu01, b21, a0, 2);
+                accu10 = vfmaq_lane_f16(accu10, b20, a1, 2);
+                accu11 = vfmaq_lane_f16(accu11, b21, a1, 2);
+            }
+            PackedB += k * 16;
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c00 = MlasLoadFloat16x8(C);
+            float16x8_t c01 = MlasLoadFloat16x8(C + 8);
+            float16x8_t c10 = MlasLoadFloat16x8(C + ldc);
+            float16x8_t c11 = MlasLoadFloat16x8(C + ldc + 8);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vfmaq_f16(c00, accu00, alpha_v);
+            accu01 = vfmaq_f16(c01, accu01, alpha_v);
+            accu10 = vfmaq_f16(c10, accu10, alpha_v);
+            accu11 = vfmaq_f16(c11, accu11, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + 8, accu01);
+            MlasStoreFloat16x8(C + ldc, accu10);
+            MlasStoreFloat16x8(C + ldc + 8, accu11);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c00 = MlasLoadFloat16x8(C);
+            float16x8_t c01 = MlasLoadFloat16x8(C + 8);
+            float16x8_t c10 = MlasLoadFloat16x8(C + ldc);
+            float16x8_t c11 = MlasLoadFloat16x8(C + ldc + 8);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu00 = vfmaq_f16(vmulq_f16(c00, beta_v), accu00, alpha_v);
+            accu01 = vfmaq_f16(vmulq_f16(c01, beta_v), accu01, alpha_v);
+            accu10 = vfmaq_f16(vmulq_f16(c10, beta_v), accu10, alpha_v);
+            accu11 = vfmaq_f16(vmulq_f16(c11, beta_v), accu11, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + 8, accu01);
+            MlasStoreFloat16x8(C + ldc, accu10);
+            MlasStoreFloat16x8(C + ldc + 8, accu11);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vmulq_f16(accu00, alpha_v);
+            accu01 = vmulq_f16(accu01, alpha_v);
+            accu10 = vmulq_f16(accu10, alpha_v);
+            accu11 = vmulq_f16(accu11, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + 8, accu01);
+            MlasStoreFloat16x8(C + ldc, accu10);
+            MlasStoreFloat16x8(C + ldc + 8, accu11);
+        }
+    }
+
+    if (CountN & 8) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu00 = MlasZeroFloat16x8();
+        float16x8_t accu10 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu00 = maq_laneq_f16_accu(accu00, b0, b1, b2, b3, b4, b5, b6, b7, a0);
+            accu10 = maq_laneq_f16_accu(accu10, b0, b1, b2, b3, b4, b5, b6, b7, a1);
+        }
+
+        if (k & 4) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu00 = maq_lane_f16_accu(accu00, b0, b1, b2, b3, a0);
+            accu10 = maq_lane_f16_accu(accu10, b0, b1, b2, b3, a1);
+            k -= 4, a += 4, PackedB += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            accu00 = vfmaq_lane_f16(accu00, b0, a0, 0);
+            accu10 = vfmaq_lane_f16(accu10, b0, a1, 0);
+            if (k > 1) {
+                float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+                accu00 = vfmaq_lane_f16(accu00, b1, a0, 1);
+                accu10 = vfmaq_lane_f16(accu10, b1, a1, 1);
+            }
+            if (k > 2) {
+                float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+                accu00 = vfmaq_lane_f16(accu00, b2, a0, 2);
+                accu10 = vfmaq_lane_f16(accu10, b2, a1, 2);
+            }
+            PackedB += k * 8;
+        }
+
+        if constexpr (beta_behavior == 1) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t c1 = MlasLoadFloat16x8(C + ldc);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vfmaq_f16(c0, accu00, alpha_v);
+            accu10 = vfmaq_f16(c1, accu10, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + ldc, accu10);
+        } else if constexpr (beta_behavior == 2) {
+            float16x8_t c0 = MlasLoadFloat16x8(C);
+            float16x8_t c1 = MlasLoadFloat16x8(C + ldc);
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            float16x8_t beta_v = MlasBroadcastFloat16x8(beta);
+            accu00 = vfmaq_f16(vmulq_f16(c0, beta_v), accu00, alpha_v);
+            accu10 = vfmaq_f16(vmulq_f16(c1, beta_v), accu10, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + ldc, accu10);
+        } else {
+            float16x8_t alpha_v = MlasBroadcastFloat16x8(alpha);
+            accu00 = vmulq_f16(accu00, alpha_v);
+            accu10 = vmulq_f16(accu10, alpha_v);
+            MlasStoreFloat16x8(C, accu00);
+            MlasStoreFloat16x8(C + ldc, accu10);
+        }
+
+        CountN -= 8, C += 8;
+    }
+
+    if (CountN > 0) {
+        const auto* a = A;
+        size_t k = CountK;
+        float16x8_t accu0 = MlasZeroFloat16x8();
+        float16x8_t accu1 = MlasZeroFloat16x8();
+        for (; k >= 8; k -= 8, a += 8, PackedB += 8 * 8) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x8_t b4 = MlasLoadFloat16x8(PackedB + 32);
+            float16x8_t b5 = MlasLoadFloat16x8(PackedB + 40);
+            float16x8_t b6 = MlasLoadFloat16x8(PackedB + 48);
+            float16x8_t b7 = MlasLoadFloat16x8(PackedB + 56);
+            float16x8_t a0 = MlasLoadFloat16x8(a);
+            float16x8_t a1 = MlasLoadFloat16x8(a + lda);
+            accu0 = maq_laneq_f16_accu(accu0, b0, b1, b2, b3, b4, b5, b6, b7, a0);
+            accu1 = maq_laneq_f16_accu(accu1, b0, b1, b2, b3, b4, b5, b6, b7, a1);
+        }
+
+        if (k & 4) {
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+            float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+            float16x8_t b3 = MlasLoadFloat16x8(PackedB + 24);
+            float16x4_t a0 = MlasLoadFloat16x4(a);
+            float16x4_t a1 = MlasLoadFloat16x4(a + lda);
+            accu0 = maq_lane_f16_accu(accu0, b0, b1, b2, b3, a0);
+            accu1 = maq_lane_f16_accu(accu1, b0, b1, b2, b3, a1);
+            k -= 4, a += 4, PackedB += 4 * 8;
+        }
+
+        if (k > 0) {
+            float16x4_t a0 = MlasLoadPartialFloat16x4(a, k);
+            float16x4_t a1 = MlasLoadPartialFloat16x4(a + lda, k);
+            float16x8_t b0 = MlasLoadFloat16x8(PackedB);
+            accu0 = vfmaq_lane_f16(accu0, b0, a0, 0);
+            accu1 = vfmaq_lane_f16(accu1, b0, a1, 0);
+            if (k > 1) {
+                float16x8_t b1 = MlasLoadFloat16x8(PackedB + 8);
+                accu0 = vfmaq_lane_f16(accu0, b1, a0, 1);
+                accu1 = vfmaq_lane_f16(accu1, b1, a1, 1);
+            }
+            if (k > 2) {
+                float16x8_t b2 = MlasLoadFloat16x8(PackedB + 16);
+                accu0 = vfmaq_lane_f16(accu0, b2, a0, 2);
+                accu1 = vfmaq_lane_f16(accu1, b2, a1, 2);
+            }
+            PackedB += k * 8;
+        }
+
+        float16x4_t accu0_low = vget_low_f16(accu0);
+        float16x4_t accu0_high = vget_high_f16(accu0);
+        float16x4_t accu1_low = vget_low_f16(accu1);
+        float16x4_t accu1_high = vget_high_f16(accu1);
+
+        if (CountN & 4) {
+            if constexpr (beta_behavior == 1) {
+                float16x4_t c0 = MlasLoadFloat16x4(C);
+                float16x4_t c1 = MlasLoadFloat16x4(C + ldc);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStoreFloat16x4(C, vfma_f16(c0, accu0_low, alpha_v));
+                MlasStoreFloat16x4(C + ldc, vfma_f16(c1, accu1_low, alpha_v));
+            } else if constexpr (beta_behavior == 2) {
+                float16x4_t c0 = MlasLoadFloat16x4(C);
+                float16x4_t c1 = MlasLoadFloat16x4(C + ldc);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+                MlasStoreFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu0_low, alpha_v));
+                MlasStoreFloat16x4(C + ldc, vfma_f16(vmul_f16(c1, beta_v), accu1_low, alpha_v));
+            } else {
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStoreFloat16x4(C, vmul_f16(accu0_low, alpha_v));
+                MlasStoreFloat16x4(C + ldc, vmul_f16(accu1_low, alpha_v));
+            }
+            CountN -= 4, C += 4;
+            accu0_low = accu0_high;
+            accu1_low = accu1_high;
+        }
+
+        if (CountN) {
+            if constexpr (beta_behavior == 1) {
+                float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN);
+                float16x4_t c1 = MlasLoadPartialFloat16x4(C + ldc, CountN);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStorePartialFloat16x4(C, vfma_f16(c0, accu0_low, alpha_v), CountN);
+                MlasStorePartialFloat16x4(C + ldc, vfma_f16(c1, accu1_low, alpha_v), CountN);
+            } else if constexpr (beta_behavior == 2) {
+                float16x4_t c0 = MlasLoadPartialFloat16x4(C, CountN);
+                float16x4_t c1 = MlasLoadPartialFloat16x4(C + ldc, CountN);
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                float16x4_t beta_v = MlasBroadcastFloat16x4(beta);
+                MlasStorePartialFloat16x4(C, vfma_f16(vmul_f16(c0, beta_v), accu0_low, alpha_v), CountN);
+                MlasStorePartialFloat16x4(C + ldc, vfma_f16(vmul_f16(c1, beta_v), accu1_low, alpha_v), CountN);
+            } else {
+                float16x4_t alpha_v = MlasBroadcastFloat16x4(alpha);
+                MlasStorePartialFloat16x4(C, vmul_f16(accu0_low, alpha_v), CountN);
+                MlasStorePartialFloat16x4(C + ldc, vmul_f16(accu1_low, alpha_v), CountN);
+            }
+        }
+    }
+}
+
+void HGemm_TransposedPackedB_Kernel(
+    const MLAS_FP16* A,
+    const MLAS_FP16* PackedB,
+    MLAS_FP16* C,
+    size_t CountM,
+    size_t CountN,
+    size_t CountK,
+    size_t lda,
+    size_t ldc,
+    _mlas_fp16_ alpha,
+    _mlas_fp16_ beta
+) {
+    if (CountM > 2) {
+        MLAS_THROW_EX(std::runtime_error, "HGemm_TransposedPackedB_Kernel only support <= 2 rows");
+    }
+
+    const auto* A_data = reinterpret_cast<const _mlas_fp16_*>(A);
+    const auto* PackedB_data = reinterpret_cast<const _mlas_fp16_*>(PackedB);
+    auto* C_data = reinterpret_cast<_mlas_fp16_*>(C);
+    const auto f16_0 = MLAS_FP16(0.0f);
+    const auto f16_1 = MLAS_FP16(1.0f);
+    if (CountM == 1) {
+        if (beta == f16_0.val) {
+            HGemm_TransposedPackedB_Kernel_M1<0>(A_data, PackedB_data, C_data, CountN, CountK, alpha, beta);
+        } else if (beta == f16_1.val) {
+            HGemm_TransposedPackedB_Kernel_M1<1>(A_data, PackedB_data, C_data, CountN, CountK, alpha, beta);
+        } else {
+            HGemm_TransposedPackedB_Kernel_M1<2>(A_data, PackedB_data, C_data, CountN, CountK, alpha, beta);
+        }
+    } else {
+        if (beta == f16_0.val) {
+            HGemm_TransposedPackedB_Kernel_M2<0>(A_data, PackedB_data, C_data, CountN, CountK, lda, ldc, alpha, beta);
+        } else if (beta == f16_1.val) {
+            HGemm_TransposedPackedB_Kernel_M2<1>(A_data, PackedB_data, C_data, CountN, CountK, lda, ldc, alpha, beta);
+        } else {
+            HGemm_TransposedPackedB_Kernel_M2<2>(A_data, PackedB_data, C_data, CountN, CountK, lda, ldc, alpha, beta);
+        }
+    }
+}
+
+}  // namespace hgemm_neon
diff --git a/onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp
new file mode 100644
index 0000000000..5b131a8e41
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/hgemm_kernel_neon.cpp
@@ -0,0 +1,28 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    hgemm_kernel_neon.cpp
+
+Abstract:
+
+    This module implements half precision GEMM kernel for neon.
+
+--*/
+
+#include "mlasi.h"
+#include "halfgemm.h"
+
+const MLAS_HGEMM_DISPATCH MlasHGemmDispatchNeon = [](){
+    MLAS_HGEMM_DISPATCH d;
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+    d.HPackBKernel_TransposedB = hgemm_neon::HPackB_TransposedB_Kernel;
+    d.HGemmKernel_TransposedB = hgemm_neon::HGemm_TransposedB_Kernel;
+    d.HGemmKernel_TransposedPackedB = hgemm_neon::HGemm_TransposedPackedB_Kernel;
+#endif
+    return d;
+}();
diff --git a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp
index 69e37d2b91..5b1f9d7d4a 100644
--- a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp
+++ b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp
@@ -93,39 +93,6 @@ Transpose8x8(uint8x8_t& v0, uint8x8_t& v1, uint8x8_t& v2, uint8x8_t& v3,
     v7 = vreinterpret_u8_u32(c3.val[1]);
 }
 
-MLAS_FORCEINLINE void
-Transpose4x8(float16x8_t& v0, float16x8_t& v1, float16x8_t& v2, float16x8_t& v3)
-{
-    // |v00|v01|v02|v03|v04|v05|v06|v07|
-    // |v10|v11|v12|v13|v14|v15|v16|v17|
-    // |v20|v21|v22|v23|v24|v25|v26|v27|
-    // |v30|v31|v32|v33|v34|v35|v36|v37|
-    //  =>
-    // |v00|v10|v20|v30|v04|v14|v24|v34|
-    // |v01|v11|v21|v31|v05|v15|v25|v35|
-    // |v02|v12|v22|v32|v06|v16|v26|v36|
-    // |v03|v13|v23|v33|v07|v17|v27|v37|
-    float16x8x2_t t01 = vtrnq_f16(v0, v1);
-    float16x8x2_t t23 = vtrnq_f16(v2, v3);
-
-    v0 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
-    v1 = vreinterpretq_f16_f32(vtrn1q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
-    v2 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[0]), vreinterpretq_f32_f16(t23.val[0])));
-    v3 = vreinterpretq_f16_f32(vtrn2q_f32(vreinterpretq_f32_f16(t01.val[1]), vreinterpretq_f32_f16(t23.val[1])));
-}
-
-MLAS_FORCEINLINE void
-Transpose4x4(float16x4_t& v0, float16x4_t& v1, float16x4_t& v2, float16x4_t& v3)
-{
-    float16x4x2_t t01 = vtrn_f16(v0, v1);
-    float16x4x2_t t23 = vtrn_f16(v2, v3);
-
-    v0 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
-    v1 = vreinterpret_f16_f32(vtrn1_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
-    v2 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[0]), vreinterpret_f32_f16(t23.val[0])));
-    v3 = vreinterpret_f16_f32(vtrn2_f32(vreinterpret_f32_f16(t01.val[1]), vreinterpret_f32_f16(t23.val[1])));
-}
-
 void
 HQ4BitGemmPackQuantBData_CompFp16(
     size_t N,
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 100d7d4775..56fad6bb34 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -301,6 +301,8 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE);
 // Define the default strides to step through slices of the input matrices.
 //
 
+#define MLAS_HGEMM_STRIDEN                          32
+#define MLAS_HGEMM_STRIDEK                          512
 #define MLAS_SGEMM_STRIDEN                          128
 #define MLAS_SGEMM_STRIDEK                          128
 #define MLAS_SGEMM_PACKED_STRIDEN                   128
@@ -317,6 +319,7 @@ static_assert(sizeof(MLAS_FP16) == FP16_SIZE);
 // the effort at this time.
 //
 
+#define MLAS_HGEMM_STRIDEN_THREAD_ALIGN             16
 #define MLAS_SGEMM_STRIDEN_THREAD_ALIGN             16
 #define MLAS_DGEMM_STRIDEN_THREAD_ALIGN             8
 #define MLAS_QGEMM_STRIDEN_THREAD_ALIGN             16
@@ -944,6 +947,7 @@ extern "C" {
 #define MLAS_SGEMM_THREAD_COMPLEXITY                (size_t(64) * size_t(1024))
 #define MLAS_DGEMM_THREAD_COMPLEXITY                (size_t(64) * size_t(1024))
 #define MLAS_QGEMM_THREAD_COMPLEXITY                65536
+#define MLAS_HGEMM_THREAD_COMPLEXITY                65536
 
 #if defined(__aarch64__) && defined(__linux__)
 #define MLAS_SBGEMM_THREAD_COMPLEXITY (size_t(64) * size_t(1024))
@@ -1055,6 +1059,12 @@ extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni;
 struct MLAS_ROPE_DISPATCH;
 extern const MLAS_ROPE_DISPATCH MlasRopeDispatchNeon;
 
+//
+// half gemm dispatch structure
+//
+struct MLAS_HGEMM_DISPATCH;
+extern const MLAS_HGEMM_DISPATCH MlasHGemmDispatchNeon;
+
 
 //
 // Quantized depthwise convolution kernels.
@@ -1217,6 +1227,7 @@ struct MLAS_PLATFORM {
     MLAS_CAST_F32_TO_F16_KERNEL* CastF32ToF16Kernel;
 
     const MLAS_ROPE_DISPATCH* RopeDispatch{nullptr};
+    const MLAS_HGEMM_DISPATCH* HGemmDispatch{nullptr};
 };
 
 inline
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index ec572a4150..026a954bbc 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -544,6 +544,7 @@ Return Value:
     this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon;
     this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon;
     this->RopeDispatch = &MlasRopeDispatchNeon;
+    this->HGemmDispatch = &MlasHGemmDispatchNeon;
 
     //
     // Check if the processor supports ASIMD dot product instructions.
diff --git a/onnxruntime/test/mlas/bench/bench_hgemm.cpp b/onnxruntime/test/mlas/bench/bench_hgemm.cpp
new file mode 100644
index 0000000000..1e8b0eb7c3
--- /dev/null
+++ b/onnxruntime/test/mlas/bench/bench_hgemm.cpp
@@ -0,0 +1,86 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "mlas.h"
+#include "bench_util.h"
+#include "core/util/thread_utils.h"
+
+#include <stdexcept>
+#include <numeric>
+
+static const std::vector<std::string> hgemm_bench_arg_names = {"M", "N", "K"};
+
+void HGEMM(benchmark::State& state, bool transA, bool transB) {
+  if (state.range(0) <= 0) throw std::invalid_argument("M must greater than 0!");
+  if (state.range(1) <= 0) throw std::invalid_argument("N must greater than 0!");
+  if (state.range(2) <= 0) throw std::invalid_argument("K must greater than 0!");
+  const size_t M = static_cast<size_t>(state.range(0));
+  const size_t N = static_cast<size_t>(state.range(1));
+  const size_t K = static_cast<size_t>(state.range(2));
+
+  auto A = RandomVectorUniform(static_cast<size_t>(M * K), MLAS_FP16(-1.0f), MLAS_FP16(1.0f));
+  auto B = RandomVectorUniform(static_cast<size_t>(N * K), MLAS_FP16(-1.0f), MLAS_FP16(1.0f));
+  std::vector<MLAS_FP16> C(static_cast<size_t>(M * N));
+
+  MLAS_FP16 alpha = MLAS_FP16(1.0f);
+  MLAS_FP16 beta = MLAS_FP16(0.0f);
+  OrtThreadPoolParams tpo;
+  tpo.thread_pool_size = 8;
+  tpo.auto_set_affinity = true;
+  std::unique_ptr<onnxruntime::concurrency::ThreadPool> tp(
+      onnxruntime::concurrency::CreateThreadPool(&onnxruntime::Env::Default(),
+                                                 tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP));
+  MlasGemm(
+      transA ? CblasTrans : CblasNoTrans,
+      transB ? CblasTrans : CblasNoTrans,
+      static_cast<size_t>(M),
+      static_cast<size_t>(N),
+      static_cast<size_t>(K),
+      A.data(),
+      transA ? M : K,
+      B.data(),
+      transB ? K : N,
+      C.data(),
+      N,
+      alpha.val,
+      beta.val,
+      tp.get());
+
+  for (auto _ : state) {
+    MlasGemm(
+        transA ? CblasTrans : CblasNoTrans,
+        transB ? CblasTrans : CblasNoTrans,
+        static_cast<size_t>(M),
+        static_cast<size_t>(N),
+        static_cast<size_t>(K),
+        A.data(),
+        transA ? M : K,
+        B.data(),
+        transB ? K : N,
+        C.data(),
+        N,
+        alpha.val,
+        beta.val,
+        tp.get());
+  }
+}
+
+static void GemmSizeWithOne(benchmark::internal::Benchmark* b) {
+  b->ArgNames(hgemm_bench_arg_names);
+  b->ArgsProduct({{1}, {63, 255, 1023}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {1}, {63, 255, 1023}});
+  b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {1}});
+}
+BENCHMARK_CAPTURE(HGEMM, GEMV_TransB, false, true)->Apply(GemmSizeWithOne)->UseRealTime();
+
+static void GemmSizeProducts(benchmark::internal::Benchmark* b) {
+  b->ArgNames(hgemm_bench_arg_names);
+  b->ArgsProduct({{63, 255, 1023}, {63, 255, 1023}, {63, 255, 1023}});
+}
+BENCHMARK_CAPTURE(HGEMM, NORMAL_TransB, false, true)->Apply(GemmSizeProducts)->UseRealTime();
+
+static void GemmLLMSizeProducts(benchmark::internal::Benchmark* b) {
+  b->ArgNames(hgemm_bench_arg_names);
+  b->ArgsProduct({{1, 1024, 2048}, {4096, 11008}, {4096, 11008}});
+}
+BENCHMARK_CAPTURE(HGEMM, LLM, false, true)->Apply(GemmLLMSizeProducts)->UseRealTime();
diff --git a/onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp b/onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp
new file mode 100644
index 0000000000..4f3d690b43
--- /dev/null
+++ b/onnxruntime/test/mlas/unittest/test_hgemm_neon.cpp
@@ -0,0 +1,393 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    test_hgemm_neon.cpp
+
+Abstract:
+
+    Tests for MLAS fp16 GEMM on ARM CPU.
+
+--*/
+
+#include <vector>
+#include <random>
+
+#include "test/mlas/unittest/test_util.h"
+#include "core/mlas/lib/mlasi.h"
+#include "core/mlas/lib/halfgemm.h"
+
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+
+class MlasNeonHGemmPackBTest : public MlasTestBase {
+ private:
+  std::random_device rd_;
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_real_distribution<float> distrib_;
+  MatrixGuardBuffer<MLAS_FP16> input_, ref_, packed_;
+
+  template <size_t N, size_t K>
+  MLAS_FORCEINLINE void PackB(const MLAS_FP16* src, MLAS_FP16* dst) {
+    size_t i = 0;
+    for (; i + 16 <= N; i += 16) {
+      for (size_t j = 0; j < K; ++j) {
+        for (size_t k = 0; k < 16; ++k) {
+          *dst = src[(i + k) * K + j];
+          ++dst;
+        }
+      }
+    }
+    if (i + 8 <= N) {
+      for (size_t j = 0; j < K; ++j) {
+        for (size_t k = 0; k < 8; ++k) {
+          *dst = src[(i + k) * K + j];
+          ++dst;
+        }
+      }
+      i += 8;
+    }
+    if (i < N) {
+      for (size_t j = 0; j < K; ++j) {
+        for (size_t k = 0; k < N - i; ++k) {
+          *dst = src[(i + k) * K + j];
+          ++dst;
+        }
+        dst += 8 - (N - i);
+      }
+    }
+  }
+
+  template <size_t N, size_t K>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* packed, const MLAS_FP16* ref) {
+    size_t n = ((N + 7) & ~7) * K;
+    for (size_t i = 0; i < n; ++i) {
+      ASSERT_EQ(packed[i].val, ref[i].val) << " seed " << seed_ << " i " << i;
+    }
+  }
+
+  template <size_t N, size_t K>
+  void TestPackB() {
+    auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = MLAS_FP16(distrib_(gen_));
+      }
+    };
+
+    const auto* input = input_.GetFilledBuffer(N * K, InitializeBuffer);
+    auto* packed = packed_.GetBuffer(K * ((N + 7) & ~7), true);
+    auto* ref = ref_.GetBuffer(K * ((N + 7) & ~7), true);
+    hgemm_neon::HPackB_TransposedB_Kernel(input, packed, N, K, K);
+    PackB<N, K>(input, ref);
+    Check<N, K>(packed, ref);
+  }
+
+ public:
+  MlasNeonHGemmPackBTest()
+      : seed_(rd_()), gen_(seed_), distrib_(-100.f, 100.f) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonHGemmPackB";
+  }
+
+  void ExecuteShort(void) override {
+    TestPackB<1, 1>();
+    TestPackB<1, 15>();
+    TestPackB<1, 31>();
+    TestPackB<8, 1>();
+    TestPackB<8, 16>();
+    TestPackB<9, 31>();
+    TestPackB<9, 33>();
+    TestPackB<15, 33>();
+    TestPackB<17, 67>();
+    TestPackB<17, 96>();
+    TestPackB<265, 263>();
+  }
+};
+
+class MlasNeonHGemmTransposedBTest : public MlasTestBase {
+ private:
+  std::random_device rd_;
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_real_distribution<float> distrib_;
+  MatrixGuardBuffer<MLAS_FP16> A_, B_, ref_, C_;
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void HGemm(const MLAS_FP16* A, const MLAS_FP16* B, MLAS_FP16* C, MLAS_FP16 alpha, MLAS_FP16 beta) {
+    float alphaf = alpha.ToFloat();
+    float betaf = beta.ToFloat();
+    for (size_t m = 0; m < M; ++m) {
+      for (size_t n = 0; n < N; ++n) {
+        float accu = 0.0f;
+        for (size_t k = 0; k < K; ++k) {
+          accu += (A[m * K + k].ToFloat()) * (B[n * K + k].ToFloat());
+        }
+        C[m * N + n] = MLAS_FP16(accu * alphaf + C[m * N + n].ToFloat() * betaf);
+      }
+    }
+  }
+
+  MLAS_FORCEINLINE
+  bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) {
+    float f0 = v0.ToFloat(), f1 = v1.ToFloat();
+    return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol;
+  }
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* C, const MLAS_FP16* ref) {
+    size_t n = M * N;
+    for (size_t i = 0; i < n; ++i) {
+      ASSERT_TRUE(FloatEqual(C[i], ref[i], 0.02f, 0.055f))
+          << " seed " << seed_ << " i " << i
+          << " M " << M << " N " << N << " K " << K
+          << " v0 " << C[i] << " v1 " << ref[i];
+    }
+  }
+
+  template <size_t M, size_t K, size_t N>
+  void TestHGemm(MLAS_FP16 alpha, MLAS_FP16 beta) {
+    auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = MLAS_FP16(distrib_(gen_));
+      }
+    };
+
+    const auto* A = A_.GetFilledBuffer(M * K, InitializeBuffer);
+    const auto* B = B_.GetFilledBuffer(K * N, InitializeBuffer);
+    auto* C = C_.GetBuffer(M * N, true);
+    auto* ref = ref_.GetBuffer(M * N, true);
+    hgemm_neon::HGemm_TransposedB_Kernel(A, B, C, M, N, K, K, K, N, alpha.val, beta.val);
+    HGemm<M, K, N>(A, B, ref, alpha, beta);
+    Check<M, K, N>(C, ref);
+  }
+
+ public:
+  MlasNeonHGemmTransposedBTest()
+      : seed_(1928375), gen_(seed_), distrib_(-1.f, 1.f) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonHGemmTransposedB";
+  }
+
+  void ExecuteShort(void) override {
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<1, 1, 1>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 15, 17>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 17, 15>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 17, 15>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 33, 31>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 31, 32>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 32, 33>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 78, 263>(MLAS_FP16(0.5f), MLAS_FP16(0.0f));
+    TestHGemm<2, 267, 79>(MLAS_FP16(1.5f), MLAS_FP16(1.0f));
+  }
+};
+
+class MlasNeonHGemmTransposedPackedBTest : public MlasTestBase {
+ private:
+  std::random_device rd_;
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_real_distribution<float> distrib_;
+  MatrixGuardBuffer<MLAS_FP16> A_, B_, ref_, C_;
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void HGemm(const MLAS_FP16* A, const MLAS_FP16* B, MLAS_FP16* C, MLAS_FP16 alpha, MLAS_FP16 beta) {
+    float alphaf = alpha.ToFloat();
+    float betaf = beta.ToFloat();
+    size_t n = 0;
+    for (; n + 16 <= N; n += 16) {
+      for (size_t i = 0; i < 16; ++i) {
+        for (size_t m = 0; m < M; ++m) {
+          float accu = 0.0f;
+          for (size_t k = 0; k < K; ++k) {
+            accu += (A[m * K + k].ToFloat()) * (B[n * K + k * 16 + i].ToFloat());
+          }
+          C[m * N + n + i] = MLAS_FP16(accu * alphaf + C[m * N + n + i].ToFloat() * betaf);
+        }
+      }
+    }
+    if (n + 8 <= N) {
+      for (size_t i = 0; i < 8; ++i) {
+        for (size_t m = 0; m < M; ++m) {
+          float accu = 0.0f;
+          for (size_t k = 0; k < K; ++k) {
+            accu += (A[m * K + k].ToFloat()) * (B[n * K + k * 8 + i].ToFloat());
+          }
+          C[m * N + n + i] = MLAS_FP16(accu * alphaf + C[m * N + n + i].ToFloat() * betaf);
+        }
+      }
+      n += 8;
+    }
+    if (n < N) {
+      for (size_t i = 0; i < N - n; ++i) {
+        for (size_t m = 0; m < M; ++m) {
+          float accu = 0.0f;
+          for (size_t k = 0; k < K; ++k) {
+            accu += (A[m * K + k].ToFloat()) * (B[n * K + k * 8 + i].ToFloat());
+          }
+          C[m * N + n + i] = MLAS_FP16(accu * alphaf + C[m * N + n + i].ToFloat() * betaf);
+        }
+      }
+    }
+  }
+
+  MLAS_FORCEINLINE
+  bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) {
+    float f0 = v0.ToFloat(), f1 = v1.ToFloat();
+    return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol;
+  }
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* C, const MLAS_FP16* ref) {
+    size_t n = M * N;
+    for (size_t i = 0; i < n; ++i) {
+      ASSERT_TRUE(FloatEqual(C[i], ref[i], 0.02f, 0.055f))
+          << " seed " << seed_ << " i " << i
+          << " M " << M << " K " << K << " N " << N
+          << " v0 " << C[i] << " v1 " << ref[i];
+    }
+  }
+
+  template <size_t M, size_t K, size_t N>
+  void TestHGemm(MLAS_FP16 alpha, MLAS_FP16 beta) {
+    auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = MLAS_FP16(distrib_(gen_));
+      }
+    };
+
+    const auto* A = A_.GetFilledBuffer(M * K, InitializeBuffer);
+    const auto* B = B_.GetFilledBuffer(K * ((N + 7) & ~7), InitializeBuffer);
+    auto* C = C_.GetBuffer(M * N, true);
+    auto* ref = ref_.GetBuffer(M * N, true);
+    hgemm_neon::HGemm_TransposedPackedB_Kernel(A, B, C, M, N, K, K, N, alpha.val, beta.val);
+    HGemm<M, K, N>(A, B, ref, alpha, beta);
+    Check<M, K, N>(C, ref);
+  }
+
+ public:
+  MlasNeonHGemmTransposedPackedBTest()
+      : seed_(1928372), gen_(seed_), distrib_(-1.f, 1.f) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonHGemmTransposedPackedB";
+  }
+
+  void ExecuteShort(void) override {
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<1, 1, 1>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 15, 17>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 17, 15>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 17, 15>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 33, 31>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 31, 32>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 32, 33>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 78, 263>(MLAS_FP16(0.5f), MLAS_FP16(0.0f));
+    TestHGemm<2, 267, 79>(MLAS_FP16(1.5f), MLAS_FP16(1.0f));
+  }
+};
+
+class MlasNeonHGemmTest : public MlasTestBase {
+ private:
+  std::random_device rd_;
+  unsigned int seed_;
+  std::mt19937 gen_;  // mersenne_twister_engine seeded with rd()
+  std::uniform_real_distribution<float> distrib_;
+  MatrixGuardBuffer<MLAS_FP16> A_, B_, ref_, C_;
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void HGemm(const MLAS_FP16* A, const MLAS_FP16* B, MLAS_FP16* C, MLAS_FP16 alpha, MLAS_FP16 beta) {
+    float alphaf = alpha.ToFloat();
+    float betaf = beta.ToFloat();
+    for (size_t i = 0; i < M; ++i) {
+      for (size_t j = 0; j < N; ++j) {
+        float accu = 0.0f;
+        for (size_t k = 0; k < K; ++k) {
+          accu += (A[i * K + k].ToFloat()) * (B[j * K + k].ToFloat());
+        }
+        C[i * N + j] = MLAS_FP16(accu * alphaf + C[i * N + j].ToFloat() * betaf);
+      }
+    }
+  }
+
+  MLAS_FORCEINLINE
+  bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) {
+    float f0 = v0.ToFloat(), f1 = v1.ToFloat();
+    return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol;
+  }
+
+  template <size_t M, size_t K, size_t N>
+  MLAS_FORCEINLINE void Check(const MLAS_FP16* C, const MLAS_FP16* ref) {
+    for (size_t i = 0; i < M; ++i) {
+      for (size_t j = 0; j < N; ++j) {
+        ASSERT_TRUE(FloatEqual(C[i * N + j], ref[i * N + j], 0.02f, 0.055f))
+            << " seed " << seed_ << " i " << i << " j " << j
+            << " M " << M << " K " << K << " N " << N
+            << " v0 " << C[i * N + j] << " v1 " << ref[i * N + j];
+      }
+    }
+  }
+
+  template <size_t M, size_t K, size_t N>
+  void TestHGemm(MLAS_FP16 alpha, MLAS_FP16 beta) {
+    auto InitializeBuffer = [this](MLAS_FP16* buffer, size_t count) {
+      for (size_t i = 0; i < count; i++) {
+        buffer[i] = MLAS_FP16(distrib_(gen_));
+      }
+    };
+
+    const auto* A = A_.GetFilledBuffer(M * K, InitializeBuffer);
+    const auto* B = B_.GetFilledBuffer(K * N, InitializeBuffer);
+    auto* C = C_.GetBuffer(M * N, true);
+    auto* ref = ref_.GetBuffer(M * N, true);
+    MlasGemm(CblasNoTrans, CblasTrans, M, N, K, A, K, B, K, C, N, alpha.val, beta.val, nullptr);
+    HGemm<M, K, N>(A, B, ref, alpha, beta);
+    Check<M, K, N>(C, ref);
+  }
+
+ public:
+  MlasNeonHGemmTest()
+      : seed_(192837), gen_(seed_), distrib_(-0.25f, 0.25f) {
+  }
+
+  static const char* GetTestSuiteName() {
+    return "NeonHGemm";
+  }
+
+  void ExecuteShort(void) override {
+    TestHGemm<2, 1, 1>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<1, 128, 512>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<2, 128, 513>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 128, 511>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<2, 129, 512>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<1, 127, 512>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<1, 513, 1023>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+    TestHGemm<2, 511, 1025>(MLAS_FP16(1.5f), MLAS_FP16(0.5f));
+    TestHGemm<127, 513, 1023>(MLAS_FP16(1.0f), MLAS_FP16(0.0f));
+    TestHGemm<129, 511, 1025>(MLAS_FP16(0.5f), MLAS_FP16(1.0f));
+  }
+};
+
+static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
+  size_t count = 0;
+  if (is_short_execute) {
+    count += MlasDirectShortExecuteTests<MlasNeonHGemmPackBTest>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasNeonHGemmTransposedBTest>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasNeonHGemmTransposedPackedBTest>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasNeonHGemmTest>::RegisterShortExecute();
+  }
+  return count;
+});
+
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)

From 1fc9c4823d7c2e8f0d07a09315a0755dd7c58ef8 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 24 Jan 2025 18:18:37 -0800
Subject: [PATCH 19/37] Enable coremltools for Linux build (#23481)

### Description

Enable coremltools for Linux build. In order to do this, I did:

1. Add uuid-devel to the Linux images and regenerate them.
2. Patch the coremltools code a little bit to add some missing header
files.

### Motivation and Context
To make the code simpler. Later on I will create another PR to remove
the COREML_ENABLE_MLPROGRAM C/C++ macro.
Also, after this PR I will bring more changes to
onnxruntime_provider_coreml.cmake to make it work with vcpkg.
---
 cmake/onnxruntime_providers_coreml.cmake      | 83 +++++++++----------
 .../coremltools/crossplatformbuild.patch      | 81 ++++++++++++------
 .../azure-pipelines/bigmodels-ci-pipeline.yml |  2 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |  4 +-
 .../linux-gpu-tensorrt-ci-pipeline.yml        |  4 +-
 ...-gpu-tensorrt-cuda-minimal-ci-pipeline.yml |  4 +-
 .../py-cuda-alt-package-test-pipeline.yml     |  2 +-
 .../py-cuda-package-test-pipeline.yml         |  2 +-
 .../stages/java-cuda-packaging-stage.yml      |  4 +-
 .../jobs/py-linux-cuda-package-test-job.yml   |  4 +-
 .../stages/py-gpu-packaging-stage.yml         |  4 +-
 .../linux/docker/Dockerfile.manylinux2_28_cpu |  2 +-
 .../inference/aarch64/default/cpu/Dockerfile  |  2 +-
 .../inference/aarch64/python/cpu/Dockerfile   |  2 +-
 .../inference/x86_64/default/cpu/Dockerfile   |  2 +-
 .../x86_64/default/cuda11/Dockerfile          |  2 +-
 .../x86_64/default/cuda12/Dockerfile          |  2 +-
 .../inference/x86_64/python/cpu/Dockerfile    |  2 +-
 18 files changed, 115 insertions(+), 93 deletions(-)

diff --git a/cmake/onnxruntime_providers_coreml.cmake b/cmake/onnxruntime_providers_coreml.cmake
index ec7bc7a989..18048c8cdc 100644
--- a/cmake/onnxruntime_providers_coreml.cmake
+++ b/cmake/onnxruntime_providers_coreml.cmake
@@ -8,25 +8,18 @@ endif()
 add_compile_definitions(USE_COREML=1)
 
 # Check if we can build the coremltools code for creating an mlpackage with an mlprogram.
-# The coremltools source requires std::filesystem::path which is only available from iOS 13 on.
-set(_enable_ML_PROGRAM ON)
-if (IOS AND CMAKE_OSX_DEPLOYMENT_TARGET VERSION_LESS 13.0)
-  message(WARNING "CoreML ML Program is not supported on iOS < 13.0. Excluding ML Program support from build.")
-  set(_enable_ML_PROGRAM OFF)
-elseif(LINUX)
-  # uuid-dev is required. we don't bother installing on CIs as it's really for manual developer testing.
+if(LINUX)
   find_library(LibUUID_LIBRARY NAMES uuid)
   find_path(LibUUID_INCLUDE_DIR NAMES uuid/uuid.h)
   if (NOT LibUUID_INCLUDE_DIR)
-    message(STATUS "uuid/uuid.h was not found as is required for ML Program support. "
+    message(FATAL "uuid/uuid.h was not found as is required for ML Program support. "
                     "Run `sudo apt install uuid-dev` if you need to test ML Program related CoreML EP code. ")
-    set(_enable_ML_PROGRAM OFF)
   endif()
 endif()
 
-if (_enable_ML_PROGRAM)
-  add_compile_definitions(COREML_ENABLE_MLPROGRAM=1)
-endif()
+
+add_compile_definitions(COREML_ENABLE_MLPROGRAM=1)
+
 
 # Compile CoreML proto definition to ${CMAKE_CURRENT_BINARY_DIR}/coreml_proto
 set(COREML_PROTO_ROOT ${coremltools_SOURCE_DIR}/mlmodel/format)
@@ -93,10 +86,10 @@ file(GLOB_RECURSE
   "${ONNXRUNTIME_ROOT}/core/providers/coreml/builders/*.cc"
 )
 
-if(_enable_ML_PROGRAM)
+
   # Add helpers to create mlpackage weights. limit to just the files we need to minimize the changes to make them
   # build on Windows and Linux.
-  file(GLOB
+file(GLOB
     onnxruntime_providers_coreml_milblob_cc_srcs CONFIGURE_DEPENDS
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.hpp"
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/*.cpp"
@@ -105,22 +98,22 @@ if(_enable_ML_PROGRAM)
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageFormat.hpp"
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/FileWriter.?pp"
     "${coremltools_SOURCE_DIR}/mlmodel/src/MILBlob/Blob/StorageWriter.?pp"
-  )
+)
 
-  # Add helpers to create mlpackage
-  file(GLOB
+# Add helpers to create mlpackage
+file(GLOB
     onnxruntime_providers_coreml_modelpackage_cc_srcs CONFIGURE_DEPENDS
     "${coremltools_SOURCE_DIR}/modelpackage/src/ModelPackage.?pp"
     "${coremltools_SOURCE_DIR}/modelpackage/src/utils/JsonMap.?pp"
-  )
+)
 
-  set(coremltools_srcs
+set(coremltools_srcs
     ${onnxruntime_providers_coreml_milblob_cc_srcs}
     ${onnxruntime_providers_coreml_modelpackage_cc_srcs}
-  )
+)
+
+source_group(TREE ${coremltools_SOURCE_DIR} PREFIX coremltools FILES ${coremltools_srcs})
 
-  source_group(TREE ${coremltools_SOURCE_DIR} PREFIX coremltools FILES ${coremltools_srcs})
-endif()
 
 # Add CoreML objective c++ source code
 if (APPLE)
@@ -174,34 +167,34 @@ if (APPLE)
   target_compile_definitions(onnxruntime_providers_coreml PRIVATE __APPLE__)
 endif()
 
-if (_enable_ML_PROGRAM)
-  # Setup coremltools fp16 and json dependencies for creating an mlpackage.
-  #
-  # fp16 depends on psimd
-  FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
-  onnxruntime_fetchcontent_makeavailable(psimd)
-  set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR})
-  FetchContent_Declare(fp16 URL ${DEP_URL_fp16} URL_HASH SHA1=${DEP_SHA1_fp16})
-  set(FP16_BUILD_TESTS OFF CACHE INTERNAL "")
-  set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
-  onnxruntime_fetchcontent_makeavailable(fp16)
 
-  # need to tweak the include paths to match what the coreml source code expects
-  target_include_directories(onnxruntime_providers_coreml PRIVATE
-                            ${fp16_SOURCE_DIR}/include
-                            ${nlohmann_json_SOURCE_DIR}/single_include/nlohmann
-                            ${coremltools_SOURCE_DIR}
-                            ${coremltools_SOURCE_DIR}/mlmodel/src/
-                            ${coremltools_SOURCE_DIR}/modelpackage/src/
-  )
+# Setup coremltools fp16 and json dependencies for creating an mlpackage.
+#
+# fp16 depends on psimd
+FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
+onnxruntime_fetchcontent_makeavailable(psimd)
+set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR})
+FetchContent_Declare(fp16 URL ${DEP_URL_fp16} URL_HASH SHA1=${DEP_SHA1_fp16})
+set(FP16_BUILD_TESTS OFF CACHE INTERNAL "")
+set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
+onnxruntime_fetchcontent_makeavailable(fp16)
 
-  add_dependencies(onnxruntime_providers_coreml nlohmann_json::nlohmann_json fp16)
+# need to tweak the include paths to match what the coreml source code expects
+target_include_directories(onnxruntime_providers_coreml PRIVATE
+                          ${fp16_SOURCE_DIR}/include
+                          ${nlohmann_json_SOURCE_DIR}/single_include/nlohmann
+                          ${coremltools_SOURCE_DIR}
+                          ${coremltools_SOURCE_DIR}/mlmodel/src/
+                          ${coremltools_SOURCE_DIR}/modelpackage/src/
+)
 
-  if (LINUX)
-    target_link_libraries(onnxruntime_providers_coreml PRIVATE uuid)
-  endif()
+add_dependencies(onnxruntime_providers_coreml nlohmann_json::nlohmann_json fp16)
+
+if (LINUX)
+  target_link_libraries(onnxruntime_providers_coreml PRIVATE uuid)
 endif()
 
+
 if (APPLE)
   target_link_libraries(onnxruntime_providers_coreml PRIVATE "-framework Foundation" "-framework CoreML")
 endif()
diff --git a/cmake/patches/coremltools/crossplatformbuild.patch b/cmake/patches/coremltools/crossplatformbuild.patch
index 7f2268f50c..832191b366 100644
--- a/cmake/patches/coremltools/crossplatformbuild.patch
+++ b/cmake/patches/coremltools/crossplatformbuild.patch
@@ -3,7 +3,7 @@ index adc7bfcf..7b2bf9cc 100644
 --- a/mlmodel/src/MILBlob/Blob/FileWriter.cpp
 +++ b/mlmodel/src/MILBlob/Blob/FileWriter.cpp
 @@ -8,8 +8,12 @@
-
+ 
  #include <cstdio>
  #include <stdexcept>
 +
@@ -12,17 +12,31 @@ index adc7bfcf..7b2bf9cc 100644
  #include <sys/mman.h>
  #include <sys/stat.h>
 +#endif
-
+ 
  using namespace MILBlob;
  using namespace MILBlob::Blob;
+diff --git a/mlmodel/src/MILBlob/Blob/FileWriter.hpp b/mlmodel/src/MILBlob/Blob/FileWriter.hpp
+index 2bc99403..49239513 100644
+--- a/mlmodel/src/MILBlob/Blob/FileWriter.hpp
++++ b/mlmodel/src/MILBlob/Blob/FileWriter.hpp
+@@ -6,7 +6,8 @@
+ #pragma once
+ 
+ #include "MILBlob/Util/Span.hpp"
+-
++// ORT_EDIT: add missing header
++#include <cstdint>
+ #include <fstream>
+ #include <string>
+ #include <type_traits>
 diff --git a/mlmodel/src/MILBlob/Fp16.cpp b/mlmodel/src/MILBlob/Fp16.cpp
 index ae1e71a1..77a7161f 100644
 --- a/mlmodel/src/MILBlob/Fp16.cpp
 +++ b/mlmodel/src/MILBlob/Fp16.cpp
 @@ -5,6 +5,8 @@
-
+ 
  #include "MILBlob/Fp16.hpp"
-
+ 
 +// ORT_EDIT: Exclude clang specific pragmas from other builds
 +#if defined(__clang__)
  // fp16 lib code has some conversion warnings we don't want to globally ignore
@@ -35,11 +49,11 @@ index ae1e71a1..77a7161f 100644
 +#else
 +#include "fp16/fp16.h"
 +#endif
-
+ 
  using namespace MILBlob;
-
+ 
 diff --git a/modelpackage/src/ModelPackage.cpp b/modelpackage/src/ModelPackage.cpp
-index 8fee56b9..99e0d8d6 100644
+index 8fee56b9..5508e316 100644
 --- a/modelpackage/src/ModelPackage.cpp
 +++ b/modelpackage/src/ModelPackage.cpp
 @@ -26,7 +26,14 @@ namespace std {
@@ -55,22 +69,22 @@ index 8fee56b9..99e0d8d6 100644
  #include <uuid/uuid.h>
 +#endif
  #include <vector>
-
+ 
  #if defined(__cplusplus)
 @@ -187,7 +194,10 @@ public:
      ModelPackageItemInfo createFile(const std::string& name, const std::string& author, const std::string& description);
  };
-
+ 
 +// ORT_EDIT: pragma only available on APPLE platforms
 +#if defined(__APPLE__)
  #pragma mark ModelPackageImpl
 +#endif
-
+ 
  ModelPackageImpl::ModelPackageImpl(const std::filesystem::path& path, bool createIfNecessary, bool readOnly)
  : m_packagePath(path),
 @@ -372,6 +382,20 @@ std::filesystem::path ModelPackageImpl::getItemPath(const std::string& name, con
  }
-
+ 
  std::string ModelPackageImpl::generateIdentifier() const {
 +// ORT_EDIT: Use built-in UUID generation on Windows
 +#if defined(_WIN32)
@@ -87,20 +101,20 @@ index 8fee56b9..99e0d8d6 100644
 +    return uuidStrCpp;
 +#else
      uuid_t uuid;
-
+     
      // uuid_unparse generates a 36-character null-terminated string (37 bytes).
 @@ -383,6 +407,7 @@ std::string ModelPackageImpl::generateIdentifier() const {
      uuid_unparse(uuid, buf);
-
+         
      return std::string(buf);
 +#endif
  }
-
+ 
  ModelPackageItemInfo ModelPackageImpl::createFile(const std::string& name, const std::string& author, const std::string& description) {
-@@ -468,7 +493,13 @@ std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::stri
+@@ -468,7 +493,14 @@ std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::stri
      auto author = itemInfoEntry->getString(kModelPackageItemInfoAuthorKey);
      auto description = itemInfoEntry->getString(kModelPackageItemInfoDescriptionKey);
-
+     
 +// ORT_EDIT: need to use path.string() on Windows
 +#if defined(_WIN32)
 +    return std::make_shared<ModelPackageItemInfo>(std::make_shared<ModelPackageItemInfoImpl>(identifier, path.string(), name, author, description));
@@ -108,12 +122,13 @@ index 8fee56b9..99e0d8d6 100644
 +#else
      return std::make_shared<ModelPackageItemInfo>(std::make_shared<ModelPackageItemInfoImpl>(identifier, path, name, author, description));
 +#endif
++
  }
-
+ 
  std::shared_ptr<ModelPackageItemInfo> ModelPackageImpl::findItem(const std::string& name, const std::string& author) const
-@@ -514,7 +545,9 @@ void ModelPackageImpl::removeItem(const std::string& identifier)
+@@ -514,7 +546,9 @@ void ModelPackageImpl::removeItem(const std::string& identifier)
      }
-
+     
      auto path = m_packageDataDirPath / itemInfoEntry->getString(kModelPackageItemInfoPathKey);
 -    if (0 != std::remove(path.c_str())) {
 +    // ORT_EDIT: std::remove doesn't work on Windows. Use std::filesystem::remove instead.
@@ -121,8 +136,8 @@ index 8fee56b9..99e0d8d6 100644
 +    if (!std::filesystem::remove(path)) {
          throw std::runtime_error("Failed to remove file at path: " + path.string());
      }
-
-@@ -525,13 +558,16 @@ bool ModelPackageImpl::isValid(const std::filesystem::path& path)
+     
+@@ -525,13 +559,16 @@ bool ModelPackageImpl::isValid(const std::filesystem::path& path)
  {
      try {
          ModelPackageImpl(path, false, true);
@@ -132,16 +147,16 @@ index 8fee56b9..99e0d8d6 100644
      }
      return true;
  }
-
+ 
 +// ORT_EDIT: pragma only available on APPLE platforms
 +#if defined(__APPLE__)
  #pragma mark ModelPackage
 +#endif
-
+ 
  ModelPackage::ModelPackage(const std::string& packagePath, bool createIfNecessary, bool readOnly)
  : m_modelPackageImpl(std::make_shared<ModelPackageImpl>(packagePath, createIfNecessary, readOnly))
-@@ -544,7 +580,12 @@ ModelPackage::~ModelPackage()
-
+@@ -544,7 +581,12 @@ ModelPackage::~ModelPackage()
+ 
  std::string ModelPackage::path() const
  {
 +// ORT_EDIT: Windows doesn't automatically convert to std::string as the native format could be char or wchar.
@@ -151,5 +166,19 @@ index 8fee56b9..99e0d8d6 100644
      return m_modelPackageImpl->path();
 +#endif
  }
-
+ 
  std::string ModelPackage::setRootModel(const std::string& path, const std::string& name, const std::string& author, const std::string& description)
+diff --git a/modelpackage/src/utils/JsonMap.hpp b/modelpackage/src/utils/JsonMap.hpp
+index 0d7dc3f4..b700cfd5 100644
+--- a/modelpackage/src/utils/JsonMap.hpp
++++ b/modelpackage/src/utils/JsonMap.hpp
+@@ -10,7 +10,8 @@
+ #include <iostream>
+ #include <vector>
+ #include <string>
+-
++// ORT_EDIT: add missing header
++#include <memory>
+ class JsonMapImpl;
+ 
+ class JsonMap {
diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
index 59deb0d497..0eaaea562c 100644
--- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml
@@ -41,7 +41,7 @@ parameters:
 
 variables:
   - name: docker_base_image
-    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+    value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
   - name: linux_trt_version
     value: 10.3.0.26-1.cuda11.8
   - name: Repository
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 518aec8c2f..71f7ab6e49 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -49,9 +49,9 @@ parameters:
 variables:
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
 
   - name: Repository
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 9025f084d5..c08eaaaa13 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -40,9 +40,9 @@ variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
index 8d42e72014..4a86da167f 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml
@@ -40,9 +40,9 @@ variables:
   - template: templates/common-variables.yml
   - name: docker_base_image
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
     ${{ if eq(parameters.CudaVersion, '12.2') }}:
-      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+      value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
   - name: linux_trt_version
     ${{ if eq(parameters.CudaVersion, '11.8') }}:
       value: ${{ variables.linux_trt_version_cuda11 }}
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
index 4b94ffc7e3..960b59f93b 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml
@@ -18,7 +18,7 @@ stages:
           machine_pool: 'Onnxruntime-Linux-GPU'
           python_wheel_suffix: '_gpu'
           timeout: 480
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
           cuda_version: '11.8'
 
   - stage: Republish_Wheels
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
index 48d1e6b1ac..021f7c5ece 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml
@@ -18,7 +18,7 @@ stages:
           machine_pool: 'Onnxruntime-Linux-GPU'
           python_wheel_suffix: '_gpu'
           timeout: 480
-          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+          docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
           cuda_version: '12.2'
 
   - stage: Republish_Wheels
diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
index 0517fec3ba..b081b39ad9 100644
--- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml
@@ -142,9 +142,9 @@ stages:
       value: false
     - name: docker_base_image
       ${{ if eq(parameters.CudaVersion, '11.8') }}:
-        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
       ${{ if eq(parameters.CudaVersion, '12.2') }}:
-        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+        value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
     timeoutInMinutes: 60
 
     steps:
diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
index 4adf41d3db..85366ffc28 100644
--- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml
@@ -45,9 +45,9 @@ jobs:
       - template: ../../templates/common-variables.yml
       - name: docker_base_image
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
         ${{ if eq(parameters.CudaVersion, '12.2') }}:
-          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+          value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
       - name: linux_trt_version
         ${{ if eq(parameters.CudaVersion, '11.8') }}:
           value: ${{ variables.linux_trt_version_cuda11 }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index a3c804055d..f48573abd3 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -68,9 +68,9 @@ stages:
           cmake_build_type: ${{ parameters.cmake_build_type }}
           cuda_version: ${{ parameters.cuda_version }}
           ${{ if eq(parameters.cuda_version, '11.8') }}:
-            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250109.1
+            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20250124.1
           ${{ if eq(parameters.cuda_version, '12.2') }}:
-            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250109.1
+            docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1
 
   - ${{ if eq(parameters.enable_windows_dml, true) }}:
     - ${{ each python_version in parameters.PythonVersions }}:
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
index 72912acce8..02938f015e 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250124.1
 
 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17
 
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
index 9569aa2fcd..f9d84e3b0e 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14_dotnet:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14_dotnet:20250124.1
 
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
index 589bd869ba..20b9a6c224 100644
--- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc14:20250124.1
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
index 1c1f716d81..d94e7562f1 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14_dotnet:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14_dotnet:20250124.1
 
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
index 6caf21c475..24287fd34d 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20250124.1
 
 ARG TRT_VERSION
 #Install TensorRT only if TRT_VERSION is not empty
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
index a5dda5904d..764a79135d 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile
@@ -2,7 +2,7 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20250124.1
 ARG TRT_VERSION
 
 #Install TensorRT only if TRT_VERSION is not empty
diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
index 04c6398e06..7590d5dd18 100644
--- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250109.1
+FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc14:20250124.1
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && rm -rf /tmp/scripts

From 97c2bbe3eb676d443d92e14447d4c60d92757530 Mon Sep 17 00:00:00 2001
From: Yateng Hong <yatengh@microsoft.com>
Date: Sun, 26 Jan 2025 15:41:29 +0800
Subject: [PATCH 20/37] Fix shape infer of onnx GroupNorm (#23477)

### Description
<!-- Describe your changes. -->
Fix shape infer of onnx GroupNorm.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

Unable to run shape inference for onnx `GroupNorm`.


[model.onnx](https://raw.githubusercontent.com/onnx/onnx/refs/heads/main/onnx/backend/test/data/node/test_group_normalization_example/model.onnx)

> python
D:\source\cognition\onnxruntime\onnxruntime\python\tools\symbolic_shape_infer.py
--input model.onnx
Traceback (most recent call last):
File
"D:\source\cognition\onnxruntime\onnxruntime\python\tools\symbolic_shape_infer.py",
line 2999, in <module>
    out_mp = SymbolicShapeInference.infer_shapes(
File
"D:\source\cognition\onnxruntime\onnxruntime\python\tools\symbolic_shape_infer.py",
line 2935, in infer_shapes
    raise Exception("Incomplete symbolic shape inference")
---
 onnxruntime/python/tools/symbolic_shape_infer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 7a6028dfbe..b9675d4280 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -205,6 +205,7 @@ class SymbolicShapeInference:
             "GemmFastGelu": self._infer_GemmFastGelu,
             "GemmFloat8": self._infer_GemmFloat8,
             "GroupNorm": self._infer_GroupNorm,
+            "GroupNormalization": self._infer_GroupNorm,
             "GroupQueryAttention": self._infer_GroupQueryAttention,
             "LayerNormalization": self._infer_LayerNormalization,
             "LongformerAttention": self._infer_LongformerAttention,
@@ -474,6 +475,7 @@ class SymbolicShapeInference:
             "PythonOp",
             "MultiHeadAttention",
             "GroupNorm",
+            "GroupNormalization",
             "GroupQueryAttention",
             "SparseAttention",
             "SkipGroupNorm",

From 42f0c00f955d7e1d34dfbb540f2016a823736e36 Mon Sep 17 00:00:00 2001
From: Michael Sharp <51342856+michaelgsharp@users.noreply.github.com>
Date: Mon, 27 Jan 2025 11:58:38 -0700
Subject: [PATCH 21/37] Adds the new System.Numerics.Tensors as an input/output
 type when using dotnet 8.0 and up. (#23261)

### Description
Adds the new System.Numerics.Tensors as an input/output type when using
dotnet 8.0 and up. It does not change/remove any of the existing API,
only adds additional ones.


### Motivation and Context
Now that C#/Dotnet has an official tensor type built into the language,
we want to expand the places that it can be used.
---
 .../Microsoft.ML.OnnxRuntime.csproj           |   6 +-
 .../OrtValue.shared.cs                        | 152 +++++++++++++
 .../InferenceTest.netcore.cs                  | 211 ++++++++++++++++++
 .../linux_pack/LinuxPackNativeNuget.csproj    |   2 +-
 4 files changed, 369 insertions(+), 2 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index b9155e748f..be0e8d2ee5 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="MSBuild.Sdk.Extras/3.0.22">
+<Project Sdk="Microsoft.NET.Sdk">
   <PropertyGroup>
     <!--- packaging properties -->
     <OrtPackageId Condition="'$(OrtPackageId)' == ''">Microsoft.ML.OnnxRuntime</OrtPackageId>
@@ -189,6 +189,10 @@
     <PackageReference Include="Microsoft.SourceLink.GitHub" Version="8.0.0" PrivateAssets="All" />
   </ItemGroup>
 
+  <ItemGroup Condition="$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net8.0'))">
+	<PackageReference Include="System.Numerics.Tensors" Version="9.0.0" />
+  </ItemGroup>
+
   <!-- debug output - makes finding/fixing any issues with the the conditions easy.  -->
   <Target Name="DumpValues" BeforeTargets="PreBuildEvent">
     <Message Text="SolutionName='$(SolutionName)'" />
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
index d38748c2f9..7a5c3aaa19 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtValue.shared.cs
@@ -9,6 +9,14 @@ using System.Diagnostics;
 using System.Runtime.InteropServices;
 using System.Text;
 
+#if NET8_0_OR_GREATER
+using System.Diagnostics.CodeAnalysis;
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using SystemNumericsTensors = System.Numerics.Tensors;
+using TensorPrimitives = System.Numerics.Tensors.TensorPrimitives;
+#endif
+
 namespace Microsoft.ML.OnnxRuntime
 {
     /// <summary>
@@ -205,6 +213,33 @@ namespace Microsoft.ML.OnnxRuntime
             return MemoryMarshal.Cast<byte, T>(byteSpan);
         }
 
+#if NET8_0_OR_GREATER
+        /// <summary>
+        /// Returns a ReadOnlyTensorSpan<typeparamref name="T"/> over tensor native buffer that
+        /// provides a read-only view.
+        ///
+        /// Note, that the memory may be device allocated and, therefore, not accessible from the CPU.
+        /// To get memory descriptor use GetTensorMemoryInfo().
+        ///
+        /// OrtValue must contain a non-string tensor.
+        /// The span is valid as long as the OrtValue instance is alive (not disposed).
+        /// </summary>
+        /// <typeparam name="T"></typeparam>
+        /// <returns>ReadOnlySpan<typeparamref name="T"/></returns>
+        /// <exception cref="OnnxRuntimeException"></exception>
+        [Experimental("SYSLIB5001")]
+        public SystemNumericsTensors.ReadOnlyTensorSpan<T> GetTensorDataAsTensorSpan<T>() where T : unmanaged
+        {
+            var byteSpan = GetTensorBufferRawData(typeof(T));
+
+            var typeSpan = MemoryMarshal.Cast<byte, T>(byteSpan);
+            var shape = GetTypeInfo().TensorTypeAndShapeInfo.Shape;
+            nint[] nArray = Array.ConvertAll(shape, new Converter<long, nint>(x => (nint)x));
+
+            return new SystemNumericsTensors.ReadOnlyTensorSpan<T>(typeSpan, nArray, []);
+        }
+#endif
+
         /// <summary>
         /// Returns a Span<typeparamref name="T"/> over tensor native buffer.
         /// This enables you to safely and efficiently modify the underlying
@@ -225,6 +260,32 @@ namespace Microsoft.ML.OnnxRuntime
             return MemoryMarshal.Cast<byte, T>(byteSpan);
         }
 
+#if NET8_0_OR_GREATER
+        /// <summary>
+        /// Returns a TensorSpan<typeparamref name="T"/> over tensor native buffer.
+        ///
+        /// Note, that the memory may be device allocated and, therefore, not accessible from the CPU.
+        /// To get memory descriptor use GetTensorMemoryInfo().
+        ///
+        /// OrtValue must contain a non-string tensor.
+        /// The span is valid as long as the OrtValue instance is alive (not disposed).
+        /// </summary>
+        /// <typeparam name="T"></typeparam>
+        /// <returns>ReadOnlySpan<typeparamref name="T"/></returns>
+        /// <exception cref="OnnxRuntimeException"></exception>
+        [Experimental("SYSLIB5001")]
+        public SystemNumericsTensors.TensorSpan<T> GetTensorMutableDataAsTensorSpan<T>() where T : unmanaged
+        {
+            var byteSpan = GetTensorBufferRawData(typeof(T));
+
+            var typeSpan = MemoryMarshal.Cast<byte, T>(byteSpan);
+            var shape = GetTypeInfo().TensorTypeAndShapeInfo.Shape;
+            nint[] nArray = Array.ConvertAll(shape, new Converter<long, nint>(x => (nint)x));
+
+            return new SystemNumericsTensors.TensorSpan<T>(typeSpan, nArray, []);
+        }
+#endif
+
         /// <summary>
         /// Provides mutable raw native buffer access.
         /// </summary>
@@ -234,6 +295,23 @@ namespace Microsoft.ML.OnnxRuntime
             return GetTensorBufferRawData(typeof(byte));
         }
 
+#if NET8_0_OR_GREATER
+        /// <summary>
+        /// Provides mutable raw native buffer access.
+        /// </summary>
+        /// <returns>TensorSpan over the native buffer bytes</returns>
+        [Experimental("SYSLIB5001")]
+        public SystemNumericsTensors.TensorSpan<byte> GetTensorSpanMutableRawData<T>() where T : unmanaged
+        {
+            var byteSpan = GetTensorBufferRawData(typeof(T));
+
+            var shape = GetTypeInfo().TensorTypeAndShapeInfo.Shape;
+            nint[] nArray = Array.ConvertAll(shape, new Converter<long, nint>(x => (nint)x));
+
+            return new SystemNumericsTensors.TensorSpan<byte>(byteSpan, nArray, []);
+        }
+#endif
+
         /// <summary>
         /// Fetch string tensor element buffer pointer at the specified index,
         /// convert/copy to UTF-16 char[] and return a ReadOnlyMemory{char} instance.
@@ -605,6 +683,80 @@ namespace Microsoft.ML.OnnxRuntime
             return OrtValue.CreateTensorValueFromMemory(OrtMemoryInfo.DefaultInstance, new Memory<T>(data), shape);
         }
 
+#if NET8_0_OR_GREATER
+        /// <summary>
+        /// This is a factory method creates a native Onnxruntime OrtValue containing a tensor on top of the existing tensor managed memory.
+        /// The method will attempt to pin managed memory so no copying occurs when data is passed down
+        /// to native code.
+        /// </summary>
+        /// <param name="value">Tensor object</param>
+        /// <param name="elementType">discovered tensor element type</param>
+        /// <returns>And instance of OrtValue constructed on top of the object</returns>
+        [Experimental("SYSLIB5001")]
+        public static OrtValue CreateTensorValueFromSystemNumericsTensorObject<T>(SystemNumericsTensors.Tensor<T> tensor) where T : unmanaged
+        {
+            if (!IsContiguousAndDense(tensor))
+            {
+                var newTensor = SystemNumericsTensors.Tensor.Create<T>(tensor.Lengths);
+                tensor.CopyTo(newTensor);
+                tensor = newTensor;
+            }
+            unsafe
+            {
+                var backingData = (T[])tensor.GetType().GetField("_values", BindingFlags.Instance | BindingFlags.NonPublic).GetValue(tensor);
+                GCHandle handle = GCHandle.Alloc(backingData, GCHandleType.Pinned);
+                var memHandle = new MemoryHandle(Unsafe.AsPointer(ref tensor.GetPinnableReference()), handle);
+
+                try
+                {
+                    IntPtr dataBufferPointer = IntPtr.Zero;
+                    unsafe
+                    {
+                        dataBufferPointer = (IntPtr)memHandle.Pointer;
+                    }
+
+                    var bufferLengthInBytes = tensor.FlattenedLength * sizeof(T);
+                    long[] shape = Array.ConvertAll(tensor.Lengths.ToArray(), new Converter<nint, long>(x => (long)x));
+
+                    var typeInfo = TensorBase.GetTypeInfo(typeof(T)) ??
+                        throw new OnnxRuntimeException(ErrorCode.InvalidArgument, $"Tensor of type: {typeof(T)} is not supported");
+
+                    NativeApiStatus.VerifySuccess(NativeMethods.OrtCreateTensorWithDataAsOrtValue(
+                        OrtMemoryInfo.DefaultInstance.Pointer,
+                        dataBufferPointer,
+                        (UIntPtr)(bufferLengthInBytes),
+                        shape,
+                        (UIntPtr)tensor.Rank,
+                        typeInfo.ElementType,
+                        out IntPtr nativeValue));
+
+                    return new OrtValue(nativeValue, memHandle);
+                }
+                catch (Exception)
+                {
+                    memHandle.Dispose();
+                    throw;
+                }
+            }
+        }
+
+        [Experimental("SYSLIB5001")]
+        private static bool IsContiguousAndDense<T>(SystemNumericsTensors.Tensor<T> tensor) where T : unmanaged 
+        {
+            // Right most dimension must be 1 for a dense tensor.
+            if (tensor.Strides[^1] != 1)
+                return false;
+
+            // For other dimensions, the stride must be equal to the product of the dimensions to the right.
+            for (int i = tensor.Rank - 2; i >= 0; i--)
+            {
+                if (tensor.Strides[i] != TensorPrimitives.Product(tensor.Lengths.Slice(i + 1, tensor.Lengths.Length - i - 1)))
+                    return false;
+            }
+            return true;
+        }
+#endif
+
         /// <summary>
         /// The factory API creates an OrtValue with memory allocated using the given allocator
         /// according to the specified shape and element type. The memory will be released when OrtValue
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
index ff5fd2de54..816511150a 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
@@ -7,6 +7,10 @@ using System.Runtime.InteropServices;
 using System.Text.RegularExpressions;
 using Xunit;
 
+#if NET8_0_OR_GREATER
+using SystemNumericsTensors = System.Numerics.Tensors;
+#endif
+
 namespace Microsoft.ML.OnnxRuntime.Tests
 {
     /// <summary>
@@ -67,6 +71,194 @@ namespace Microsoft.ML.OnnxRuntime.Tests
             }
         }
 
+#if NET8_0_OR_GREATER
+#pragma warning disable SYSLIB5001 // System.Numerics.Tensors is only in preview so we can continue receiving API feedback
+        [Theory]
+        [InlineData(GraphOptimizationLevel.ORT_DISABLE_ALL, true)]
+        [InlineData(GraphOptimizationLevel.ORT_DISABLE_ALL, false)]
+        [InlineData(GraphOptimizationLevel.ORT_ENABLE_EXTENDED, true)]
+        [InlineData(GraphOptimizationLevel.ORT_ENABLE_EXTENDED, false)]
+        private void CanRunInferenceOnAModelDotnetTensors(GraphOptimizationLevel graphOptimizationLevel, bool enableParallelExecution)
+        {
+            var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+
+            using (var cleanUp = new DisposableListTest<IDisposable>())
+            {
+                // Set the graph optimization level for this session.
+                SessionOptions options = new SessionOptions();
+                cleanUp.Add(options);
+                options.GraphOptimizationLevel = graphOptimizationLevel;
+
+                var session = new InferenceSession(model, options);
+                cleanUp.Add(session);
+
+                using var runOptions = new RunOptions();
+                var inputMeta = session.InputMetadata;
+                var outputMeta = session.OutputMetadata;
+
+                float[] expectedOutput = TestDataLoader.LoadTensorFromEmbeddedResource("bench.expected_out");
+                long[] expectedDimensions = { 1, 1000, 1, 1 };  // hardcoded for now for the test data
+                ReadOnlySpan<long> expectedOutputDimensions = expectedDimensions;
+
+                float[] inputData = TestDataLoader.LoadTensorFromEmbeddedResource("bench.in"); // this is the data for only one input tensor for this model
+
+                using var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count);
+
+                foreach (var name in inputMeta.Keys)
+                {
+                    Assert.Equal(typeof(float), inputMeta[name].ElementType);
+                    Assert.True(inputMeta[name].IsTensor);
+                    var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, inputMeta[name].Dimensions.Select(x => (nint)x).ToArray());
+                    inputOrtValues.Add(new DisposableTestPair<OrtValue>(name, OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+
+                }
+
+                runOptions.LogId = "CsharpTest";
+                runOptions.Terminate = false;  // TODO: Test terminate = true, it currently crashes
+                runOptions.LogSeverityLevel = OrtLoggingLevel.ORT_LOGGING_LEVEL_ERROR;
+                // Run inference with named inputs and outputs created with in Run()
+                using (var results = session.Run(runOptions, inputOrtValues.Select(x => x.Key).ToList(), inputOrtValues.Select(x => x.Value).ToList(), new List<string>(["softmaxout_1"])))  // results is an IDisposableReadOnlyCollection<OrtValue> container
+                {
+                    // validate the results
+                    foreach (var r in results)
+                    {
+                        Assert.Single(results);
+
+                        ValidateRunResult(r, expectedOutput, expectedDimensions);
+                    }
+                }
+            }
+        }
+
+        [Fact]
+        public void InferenceSessionDisposedDotnetTensors()
+        {
+            var model = TestDataLoader.LoadModelFromEmbeddedResource("squeezenet.onnx");
+
+            // Set the graph optimization level for this session.
+            using (SessionOptions options = new SessionOptions())
+            {
+                options.ProfileOutputPathPrefix = "Ort_P_";
+                options.EnableProfiling = true;
+                using (var session = new InferenceSession(model, options))
+                {
+                    var inputMeta = session.InputMetadata;
+                    var container = new List<NamedOnnxValue>();
+
+                    float[] inputData = TestDataLoader.LoadTensorFromEmbeddedResource("bench.in"); // this is the data for only one input tensor for this model
+
+                    using (var runOptions = new RunOptions())
+                    using (var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count))
+                    using (var outputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.OutputMetadata.Count))
+                    {
+                        
+                        foreach (var name in inputMeta.Keys)
+                        {
+                            Assert.Equal(typeof(float), inputMeta[name].ElementType);
+                            Assert.True(inputMeta[name].IsTensor);
+                            var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, inputMeta[name].Dimensions.Select(x => (nint) x).ToArray());
+                            inputOrtValues.Add(new DisposableTestPair<OrtValue>(name, OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+                        }
+                        
+                        // Run inference with named inputs and outputs created with in Run()
+                        using (var results = session.Run(runOptions, inputOrtValues.Select(x => x.Key).ToList(), inputOrtValues.Select(x => x.Value).ToList(), new List<string>(["softmaxout_1"])))  // results is an IDisposableReadOnlyCollection<OrtValue> container
+                        {
+                            // validate the results
+                            foreach (var r in results)
+                            {
+                                Assert.Single(results);
+
+                                float[] expectedOutput = TestDataLoader.LoadTensorFromEmbeddedResource("bench.expected_out");
+                                long[] expectedDimensions = { 1, 1000, 1, 1 };  // hardcoded for now for the test data
+                                ValidateRunResult(r, expectedOutput, expectedDimensions);
+                            }
+                        }
+                    }
+
+                    string profile_file = session.EndProfiling();
+
+                    // Profile file should have the output path prefix in it
+                    Assert.Contains("Ort_P_", profile_file);
+                }
+            }
+        }
+
+        [Fact]
+        private void ThrowWrongOutputNameDotnetTensors()
+        {
+            var tuple = OpenSessionSqueezeNet();
+            var session = tuple.Item1;
+            var inputData = tuple.Item2;
+            var inputTensor = tuple.Item3;
+
+            using (var runOptions = new RunOptions())
+            using (var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count))
+            using (var outputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.OutputMetadata.Count))
+            {
+                var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, Array.ConvertAll<int, nint>(inputTensor.Dimensions.ToArray(), x => (nint)x));
+
+                inputOrtValues.Add(new DisposableTestPair<OrtValue>("data_0", OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+                outputOrtValues.Add(new DisposableTestPair<OrtValue>("bad_output_name", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(tensor)));
+
+                var ex = Assert.Throws<OnnxRuntimeException>(() => session.Run(runOptions, ["data_0"], [inputOrtValues[0].Value], ["bad_output_name"], [outputOrtValues[0].Value]));
+                Assert.Contains("Output name: 'bad_output_name' is not in the metadata", ex.Message);
+            }
+
+            session.Dispose();
+        }
+
+        [Fact]
+        private void ThrowWrongOutputDimensionDotnetTensors()
+        {
+            var tuple = OpenSessionSqueezeNet();
+            var session = tuple.Item1;
+            var inputData = tuple.Item2;
+            var inputTensor = tuple.Item3;
+            var outputTensor = SystemNumericsTensors.Tensor.Create<float>([1, 1001, 1, 1]);
+
+            using (var runOptions = new RunOptions())
+            using (var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count))
+            using (var outputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.OutputMetadata.Count))
+            {
+                var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, Array.ConvertAll<int, nint>(inputTensor.Dimensions.ToArray(), x => (nint)x));
+
+                inputOrtValues.Add(new DisposableTestPair<OrtValue>("data_0", OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+                outputOrtValues.Add(new DisposableTestPair<OrtValue>("softmaxout_1", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(outputTensor)));
+                
+                var ex = Assert.Throws<OnnxRuntimeException>(() => session.Run(runOptions, ["data_0"], [inputOrtValues[0].Value], ["softmaxout_1"], [outputOrtValues[0].Value]));
+            }
+
+            session.Dispose();
+        }
+
+        [Fact]
+        private void ThrowInconsistentPinnedOutputsDotnetTensors()
+        {
+            var tuple = OpenSessionSqueezeNet();
+            using var cleanUp = new DisposableListTest<IDisposable>();
+            cleanUp.Add(tuple.Item1);
+            var session = tuple.Item1;
+            var inputData = tuple.Item2;
+            var inputTensor = tuple.Item3;
+            var outputTensor = SystemNumericsTensors.Tensor.Create([1, 1001, 1, 1], [4]);
+
+            using (var runOptions = new RunOptions())
+            using (var inputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.InputMetadata.Count))
+            using (var outputOrtValues = new DisposableListTest<DisposableTestPair<OrtValue>>(session.OutputMetadata.Count))
+            {
+                var tensor = SystemNumericsTensors.Tensor.Create<float>(inputData, Array.ConvertAll<int, nint>(inputTensor.Dimensions.ToArray(), x => (nint)x));
+
+                inputOrtValues.Add(new DisposableTestPair<OrtValue>("data_0", OrtValue.CreateTensorValueFromSystemNumericsTensorObject<float>(tensor)));
+                outputOrtValues.Add(new DisposableTestPair<OrtValue>("softmaxout_1", OrtValue.CreateTensorValueFromSystemNumericsTensorObject(outputTensor)));
+                OrtValue[] outputs = [];
+                var ex = Assert.Throws<ArgumentException>(() => session.Run(runOptions, ["data_0"], [inputOrtValues[0].Value], ["softmaxout_1"], outputs));
+                Assert.StartsWith("Length of outputNames (1) must match that of outputValues (0).", ex.Message);
+            }
+        }
+#pragma warning restore SYSLIB5001 // System.Numerics.Tensors is only in preview so we can continue receiving API feedback
+#endif
+
+
 #if USE_CUDA
         [Fact(DisplayName = "TestCUDAProviderOptions")]
         private void TestCUDAProviderOptions()
@@ -1416,6 +1608,25 @@ namespace Microsoft.ML.OnnxRuntime.Tests
             }
         }
 
+#if NET8_0_OR_GREATER
+#pragma warning disable SYSLIB5001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+        private void ValidateRunResultData(SystemNumericsTensors.Tensor<float> resultTensor, float[] expectedOutput, int[] expectedDimensions)
+        {
+            Assert.Equal(expectedDimensions.Length, resultTensor.Rank);
+
+            var resultDimensions = resultTensor.Lengths;
+            for (int i = 0; i < expectedDimensions.Length; i++)
+            {
+                Assert.Equal(expectedDimensions[i], resultDimensions[i]);
+            }
+
+            var resultArray = resultTensor.ToArray();
+            Assert.Equal(expectedOutput.Length, resultArray.Length);
+            Assert.Equal(expectedOutput, resultArray, new FloatComparer());
+        }
+#pragma warning restore SYSLIB5001 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+#endif
+
         static string GetTestModelsDir()
         {
             // get build directory, append downloaded models location
diff --git a/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj b/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj
index 098078d2e3..b814f99b05 100644
--- a/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj
+++ b/csharp/tools/linux_pack/LinuxPackNativeNuget.csproj
@@ -7,7 +7,7 @@
      If you need a more sophisticated package for testing, you can run the production packaging pipeline against your
      branch and download the resulting nuget package from the build artifacts.
  -->
-<Project Sdk="MSBuild.Sdk.Extras/3.0.22">
+<Project Sdk="Microsoft.NET.Sdk">
   <PropertyGroup>
     <TargetFrameworks>netstandard2.0</TargetFrameworks>
     <NuspecFile>$(OnnxRuntimeBuildDirectory)/NativeNuget.nuspec</NuspecFile>

From 96ec1dd134391ee8bd2040d305f02e61a09476c2 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 27 Jan 2025 12:13:15 -0800
Subject: [PATCH 22/37] Bump ruff from 0.9.2 to 0.9.3 (#23496)

Bumps [ruff](https://github.com/astral-sh/ruff) from 0.9.2 to 0.9.3.
<details>
<summary>Release notes</summary>
<p><em>Sourced from <a
href="https://github.com/astral-sh/ruff/releases">ruff's
releases</a>.</em></p>
<blockquote>
<h2>0.9.3</h2>
<h2>Release Notes</h2>
<h3>Preview features</h3>
<ul>
<li>[<code>airflow</code>] Argument <code>fail_stop</code> in DAG has
been renamed as <code>fail_fast</code> (<code>AIR302</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15633">#15633</a>)</li>
<li>[<code>airflow</code>] Extend <code>AIR303</code> with more symbols
(<a
href="https://redirect.github.com/astral-sh/ruff/pull/15611">#15611</a>)</li>
<li>[<code>flake8-bandit</code>] Report all references to suspicious
functions (<code>S3</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15541">#15541</a>)</li>
<li>[<code>flake8-pytest-style</code>] Do not emit diagnostics for empty
<code>for</code> loops (<code>PT012</code>, <code>PT031</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15542">#15542</a>)</li>
<li>[<code>flake8-simplify</code>] Avoid double negations
(<code>SIM103</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15562">#15562</a>)</li>
<li>[<code>pyflakes</code>] Fix infinite loop with unused local import
in <code>__init__.py</code> (<code>F401</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15517">#15517</a>)</li>
<li>[<code>pylint</code>] Do not report methods with only one
<code>EM101</code>-compatible <code>raise</code> (<code>PLR6301</code>)
(<a
href="https://redirect.github.com/astral-sh/ruff/pull/15507">#15507</a>)</li>
<li>[<code>pylint</code>] Implement
<code>redefined-slots-in-subclass</code> (<code>W0244</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/9640">#9640</a>)</li>
<li>[<code>pyupgrade</code>] Add rules to use PEP 695 generics in
classes and functions (<code>UP046</code>, <code>UP047</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15565">#15565</a>,
<a
href="https://redirect.github.com/astral-sh/ruff/pull/15659">#15659</a>)</li>
<li>[<code>refurb</code>] Implement <code>for-loop-writes</code>
(<code>FURB122</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/10630">#10630</a>)</li>
<li>[<code>ruff</code>] Implement <code>needless-else</code> clause
(<code>RUF047</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15051">#15051</a>)</li>
<li>[<code>ruff</code>] Implement <code>starmap-zip</code>
(<code>RUF058</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15483">#15483</a>)</li>
</ul>
<h3>Rule changes</h3>
<ul>
<li>[<code>flake8-bugbear</code>] Do not raise error if keyword argument
is present and target-python version is less or equals than 3.9
(<code>B903</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15549">#15549</a>)</li>
<li>[<code>flake8-comprehensions</code>] strip parentheses around
generators in <code>unnecessary-generator-set</code> (<code>C401</code>)
(<a
href="https://redirect.github.com/astral-sh/ruff/pull/15553">#15553</a>)</li>
<li>[<code>flake8-pytest-style</code>] Rewrite references to
<code>.exception</code> (<code>PT027</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15680">#15680</a>)</li>
<li>[<code>flake8-simplify</code>] Mark fixes as unsafe
(<code>SIM201</code>, <code>SIM202</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15626">#15626</a>)</li>
<li>[<code>flake8-type-checking</code>] Fix some safe fixes being
labeled unsafe (<code>TC006</code>,<code>TC008</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15638">#15638</a>)</li>
<li>[<code>isort</code>] Omit trailing whitespace in
<code>unsorted-imports</code> (<code>I001</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15518">#15518</a>)</li>
<li>[<code>pydoclint</code>] Allow ignoring one line docstrings for
<code>DOC</code> rules (<a
href="https://redirect.github.com/astral-sh/ruff/pull/13302">#13302</a>)</li>
<li>[<code>pyflakes</code>] Apply redefinition fixes by source code
order (<code>F811</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15575">#15575</a>)</li>
<li>[<code>pyflakes</code>] Avoid removing too many imports in
<code>redefined-while-unused</code> (<code>F811</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15585">#15585</a>)</li>
<li>[<code>pyflakes</code>] Group redefinition fixes by source statement
(<code>F811</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15574">#15574</a>)</li>
<li>[<code>pylint</code>] Include name of base class in message for
<code>redefined-slots-in-subclass</code> (<code>W0244</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15559">#15559</a>)</li>
<li>[<code>ruff</code>] Update fix for <code>RUF055</code> to use
<code>var == value</code> (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15605">#15605</a>)</li>
</ul>
<h3>Formatter</h3>
<ul>
<li>Fix bracket spacing for single-element tuples in f-string
expressions (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15537">#15537</a>)</li>
<li>Fix unstable f-string formatting for expressions containing a
trailing comma (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15545">#15545</a>)</li>
</ul>
<h3>Performance</h3>
<ul>
<li>Avoid quadratic membership check in import fixes (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15576">#15576</a>)</li>
</ul>
<h3>Server</h3>
<ul>
<li>Allow <code>unsafe-fixes</code> settings for code actions (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15666">#15666</a>)</li>
</ul>
<h3>Bug fixes</h3>
<ul>
<li>[<code>flake8-bandit</code>] Add missing single-line/dotall regex
flag (<code>S608</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15654">#15654</a>)</li>
<li>[<code>flake8-import-conventions</code>] Fix infinite loop between
<code>ICN001</code> and <code>I002</code> (<code>ICN001</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15480">#15480</a>)</li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Changelog</summary>
<p><em>Sourced from <a
href="https://github.com/astral-sh/ruff/blob/main/CHANGELOG.md">ruff's
changelog</a>.</em></p>
<blockquote>
<h2>0.9.3</h2>
<h3>Preview features</h3>
<ul>
<li>[<code>airflow</code>] Argument <code>fail_stop</code> in DAG has
been renamed as <code>fail_fast</code> (<code>AIR302</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15633">#15633</a>)</li>
<li>[<code>airflow</code>] Extend <code>AIR303</code> with more symbols
(<a
href="https://redirect.github.com/astral-sh/ruff/pull/15611">#15611</a>)</li>
<li>[<code>flake8-bandit</code>] Report all references to suspicious
functions (<code>S3</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15541">#15541</a>)</li>
<li>[<code>flake8-pytest-style</code>] Do not emit diagnostics for empty
<code>for</code> loops (<code>PT012</code>, <code>PT031</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15542">#15542</a>)</li>
<li>[<code>flake8-simplify</code>] Avoid double negations
(<code>SIM103</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15562">#15562</a>)</li>
<li>[<code>pyflakes</code>] Fix infinite loop with unused local import
in <code>__init__.py</code> (<code>F401</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15517">#15517</a>)</li>
<li>[<code>pylint</code>] Do not report methods with only one
<code>EM101</code>-compatible <code>raise</code> (<code>PLR6301</code>)
(<a
href="https://redirect.github.com/astral-sh/ruff/pull/15507">#15507</a>)</li>
<li>[<code>pylint</code>] Implement
<code>redefined-slots-in-subclass</code> (<code>W0244</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/9640">#9640</a>)</li>
<li>[<code>pyupgrade</code>] Add rules to use PEP 695 generics in
classes and functions (<code>UP046</code>, <code>UP047</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15565">#15565</a>,
<a
href="https://redirect.github.com/astral-sh/ruff/pull/15659">#15659</a>)</li>
<li>[<code>refurb</code>] Implement <code>for-loop-writes</code>
(<code>FURB122</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/10630">#10630</a>)</li>
<li>[<code>ruff</code>] Implement <code>needless-else</code> clause
(<code>RUF047</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15051">#15051</a>)</li>
<li>[<code>ruff</code>] Implement <code>starmap-zip</code>
(<code>RUF058</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15483">#15483</a>)</li>
</ul>
<h3>Rule changes</h3>
<ul>
<li>[<code>flake8-bugbear</code>] Do not raise error if keyword argument
is present and target-python version is less or equals than 3.9
(<code>B903</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15549">#15549</a>)</li>
<li>[<code>flake8-comprehensions</code>] strip parentheses around
generators in <code>unnecessary-generator-set</code> (<code>C401</code>)
(<a
href="https://redirect.github.com/astral-sh/ruff/pull/15553">#15553</a>)</li>
<li>[<code>flake8-pytest-style</code>] Rewrite references to
<code>.exception</code> (<code>PT027</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15680">#15680</a>)</li>
<li>[<code>flake8-simplify</code>] Mark fixes as unsafe
(<code>SIM201</code>, <code>SIM202</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15626">#15626</a>)</li>
<li>[<code>flake8-type-checking</code>] Fix some safe fixes being
labeled unsafe (<code>TC006</code>,<code>TC008</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15638">#15638</a>)</li>
<li>[<code>isort</code>] Omit trailing whitespace in
<code>unsorted-imports</code> (<code>I001</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15518">#15518</a>)</li>
<li>[<code>pydoclint</code>] Allow ignoring one line docstrings for
<code>DOC</code> rules (<a
href="https://redirect.github.com/astral-sh/ruff/pull/13302">#13302</a>)</li>
<li>[<code>pyflakes</code>] Apply redefinition fixes by source code
order (<code>F811</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15575">#15575</a>)</li>
<li>[<code>pyflakes</code>] Avoid removing too many imports in
<code>redefined-while-unused</code> (<code>F811</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15585">#15585</a>)</li>
<li>[<code>pyflakes</code>] Group redefinition fixes by source statement
(<code>F811</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15574">#15574</a>)</li>
<li>[<code>pylint</code>] Include name of base class in message for
<code>redefined-slots-in-subclass</code> (<code>W0244</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15559">#15559</a>)</li>
<li>[<code>ruff</code>] Update fix for <code>RUF055</code> to use
<code>var == value</code> (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15605">#15605</a>)</li>
</ul>
<h3>Formatter</h3>
<ul>
<li>Fix bracket spacing for single-element tuples in f-string
expressions (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15537">#15537</a>)</li>
<li>Fix unstable f-string formatting for expressions containing a
trailing comma (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15545">#15545</a>)</li>
</ul>
<h3>Performance</h3>
<ul>
<li>Avoid quadratic membership check in import fixes (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15576">#15576</a>)</li>
</ul>
<h3>Server</h3>
<ul>
<li>Allow <code>unsafe-fixes</code> settings for code actions (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15666">#15666</a>)</li>
</ul>
<h3>Bug fixes</h3>
<ul>
<li>[<code>flake8-bandit</code>] Add missing single-line/dotall regex
flag (<code>S608</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15654">#15654</a>)</li>
<li>[<code>flake8-import-conventions</code>] Fix infinite loop between
<code>ICN001</code> and <code>I002</code> (<code>ICN001</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15480">#15480</a>)</li>
<li>[<code>flake8-simplify</code>] Do not emit diagnostics for
expressions inside string type annotations (<code>SIM222</code>,
<code>SIM223</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/pull/15405">#15405</a>)</li>
</ul>
<!-- raw HTML omitted -->
</blockquote>
<p>... (truncated)</p>
</details>
<details>
<summary>Commits</summary>
<ul>
<li><a
href="https://github.com/astral-sh/ruff/commit/90589372daf58ec4d314cbd15db8d2ef572c33cc"><code>9058937</code></a>
Fix grep for version number in docker build (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15699">#15699</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/b5ffb404de8ab05eb7b14d6547f79f4fe3a3e25f"><code>b5ffb40</code></a>
Bump version to 0.9.3 (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15698">#15698</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/cffd1866ce1ac6da4d6a5bc12435316d2d99755b"><code>cffd186</code></a>
Preserve raw string prefix and escapes (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15694">#15694</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/569060f46ca2e036cd54532c97121737884f26c0"><code>569060f</code></a>
[<code>flake8-pytest-style</code>] Rewrite references to
<code>.exception</code> (<code>PT027</code>) (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15680">#15680</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/15394a80282f589526497eefb2507a0afc662ca6"><code>15394a8</code></a>
[red-knot] MDTests: Do not depend on precise public-symbol type
inference (<a
href="https://redirect.github.com/astral-sh/ruff/issues/1">#1</a>...</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/fc2ebea7369b26c864769fce54201a8657d70058"><code>fc2ebea</code></a>
[red-knot] Make <code>infer.rs</code> unit tests independent of public
symbol inference ...</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/43160b4c3edb9cda4c01ed857e94578213e70c6f"><code>43160b4</code></a>
Tidy knot CLI tests (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15685">#15685</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/0173738eef808a9b2f492a0b966e3f70e8584e21"><code>0173738</code></a>
[red-knot] Port comprehension tests to Markdown (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15688">#15688</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/05ea77b1d4d1863e6436101cf877fbf265e966f4"><code>05ea77b</code></a>
Create Unknown rule diagnostics with a source range (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15648">#15648</a>)</li>
<li><a
href="https://github.com/astral-sh/ruff/commit/1e790d3885919826e2cff2fbf6ddb31554714050"><code>1e790d3</code></a>
[red-knot] Port 'deferred annotations' unit tests to Markdown (<a
href="https://redirect.github.com/astral-sh/ruff/issues/15686">#15686</a>)</li>
<li>Additional commits viewable in <a
href="https://github.com/astral-sh/ruff/compare/0.9.2...0.9.3">compare
view</a></li>
</ul>
</details>
<br />


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=ruff&package-manager=pip&previous-version=0.9.2&new-version=0.9.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)


</details>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-lintrunner.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt
index 4aef423f4b..2ca562e5f5 100644
--- a/requirements-lintrunner.txt
+++ b/requirements-lintrunner.txt
@@ -3,6 +3,6 @@
 lintrunner==0.12.5
 lintrunner-adapters==0.12.4
 # RUFF
-ruff==0.9.2
+ruff==0.9.3
 # CLANGFORMAT
 clang-format==19.1.7

From fdde2e25e138e00b61faea158b4650f58331433a Mon Sep 17 00:00:00 2001
From: Malik Shahzad Muzaffar <shahzad.malik.muzaffar@cern.ch>
Date: Mon, 27 Jan 2025 22:35:18 +0100
Subject: [PATCH 23/37] Fix for gcc 13.3.1: Avoid creating a copy (#23500)

### Description
This change avoids creating loop variable copy. GCC 13.3 suggests to use
reference type to prevent copying.


### Motivation and Context
While building onnxruntime 1.20.1 with latest changes from gcc 13.3, I
get build error like
```
onnxruntime-1.20.1/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc: In function 'onnxruntime::common::Status onnxruntime::MatchAndProcess(Graph&, const GraphViewer&, Node&, bool&, const logging::Logger&, const std::string&, const SelectorActionRegistry&, const SatRuntimeOptimizationSaveContext*)':
onnxruntime-1.20.1/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc:150:23: error: loop variable 'op_schema' creates a copy from type 'const gsl::not_null<const onnx::OpSchema*>' [-Werror=range-loop-construct]
  150 |       for (const auto op_schema : action_saved_state.produced_node_op_schemas) {
      |                       ^~~~~~~~~
onnxruntime-1.20.1/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc:150:23: note: use reference type to prevent copying
  150 |       for (const auto op_schema : action_saved_state.produced_node_op_schemas) {
      |                       ^~~~~~~~~
      |                       &
```
---
 .../optimizer/selectors_actions/selector_action_transformer.cc  | 2 +-
 onnxruntime/core/session/inference_session.cc                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc b/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc
index b68cbaf85b..b1d6c51f69 100644
--- a/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/selectors_actions/selector_action_transformer.cc
@@ -147,7 +147,7 @@ static Status MatchAndProcess(
       RuntimeOptimizationRecord::ProducedOpIdVector produced_op_ids{};
       produced_op_ids.reserve(action_saved_state.produced_node_op_schemas.size());
 
-      for (const auto op_schema : action_saved_state.produced_node_op_schemas) {
+      for (const auto& op_schema : action_saved_state.produced_node_op_schemas) {
         produced_op_ids.push_back(utils::MakeOpId(*op_schema));
         if (save_context->record_produced_node_op_schema) {
           status = save_context->record_produced_node_op_schema(*op_schema);
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 223eed2488..26ffeb93ab 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -921,7 +921,7 @@ common::Status InferenceSession::SaveToOrtFormat(const std::filesystem::path& fi
   ORT_RETURN_IF_ERROR(kernel_type_str_resolver.RegisterGraphNodeOpSchemas(model_->MainGraph()));
   ORT_RETURN_IF_ERROR(standalone::RegisterCustomOpNodeSchemas(kernel_type_str_resolver, model_->MainGraph()));
 
-  for (const auto op_schema : saved_runtime_optimization_produced_node_op_schemas_) {
+  for (const auto& op_schema : saved_runtime_optimization_produced_node_op_schemas_) {
     ORT_RETURN_IF_ERROR(kernel_type_str_resolver.RegisterOpSchema(*op_schema));
   }
 

From 8db97a68f2629aa32a3ab318e741555f72151aca Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Mon, 27 Jan 2025 14:02:06 -0800
Subject: [PATCH 24/37] [webgpu] Bump version of Dawn to b9b4a370 (#23494)

### Description

This PR updates the version of Dawn to
`b9b4a37041dec3dd62ac92014a6cc1aece48d9f3` (ref:
[chromium](https://chromium.googlesource.com/chromium/src.git/+/67f86f01ddb0e5cbdac4a050c17c468deb740c6c/DEPS#399))
in the `deps.txt` file.

The newer version of Dawn includes the previous changes from dawn.patch
so that we can remove the patch file.

There is a little interface changes and code is updated correspondingly.
---
 cgmanifests/generated/cgmanifest.json         |   2 +-
 cmake/deps.txt                                |   2 +-
 .../external/onnxruntime_external_deps.cmake  |   2 +-
 cmake/patches/dawn/dawn.patch                 | 118 ------------------
 .../core/providers/webgpu/program_manager.cc  |  12 +-
 .../core/providers/webgpu/webgpu_context.cc   |   8 +-
 .../templates/download-deps.yml               |   4 +-
 7 files changed, 15 insertions(+), 133 deletions(-)
 delete mode 100644 cmake/patches/dawn/dawn.patch

diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
index 044588c080..46f8c8891d 100644
--- a/cgmanifests/generated/cgmanifest.json
+++ b/cgmanifests/generated/cgmanifest.json
@@ -346,7 +346,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "12a3b24c456cebd9fd11f23ac0164f78129b00c6",
+          "commitHash": "b9b4a37041dec3dd62ac92014a6cc1aece48d9f3",
           "repositoryUrl": "https://github.com/google/dawn.git"
         },
         "comments": "dawn"
diff --git a/cmake/deps.txt b/cmake/deps.txt
index d1a528bd6b..c73d9a4e35 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -58,5 +58,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/f3f6caa6e
 composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/204da9c522cebec5220bba52cd3542ebcaf99e7a.zip;1827348efd47831c13074245274d41b7cae8a557
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1
-dawn;https://github.com/google/dawn/archive/12a3b24c456cebd9fd11f23ac0164f78129b00c6.zip;ad428f6dc16f1336d584f7bad5714e1097dafc43
+dawn;https://github.com/google/dawn/archive/b9b4a37041dec3dd62ac92014a6cc1aece48d9f3.zip;e8b8c2ebabdedb7c57d931fc4a19ae22146d31e1
 kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/d15722976120710080ca098fe8ddabf4556cb40f/kleidiai-d15722976120710080ca098fe8ddabf4556cb40f.zip;d6c840d00c3b05aedf06e957ddaece1013d1f40b
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 761ce47582..e956569698 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -631,7 +631,7 @@ if (onnxruntime_USE_WEBGPU)
       URL_HASH SHA1=${DEP_SHA1_dawn}
       # All previous patches are merged into the upstream dawn project. We don't need to apply any patches right now.
       # if we need to apply patches in the future, we can uncomment the following line.
-      PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
+      # PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
     )
   endif()
 
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
deleted file mode 100644
index 868db0c43e..0000000000
--- a/cmake/patches/dawn/dawn.patch
+++ /dev/null
@@ -1,118 +0,0 @@
-diff --git a/src/emdawnwebgpu/CMakeLists.txt b/src/emdawnwebgpu/CMakeLists.txt
-index de673537d3..c98dc46de7 100644
---- a/src/emdawnwebgpu/CMakeLists.txt
-+++ b/src/emdawnwebgpu/CMakeLists.txt
-@@ -78,6 +78,7 @@ if (${DAWN_ENABLE_EMSCRIPTEN})
-         endif()
- 
-         set(ARGS
-+            ${Python3_EXECUTABLE}
-             "${DAWN_EMSCRIPTEN_TOOLCHAIN}/tools/maint/gen_struct_info.py"
-             -q
-             "${EM_BUILD_GEN_DIR}/struct_info_webgpu.json"
-diff --git a/third_party/emdawnwebgpu/library_webgpu.js b/third_party/emdawnwebgpu/library_webgpu.js
-index d1835cb090..df03ea2f94 100644
---- a/third_party/emdawnwebgpu/library_webgpu.js
-+++ b/third_party/emdawnwebgpu/library_webgpu.js
-@@ -16,10 +16,19 @@
-     throw new Error("To use Dawn's library_webgpu.js, disable -sUSE_WEBGPU and first include Dawn's library_webgpu_struct_info.js and library_webgpu_enum_tables.js (before library_webgpu.js)");
-   }
- 
-+  if (MEMORY64) {
-+    throw new Error("The current implementation of Dawn's library_webgpu.js does not support MEMORY64 yet");
-+  }
-+
-   // Helper functions for code generation
-   globalThis.gpu = {
--    convertSentinelToUndefined: function(name) {
--      return `if (${name} == -1) ${name} = undefined;`;
-+    convertSentinelToUndefined: function(name, isPtr = false) {
-+      // When `CAN_ADDRESS_2GB` is true, value `-1` is normalized to `0xFFFFFFFF` for pointer.
-+      if (CAN_ADDRESS_2GB && isPtr) {
-+        return `if (${name} == 0xFFFFFFFF) ${name} = undefined;`;
-+      } else {
-+        return `if (${name} == -1) ${name} = undefined;`;
-+      }
-     },
- 
-     makeGetBool: function(struct, offset) {
-@@ -700,6 +709,7 @@ var LibraryWebGPU = {
-     {{{ makeSetValue('info', C_STRUCTS.WGPUAdapterInfo.adapterType, 'adapterType', 'i32') }}};
-     {{{ makeSetValue('info', C_STRUCTS.WGPUAdapterInfo.vendorID, '0', 'i32') }}};
-     {{{ makeSetValue('info', C_STRUCTS.WGPUAdapterInfo.deviceID, '0', 'i32') }}};
-+    return 1;
-   },
- 
-   wgpuAdapterGetLimits: (adapterPtr, limitsOutPtr) => {
-@@ -882,7 +892,7 @@ var LibraryWebGPU = {
- 
-     if (size === 0) warnOnce('getMappedRange size=0 no longer means WGPU_WHOLE_MAP_SIZE');
- 
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
- 
-     var mapped;
-     try {
-@@ -909,7 +919,7 @@ var LibraryWebGPU = {
- 
-     if (size === 0) warnOnce('getMappedRange size=0 no longer means WGPU_WHOLE_MAP_SIZE');
- 
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
- 
-     var mapped;
-     try {
-@@ -950,7 +960,7 @@ var LibraryWebGPU = {
-     var buffer = WebGPU.getJsObject(bufferPtr);
-     WebGPU.Internals.bufferOnUnmaps[bufferPtr] = [];
- 
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
- 
-     {{{ runtimeKeepalivePush() }}}
-     WebGPU.Internals.futureInsert(futureId, buffer.mapAsync(mode, offset, size).then(() => {
-@@ -1145,7 +1155,7 @@ var LibraryWebGPU = {
- 
-   wgpuCommandEncoderClearBuffer: (encoderPtr, bufferPtr, offset, size) => {
-     var commandEncoder = WebGPU.getJsObject(encoderPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
- 
-     var buffer = WebGPU.getJsObject(bufferPtr);
-     commandEncoder.clearBuffer(buffer, offset, size);
-@@ -2103,7 +2113,7 @@ var LibraryWebGPU = {
-   wgpuRenderBundleEncoderSetIndexBuffer: (passPtr, bufferPtr, format, offset, size) => {
-     var pass = WebGPU.getJsObject(passPtr);
-     var buffer = WebGPU.getJsObject(bufferPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
-     pass.setIndexBuffer(buffer, WebGPU.IndexFormat[format], offset, size);
-   },
- 
-@@ -2116,7 +2126,7 @@ var LibraryWebGPU = {
-   wgpuRenderBundleEncoderSetVertexBuffer: (passPtr, slot, bufferPtr, offset, size) => {
-     var pass = WebGPU.getJsObject(passPtr);
-     var buffer = WebGPU.getJsObject(bufferPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
-     pass.setVertexBuffer(slot, buffer, offset, size);
-   },
- 
-@@ -2211,7 +2221,7 @@ var LibraryWebGPU = {
-   wgpuRenderPassEncoderSetIndexBuffer: (passPtr, bufferPtr, format, offset, size) => {
-     var pass = WebGPU.getJsObject(passPtr);
-     var buffer = WebGPU.getJsObject(bufferPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
-     pass.setIndexBuffer(buffer, WebGPU.IndexFormat[format], offset, size);
-   },
- 
-@@ -2234,7 +2244,7 @@ var LibraryWebGPU = {
-   wgpuRenderPassEncoderSetVertexBuffer: (passPtr, slot, bufferPtr, offset, size) => {
-     var pass = WebGPU.getJsObject(passPtr);
-     var buffer = WebGPU.getJsObject(bufferPtr);
--    {{{ gpu.convertSentinelToUndefined('size') }}}
-+    {{{ gpu.convertSentinelToUndefined('size', true) }}}
-     pass.setVertexBuffer(slot, buffer, offset, size);
-   },
- 
diff --git a/onnxruntime/core/providers/webgpu/program_manager.cc b/onnxruntime/core/providers/webgpu/program_manager.cc
index 109bac34d6..1fdd312d4f 100644
--- a/onnxruntime/core/providers/webgpu/program_manager.cc
+++ b/onnxruntime/core/providers/webgpu/program_manager.cc
@@ -147,16 +147,16 @@ Status ProgramManager::Build(const ProgramBase& program,
     }
   }
 
-  wgpu::ProgrammableStageDescriptor compute_stage{};
-  compute_stage.module = shader_module;
-  compute_stage.entryPoint = "main";
+  wgpu::ComputeState compute_state{};
+  compute_state.module = shader_module;
+  compute_state.entryPoint = "main";
   if (!constant_entries.empty()) {
-    compute_stage.constants = constant_entries.data();
-    compute_stage.constantCount = constant_entries.size();
+    compute_state.constants = constant_entries.data();
+    compute_state.constantCount = constant_entries.size();
   }
 
   wgpu::ComputePipelineDescriptor pipeline_descriptor{};
-  pipeline_descriptor.compute = compute_stage;
+  pipeline_descriptor.compute = compute_state;
 #ifndef NDEBUG  // if debug build
   pipeline_descriptor.label = program.Name().c_str();
 #endif
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index f7d9420701..99a645878c 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -115,12 +115,12 @@ void WebGpuContext::Initialize(const WebGpuBufferCacheConfig& buffer_cache_confi
       device_desc.requiredLimits = &required_limits;
 
       // TODO: revise temporary error handling
-      device_desc.SetUncapturedErrorCallback([](const wgpu::Device& /*device*/, wgpu::ErrorType type, const char* message) {
-        LOGS_DEFAULT(ERROR) << "WebGPU device error(" << int(type) << "): " << message;
+      device_desc.SetUncapturedErrorCallback([](const wgpu::Device& /*device*/, wgpu::ErrorType type, wgpu::StringView message) {
+        LOGS_DEFAULT(ERROR) << "WebGPU device error(" << int(type) << "): " << std::string_view{message};
       });
       // TODO: revise temporary device lost handling
-      device_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device& /*device*/, wgpu::DeviceLostReason reason, const char* message) {
-        LOGS_DEFAULT(INFO) << "WebGPU device lost (" << int(reason) << "): " << message;
+      device_desc.SetDeviceLostCallback(wgpu::CallbackMode::AllowSpontaneous, [](const wgpu::Device& /*device*/, wgpu::DeviceLostReason reason, wgpu::StringView message) {
+        LOGS_DEFAULT(INFO) << "WebGPU device lost (" << int(reason) << "): " << std::string_view{message};
       });
 
       ORT_ENFORCE(wgpu::WaitStatus::Success == instance_.WaitAny(adapter_.RequestDevice(
diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
index 1a53ce6a42..fe3bc60c83 100644
--- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml
@@ -11,7 +11,7 @@ steps:
       packageType: upack
       feed: '/7424c8e4-5c62-490e-95c4-79446f31017c'
       definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0'
-      version: 1.0.208
+      version: 1.0.213
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # The private ADO project
@@ -22,7 +22,7 @@ steps:
       packageType: upack
       feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325'
       definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a'
-      version: 1.0.208
+      version: 1.0.213
       downloadPath: $(Build.BinariesDirectory)/deps
 
 # You can add more ADO accounts at here.

From ded8730d6e9bf1c5efeb2fec1ef254651ca5b38f Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Tue, 28 Jan 2025 08:58:18 -0800
Subject: [PATCH 25/37] Remove thrust::unary_function (#23506)

### Description
<!-- Describe your changes. -->
Remove thrust::unary_function which is deprecated in later versions of
CUDA.

### Motivation and Context
Addresses issue: https://github.com/microsoft/onnxruntime/issues/23499
---
 onnxruntime/core/providers/cuda/tensor/compress_impl.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/tensor/compress_impl.cu b/onnxruntime/core/providers/cuda/tensor/compress_impl.cu
index b2c7b60866..0c04e027ca 100644
--- a/onnxruntime/core/providers/cuda/tensor/compress_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/compress_impl.cu
@@ -13,7 +13,6 @@
 
 #include "core/providers/cuda/tensor/compress_impl.h"
 
-#include <thrust/functional.h>
 #include <thrust/iterator/transform_iterator.h>
 
 namespace onnxruntime {
@@ -23,7 +22,7 @@ namespace cuda {
 // in InclusiveSum(). By default, the accumulator type matches the input, but for int8_t
 // the sum overflows quickly, so we want the source type to match the output (int32_t).
 // see https://github.com/NVIDIA/cub/issues/384
-struct CastToInt32 : public thrust::unary_function<int8_t, int32_t> {
+struct CastToInt32 {
   __host__ __device__ int32_t operator()(int8_t v) const {
     return static_cast<int32_t>(v);
   }

From d2c5e2474c799aa4b7c7ab309ec615a326dca783 Mon Sep 17 00:00:00 2001
From: Corentin Maravat <101636442+cocotdf@users.noreply.github.com>
Date: Tue, 28 Jan 2025 18:00:01 +0100
Subject: [PATCH 26/37] Add of GlobalMaxPool Gradient (#23502)

### Description
Added gradient computation support for the GlobalMaxPool node.


### Motivation and Context
Improve the training capabilities of ONNX Runtime.
---
 .../core/graph/gradient_builder.cc            | 18 +++++++++++++++
 .../orttraining/core/graph/gradient_builder.h |  1 +
 .../core/graph/gradient_builder_registry.cc   |  1 +
 .../test/gradient/gradient_ops_test.cc        | 23 +++++++++++++++++++
 4 files changed, 43 insertions(+)

diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index 43835f07c4..1ba52ca9e5 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -2239,5 +2239,23 @@ IMPLEMENT_GRADIENT_BUILDER(GetAtanGradient) {
   return result;
 }
 
+IMPLEMENT_GRADIENT_BUILDER(GetGlobalMaxPoolGradient) {
+  // For GlobalMaxPool's gradient, a binary mask flags max elements.
+  // We multiply that mask by the incoming gradient, passing gradients only to maxima.
+  std::vector<NodeDef> result;
+  result.push_back(NodeDef("Shape", {I(0)}, {IA("X_shape")}));
+  result.push_back(NodeDef("Expand", {O(0), IA("X_shape")}, {IA("expanded_Y")}));
+  result.push_back(NodeDef("Equal", {I(0), IA("expanded_Y")}, {IA("mask")}));
+  result.push_back(NodeDef("Cast",
+                           {IA("mask")},
+                           {IA("mask_cast")},
+                           {MakeAttribute("to", static_cast<int64_t>(IElemType(0)))}));
+
+  result.push_back(NodeDef("Expand", {GO(0), IA("X_shape")}, {IA("expanded_dY")}));
+  result.push_back(NodeDef("Mul", {IA("mask_cast"), IA("expanded_dY")}, {GI(0)}));
+
+  return result;
+}
+
 }  // namespace training
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/core/graph/gradient_builder.h b/orttraining/orttraining/core/graph/gradient_builder.h
index 2b40754b62..2611e742f3 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.h
+++ b/orttraining/orttraining/core/graph/gradient_builder.h
@@ -94,6 +94,7 @@ DECLARE_GRADIENT_BUILDER(GetLeakyReluGradient)
 DECLARE_GRADIENT_BUILDER(GetConvTransposeGradient)
 DECLARE_GRADIENT_BUILDER(GetResizeGradient)
 DECLARE_GRADIENT_BUILDER(GetAtanGradient)
+DECLARE_GRADIENT_BUILDER(GetGlobalMaxPoolGradient)
 
 DECLARE_GRADIENT_BUILDER(GetExternalGradient)
 
diff --git a/orttraining/orttraining/core/graph/gradient_builder_registry.cc b/orttraining/orttraining/core/graph/gradient_builder_registry.cc
index 9c9884c5d3..a04d909267 100755
--- a/orttraining/orttraining/core/graph/gradient_builder_registry.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder_registry.cc
@@ -126,6 +126,7 @@ void GradientBuilderRegistry::RegisterGradientBuilders() {
   REGISTER_GRADIENT_BUILDER("ConvTranspose", GetConvTransposeGradient);
   REGISTER_GRADIENT_BUILDER("Resize", GetResizeGradient);
   REGISTER_GRADIENT_BUILDER("Atan", GetAtanGradient);
+  REGISTER_GRADIENT_BUILDER("GlobalMaxPool", GetGlobalMaxPoolGradient);
 
   REGISTER_GRADIENT_BUILDER("ExternalGradient", GetExternalGradient);
 };
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index b683010a72..f4083d5b8f 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -3356,6 +3356,29 @@ TEST(GradientCheckerTest, ResizeGrad) {
 
 TEST(GradientCheckerTest, AtanGrad) { UnaryOpGradientTest("Atan"); }
 
+TEST(GradientCheckerTest, GlobalMaxPoolGrad) {
+  float max_error;
+  GradientChecker<float, float, float> gradient_checker;
+  OpDef op_def{"GlobalMaxPool", kOnnxDomain, 11};
+  constexpr float error_tolerance = 1e-3f;
+
+  // globalmaxpool
+  {
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 3, 5, 5}}, {{2, 3, 1, 1}}, &max_error, {},
+                                                           /*check_not_have_gradient*/ true,
+                                                           /*check_not_have_shape_inferencing*/ true));
+    EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
+  }
+
+  // globalmaxpool_precomputed
+  {
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {{2, 1, 3, 3}}, {{2, 1, 1, 1}}, &max_error, {},
+                                                           /*check_not_have_gradient*/ true,
+                                                           /*check_not_have_shape_inferencing*/ true));
+    EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
+  }
+}
+
 }  // namespace test
 }  // namespace onnxruntime
 

From 1cf0ebd4cc168c6cbb97c0232e52e6e9a1f63334 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 28 Jan 2025 09:11:12 -0800
Subject: [PATCH 27/37] Delete Prefast workflow until the build failure is
 fixed (#23510)

### Description
Delete Prefast workflow until the build failure is fixed


### Motivation and Context
Right now the pipelines are failing due to an environment change from
Github.
---
 .github/workflows/sca.yml     | 177 ----------------------------------
 .github/workflows/windows.yml |   2 +-
 2 files changed, 1 insertion(+), 178 deletions(-)
 delete mode 100644 .github/workflows/sca.yml

diff --git a/.github/workflows/sca.yml b/.github/workflows/sca.yml
deleted file mode 100644
index 51166293f0..0000000000
--- a/.github/workflows/sca.yml
+++ /dev/null
@@ -1,177 +0,0 @@
-name: Windows_SCA
-on:
-  push:
-    branches:
-      - main
-      - rel-*
-  pull_request:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  AZCOPY_AUTO_LOGIN_TYPE: MSI
-  AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
-
-jobs:
-  Onnxruntime-SCA-training-CUDA:
-    permissions:
-      security-events: write
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: false
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.11.x'
-          architecture: 'x64'
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Download cuda
-        run: azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/cuda_sdk/v11.8" cuda_sdk
-
-
-      - name: Install ONNX
-        run: |
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x64 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run: python tools\ci_build\build.py --windows_sdk_version 10.0.22621.0 --enable_training --build_java --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --enable_pybind --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --use_cuda --cuda_home=${{ github.workspace }}\cuda_sdk\v11.8 --use_binskim_compliant_compile_flags --enable_cuda_profiling  --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v3
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA
-
-  # With WebGPU, Without python
-  Onnxruntime-SCA-win32-WebGPU-x64:
-    permissions:
-      security-events: write
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: false
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.11.x'
-          architecture: 'x64'
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-
-
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run:  python tools\ci_build\build.py --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --use_webgpu
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v3
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA_WIN32_WEBGPU_X64
-
-  # No python
-  Onnxruntime-SCA-win32-WINML-x64:
-    permissions:
-      security-events: write
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: false
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.11.x'
-          architecture: 'x64'
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x64 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run:  python tools\ci_build\build.py --build_java --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --ms_experimental --use_dml --use_winml --disable_rtti --enable_wcos --build_shared_lib
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v3
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA_WIN32_WINML_X64
-
-  # No java, No python
-  Onnxruntime-SCA-win32-WINML-x86:
-    permissions:
-      security-events: write
-    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: false
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.11.x'
-          architecture: 'x86'
-
-      - uses: actions/setup-node@v4
-        with:
-          node-version: 20
-
-      - name: Delete build folder
-        run: |
-          if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x86 -install_prefix D:\b\Debug\installed -build_config Debug
-
-      # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
-      - name: Build code
-        env:
-           CAExcludePath: 'C:\Program Files;D:\b;${{ github.workspace }}\cmake'
-        run:  python tools\ci_build\build.py --compile_no_warning_as_error --config Debug --build_dir D:\b --skip_submodule_sync --build_csharp --update --build --parallel --cmake_generator "Visual Studio 17 2022" --build_shared_lib --cmake_extra_defines onnxruntime_USE_CUSTOM_STATIC_ANALYSIS_RULES=ON --cmake_extra_defines onnxruntime_ENABLE_STATIC_ANALYSIS=ON --cmake_extra_defines onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE=ON --ms_experimental --use_dml --use_winml --disable_rtti --enable_wcos --build_shared_lib
-        
-      - name: Generate sarif
-        working-directory: D:\b
-        run: npx @microsoft/sarif-multitool merge *.sarif --recurse --output-directory=${{ github.workspace }}\output --output-file=MergeResult.sarif --merge-runs && dir ${{ github.workspace }}\output
-
-      - name: Upload SARIF to GitHub
-        uses: github/codeql-action/upload-sarif@v3
-        continue-on-error: true
-        with:
-          sarif_file: ${{ github.workspace }}\output\MergeResult.sarif
-          category: VS_SCA_WIN32_WINML_X86
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 35bacb2b6b..5adfad6309 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -37,7 +37,7 @@ jobs:
       - name: Delete build folder
         run: |
           if (Test-Path D:\b) { Remove-Item -Recurse -Force D:\b }
-          &tools\ci_build\github\windows\install_third_party_deps.ps1 -cpu_arch x64 -install_prefix D:\b\Debug\installed -build_config Debug
+
 
       # The build machine doesn't have a GPU. So the value of CMAKE_CUDA_ARCHITECTURES doesn't matter.
       - name: Build code

From a770a8dec821a761f28052e8193ff59e1fdaf6b6 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 28 Jan 2025 12:53:13 -0500
Subject: [PATCH 28/37] Update RN to 0.71.19 (#23381)

### Description
<!-- Describe your changes. -->

Upgrading RN to 0.71.19, including Android and iOS changes.. This PR
also include the E2E test changes.

Used React-Native upgrade
[helper](https://react-native-community.github.io/upgrade-helper/?from=0.70.15&to=0.71.19&package=onnxruntime-android&name=onnxruntime)
as the reference.


### Motivation and Context
Need newer RN version to fix S360 work items.
---
 js/react_native/android/build.gradle          |  11 +-
 js/react_native/android/gradle.properties     |   2 +-
 .../android/src/main/AndroidManifestNew.xml   |   2 +
 js/react_native/e2e/android/app/build.gradle  | 175 ++++++++----------
 .../ReactNativeFlipper.java                   |  49 +++--
 .../android/app/src/main/AndroidManifest.xml  |   3 +-
 js/react_native/e2e/android/build.gradle      |   4 +-
 js/react_native/e2e/android/gradle.properties |   6 +-
 js/react_native/e2e/android/settings.gradle   |   1 +
 .../project.pbxproj                           |  88 ++-------
 js/react_native/e2e/ios/Podfile               |  12 +-
 js/react_native/e2e/ios/PrivacyInfo.xcprivacy |  37 ++++
 js/react_native/e2e/package.json              |  28 ++-
 .../project.pbxproj                           | 168 +++++++++--------
 js/react_native/ios/Podfile                   |  31 ++--
 js/react_native/ios/PrivacyInfo.xcprivacy     |  38 ++++
 js/react_native/package.json                  |  10 +-
 17 files changed, 347 insertions(+), 318 deletions(-)
 create mode 100644 js/react_native/android/src/main/AndroidManifestNew.xml
 create mode 100644 js/react_native/e2e/ios/PrivacyInfo.xcprivacy
 create mode 100644 js/react_native/ios/PrivacyInfo.xcprivacy

diff --git a/js/react_native/android/build.gradle b/js/react_native/android/build.gradle
index df5792d274..2f5b5adc7a 100644
--- a/js/react_native/android/build.gradle
+++ b/js/react_native/android/build.gradle
@@ -70,6 +70,8 @@ def REACT_NATIVE_VERSION = ['node', '--print', "JSON.parse(require('fs').readFil
 def REACT_NATIVE_MINOR_VERSION = REACT_NATIVE_VERSION.split("\\.")[1].toInteger()
 
 android {
+// This is needed by the new AndroidManifestNew.xml
+  namespace "ai.onnxruntime.reactnative"
   compileSdkVersion getExtOrIntegerDefault('compileSdkVersion')
   buildToolsVersion getExtOrDefault('buildToolsVersion')
   defaultConfig {
@@ -110,6 +112,8 @@ android {
   }
 
   packagingOptions {
+    pickFirst '**/libc++_shared.so'
+    pickFirst '**/libfbjni.so'
     doNotStrip resolveBuildType() == 'debug' ? "**/**/*.so" : ''
     excludes = [
       "META-INF",
@@ -134,6 +138,8 @@ android {
   sourceSets {
     main {
       java.srcDirs = ['src/main/java/']
+//       A tricky situation where iOS still uses the AndroidManifest.xml file, but the Android use AndroidManifestNew.xml
+      manifest.srcFile "src/main/AndroidManifestNew.xml"
       if (ortExtensionsEnabled) {
         java.exclude '**/OnnxruntimeExtensionsDisabled.java'
       } else {
@@ -218,7 +224,8 @@ repositories {
 }
 
 dependencies {
-  api "com.facebook.react:react-native:" + REACT_NATIVE_VERSION
+  //noinspection GradleDynamicVersion
+  implementation "com.facebook.react:react-android:"+ REACT_NATIVE_VERSION
   api "org.mockito:mockito-core:2.28.2"
 
   androidTestImplementation "androidx.test:runner:1.5.2"
@@ -233,4 +240,4 @@ dependencies {
   if (ortExtensionsEnabled) {
     implementation "com.microsoft.onnxruntime:onnxruntime-extensions-android:latest.integration@aar"
   }
-}
+}
\ No newline at end of file
diff --git a/js/react_native/android/gradle.properties b/js/react_native/android/gradle.properties
index 5f8bcd9931..3461ce4919 100644
--- a/js/react_native/android/gradle.properties
+++ b/js/react_native/android/gradle.properties
@@ -13,7 +13,7 @@ org.gradle.jvmargs=-Xmx4096m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF
 #Tue Jan 26 17:36:02 PST 2021
 android.enableJetifier=true
 android.useAndroidX=true
-OnnxruntimeModule_buildToolsVersion=29.0.2
+OnnxruntimeModule_buildToolsVersion=33.0.0
 OnnxruntimeModule_compileSdkVersion=34
 OnnxruntimeModule_minSdkVersion=24
 OnnxruntimeModule_targetSdkVersion=34
diff --git a/js/react_native/android/src/main/AndroidManifestNew.xml b/js/react_native/android/src/main/AndroidManifestNew.xml
new file mode 100644
index 0000000000..a30cbbdd6f
--- /dev/null
+++ b/js/react_native/android/src/main/AndroidManifestNew.xml
@@ -0,0 +1,2 @@
+<manifest xmlns:android="http://schemas.android.com/apk/res/android" >
+</manifest>
diff --git a/js/react_native/e2e/android/app/build.gradle b/js/react_native/e2e/android/app/build.gradle
index 526259e3f8..68eaacc190 100644
--- a/js/react_native/e2e/android/app/build.gradle
+++ b/js/react_native/e2e/android/app/build.gradle
@@ -1,132 +1,105 @@
 apply plugin: "com.android.application"
+apply plugin: "com.facebook.react"
 
 import com.android.build.OutputFile
 
 /**
- * The react.gradle file registers a task for each build variant (e.g. bundleDebugJsAndAssets
- * and bundleReleaseJsAndAssets).
- * These basically call `react-native bundle` with the correct arguments during the Android build
- * cycle. By default, bundleDebugJsAndAssets is skipped, as in debug/dev mode we prefer to load the
- * bundle directly from the development server. Below you can see all the possible configurations
- * and their defaults. If you decide to add a configuration block, make sure to add it before the
- * `apply from: "../../node_modules/react-native/react.gradle"` line.
- *
- * project.ext.react = [
- *   // the name of the generated asset file containing your JS bundle
- *   bundleAssetName: "index.android.bundle",
- *
- *   // the entry file for bundle generation
- *   entryFile: "index.android.js",
- *
- *   // https://reactnative.dev/docs/performance#enable-the-ram-format
- *   bundleCommand: "ram-bundle",
- *
- *   // whether to bundle JS and assets in debug mode
- *   bundleInDebug: false,
- *
- *   // whether to bundle JS and assets in release mode
- *   bundleInRelease: true,
- *
- *   // whether to bundle JS and assets in another build variant (if configured).
- *   // See http://tools.android.com/tech-docs/new-build-system/user-guide#TOC-Build-Variants
- *   // The configuration property can be in the following formats
- *   //         'bundleIn${productFlavor}${buildType}'
- *   //         'bundleIn${buildType}'
- *   // bundleInFreeDebug: true,
- *   // bundleInPaidRelease: true,
- *   // bundleInBeta: true,
- *
- *   // whether to disable dev mode in custom build variants (by default only disabled in release)
- *   // for OnnxruntimeModuleExample: to disable dev mode in the staging build type (if configured)
- *   devDisabledInStaging: true,
- *   // The configuration property can be in the following formats
- *   //         'devDisabledIn${productFlavor}${buildType}'
- *   //         'devDisabledIn${buildType}'
- *
- *   // the root of your project, i.e. where "package.json" lives
- *   root: "../../",
- *
- *   // where to put the JS bundle asset in debug mode
- *   jsBundleDirDebug: "$buildDir/intermediates/assets/debug",
- *
- *   // where to put the JS bundle asset in release mode
- *   jsBundleDirRelease: "$buildDir/intermediates/assets/release",
- *
- *   // where to put drawable resources / React Native assets, e.g. the ones you use via
- *   // require('./image.png')), in debug mode
- *   resourcesDirDebug: "$buildDir/intermediates/res/merged/debug",
- *
- *   // where to put drawable resources / React Native assets, e.g. the ones you use via
- *   // require('./image.png')), in release mode
- *   resourcesDirRelease: "$buildDir/intermediates/res/merged/release",
- *
- *   // by default the gradle tasks are skipped if none of the JS files or assets change; this means
- *   // that we don't look at files in android/ or ios/ to determine whether the tasks are up to
- *   // date; if you have any other folders that you want to ignore for performance reasons (gradle
- *   // indexes the entire tree), add them here. Alternatively, if you have JS files in android/
- *   // for OnnxruntimeModuleExample, you might want to remove it from here.
- *   inputExcludes: ["android/**", "ios/**"],
- *
- *   // override which node gets called and with what additional arguments
- *   nodeExecutableAndArgs: ["node"],
- *
- *   // supply additional arguments to the packager
- *   extraPackagerArgs: []
- * ]
+ * This is the configuration block to customize your React Native Android app.
+ * By default you don't need to apply any configuration, just uncomment the lines you need.
  */
+react {
+    /* Folders */
+    //   The root of your project, i.e. where "package.json" lives. Default is '..'
+    // root = file("../")
+    //   The folder where the react-native NPM package is. Default is ../node_modules/react-native
+    // reactNativeDir = file("../node_modules/react-native")
+    //   The folder where the react-native Codegen package is. Default is ../node_modules/react-native-codegen
+    // codegenDir = file("../node_modules/react-native-codegen")
+    //   The cli.js file which is the React Native CLI entrypoint. Default is ../node_modules/react-native/cli.js
+    // cliFile = file("../node_modules/react-native/cli.js")
 
-project.ext.react = [
-    enableHermes: false,  // clean and rebuild if changing
-    entryFile: "index.tsx",
-]
+    /* Variants */
+    //   The list of variants to that are debuggable. For those we're going to
+    //   skip the bundling of the JS bundle and the assets. By default is just 'debug'.
+    //   If you add flavors like lite, prod, etc. you'll have to list your debuggableVariants.
+    // debuggableVariants = ["liteDebug", "prodDebug"]
 
-apply from: "../../node_modules/react-native/react.gradle"
+    /* Bundling */
+    //   A list containing the node command and its flags. Default is just 'node'.
+    // nodeExecutableAndArgs = ["node"]
+    //
+    //   The command to run when bundling. By default is 'bundle'
+    // bundleCommand = "ram-bundle"
+    //
+    //   The path to the CLI configuration file. Default is empty.
+    // bundleConfig = file(../rn-cli.config.js)
+    //
+    //   The name of the generated asset file containing your JS bundle
+    // bundleAssetName = "MyApplication.android.bundle"
+    //
+    //   The entry file for bundle generation. Default is 'index.android.js' or 'index.js'
+    entryFile = file("${rootProject.projectDir}/../index.tsx")
+    //
+    //   A list of extra flags to pass to the 'bundle' commands.
+    //   See https://github.com/react-native-community/cli/blob/main/docs/commands.md#bundle
+    // extraPackagerArgs = []
+
+    /* Hermes Commands */
+    //   The hermes compiler command to run. By default it is 'hermesc'
+    // hermesCommand = "$rootDir/my-custom-hermesc/bin/hermesc"
+    //
+    //   The list of flags to pass to the Hermes compiler. By default is "-O", "-output-source-map"
+    // hermesFlags = ["-O", "-output-source-map"]
+}
 
 /**
- * Set this to true to create two separate APKs instead of one:
- *   - An APK that only works on ARM devices
- *   - An APK that only works on x86 devices
- * The advantage is the size of the APK is reduced by about 4MB.
- * Upload all the APKs to the Play Store and people will download
- * the correct one based on the CPU architecture of their device.
+ * Set this to true to create four separate APKs instead of one,
+ * one for each native architecture. This is useful if you don't
+ * use App Bundles (https://developer.android.com/guide/app-bundle/)
+ * and want to have separate APKs to upload to the Play Store.
  */
 def enableSeparateBuildPerCPUArchitecture = false
 
 /**
- * Run Proguard to shrink the Java bytecode in release builds.
+ * Set this to true to Run Proguard on Release builds to minify the Java bytecode.
  */
 def enableProguardInReleaseBuilds = false
 
 /**
- * The preferred build flavor of JavaScriptCore.
+ * The preferred build flavor of JavaScriptCore (JSC)
  *
- * For OnnxruntimeModuleExample, to use the international variant, you can use:
+ * For example, to use the international variant, you can use:
  * `def jscFlavor = 'org.webkit:android-jsc-intl:+'`
  *
  * The international variant includes ICU i18n library and necessary data
  * allowing to use e.g. `Date.toLocaleString` and `String.localeCompare` that
- * give correct results when using with locales other than en-US.  Note that
+ * give correct results when using with locales other than en-US. Note that
  * this variant is about 6MiB larger per architecture than default.
  */
 def jscFlavor = 'org.webkit:android-jsc:+'
 
 /**
- * Whether to enable the Hermes VM.
- *
- * This should be set on project.ext.react and mirrored here.  If it is not set
- * on project.ext.react, JavaScript will not be compiled to Hermes Bytecode
- * and the benefits of using Hermes will therefore be sharply reduced.
+ * Private function to get the list of Native Architectures you want to build.
+ * This reads the value from reactNativeArchitectures in your gradle.properties
+ * file and works together with the --active-arch-only flag of react-native run-android.
  */
-def enableHermes = project.ext.react.get("enableHermes", false);
+def reactNativeArchitectures() {
+    def value = project.getProperties().get("reactNativeArchitectures")
+    return value ? value.split(",") : ["armeabi-v7a", "x86", "x86_64", "arm64-v8a"]
+}
 
 android {
+
     compileSdkVersion rootProject.ext.compileSdkVersion
-
+		namespace "com.example.reactnativeonnxruntimemodule"
     compileOptions {
-        sourceCompatibility JavaVersion.VERSION_1_8
-        targetCompatibility JavaVersion.VERSION_1_8
+        sourceCompatibility JavaVersion.VERSION_17
+        targetCompatibility JavaVersion.VERSION_17
     }
-
+	  packagingOptions {
+	    pickFirst '**/libc++_shared.so'
+	    pickFirst '**/libfbjni.so'
+		}
     defaultConfig {
         applicationId "com.example.reactnativeonnxruntimemodule"
         minSdkVersion rootProject.ext.minSdkVersion
@@ -185,12 +158,12 @@ repositories {
 }
 
 dependencies {
-    androidTestImplementation('com.wix:detox:20.7.0')
+    androidTestImplementation('com.wix:detox:+')
     implementation 'androidx.appcompat:appcompat:1.1.0'
 
     implementation fileTree(dir: "libs", include: ["*.jar"])
-    //noinspection GradleDynamicVersion
-    implementation "com.facebook.react:react-native:+"  // From node_modules
+    // The version of react-native is set by the React Native Gradle Plugin
+    implementation("com.facebook.react:react-android")
 
     implementation "androidx.swiperefreshlayout:swiperefreshlayout:1.0.0"
     implementation 'androidx.test.ext:junit:1.1.5'
@@ -205,10 +178,8 @@ dependencies {
         exclude group:'com.facebook.flipper'
     }
 
-    if (enableHermes) {
-        def hermesPath = "../../node_modules/hermes-engine/android/";
-        debugImplementation files(hermesPath + "hermes-debug.aar")
-        releaseImplementation files(hermesPath + "hermes-release.aar")
+    if (hermesEnabled.toBoolean()) {
+        implementation("com.facebook.react:hermes-android")
     } else {
         implementation jscFlavor
     }
diff --git a/js/react_native/e2e/android/app/src/debug/java/com/example/reactnativeonnxruntimemodule/ReactNativeFlipper.java b/js/react_native/e2e/android/app/src/debug/java/com/example/reactnativeonnxruntimemodule/ReactNativeFlipper.java
index 5624fffa7f..3cacb2d5d4 100644
--- a/js/react_native/e2e/android/app/src/debug/java/com/example/reactnativeonnxruntimemodule/ReactNativeFlipper.java
+++ b/js/react_native/e2e/android/app/src/debug/java/com/example/reactnativeonnxruntimemodule/ReactNativeFlipper.java
@@ -1,10 +1,10 @@
 /**
- * Copyright (c) Facebook, Inc. and its affiliates.
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
  *
  * <p>This source code is licensed under the MIT license found in the LICENSE file in the root
  * directory of this source tree.
  */
-package com.example.reactnativeonnxruntimemodule;
+package com.reactnativeonnxruntimemodule;
 
 import android.content.Context;
 import com.facebook.flipper.android.AndroidFlipperClient;
@@ -17,50 +17,59 @@ import com.facebook.flipper.plugins.inspector.DescriptorMapping;
 import com.facebook.flipper.plugins.inspector.InspectorFlipperPlugin;
 import com.facebook.flipper.plugins.network.FlipperOkhttpInterceptor;
 import com.facebook.flipper.plugins.network.NetworkFlipperPlugin;
-import com.facebook.flipper.plugins.react.ReactFlipperPlugin;
 import com.facebook.flipper.plugins.sharedpreferences.SharedPreferencesFlipperPlugin;
+import com.facebook.react.ReactInstanceEventListener;
 import com.facebook.react.ReactInstanceManager;
 import com.facebook.react.bridge.ReactContext;
 import com.facebook.react.modules.network.NetworkingModule;
 import okhttp3.OkHttpClient;
 
+/**
+ * Class responsible of loading Flipper inside your React Native application. This is the debug
+ * flavor of it. Here you can add your own plugins and customize the Flipper setup.
+ */
 public class ReactNativeFlipper {
   public static void initializeFlipper(Context context, ReactInstanceManager reactInstanceManager) {
     if (FlipperUtils.shouldEnableFlipper(context)) {
       final FlipperClient client = AndroidFlipperClient.getInstance(context);
+
       client.addPlugin(new InspectorFlipperPlugin(context, DescriptorMapping.withDefaults()));
-      client.addPlugin(new ReactFlipperPlugin());
       client.addPlugin(new DatabasesFlipperPlugin(context));
       client.addPlugin(new SharedPreferencesFlipperPlugin(context));
       client.addPlugin(CrashReporterPlugin.getInstance());
+
       NetworkFlipperPlugin networkFlipperPlugin = new NetworkFlipperPlugin();
-      NetworkingModule.setCustomClientBuilder(new NetworkingModule.CustomClientBuilder() {
-        @Override
-        public void apply(OkHttpClient.Builder builder) {
-          builder.addNetworkInterceptor(new FlipperOkhttpInterceptor(networkFlipperPlugin));
-        }
-      });
+      NetworkingModule.setCustomClientBuilder(
+          new NetworkingModule.CustomClientBuilder() {
+            @Override
+            public void apply(OkHttpClient.Builder builder) {
+              builder.addNetworkInterceptor(new FlipperOkhttpInterceptor(networkFlipperPlugin));
+            }
+          });
       client.addPlugin(networkFlipperPlugin);
       client.start();
+
       // Fresco Plugin needs to ensure that ImagePipelineFactory is initialized
       // Hence we run if after all native modules have been initialized
       ReactContext reactContext = reactInstanceManager.getCurrentReactContext();
       if (reactContext == null) {
-        reactInstanceManager.addReactInstanceEventListener(new ReactInstanceManager.ReactInstanceEventListener() {
-          @Override
-          public void onReactContextInitialized(ReactContext reactContext) {
-            reactInstanceManager.removeReactInstanceEventListener(this);
-            reactContext.runOnNativeModulesQueueThread(new Runnable() {
+        reactInstanceManager.addReactInstanceEventListener(
+            new ReactInstanceEventListener() {
               @Override
-              public void run() {
-                client.addPlugin(new FrescoFlipperPlugin());
+              public void onReactContextInitialized(ReactContext reactContext) {
+                reactInstanceManager.removeReactInstanceEventListener(this);
+                reactContext.runOnNativeModulesQueueThread(
+                    new Runnable() {
+                      @Override
+                      public void run() {
+                        client.addPlugin(new FrescoFlipperPlugin());
+                      }
+                    });
               }
             });
-          }
-        });
       } else {
         client.addPlugin(new FrescoFlipperPlugin());
       }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/js/react_native/e2e/android/app/src/main/AndroidManifest.xml b/js/react_native/e2e/android/app/src/main/AndroidManifest.xml
index 24e685b6ca..d219c7c18f 100644
--- a/js/react_native/e2e/android/app/src/main/AndroidManifest.xml
+++ b/js/react_native/e2e/android/app/src/main/AndroidManifest.xml
@@ -1,5 +1,4 @@
-<manifest xmlns:android="http://schemas.android.com/apk/res/android"
-  package="com.example.reactnativeonnxruntimemodule">
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
 
   <uses-permission android:name="android.permission.INTERNET" />
   <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
diff --git a/js/react_native/e2e/android/build.gradle b/js/react_native/e2e/android/build.gradle
index 9b953c0849..1178791f48 100644
--- a/js/react_native/e2e/android/build.gradle
+++ b/js/react_native/e2e/android/build.gradle
@@ -2,7 +2,7 @@
 
 buildscript {
     ext {
-        buildToolsVersion = "29.0.2"
+        buildToolsVersion = "33.0.0"
         minSdkVersion = 24
         compileSdkVersion = 34
         targetSdkVersion = 34
@@ -15,6 +15,8 @@ buildscript {
     dependencies {
         classpath('com.android.tools.build:gradle:7.2.1')
         classpath("org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlinVersion")
+        classpath("com.facebook.react:react-native-gradle-plugin")
+
         // NOTE: Do not place your application dependencies here; they belong
         // in the individual module build.gradle files
     }
diff --git a/js/react_native/e2e/android/gradle.properties b/js/react_native/e2e/android/gradle.properties
index 5c4f82a8fc..a2541138f1 100644
--- a/js/react_native/e2e/android/gradle.properties
+++ b/js/react_native/e2e/android/gradle.properties
@@ -19,5 +19,9 @@
 
 android.useAndroidX=true
 android.enableJetifier=true
-FLIPPER_VERSION=0.54.0
+FLIPPER_VERSION=0.125.0
 org.gradle.jvmargs=-Xmx4096M
+
+# Use this property to enable or disable the Hermes JS engine.
+# If set to false, you will be using JSC instead.
+hermesEnabled=false
diff --git a/js/react_native/e2e/android/settings.gradle b/js/react_native/e2e/android/settings.gradle
index fd02678d9b..b12a36f91d 100644
--- a/js/react_native/e2e/android/settings.gradle
+++ b/js/react_native/e2e/android/settings.gradle
@@ -1,3 +1,4 @@
 rootProject.name = 'OnnxruntimeModuleExample'
 apply from: file("../node_modules/@react-native-community/cli-platform-android/native_modules.gradle"); applyNativeModulesSettingsGradle(settings)
 include ':app'
+includeBuild('../node_modules/react-native-gradle-plugin')
\ No newline at end of file
diff --git a/js/react_native/e2e/ios/OnnxruntimeModuleExample.xcodeproj/project.pbxproj b/js/react_native/e2e/ios/OnnxruntimeModuleExample.xcodeproj/project.pbxproj
index a358f51984..b8c9d9ab90 100644
--- a/js/react_native/e2e/ios/OnnxruntimeModuleExample.xcodeproj/project.pbxproj
+++ b/js/react_native/e2e/ios/OnnxruntimeModuleExample.xcodeproj/project.pbxproj
@@ -10,12 +10,13 @@
 		13B07FBC1A68108700A75B9A /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 13B07FB01A68108700A75B9A /* AppDelegate.m */; };
 		13B07FBF1A68108700A75B9A /* Images.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 13B07FB51A68108700A75B9A /* Images.xcassets */; };
 		13B07FC11A68108700A75B9A /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 13B07FB71A68108700A75B9A /* main.m */; };
+		81411D106EB3E14586DBF352 /* libPods-OnnxruntimeModuleExample.a in Frameworks */ = {isa = PBXBuildFile; fileRef = A98DB3380F37BDA06AFF9005 /* libPods-OnnxruntimeModuleExample.a */; };
 		81AB9BB82411601600AC10FF /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 81AB9BB72411601600AC10FF /* LaunchScreen.storyboard */; };
-		863EC78A40302DD5B5E3AD76 /* Pods_OnnxruntimeModuleExample.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1FBB91B17EC27D0D688A14CD /* Pods_OnnxruntimeModuleExample.framework */; };
 		DB61BA27278684FB0096C971 /* OnnxruntimeModuleExampleUITests.m in Sources */ = {isa = PBXBuildFile; fileRef = DB61BA26278684FB0096C971 /* OnnxruntimeModuleExampleUITests.m */; };
 		DBA8BA87267293C4008CC55A /* mnist.ort in Resources */ = {isa = PBXBuildFile; fileRef = DBA8BA86267293C4008CC55A /* mnist.ort */; };
 		DBBF7412263B8C7100487C77 /* MNISTDataHandler.mm in Sources */ = {isa = PBXBuildFile; fileRef = DBBF7411263B8C7100487C77 /* MNISTDataHandler.mm */; };
 		DBBF7414263B8CCB00487C77 /* 3.jpg in Resources */ = {isa = PBXBuildFile; fileRef = DBBF7413263B8CCB00487C77 /* 3.jpg */; };
+		E329E1162D3728940016B599 /* PrivacyInfo.xcprivacy in Resources */ = {isa = PBXBuildFile; fileRef = E329E1142D3728940016B599 /* PrivacyInfo.xcprivacy */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
@@ -49,9 +50,9 @@
 		13B07FB51A68108700A75B9A /* Images.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; name = Images.xcassets; path = OnnxruntimeModuleExample/Images.xcassets; sourceTree = "<group>"; };
 		13B07FB61A68108700A75B9A /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; name = Info.plist; path = OnnxruntimeModuleExample/Info.plist; sourceTree = "<group>"; };
 		13B07FB71A68108700A75B9A /* main.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = main.m; path = OnnxruntimeModuleExample/main.m; sourceTree = "<group>"; };
-		1FBB91B17EC27D0D688A14CD /* Pods_OnnxruntimeModuleExample.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_OnnxruntimeModuleExample.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		81AB9BB72411601600AC10FF /* LaunchScreen.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; name = LaunchScreen.storyboard; path = OnnxruntimeModuleExample/LaunchScreen.storyboard; sourceTree = "<group>"; };
 		9D58C0FCCF00905433F4ED74 /* Pods-OnnxruntimeModuleExample.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OnnxruntimeModuleExample.debug.xcconfig"; path = "Target Support Files/Pods-OnnxruntimeModuleExample/Pods-OnnxruntimeModuleExample.debug.xcconfig"; sourceTree = "<group>"; };
+		A98DB3380F37BDA06AFF9005 /* libPods-OnnxruntimeModuleExample.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-OnnxruntimeModuleExample.a"; sourceTree = BUILT_PRODUCTS_DIR; };
 		B70FCE6DFAB320E9051DA321 /* Pods-OnnxruntimeModuleExample.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OnnxruntimeModuleExample.release.xcconfig"; path = "Target Support Files/Pods-OnnxruntimeModuleExample/Pods-OnnxruntimeModuleExample.release.xcconfig"; sourceTree = "<group>"; };
 		DB61BA24278684FB0096C971 /* OnnxruntimeModuleExampleUITests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = OnnxruntimeModuleExampleUITests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
 		DB61BA26278684FB0096C971 /* OnnxruntimeModuleExampleUITests.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = OnnxruntimeModuleExampleUITests.m; sourceTree = "<group>"; };
@@ -59,6 +60,7 @@
 		DBBF7410263B8C5F00487C77 /* MNISTDataHandler.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MNISTDataHandler.h; sourceTree = "<group>"; };
 		DBBF7411263B8C7100487C77 /* MNISTDataHandler.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MNISTDataHandler.mm; sourceTree = "<group>"; };
 		DBBF7413263B8CCB00487C77 /* 3.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; name = 3.jpg; path = ../src/3.jpg; sourceTree = "<group>"; };
+		E329E1142D3728940016B599 /* PrivacyInfo.xcprivacy */ = {isa = PBXFileReference; lastKnownFileType = text.xml; path = PrivacyInfo.xcprivacy; sourceTree = "<group>"; };
 		ED297162215061F000B7C4FE /* JavaScriptCore.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = JavaScriptCore.framework; path = System/Library/Frameworks/JavaScriptCore.framework; sourceTree = SDKROOT; };
 /* End PBXFileReference section */
 
@@ -67,7 +69,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				863EC78A40302DD5B5E3AD76 /* Pods_OnnxruntimeModuleExample.framework in Frameworks */,
+				81411D106EB3E14586DBF352 /* libPods-OnnxruntimeModuleExample.a in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -84,6 +86,7 @@
 		13B07FAE1A68108700A75B9A /* OnnxruntimeModuleExample */ = {
 			isa = PBXGroup;
 			children = (
+				E329E1142D3728940016B599 /* PrivacyInfo.xcprivacy */,
 				DBBF7411263B8C7100487C77 /* MNISTDataHandler.mm */,
 				DBBF7410263B8C5F00487C77 /* MNISTDataHandler.h */,
 				008F07F21AC5B25A0029DE68 /* main.jsbundle */,
@@ -101,7 +104,7 @@
 			isa = PBXGroup;
 			children = (
 				ED297162215061F000B7C4FE /* JavaScriptCore.framework */,
-				1FBB91B17EC27D0D688A14CD /* Pods_OnnxruntimeModuleExample.framework */,
+				A98DB3380F37BDA06AFF9005 /* libPods-OnnxruntimeModuleExample.a */,
 			);
 			name = Frameworks;
 			sourceTree = "<group>";
@@ -170,7 +173,7 @@
 				13B07F8E1A680F5B00A75B9A /* Resources */,
 				00DD1BFF1BD5951E006B06BC /* Bundle React Native code and images */,
 				DB8FCD9C25C3404B00C72F26 /* Embed Libraries */,
-				8BCC39118F893614DE3809E3 /* [CP] Embed Pods Frameworks */,
+				9BBEFBEFBEE7FC814F312449 /* [CP] Copy Pods Resources */,
 			);
 			buildRules = (
 			);
@@ -244,6 +247,7 @@
 				DBA8BA87267293C4008CC55A /* mnist.ort in Resources */,
 				DBBF7414263B8CCB00487C77 /* 3.jpg in Resources */,
 				81AB9BB82411601600AC10FF /* LaunchScreen.storyboard in Resources */,
+				E329E1162D3728940016B599 /* PrivacyInfo.xcprivacy in Resources */,
 				13B07FBF1A68108700A75B9A /* Images.xcassets in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -273,72 +277,22 @@
 			shellPath = /bin/sh;
 			shellScript = "if [ \"$CONFIGURATION\" == \"Release\" ]; then\n  export NODE_BINARY=$(which node)\n  export ENTRY_FILE=\"index.tsx\"\n  export EXTRA_PACKAGER_ARGS=\"--reset-cache\"\n\n  ../node_modules/react-native/scripts/react-native-xcode.sh\n  echo \"copying bundle file from $CONFIGURATION_BUILD_DIR/main.jsbundlecd  to $CONFIGURATION_BUILD_DIR/$UNLOCALIZED_RESOURCES_FOLDER_PATH/\"\n  echo \"This cp might not need it post 0.7.15 because it is a bug from facebook\" \n  cp $CONFIGURATION_BUILD_DIR/main.jsbundle $CONFIGURATION_BUILD_DIR/$UNLOCALIZED_RESOURCES_FOLDER_PATH/\nfi\n";
 		};
-		8BCC39118F893614DE3809E3 /* [CP] Embed Pods Frameworks */ = {
+		9BBEFBEFBEE7FC814F312449 /* [CP] Copy Pods Resources */ = {
 			isa = PBXShellScriptBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
 			);
 			inputPaths = (
-				"${PODS_ROOT}/Target Support Files/Pods-OnnxruntimeModuleExample/Pods-OnnxruntimeModuleExample-frameworks.sh",
-				"${BUILT_PRODUCTS_DIR}/DoubleConversion/DoubleConversion.framework",
-				"${BUILT_PRODUCTS_DIR}/RCT-Folly/folly.framework",
-				"${BUILT_PRODUCTS_DIR}/RCTTypeSafety/RCTTypeSafety.framework",
-				"${BUILT_PRODUCTS_DIR}/RNFS/RNFS.framework",
-				"${BUILT_PRODUCTS_DIR}/React-Codegen/React_Codegen.framework",
-				"${BUILT_PRODUCTS_DIR}/React-Core/React.framework",
-				"${BUILT_PRODUCTS_DIR}/React-CoreModules/CoreModules.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTAnimation/RCTAnimation.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTBlob/RCTBlob.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTImage/RCTImage.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTLinking/RCTLinking.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTNetwork/RCTNetwork.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTSettings/RCTSettings.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTText/RCTText.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTVibration/RCTVibration.framework",
-				"${BUILT_PRODUCTS_DIR}/React-bridging/react_bridging.framework",
-				"${BUILT_PRODUCTS_DIR}/React-cxxreact/cxxreact.framework",
-				"${BUILT_PRODUCTS_DIR}/React-jsi/jsi.framework",
-				"${BUILT_PRODUCTS_DIR}/React-jsiexecutor/jsireact.framework",
-				"${BUILT_PRODUCTS_DIR}/React-jsinspector/jsinspector.framework",
-				"${BUILT_PRODUCTS_DIR}/React-logger/logger.framework",
-				"${BUILT_PRODUCTS_DIR}/React-perflogger/reactperflogger.framework",
-				"${BUILT_PRODUCTS_DIR}/ReactCommon/ReactCommon.framework",
-				"${BUILT_PRODUCTS_DIR}/Yoga/yoga.framework",
-				"${BUILT_PRODUCTS_DIR}/fmt/fmt.framework",
-				"${BUILT_PRODUCTS_DIR}/glog/glog.framework",
+				"${PODS_ROOT}/Target Support Files/Pods-OnnxruntimeModuleExample/Pods-OnnxruntimeModuleExample-resources.sh",
+				"${PODS_CONFIGURATION_BUILD_DIR}/React-Core/AccessibilityResources.bundle",
 			);
-			name = "[CP] Embed Pods Frameworks";
+			name = "[CP] Copy Pods Resources";
 			outputPaths = (
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/DoubleConversion.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/folly.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTTypeSafety.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RNFS.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/React_Codegen.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/React.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/CoreModules.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTAnimation.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTBlob.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTImage.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTLinking.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTNetwork.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTSettings.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTText.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTVibration.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/react_bridging.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/cxxreact.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/jsi.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/jsireact.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/jsinspector.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/logger.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/reactperflogger.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/ReactCommon.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/yoga.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/fmt.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/glog.framework",
+				"${TARGET_BUILD_DIR}/${UNLOCALIZED_RESOURCES_FOLDER_PATH}/AccessibilityResources.bundle",
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 			shellPath = /bin/sh;
-			shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OnnxruntimeModuleExample/Pods-OnnxruntimeModuleExample-frameworks.sh\"\n";
+			shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OnnxruntimeModuleExample/Pods-OnnxruntimeModuleExample-resources.sh\"\n";
 			showEnvVarsInLog = 0;
 		};
 		FD10A7F022414F080027D42C /* Start Packager */ = {
@@ -522,11 +476,7 @@
 				);
 				MTL_ENABLE_DEBUG_INFO = YES;
 				ONLY_ACTIVE_ARCH = YES;
-				OTHER_LDFLAGS = (
-					"$(inherited)",
-					"-Wl",
-					"-ld_classic",
-				);
+				OTHER_LDFLAGS = "$(inherited)";
 				REACT_NATIVE_PATH = "${PODS_ROOT}/../../node_modules/react-native";
 				SDKROOT = iphoneos;
 			};
@@ -586,11 +536,7 @@
 					"\"$(inherited)\"",
 				);
 				MTL_ENABLE_DEBUG_INFO = NO;
-				OTHER_LDFLAGS = (
-					"$(inherited)",
-					"-Wl",
-					"-ld_classic",
-				);
+				OTHER_LDFLAGS = "$(inherited)";
 				REACT_NATIVE_PATH = "${PODS_ROOT}/../../node_modules/react-native";
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
diff --git a/js/react_native/e2e/ios/Podfile b/js/react_native/e2e/ios/Podfile
index bb3b9c28be..78886dd354 100644
--- a/js/react_native/e2e/ios/Podfile
+++ b/js/react_native/e2e/ios/Podfile
@@ -3,6 +3,14 @@ require_relative '../node_modules/@react-native-community/cli-platform-ios/nativ
 
 platform :ios, '15.1'
 
+prepare_react_native_project!
+
+linkage = ENV['USE_FRAMEWORKS']
+if linkage != nil
+  Pod::UI.puts "Configuring Pod with #{linkage}ally linked Frameworks".green
+  use_frameworks! :linkage => linkage.to_sym
+end
+
 pre_install do |installer|
   # Custom pre-install script or commands
   puts "Running pre-install script..."
@@ -27,7 +35,6 @@ target 'OnnxruntimeModuleExample' do
     :app_path => "#{Pod::Config.instance.installation_root}/.."
   )
 
-  use_frameworks!
 
   ort_c_local_pod_path = ENV['ORT_C_LOCAL_POD_PATH']
   if ort_c_local_pod_path != nil
@@ -38,10 +45,11 @@ target 'OnnxruntimeModuleExample' do
 
   inherit! :search_paths
   post_install do |installer|
-    # https://github.com/facebook/react-native/blob/main/packages/react-native/scripts/react_native_pods.rb#L197-L202
     react_native_post_install(
       installer,
       config[:reactNativePath],
+    # Set `mac_catalyst_enabled` to `true` in order to apply patches
+	  # necessary for Mac Catalyst builds
       :mac_catalyst_enabled => false,
     )
     __apply_Xcode_12_5_M1_post_install_workaround(installer)
diff --git a/js/react_native/e2e/ios/PrivacyInfo.xcprivacy b/js/react_native/e2e/ios/PrivacyInfo.xcprivacy
new file mode 100644
index 0000000000..549cd5d8b6
--- /dev/null
+++ b/js/react_native/e2e/ios/PrivacyInfo.xcprivacy
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>NSPrivacyCollectedDataTypes</key>
+	<array/>
+	<key>NSPrivacyAccessedAPITypes</key>
+	<array>
+		<dict>
+			<key>NSPrivacyAccessedAPIType</key>
+			<string>NSPrivacyAccessedAPICategoryFileTimestamp</string>
+			<key>NSPrivacyAccessedAPITypeReasons</key>
+			<array>
+				<string>C617.1</string>
+			</array>
+		</dict>
+		<dict>
+			<key>NSPrivacyAccessedAPIType</key>
+			<string>NSPrivacyAccessedAPICategoryUserDefaults</string>
+			<key>NSPrivacyAccessedAPITypeReasons</key>
+			<array>
+				<string>CA92.1</string>
+			</array>
+		</dict>
+		<dict>
+			<key>NSPrivacyAccessedAPIType</key>
+			<string>NSPrivacyAccessedAPICategorySystemBootTime</string>
+			<key>NSPrivacyAccessedAPITypeReasons</key>
+			<array>
+				<string>35F9.1</string>
+			</array>
+		</dict>
+	</array>
+	<key>NSPrivacyTracking</key>
+	<false/>
+</dict>
+</plist>
diff --git a/js/react_native/e2e/package.json b/js/react_native/e2e/package.json
index d3610e7137..d68b4ef2da 100644
--- a/js/react_native/e2e/package.json
+++ b/js/react_native/e2e/package.json
@@ -6,19 +6,31 @@
   "scripts": {
     "android": "react-native run-android",
     "ios": "react-native run-ios",
-    "start": "react-native start"
+    "start": "react-native start",
+    "lint": "eslint .",
+    "test": "jest"
   },
   "dependencies": {
-    "react": "^18.1.0",
-    "react-native": "^0.70.15",
+    "react": "^18.2.0",
+    "react-native": "^0.71.19",
     "react-native-fs": "^2.20.0"
   },
   "devDependencies": {
-    "@babel/core": "^7.17.0",
-    "@babel/runtime": "^7.17.0",
+    "@babel/core": "^7.20.0",
+    "@babel/preset-env": "^7.20.0",
+    "@babel/runtime": "^7.20.0",
+    "@react-native-community/eslint-config": "^3.2.0",
+    "@tsconfig/react-native": "^2.0.2",
+    "@types/jest": "^29.2.1",
+    "@types/react": "^18.0.24",
+    "@types/react-test-renderer": "^18.0.0",
+    "babel-jest": "^29.2.1",
     "babel-plugin-module-resolver": "^4.0.0",
-    "detox": "^20.7.0",
-    "jest": "^29",
-    "metro-react-native-babel-preset": "0.72.4"
+    "detox": "20.10.0",
+    "jest": "^29.2.1",
+    "metro-react-native-babel-preset": "0.73.10",
+    "prettier": "^2.4.1",
+    "react-test-renderer": "18.2.0",
+    "typescript": "4.8.4"
   }
 }
diff --git a/js/react_native/ios/OnnxruntimeModule.xcodeproj/project.pbxproj b/js/react_native/ios/OnnxruntimeModule.xcodeproj/project.pbxproj
index a11c9ca069..b5984872a0 100644
--- a/js/react_native/ios/OnnxruntimeModule.xcodeproj/project.pbxproj
+++ b/js/react_native/ios/OnnxruntimeModule.xcodeproj/project.pbxproj
@@ -7,15 +7,16 @@
 	objects = {
 
 /* Begin PBXBuildFile section */
-		0105483CF04B9471894F3EAA /* Pods_OnnxruntimeModuleTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 38EB61A518C2DF782F7CD433 /* Pods_OnnxruntimeModuleTest.framework */; };
+		2507023E063B593E8767184B /* Pods_OnnxruntimeModuleTest.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 380A4E10493D3005E7695737 /* Pods_OnnxruntimeModuleTest.framework */; };
 		7FD234672A1F221700734B71 /* FakeRCTBlobManager.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FD234662A1F221700734B71 /* FakeRCTBlobManager.m */; };
-		C60033360456900E26D6F96F /* Pods_OnnxruntimeModule.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 49D0ADD02E7162A5F0DE8BAB /* Pods_OnnxruntimeModule.framework */; };
 		DB8FC9B525C2867800C72F26 /* OnnxruntimeModule.mm in Sources */ = {isa = PBXBuildFile; fileRef = DB8FC9B425C2867800C72F26 /* OnnxruntimeModule.mm */; };
 		DB8FC9B825C2868700C72F26 /* TensorHelper.mm in Sources */ = {isa = PBXBuildFile; fileRef = DB8FC9B725C2868700C72F26 /* TensorHelper.mm */; };
 		DBDB57DA2603211A004F16BE /* TensorHelperTest.mm in Sources */ = {isa = PBXBuildFile; fileRef = DBDB57D92603211A004F16BE /* TensorHelperTest.mm */; };
 		DBDB57DC2603211A004F16BE /* libOnnxruntimeModule.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 134814201AA4EA6300B7C361 /* libOnnxruntimeModule.a */; };
 		DBDB588B2609B18F004F16BE /* Resources in Resources */ = {isa = PBXBuildFile; fileRef = DBDB588A2609B18F004F16BE /* Resources */; };
 		DBDB58B0262A92D7004F16BE /* OnnxruntimeModuleTest.mm in Sources */ = {isa = PBXBuildFile; fileRef = DBDB58AF262A92D6004F16BE /* OnnxruntimeModuleTest.mm */; };
+		E329E1182D372C780016B599 /* PrivacyInfo.xcprivacy in Resources */ = {isa = PBXBuildFile; fileRef = E329E1172D372C780016B599 /* PrivacyInfo.xcprivacy */; };
+		F58B845092748409D2B634B9 /* Pods_OnnxruntimeModule.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 7FFA29EB31D0567D9122F532 /* Pods_OnnxruntimeModule.framework */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
@@ -42,13 +43,13 @@
 
 /* Begin PBXFileReference section */
 		134814201AA4EA6300B7C361 /* libOnnxruntimeModule.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libOnnxruntimeModule.a; sourceTree = BUILT_PRODUCTS_DIR; };
-		38EB61A518C2DF782F7CD433 /* Pods_OnnxruntimeModuleTest.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_OnnxruntimeModuleTest.framework; sourceTree = BUILT_PRODUCTS_DIR; };
-		49D0ADD02E7162A5F0DE8BAB /* Pods_OnnxruntimeModule.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_OnnxruntimeModule.framework; sourceTree = BUILT_PRODUCTS_DIR; };
+		380A4E10493D3005E7695737 /* Pods_OnnxruntimeModuleTest.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_OnnxruntimeModuleTest.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		5391B4C0B7C168594AA0DD0B /* Pods-OnnxruntimeModuleTest.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OnnxruntimeModuleTest.debug.xcconfig"; path = "Target Support Files/Pods-OnnxruntimeModuleTest/Pods-OnnxruntimeModuleTest.debug.xcconfig"; sourceTree = "<group>"; };
 		548638FE75FCC69C842C9545 /* Pods-OnnxruntimeModule.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OnnxruntimeModule.release.xcconfig"; path = "Target Support Files/Pods-OnnxruntimeModule/Pods-OnnxruntimeModule.release.xcconfig"; sourceTree = "<group>"; };
 		63B05EB079B0A4D99448F1D3 /* Pods-OnnxruntimeModule.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OnnxruntimeModule.debug.xcconfig"; path = "Target Support Files/Pods-OnnxruntimeModule/Pods-OnnxruntimeModule.debug.xcconfig"; sourceTree = "<group>"; };
 		7FD234662A1F221700734B71 /* FakeRCTBlobManager.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = FakeRCTBlobManager.m; sourceTree = "<group>"; };
 		7FD234682A1F234500734B71 /* FakeRCTBlobManager.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = FakeRCTBlobManager.h; sourceTree = "<group>"; };
+		7FFA29EB31D0567D9122F532 /* Pods_OnnxruntimeModule.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_OnnxruntimeModule.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		8529D8A6F40E462E62B38B52 /* Pods-OnnxruntimeModuleTest.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-OnnxruntimeModuleTest.release.xcconfig"; path = "Target Support Files/Pods-OnnxruntimeModuleTest/Pods-OnnxruntimeModuleTest.release.xcconfig"; sourceTree = "<group>"; };
 		DB8FC9B425C2867800C72F26 /* OnnxruntimeModule.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = OnnxruntimeModule.mm; sourceTree = SOURCE_ROOT; };
 		DB8FC9B725C2868700C72F26 /* TensorHelper.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = TensorHelper.mm; sourceTree = SOURCE_ROOT; };
@@ -57,6 +58,7 @@
 		DBDB57DB2603211A004F16BE /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
 		DBDB588A2609B18F004F16BE /* Resources */ = {isa = PBXFileReference; lastKnownFileType = folder; name = Resources; path = OnnxruntimeModuleTest/Resources; sourceTree = "<group>"; };
 		DBDB58AF262A92D6004F16BE /* OnnxruntimeModuleTest.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = OnnxruntimeModuleTest.mm; sourceTree = "<group>"; };
+		E329E1172D372C780016B599 /* PrivacyInfo.xcprivacy */ = {isa = PBXFileReference; lastKnownFileType = text.xml; path = PrivacyInfo.xcprivacy; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -64,7 +66,7 @@
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				C60033360456900E26D6F96F /* Pods_OnnxruntimeModule.framework in Frameworks */,
+				F58B845092748409D2B634B9 /* Pods_OnnxruntimeModule.framework in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -73,7 +75,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				DBDB57DC2603211A004F16BE /* libOnnxruntimeModule.a in Frameworks */,
-				0105483CF04B9471894F3EAA /* Pods_OnnxruntimeModuleTest.framework in Frameworks */,
+				2507023E063B593E8767184B /* Pods_OnnxruntimeModuleTest.framework in Frameworks */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@@ -91,6 +93,7 @@
 		58B511D21A9E6C8500147676 = {
 			isa = PBXGroup;
 			children = (
+				E329E1172D372C780016B599 /* PrivacyInfo.xcprivacy */,
 				DBDB588A2609B18F004F16BE /* Resources */,
 				DB8FC9B325C2861300C72F26 /* OnnxruntimeModule */,
 				DBDB57D82603211A004F16BE /* OnnxruntimeModuleTest */,
@@ -115,8 +118,8 @@
 		6FFDF1594C99DA125B013E34 /* Frameworks */ = {
 			isa = PBXGroup;
 			children = (
-				49D0ADD02E7162A5F0DE8BAB /* Pods_OnnxruntimeModule.framework */,
-				38EB61A518C2DF782F7CD433 /* Pods_OnnxruntimeModuleTest.framework */,
+				7FFA29EB31D0567D9122F532 /* Pods_OnnxruntimeModule.framework */,
+				380A4E10493D3005E7695737 /* Pods_OnnxruntimeModuleTest.framework */,
 			);
 			name = Frameworks;
 			sourceTree = "<group>";
@@ -171,7 +174,7 @@
 				DBDB57D32603211A004F16BE /* Sources */,
 				DBDB57D42603211A004F16BE /* Frameworks */,
 				DBDB57D52603211A004F16BE /* Resources */,
-				015C75E59BC80D4507FB6E8A /* [CP] Embed Pods Frameworks */,
+				8FE621EF8E674693B253B8F6 /* [CP] Embed Pods Frameworks */,
 			);
 			buildRules = (
 			);
@@ -227,6 +230,7 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
+				E329E1182D372C780016B599 /* PrivacyInfo.xcprivacy in Resources */,
 				DBDB588B2609B18F004F16BE /* Resources in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
@@ -234,72 +238,6 @@
 /* End PBXResourcesBuildPhase section */
 
 /* Begin PBXShellScriptBuildPhase section */
-		015C75E59BC80D4507FB6E8A /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-				"${PODS_ROOT}/Target Support Files/Pods-OnnxruntimeModuleTest/Pods-OnnxruntimeModuleTest-frameworks.sh",
-				"${BUILT_PRODUCTS_DIR}/DoubleConversion/DoubleConversion.framework",
-				"${BUILT_PRODUCTS_DIR}/RCT-Folly/folly.framework",
-				"${BUILT_PRODUCTS_DIR}/RCTTypeSafety/RCTTypeSafety.framework",
-				"${BUILT_PRODUCTS_DIR}/React-Codegen/React_Codegen.framework",
-				"${BUILT_PRODUCTS_DIR}/React-Core/React.framework",
-				"${BUILT_PRODUCTS_DIR}/React-CoreModules/CoreModules.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTAnimation/RCTAnimation.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTBlob/RCTBlob.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTImage/RCTImage.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTLinking/RCTLinking.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTNetwork/RCTNetwork.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTSettings/RCTSettings.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTText/RCTText.framework",
-				"${BUILT_PRODUCTS_DIR}/React-RCTVibration/RCTVibration.framework",
-				"${BUILT_PRODUCTS_DIR}/React-bridging/react_bridging.framework",
-				"${BUILT_PRODUCTS_DIR}/React-cxxreact/cxxreact.framework",
-				"${BUILT_PRODUCTS_DIR}/React-jsi/jsi.framework",
-				"${BUILT_PRODUCTS_DIR}/React-jsiexecutor/jsireact.framework",
-				"${BUILT_PRODUCTS_DIR}/React-jsinspector/jsinspector.framework",
-				"${BUILT_PRODUCTS_DIR}/React-logger/logger.framework",
-				"${BUILT_PRODUCTS_DIR}/React-perflogger/reactperflogger.framework",
-				"${BUILT_PRODUCTS_DIR}/ReactCommon/ReactCommon.framework",
-				"${BUILT_PRODUCTS_DIR}/Yoga/yoga.framework",
-				"${BUILT_PRODUCTS_DIR}/fmt/fmt.framework",
-				"${BUILT_PRODUCTS_DIR}/glog/glog.framework",
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputPaths = (
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/DoubleConversion.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/folly.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTTypeSafety.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/React_Codegen.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/React.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/CoreModules.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTAnimation.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTBlob.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTImage.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTLinking.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTNetwork.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTSettings.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTText.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTVibration.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/react_bridging.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/cxxreact.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/jsi.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/jsireact.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/jsinspector.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/logger.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/reactperflogger.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/ReactCommon.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/yoga.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/fmt.framework",
-				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/glog.framework",
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OnnxruntimeModuleTest/Pods-OnnxruntimeModuleTest-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
 		896E89AEC864CBD0CC7E0AF1 /* [CP] Check Pods Manifest.lock */ = {
 			isa = PBXShellScriptBuildPhase;
 			buildActionMask = 2147483647;
@@ -322,6 +260,74 @@
 			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n";
 			showEnvVarsInLog = 0;
 		};
+		8FE621EF8E674693B253B8F6 /* [CP] Embed Pods Frameworks */ = {
+			isa = PBXShellScriptBuildPhase;
+			buildActionMask = 2147483647;
+			files = (
+			);
+			inputPaths = (
+				"${PODS_ROOT}/Target Support Files/Pods-OnnxruntimeModuleTest/Pods-OnnxruntimeModuleTest-frameworks.sh",
+				"${BUILT_PRODUCTS_DIR}/DoubleConversion/DoubleConversion.framework",
+				"${BUILT_PRODUCTS_DIR}/RCT-Folly/folly.framework",
+				"${BUILT_PRODUCTS_DIR}/RCTTypeSafety/RCTTypeSafety.framework",
+				"${BUILT_PRODUCTS_DIR}/React-Codegen/React_Codegen.framework",
+				"${BUILT_PRODUCTS_DIR}/React-Core/React.framework",
+				"${BUILT_PRODUCTS_DIR}/React-CoreModules/CoreModules.framework",
+				"${BUILT_PRODUCTS_DIR}/React-RCTAnimation/RCTAnimation.framework",
+				"${BUILT_PRODUCTS_DIR}/React-RCTAppDelegate/React_RCTAppDelegate.framework",
+				"${BUILT_PRODUCTS_DIR}/React-RCTBlob/RCTBlob.framework",
+				"${BUILT_PRODUCTS_DIR}/React-RCTImage/RCTImage.framework",
+				"${BUILT_PRODUCTS_DIR}/React-RCTLinking/RCTLinking.framework",
+				"${BUILT_PRODUCTS_DIR}/React-RCTNetwork/RCTNetwork.framework",
+				"${BUILT_PRODUCTS_DIR}/React-RCTSettings/RCTSettings.framework",
+				"${BUILT_PRODUCTS_DIR}/React-RCTText/RCTText.framework",
+				"${BUILT_PRODUCTS_DIR}/React-RCTVibration/RCTVibration.framework",
+				"${BUILT_PRODUCTS_DIR}/React-cxxreact/cxxreact.framework",
+				"${BUILT_PRODUCTS_DIR}/React-jsc/React_jsc.framework",
+				"${BUILT_PRODUCTS_DIR}/React-jsi/jsi.framework",
+				"${BUILT_PRODUCTS_DIR}/React-jsiexecutor/jsireact.framework",
+				"${BUILT_PRODUCTS_DIR}/React-jsinspector/jsinspector.framework",
+				"${BUILT_PRODUCTS_DIR}/React-logger/logger.framework",
+				"${BUILT_PRODUCTS_DIR}/React-perflogger/reactperflogger.framework",
+				"${BUILT_PRODUCTS_DIR}/ReactCommon/ReactCommon.framework",
+				"${BUILT_PRODUCTS_DIR}/Yoga/yoga.framework",
+				"${BUILT_PRODUCTS_DIR}/fmt/fmt.framework",
+				"${BUILT_PRODUCTS_DIR}/glog/glog.framework",
+			);
+			name = "[CP] Embed Pods Frameworks";
+			outputPaths = (
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/DoubleConversion.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/folly.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTTypeSafety.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/React_Codegen.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/React.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/CoreModules.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTAnimation.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/React_RCTAppDelegate.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTBlob.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTImage.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTLinking.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTNetwork.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTSettings.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTText.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/RCTVibration.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/cxxreact.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/React_jsc.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/jsi.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/jsireact.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/jsinspector.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/logger.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/reactperflogger.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/ReactCommon.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/yoga.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/fmt.framework",
+				"${TARGET_BUILD_DIR}/${FRAMEWORKS_FOLDER_PATH}/glog.framework",
+			);
+			runOnlyForDeploymentPostprocessing = 0;
+			shellPath = /bin/sh;
+			shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-OnnxruntimeModuleTest/Pods-OnnxruntimeModuleTest-frameworks.sh\"\n";
+			showEnvVarsInLog = 0;
+		};
 		FA8BD7B76BD8BD02A6DB750A /* [CP] Check Pods Manifest.lock */ = {
 			isa = PBXShellScriptBuildPhase;
 			buildActionMask = 2147483647;
@@ -427,11 +433,7 @@
 				LIBRARY_SEARCH_PATHS = "$(SDKROOT)/usr/lib/swift";
 				MTL_ENABLE_DEBUG_INFO = YES;
 				ONLY_ACTIVE_ARCH = YES;
-				OTHER_LDFLAGS = (
-					"$(inherited)",
-					"-Wl",
-					"-ld_classic",
-				);
+				OTHER_LDFLAGS = "$(inherited)";
 				REACT_NATIVE_PATH = "${PODS_ROOT}/../../node_modules/react-native";
 				SDKROOT = iphoneos;
 			};
@@ -482,11 +484,7 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 15.1;
 				LIBRARY_SEARCH_PATHS = "$(SDKROOT)/usr/lib/swift";
 				MTL_ENABLE_DEBUG_INFO = NO;
-				OTHER_LDFLAGS = (
-					"$(inherited)",
-					"-Wl",
-					"-ld_classic",
-				);
+				OTHER_LDFLAGS = "$(inherited)";
 				REACT_NATIVE_PATH = "${PODS_ROOT}/../../node_modules/react-native";
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
diff --git a/js/react_native/ios/Podfile b/js/react_native/ios/Podfile
index 717164139e..9e94465765 100644
--- a/js/react_native/ios/Podfile
+++ b/js/react_native/ios/Podfile
@@ -3,30 +3,26 @@ require_relative '../node_modules/@react-native-community/cli-platform-ios/nativ
 
 platform :ios, '15.1'
 
-pre_install do |installer|
-  # Custom pre-install script or commands
-  puts "Running pre-install script..."
-
-  # Recommended fix for https://github.com/facebook/react-native/issues/32483
-  # from https://github.com/facebook/react-native/issues/32483#issuecomment-966784501
-  system("sed -i '' 's/typedef uint8_t clockid_t;//' \"./Pods/RCT-Folly/folly/portability/Time.h\"")
-end
+prepare_react_native_project!
+# use_frameworks need to be loaded before use_react_native! for now
+use_frameworks!
 
 def shared
   config = use_native_modules!
 
+  # Flags change depending on the env values.
+  flags = get_default_flags()
+
   use_react_native!(
     :path => config[:reactNativePath],
-
     # Hermes is now enabled by default. Disable by setting this flag to false.
     # Upcoming versions of React Native may rely on get_default_flags(), but
     # we make it explicit here to aid in the React Native upgrade process.
-    :hermes_enabled => false
+    :hermes_enabled => false,
+    :fabric_enabled => false,
+    :app_path => "#{Pod::Config.instance.installation_root}/.."
   )
 
-  # Comment the next line if you don't want to use dynamic frameworks
-  use_frameworks!
-
   ort_c_local_pod_path = ENV['ORT_C_LOCAL_POD_PATH']
   if ort_c_local_pod_path != nil
     print 'Using onnxruntime-c pod at ', ort_c_local_pod_path, "\n"
@@ -34,9 +30,7 @@ def shared
   else
     pod 'onnxruntime-c'
   end
-
-  inherit! :search_paths
-
+  inherit! :complete
 end
 
 target 'OnnxruntimeModule' do
@@ -48,10 +42,11 @@ target 'OnnxruntimeModuleTest' do
 end
 
 post_install do |installer|
-  # https://github.com/facebook/react-native/blob/main/packages/react-native/scripts/react_native_pods.rb#L197-L202
   react_native_post_install(
     installer,
-    :mac_catalyst_enabled => false,
+    # Set `mac_catalyst_enabled` to `true` in order to apply patches
+	  # necessary for Mac Catalyst builds
+	  :mac_catalyst_enabled => false
   )
   __apply_Xcode_12_5_M1_post_install_workaround(installer)
 end
\ No newline at end of file
diff --git a/js/react_native/ios/PrivacyInfo.xcprivacy b/js/react_native/ios/PrivacyInfo.xcprivacy
new file mode 100644
index 0000000000..3639c19774
--- /dev/null
+++ b/js/react_native/ios/PrivacyInfo.xcprivacy
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+  <key>NSPrivacyCollectedDataTypes</key>
+  <array>
+  </array>
+  <key>NSPrivacyAccessedAPITypes</key>
+  <array>
+    <dict>
+      <key>NSPrivacyAccessedAPIType</key>
+      <string>NSPrivacyAccessedAPICategoryFileTimestamp</string>
+      <key>NSPrivacyAccessedAPITypeReasons</key>
+      <array>
+        <string>C617.1</string>
+      </array>
+    </dict>
+    <dict>
+      <key>NSPrivacyAccessedAPIType</key>
+      <string>NSPrivacyAccessedAPICategoryUserDefaults</string>
+      <key>NSPrivacyAccessedAPITypeReasons</key>
+      <array>
+        <string>CA92.1</string>
+      </array>
+    </dict>
+    <dict>
+      <key>NSPrivacyAccessedAPIType</key>
+      <string>NSPrivacyAccessedAPICategorySystemBootTime</string>
+      <key>NSPrivacyAccessedAPITypeReasons</key>
+      <array>
+        <string>35F9.1</string>
+      </array>
+    </dict>
+  </array>
+  <key>NSPrivacyTracking</key>
+  <false/>
+</dict>
+</plist>
\ No newline at end of file
diff --git a/js/react_native/package.json b/js/react_native/package.json
index b0a2ddf66d..29cf3a0a04 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -17,14 +17,14 @@
     "ONNX Runtime"
   ],
   "devDependencies": {
-    "@types/jest": "^27.4.0",
-    "@types/react": "^18.0.9",
+    "@types/jest": "^29.2.1",
+    "@types/react": "^18.0.24",
     "@types/react-native": "^0.67.7",
-    "jest": "^27.4.7",
+    "jest": "^29.2.1",
     "pod-install": "^0.1.36",
     "prettier": "^2.6.2",
-    "react": "^18.1.0",
-    "react-native": "^0.70.15",
+    "react": "^18.2.0",
+    "react-native": "^0.71.19",
     "react-native-builder-bob": "^0.18.2"
   },
   "peerDependencies": {

From 655a23ff1d19927d048a325be8065185b127ed14 Mon Sep 17 00:00:00 2001
From: Karim Vadsariya <karim.vadsariya@microsoft.com>
Date: Tue, 28 Jan 2025 15:24:09 -0800
Subject: [PATCH 29/37] [onnxruntime/build] Add new flag
 enable_generic_interface to build primary EPs by default (#23342)

### Description
- Add new build flag in build.py to build onnxruntime.dll supporting
interfaces for all primary EPs( QNN, TensoRT, OpenVino, VitisAI).
- Modify onnxruntime.dll/onnxruntime_shared.dll build settings to remove
dependency of IHV SDK Toolset to be installed on the system.
- Change CMake variables to be explicit when building EP vs ORT. e.g.
onnxruntime_USE_TENSORRT vs onnxruntime_USE_TENSORRT_INTERFACE, to
evolve the build system to build ORT independent of EPs.


### Motivation and Context
Changes in the build system required to evolve the repo to build the
components independently while removing unnecessary dependencies

---------

Co-authored-by: Lei Cao <jslhcl@gmail.com>
Co-authored-by: Karim Vadsariya <kvadsariya@microsoft.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 cmake/CMakeLists.txt                          | 24 ++++++-----
 .../shared_library/provider_interfaces.h      |  5 ---
 .../core/session/provider_bridge_ort.cc       |  7 +---
 tools/ci_build/build.py                       | 40 ++++++++++++++++---
 .../azure-pipelines/win-ci-pipeline.yml       | 19 +++++++++
 5 files changed, 71 insertions(+), 24 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index b332583035..8650cc53d9 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -259,6 +259,12 @@ option(onnxruntime_USE_AZURE "Build with azure inferencing support" OFF)
 option(onnxruntime_USE_LOCK_FREE_QUEUE "Build with lock-free task queue for threadpool." OFF)
 option(onnxruntime_FORCE_GENERIC_ALGORITHMS "Disable optimized arch-specific algorithms. Use only for testing and debugging generic algorithms." OFF)
 
+option(onnxruntime_USE_TENSORRT_INTERFACE "Build ONNXRuntime shared lib which is compatible with TensorRT EP interface" OFF)
+option(onnxruntime_USE_CUDA_INTERFACE "Build ONNXRuntime shared lib which is compatible with Cuda EP interface" OFF)
+option(onnxruntime_USE_OPENVINO_INTERFACE "Build ONNXRuntime shared lib which is compatible with OpenVINO EP interface" OFF)
+option(onnxruntime_USE_VITISAI_INTERFACE "Build ONNXRuntime shared lib which is compatible with Vitis-AI EP interface" OFF)
+option(onnxruntime_USE_QNN_INTERFACE "Build ONNXRuntime shared lib which is compatible with QNN EP interface" OFF)
+
 # ENABLE_TRAINING includes all training functionality
 # The following 2 entry points
 # 1. ORTModule
@@ -703,7 +709,7 @@ if (WIN32)
     # structure was padded due to __declspec(align())
     list(APPEND ORT_WARNING_FLAGS "/wd4324")
     # warning C4800: Implicit conversion from 'X' to bool. Possible information loss
-    if (onnxruntime_USE_OPENVINO)
+    if (onnxruntime_USE_OPENVINO OR onnxruntime_USE_OPENVINO_INTERFACE)
        list(APPEND ORT_WARNING_FLAGS "/wd4800")
     endif()
     # operator 'operator-name': deprecated between enumerations of different types
@@ -864,7 +870,7 @@ else()
   set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
 endif()
 
-if (onnxruntime_USE_CUDA)
+if (onnxruntime_USE_CUDA OR onnxruntime_USE_CUDA_INTERFACE)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_CUDA=1)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_CUDA=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES cuda)
@@ -888,7 +894,7 @@ if (onnxruntime_USE_CUDA)
     endif()
 endif()
 
-if (onnxruntime_USE_VITISAI)
+if (onnxruntime_USE_VITISAI OR onnxruntime_USE_VITISAI_INTERFACE)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_VITISAI=1)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_VITISAI=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES vitisai)
@@ -898,12 +904,12 @@ if (onnxruntime_USE_DNNL)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES dnnl)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_DNNL=1)
 endif()
-if (onnxruntime_USE_OPENVINO)
+if (onnxruntime_USE_OPENVINO OR onnxruntime_USE_OPENVINO_INTERFACE)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_OPENVINO=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES openvino)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_OPENVINO=1)
 endif()
-if (onnxruntime_USE_TENSORRT)
+if (onnxruntime_USE_TENSORRT OR onnxruntime_USE_TENSORRT_INTERFACE)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_TENSORRT=1)
     #TODO: remove the following line and change the test code in onnxruntime_shared_lib_test to use the new EP API.
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES tensorrt)
@@ -929,7 +935,7 @@ if (onnxruntime_USE_JSEP)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_JSEP=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES js)
 endif()
-if (onnxruntime_USE_QNN)
+if (onnxruntime_USE_QNN OR onnxruntime_USE_QNN_INTERFACE)
     list(APPEND ORT_PROVIDER_FLAGS -DUSE_QNN=1)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_QNN=1)
     list(APPEND ONNXRUNTIME_PROVIDER_NAMES qnn)
@@ -957,7 +963,7 @@ if (onnxruntime_USE_QNN)
       endif()
     endif()
 
-    if (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+    if ((NOT onnxruntime_USE_QNN_INTERFACE) AND (MSVC OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux"))
       file(GLOB QNN_LIB_FILES LIST_DIRECTORIES false "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libQnn*.so"
 	                                     "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/Qnn*.dll"
                                              "${onnxruntime_QNN_HOME}/lib/${QNN_ARCH_ABI}/libHtpPrepare.so"
@@ -1416,7 +1422,7 @@ if (onnxruntime_ENABLE_TRAINING_APIS)
   )
 endif()
 
-if (onnxruntime_USE_OPENVINO)
+if (onnxruntime_USE_OPENVINO OR onnxruntime_USE_OPENVINO_INTERFACE)
 
   add_definitions(-DUSE_OPENVINO=1)
 
@@ -1429,7 +1435,7 @@ if (onnxruntime_USE_OPENVINO)
     add_definitions(-DOPENVINO_CONFIG_GPU=1)
   endif()
 
-  if (onnxruntime_USE_OPENVINO_CPU)
+  if (onnxruntime_USE_OPENVINO_CPU OR onnxruntime_USE_OPENVINO_INTERFACE) # OpenVino CPU interface is default built.
     add_definitions(-DOPENVINO_CONFIG_CPU=1)
   endif()
 
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 962d10d895..a1bb86598e 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -178,7 +178,6 @@ struct ProviderHost {
   virtual std::string demangle(const char* name) = 0;
   virtual std::string demangle(const std::string& name) = 0;
 
-#ifdef USE_CUDA
   virtual std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) = 0;
   virtual std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(const char* name) = 0;
   virtual std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() = 0;
@@ -190,7 +189,6 @@ struct ProviderHost {
 
   virtual Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0;
   virtual void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0;
-#endif
 
 #ifdef USE_MIGRAPHX
   virtual std::unique_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) = 0;
@@ -200,7 +198,6 @@ struct ProviderHost {
 #ifdef USE_ROCM
   virtual std::unique_ptr<IAllocator> CreateROCMAllocator(int16_t device_id, const char* name) = 0;
   virtual std::unique_ptr<IAllocator> CreateROCMPinnedAllocator(const char* name) = 0;
-  virtual std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() = 0;
 
   virtual void rocm__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) = 0;
   virtual void rocm__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) = 0;
@@ -1256,9 +1253,7 @@ struct ProviderHost {
   virtual training::DistributedRunContext& GetDistributedRunContextInstance() = 0;
 #endif
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
   virtual PhiloxGenerator& PhiloxGenerator__Default() = 0;
-#endif
 
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
   virtual void contrib__PythonOpBase__Init(contrib::PythonOpBase* p, const OpKernelInfo& info) = 0;
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index d7c6dab72f..3a694ac6f8 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -258,10 +258,8 @@ struct ProviderHostImpl : ProviderHost {
   void* CPUAllocator__Alloc(CPUAllocator* p, size_t size) override { return p->CPUAllocator::Alloc(size); }
   void CPUAllocator__Free(CPUAllocator* p, void* allocation) override { return p->CPUAllocator::Free(allocation); }
 
-#ifdef USE_CUDA
   std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA().CreateCUDAAllocator(device_id, name); }
   std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(const char* name) override { return GetProviderInfo_CUDA().CreateCUDAPinnedAllocator(name); }
-  std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() override { return GetProviderInfo_CUDA().CreateGPUDataTransfer(); }
 
   void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }
   void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }
@@ -271,7 +269,6 @@ struct ProviderHostImpl : ProviderHost {
 
   Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return GetProviderInfo_CUDA().CudaCall_false(retCode, exprString, libName, successCode, msg, file, line); }
   void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { GetProviderInfo_CUDA().CudaCall_true(retCode, exprString, libName, successCode, msg, file, line); }
-#endif
 
 #ifdef USE_MIGRAPHX
   std::unique_ptr<IAllocator> CreateMIGraphXAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_MIGraphX().CreateMIGraphXAllocator(device_id, name); }
@@ -291,6 +288,8 @@ struct ProviderHostImpl : ProviderHost {
 
   Status RocmCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return GetProviderInfo_ROCM().RocmCall_false(retCode, exprString, libName, successCode, msg, file, line); }
   void RocmCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { GetProviderInfo_ROCM().RocmCall_true(retCode, exprString, libName, successCode, msg, file, line); }
+#else
+  std::unique_ptr<IDataTransfer> CreateGPUDataTransfer() override { return GetProviderInfo_CUDA().CreateGPUDataTransfer(); }
 #endif
 
   std::string GetEnvironmentVar(const std::string& var_name) override { return Env::Default().GetEnvironmentVar(var_name); }
@@ -1560,9 +1559,7 @@ struct ProviderHostImpl : ProviderHost {
   training::DistributedRunContext& GetDistributedRunContextInstance() override { return training::DistributedRunContext::GetInstance(); }
 #endif
 
-#if defined(USE_CUDA) || defined(USE_ROCM)
   PhiloxGenerator& PhiloxGenerator__Default() override { return PhiloxGenerator::Default(); }
-#endif
 
 #ifdef ENABLE_TRAINING_TORCH_INTEROP
   void contrib__PythonOpBase__Init(contrib::PythonOpBase* p, const OpKernelInfo& info) override { p->PythonOpBase::Init(info); }
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index bce7552854..cc733f859f 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -782,6 +782,12 @@ def parse_arguments():
     parser.add_argument("--use_triton_kernel", action="store_true", help="Use triton compiled kernels")
     parser.add_argument("--use_lock_free_queue", action="store_true", help="Use lock-free task queue for threadpool.")
 
+    parser.add_argument(
+        "--enable_generic_interface",
+        action="store_true",
+        help="build ORT shared library and compatible bridge with primary EPs(tensorRT, OpenVino, Qnn, vitisai) but not tests",
+    )
+
     if not is_windows():
         parser.add_argument(
             "--allow_running_as_root",
@@ -1042,6 +1048,12 @@ def generate_build_tree(
         "-Donnxruntime_USE_TENSORRT=" + ("ON" if args.use_tensorrt else "OFF"),
         "-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER="
         + ("ON" if args.use_tensorrt_builtin_parser and not args.use_tensorrt_oss_parser else "OFF"),
+        # interface variables are used only for building onnxruntime/onnxruntime_shared.dll but not EPs
+        "-Donnxruntime_USE_TENSORRT_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
+        "-Donnxruntime_USE_CUDA_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
+        "-Donnxruntime_USE_OPENVINO_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
+        "-Donnxruntime_USE_VITISAI_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
+        "-Donnxruntime_USE_QNN_INTERFACE=" + ("ON" if args.enable_generic_interface else "OFF"),
         # set vars for migraphx
         "-Donnxruntime_USE_MIGRAPHX=" + ("ON" if args.use_migraphx else "OFF"),
         "-Donnxruntime_DISABLE_CONTRIB_OPS=" + ("ON" if args.disable_contrib_ops else "OFF"),
@@ -1372,6 +1384,8 @@ def generate_build_tree(
             cmake_args += ["-Donnxruntime_BUILD_QNN_EP_STATIC_LIB=ON"]
         if args.android and args.use_qnn != "static_lib":
             raise BuildError("Only support Android + QNN builds with QNN EP built as a static library.")
+        if args.use_qnn == "static_lib" and args.enable_generic_interface:
+            raise BuildError("Generic ORT interface only supported with QNN EP built as a shared library.")
 
     if args.use_coreml:
         cmake_args += ["-Donnxruntime_USE_COREML=ON"]
@@ -1529,6 +1543,12 @@ def generate_build_tree(
             "-Donnxruntime_USE_FULL_PROTOBUF=ON",
         ]
 
+    # When this flag is enabled, that means we only build ONNXRuntime shared library, expecting some compatible EP
+    # shared lib being build in a seperate process. So we skip the test for now as ONNXRuntime shared lib built under
+    # this flag is not expected to work alone
+    if args.enable_generic_interface:
+        cmake_args += ["-Donnxruntime_BUILD_UNIT_TESTS=OFF"]
+
     if args.enable_lazy_tensor:
         import torch
 
@@ -2649,6 +2669,9 @@ def main():
         # Disable ONNX Runtime's builtin memory checker
         args.disable_memleak_checker = True
 
+    if args.enable_generic_interface:
+        args.test = False
+
     # If there was no explicit argument saying what to do, default
     # to update, build and test (for native builds).
     if not (args.update or args.clean or args.build or args.test or args.gen_doc):
@@ -2752,7 +2775,10 @@ def main():
     source_dir = os.path.normpath(os.path.join(script_dir, "..", ".."))
 
     # if using cuda, setup cuda paths and env vars
-    cuda_home, cudnn_home = setup_cuda_vars(args)
+    cuda_home = ""
+    cudnn_home = ""
+    if args.use_cuda:
+        cuda_home, cudnn_home = setup_cuda_vars(args)
 
     mpi_home = args.mpi_home
     nccl_home = args.nccl_home
@@ -2765,10 +2791,14 @@ def main():
     armnn_home = args.armnn_home
     armnn_libs = args.armnn_libs
 
-    qnn_home = args.qnn_home
+    qnn_home = ""
+    if args.use_qnn:
+        qnn_home = args.qnn_home
 
     # if using tensorrt, setup tensorrt paths
-    tensorrt_home = setup_tensorrt_vars(args)
+    tensorrt_home = ""
+    if args.use_tensorrt:
+        tensorrt_home = setup_tensorrt_vars(args)
 
     # if using migraphx, setup migraphx paths
     migraphx_home = setup_migraphx_vars(args)
@@ -2853,9 +2883,9 @@ def main():
                     toolset = "host=" + host_arch + ",version=" + args.msvc_toolset
                 else:
                     toolset = "host=" + host_arch
-                if args.cuda_version:
+                if args.use_cuda and args.cuda_version:
                     toolset += ",cuda=" + args.cuda_version
-                elif args.cuda_home:
+                elif args.use_cuda and args.cuda_home:
                     toolset += ",cuda=" + args.cuda_home
                 if args.windows_sdk_version:
                     target_arch += ",version=" + args.windows_sdk_version
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index 94c2d35a56..d96f1cb68c 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -177,6 +177,25 @@ stages:
         WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
 
+- stage: x64_release_ep_generic_interface
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        buildArch: x64
+        additionalBuildFlags: --enable_generic_interface
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_release_ep_generic_interface
+        RunOnnxRuntimeTests: false  # --enable_generic_interface does not build tests
+        EnablePython: false
+        isTraining: false
+        ORT_EP_NAME: CPU
+        GenerateDocumentation: false
+        WITH_CACHE: false
+        MachinePool: 'onnxruntime-Win-CPU-2022'
+
 - stage: x86_release
   dependsOn: []
   jobs:

From bf023ab3d565668c13a5334b505df0eb6acf3625 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 28 Jan 2025 16:24:41 -0800
Subject: [PATCH 30/37] [js/web] allow import .mjs/.wasm file (#23487)

### Description

Allow importing the `.mjs` and `.wasm` files.

when using Vite, this enables web app to consume ORT-web for simplify
the setup:
   ```js
   import * as ort from 'onnxruntime-web';

   import wasmFileUrl from 'onnxruntime-web/.wasm?url';
   ort.env.wasm.wasmPaths = { wasm: wasmFileUrl };
---
 js/web/package.json                                         | 6 ++++++
 .../testcases/vite-default/src/components/onnx-helper.js    | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/js/web/package.json b/js/web/package.json
index 181d6127f5..4e6e2c32ae 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -77,16 +77,22 @@
       "require": "./dist/ort.min.js",
       "types": "./types.d.ts"
     },
+    "./.mjs": "./dist/ort-wasm-simd-threaded.jsep.mjs",
+    "./.wasm": "./dist/ort-wasm-simd-threaded.jsep.wasm",
     "./all": {
       "import": "./dist/ort.all.bundle.min.mjs",
       "require": "./dist/ort.all.min.js",
       "types": "./types.d.ts"
     },
+    "./all/.mjs": "./dist/ort-wasm-simd-threaded.jsep.mjs",
+    "./all/.wasm": "./dist/ort-wasm-simd-threaded.jsep.wasm",
     "./wasm": {
       "import": "./dist/ort.wasm.bundle.min.mjs",
       "require": "./dist/ort.wasm.min.js",
       "types": "./types.d.ts"
     },
+    "./wasm/.mjs": "./dist/ort-wasm-simd-threaded.mjs",
+    "./wasm/.wasm": "./dist/ort-wasm-simd-threaded.wasm",
     "./webgl": {
       "import": "./dist/ort.webgl.min.mjs",
       "require": "./dist/ort.webgl.min.js",
diff --git a/js/web/test/e2e/exports/testcases/vite-default/src/components/onnx-helper.js b/js/web/test/e2e/exports/testcases/vite-default/src/components/onnx-helper.js
index 7272ee7371..4b8c626157 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/src/components/onnx-helper.js
+++ b/js/web/test/e2e/exports/testcases/vite-default/src/components/onnx-helper.js
@@ -4,7 +4,7 @@ import * as ort from 'onnxruntime-web';
 //
 // see https://vite.dev/guide/assets.html#explicit-url-imports
 //
-import wasmFileUrl from '/node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.jsep.wasm?url';
+import wasmFileUrl from 'onnxruntime-web/.wasm?url';
 
 // wasmFileUrl is the URL of the wasm file. Vite will make sure it's available in both development and production.
 ort.env.wasm.wasmPaths = { wasm: wasmFileUrl };

From 80bc1d25f0fe5c562b34db201b567f0be64226c7 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Tue, 28 Jan 2025 20:22:22 -0800
Subject: [PATCH 31/37] Enable Ep context with external data for CPU nodes
 (#23498)

### Description
When user dump the EP context model, if the nodes not partitioned to the EP, and they have external initializers, then the dumped model still point to the old external data file. It does not make sense that new generated model still point to old external data file.
Example, model has node A, B, C, D all has external initializer in ext.bin. So ext.bin contains data for A, B, C, D.
After dumping the EP context model, node A is on CPU, node B, C, D are on EP and dumped as EPContext node. If A's data is still in ext.bin, then new generated model has to depend on old ext.bin which contains all external data for the old model which is a big overhead.

Fix:
For new generated model, user should have option to specify the new external data file, so that the new generated model either pack all initializers into the Onnx model or has all initializers in the external data file.
Add option ep.context_model_external_initializers_file_name to specify the new external data file and size threshold. All initializers will be inside the external data fie if the options is specified. Otherwise all initializers will be inside the EP context Onnx model.

### Motivation and Context
Fix the issue https://github.com/microsoft/onnxruntime/issues/23358
---
 .../onnxruntime_session_options_config_keys.h |  5 ++
 .../core/framework/graph_partitioner.cc       | 22 ++++-
 onnxruntime/core/graph/graph.cc               | 15 ++++
 .../test/providers/qnn/qnn_ep_context_test.cc | 82 +++++++++++++++++--
 .../test/python/onnxruntime_test_python.py    |  9 +-
 5 files changed, 117 insertions(+), 16 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 64a4dd19c1..89a87a0227 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -283,6 +283,11 @@ static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_
 // Share EP related resources across EPs
 static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";
 
+// Use this config when dumping EP context model with an external initializers file
+// All initializers will be inside the external data file if specified, otherwise all in Onnx file
+static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =
+    "ep.context_model_external_initializers_file_name";
+
 // Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
 // Option values:
 // - "0": Gemm FastMath mode is not enabled. [DEFAULT]
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index b97cf03e3b..b4f1e9b11c 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -16,6 +16,7 @@
 #include "core/graph/function_utils.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/model.h"
+#include "core/graph/model_saving_options.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 
 // uncomment this line to count non-CUDA ops in ONNX domain
@@ -645,6 +646,7 @@ static Status InlineFunctionsAOTImpl(const ExecutionProviders& execution_provide
 static Status CreateEpContextModel(const ExecutionProviders& execution_providers,
                                    const Graph& graph,
                                    const std::filesystem::path& ep_context_path,
+                                   const std::filesystem::path& ep_context_ext_ini_path,
                                    const logging::Logger& logger) {
   InlinedVector<const Node*> all_ep_context_nodes;
   for (const auto& ep : execution_providers) {
@@ -727,7 +729,20 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
     }
   }
 
-  ORT_RETURN_IF_ERROR(Model::Save(ep_context_model, context_cache_path));
+  size_t ini_size_threshold = 0;
+  std::filesystem::path external_ini_path;
+  if (ep_context_ext_ini_path.empty()) {
+    // Set the threshold to the max so all initializers are forced into the Onnx file
+    ini_size_threshold = SIZE_MAX;
+    external_ini_path = "./model_ext_ini.bin";
+  } else {
+    // Set the theshold to 0 so all initializers are forced into the external file
+    ini_size_threshold = 0;
+    external_ini_path = ep_context_ext_ini_path;
+  }
+  ModelSavingOptions model_saving_options{ini_size_threshold};
+  ORT_RETURN_IF_ERROR(Model::SaveWithExternalInitializers(ep_context_model, context_cache_path,
+                                                          external_ini_path, model_saving_options));
 
   return Status::OK();
 }
@@ -993,9 +1008,10 @@ Status GraphPartitioner::Partition(Graph& graph, FuncManager& func_mgr,
     ORT_RETURN_IF_ERROR(PartitionOnnxFormatModel(partition_params, mode, providers_, kernel_registry_mgr_, logger));
 
     bool ep_context_enabled = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0") == "1";
-    std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
     if (ep_context_enabled) {
-      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, logger));
+      std::string ep_context_path = config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "");
+      std::string external_ini_file_name = config_options.GetConfigOrDefault(kOrtSessionOptionsEpContextModelExternalInitializersFileName, "");
+      ORT_RETURN_IF_ERROR(CreateEpContextModel(providers_, graph, ep_context_path, external_ini_file_name, logger));
     }
 #else
     ORT_UNUSED_PARAMETER(config_options);
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 7ee794ccbd..e4915616b7 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -4175,6 +4175,14 @@ Status Graph::AddExternalInitializersToGraphProtoImpl(
       size_t tensor_bytes_size = raw_data.size();
       if (tensor_bytes_size < model_saving_options.initializer_size_threshold) {
         *output_proto = initializer;
+        // Data with size above the threshold is written into the new external initializer file
+        // Data with size below the threshold should be kept inside the new model file
+        // instead of leaving it in the old external initializer file for the old Onnx file
+        if (initializer.data_location() == TensorProto_DataLocation_EXTERNAL) {
+          TensorShape shape(initializer.dims());
+          output_proto->set_raw_data(raw_data.data(), raw_data.size());
+          output_proto->clear_data_location();
+        }
         if (process_prepacks) {
           // These pre-packs will reside in memory
           processed_weights.insert(initializer.name());
@@ -4263,6 +4271,7 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
 
   // Create the external file.
   std::ofstream external_stream(modified_external_file_path, std::ofstream::out | std::ofstream::binary);
+  auto const external_empty_pos = external_stream.tellp();
   ORT_ENFORCE(external_stream.is_open(), "Failed to open for writing:", modified_external_file_path);
   int64_t external_offset = 0;
 
@@ -4275,6 +4284,12 @@ ONNX_NAMESPACE::GraphProto Graph::ToGraphProtoWithExternalInitializers(
     ORT_THROW("Failed to flush file with external initializers: ", modified_external_file_path);
   }
 
+  // Delete if the external data file is empty
+  if (external_empty_pos == external_stream.tellp()) {
+    external_stream.close();
+    std::remove(modified_external_file_path.string().c_str());
+  }
+
   return result;
 }
 
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index 38fde332ca..416d812326 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -7,6 +7,7 @@
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "core/session/inference_session.h"
+#include "core/graph/model_saving_options.h"
 
 #include "test/providers/qnn/qnn_test_utils.h"
 
@@ -49,19 +50,19 @@ static const std::string& GetNodeAttr(const Node& node, const std::string& attr_
 static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
   return [single_ep_node](ModelTestBuilder& builder) {
     // Creat non-quantized FusedMatMul node1
-    NodeArg* input1 = MakeTestInput(builder, TestInputDef<float>({2, 2}, false, {0, 1, 0, 1}));
-    NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
+    std::vector<float> data(200 * 200, 1.0f);
+    NodeArg* input1 = MakeTestInput(builder, TestInputDef<float>({200, 200}, false, data));
+    NodeArg* add1_ini_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
 
     auto* add1_output = builder.MakeIntermediate();
     builder.AddNode("FusedMatMul", {input1, add1_ini_input2}, {add1_output}, kMSDomain);
 
     // Create quantized Add node2
-    std::vector<float> data = {0.0f, 0.0f, 1.0f, 0.0f};
     gsl::span<float> data_range = gsl::make_span(data);
     QuantParams<uint8_t> q_parameter = GetDataQuantParams<uint8_t>(data_range);
     auto* add2_input1_qdq = AddQDQNodePair<uint8_t>(builder, add1_output, q_parameter.scale, q_parameter.zero_point);
 
-    NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
+    NodeArg* add2_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
     auto* add2_input2_qdq = AddQDQNodePair<uint8_t>(builder, add2_input2, q_parameter.scale, q_parameter.zero_point);
 
     auto* add2_output = builder.MakeIntermediate();
@@ -73,7 +74,7 @@ static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
       AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
     } else {
       auto* add3_input1_qdq = AddQDQNodePair<uint8_t>(builder, add2_output, q_parameter.scale, q_parameter.zero_point);
-      NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, {0, 0, 0, 0}));
+      NodeArg* add3_ini_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
 
       auto* add3_output = builder.MakeIntermediate();
       builder.AddNode("FusedMatMul", {add3_input1_qdq, add3_ini_input2}, {add3_output}, kMSDomain);
@@ -81,7 +82,7 @@ static GetTestModelFn BuildGraphWithQAndNonQ(bool single_ep_node = true) {
       // Create quantized Add node4
       auto* add4_input1_qdq = AddQDQNodePair<uint8_t>(builder, add3_output, q_parameter.scale, q_parameter.zero_point);
 
-      NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef<float>({2, 2}, true, data));
+      NodeArg* add4_input2 = MakeTestInput(builder, TestInputDef<float>({200, 200}, true, data));
       auto* add4_input2_qdq = AddQDQNodePair<uint8_t>(builder, add4_input2, q_parameter.scale, q_parameter.zero_point);
 
       auto* add4_output = builder.MakeIntermediate();
@@ -179,6 +180,75 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryMultiPartitionSupport2) {
   QnnContextBinaryMultiPartitionTestBody(single_ep_node);
 }
 
+void EpCtxCpuNodeWithExternalIniFileTestBody(bool expect_external_ini_file) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
+
+  auto& logging_manager = DefaultLoggingManager();
+  logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR);
+
+  onnxruntime::Model model("QNN_EP_TestModel", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  BuildGraphWithQAndNonQ(true)(helper);
+  helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(model.MainGraph().Resolve());
+  ModelSavingOptions model_saving_options{10};
+  const std::string model_with_ext = "model_external.onnx";
+  const std::string model_ext_file = "model_external.bin";
+  ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model, model_with_ext,
+                                                       model_ext_file, model_saving_options));
+
+  EXPECT_TRUE(std::filesystem::exists(model_with_ext.c_str()));
+  EXPECT_TRUE(std::filesystem::exists(model_ext_file.c_str()));
+
+  Ort::SessionOptions so;
+  so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
+  so.AppendExecutionProvider("QNN", provider_options);
+  const std::string ep_context_model_file = "./qnn_ctx_part_external_ini_ctx.onnx";
+  so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, ep_context_model_file.c_str());
+  const std::string external_ini_file = "./qnn_ctx_part_external_ini.bin";
+  if (expect_external_ini_file) {
+    // Set the external ini file name will force all initializers to the external file
+    so.AddConfigEntry(kOrtSessionOptionsEpContextModelExternalInitializersFileName, external_ini_file.c_str());
+  }  // otherwise all initializers are in Onnx file, no external data file generated
+
+  Ort::Session session(*ort_env, ToPathString(model_with_ext).c_str(), so);
+
+  EXPECT_TRUE(std::filesystem::exists(ep_context_model_file.c_str()));
+  if (expect_external_ini_file) {
+    EXPECT_TRUE(std::filesystem::exists(external_ini_file.c_str()));
+    ASSERT_EQ(std::remove(external_ini_file.c_str()), 0);
+  } else {
+    EXPECT_FALSE(std::filesystem::exists(external_ini_file.c_str()));
+  }
+
+  // clean up
+  ASSERT_EQ(std::remove(model_with_ext.c_str()), 0);
+  ASSERT_EQ(std::remove(model_ext_file.c_str()), 0);
+  ASSERT_EQ(std::remove(ep_context_model_file.c_str()), 0);
+}
+
+// Set the external initializer size threshold to 1024 so FusedMatMul (which fallback on CPU)
+// will dump initializer data to external file
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithExternalWeights) {
+  EpCtxCpuNodeWithExternalIniFileTestBody(true);
+}
+
+// Use the default external initializer size threshold (1024000) so FusedMatMul (which fallback on CPU)
+// will NOT dump initializer data to external file
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCpuNodeWithoutExternalWeights) {
+  EpCtxCpuNodeWithExternalIniFileTestBody(false);
+}
+
 // Create a model with Case + Add (quantized)
 // cast_input -> Cast -> Q -> DQ \
 //                                Add -> Q -> DQ -> output
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 8aaa0aa02d..91310cfc2a 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -183,7 +183,7 @@ class TestInferenceSession(unittest.TestCase):
             so.add_session_config_entry(
                 "session.optimized_model_external_initializers_file_name", external_initializers_file
             )
-            so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100")
+            so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "20")
             onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so)
             self.assertTrue(os.path.isfile(so.optimized_model_filepath))
             self.assertTrue(os.path.isfile(os.path.join(directory, external_initializers_file)))
@@ -213,14 +213,10 @@ class TestInferenceSession(unittest.TestCase):
             "session.optimized_model_external_initializers_file_name", external_initializers_file
         )
 
-        # TODO(anyone): Set this to 100 will cause test error since some tensor below the threshold
-        # still refers to the original external data file. We shall fix this issue so that the
-        # optimized model only refers to one external data file.
-        so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "10")
+        so.add_session_config_entry("session.optimized_model_external_initializers_min_size_in_bytes", "100")
         session1 = onnxrt.InferenceSession(get_name("model_with_orig_ext_data.onnx"), sess_options=so)
         del session1
         self.assertTrue(os.path.isfile(optimized_model_filepath))
-        self.assertTrue(os.path.isfile(external_initializers_file))
 
         so2 = onnxrt.SessionOptions()
         so2.log_severity_level = 1
@@ -240,7 +236,6 @@ class TestInferenceSession(unittest.TestCase):
 
         # Remove model 1 to make sure optimized model 2 can be loaded independently from model 1
         os.remove(optimized_model_filepath)
-        os.remove(external_initializers_file)
 
         session3 = onnxrt.InferenceSession(optimized_model_filepath_2, sess_options=onnxrt.SessionOptions())
         del session3

From e3e41739a7ca0ce0806805aa7e2814c72748d0e5 Mon Sep 17 00:00:00 2001
From: Ted Themistokleous
 <107195283+TedThemistokleous@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:41:16 -0500
Subject: [PATCH 32/37] [ROCm EP] Fix transpose helper for gfx gridsize
 constraints (#23527)

Remove inline default transposeHelper and ensure we use the proper check
via CanUse_hipBlasTransposeHelper_MLFloat16

Related to change in ROCm Onnxruntime repo:
https://github.com/ROCm/onnxruntime/pull/82

### Description

Required to correctly limit grid size of transpose helper kernel

### Motivation and Context
Compile was defaulting to the inline constructor that was removed
instead of using the overloaded case with proper checks.
Removed the inline default "true" case as this is incorrect for newer
AMD cards/targets

Co-authored-by: Ted Themistokleous <tedthemistokleous@amd.com>
---
 onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h
index 39d5306b15..9d32fcb65d 100644
--- a/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h
+++ b/onnxruntime/core/providers/rocm/shared_inc/fpgeneric.h
@@ -501,7 +501,6 @@ inline hipblasStatus_t hipblasTransposeHelper(hipStream_t /*stream*/, hipblasHan
   return hipblasDgeam(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
 }
 
-inline bool CanUse_hipblasTransposeHelper_MLFloat16(int /*m*/, int /*n*/) { return true; }  // CUDA has a limited grid size of 65536, ROCm has higher limits.
 hipblasStatus_t hipblasTransposeHelper(hipStream_t stream, hipblasHandle_t, hipblasOperation_t, hipblasOperation_t, int m, int n, const half*, const half* A, int, const half*, const half*, int, half* C, int);
 
 // copy

From d5338da1f556eddff636293c97e0921443fe0fc4 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:35:25 -0800
Subject: [PATCH 33/37] Fix tensor external data info length parsing issue.
 (#23526)

Fix tensor external data info length parsing issue.

The old implementation was parsing a `size_t` value with `strtol` (via `OrtStrToPtrDiff`) on ARM64 MSVC.

https://github.com/microsoft/onnxruntime/blob/bf023ab3d565668c13a5334b505df0eb6acf3625/onnxruntime/core/platform/path_lib.h#L74

If we have `sizeof(size_t) == 8` and `sizeof(long) == 4` (as is the case for x64 and ARM64 MSVC), `strtol` will return a maximum value of `2^31-1` even for a larger, valid `size_t` value. `strtol` will also set `errno` to `ERANGE`, but we weren't checking that.

Updated to use `ParseStringWithClassicLocale` which will parse directly to the target type.

Added some tests.
---
 .../framework/tensor_external_data_info.cc    | 44 +++++-------
 .../framework/tensor_external_data_info.h     |  2 -
 .../test/framework/tensorutils_test.cc        | 72 +++++++++++++++++++
 3 files changed, 88 insertions(+), 30 deletions(-)

diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc
index ec8b25e9f4..971851db62 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.cc
+++ b/onnxruntime/core/framework/tensor_external_data_info.cc
@@ -4,6 +4,7 @@
 #include "tensor_external_data_info.h"
 #include "core/common/common.h"
 #include "core/common/narrow.h"
+#include "core/common/parse_string.h"
 #include "core/common/safeint.h"
 #include "core/common/string_utils.h"
 #include "core/platform/path_lib.h"
@@ -18,21 +19,8 @@ using ::ONNX_NAMESPACE::StringStringEntryProto;
 
 namespace onnxruntime {
 Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>& input,
-                                std::unique_ptr<ExternalDataInfo>& out) {
-  auto str_to_int = [](const std::string& s, OFFSET_TYPE& result) -> Status {
-    char* end;
-#ifdef _WIN32
-    result = _strtoi64(s.c_str(), &end, 10);
-#else
-    result = OrtStrToPtrDiff(s.c_str(), &end);
-#endif
-    if (end != s.c_str() + s.length()) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", s, " failed");
-    }
-    return Status::OK();
-  };
-
-  out = std::make_unique<ExternalDataInfo>();
+                                std::unique_ptr<ExternalDataInfo>& external_data_info_result) {
+  auto external_data_info = std::make_unique<ExternalDataInfo>();
   PrepackedInfos prepacked_infos;
 
   const int input_size = input.size();
@@ -43,17 +31,15 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error! Need a key for the external data info");
     if (!stringmap.has_value())
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error! Need a value for the external data info");
+
     if (stringmap.key() == "location" && !stringmap.value().empty()) {
-      out->rel_path_ = ToWideString(stringmap.value());
+      external_data_info->rel_path_ = ToWideString(stringmap.value());
     } else if (stringmap.key() == "offset" && !stringmap.value().empty()) {
-      ORT_RETURN_IF_ERROR(str_to_int(stringmap.value(), out->offset_));
+      ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(stringmap.value(), external_data_info->offset_));
     } else if (stringmap.key() == "length" && !stringmap.value().empty()) {
-      char* end;
-      out->length_ = narrow<size_t>(OrtStrToPtrDiff(stringmap.value().c_str(), &end));
-      if (end != stringmap.value().c_str() + stringmap.value().length())
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "parsing ", stringmap.value(), " failed");
+      ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(stringmap.value(), external_data_info->length_));
     } else if (stringmap.key() == "checksum" && !stringmap.value().empty()) {
-      out->checksum_ = stringmap.value();
+      external_data_info->checksum_ = stringmap.value();
     } else if (stringmap.key().find("prepacked", 0) == 0) {
       // Starts with 'prepacked', each has its own key.
       // Each prepacked entry may have multiple blobs with the same key
@@ -72,10 +58,11 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
             const auto& blob = split_fields[f];
             auto blob_fields = utils::SplitString(blob, ";", false);
             if (blob_fields.size() == 3) {
-              OFFSET_TYPE offset, len;
-              ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[0]), offset));
-              ORT_RETURN_IF_ERROR(str_to_int(std::string(blob_fields[1]), len));
-              blob_infos.push_back(std::make_tuple(offset, narrow<size_t>(len), std::string(blob_fields[2])));
+              OFFSET_TYPE offset;
+              size_t len;
+              ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(blob_fields[0], offset));
+              ORT_RETURN_IF_ERROR(ParseStringWithClassicLocale(blob_fields[1], len));
+              blob_infos.push_back(std::make_tuple(offset, len, std::string(blob_fields[2])));
             }
           }
           if (blob_infos.empty()) {
@@ -88,14 +75,15 @@ Status ExternalDataInfo::Create(const RepeatedPtrField<StringStringEntryProto>&
     }
   }
 
-  if (out->rel_path_.empty()) {
+  if (external_data_info->rel_path_.empty()) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "model format error! Missing 'location'");
   }
 
   if (!prepacked_infos.empty()) {
-    out->prepacked_infos_ = std::move(prepacked_infos);
+    external_data_info->prepacked_infos_ = std::move(prepacked_infos);
   }
 
+  external_data_info_result = std::move(external_data_info);
   return Status::OK();
 }
 void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& external_file_path,
diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h
index 1b185b8c5d..2de1e01f38 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.h
+++ b/onnxruntime/core/framework/tensor_external_data_info.h
@@ -32,8 +32,6 @@ class ExternalDataInfo {
 
   const std::string& GetChecksum() const { return checksum_; }
 
-  // If the value of 'offset' or 'length' field is larger the max value of ssize_t, this function will treat it as a
-  // wrong value and return FAIL.
   static common::Status Create(
       const ::google::protobuf::RepeatedPtrField<::ONNX_NAMESPACE::StringStringEntryProto>& input,
       std::unique_ptr<ExternalDataInfo>& out);
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 229f4f95b8..931a507c53 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/common/inlined_containers.h"
+#include "core/common/parse_string.h"
 #include "core/framework/prepacked_weights.h"
 #include "core/framework/prepacked_weights_container.h"
 #include "core/framework/tensorprotoutils.h"
@@ -9,6 +10,9 @@
 #include "test/util/include/asserts.h"
 #include "file_util.h"
 
+#include <cstdint>
+#include <limits>
+
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
 
@@ -22,6 +26,74 @@ using namespace ONNX_NAMESPACE;
 namespace onnxruntime {
 namespace test {
 
+// if `expected_error_message_substring` is nullptr, parsing is expected to be successful
+static void TestExternalDataInfoParsingOffsetAndLengthWithStrings(
+    std::string_view offset_str,
+    std::string_view length_str,
+    const char* expected_error_message_substring = nullptr) {
+  SCOPED_TRACE(MakeString("offset: \"", offset_str, "\", length: \"", length_str, "\""));
+
+  ONNX_NAMESPACE::TensorProto tensor_proto;
+  const std::filesystem::path kExternalDataPath("test.bin");
+
+  tensor_proto.set_data_location(ONNX_NAMESPACE::TensorProto_DataLocation::TensorProto_DataLocation_EXTERNAL);
+
+  auto* location_entry = tensor_proto.add_external_data();
+  location_entry->set_key("location");
+  location_entry->set_value(ToUTF8String(kExternalDataPath.native()));
+
+  auto* offset_entry = tensor_proto.add_external_data();
+  offset_entry->set_key("offset");
+  offset_entry->set_value(offset_str.data(), offset_str.size());
+
+  auto* length_entry = tensor_proto.add_external_data();
+  length_entry->set_key("length");
+  length_entry->set_value(length_str.data(), length_str.size());
+
+  std::unique_ptr<ExternalDataInfo> external_data_info{};
+  const auto create_status = ExternalDataInfo::Create(tensor_proto.external_data(), external_data_info);
+  if (expected_error_message_substring) {
+    ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(create_status, expected_error_message_substring);
+    return;
+  }
+  ASSERT_STATUS_OK(create_status);
+
+  // if we got this far, assume that offset_str and length_str are able to be parsed.
+  const auto expected_offset = ParseStringWithClassicLocale<ExternalDataInfo::OFFSET_TYPE>(offset_str);
+  const auto expected_length = ParseStringWithClassicLocale<size_t>(length_str);
+
+  ASSERT_EQ(external_data_info->GetOffset(), expected_offset);
+  ASSERT_EQ(external_data_info->GetLength(), expected_length);
+}
+
+// if `expected_error_message_substring` is nullptr, parsing is expected to be successful
+static void TestExternalDataInfoParsingOffsetAndLength(intmax_t offset,
+                                                       uintmax_t length,
+                                                       const char* expected_error_message_substring = nullptr) {
+  TestExternalDataInfoParsingOffsetAndLengthWithStrings(std::to_string(offset), std::to_string(length),
+                                                        expected_error_message_substring);
+}
+
+TEST(TensorProtoUtilsTest, ParseExternalDataInfoOffsetAndLength) {
+  TestExternalDataInfoParsingOffsetAndLength(0, 0);
+
+  TestExternalDataInfoParsingOffsetAndLength(0, 1024);
+  TestExternalDataInfoParsingOffsetAndLength(0, std::numeric_limits<size_t>::max());
+
+  TestExternalDataInfoParsingOffsetAndLength(1024, 1024);
+  TestExternalDataInfoParsingOffsetAndLength(std::numeric_limits<ExternalDataInfo::OFFSET_TYPE>::max(), 1024);
+
+  {
+    // assuming that this value is too large to fit in either size_t or ExternalDataInfo::OFFSET_TYPE
+    const std::string_view two_to_the_65th_power = "36893488147419103232";
+    const std::string_view zero = "0";
+    TestExternalDataInfoParsingOffsetAndLengthWithStrings(two_to_the_65th_power, zero, "Failed to parse value");
+    TestExternalDataInfoParsingOffsetAndLengthWithStrings(zero, two_to_the_65th_power, "Failed to parse value");
+  }
+
+  // TODO should ExternalDataInfo::Create() also reject negative offset values?
+}
+
 // Test ExternalData functionality
 TEST(TensorProtoUtilsTest, SetExternalDataInformation) {
   ONNX_NAMESPACE::TensorProto tensor_proto;

From fbae88f5ade6db2b9df5d178f912733f60ebc1a9 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 29 Jan 2025 17:38:22 -0800
Subject: [PATCH 34/37] [js/web] use the recommended workaround for Vite
 (#23531)

### Description

After some investigation and debug, I decided to follow the recommended
workaround as suggested in https://github.com/vitejs/vite/issues/8427.

### Motivation and Context

There is a known issue with Vite 5.x when using WebAssembly package.
Detail information is in https://github.com/vitejs/vite/issues/8427.

There are previous attempts to fix this problem (#23487). I tried
various ways to make it working out of the box for Vite users but none
of them worked: Some "fixes" did fix the usage of Vite but broke other
use case/bundler and some introduced other issues. Eventually I figured
out that there is no good way to fix this inside ONNX Runtime.

Considering the root cause is inside Vite and it may be fixed in Vite
v6. I think now the best way is to follow the recommended workaround.
---
 js/web/package.json                                      | 6 ------
 .../testcases/vite-default/src/components/onnx-helper.js | 9 ---------
 .../e2e/exports/testcases/vite-default/vite.config.js    | 6 ++++++
 3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/js/web/package.json b/js/web/package.json
index 4e6e2c32ae..181d6127f5 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -77,22 +77,16 @@
       "require": "./dist/ort.min.js",
       "types": "./types.d.ts"
     },
-    "./.mjs": "./dist/ort-wasm-simd-threaded.jsep.mjs",
-    "./.wasm": "./dist/ort-wasm-simd-threaded.jsep.wasm",
     "./all": {
       "import": "./dist/ort.all.bundle.min.mjs",
       "require": "./dist/ort.all.min.js",
       "types": "./types.d.ts"
     },
-    "./all/.mjs": "./dist/ort-wasm-simd-threaded.jsep.mjs",
-    "./all/.wasm": "./dist/ort-wasm-simd-threaded.jsep.wasm",
     "./wasm": {
       "import": "./dist/ort.wasm.bundle.min.mjs",
       "require": "./dist/ort.wasm.min.js",
       "types": "./types.d.ts"
     },
-    "./wasm/.mjs": "./dist/ort-wasm-simd-threaded.mjs",
-    "./wasm/.wasm": "./dist/ort-wasm-simd-threaded.wasm",
     "./webgl": {
       "import": "./dist/ort.webgl.min.mjs",
       "require": "./dist/ort.webgl.min.js",
diff --git a/js/web/test/e2e/exports/testcases/vite-default/src/components/onnx-helper.js b/js/web/test/e2e/exports/testcases/vite-default/src/components/onnx-helper.js
index 4b8c626157..332745f8e5 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/src/components/onnx-helper.js
+++ b/js/web/test/e2e/exports/testcases/vite-default/src/components/onnx-helper.js
@@ -1,14 +1,5 @@
 import * as ort from 'onnxruntime-web';
 
-// The following line uses Vite's "Explicit URL Imports" feature to load the wasm file as an asset.
-//
-// see https://vite.dev/guide/assets.html#explicit-url-imports
-//
-import wasmFileUrl from 'onnxruntime-web/.wasm?url';
-
-// wasmFileUrl is the URL of the wasm file. Vite will make sure it's available in both development and production.
-ort.env.wasm.wasmPaths = { wasm: wasmFileUrl };
-
 // Model data for "test_abs/model.onnx"
 const testModelData =
   'CAcSDGJhY2tlbmQtdGVzdDpJCgsKAXgSAXkiA0FicxIIdGVzdF9hYnNaFwoBeBISChAIARIMCgIIAwoCCAQKAggFYhcKAXkSEgoQCAESDAoCCAMKAggECgIIBUIECgAQDQ==';
diff --git a/js/web/test/e2e/exports/testcases/vite-default/vite.config.js b/js/web/test/e2e/exports/testcases/vite-default/vite.config.js
index 37d3d6f22b..253ecbb621 100644
--- a/js/web/test/e2e/exports/testcases/vite-default/vite.config.js
+++ b/js/web/test/e2e/exports/testcases/vite-default/vite.config.js
@@ -3,5 +3,11 @@ import vue from '@vitejs/plugin-vue';
 
 // https://vite.dev/config/
 export default defineConfig({
+  // This is a known issue when using WebAssembly with Vite 5.x
+  // Need to specify `optimizeDeps.exclude` to NPM packages that uses WebAssembly
+  // See: https://github.com/vitejs/vite/issues/8427
+  optimizeDeps: {
+    exclude: ['onnxruntime-web'],
+  },
   plugins: [vue()],
 });

From 5407c69028ae6dd4e87521aea147c22153d8e6c7 Mon Sep 17 00:00:00 2001
From: Hector Li <hecli@microsoft.com>
Date: Wed, 29 Jan 2025 22:01:13 -0800
Subject: [PATCH 35/37] Fix the issue that the new generated EP context model
 not able to find external data (#23537)

Fix the issue that the new generated EP context model not able to find external data

### Description
The new generated EP context model was not able to find the external data file because it lost track of the source model path which used to locate the external initializers.

Relate to issue: https://github.com/microsoft/onnxruntime/issues/23358
---
 onnxruntime/core/framework/graph_partitioner.cc       | 4 +++-
 onnxruntime/test/providers/qnn/qnn_ep_context_test.cc | 8 +++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index b4f1e9b11c..7c980a1aeb 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -683,7 +683,9 @@ static Status CreateEpContextModel(const ExecutionProviders& execution_providers
                            context_cache_path, "' exist already.");
   }
 
-  Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList{graph.GetSchemaRegistry()},
+  Model ep_context_model(graph.Name(), false, graph.GetModel().MetaData(),
+                         graph.GetModel().ModelPath(),  // use source model path so that external initializers can find the data file path
+                         IOnnxRuntimeOpSchemaRegistryList{graph.GetSchemaRegistry()},
                          graph.DomainToVersionMap(), {}, logger);
   auto& ep_graph = ep_context_model.MainGraph();
   ep_graph.SetDescription(graph.Description());
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index 416d812326..a4ecf2a240 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -202,13 +202,15 @@ void EpCtxCpuNodeWithExternalIniFileTestBody(bool expect_external_ini_file) {
   helper.SetGraphOutputs();
   ASSERT_STATUS_OK(model.MainGraph().Resolve());
   ModelSavingOptions model_saving_options{10};
-  const std::string model_with_ext = "model_external.onnx";
+  // dump the model in testdata folder in case it hides the bug that not able to find model not in current dir
+  const std::string model_with_ext = "./testdata/model_external.onnx";
   const std::string model_ext_file = "model_external.bin";
   ASSERT_STATUS_OK(Model::SaveWithExternalInitializers(model, model_with_ext,
                                                        model_ext_file, model_saving_options));
 
   EXPECT_TRUE(std::filesystem::exists(model_with_ext.c_str()));
-  EXPECT_TRUE(std::filesystem::exists(model_ext_file.c_str()));
+  std::string model_ext_file_full_path = "./testdata/" + model_ext_file;
+  EXPECT_TRUE(std::filesystem::exists(model_ext_file_full_path.c_str()));
 
   Ort::SessionOptions so;
   so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1");
@@ -233,7 +235,7 @@ void EpCtxCpuNodeWithExternalIniFileTestBody(bool expect_external_ini_file) {
 
   // clean up
   ASSERT_EQ(std::remove(model_with_ext.c_str()), 0);
-  ASSERT_EQ(std::remove(model_ext_file.c_str()), 0);
+  ASSERT_EQ(std::remove(model_ext_file_full_path.c_str()), 0);
   ASSERT_EQ(std::remove(ep_context_model_file.c_str()), 0);
 }
 

From dc2f7a9a0c4dc6c193ec817ea0d294254ffb30b0 Mon Sep 17 00:00:00 2001
From: Edward Chen <18449977+edgchen1@users.noreply.github.com>
Date: Thu, 30 Jan 2025 13:55:54 -0800
Subject: [PATCH 36/37] Add overload of `TryParseStringWithClassicLocale()`
 that uses `std::from_chars()` (#23541)

Add overload of `TryParseStringWithClassicLocale()` that uses `std::from_chars()` for certain types.

Reduce binary size. It recently increased after PR #23526.
---
 .../onnxruntime/core/common/parse_string.h    | 40 ++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/include/onnxruntime/core/common/parse_string.h b/include/onnxruntime/core/common/parse_string.h
index 941e3f3377..6345b2a554 100644
--- a/include/onnxruntime/core/common/parse_string.h
+++ b/include/onnxruntime/core/common/parse_string.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <charconv>
 #include <locale>
 #include <sstream>
 #include <string_view>
@@ -12,18 +13,45 @@
 
 namespace onnxruntime {
 
+namespace detail {
+
+// Whether we will use std::from_chars() to parse to `T`.
+#if defined(_LIBCPP_VERSION)
+// Note: Currently (e.g., in LLVM 19), libc++'s std::from_chars() doesn't support floating point types yet.
+template <typename T>
+constexpr bool ParseWithFromChars = !std::is_same_v<bool, T> && std::is_integral_v<T>;
+#else
+template <typename T>
+constexpr bool ParseWithFromChars = !std::is_same_v<bool, T> && (std::is_integral_v<T> || std::is_floating_point_v<T>);
+#endif
+
+}  // namespace detail
+
 /**
  * Tries to parse a value from an entire string.
+ * If successful, sets `value` and returns true. Otherwise, does not modify `value` and returns false.
  */
 template <typename T>
-bool TryParseStringWithClassicLocale(std::string_view str, T& value) {
-  if constexpr (std::is_integral<T>::value && std::is_unsigned<T>::value) {
-    // if T is unsigned integral type, reject negative values which will wrap
-    if (!str.empty() && str[0] == '-') {
-      return false;
-    }
+std::enable_if_t<detail::ParseWithFromChars<T>, bool>
+TryParseStringWithClassicLocale(std::string_view str, T& value) {
+  T parsed_value{};
+  const auto [ptr, ec] = std::from_chars(str.data(), str.data() + str.size(), parsed_value);
+
+  if (ec != std::errc{}) {
+    return false;
   }
 
+  if (ptr != str.data() + str.size()) {
+    return false;
+  }
+
+  value = parsed_value;
+  return true;
+}
+
+template <typename T>
+std::enable_if_t<!detail::ParseWithFromChars<T>, bool>
+TryParseStringWithClassicLocale(std::string_view str, T& value) {
   // don't allow leading whitespace
   if (!str.empty() && std::isspace(str[0], std::locale::classic())) {
     return false;

From 7e2408880e963bcfdd2b898c7b6464506545cec2 Mon Sep 17 00:00:00 2001
From: Takeshi Watanabe <take-cheeze@users.noreply.github.com>
Date: Fri, 31 Jan 2025 07:23:56 +0900
Subject: [PATCH 37/37] Enable dlpack by default (#23110)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
<!-- Describe your changes. -->
This PR will enable python dlpack interface by default.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

dlpack python interface is useful in inference mode not only training
mode.
Since some inference result preprocess may be written in torch and
making unnecessary device transfer should be reduced in those cases.
closes https://github.com/microsoft/onnxruntime/issues/15963 closes
https://github.com/microsoft/onnxruntime/issues/22061

TODOs:
- [x] Add tests like
https://github.com/microsoft/onnxruntime/blob/5407c69028ae6dd4e87521aea147c22153d8e6c7/orttraining/orttraining/test/python/orttraining_test_ortvalue.py
that's unrelated to training feature

---------

Co-authored-by: Xavier Dupré <xadupre@users.noreply.github.com>
Co-authored-by: Justin Chu <justinchuby@users.noreply.github.com>
---
 cmake/CMakeLists.txt                          |  7 +++++
 .../external/onnxruntime_external_deps.cmake  |  4 +--
 cmake/onnxruntime_providers_cpu.cmake         |  6 +++-
 cmake/onnxruntime_python.cmake                |  3 ++
 .../python/onnxruntime_pybind_ortvalue.cc     | 13 ++++----
 .../python/onnxruntime_pybind_state_common.cc |  2 +-
 .../python/onnxruntime_pybind_state_common.h  |  4 +--
 .../test/python/onnxruntime_test_python.py    | 30 ++++++++++++++++---
 8 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 8650cc53d9..962b42c190 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -241,6 +241,9 @@ option(onnxruntime_ENABLE_CPUINFO "Enable cpuinfo" ON)
 # ATen fallback support
 option(onnxruntime_ENABLE_ATEN "Enable ATen fallback" OFF)
 
+# dlpack support
+cmake_dependent_option(onnxruntime_ENABLE_DLPACK "Enable dlpack" ON "onnxruntime_ENABLE_TRAINING OR onnxruntime_ENABLE_ATEN OR onnxruntime_ENABLE_PYTHON" OFF)
+
 # Triton support
 option(onnxruntime_ENABLE_TRITON "Enable Triton" OFF)
 
@@ -1603,6 +1606,10 @@ if (onnxruntime_ENABLE_TRAINING)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES tensorboard)
 endif()
 
+if (onnxruntime_ENABLE_DLPACK)
+  add_compile_definitions(ENABLE_DLPACK)
+endif()
+
 if (UNIX OR onnxruntime_USE_NCCL)
   # MPI is INDEPENDENT of NCCL for now. You can build NCLL without MPI and launch multi-GPU with your own launcher.
   if (onnxruntime_USE_MPI)
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index e956569698..4e5875f969 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -576,8 +576,8 @@ if (onnxruntime_RUN_ONNX_TESTS)
 endif()
 
 
-if(onnxruntime_ENABLE_ATEN)
-  message(STATUS "Aten fallback is enabled.")
+if(onnxruntime_ENABLE_DLPACK)
+  message(STATUS "dlpack is enabled.")
   FetchContent_Declare(
     dlpack
     URL ${DEP_URL_dlpack}
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index 4ae89a3922..3e1a9edbd1 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -103,7 +103,7 @@ if (onnxruntime_ENABLE_TRAINING_OPS AND NOT onnxruntime_ENABLE_TRAINING)
   list(REMOVE_ITEM onnxruntime_providers_src ${onnxruntime_cpu_full_training_only_srcs})
 endif()
 
-if (onnxruntime_ENABLE_ATEN)
+if (onnxruntime_ENABLE_DLPACK)
   file(GLOB_RECURSE onnxruntime_providers_dlpack_srcs CONFIGURE_DEPENDS
     "${ONNXRUNTIME_ROOT}/core/dlpack/dlpack_converter.cc"
     "${ONNXRUNTIME_ROOT}/core/dlpack/dlpack_converter.h"
@@ -191,6 +191,10 @@ endif()
 
 if (onnxruntime_ENABLE_ATEN)
   target_compile_definitions(onnxruntime_providers PRIVATE ENABLE_ATEN)
+endif()
+
+if (onnxruntime_ENABLE_DLPACK)
+  target_compile_definitions(onnxruntime_providers PRIVATE ENABLE_DLPACK)
   # DLPack is a header-only dependency
   set(DLPACK_INCLUDE_DIR ${dlpack_SOURCE_DIR}/include)
   target_include_directories(onnxruntime_providers PRIVATE ${DLPACK_INCLUDE_DIR})
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index 15a2862ced..b4a251dc6e 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -128,6 +128,9 @@ endif()
 
 if (onnxruntime_ENABLE_ATEN)
   target_compile_definitions(onnxruntime_pybind11_state PRIVATE ENABLE_ATEN)
+endif()
+
+if (onnxruntime_ENABLE_DLPACK)
   target_include_directories(onnxruntime_pybind11_state PRIVATE ${dlpack_SOURCE_DIR}/include)
 endif()
 
diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
index 6a57fc5f90..66ceacda75 100644
--- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc
@@ -13,9 +13,6 @@
 #include "core/framework/tensor.h"
 #include "core/framework/sparse_tensor.h"
 #include "core/framework/TensorSeq.h"
-#ifdef ENABLE_TRAINING
-#include "core/dlpack/dlpack_converter.h"
-#endif
 namespace onnxruntime {
 namespace python {
 
@@ -350,7 +347,7 @@ void addOrtValueMethods(pybind11::module& m) {
         py::object obj = GetPyObjFromTensor(*ml_value, nullptr, nullptr);
 #endif
         return obj; })
-#ifdef ENABLE_TRAINING
+#if defined(ENABLE_DLPACK)
       .def("to_dlpack", [](OrtValue* ort_value) -> py::object { return py::reinterpret_steal<py::object>(ToDlpack(*ort_value)); },
            "Returns a DLPack representing the tensor. This method does not copy the pointer shape, "
            "instead, it copies the pointer value. The OrtValue must be persist until the dlpack structure "
@@ -373,7 +370,7 @@ void addOrtValueMethods(pybind11::module& m) {
       .def("push_back", [](std::vector<OrtValue>* v, const OrtValue& ortvalue) {
         v->push_back(ortvalue);
       })
-#ifdef ENABLE_TRAINING
+#if defined(ENABLE_DLPACK)
       .def("push_back", [](std::vector<OrtValue>* v, py::object dlpack_tensor, const bool is_bool_tensor) { v->push_back(FromDlpack(dlpack_tensor.ptr(), is_bool_tensor)); }, "Add a new OrtValue after being ownership was transferred from the DLPack structure.", py::arg("dlpack_tensor"), py::arg("is_bool_tensor") = false)
       .def("push_back_batch", [](std::vector<OrtValue>* v, std::vector<py::object>& torch_tensors, std::vector<int64_t>& data_ptrs, std::vector<py::object>& element_types, const std::vector<std::vector<int64_t>>& shapes, const std::vector<OrtDevice>& devices) {
             for (size_t i = 0; i < torch_tensors.size(); ++i) {
@@ -415,7 +412,7 @@ void addOrtValueMethods(pybind11::module& m) {
            "In case of a boolean tensor, method to_dlpacks returns a uint8 tensor instead of a boolean tensor. "
            "If torch consumes the dlpack structure, `.to(torch.bool)` must be applied to the torch tensor "
            "to get a boolean tensor.")
-#ifdef ENABLE_TRAINING
+#if defined(ENABLE_DLPACK)
       .def("dlpack_at", [](std::vector<OrtValue>* v, const size_t idx) { return py::reinterpret_steal<py::object>(ToDlpack(v->at(idx))); })
 #endif
       .def("element_type_at", [](std::vector<OrtValue>* v, const size_t idx) -> int32_t { return GetTensorProtoType(v->at(idx)); },
@@ -424,7 +421,7 @@ void addOrtValueMethods(pybind11::module& m) {
            "(such as onnx.TensorProto.FLOAT)."
            "Raises an exception in any other case.",
            py::arg("idx"))
-#ifdef ENABLE_TRAINING
+#if defined(ENABLE_DLPACK)
       .def("to_dlpacks", [](const std::vector<OrtValue>& v, py::object to_tensor) -> py::list {
             if (v.size() == 0)
               return py::list();
@@ -494,7 +491,7 @@ for every transferred tensor.
 #endif
       ;
 
-#ifdef ENABLE_TRAINING
+#if defined(ENABLE_DLPACK)
   m.def(
       "is_dlpack_uint8_tensor", [](py::capsule cap) -> bool {
         // case ONNX_NAMESPACE::TensorProto_DataType_BOOL:
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.cc b/onnxruntime/python/onnxruntime_pybind_state_common.cc
index cec4dfc141..55ea264571 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.cc
@@ -43,7 +43,7 @@ onnxruntime::ROCMExecutionProviderExternalAllocatorInfo external_allocator_info{
 onnxruntime::ArenaExtendStrategy arena_extend_strategy = onnxruntime::ArenaExtendStrategy::kNextPowerOfTwo;
 #endif
 
-#ifdef ENABLE_TRAINING
+#if defined(ENABLE_DLPACK)
 
 void DlpackCapsuleDestructor(PyObject* data) {
   DLManagedTensor* dlmanaged_tensor = reinterpret_cast<DLManagedTensor*>(PyCapsule_GetPointer(data, "dltensor"));
diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h
index 995341b0f8..8d4a882b14 100644
--- a/onnxruntime/python/onnxruntime_pybind_state_common.h
+++ b/onnxruntime/python/onnxruntime_pybind_state_common.h
@@ -12,7 +12,7 @@
 #include "core/session/environment.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/session/inference_session.h"
-#ifdef ENABLE_TRAINING
+#if defined(ENABLE_DLPACK)
 #include "core/dlpack/dlpack_converter.h"
 #endif
 
@@ -410,7 +410,7 @@ bool CheckIfTensor(const std::vector<const NodeArg*>& def_list,
                    const std::string& name,
                    /*out*/ ONNX_NAMESPACE::TypeProto& type_proto);
 
-#ifdef ENABLE_TRAINING
+#if defined(ENABLE_DLPACK)
 
 // Allocate a new Capsule object, which takes the ownership of OrtValue.
 // Caller is responsible for releasing.
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index 91310cfc2a..3af6e8ccac 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -17,6 +17,7 @@ import numpy as np
 from helper import get_name
 
 import onnxruntime as onnxrt
+from onnxruntime.capi import _pybind_state as C
 from onnxruntime.capi.onnxruntime_pybind11_state import Fail, OrtValueVector, RunOptions
 
 # handle change from python 3.8 and on where loading a dll from the current directory needs to be explicitly allowed.
@@ -325,8 +326,6 @@ class TestInferenceSession(unittest.TestCase):
             self.assertEqual(option["user_compute_stream"], "1")
             self.assertEqual(option["has_user_compute_stream"], "1")
 
-            from onnxruntime.capi import _pybind_state as C
-
             session_options = C.get_default_session_options()
 
             # TRT plugins registered as custom op domain should only be added once in session option regardless of number of session creation
@@ -1421,6 +1420,31 @@ class TestInferenceSession(unittest.TestCase):
                 outs = session.run(output_names=["output"], input_feed=upstreams_onnxrt)[0]
                 self.assertTrue(np.allclose(inps, outs))
 
+    @unittest.skipIf(not hasattr(C.OrtValue, "from_dlpack"), "dlpack not enabled in this build")
+    def test_ort_value_dlpack(self):
+        # Tests originally from orttraining/orttraining/test/python/orttraining_test_ortvalue.py testOrtValueDlPack_float32
+        numpy_arr_input = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        ortvalue = onnxrt.OrtValue.ortvalue_from_numpy(numpy_arr_input)
+        self.assertEqual(numpy_arr_input.shape, tuple(ortvalue.shape()))
+        ptr = ortvalue._ortvalue.data_ptr()
+
+        dlp = ortvalue._ortvalue.to_dlpack()
+        self.assertFalse(C.is_dlpack_uint8_tensor(dlp))
+        ortvalue2 = C.OrtValue.from_dlpack(dlp, False)
+        self.assertEqual(ptr, ortvalue2.data_ptr())
+        new_array = ortvalue2.numpy()
+        np.testing.assert_equal(numpy_arr_input, new_array)
+
+        dlp = ortvalue._ortvalue.__dlpack__()
+        self.assertFalse(C.is_dlpack_uint8_tensor(dlp))
+        ortvalue2 = C.OrtValue.from_dlpack(dlp, False)
+        self.assertEqual(ptr, ortvalue2.data_ptr())
+        new_array = ortvalue2.numpy()
+        np.testing.assert_equal(numpy_arr_input, new_array)
+
+        device = ortvalue._ortvalue.__dlpack_device__()
+        self.assertEqual((1, 0), device)
+
     def test_sparse_tensor_coo_format(self):
         cpu_device = onnxrt.OrtDevice.make("cpu", 0)
         shape = [9, 9]
@@ -1694,8 +1718,6 @@ class TestInferenceSession(unittest.TestCase):
         check_failure([("a", {1: 2})], [{3: 4}])
 
     def test_register_custom_e_ps_library(self):
-        from onnxruntime.capi import _pybind_state as C
-
         available_eps = C.get_available_providers()
         # skip amd gpu build
         if "ROCMExecutionProvider" in available_eps: