From d8a7e1d159e22389d640ced85c6cd541acc812ce Mon Sep 17 00:00:00 2001
From: Ryan Lai <Ryan.Lai@microsoft.com>
Date: Tue, 30 Nov 2021 21:29:25 +0000
Subject: [PATCH 1/5] Merged PR 6718335: RI 11/30 from github

Pipeline green https://microsoft.visualstudio.com/WindowsAI/_build/results?buildId=42142807&view=results

![image.png](https://microsoft.visualstudio.com/274e76ac-6b29-4f77-a85d-7914c77cabd5/_apis/git/repositories/853d2ddc-663c-4fe8-8036-dbf0d50db2d9/pullRequests/6718335/attachments/image.png)

Related work items: #37220320
---
 cmake/CMakeLists.txt                          |   12 +-
 cmake/external/dml.cmake                      |    2 +-
 cmake/onnxruntime.cmake                       |   28 -
 cmake/onnxruntime_mlas.cmake                  |    5 +-
 cmake/onnxruntime_providers.cmake             |   16 +-
 csharp/OnnxRuntime.CSharp.proj                |    6 +-
 csharp/OnnxRuntime.DesktopOnly.CSharp.sln     |  230 +++
 csharp/readme.txt                             |    6 +
 .../Microsoft.ML.OnnxRuntime.csproj           |   25 +-
 .../NativeMethods.shared.cs                   |   97 +-
 .../OrtIoBinding.shared.cs                    |   20 +-
 .../InferenceTest.cs                          |    5 +-
 .../OrtIoBindingAllocationTest.cs             |    4 +
 dockerfiles/Dockerfile.openvino               |    2 +-
 dockerfiles/Dockerfile.openvino-centos7       |    6 +-
 dockerfiles/Dockerfile.openvino-csharp        |    9 +-
 docs/ContribOperators.md                      |   67 +
 docs/OperatorKernels.md                       |    7 +-
 .../onnxruntime/core/framework/data_types.h   |  386 ++--
 .../onnxruntime/core/framework/tensor_shape.h |    6 +-
 include/onnxruntime/core/graph/graph.h        |   22 +
 .../core/session/onnxruntime_c_api.h          |   22 +-
 .../core/session/onnxruntime_cxx_api.h        |    2 +
 .../core/session/onnxruntime_cxx_inline.h     |    8 +
 js/README.md                                  |   18 +-
 js/common/tsconfig.json                       |    2 +
 js/common/webpack.config.js                   |   27 +-
 js/node/package-lock.json                     |  597 ++----
 js/node/package.json                          |    2 +-
 js/package-lock.json                          |   12 +-
 js/react_native/package.json                  |    2 +-
 js/react_native/yarn.lock                     | 1682 ++++++++---------
 js/tsconfig.json                              |    2 -
 js/web/.npmignore                             |    2 +
 js/web/README.md                              |    2 +-
 js/web/lib/build-def.ts                       |   31 +
 js/web/lib/index.ts                           |   17 +-
 js/web/lib/onnxjs/attribute.ts                |    7 +-
 js/web/lib/onnxjs/session.ts                  |    2 +-
 js/web/lib/onnxjs/tensor.ts                   |    5 +-
 js/web/lib/onnxjs/util.ts                     |    4 +
 js/web/lib/wasm/proxy-wrapper.ts              |   12 +-
 js/web/lib/wasm/wasm-factory.ts               |   12 +-
 js/web/package-lock.json                      |  184 +-
 js/web/package.json                           |    5 +-
 js/web/script/build.ts                        |   12 +
 ... => browser-test-wasm-proxy-no-threads.js} |    2 +-
 js/web/test/e2e/karma.conf.js                 |    3 +-
 js/web/test/e2e/run.js                        |   23 +-
 js/web/test/e2e/simple-http-server.js         |    3 +
 js/web/test/test-runner.ts                    |    6 +-
 js/web/test/test-shared.ts                    |   10 +-
 js/web/tsconfig.json                          |    1 +
 js/web/webpack.config.js                      |  136 +-
 .../contrib_ops/cpu/cpu_contrib_kernels.cc    |    6 +-
 .../cpu/qlinear_global_average_pool.cc        |   45 +-
 .../cpu/qlinear_global_average_pool.h         |    9 +-
 onnxruntime/contrib_ops/cpu/qlinear_pool.cc   |   56 +-
 onnxruntime/contrib_ops/cpu/qlinear_pool.h    |    7 +
 .../cpu/quantization/nhwc_max_pool.cc         |   41 +-
 .../contrib_ops/cuda/bert/attention_concat.cu |  222 +++
 .../contrib_ops/cuda/bert/attention_impl.cu   |  246 ++-
 .../contrib_ops/cuda/bert/attention_impl.h    |   66 +-
 .../contrib_ops/cuda/bert/attention_past.cu   |  169 --
 .../contrib_ops/cuda/bert/attention_softmax.h |   34 +-
 .../cuda/bert/attention_transpose.cu          |  114 +-
 .../cuda/bert/decoder_attention.cc            |  387 ++++
 .../contrib_ops/cuda/bert/decoder_attention.h |   26 +
 .../cuda/bert/longformer_attention.cc         |   22 +-
 .../cuda/bert/longformer_attention_impl.cu    |    6 +-
 .../contrib_ops/cuda/bert/skip_layer_norm.cc  |    8 +
 .../cuda/bert/transformer_common.h            |    2 +
 .../cuda/bert/transformer_cuda_common.h       |   33 +
 .../contrib_ops/cuda/cuda_contrib_kernels.cc  |    4 +
 onnxruntime/contrib_ops/cuda/inverse.cc       |    2 +-
 onnxruntime/contrib_ops/cuda/layer_norm.cc    |    6 +
 onnxruntime/core/framework/data_types.cc      |  102 +-
 onnxruntime/core/framework/execution_frame.cc |    2 +
 .../core/framework/execution_providers.h      |    2 +-
 .../core/framework/graph_partitioner.cc       |    2 -
 onnxruntime/core/framework/session_state.cc   |    2 +
 onnxruntime/core/framework/tensor_shape.cc    |    7 +-
 .../core/framework/tensor_type_and_shape.cc   |    9 +
 .../core/framework/tensorprotoutils.cc        |    2 +
 .../core/graph/contrib_ops/contrib_defs.cc    |   69 +
 onnxruntime/core/graph/graph.cc               |   83 +-
 onnxruntime/core/mlas/inc/mlas.h              |   22 +-
 onnxruntime/core/mlas/lib/mlasi.h             |    3 +-
 onnxruntime/core/mlas/lib/platform.cpp        |    1 +
 onnxruntime/core/mlas/lib/pooling.cpp         |  146 +-
 .../mlas/lib/power/DgemmKernelPOWER10.cpp     |  418 ++++
 .../core/mlas/lib/power/DgemmKernelpower.h    |  311 +--
 .../core/mlas/lib/power/FgemmKernelpower.h    |  333 ++++
 .../mlas/lib/power/SgemmKernelPOWER10.cpp     |    4 +-
 .../core/mlas/lib/power/SgemmKernelpower.h    |  323 +---
 .../core/mlas/lib/qdwconv_kernelsize.cpp      |    5 +-
 onnxruntime/core/mlas/lib/qlgavgpool.cpp      |  587 ++++--
 onnxruntime/core/mlas/lib/quantize.cpp        |  117 +-
 onnxruntime/core/mlas/lib/transpose.cpp       |   15 +
 .../transpose_optimizer.cc                    |   60 +-
 onnxruntime/core/providers/common.h           |    2 +-
 .../einsum_compute_preprocessor.h             |    4 +-
 .../providers/cuda/cuda_execution_provider.cc |    8 +
 .../core/providers/cuda/cuda_profiler.cc      |    2 +-
 .../core/providers/cuda/cuda_profiler.h       |    2 +-
 .../providers/cuda/cuda_provider_factory.cc   |   21 +-
 .../core/providers/cuda/generator/random.cc   |   78 +
 .../core/providers/cuda/generator/random.h    |  130 ++
 .../providers/cuda/generator/random_impl.cu   |  145 ++
 .../providers/cuda/generator/random_impl.h    |   22 +
 .../core/providers/cuda/nn/instance_norm.cc   |    9 +
 .../src/MLOperatorAuthorImpl.cpp              |    4 +-
 .../dml/OperatorAuthorHelper/Common.h         |    2 +
 .../OperatorAuthorHelper/OperatorHelper.cpp   |    2 +-
 .../providers/dml/dml_provider_factory.cc     |    5 +-
 .../providers/dnnl/dnnl_node_capability.cc    |   32 +
 .../providers/dnnl/dnnl_node_capability.h     |   15 +
 .../core/providers/dnnl/dnnl_op_manager.cc    |    2 +
 .../providers/dnnl/subgraph/dnnl_squeeze.cc   |   75 +
 .../providers/dnnl/subgraph/dnnl_squeeze.h    |   30 +
 .../dnnl/subgraph/dnnl_subgraph_primitive.cc  |   11 +
 .../dnnl/subgraph/dnnl_subgraph_primitive.h   |    6 +
 .../providers/dnnl/subgraph/dnnl_unsqueeze.cc |   85 +
 .../providers/dnnl/subgraph/dnnl_unsqueeze.h  |   30 +
 .../nnapi_builtin/builders/op_builder.cc      |    2 +-
 .../providers/nuphar/scripts/rnn_benchmark.py |    4 +-
 .../openvino/ov_versions/data_ops.cc          |   12 +-
 .../providers/rocm/rocm_execution_provider.cc |    8 +
 .../provider_bridge_provider.cc               |    2 +-
 .../shared_library/provider_interfaces.h      |    2 +-
 onnxruntime/core/session/inference_session.cc |    2 +-
 onnxruntime/core/session/onnxruntime_c_api.cc |  103 +-
 onnxruntime/core/session/ort_apis.h           |    3 +-
 .../core/session/provider_bridge_ort.cc       |    2 +-
 onnxruntime/core/util/math_cpu.cc             |    1 +
 .../onnxruntime_inference_collection.py       |    7 +
 .../python/onnxruntime_pybind_iobinding.cc    |   12 +
 .../python/onnxruntime_pybind_state.cc        |   30 +-
 onnxruntime/python/tools/onnxruntime_test.py  |    2 +-
 .../tools/quantization/onnx_quantizer.py      |  108 +-
 .../tools/quantization/operators/direct_q8.py |   51 +-
 .../operators/qdq_base_operator.py            |    8 +-
 .../tools/quantization/qdq_quantizer.py       |   86 +-
 .../python/tools/quantization/quantize.py     |   10 +
 .../python/tools/quantization/registry.py     |    2 +-
 .../python/tools/symbolic_shape_infer.py      |    8 +-
 .../contrib_ops/decoder_attention_op_test.cc  |  412 ++++
 .../test/contrib_ops/nhwc_maxpool_op_test.cc  |   62 +-
 .../qlinear_global_average_pool_test.cc       |  154 +-
 .../test/contrib_ops/qlinear_pool_test.cc     |  302 ++-
 .../test/framework/inference_session_test.cc  |    2 +-
 .../test/fuzzing/include/BetaDistribution.h   |    2 +-
 .../mlas/unittest/test_qlinear_gavgpool.cpp   |   85 +-
 onnxruntime/test/onnx/main.cc                 |    1 +
 .../optimizer/transpose_optimizer_test.cc     |  407 ++--
 .../providers/cpu/generator/random_test.cc    |  211 ++-
 .../providers/cpu/tensor/unsqueeze_op_test.cc |  100 +-
 .../test/python/onnx_backend_test_series.py   |    1 +
 .../test/python/onnxruntime_test_python.py    |   14 +-
 .../onnxruntime_test_python_iobinding.py      |   41 +-
 ...untime_test_python_symbolic_shape_infer.py |   48 +-
 .../test/python/quantization/test_qdq.py      |  173 +-
 .../test_parity_decoder_attention.py          |  423 +++++
 onnxruntime/test/shared_lib/test_inference.cc |   45 +-
 .../test/testdata/crop_and_resize.onnx        |  Bin 0 -> 3107 bytes
 onnxruntime/test/testdata/identity_9799.onnx  |   20 +
 .../kernel_def_hashes/contrib.cpu.json        |    4 +
 .../kernel_def_hashes/training_ops.cpu.json   |    8 +
 .../core/graph/gradient_builder.cc            |    4 +-
 .../orttraining/core/graph/graph_augmenter.h  |    2 +-
 .../core/graph/training_op_defs.cc            |   36 +
 .../orttraining/eager/opgen/onnxgen.py        |   30 +-
 .../orttraining/eager/opgen/opgen/atenops.py  |   28 +-
 .../eager/opgen/opgen/custom_ops.py           |    2 +-
 .../eager/opgen/opgen/generator.py            |  104 +-
 .../orttraining/eager/opgen/opgen/onnxops.py  | 1357 ++++++++-----
 orttraining/orttraining/eager/ort_aten.cpp    |   96 +-
 orttraining/orttraining/eager/ort_aten.h      |    6 +
 orttraining/orttraining/eager/ort_tensor.h    |    1 +
 .../orttraining/eager/test/ort_eps_test.py    |    7 +
 orttraining/orttraining/eager/test/ort_ops.py |   22 +
 .../orttraining/eager/test/ort_tensor.py      |    7 +
 .../test/gradient/gradient_ops_test.cc        |   12 +-
 .../graph/optimizer_graph_builder_test.cc     |    2 +-
 .../python/orttraining_test_ortmodule_api.py  |    8 +-
 .../training_ops/cpu/cpu_training_kernels.cc  |    4 +
 .../training_ops/cpu/op_gradients.cc          |  150 +-
 .../training_ops/cpu/op_gradients.h           |   21 +-
 .../cuda/cuda_training_kernels.cc             |   15 +
 .../training_ops/cuda/math/softmax_grad.cc    |   93 +-
 .../training_ops/cuda/math/softmax_grad.h     |   11 +-
 .../training_ops/cuda/nn/conv_grad.cc         |    4 +-
 .../tools/ci_test/download_azure_blob.py      |   67 -
 .../scripts/performance_investigation.py      |   85 +
 packages.config                               |    2 +-
 server/serializing/tensorprotoutils.cc        |   18 +-
 tools/ci_build/amd_hipify.py                  |    5 +-
 tools/ci_build/build.py                       |   26 +-
 .../c-api-noopenmp-packaging-pipelines.yml    |   11 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |    1 +
 .../azure-pipelines/nodejs/templates/test.yml |    2 +-
 .../azure-pipelines/templates/c-api-cpu.yml   |   35 +-
 .../azure-pipelines/templates/win-ci-2019.yml |   12 +-
 .../azure-pipelines/templates/win-cpu-ci.yml  |   12 +-
 .../azure-pipelines/win-gpu-ci-pipeline.yml   |   13 +-
 .../win-gpu-cuda-10-2-pipeline.yml            |   11 +-
 .../linux/docker/Dockerfile.ubuntu_openvino   |    4 +-
 .../nuget/generate_nuspec_for_native_nuget.py |    2 +-
 winml/lib/Api/ImageFeatureValue.cpp           |   34 +-
 .../cppwinrt/scenariotestscppwinrt.cpp        |    8 +
 210 files changed, 9789 insertions(+), 4235 deletions(-)
 create mode 100644 csharp/OnnxRuntime.DesktopOnly.CSharp.sln
 create mode 100644 csharp/readme.txt
 create mode 100644 js/web/lib/build-def.ts
 rename js/web/test/e2e/{browser-test-wasm-no-threads-proxy.js => browser-test-wasm-proxy-no-threads.js} (80%)
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/attention_concat.cu
 delete mode 100644 onnxruntime/contrib_ops/cuda/bert/attention_past.cu
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/decoder_attention.h
 create mode 100644 onnxruntime/contrib_ops/cuda/bert/transformer_cuda_common.h
 create mode 100644 onnxruntime/core/mlas/lib/power/DgemmKernelPOWER10.cpp
 create mode 100644 onnxruntime/core/mlas/lib/power/FgemmKernelpower.h
 create mode 100644 onnxruntime/core/providers/cuda/generator/random.cc
 create mode 100644 onnxruntime/core/providers/cuda/generator/random.h
 create mode 100644 onnxruntime/core/providers/cuda/generator/random_impl.cu
 create mode 100644 onnxruntime/core/providers/cuda/generator/random_impl.h
 create mode 100644 onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.cc
 create mode 100644 onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.h
 create mode 100644 onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.cc
 create mode 100644 onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.h
 create mode 100644 onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
 create mode 100644 onnxruntime/test/python/transformers/test_parity_decoder_attention.py
 create mode 100644 onnxruntime/test/testdata/crop_and_resize.onnx
 create mode 100644 onnxruntime/test/testdata/identity_9799.onnx
 delete mode 100755 orttraining/tools/ci_test/download_azure_blob.py
 create mode 100644 orttraining/tools/scripts/performance_investigation.py

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index fb6bf16644..acbde7f56a 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -175,6 +175,8 @@ option(onnxruntime_PREBUILT_PYTORCH_PATH "Path to pytorch installation dir")
 # external transformer src path
 option(onnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH "Path to external transformer src dir")
 
+option(onnxruntime_ENABLE_CUDA_PROFILING "Enable CUDA kernel profiling" OFF)
+
 if (onnxruntime_USE_CUDA)
   set(onnxruntime_DISABLE_RTTI OFF)
 endif()
@@ -960,7 +962,11 @@ if (WIN32)
         # issued by thrust nonstandard extension used: nameless struct/union
         list(APPEND ORT_WARNING_FLAGS "/wd4201")
         # warning C4800: Implicit conversion from 'X' to bool. Possible information loss
-        list(APPEND ORT_WARNING_FLAGS "/w34800")
+        if (onnxruntime_USE_OPENVINO)
+           list(APPEND ORT_WARNING_FLAGS "/wd4800")
+        else()
+           list(APPEND ORT_WARNING_FLAGS "/w34800")
+        endif()
         if (onnxruntime_USE_OPENMP)
             list(APPEND ORT_WARNING_FLAGS "/wd6993") # Code analysis ignores OpenMP constructs
         endif()
@@ -1696,6 +1702,10 @@ if (onnxruntime_ENABLE_TRAINING_OPS)
   add_compile_definitions(ENABLE_TRAINING_OPS)
 endif()
 
+if (onnxruntime_ENABLE_CUDA_PROFILING)
+  add_compile_definitions(ENABLE_CUDA_PROFILING)
+endif()
+
 if (onnxruntime_ENABLE_TRAINING)
   add_compile_definitions(ENABLE_TRAINING)
   add_compile_definitions(ENABLE_TRAINING_OPS)
diff --git a/cmake/external/dml.cmake b/cmake/external/dml.cmake
index df258efd0e..f7da89f544 100644
--- a/cmake/external/dml.cmake
+++ b/cmake/external/dml.cmake
@@ -20,7 +20,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
   set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
   set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
   get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
-  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.5.1)
+  set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.8.0)
   set(DML_SHARED_LIB DirectML.dll)
 
   # Restore nuget packages, which will pull down the DirectML redist package
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 03544e0691..5b123ed1ff 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -215,34 +215,6 @@ if (WINDOWS_STORE)
   target_link_options(onnxruntime PRIVATE /DELAYLOAD:api-ms-win-core-libraryloader-l1-2-1.dll)
 endif()
 
-if (winml_is_inbox)
-  # Apply linking flags required by inbox static analysis tools
-  target_link_options(onnxruntime PRIVATE ${os_component_link_flags_list})
-  # Link *_x64/*_arm64 DLLs for the ARM64X forwarder
-  function(duplicate_shared_library target new_target)
-    get_target_property(sources ${target} SOURCES)
-    get_target_property(compile_definitions ${target} COMPILE_DEFINITIONS)
-    get_target_property(compile_options ${target} COMPILE_OPTIONS)
-    get_target_property(include_directories ${target} INCLUDE_DIRECTORIES)
-    get_target_property(link_libraries ${target} LINK_LIBRARIES)
-    get_target_property(link_flags ${target} LINK_FLAGS)
-    get_target_property(link_options ${target} LINK_OPTIONS)
-
-    add_library(${new_target} SHARED ${sources})
-    add_dependencies(${target} ${new_target})
-    target_compile_definitions(${new_target} PRIVATE ${compile_definitions})
-    target_compile_options(${new_target} PRIVATE ${compile_options})
-    target_include_directories(${new_target} PRIVATE ${include_directories})
-    target_link_libraries(${new_target} PRIVATE ${link_libraries})
-    set_property(TARGET ${new_target} PROPERTY LINK_FLAGS "${link_flags}")
-    target_link_options(${new_target} PRIVATE ${link_options})
-  endfunction()
-
-  if (WAI_ARCH STREQUAL x64 OR WAI_ARCH STREQUAL arm64)
-    duplicate_shared_library(onnxruntime onnxruntime_${WAI_ARCH})
-  endif()
-endif()
-
 # Assemble the Apple static framework (iOS and macOS)
 if(onnxruntime_BUILD_APPLE_FRAMEWORK)
   set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries)
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 42fcd4ad0d..e7ce2cc8b2 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -291,6 +291,7 @@ else()
           ${MLAS_SRC_DIR}/dgemm.cpp
           ${MLAS_SRC_DIR}/power/DgemmKernelPower.cpp
         )
+        set_source_files_properties(${MLAS_SRC_DIR}/power/SgemmKernelPower.cpp PROPERTIES COMPILE_FLAGS "-DSINGLE")
         check_cxx_compiler_flag("-mcpu=power10" HAS_POWER10)
         if(HAS_POWER10)
           set(CMAKE_REQUIRED_FLAGS "-mcpu=power10")
@@ -318,8 +319,10 @@ else()
             endif()
             set(mlas_platform_srcs_power10
               ${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp
+              ${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp
             )
-            set_source_files_properties(${mlas_platform_srcs_power10} PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10")
+            set_source_files_properties(${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10 -DSINGLE")
+            set_source_files_properties(${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10")
             set(mlas_platform_srcs
               ${mlas_platform_srcs}
               ${mlas_platform_srcs_power10}
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 56685921ae..a82629bed4 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -353,13 +353,18 @@ if (onnxruntime_USE_CUDA)
   endif()
 
   add_dependencies(onnxruntime_providers_cuda onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES} ${onnxruntime_tvm_dependencies})
-  target_link_directories(onnxruntime_providers_cuda PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
-  target_link_libraries(onnxruntime_providers_cuda PRIVATE cublas cudnn curand cufft cupti ${ONNXRUNTIME_PROVIDERS_SHARED})
-  target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
+  target_link_libraries(onnxruntime_providers_cuda PRIVATE cublas cudnn curand cufft ${ONNXRUNTIME_PROVIDERS_SHARED})
+  target_include_directories(onnxruntime_providers_cuda PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${onnxruntime_CUDNN_HOME}/include ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
   # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found
   set_target_properties(onnxruntime_providers_cuda PROPERTIES LINKER_LANGUAGE CUDA)
   set_target_properties(onnxruntime_providers_cuda PROPERTIES FOLDER "ONNXRuntime")
 
+  if (onnxruntime_ENABLE_CUDA_PROFILING) # configure cupti for cuda profiling
+    target_include_directories(onnxruntime_providers_cuda PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/include)
+    target_link_directories(onnxruntime_providers_cuda PRIVATE ${onnxruntime_CUDA_HOME}/extras/CUPTI/lib64)
+    target_link_libraries(onnxruntime_providers_cuda PRIVATE cupti)
+  endif()
+
   if (onnxruntime_ENABLE_NVTX_PROFILE)
     target_link_libraries(onnxruntime_providers_cuda PRIVATE nvToolsExt)
   endif()
@@ -887,11 +892,7 @@ if (onnxruntime_USE_DML)
     "${ONNXRUNTIME_ROOT}/core/providers/dml/*.cc"
   )
   source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_dml_cc_srcs})
-  
-  set_msvc_c_cpp_compiler_warning_level(3)
   onnxruntime_add_static_library(onnxruntime_providers_dml ${onnxruntime_providers_dml_cc_srcs})
-  set_msvc_c_cpp_compiler_warning_level(4)
-
   onnxruntime_add_include_to_target(onnxruntime_providers_dml onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers)
   add_dependencies(onnxruntime_providers_dml ${onnxruntime_EXTERNAL_DEPENDENCIES})
   target_include_directories(onnxruntime_providers_dml PRIVATE ${ONNXRUNTIME_ROOT} ${ONNXRUNTIME_ROOT}/../cmake/external/wil/include)
@@ -932,6 +933,7 @@ if (onnxruntime_USE_DML)
   target_compile_definitions(onnxruntime_providers_dml PRIVATE UNICODE _UNICODE NOMINMAX)
   if (MSVC)
     target_compile_definitions(onnxruntime_providers_dml PRIVATE _SILENCE_CXX17_ITERATOR_BASE_CLASS_DEPRECATION_WARNING)
+    target_compile_options(onnxruntime_providers_dml PRIVATE "/W3")
   endif()
 
   install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/dml  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
diff --git a/csharp/OnnxRuntime.CSharp.proj b/csharp/OnnxRuntime.CSharp.proj
index cf225b34ed..2ba184601c 100644
--- a/csharp/OnnxRuntime.CSharp.proj
+++ b/csharp/OnnxRuntime.CSharp.proj
@@ -44,7 +44,7 @@ CMake creates a target to this project
     <Message Importance="High" Text="Restoring NuGet packages for CSharp projects..." />
     <MSBuild Projects="src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj"
              Targets="Restore" 
-             Properties="Platform=AnyCPU" 
+             Properties="Platform=AnyCPU;OrtPackageId=$(OrtPackageId)" 
              />
     <MSBuild Projects="sample\Microsoft.ML.OnnxRuntime.InferenceSample\Microsoft.ML.OnnxRuntime.InferenceSample.csproj"
              Targets="Restore" 
@@ -65,7 +65,7 @@ CMake creates a target to this project
  
     <MSBuild Projects="src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj"
              Targets="ObtainPackageVersion;Build" 
-             Properties="Platform=AnyCPU"/>
+             Properties="Platform=AnyCPU;OrtPackageId=$(OrtPackageId)"/>
     <MSBuild Projects="sample\Microsoft.ML.OnnxRuntime.InferenceSample\Microsoft.ML.OnnxRuntime.InferenceSample.csproj"
              Targets="Build" 
              Properties="Platform=AnyCPU"
@@ -130,7 +130,7 @@ CMake creates a target to this project
     </Exec>
 
     <Message Importance="High" Text="Bundling native shared library artifacts into a NuGet package ..." />
-    <Exec ContinueOnError="False" Command="$(NugetExe) pack -Symbols -SymbolPackageFormat snupkg NativeNuget.nuspec" ConsoleToMSBuild="true" WorkingDirectory="$(NativeBuildOutputDirAbs)" Condition=" '$(OS)' == 'Windows_NT'">
+    <Exec ContinueOnError="False" Command="$(NugetExe) pack NativeNuget.nuspec" ConsoleToMSBuild="true" WorkingDirectory="$(NativeBuildOutputDirAbs)" Condition=" '$(OS)' == 'Windows_NT'">
       <Output TaskParameter="ConsoleOutput" PropertyName="OutputOfExec" />
     </Exec>
 
diff --git a/csharp/OnnxRuntime.DesktopOnly.CSharp.sln b/csharp/OnnxRuntime.DesktopOnly.CSharp.sln
new file mode 100644
index 0000000000..d02a9e604d
--- /dev/null
+++ b/csharp/OnnxRuntime.DesktopOnly.CSharp.sln
@@ -0,0 +1,230 @@
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.31613.86
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "OnnxRuntime", "OnnxRuntime", "{6EFBFAB8-C606-4BA4-9604-BBAF3788520D}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxRuntime", "src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj", "{584B53B3-359D-4DC2-BCD8-530B5D4685AD}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Sample", "Sample", "{02AADD56-0FD4-4F03-A56C-30529A36B0C0}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxRuntime.InferenceSample", "sample\InferenceSample\Microsoft.ML.OnnxRuntime.InferenceSample\Microsoft.ML.OnnxRuntime.InferenceSample.csproj", "{2E295930-42B1-422D-925D-F07947AD8EFF}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxRuntime.InferenceSample.Forms", "sample\InferenceSample\Microsoft.ML.OnnxRuntime.InferenceSample.Forms\Microsoft.ML.OnnxRuntime.InferenceSample.Forms.csproj", "{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxRuntime.InferenceSample.NetCoreApp", "sample\InferenceSample\Microsoft.ML.OnnxRuntime.InferenceSample.NetCoreApp\Microsoft.ML.OnnxRuntime.InferenceSample.NetCoreApp.csproj", "{1AA14958-9246-4163-9403-F650E65ADCBC}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Perf", "Perf", "{05C85C92-A377-4F69-9EF4-44A94C9B089D}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxRuntime.PerfTool", "tools\Microsoft.ML.OnnxRuntime.PerfTool\Microsoft.ML.OnnxRuntime.PerfTool.csproj", "{310506FD-6E78-4D62-989B-25D69A85E8CF}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tests", "Tests", "{6782763B-8097-457C-AEA3-67678621DBE0}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxRuntime.Tests.Common", "test\Microsoft.ML.OnnxRuntime.Tests.Common\Microsoft.ML.OnnxRuntime.Tests.Common.csproj", "{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}"
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.ML.OnnxRuntime.Tests.NetCoreApp", "test\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj", "{50173D13-DF29-42E7-A30B-8B12D36C77B1}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Debug|iPhone = Debug|iPhone
+		Debug|iPhoneSimulator = Debug|iPhoneSimulator
+		Debug|x86 = Debug|x86
+		Release|Any CPU = Release|Any CPU
+		Release|iPhone = Release|iPhone
+		Release|iPhoneSimulator = Release|iPhoneSimulator
+		Release|x86 = Release|x86
+		RelWithDebInfo|Any CPU = RelWithDebInfo|Any CPU
+		RelWithDebInfo|iPhone = RelWithDebInfo|iPhone
+		RelWithDebInfo|iPhoneSimulator = RelWithDebInfo|iPhoneSimulator
+		RelWithDebInfo|x86 = RelWithDebInfo|x86
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Debug|iPhone.ActiveCfg = Debug|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Debug|iPhone.Build.0 = Debug|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Debug|iPhoneSimulator.ActiveCfg = Debug|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Debug|iPhoneSimulator.Build.0 = Debug|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Debug|x86.ActiveCfg = Debug|x86
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Debug|x86.Build.0 = Debug|x86
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Release|Any CPU.Build.0 = Release|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Release|iPhone.ActiveCfg = Release|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Release|iPhone.Build.0 = Release|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Release|iPhoneSimulator.ActiveCfg = Release|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Release|iPhoneSimulator.Build.0 = Release|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Release|x86.ActiveCfg = Release|x86
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.Release|x86.Build.0 = Release|x86
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.RelWithDebInfo|Any CPU.Build.0 = RelWithDebInfo|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.RelWithDebInfo|iPhone.ActiveCfg = RelWithDebInfo|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.RelWithDebInfo|iPhone.Build.0 = RelWithDebInfo|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.RelWithDebInfo|iPhoneSimulator.ActiveCfg = RelWithDebInfo|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.RelWithDebInfo|iPhoneSimulator.Build.0 = RelWithDebInfo|Any CPU
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x86
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD}.RelWithDebInfo|x86.Build.0 = RelWithDebInfo|x86
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Debug|iPhone.ActiveCfg = Debug|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Debug|iPhone.Build.0 = Debug|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Debug|iPhoneSimulator.ActiveCfg = Debug|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Debug|iPhoneSimulator.Build.0 = Debug|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Debug|x86.Build.0 = Debug|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Release|Any CPU.Build.0 = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Release|iPhone.ActiveCfg = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Release|iPhone.Build.0 = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Release|iPhoneSimulator.ActiveCfg = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Release|iPhoneSimulator.Build.0 = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Release|x86.ActiveCfg = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.Release|x86.Build.0 = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.RelWithDebInfo|Any CPU.ActiveCfg = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.RelWithDebInfo|Any CPU.Build.0 = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.RelWithDebInfo|iPhone.ActiveCfg = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.RelWithDebInfo|iPhone.Build.0 = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.RelWithDebInfo|iPhoneSimulator.ActiveCfg = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.RelWithDebInfo|iPhoneSimulator.Build.0 = Release|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.RelWithDebInfo|x86.ActiveCfg = Debug|Any CPU
+		{2E295930-42B1-422D-925D-F07947AD8EFF}.RelWithDebInfo|x86.Build.0 = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Debug|iPhone.ActiveCfg = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Debug|iPhone.Build.0 = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Debug|iPhoneSimulator.ActiveCfg = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Debug|iPhoneSimulator.Build.0 = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Debug|x86.ActiveCfg = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Debug|x86.Build.0 = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Release|Any CPU.Build.0 = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Release|iPhone.ActiveCfg = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Release|iPhone.Build.0 = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Release|iPhoneSimulator.ActiveCfg = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Release|iPhoneSimulator.Build.0 = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Release|x86.ActiveCfg = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.Release|x86.Build.0 = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.RelWithDebInfo|Any CPU.ActiveCfg = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.RelWithDebInfo|Any CPU.Build.0 = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.RelWithDebInfo|iPhone.ActiveCfg = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.RelWithDebInfo|iPhone.Build.0 = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.RelWithDebInfo|iPhoneSimulator.ActiveCfg = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.RelWithDebInfo|iPhoneSimulator.Build.0 = Release|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.RelWithDebInfo|x86.ActiveCfg = Debug|Any CPU
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36}.RelWithDebInfo|x86.Build.0 = Debug|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Debug|iPhone.ActiveCfg = Debug|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Debug|iPhone.Build.0 = Debug|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Debug|iPhoneSimulator.ActiveCfg = Debug|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Debug|iPhoneSimulator.Build.0 = Debug|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Debug|x86.ActiveCfg = Debug|x86
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Debug|x86.Build.0 = Debug|x86
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Release|Any CPU.Build.0 = Release|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Release|iPhone.ActiveCfg = Release|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Release|iPhone.Build.0 = Release|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Release|iPhoneSimulator.ActiveCfg = Release|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Release|iPhoneSimulator.Build.0 = Release|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Release|x86.ActiveCfg = Release|x86
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.Release|x86.Build.0 = Release|x86
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.RelWithDebInfo|Any CPU.Build.0 = RelWithDebInfo|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.RelWithDebInfo|iPhone.ActiveCfg = RelWithDebInfo|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.RelWithDebInfo|iPhone.Build.0 = RelWithDebInfo|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.RelWithDebInfo|iPhoneSimulator.ActiveCfg = RelWithDebInfo|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.RelWithDebInfo|iPhoneSimulator.Build.0 = RelWithDebInfo|Any CPU
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x86
+		{1AA14958-9246-4163-9403-F650E65ADCBC}.RelWithDebInfo|x86.Build.0 = RelWithDebInfo|x86
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Debug|iPhone.ActiveCfg = Debug|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Debug|iPhone.Build.0 = Debug|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Debug|iPhoneSimulator.ActiveCfg = Debug|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Debug|iPhoneSimulator.Build.0 = Debug|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Debug|x86.ActiveCfg = Debug|x86
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Debug|x86.Build.0 = Debug|x86
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Release|Any CPU.Build.0 = Release|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Release|iPhone.ActiveCfg = Release|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Release|iPhone.Build.0 = Release|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Release|iPhoneSimulator.ActiveCfg = Release|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Release|iPhoneSimulator.Build.0 = Release|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Release|x86.ActiveCfg = Release|x86
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.Release|x86.Build.0 = Release|x86
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.RelWithDebInfo|Any CPU.Build.0 = RelWithDebInfo|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.RelWithDebInfo|iPhone.ActiveCfg = RelWithDebInfo|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.RelWithDebInfo|iPhone.Build.0 = RelWithDebInfo|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.RelWithDebInfo|iPhoneSimulator.ActiveCfg = RelWithDebInfo|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.RelWithDebInfo|iPhoneSimulator.Build.0 = RelWithDebInfo|Any CPU
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x86
+		{310506FD-6E78-4D62-989B-25D69A85E8CF}.RelWithDebInfo|x86.Build.0 = RelWithDebInfo|x86
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Debug|iPhone.ActiveCfg = Debug|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Debug|iPhone.Build.0 = Debug|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Debug|iPhoneSimulator.ActiveCfg = Debug|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Debug|iPhoneSimulator.Build.0 = Debug|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Debug|x86.ActiveCfg = Debug|x86
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Debug|x86.Build.0 = Debug|x86
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Release|Any CPU.Build.0 = Release|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Release|iPhone.ActiveCfg = Release|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Release|iPhone.Build.0 = Release|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Release|iPhoneSimulator.ActiveCfg = Release|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Release|iPhoneSimulator.Build.0 = Release|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Release|x86.ActiveCfg = Release|x86
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.Release|x86.Build.0 = Release|x86
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.RelWithDebInfo|Any CPU.Build.0 = RelWithDebInfo|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.RelWithDebInfo|iPhone.ActiveCfg = RelWithDebInfo|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.RelWithDebInfo|iPhone.Build.0 = RelWithDebInfo|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.RelWithDebInfo|iPhoneSimulator.ActiveCfg = RelWithDebInfo|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.RelWithDebInfo|iPhoneSimulator.Build.0 = RelWithDebInfo|Any CPU
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x86
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6}.RelWithDebInfo|x86.Build.0 = RelWithDebInfo|x86
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Debug|iPhone.ActiveCfg = Debug|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Debug|iPhone.Build.0 = Debug|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Debug|iPhoneSimulator.ActiveCfg = Debug|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Debug|iPhoneSimulator.Build.0 = Debug|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Debug|x86.ActiveCfg = Debug|x86
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Debug|x86.Build.0 = Debug|x86
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Release|Any CPU.Build.0 = Release|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Release|iPhone.ActiveCfg = Release|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Release|iPhone.Build.0 = Release|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Release|iPhoneSimulator.ActiveCfg = Release|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Release|iPhoneSimulator.Build.0 = Release|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Release|x86.ActiveCfg = Release|x86
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.Release|x86.Build.0 = Release|x86
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.RelWithDebInfo|Any CPU.ActiveCfg = RelWithDebInfo|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.RelWithDebInfo|Any CPU.Build.0 = RelWithDebInfo|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.RelWithDebInfo|iPhone.ActiveCfg = RelWithDebInfo|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.RelWithDebInfo|iPhone.Build.0 = RelWithDebInfo|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.RelWithDebInfo|iPhoneSimulator.ActiveCfg = RelWithDebInfo|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.RelWithDebInfo|iPhoneSimulator.Build.0 = RelWithDebInfo|Any CPU
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.RelWithDebInfo|x86.ActiveCfg = RelWithDebInfo|x86
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1}.RelWithDebInfo|x86.Build.0 = RelWithDebInfo|x86
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(NestedProjects) = preSolution
+		{584B53B3-359D-4DC2-BCD8-530B5D4685AD} = {6EFBFAB8-C606-4BA4-9604-BBAF3788520D}
+		{2E295930-42B1-422D-925D-F07947AD8EFF} = {02AADD56-0FD4-4F03-A56C-30529A36B0C0}
+		{C5BDDD5D-F811-4CDD-A977-2D8581C21F36} = {02AADD56-0FD4-4F03-A56C-30529A36B0C0}
+		{1AA14958-9246-4163-9403-F650E65ADCBC} = {02AADD56-0FD4-4F03-A56C-30529A36B0C0}
+		{310506FD-6E78-4D62-989B-25D69A85E8CF} = {05C85C92-A377-4F69-9EF4-44A94C9B089D}
+		{04FA49F0-AA23-4EE5-B455-6E12FFAD29E6} = {6782763B-8097-457C-AEA3-67678621DBE0}
+		{50173D13-DF29-42E7-A30B-8B12D36C77B1} = {6782763B-8097-457C-AEA3-67678621DBE0}
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {C3DBDA2B-F169-4EDE-9353-858904124B75}
+	EndGlobalSection
+	GlobalSection(Performance) = preSolution
+		HasPerformanceSessions = true
+	EndGlobalSection
+EndGlobal
diff --git a/csharp/readme.txt b/csharp/readme.txt
new file mode 100644
index 0000000000..5b9a18847a
--- /dev/null
+++ b/csharp/readme.txt
@@ -0,0 +1,6 @@
+The main solution file is OnnxRuntime.CSharp.sln. This includes desktop and Xamarin mobile projects.
+OnnxRuntime.DesktopOnly.CSharp.sln is a copy of that with all the mobile projects removed. This is 
+due to there being no way to selectively exclude a csproj from the sln if Xamarin isn't available. 
+
+If changes are required, either update the main solution first and copy the relevant changes across,
+ or copy the entire file and remove the mobile projects (anything with iOS, Android or Droid in the name). 
\ No newline at end of file
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
index 2eb04ab4fd..46b30483a0 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
@@ -1,7 +1,25 @@
 <Project Sdk="MSBuild.Sdk.Extras/3.0.22">
-
   <PropertyGroup>
-    <TargetFrameworks>netstandard1.1;netstandard2.0;xamarinios10;monoandroid11.0;net5.0;netcoreapp3.1</TargetFrameworks>
+    <!--- packaging properties -->
+    <OrtPackageId Condition="'$(OrtPackageId)' == ''">Microsoft.ML.OnnxRuntime</OrtPackageId>
+  </PropertyGroup>
+
+  <!-- only include the Xamarin mobile targets for the main ORT package, 
+       and only if the mobile workloads are installed -->
+  <Choose>
+    <When Condition="'$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime' AND Exists('$(MSBuildExtensionsPath)\Xamarin\Android') AND Exists('$(MSBuildExtensionsPath)\Xamarin\iOS')">
+      <PropertyGroup>
+        <TargetFrameworks>netstandard1.1;netstandard2.0;xamarinios10;monoandroid11.0;net5.0;netcoreapp3.1</TargetFrameworks>
+      </PropertyGroup>      
+    </When>
+    <Otherwise>
+      <PropertyGroup>
+        <TargetFrameworks>netstandard1.1;netstandard2.0;net5.0;netcoreapp3.1</TargetFrameworks>
+      </PropertyGroup>    
+   </Otherwise>
+  </Choose>
+  
+  <PropertyGroup>
     <Platforms>AnyCPU;x86</Platforms>
     <LangVersion>7.2</LangVersion>
     <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
@@ -19,9 +37,6 @@
     <EnableDefaultCompileItems>false</EnableDefaultCompileItems>
     <DebugType>portable</DebugType>
 
-    <!--- packaging properties -->
-    <OrtPackageId Condition=" '$(OrtPackageId)' == '' ">Microsoft.ML.OnnxRuntime</OrtPackageId>
-
     <!--- The package name is always hardcoded as the package created by this project only contains managed assemblies -->
     <!--- The parameter OrtPackageId is only used for some conditional logic below -->    
     <PackageId>Microsoft.ML.OnnxRuntime.Managed</PackageId>
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index e708950d24..09b3cc48c9 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -223,6 +223,18 @@ namespace Microsoft.ML.OnnxRuntime
         public IntPtr GetSparseTensorValues;
         public IntPtr GetSparseTensorIndicesTypeShape;
         public IntPtr GetSparseTensorIndices;
+        public IntPtr HasValue;
+        public IntPtr KernelContext_GetGPUComputeStream;
+        public IntPtr GetTensorMemoryInfo;
+        public IntPtr GetExecutionProviderApi;
+        public IntPtr SessionOptionsSetCustomCreateThreadFn;
+        public IntPtr SessionOptionsSetCustomThreadCreationOptions;
+        public IntPtr SessionOptionsSetCustomJoinThreadFn;
+        public IntPtr SetGlobalCustomCreateThreadFn;
+        public IntPtr SetGlobalCustomThreadCreationOptions;
+        public IntPtr SetGlobalCustomJoinThreadFn;
+        public IntPtr SynchronizeBoundInputs;
+        public IntPtr SynchronizeBoundOutputs;
     }
 
     internal static class NativeMethods
@@ -328,12 +340,15 @@ namespace Microsoft.ML.OnnxRuntime
             OrtCreateIoBinding = (DOrtCreateIoBinding)Marshal.GetDelegateForFunctionPointer(api_.CreateIoBinding, typeof(DOrtCreateIoBinding));
             OrtReleaseIoBinding = (DOrtReleaseIoBinding)Marshal.GetDelegateForFunctionPointer(api_.ReleaseIoBinding, typeof(DOrtReleaseIoBinding));
             OrtBindInput = (DOrtBindInput)Marshal.GetDelegateForFunctionPointer(api_.BindInput, typeof(DOrtBindInput));
+            OrtSynchronizeBoundInputs = (DOrtSynchronizeBoundInputs)Marshal.GetDelegateForFunctionPointer(api_.SynchronizeBoundInputs, typeof(DOrtSynchronizeBoundInputs));
             OrtBindOutput = (DOrtBindOutput)Marshal.GetDelegateForFunctionPointer(api_.BindOutput, typeof(DOrtBindOutput));
             OrtBindOutputToDevice = (DOrtBindOutputToDevice)Marshal.GetDelegateForFunctionPointer(api_.BindOutputToDevice, typeof(DOrtBindOutputToDevice));
+            OrtSynchronizeBoundOutputs = (DOrtSynchronizeBoundOutputs)Marshal.GetDelegateForFunctionPointer(api_.SynchronizeBoundOutputs, typeof(DOrtSynchronizeBoundOutputs));
             OrtGetBoundOutputNames = (DOrtGetBoundOutputNames)Marshal.GetDelegateForFunctionPointer(api_.GetBoundOutputNames, typeof(DOrtGetBoundOutputNames));
             OrtGetBoundOutputValues = (DOrtGetBoundOutputValues)Marshal.GetDelegateForFunctionPointer(api_.GetBoundOutputValues, typeof(DOrtGetBoundOutputValues));
             OrtClearBoundInputs = (DOrtClearBoundInputs)Marshal.GetDelegateForFunctionPointer(api_.ClearBoundInputs, typeof(DOrtClearBoundInputs));
             OrtClearBoundOutputs = (DOrtClearBoundOutputs)Marshal.GetDelegateForFunctionPointer(api_.ClearBoundOutputs, typeof(DOrtClearBoundOutputs));
+
             OrtTensorAt = (DOrtTensorAt)Marshal.GetDelegateForFunctionPointer(api_.TensorAt, typeof(DOrtTensorAt));
             OrtCreateAndRegisterAllocator = (DOrtCreateAndRegisterAllocator)Marshal.GetDelegateForFunctionPointer(api_.CreateAndRegisterAllocator, typeof(DOrtCreateAndRegisterAllocator));
             OrtSetLanguageProjection = (DOrtSetLanguageProjection)Marshal.GetDelegateForFunctionPointer(api_.SetLanguageProjection, typeof(DOrtSetLanguageProjection));
@@ -482,7 +497,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                 byte[] modelPath,
                                                 IntPtr /* (OrtSessionOptions*) */sessopnOptions,
                                                 out IntPtr /**/ session);
-        
+
         public static DOrtCreateSession OrtCreateSession;
 
         /// <summary>
@@ -500,7 +515,7 @@ namespace Microsoft.ML.OnnxRuntime
                                         IntPtr /* (OrtSessionOptions*) */sessionOptions,
                                         IntPtr /* (OrtPrepackedWeightsContainer*) */prepackedWeightsContainer,
                                         out IntPtr /* (OrtSession**) */ session);
-        
+
         public static DOrtCreateSessionWithPrepackedWeightsContainer OrtCreateSessionWithPrepackedWeightsContainer;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -542,7 +557,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                 UIntPtr outputCount,
                                                 IntPtr[] outputValues /* An array of output value pointers. Array must be allocated by the caller */
                                                 );
-        
+
         public static DOrtRun OrtRun;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -551,28 +566,28 @@ namespace Microsoft.ML.OnnxRuntime
                                                 IntPtr /*(OrtSessionRunOptions*)*/ runOptions, // can not be null
                                                 IntPtr /*(const OrtIoBinding*)*/ io_binding
                                                 );
-        
+
         public static DOrtRunWithBinding OrtRunWithBinding;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtSessionGetInputCount(
                                                 IntPtr /*(OrtSession*)*/ session,
                                                 out UIntPtr count);
-        
+
         public static DOrtSessionGetInputCount OrtSessionGetInputCount;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtSessionGetOutputCount(
                                                 IntPtr /*(OrtSession*)*/ session,
                                                 out UIntPtr count);
-        
+
         public static DOrtSessionGetOutputCount OrtSessionGetOutputCount;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtSessionGetOverridableInitializerCount(
                                                 IntPtr /*(OrtSession*)*/ session,
                                                 out UIntPtr count);
-        
+
         public static DOrtSessionGetOverridableInitializerCount OrtSessionGetOverridableInitializerCount;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -581,7 +596,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                 UIntPtr index,
                                                 IntPtr /*(OrtAllocator*)*/ allocator,
                                                 out IntPtr /*(char**)*/name);
-        
+
         public static DOrtSessionGetInputName OrtSessionGetInputName;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -590,7 +605,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                 UIntPtr index,
                                                 IntPtr /*(OrtAllocator*)*/ allocator,
                                                 out IntPtr /*(char**)*/name);
-        
+
         public static DOrtSessionGetOutputName OrtSessionGetOutputName;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -598,7 +613,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                 IntPtr /*(const OrtSession*)*/ session,
                                                 IntPtr /*(OrtAllocator*)*/ allocator,
                                                 out IntPtr /*(char**)*/profile_file);
-        
+
         public static DOrtSessionEndProfiling OrtSessionEndProfiling;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -607,7 +622,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                 UIntPtr index,
                                                 IntPtr /*(OrtAllocator*)*/ allocator,
                                                 out IntPtr /*(char**)*/name);
-        
+
         public static DOrtSessionGetOverridableInitializerName OrtSessionGetOverridableInitializerName;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -615,7 +630,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                 IntPtr /*(const OrtSession*)*/ session,
                                                 UIntPtr index,
                                                 out IntPtr /*(struct OrtTypeInfo**)*/ typeInfo);
-        
+
         public static DOrtSessionGetInputTypeInfo OrtSessionGetInputTypeInfo;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -623,7 +638,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                 IntPtr /*(const OrtSession*)*/ session,
                                                 UIntPtr index,
                                                 out IntPtr /* (struct OrtTypeInfo**)*/ typeInfo);
-        
+
         public static DOrtSessionGetOutputTypeInfo OrtSessionGetOutputTypeInfo;
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
@@ -631,7 +646,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                 IntPtr /*(const OrtSession*)*/ session,
                                                 UIntPtr index,
                                                 out IntPtr /* (struct OrtTypeInfo**)*/ typeInfo);
-        
+
         public static DOrtSessionGetOverridableInitializerTypeInfo OrtSessionGetOverridableInitializerTypeInfo;
 
         // release the typeinfo using OrtReleaseTypeInfo
@@ -854,9 +869,9 @@ namespace Microsoft.ML.OnnxRuntime
 
         public static DOrtAddInitializer OrtAddInitializer;
 
-#endregion
+        #endregion
 
-#region RunOptions API
+        #region RunOptions API
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtCreateRunOptions(out IntPtr /* OrtRunOptions** */ runOptions);
@@ -900,9 +915,9 @@ namespace Microsoft.ML.OnnxRuntime
         public delegate IntPtr /*(OrtStatus*)*/ DOrtRunOptionsUnsetTerminate(IntPtr /* OrtRunOptions* */ options);
         public static DOrtRunOptionsUnsetTerminate OrtRunOptionsUnsetTerminate;
 
-#endregion
+        #endregion
 
-#region Allocator/MemoryInfo API
+        #region Allocator/MemoryInfo API
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /* (OrtStatus*)*/ DOrtCreateMemoryInfo(
@@ -1041,9 +1056,9 @@ namespace Microsoft.ML.OnnxRuntime
 
         public static DOrtAllocatorFree OrtAllocatorFree;
 
-#endregion Allocator/MemoryInfo API
+        #endregion Allocator/MemoryInfo API
 
-#region IoBinding API
+        #region IoBinding API
 
         /// <summary>
         /// Create OrtIoBinding instance that is used to bind memory that is allocated
@@ -1081,6 +1096,20 @@ namespace Microsoft.ML.OnnxRuntime
 
         public static DOrtBindInput OrtBindInput;
 
+        /// <summary>
+        /// The API calls Sync() on all EP providers present. This blocks until the device has completed
+        /// all preceding requested tasks. This is necessary when memory synchronization is required.
+        /// For example, the memory bound to an input is likely to be on a different CUDA stream.
+        /// For some scenarios and devices this may be a no-op, use
+        /// your best judgment.
+        /// </summary>
+        /// <param name="io_binding">instance of OrtIoBinding</param>
+        /// <returns>An instance of OrtStatus or null</returns>
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus*/ DOrtSynchronizeBoundInputs(IntPtr /*(OrtIoBinding)*/ io_binding);
+
+        public static DOrtSynchronizeBoundInputs OrtSynchronizeBoundInputs;
+
         /// <summary>
         /// Bind OrtValue to the model output with the specified name
         /// If binding with the specified name already exists, it will be replaced
@@ -1109,6 +1138,18 @@ namespace Microsoft.ML.OnnxRuntime
 
         public static DOrtBindOutputToDevice OrtBindOutputToDevice;
 
+        /// <summary>
+        /// The API calls Sync() on all EP providers present. This blocks until the device has completed
+        /// all preceding requested tasks. This is necessary when memory synchronization is required.
+        /// For some scenarios and devices this may be a no-op, use your best judgment.
+        /// </summary>
+        /// <param name="io_binding">instance of OrtIoBinding</param>
+        /// <returns>An instance of OrtStatus or null</returns>
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus*/ DOrtSynchronizeBoundOutputs(IntPtr /*(OrtIoBinding)*/ io_binding);
+
+        public static DOrtSynchronizeBoundOutputs OrtSynchronizeBoundOutputs;
+
         /// <summary>
         /// The function will return all bound output names in the order they were bound.
         /// It is the same order that the output values will be returned after RunWithBinding() is used.
@@ -1205,9 +1246,9 @@ namespace Microsoft.ML.OnnxRuntime
 
         public static DOrtSetLanguageProjection OrtSetLanguageProjection;
 
-#endregion IoBinding API
+        #endregion IoBinding API
 
-#region ModelMetadata API
+        #region ModelMetadata API
 
         /// <summary>
         /// Gets the ModelMetadata associated with an InferenceSession
@@ -1326,9 +1367,9 @@ namespace Microsoft.ML.OnnxRuntime
 
         public static DOrtReleaseModelMetadata OrtReleaseModelMetadata;
 
-#endregion ModelMetadata API
+        #endregion ModelMetadata API
 
-#region Tensor/OnnxValue API
+        #region Tensor/OnnxValue API
 
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtGetValue(IntPtr /*(OrtValue*)*/ value,
@@ -1337,7 +1378,7 @@ namespace Microsoft.ML.OnnxRuntime
                                                                  out IntPtr /*(OrtValue**)*/ outputValue);
 
         public static DOrtGetValue OrtGetValue;
-      
+
         [UnmanagedFunctionPointer(CallingConvention.Winapi)]
         public delegate IntPtr /*(OrtStatus*)*/ DOrtGetValueType(IntPtr /*(OrtValue*)*/ value, out IntPtr /*(OnnxValueType*)*/ onnxtype);
 
@@ -1485,9 +1526,9 @@ namespace Microsoft.ML.OnnxRuntime
 
         public static DOrtReleaseValue OrtReleaseValue;
 
-#endregion
+        #endregion
 
-#region Misc API
+        #region Misc API
 
         /// <summary>
         /// Queries all the execution providers supported in the native onnxruntime shared library
@@ -1527,7 +1568,7 @@ namespace Microsoft.ML.OnnxRuntime
 
         public static DOrtReleasePrepackedWeightsContainer OrtReleasePrepackedWeightsContainer;
 
-#endregion
+        #endregion
 
         public static byte[] GetPlatformSerializedString(string str)
         {
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtIoBinding.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtIoBinding.shared.cs
index 84bda820ee..40549a6848 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtIoBinding.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtIoBinding.shared.cs
@@ -8,7 +8,7 @@ using System.Text;
 namespace Microsoft.ML.OnnxRuntime
 {
     /// <summary>
-    /// This class enable to bind inputs and outputs to pre-allocated
+    /// This class enables binding of inputs and/or outputs to pre-allocated
     /// memory. This enables interesting scenarios. For example, if your input
     /// already resides in some pre-allocated memory like GPU, you can bind
     /// that piece of memory to an input name and shape and onnxruntime will use that as input.
@@ -87,6 +87,15 @@ namespace Microsoft.ML.OnnxRuntime
             BindInputOrOutput(name, fixedValue.Value.Handle, true);
         }
 
+        /// <summary>
+        /// Blocks until device completes all preceding requested tasks.
+        /// Useful for memory synchronization.
+        /// </summary>
+        public void SynchronizeBoundInputs()
+        {
+            NativeMethods.OrtSynchronizeBoundInputs(handle);
+        }
+
         /// <summary>
         /// Bind model output to an OrtValue as Tensor with a given type and shape. An instance of OrtMemoryAllocaiton
         /// owns the memory and should be alive for the time of execution.The size of the allocation can not be less than required
@@ -133,6 +142,15 @@ namespace Microsoft.ML.OnnxRuntime
             NativeApiStatus.VerifySuccess(NativeMethods.OrtBindOutputToDevice(handle, pinnedName.Pointer, memInfo.Pointer));
         }
 
+        /// <summary>
+        /// Blocks until device completes all preceding requested tasks.
+        /// Useful for memory synchronization.
+        /// </summary>
+        public void SynchronizeBoundOutputs()
+        {
+            NativeMethods.OrtSynchronizeBoundOutputs(handle);
+        }
+
         /// <summary>
         /// Internal helper
         /// </summary>
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
index 892bfb2865..b9ed2d4f26 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs
@@ -1670,8 +1670,10 @@ namespace Microsoft.ML.OnnxRuntime.Tests
                 {
                     ioBinding.BindInput(inputName, fixeInputBuffer);
                     ioBinding.BindOutput(outputName, fixedOutputBuffer);
+                    ioBinding.SynchronizeBoundInputs();
                     using (var outputs = session.RunWithBindingAndNames(runOptions, ioBinding))
                     {
+                        ioBinding.SynchronizeBoundOutputs();
                         Assert.Equal(1, outputs.Count);
                         var output = outputs.First();
                         Assert.Equal(outputName, output.Name);
@@ -1687,9 +1689,10 @@ namespace Microsoft.ML.OnnxRuntime.Tests
                 {
                     ioBinding.BindInput(inputName, fixedInputBuffer);
                     ioBinding.BindOutputToDevice(outputName, allocator.Info);
-
+                    ioBinding.SynchronizeBoundInputs();
                     using (var outputs = session.RunWithBindingAndNames(runOptions, ioBinding))
                     {
+                        ioBinding.SynchronizeBoundOutputs();
                         Assert.Equal(1, outputs.Count);
                         var output = outputs.First();
                         Assert.Equal(outputName, output.Name);
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtIoBindingAllocationTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtIoBindingAllocationTest.cs
index 84cea5ea90..7c9fcfe348 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtIoBindingAllocationTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/OrtIoBindingAllocationTest.cs
@@ -73,8 +73,10 @@ namespace Microsoft.ML.OnnxRuntime.Tests
                 {
                     ioBinding.BindInput(inputName, fixedInputBuffer);
                     ioBinding.BindOutput(outputName, Tensors.TensorElementType.Float, outputShape, ortAllocationOutput);
+                    ioBinding.SynchronizeBoundInputs();
                     using (var outputs = session.RunWithBindingAndNames(runOptions, ioBinding))
                     {
+                        ioBinding.SynchronizeBoundOutputs();
                         Assert.Equal(1, outputs.Count);
                         var output = outputs.ElementAt(0);
                         Assert.Equal(outputName, output.Name);
@@ -88,8 +90,10 @@ namespace Microsoft.ML.OnnxRuntime.Tests
                 {
                     ioBinding.BindInput(inputName, Tensors.TensorElementType.Float, inputShape, ortAllocationInput);
                     ioBinding.BindOutput(outputName, Tensors.TensorElementType.Float, outputShape, ortAllocationOutput);
+                    ioBinding.SynchronizeBoundInputs();
                     using (var outputs = session.RunWithBindingAndNames(runOptions, ioBinding))
                     {
+                        ioBinding.SynchronizeBoundOutputs();
                         Assert.Equal(1, outputs.Count);
                         var output = outputs.ElementAt(0);
                         Assert.Equal(outputName, output.Name);
diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino
index b449ea1194..18d40bd78e 100644
--- a/dockerfiles/Dockerfile.openvino
+++ b/dockerfiles/Dockerfile.openvino
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: MIT
 #--------------------------------------------------------------------------
 
-ARG OPENVINO_VERSION=2021.4.1
+ARG OPENVINO_VERSION=2021.4.2
 
 
 # Build stage
diff --git a/dockerfiles/Dockerfile.openvino-centos7 b/dockerfiles/Dockerfile.openvino-centos7
index af14da9e64..e500f3ce5a 100755
--- a/dockerfiles/Dockerfile.openvino-centos7
+++ b/dockerfiles/Dockerfile.openvino-centos7
@@ -8,12 +8,12 @@ FROM centos:7.8.2003
 WORKDIR /code
 
 ARG MY_ROOT=/code
-ARG YUM_OV_PACKAGE=intel-openvino-runtime-centos7-2021.4.689.x86_64
+ARG YUM_OV_PACKAGE=intel-openvino-runtime-centos7-2021.4.752.x86_64
 ARG DEVICE=CPU_FP32
 ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=master
 
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2021.4.689
+ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2021.4.752
 ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/share
 ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/lib/intel64
 ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/ngraph/cmake
@@ -58,7 +58,7 @@ RUN yum update -y && \
     yum update -y && yum list intel-openvino* && \
     yum install -y $YUM_OV_PACKAGE && \
     cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && \
-    printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2021.4.689/bin/setupvars.sh && \
+    printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2021.4.752/bin/setupvars.sh && \
     cd /opt/libusb-1.0.22 && \
     /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \
     cp /opt/intel/openvino_2021/deployment_tools/inference_engine/external/97-myriad-usbboot.rules /etc/udev/rules.d/ && \
diff --git a/dockerfiles/Dockerfile.openvino-csharp b/dockerfiles/Dockerfile.openvino-csharp
index eec0d0934c..961e3b30f6 100644
--- a/dockerfiles/Dockerfile.openvino-csharp
+++ b/dockerfiles/Dockerfile.openvino-csharp
@@ -15,7 +15,7 @@ ARG MY_ROOT=/code
 ENV PATH /opt/miniconda/bin:/code/cmake-3.21.0-linux-x86_64/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/miniconda/lib:/usr/lib:/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
 
-ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2021.4.689
+ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2021.4.752
 ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/share
 ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/lib/intel64
 ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/inference_engine/external/gna/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/mkltiny_lnx/lib:$INTEL_OPENVINO_DIR/deployment_tools/ngraph/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/omp/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH}
@@ -54,7 +54,7 @@ RUN apt update -y && \
     cd /etc/apt/sources.list.d && \
     echo "deb https://apt.repos.intel.com/openvino/2021 all main">intel-openvino-2021.list && \ 
     apt update -y && \
-    apt -y install intel-openvino-dev-ubuntu18-2021.4.689 && \
+    apt -y install intel-openvino-dev-ubuntu18-2021.4.752 && \
     cd ${INTEL_OPENVINO_DIR}/install_dependencies && ./install_openvino_dependencies.sh -y && \
     cd ${INTEL_OPENVINO_DIR} && rm -rf documentation data_processing && \
     cd deployment_tools/ && rm -rf model_optimizer open_model_zoo demo tools && \
@@ -82,7 +82,7 @@ RUN apt update -y && \
     cd ${MY_ROOT} && \
     apt install -y gnupg ca-certificates && \
     #apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF && \
-    curl http://download.mono-project.com/repo/xamarin.gpg | apt-key add - && \
+    curl https://download.mono-project.com/repo/xamarin.gpg | apt-key add - && \
     echo "deb https://download.mono-project.com/repo/ubuntu stable-bionic main" | sudo tee /etc/apt/sources.list.d/mono-official-stable.list && \
     apt update -y && \
     apt install -y mono-devel && \
@@ -97,13 +97,14 @@ RUN apt update -y && \
     apt-get update -y &&\
     apt-get install -y apt-transport-https && \
     apt-get update -y && \
-    apt-get install -y dotnet-sdk-3.1 && \
+    apt-get install -y dotnet-sdk-5.0 && \
 # Download and build ONNX Runtime
     cd ${MY_ROOT} && \
     git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \
     /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh && \
     pip install onnx==1.9 && \
     cd ${MY_ROOT}/onnxruntime && ./build.sh --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget --build_shared_lib && \
+    cp ${MY_ROOT}/onnxruntime/build/Linux/Release/Microsoft.ML.OnnxRuntime.Managed* ${MY_ROOT}/onnxruntime/build/Linux/Release/nuget-artifacts && \
     mv ${MY_ROOT}/onnxruntime/build/Linux/Release/nuget-artifacts ${MY_ROOT} && \
 # Clean-up unnecessary files
     rm -rf ${MY_ROOT}/cmake* /opt/cmake ${MY_ROOT}/onnxruntime && \
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index d92b791d30..3a0171a064 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -14,6 +14,7 @@ Do not modify directly.*
   * <a href="#com.microsoft.ComplexMulConj">com.microsoft.ComplexMulConj</a>
   * <a href="#com.microsoft.ConvTransposeWithDynamicPads">com.microsoft.ConvTransposeWithDynamicPads</a>
   * <a href="#com.microsoft.CropAndResize">com.microsoft.CropAndResize</a>
+  * <a href="#com.microsoft.DecoderAttention">com.microsoft.DecoderAttention</a>
   * <a href="#com.microsoft.DequantizeLinear">com.microsoft.DequantizeLinear</a>
   * <a href="#com.microsoft.DynamicQuantizeLSTM">com.microsoft.DynamicQuantizeLSTM</a>
   * <a href="#com.microsoft.DynamicQuantizeMatMul">com.microsoft.DynamicQuantizeMatMul</a>
@@ -717,6 +718,72 @@ This version of the operator has been available since version 1 of the 'com.micr
 </dl>
 
 
+### <a name="com.microsoft.DecoderAttention"></a><a name="com.microsoft.decoderattention">**com.microsoft.DecoderAttention**</a>
+
+  This DecoderAttention supports self attention and cross attention, key and value cache, and key_padding_mask. The attention mask is not support at the moment.
+  Some boolean parameters are passed by runtime input for generic purpose
+
+#### Version
+
+This version of the operator has been available since version 1 of the 'com.microsoft' operator set.
+
+#### Attributes
+
+<dl>
+<dt><tt>num_heads</tt> : int (required)</dt>
+<dd>Number of attention heads</dd>
+</dl>
+
+#### Inputs
+
+<dl>
+<dt><tt>query</tt> : T</dt>
+<dd>3D input tensor with shape (sequence_length, batch_size, hidden_size), hidden_size = num_heads * head_size</dd>
+<dt><tt>key</tt> : T</dt>
+<dd>3D input tensor with shape (total_sequence_length, batch_size, hidden_size)</dd>
+<dt><tt>q_weight</tt> : T</dt>
+<dd>2D input tensor with shape (hidden_size, hidden_size)</dd>
+<dt><tt>kv_weight</tt> : T</dt>
+<dd>2D input tensor with shape (hidden_size, 2 * hidden_size)</dd>
+<dt><tt>bias</tt> : T</dt>
+<dd>1D input tensor with shape (3 * hidden_size)</dd>
+<dt><tt>key_padding_mask</tt> (optional) : B</dt>
+<dd>2D input tensor with shape (batch_size, total_sequence_length)</dd>
+<dt><tt>key_cache</tt> (optional) : T</dt>
+<dd>input tensor with shape (batch_size, num_heads, sequence_length or total_sequence_length, head_size)</dd>
+<dt><tt>value_cache</tt> (optional) : T</dt>
+<dd>input tensor with shape (batch_size, num_heads, sequence_length or total_sequence_length, head_size)</dd>
+<dt><tt>static_kv</tt> : B</dt>
+<dd>If static_kv = true, cross-attention; else self-attention</dd>
+<dt><tt>use_past</tt> : B</dt>
+<dd>If use_past = true, use cache; else no cache</dd>
+<dt><tt>has_layer_state</tt> : B</dt>
+<dd>If has_layer_state = true, layer_state = {} or [a,b]; else layer_state = None</dd>
+<dt><tt>has_key_padding_mask</tt> : B</dt>
+<dd>has_key_padding_mask or not</dd>
+</dl>
+
+#### Outputs (1 - 3)
+
+<dl>
+<dt><tt>output</tt> : T</dt>
+<dd>3D output tensor with shape (sequence_length, batch_size, hidden_size)</dd>
+<dt><tt>new_key_cache</tt> (optional) : T</dt>
+<dd>output tensor with shape (batch_size, num_heads, new sequence_length, head_size)</dd>
+<dt><tt>new_value_cache</tt> (optional) : T</dt>
+<dd>output tensor with shape (batch_size, num_heads, new sequence_length, head_size)</dd>
+</dl>
+
+#### Type Constraints
+
+<dl>
+<dt><tt>T</tt> : tensor(float), tensor(float16)</dt>
+<dd>Constrain input and output types to float and float16 tensors.</dd>
+<dt><tt>B</tt> : tensor(bool)</dt>
+<dd>Constrain key_padding_mask to bool tensors.</dd>
+</dl>
+
+
 ### <a name="com.microsoft.DequantizeLinear"></a><a name="com.microsoft.dequantizelinear">**com.microsoft.DequantizeLinear**</a>
 
   The linear dequantization operator. It consumes a quantized data, a scale, a zero point and computes the full precision data.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index ec0f394ce1..a7ea71314f 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -400,7 +400,7 @@ Do not modify directly.*
 |MaxpoolWithMask|*in* X:**T**<br> *in* M:**tensor(int32)**<br> *out* Y:**T**|1+|**X** = tensor(float)|
 |MurmurHash3|*in* X:**T1**<br> *out* Y:**T2**|1+|**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(string), tensor(uint32), tensor(uint64)<br/> **T2** = tensor(int32), tensor(uint32)|
 |NGramRepeatBlock|*in* input_ids:**Tid**<br> *in* scores:**T**<br> *out* scores_out:**T**|1+|**T** = tensor(float)<br/> **Tid** = tensor(int64)|
-|NhwcMaxPool|*in* x:**T**<br> *out* y:**T**|1+|**T** = tensor(uint8)|
+|NhwcMaxPool|*in* x:**T**<br> *out* y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |Pad|*in* data:**T**<br> *in* pads:**tensor(int64)**<br> *in* value:**T**<br> *out* output:**T**|1+|**T** = tensor(float)|
 |QAttention|*in* input:**T1**<br> *in* weight:**T2**<br> *in* bias:**T3**<br> *in* input_scale:**T3**<br> *in* weight_scale:**T3**<br> *in* mask_index:**T4**<br> *in* input_zero_point:**T1**<br> *in* weight_zero_point:**T2**<br> *in* past:**T3**<br> *out* output:**T3**<br> *out* present:**T3**|1+|**T1** = tensor(uint8)<br/> **T2** = tensor(int8), tensor(uint8)<br/> **T3** = tensor(float)<br/> **T4** = tensor(int32)|
 |QEmbedLayerNormalization|*in* input_ids:**T1**<br> *in* segment_ids:**T1**<br> *in* word_embedding_quant:**T2**<br> *in* position_embedding_quant:**T2**<br> *in* segment_embedding:**T2**<br> *in* gamma_quant:**T2**<br> *in* beta_quant:**T2**<br> *in* mask:**T1**<br> *in* word_embedding_scale:**T**<br> *in* position_embedding_scale:**T**<br> *in* segment_embedding_scale:**T**<br> *in* gamma_scale:**T**<br> *in* beta_scale:**T**<br> *in* word_embedding_zero_point:**T2**<br> *in* position_embedding_zero_point:**T2**<br> *in* segment_embedding_zero_point:**T2**<br> *in* gamma_zero_point:**T2**<br> *in* beta_zero_point:**T2**<br> *out* layernorm_out:**T**<br> *out* mask_index_out:**T1**|1+|**T** = tensor(float)|
@@ -600,6 +600,10 @@ Do not modify directly.*
 |QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|10+|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
 |||[7, 13]|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
+|RandomNormal|*out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
+|RandomNormalLike|*in* input:**T1**<br> *out* output:**T2**|1+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(double), tensor(float), tensor(float16)|
+|RandomUniform|*out* output:**T**|1+|**T** = tensor(double), tensor(float), tensor(float16)|
+|RandomUniformLike|*in* input:**T1**<br> *out* output:**T2**|1+|**T1** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **T2** = tensor(double), tensor(float), tensor(float16)|
 |Range|*in* start:**T**<br> *in* limit:**T**<br> *in* delta:**T**<br> *out* output:**T**|11+|**T** = tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64)|
 |Reciprocal|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[6, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
@@ -729,6 +733,7 @@ Do not modify directly.*
 |ComplexMul|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(float), tensor(float16)|
 |ComplexMulConj|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|1+|**T** = tensor(float), tensor(float16)|
 |ConvTransposeWithDynamicPads|*in* X:**T**<br> *in* W:**T**<br> *in* Pads:**tensor(int64)**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
+|DecoderAttention|*in* query:**T**<br> *in* key:**T**<br> *in* q_weight:**T**<br> *in* kv_weight:**T**<br> *in* bias:**T**<br> *in* key_padding_mask:**B**<br> *in* key_cache:**T**<br> *in* value_cache:**T**<br> *in* static_kv:**B**<br> *in* use_past:**B**<br> *in* has_layer_state:**B**<br> *in* has_key_padding_mask:**B**<br> *out* output:**T**<br> *out* new_key_cache:**T**<br> *out* new_value_cache:**T**|1+|**T** = tensor(float), tensor(float16)|
 |DequantizeLinear|*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|1+|**T1** = tensor(int8), tensor(uint8)<br/> **T2** = tensor(float16)|
 |EmbedLayerNormalization|*in* input_ids:**T1**<br> *in* segment_ids:**T1**<br> *in* word_embedding:**T**<br> *in* position_embedding:**T**<br> *in* segment_embedding:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* mask:**T1**<br> *in* position_ids:**T1**<br> *out* output:**T**<br> *out* mask_index:**T1**<br> *out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
 |FastGelu|*in* X:**T**<br> *in* bias:**T**<br> *out* Y:**T**|1+|**T** = tensor(bfloat16), tensor(float), tensor(float16)|
diff --git a/include/onnxruntime/core/framework/data_types.h b/include/onnxruntime/core/framework/data_types.h
index f311251385..ccb28173af 100644
--- a/include/onnxruntime/core/framework/data_types.h
+++ b/include/onnxruntime/core/framework/data_types.h
@@ -33,7 +33,7 @@ namespace onnxruntime {
 
 #if !defined(DISABLE_ML_OPS)
 
-//maps (only used by ML ops)
+// maps (only used by ML ops)
 using MapStringToString = std::map<std::string, std::string>;
 using MapStringToInt64 = std::map<std::string, int64_t>;
 using MapStringToFloat = std::map<std::string, float>;
@@ -43,7 +43,7 @@ using MapInt64ToInt64 = std::map<int64_t, int64_t>;
 using MapInt64ToFloat = std::map<int64_t, float>;
 using MapInt64ToDouble = std::map<int64_t, double>;
 
-//vectors/sequences
+// vectors/sequences
 using VectorMapStringToFloat = std::vector<MapStringToFloat>;
 using VectorMapInt64ToFloat = std::vector<MapInt64ToFloat>;
 
@@ -78,6 +78,23 @@ using CreateFunc = void* (*)();
  *
  */
 class DataTypeImpl {
+ public:
+  enum class GeneralType {
+    kInvalid = 0,
+    kNonTensor = 1,
+    kTensor = 2,
+    kTensorSequence = 3,
+    kSparseTensor = 4,
+    kOptional = 5,
+    kPrimitive = 6,
+  };
+
+  const GeneralType type_;
+  const size_t size_;
+
+ protected:
+  DataTypeImpl(GeneralType type, size_t size) : type_{type}, size_{size} {}
+
  public:
   virtual ~DataTypeImpl() = default;
 
@@ -90,7 +107,7 @@ class DataTypeImpl {
    */
   virtual bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const = 0;
 
-  virtual size_t Size() const = 0;
+  size_t Size() const { return size_; }
 
   virtual DeleteFunc GetDeleteFunc() const = 0;
 
@@ -102,53 +119,49 @@ class DataTypeImpl {
    */
   virtual const ONNX_NAMESPACE::TypeProto* GetTypeProto() const = 0;
 
-  virtual bool IsTensorType() const {
-    return false;
+  bool IsTensorType() const {
+    return type_ == GeneralType::kTensor;
   }
 
-  virtual bool IsTensorSequenceType() const {
-    return false;
+  bool IsTensorSequenceType() const {
+    return type_ == GeneralType::kTensorSequence;
   }
 
-  virtual bool IsSparseTensorType() const {
-    return false;
+  bool IsSparseTensorType() const {
+    return type_ == GeneralType::kSparseTensor;
   }
 
-  virtual bool IsOptionalType() const {
-    return false;
+  bool IsOptionalType() const {
+    return type_ == GeneralType::kOptional;
+  }
+
+  bool IsNonTensorType() const {
+    return type_ == GeneralType::kNonTensor;
+  }
+
+  bool IsPrimitiveDataType() const {
+    return type_ == GeneralType::kPrimitive;
   }
 
   // Returns this if this is of tensor-type and null otherwise
-  virtual const TensorTypeBase* AsTensorType() const {
-    return nullptr;
-  }
+  const TensorTypeBase* AsTensorType() const;
 
-  virtual const SequenceTensorTypeBase* AsSequenceTensorType() const {
-    return nullptr;
-  }
+  const SequenceTensorTypeBase* AsSequenceTensorType() const;
 
 #if !defined(DISABLE_SPARSE_TENSORS)
   // Returns this if this is of sparse-tensor-type and null otherwise
-  virtual const SparseTensorTypeBase* AsSparseTensorType() const {
-    return nullptr;
-  }
+  const SparseTensorTypeBase* AsSparseTensorType() const;
 #endif
 
 #if !defined(DISABLE_OPTIONAL_TYPE)
-  virtual const OptionalTypeBase* AsOptionalType() const {
-    return nullptr;
-  }
+  const OptionalTypeBase* AsOptionalType() const;
 #endif
 
-  virtual const NonTensorTypeBase* AsNonTensorType() const {
-    return nullptr;
-  }
+  const NonTensorTypeBase* AsNonTensorType() const;
 
   // Returns this if this is one of the primitive data types (specialization of PrimitiveDataTypeBase)
   // and null otherwise
-  virtual const PrimitiveDataTypeBase* AsPrimitiveDataType() const {
-    return nullptr;
-  }
+  const PrimitiveDataTypeBase* AsPrimitiveDataType() const;
 
   // Return the type meta that we are using in the runtime.
   template <typename T>
@@ -217,8 +230,9 @@ namespace data_types_internal {
 ///
 
 template <typename T>
-constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType();
-
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType() {
+  return ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<float>() {
   return ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
@@ -226,65 +240,56 @@ constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<float>() {
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<uint8_t>() {
   return ONNX_NAMESPACE::TensorProto_DataType_UINT8;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<int8_t>() {
   return ONNX_NAMESPACE::TensorProto_DataType_INT8;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<uint16_t>() {
   return ONNX_NAMESPACE::TensorProto_DataType_UINT16;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<int16_t>() {
   return ONNX_NAMESPACE::TensorProto_DataType_INT16;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<int32_t>() {
   return ONNX_NAMESPACE::TensorProto_DataType_INT32;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<int64_t>() {
   return ONNX_NAMESPACE::TensorProto_DataType_INT64;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<std::string>() {
   return ONNX_NAMESPACE::TensorProto_DataType_STRING;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<bool>() {
   return ONNX_NAMESPACE::TensorProto_DataType_BOOL;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<MLFloat16>() {
   return ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<double>() {
   return ONNX_NAMESPACE::TensorProto_DataType_DOUBLE;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<uint32_t>() {
   return ONNX_NAMESPACE::TensorProto_DataType_UINT32;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<uint64_t>() {
   return ONNX_NAMESPACE::TensorProto_DataType_UINT64;
-};
+}
 template <>
 constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorDataType<BFloat16>() {
   return ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16;
 }
 
-// There is a specialization only for one
-// type argument.
-template <typename... Types>
-struct TensorElementTypeSetter {
-  static void SetTensorElementType(ONNX_NAMESPACE::TypeProto&);
-  static void SetMapKeyType(ONNX_NAMESPACE::TypeProto&);
-  static int32_t GetElementType();
-};
-
 /// Is a given type on the list of types?
 /// Accepts a list of types and the first argument is the type
 /// We are checking if it is listed among those that follow
@@ -351,49 +356,61 @@ struct GetMLDataType<T, false> {
   }
 };
 
+struct TensorTypeHelper {
+  static void Set(ONNX_NAMESPACE::TensorProto_DataType element_type,
+                  ONNX_NAMESPACE::TypeProto& proto) {
+    proto.mutable_tensor_type()->set_elem_type(element_type);
+  }
+};
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+struct SparseTensorTypeHelper {
+  static void Set(ONNX_NAMESPACE::TensorProto_DataType element_type,
+                  ONNX_NAMESPACE::TypeProto& proto) {
+    proto.mutable_sparse_tensor_type()->set_elem_type(element_type);
+  }
+};
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+
 #if !defined(DISABLE_ML_OPS)
-/// MapTypes helper API
-/// K should always be one of the primitive data types
-/// V can be either a primitive type (in which case it is a tensor)
-/// or other preregistered types
+/// Map helpers
 
 void CopyMutableMapValue(const ONNX_NAMESPACE::TypeProto&,
                          ONNX_NAMESPACE::TypeProto&);
 
-template <typename K, typename V>
-struct SetMapTypes {
-  static void Set(ONNX_NAMESPACE::TypeProto& proto) {
-    TensorElementTypeSetter<K>::SetMapKeyType(proto);
-    MLDataType dt = GetMLDataType<V, IsTensorContainedType<V>::value>::Get();
-    const auto* value_proto = dt->GetTypeProto();
-#ifdef ORT_NO_RTTI
+struct MapTypeHelper {
+  // V can be either a primitive type (in which case it is a tensor)
+  // or other preregistered types
+  template <typename V>
+  static MLDataType GetValueType() {
+    return GetMLDataType<V, IsTensorContainedType<V>::value>::Get();
+  }
+
+  static void Set(ONNX_NAMESPACE::TensorProto_DataType key_type, const ONNX_NAMESPACE::TypeProto* value_proto,
+                  ONNX_NAMESPACE::TypeProto& proto) {
     ORT_ENFORCE(value_proto != nullptr, "expected a registered ONNX type");
-#else
-    ORT_ENFORCE(value_proto != nullptr, typeid(V).name(),
-                " expected to be a registered ONNX type");
-#endif
+    proto.mutable_map_type()->set_key_type(key_type);
     CopyMutableMapValue(*value_proto, proto);
   }
 };
 #endif
 
 /// Sequence helpers
-///
+
 // Element type is a primitive type so we set it to a tensor<elemT>
 void CopyMutableSeqElement(const ONNX_NAMESPACE::TypeProto&,
                            ONNX_NAMESPACE::TypeProto&);
 
-template <typename T>
-struct SetSequenceType {
-  static void Set(ONNX_NAMESPACE::TypeProto& proto) {
-    MLDataType dt = GetMLDataType<T, IsTensorContainedType<T>::value>::Get();
-    const auto* elem_proto = dt->GetTypeProto();
-#ifdef ORT_NO_RTTI
+// helper to create TypeProto with minimal binary size impact
+struct SequenceTypeHelper {
+  template <typename T>
+  static MLDataType GetElemType() {
+    return GetMLDataType<T, IsTensorContainedType<T>::value>::Get();
+  }
+
+  static void Set(const ONNX_NAMESPACE::TypeProto* elem_proto,
+                  ONNX_NAMESPACE::TypeProto& proto) {
     ORT_ENFORCE(elem_proto != nullptr, "expected a registered ONNX type");
-#else
-    ORT_ENFORCE(elem_proto != nullptr, typeid(T).name(),
-                " expected to be a registered ONNX type");
-#endif
     CopyMutableSeqElement(*elem_proto, proto);
   }
 };
@@ -403,33 +420,26 @@ struct SetSequenceType {
 void CopyMutableOptionalElement(const ONNX_NAMESPACE::TypeProto&,
                                 ONNX_NAMESPACE::TypeProto&);
 
-template <typename T, typename elemT>
-struct SetOptionalType {
-  static void Set(ONNX_NAMESPACE::TypeProto& proto) {
-    const onnx::TypeProto* elem_proto = nullptr;
-    if (std::is_same<T, Tensor>::value) {
-      MLDataType dt = DataTypeImpl::GetTensorType<elemT>();
-      elem_proto = dt->GetTypeProto();
-    } else if (std::is_same<T, TensorSeq>::value) {
-      MLDataType dt = DataTypeImpl::GetSequenceTensorType<elemT>();
-      elem_proto = dt->GetTypeProto();
+// helper to create TypeProto with minimal binary size impact
+struct OptionalTypeHelper {
+  template <typename T, typename elemT>
+  static MLDataType GetElemType() {
+    if constexpr (std::is_same<T, Tensor>::value) {
+      return DataTypeImpl::GetTensorType<elemT>();
     } else {
-      // Will not reach here
-      ORT_ENFORCE(false, "Unsupported type for optional type");
+      static_assert(std::is_same<T, TensorSeq>::value, "Unsupported element type for optional type");
+      return DataTypeImpl::GetSequenceTensorType<elemT>();
     }
+  }
 
-#ifdef ORT_NO_RTTI
-    ORT_ENFORCE(elem_proto != nullptr, "expected a registered ORT type");
-#else
-    ORT_ENFORCE(elem_proto != nullptr, typeid(T).name(),
-                " expected to be a registered ORT type");
-#endif
+  static void Set(const onnx::TypeProto* elem_proto, ONNX_NAMESPACE::TypeProto& proto) {
+    ORT_ENFORCE(elem_proto != nullptr, "expected a registered ONNX type");
     CopyMutableOptionalElement(*elem_proto, proto);
   }
 };
 
 /// OpaqueTypes helpers
-///
+
 void AssignOpaqueDomainName(const char* domain, const char* name,
                             ONNX_NAMESPACE::TypeProto& proto);
 
@@ -445,16 +455,6 @@ class TensorTypeBase : public DataTypeImpl {
   /// where TypeProto was created ad-hoc and not queried from MLDataType
   bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override;
 
-  bool IsTensorType() const override {
-    return true;
-  }
-
-  const TensorTypeBase* AsTensorType() const override {
-    return this;
-  }
-
-  size_t Size() const override;
-
   DeleteFunc GetDeleteFunc() const override;
 
   const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
@@ -464,8 +464,7 @@ class TensorTypeBase : public DataTypeImpl {
     ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
   }
 
-  TensorTypeBase(const TensorTypeBase&) = delete;
-  TensorTypeBase& operator=(const TensorTypeBase&) = delete;
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorTypeBase);
 
  protected:
   ONNX_NAMESPACE::TypeProto& MutableTypeProto();
@@ -507,7 +506,7 @@ class TensorType : public TensorTypeBase {
  private:
   TensorType() {
     using namespace data_types_internal;
-    TensorElementTypeSetter<elemT>::SetTensorElementType(this->MutableTypeProto());
+    TensorTypeHelper::Set(ToTensorDataType<elemT>(), MutableTypeProto());
   }
 };
 
@@ -527,10 +526,6 @@ class DisabledTypeBase : public DataTypeImpl {
     return false;
   }
 
-  size_t Size() const override {
-    ORT_THROW("Type is disabled in this build.");
-  }
-
   DeleteFunc GetDeleteFunc() const override {
     ORT_THROW("Type is disabled in this build.");
   }
@@ -544,7 +539,7 @@ class DisabledTypeBase : public DataTypeImpl {
   // This must work
   ONNX_NAMESPACE::TypeProto& MutableTypeProto();
 
-  DisabledTypeBase();
+  DisabledTypeBase(DataTypeImpl::GeneralType type, size_t size);
   ~DisabledTypeBase() override;
 
  private:
@@ -560,18 +555,8 @@ class SparseTensorTypeBase : public DataTypeImpl {
  public:
   static MLDataType Type();
 
-  bool IsSparseTensorType() const override {
-    return true;
-  }
-
-  const SparseTensorTypeBase* AsSparseTensorType() const override {
-    return this;
-  }
-
   bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override;
 
-  size_t Size() const override;
-
   DeleteFunc GetDeleteFunc() const override;
 
   const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
@@ -581,8 +566,7 @@ class SparseTensorTypeBase : public DataTypeImpl {
     ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
   }
 
-  SparseTensorTypeBase(const SparseTensorTypeBase&) = delete;
-  SparseTensorTypeBase& operator=(const SparseTensorTypeBase&) = delete;
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SparseTensorTypeBase);
 
  protected:
   ONNX_NAMESPACE::TypeProto& MutableTypeProto();
@@ -611,7 +595,7 @@ class SparseTensorType : public SparseTensorTypeBase {
  private:
   SparseTensorType() {
     using namespace data_types_internal;
-    TensorElementTypeSetter<elemT>::SetSparseTensorElementType(MutableTypeProto());
+    SparseTensorTypeHelper::Set(ToTensorDataType<elemT>(), MutableTypeProto());
   }
 };
 
@@ -624,21 +608,8 @@ class OptionalTypeBase : public DataTypeImpl {
  public:
   static MLDataType Type();
 
-  bool IsOptionalType() const override {
-    return true;
-  }
-
-  const OptionalTypeBase* AsOptionalType() const override {
-    return this;
-  }
-
   bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override;
 
-  size_t Size() const override {
-    // should never reach here.
-    ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
-  }
-
   DeleteFunc GetDeleteFunc() const override {
     // should never reach here.
     ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
@@ -687,32 +658,31 @@ class OptionalType :
                 "Requires one of the tensor fundamental types");
 
   MLDataType GetElementType() const override {
-    if (std::is_same<T, Tensor>::value) {
-      return DataTypeImpl::GetTensorType<elemT>();
-    } else if (std::is_same<T, TensorSeq>::value) {
-      return DataTypeImpl::GetSequenceTensorType<elemT>();
-    } else {
-      // Will not reach here
-      ORT_ENFORCE(false, "Unsupported optional type");
-    }
+    return data_types_internal::OptionalTypeHelper::GetElemType<T, elemT>();
   }
 #endif
 
  private:
-  OptionalType() {
-    data_types_internal::SetOptionalType<T, elemT>::Set(MutableTypeProto());
+#if !defined(DISABLE_OPTIONAL_TYPE)
+  OptionalType()
+#else
+  OptionalType() : DisabledTypeBase { DataTypeImpl::GeneralType::kOptional, 0 }
+#endif
+  {
+    using namespace data_types_internal;
+    OptionalTypeHelper::Set(OptionalTypeHelper::GetElemType<T, elemT>()->GetTypeProto(), MutableTypeProto());
   }
-};
+};  // namespace onnxruntime
 
 /**
-  * \brief Provide a specialization for your C++ Non-tensor type
-  *        so your implementation FromDataTypeContainer/ToDataTypeContainer
-  *        functions correctly. Otherwise you get a default implementation
-  *        which may not be what you need/want.
-  *
-  * This class is used to create OrtValue, fetch data from OrtValue via
-  * C/C++ APIs
-  */
+ * \brief Provide a specialization for your C++ Non-tensor type
+ *        so your implementation FromDataTypeContainer/ToDataTypeContainer
+ *        functions correctly. Otherwise you get a default implementation
+ *        which may not be what you need/want.
+ *
+ * This class is used to create OrtValue, fetch data from OrtValue via
+ * C/C++ APIs
+ */
 template <class T>
 struct NonTensorTypeConverter {
   static void FromContainer(MLDataType /*dtype*/, const void* /*data*/, size_t /*data_size*/, OrtValue& /*output*/) {
@@ -728,18 +698,12 @@ struct NonTensorTypeConverter {
  */
 class NonTensorTypeBase : public DataTypeImpl {
  public:
-  size_t Size() const override = 0;
-
   DeleteFunc GetDeleteFunc() const override = 0;
 
   virtual CreateFunc GetCreateFunc() const = 0;
 
   const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
 
-  const NonTensorTypeBase* AsNonTensorType() const override {
-    return this;
-  }
-
   // \brief Override for Non-tensor types to initialize non-tensor CPP
   // data representation from data. The caller of the interface
   // should have a shared definition of the data which is used to initialize
@@ -766,7 +730,7 @@ class NonTensorTypeBase : public DataTypeImpl {
   NonTensorTypeBase& operator=(const NonTensorTypeBase&) = delete;
 
  protected:
-  NonTensorTypeBase();
+  NonTensorTypeBase(size_t size);
   ~NonTensorTypeBase() override;
 
   ONNX_NAMESPACE::TypeProto& MutableTypeProto();
@@ -791,10 +755,6 @@ class NonTensorType : public NonTensorTypeBase {
   }
 
  public:
-  size_t Size() const override {
-    return sizeof(T);
-  }
-
   DeleteFunc GetDeleteFunc() const override {
     return &Delete;
   }
@@ -804,7 +764,7 @@ class NonTensorType : public NonTensorTypeBase {
   }
 
  protected:
-  NonTensorType() = default;
+  NonTensorType() : NonTensorTypeBase(sizeof(T)) {}
 };
 
 #if !defined(DISABLE_ML_OPS)
@@ -833,7 +793,9 @@ class MapType : public NonTensorType<CPPType> {
  private:
   MapType() {
     using namespace data_types_internal;
-    SetMapTypes<typename CPPType::key_type, typename CPPType::mapped_type>::Set(this->MutableTypeProto());
+    MapTypeHelper::Set(ToTensorDataType<typename CPPType::key_type>(),
+                       MapTypeHelper::GetValueType<typename CPPType::mapped_type>()->GetTypeProto(),
+                       this->MutableTypeProto());
   }
 };
 #endif
@@ -858,13 +820,15 @@ class SequenceType : public NonTensorType<CPPType> {
 
  private:
   SequenceType() {
-    data_types_internal::SetSequenceType<typename CPPType::value_type>::Set(this->MutableTypeProto());
+    using namespace data_types_internal;
+    SequenceTypeHelper::Set(SequenceTypeHelper::GetElemType<typename CPPType::value_type>()->GetTypeProto(),
+                            this->MutableTypeProto());
   }
 };
 
 /**
  * \brief SequenceTensorTypeBase serves as a base type class for
- *        Tensor sequences. Akin TensorTypeBase.
+ *        Tensor sequences. Akin to TensorTypeBase.
  *        Runtime representation is always TensorSeq.
  */
 class SequenceTensorTypeBase : public DataTypeImpl {
@@ -873,21 +837,11 @@ class SequenceTensorTypeBase : public DataTypeImpl {
 
   bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override;
 
-  bool IsTensorSequenceType() const override {
-    return true;
-  }
-
-  const SequenceTensorTypeBase* AsSequenceTensorType() const override {
-    return this;
-  }
-
   virtual MLDataType GetElementType() const {
     // should never reach here.
     ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
   }
 
-  size_t Size() const override;
-
   DeleteFunc GetDeleteFunc() const override;
 
   const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
@@ -931,18 +885,20 @@ class SequenceTensorType : public SequenceTensorTypeBase {
 
  private:
   SequenceTensorType() {
-    data_types_internal::SetSequenceType<TensorElemType>::Set(this->MutableTypeProto());
+    using namespace data_types_internal;
+    SequenceTypeHelper::Set(SequenceTypeHelper::GetElemType<TensorElemType>()->GetTypeProto(),
+                            MutableTypeProto());
   }
 };
 
 /**
  * \brief OpaqueType
  *
- * \param T - cpp runtume that implements the Opaque type
+ * \tparam T - cpp runtume that implements the Opaque type
  *
- * \param const char D[] - domain must be extern to be unique
+ * \tparam const char D[] - domain must be extern to be unique
  *
- * \param const char N[] - name must be extern to be unique
+ * \tparam const char N[] - name must be extern to be unique
  *
  * \details Only one CPP type can be associated with a particular
  *          OpaqueType registration
@@ -985,10 +941,6 @@ class PrimitiveDataTypeBase : public DataTypeImpl {
     return false;
   }
 
-  const PrimitiveDataTypeBase* AsPrimitiveDataType() const override final {
-    return this;
-  }
-
   const ONNX_NAMESPACE::TypeProto* GetTypeProto() const final {
     return nullptr;
   }
@@ -998,14 +950,11 @@ class PrimitiveDataTypeBase : public DataTypeImpl {
   }
 
  protected:
-  PrimitiveDataTypeBase() = default;
-
-  void SetDataType(int32_t data_type) {
-    data_type_ = data_type;
-  }
+  PrimitiveDataTypeBase(size_t size, int32_t data_type)
+      : DataTypeImpl{GeneralType::kPrimitive, size}, data_type_{data_type} {}
 
  private:
-  int32_t data_type_;
+  const int32_t data_type_;
 };
 
 /**
@@ -1015,7 +964,7 @@ class PrimitiveDataTypeBase : public DataTypeImpl {
  *
  * \param T - primitive data type
  *
-  */
+ */
 template <typename T>
 class PrimitiveDataType : public PrimitiveDataTypeBase {
  private:
@@ -1026,20 +975,45 @@ class PrimitiveDataType : public PrimitiveDataTypeBase {
  public:
   static MLDataType Type();
 
-  size_t Size() const override {
-    return sizeof(T);
-  }
-
   DeleteFunc GetDeleteFunc() const override {
     return &Delete;
   }
 
  private:
-  PrimitiveDataType() {
-    this->SetDataType(data_types_internal::TensorElementTypeSetter<T>::GetElementType());
+  PrimitiveDataType()
+      : PrimitiveDataTypeBase{sizeof(T),
+                              data_types_internal::ToTensorDataType<T>()} {
   }
 };
 
+inline const TensorTypeBase* DataTypeImpl::AsTensorType() const {
+  return IsTensorType() ? static_cast<const TensorTypeBase*>(this) : nullptr;
+}
+
+inline const SequenceTensorTypeBase* DataTypeImpl::AsSequenceTensorType() const {
+  return IsTensorSequenceType() ? static_cast<const SequenceTensorTypeBase*>(this) : nullptr;
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+inline const SparseTensorTypeBase* DataTypeImpl::AsSparseTensorType() const {
+  return IsSparseTensorType() ? static_cast<const SparseTensorTypeBase*>(this) : nullptr;
+}
+#endif
+
+#if !defined(DISABLE_OPTIONAL_TYPE)
+inline const OptionalTypeBase* DataTypeImpl::AsOptionalType() const {
+  return IsOptionalType() ? static_cast<const OptionalTypeBase*>(this) : nullptr;
+}
+#endif
+
+inline const NonTensorTypeBase* DataTypeImpl::AsNonTensorType() const {
+  return IsNonTensorType() ? static_cast<const NonTensorTypeBase*>(this) : nullptr;
+}
+
+inline const PrimitiveDataTypeBase* DataTypeImpl::AsPrimitiveDataType() const {
+  return IsPrimitiveDataType() ? static_cast<const PrimitiveDataTypeBase*>(this) : nullptr;
+}
+
 // Explicit specialization of base class template function
 // is only possible within the enclosing namespace scope,
 // thus a simple way to pre-instantiate a given template
diff --git a/include/onnxruntime/core/framework/tensor_shape.h b/include/onnxruntime/core/framework/tensor_shape.h
index da42b8ce78..408fafe81f 100644
--- a/include/onnxruntime/core/framework/tensor_shape.h
+++ b/include/onnxruntime/core/framework/tensor_shape.h
@@ -26,12 +26,12 @@ class TensorShape {
   TensorShape(const TensorShape& other) : TensorShape(other.GetDims()) {}
   TensorShape& operator=(const TensorShape& other);
 
-  TensorShape(TensorShape&& other) { operator=(std::move(other)); }
-  TensorShape& operator=(TensorShape&& other);
+  TensorShape(TensorShape&& other) noexcept { operator=(std::move(other)); }
+  TensorShape& operator=(TensorShape&& other) noexcept;
 
   TensorShape(gsl::span<const int64_t> dims);
   TensorShape(const std::vector<int64_t>& dims) : TensorShape(gsl::make_span(dims)) {}
-  TensorShape(const std::initializer_list<int64_t>& dims);
+  TensorShape(const std::initializer_list<int64_t>& dims) : TensorShape(gsl::make_span(dims.begin(), dims.end())) {}
   TensorShape(const int64_t* dimension_sizes, size_t dimension_count) : TensorShape(gsl::span<const int64_t>(dimension_sizes, dimension_count)) {}
   TensorShape(const std::vector<int64_t>& dims, size_t start, size_t end) : TensorShape(gsl::span<const int64_t>(&dims[start], end - start)) {}
 
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
index 4467b4027f..d54f145d40 100644
--- a/include/onnxruntime/core/graph/graph.h
+++ b/include/onnxruntime/core/graph/graph.h
@@ -1522,6 +1522,28 @@ class Graph {
 };
 
 #if !defined(ORT_MINIMAL_BUILD)
+// Print NodeArg as
+//  name : type
+// For example,
+//  "110": tensor(float)
+std::ostream& operator<<(std::ostream& out, const NodeArg& node_arg);
+// Print Node as,
+//  (operator's name, operator's type, domain, version) : (input0, input1, ...) -> (output0, output1, ...)
+// For example,
+//  ("Add_14", Add, "", 7) : ("110": tensor(float),"109": tensor(float),) -> ("111": tensor(float),) 
+std::ostream& operator<<(std::ostream& out, const Node& node);
+// Print Graph as, for example,
+// Inputs:
+//    "Input": tensor(float)
+// Nodes:
+//    ("add0", Add, "", 7) : ("Input": tensor(float),"Bias": tensor(float),) -> ("add0_out": tensor(float),) 
+//    ("matmul", MatMul, "", 9) : ("add0_out": tensor(float),"matmul_weight": tensor(float),) -> ("matmul_out": tensor(float),) 
+//    ("add1", Add, "", 7) : ("matmul_out": tensor(float),"add_weight": tensor(float),) -> ("add1_out": tensor(float),) 
+//    ("reshape", Reshape, "", 5) : ("add1_out": tensor(float),"concat_out": tensor(int64),) -> ("Result": tensor(float),) 
+// Outputs:
+//    "Result": tensor(float)
+// Inputs' and outputs' format is described in document of NodeArg's operator<< above.
+// Node format is described in Node's operator<< above.
 std::ostream& operator<<(std::ostream& out, const Graph& graph);
 #endif
 
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 8b9c2871ff..12d3c54268 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -30,7 +30,7 @@
 *
 * This value is used by some API functions to behave as this version of the header expects.
 */
-#define ORT_API_VERSION 9
+#define ORT_API_VERSION 10
 
 #ifdef __cplusplus
 extern "C" {
@@ -3162,6 +3162,26 @@ struct OrtApi {
   */
   ORT_API2_STATUS(SetGlobalCustomJoinThreadFn, _Inout_ OrtThreadingOptions* tp_options, _In_ OrtCustomJoinThreadFn ort_custom_join_thread_fn);
   /// @}
+
+  /** \brief Synchronize bound inputs. The call may be necessary for some providers, such as cuda,
+  *   in case the system that allocated bound memory operated on a different stream. However, the
+  *   operation is provider specific and could be a no-op.
+  *
+  * \param[inout] binding_ptr
+  * 
+  * * \snippet{doc} snippets.dox OrtStatus Return Value
+  */
+  ORT_API2_STATUS(SynchronizeBoundInputs, _Inout_ OrtIoBinding* binding_ptr);
+
+  /** \brief Synchronize bound outputs. The call may be necessary for some providers, such as cuda,
+  *   in case the system that allocated bound memory operated on a different stream. However, the
+  *   operation is provider specific and could be a no-op.
+  *
+  * \param[inout] binding_ptr
+  * 
+  * * \snippet{doc} snippets.dox OrtStatus Return Value
+  */
+  ORT_API2_STATUS(SynchronizeBoundOutputs, _Inout_ OrtIoBinding* binding_ptr);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 0c2b74b39b..048421099b 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -871,6 +871,8 @@ struct IoBinding : public Base<OrtIoBinding> {
   std::vector<Value> GetOutputValues(Allocator&) const;
   void ClearBoundInputs();
   void ClearBoundOutputs();
+  void SynchronizeInputs();
+  void SynchronizeOutputs();
 
  private:
   std::vector<std::string> GetOutputNamesHelper(OrtAllocator*) const;
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 982a516cca..1f31dffca8 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -284,6 +284,14 @@ inline void IoBinding::ClearBoundOutputs() {
   GetApi().ClearBoundOutputs(p_);
 }
 
+inline void IoBinding::SynchronizeInputs() {
+  ThrowOnError(GetApi().SynchronizeBoundInputs(p_));
+}
+
+inline void IoBinding::SynchronizeOutputs() {
+  ThrowOnError(GetApi().SynchronizeBoundOutputs(p_));
+}
+
 inline ArenaCfg::ArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk) {
   ThrowOnError(GetApi().CreateArenaCfg(max_mem, arena_extend_strategy, initial_chunk_size_bytes, max_dead_bytes_per_chunk, &p_));
 }
diff --git a/js/README.md b/js/README.md
index a5bd7872b6..1055d92725 100644
--- a/js/README.md
+++ b/js/README.md
@@ -298,6 +298,22 @@ In folder `<ORT_ROOT>/js/web`, use command `npm run build:doc` to generate the l
 
 It should be able to consumed by both from projects that uses NPM packages (through a Node.js folder structure of `node_modules` folder that generated by `npm install onnxruntime-web`) and from a CDN service that serves a `ort.min.js` file and one or multiple `.wasm` file(s).
 
+#### Reduced WebAssembly artifacts
+
+By default, the WebAssembly artifacts from onnxruntime-web package allows use of both standard ONNX models (.onnx) and ORT format models (.ort). There is an option to use a minimal build of ONNX Runtime to reduce the binary size, which only supports ORT format models. See also [ORT format model](https://onnxruntime.ai/docs/tutorials/mobile/overview.html) for more information.
+#### Reduced JavaScript bundle file fize
+
+By default, the main bundle file `ort.min.js` of ONNX Runtime Web contains all features. However, its size is over 500kB and for some scenarios we want a smaller sized bundle file, if we don't use all the features. The following table lists all available bundles with their support status of features.
+
+|bundle file name|file size|file size (gzipped)|WebGL|WASM-core|WASM-proxy|WASM-threads|ES5 backward compatibility|
+|-|-|-|-|------|-----|---|-|
+|ort.es5.min.js|594.15KB|134.25KB|O|O|O|O|O|
+|ort.min.js|526.02KB|125.07KB|O|O|O|O|X|
+|ort.webgl.min.js|385.25KB|83.83KB|O|X|X|X|X|
+|ort.wasm.min.js|148.56|44KB|X|O|O|O|X|
+|ort.wasm-core.min.js|40.56KB|12.74KB|X|O|X|X|X|
+
+
 ## onnxruntime-react-native
 
 > language: typescript, java, objective-c
@@ -319,7 +335,7 @@ This project provides an ONNX Runtime React Native JavaScript library to run ONN
 
 ### Models with ORT format
 
-By default, ONNX Runtime React Native leverages ONNX Runtime Mobile package with ORT format. Follow the [instruciton](https://www.onnxruntime.ai/docs/how-to/mobile/model-conversion.html#converting-onnx-models-to-ort-format) to covert ONNX model to ORT format.
+By default, ONNX Runtime React Native leverages ONNX Runtime Mobile package with ORT format. Follow the [instruciton](https://onnxruntime.ai/docs/tutorials/mobile/model-conversion.html) to covert ONNX model to ORT format.
 
 ### Build
 
diff --git a/js/common/tsconfig.json b/js/common/tsconfig.json
index 0683a8a1d7..9e1ac941c4 100644
--- a/js/common/tsconfig.json
+++ b/js/common/tsconfig.json
@@ -2,6 +2,8 @@
   "extends": "../tsconfig.json",
   "compilerOptions": {
     "outDir": "dist/lib",
+    "esModuleInterop": false,
+    "noUnusedParameters": true,
   },
   "include": ["lib"]
 }
diff --git a/js/common/webpack.config.js b/js/common/webpack.config.js
index d35f90b525..e5c4e417bc 100644
--- a/js/common/webpack.config.js
+++ b/js/common/webpack.config.js
@@ -7,7 +7,21 @@ const path = require('path');
 const webpack = require('webpack');
 const TerserPlugin = require("terser-webpack-plugin");
 
-function addCopyrightBannerPlugin(mode) {
+function terserEcmaVersionFromWebpackTarget(target) {
+  switch (target) {
+    case 'es5':
+      return 5;
+    case 'es6':
+    case 'es2015':
+      return 2015;
+    case 'es2017':
+      return 2017;
+    default:
+      throw new RangeError(`not supported ECMA version: ${target}`);
+  }
+}
+
+function addCopyrightBannerPlugin(mode, target) {
   const VERSION = require(path.join(__dirname, 'package.json')).version;
   const COPYRIGHT_BANNER = `/*!
  * ONNX Runtime Common v${VERSION}
@@ -19,6 +33,7 @@ function addCopyrightBannerPlugin(mode) {
     return new TerserPlugin({
       extractComments: false,
       terserOptions: {
+        ecma: terserEcmaVersionFromWebpackTarget(target),
         format: {
           preamble: COPYRIGHT_BANNER,
           comments: false,
@@ -36,7 +51,7 @@ function addCopyrightBannerPlugin(mode) {
 function buildConfig({
   suffix = '',
   format = 'umd',
-  target = 'es5',
+  target = 'es2017',
   mode = 'production',
   devtool = 'source-map'
 }) {
@@ -54,7 +69,7 @@ function buildConfig({
     resolve: { extensions: ['.ts', '.js'] },
     plugins: [
       new webpack.WatchIgnorePlugin({ paths: [/\.js$/, /\.d\.ts$/] }),
-      addCopyrightBannerPlugin(mode),
+      addCopyrightBannerPlugin(mode, target),
     ],
     module: {
       rules: [{
@@ -63,7 +78,7 @@ function buildConfig({
           {
             loader: 'ts-loader',
             options: {
-              compilerOptions: { target: target }
+              compilerOptions: { target }
             }
           }
         ]
@@ -76,10 +91,10 @@ function buildConfig({
 
 module.exports = (env, argv) => {
   return [
-    buildConfig({ suffix: '.es6', mode: 'development', devtool: 'inline-source-map', target: 'es6' }),
-    buildConfig({ mode: 'development', devtool: 'inline-source-map' }),
+    buildConfig({ suffix: '.es5.min', target: 'es5' }),
     buildConfig({ suffix: '.es6.min', target: 'es6' }),
     buildConfig({ suffix: '.min' }),
+    buildConfig({ mode: 'development', devtool: 'inline-source-map' }),
     buildConfig({ format: 'commonjs', suffix: '.node' }),
   ];
 };
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index c7b67aac3a..8024c64131 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -107,18 +107,6 @@
       "integrity": "sha512-sL/cEvJWAnClXw0wHk85/2L0G6Sj8UB0Ctc1TEMbKSsmpRosqhwj9gWgFRZSrBr2f9tiXISwNhCPmlfqUqyb9Q==",
       "dev": true
     },
-    "ajv": {
-      "version": "6.12.6",
-      "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
-      "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
-      "dev": true,
-      "requires": {
-        "fast-deep-equal": "^3.1.1",
-        "fast-json-stable-stringify": "^2.0.0",
-        "json-schema-traverse": "^0.4.1",
-        "uri-js": "^4.2.2"
-      }
-    },
     "ansi": {
       "version": "0.3.1",
       "resolved": "https://registry.npmjs.org/ansi/-/ansi-0.3.1.tgz",
@@ -156,50 +144,68 @@
         "picomatch": "^2.0.4"
       }
     },
+    "are-we-there-yet": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/are-we-there-yet/-/are-we-there-yet-1.0.6.tgz",
+      "integrity": "sha1-otKMkxAqpsyWJFomy5VN4G7FPww=",
+      "dev": true,
+      "requires": {
+        "delegates": "^1.0.0",
+        "readable-stream": "^2.0.0 || ^1.1.13"
+      },
+      "dependencies": {
+        "isarray": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
+          "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=",
+          "dev": true
+        },
+        "readable-stream": {
+          "version": "2.3.7",
+          "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz",
+          "integrity": "sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==",
+          "dev": true,
+          "requires": {
+            "core-util-is": "~1.0.0",
+            "inherits": "~2.0.3",
+            "isarray": "~1.0.0",
+            "process-nextick-args": "~2.0.0",
+            "safe-buffer": "~5.1.1",
+            "string_decoder": "~1.1.1",
+            "util-deprecate": "~1.0.1"
+          }
+        },
+        "string_decoder": {
+          "version": "1.1.1",
+          "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
+          "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
+          "dev": true,
+          "requires": {
+            "safe-buffer": "~5.1.0"
+          }
+        }
+      }
+    },
     "argparse": {
       "version": "2.0.1",
       "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
       "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
       "dev": true
     },
-    "asn1": {
-      "version": "0.2.4",
-      "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz",
-      "integrity": "sha512-jxwzQpLQjSmWXgwaCZE9Nz+glAG01yF1QnWgbhGwHI5A6FRIEY6IVqtHhIepHqI7/kyEyQEagBC5mBEFlIYvdg==",
-      "dev": true,
-      "requires": {
-        "safer-buffer": "~2.1.0"
-      }
-    },
-    "assert-plus": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/assert-plus/-/assert-plus-1.0.0.tgz",
-      "integrity": "sha1-8S4PPF13sLHN2RRpQuTpbB5N1SU=",
-      "dev": true
-    },
-    "asynckit": {
-      "version": "0.4.0",
-      "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
-      "integrity": "sha1-x57Zf380y48robyXkLzDZkdLS3k=",
-      "dev": true
-    },
     "at-least-node": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/at-least-node/-/at-least-node-1.0.0.tgz",
       "integrity": "sha512-+q/t7Ekv1EDY2l6Gda6LLiX14rU9TV20Wa3ofeQmwPFZbOMo9DXrLbOjFaaclkXKWidIaopwAObQDqwWtGUjqg==",
       "dev": true
     },
-    "aws-sign2": {
-      "version": "0.7.0",
-      "resolved": "https://registry.npmjs.org/aws-sign2/-/aws-sign2-0.7.0.tgz",
-      "integrity": "sha1-tG6JCTSpWR8tL2+G1+ap8bP+dqg=",
-      "dev": true
-    },
-    "aws4": {
-      "version": "1.11.0",
-      "resolved": "https://registry.npmjs.org/aws4/-/aws4-1.11.0.tgz",
-      "integrity": "sha512-xh1Rl34h6Fi1DC2WWKfxUTVqRsNnr6LsKz2+hfwDxQJWmrx8+c7ylaqBMcHfl1U1r2dsifOvKX3LQuLNZ+XSvA==",
-      "dev": true
+    "axios": {
+      "version": "0.21.4",
+      "resolved": "https://registry.npmjs.org/axios/-/axios-0.21.4.tgz",
+      "integrity": "sha512-ut5vewkiu8jjGBdqpM44XxjuCjq9LAKeHVmoVfHVzy8eHgxxq8SbAVQNovDA8mVi05kP0Ea/n/UzcSHcTJQfNg==",
+      "dev": true,
+      "requires": {
+        "follow-redirects": "^1.14.0"
+      }
     },
     "balanced-match": {
       "version": "1.0.2",
@@ -207,19 +213,10 @@
       "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
       "dev": true
     },
-    "bcrypt-pbkdf": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/bcrypt-pbkdf/-/bcrypt-pbkdf-1.0.2.tgz",
-      "integrity": "sha1-pDAdOJtqQ/m2f/PKEaP2Y342Dp4=",
-      "dev": true,
-      "requires": {
-        "tweetnacl": "^0.14.3"
-      }
-    },
     "big-integer": {
-      "version": "1.6.48",
-      "resolved": "https://registry.npmjs.org/big-integer/-/big-integer-1.6.48.tgz",
-      "integrity": "sha512-j51egjPa7/i+RdiRuJbPdJ2FIUYYPhvYLjzoYbcMMm62ooO6F94fETG4MTs46zPAF9Brs04OajboA/qTGuz78w==",
+      "version": "1.6.51",
+      "resolved": "https://registry.npmjs.org/big-integer/-/big-integer-1.6.51.tgz",
+      "integrity": "sha512-GPEid2Y9QU1Exl1rpO9B2IPJGHPSupF5GnVIP0blYvNOMer2bTvSWs1jGOUg04hTmu67nmLsQ9TBo1puaotBHg==",
       "dev": true
     },
     "binary": {
@@ -293,12 +290,6 @@
       "integrity": "sha1-fB0W1nmhu+WcoCys7PsBHiAfWh8=",
       "dev": true
     },
-    "caseless": {
-      "version": "0.12.0",
-      "resolved": "https://registry.npmjs.org/caseless/-/caseless-0.12.0.tgz",
-      "integrity": "sha1-G2gcIf+EAzyCZUMJBolCDRhxUdw=",
-      "dev": true
-    },
     "chainsaw": {
       "version": "0.1.0",
       "resolved": "https://registry.npmjs.org/chainsaw/-/chainsaw-0.1.0.tgz",
@@ -363,11 +354,12 @@
       }
     },
     "cmake-js": {
-      "version": "6.1.0",
-      "resolved": "https://registry.npmjs.org/cmake-js/-/cmake-js-6.1.0.tgz",
-      "integrity": "sha512-utmukLQftpgrCpGRCaHnkv4K27HZNNFqmBl4vnvccy0xp4c1erxjFU/Lq4wn5ngAhFZmpwBPQfoKWKThjSBiwg==",
+      "version": "6.2.1",
+      "resolved": "https://registry.npmjs.org/cmake-js/-/cmake-js-6.2.1.tgz",
+      "integrity": "sha512-wEpg0Z8SY6ihXTe+xosadh4PbASdWSM/locbLacWRYJCZfAjWLyOrd4RoVIeirLkfPxmG8GdNQA9tW/Rz5SfJA==",
       "dev": true,
       "requires": {
+        "axios": "^0.21.1",
         "debug": "^4",
         "fs-extra": "^5.0.0",
         "is-iojs": "^1.0.1",
@@ -375,7 +367,6 @@
         "memory-stream": "0",
         "npmlog": "^1.2.0",
         "rc": "^1.2.7",
-        "request": "^2.54.0",
         "semver": "^5.0.3",
         "splitargs": "0",
         "tar": "^4",
@@ -385,16 +376,6 @@
         "yargs": "^3.6.0"
       },
       "dependencies": {
-        "are-we-there-yet": {
-          "version": "1.0.6",
-          "resolved": "https://registry.npmjs.org/are-we-there-yet/-/are-we-there-yet-1.0.6.tgz",
-          "integrity": "sha1-otKMkxAqpsyWJFomy5VN4G7FPww=",
-          "dev": true,
-          "requires": {
-            "delegates": "^1.0.0",
-            "readable-stream": "^2.0.0 || ^1.1.13"
-          }
-        },
         "fs-extra": {
           "version": "5.0.0",
           "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-5.0.0.tgz",
@@ -405,30 +386,6 @@
             "jsonfile": "^4.0.0",
             "universalify": "^0.1.0"
           }
-        },
-        "gauge": {
-          "version": "1.2.7",
-          "resolved": "https://registry.npmjs.org/gauge/-/gauge-1.2.7.tgz",
-          "integrity": "sha1-6c7FSD09TuDvRLYKfZnkk14TbZM=",
-          "dev": true,
-          "requires": {
-            "ansi": "^0.3.0",
-            "has-unicode": "^2.0.0",
-            "lodash.pad": "^4.1.0",
-            "lodash.padend": "^4.1.0",
-            "lodash.padstart": "^4.1.0"
-          }
-        },
-        "npmlog": {
-          "version": "1.2.1",
-          "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-1.2.1.tgz",
-          "integrity": "sha1-KOe+YZYJtT960d0wChDWTXFiaLY=",
-          "dev": true,
-          "requires": {
-            "ansi": "~0.3.0",
-            "are-we-there-yet": "~1.0.0",
-            "gauge": "~1.2.0"
-          }
         }
       }
     },
@@ -453,15 +410,6 @@
       "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
       "dev": true
     },
-    "combined-stream": {
-      "version": "1.0.8",
-      "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
-      "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
-      "dev": true,
-      "requires": {
-        "delayed-stream": "~1.0.0"
-      }
-    },
     "concat-map": {
       "version": "0.0.1",
       "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
@@ -469,20 +417,11 @@
       "dev": true
     },
     "core-util-is": {
-      "version": "1.0.2",
-      "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
-      "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=",
+      "version": "1.0.3",
+      "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz",
+      "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==",
       "dev": true
     },
-    "dashdash": {
-      "version": "1.14.1",
-      "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz",
-      "integrity": "sha1-hTz6D3y+L+1d4gMmuN1YEDX24vA=",
-      "dev": true,
-      "requires": {
-        "assert-plus": "^1.0.0"
-      }
-    },
     "debug": {
       "version": "4.3.1",
       "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.1.tgz",
@@ -504,12 +443,6 @@
       "integrity": "sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==",
       "dev": true
     },
-    "delayed-stream": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
-      "integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk=",
-      "dev": true
-    },
     "delegates": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/delegates/-/delegates-1.0.0.tgz",
@@ -529,16 +462,38 @@
       "dev": true,
       "requires": {
         "readable-stream": "^2.0.2"
-      }
-    },
-    "ecc-jsbn": {
-      "version": "0.1.2",
-      "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz",
-      "integrity": "sha1-OoOpBOVDUyh4dMVkt1SThoSamMk=",
-      "dev": true,
-      "requires": {
-        "jsbn": "~0.1.0",
-        "safer-buffer": "^2.1.0"
+      },
+      "dependencies": {
+        "isarray": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
+          "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=",
+          "dev": true
+        },
+        "readable-stream": {
+          "version": "2.3.7",
+          "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz",
+          "integrity": "sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==",
+          "dev": true,
+          "requires": {
+            "core-util-is": "~1.0.0",
+            "inherits": "~2.0.3",
+            "isarray": "~1.0.0",
+            "process-nextick-args": "~2.0.0",
+            "safe-buffer": "~5.1.1",
+            "string_decoder": "~1.1.1",
+            "util-deprecate": "~1.0.1"
+          }
+        },
+        "string_decoder": {
+          "version": "1.1.1",
+          "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
+          "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
+          "dev": true,
+          "requires": {
+            "safe-buffer": "~5.1.0"
+          }
+        }
       }
     },
     "emoji-regex": {
@@ -568,30 +523,6 @@
       "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==",
       "dev": true
     },
-    "extend": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
-      "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==",
-      "dev": true
-    },
-    "extsprintf": {
-      "version": "1.3.0",
-      "resolved": "https://registry.npmjs.org/extsprintf/-/extsprintf-1.3.0.tgz",
-      "integrity": "sha1-lpGEQOMEGnpBT4xS48V06zw+HgU=",
-      "dev": true
-    },
-    "fast-deep-equal": {
-      "version": "3.1.3",
-      "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
-      "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
-      "dev": true
-    },
-    "fast-json-stable-stringify": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
-      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
-      "dev": true
-    },
     "fast-safe-stringify": {
       "version": "2.0.7",
       "resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.0.7.tgz",
@@ -623,23 +554,12 @@
       "integrity": "sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==",
       "dev": true
     },
-    "forever-agent": {
-      "version": "0.6.1",
-      "resolved": "https://registry.npmjs.org/forever-agent/-/forever-agent-0.6.1.tgz",
-      "integrity": "sha1-+8cfDEGt6zf5bFd60e1C2P2sypE=",
+    "follow-redirects": {
+      "version": "1.14.5",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.14.5.tgz",
+      "integrity": "sha512-wtphSXy7d4/OR+MvIFbCVBDzZ5520qV8XfPklSN5QtxuMUJZ+b0Wnst1e1lCDocfzuCkHqj8k0FpZqO+UIaKNA==",
       "dev": true
     },
-    "form-data": {
-      "version": "2.3.3",
-      "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz",
-      "integrity": "sha512-1lLKB2Mu3aGP1Q/2eCOx0fNbRMe7XdwktwOruhfqqd0rIJWwN4Dh+E3hrPSlDCXnSR7UtZ1N38rVXm+6+MEhJQ==",
-      "dev": true,
-      "requires": {
-        "asynckit": "^0.4.0",
-        "combined-stream": "^1.0.6",
-        "mime-types": "^2.1.12"
-      }
-    },
     "fs-extra": {
       "version": "9.1.0",
       "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-9.1.0.tgz",
@@ -704,21 +624,25 @@
         "rimraf": "2"
       }
     },
+    "gauge": {
+      "version": "1.2.7",
+      "resolved": "https://registry.npmjs.org/gauge/-/gauge-1.2.7.tgz",
+      "integrity": "sha1-6c7FSD09TuDvRLYKfZnkk14TbZM=",
+      "dev": true,
+      "requires": {
+        "ansi": "^0.3.0",
+        "has-unicode": "^2.0.0",
+        "lodash.pad": "^4.1.0",
+        "lodash.padend": "^4.1.0",
+        "lodash.padstart": "^4.1.0"
+      }
+    },
     "get-caller-file": {
       "version": "2.0.5",
       "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
       "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
       "dev": true
     },
-    "getpass": {
-      "version": "0.1.7",
-      "resolved": "https://registry.npmjs.org/getpass/-/getpass-0.1.7.tgz",
-      "integrity": "sha1-Xv+OPmhNVprkyysSgmBOi6YhSfo=",
-      "dev": true,
-      "requires": {
-        "assert-plus": "^1.0.0"
-      }
-    },
     "glob": {
       "version": "7.1.6",
       "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.6.tgz",
@@ -754,22 +678,6 @@
       "integrity": "sha512-qBr4OuELkhPenW6goKVXiv47US3clb3/IbuWF9KNKEijAy9oeHxU9IgzjvJhHkUzhaj7rOUD7+YGWqUjLp5oSA==",
       "dev": true
     },
-    "har-schema": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/har-schema/-/har-schema-2.0.0.tgz",
-      "integrity": "sha1-qUwiJOvKwEeCoNkDVSHyRzW37JI=",
-      "dev": true
-    },
-    "har-validator": {
-      "version": "5.1.5",
-      "resolved": "https://registry.npmjs.org/har-validator/-/har-validator-5.1.5.tgz",
-      "integrity": "sha512-nmT2T0lljbxdQZfspsno9hgrG3Uir6Ks5afism62poxqBM6sDnMEuPmzTq8XN0OEwqKLLdh1jQI3qyE66Nzb3w==",
-      "dev": true,
-      "requires": {
-        "ajv": "^6.12.3",
-        "har-schema": "^2.0.0"
-      }
-    },
     "has-flag": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
@@ -788,17 +696,6 @@
       "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==",
       "dev": true
     },
-    "http-signature": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.2.0.tgz",
-      "integrity": "sha1-muzZJRFHcvPZW2WmCruPfBj7rOE=",
-      "dev": true,
-      "requires": {
-        "assert-plus": "^1.0.0",
-        "jsprim": "^1.2.2",
-        "sshpk": "^1.7.0"
-      }
-    },
     "inflight": {
       "version": "1.0.6",
       "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
@@ -884,16 +781,10 @@
       "integrity": "sha512-YWnfyRwxL/+SsrWYfOpUtz5b3YD+nyfkHvjbcanzk8zgyO4ASD67uVMRt8k5bM4lLMDnXfriRhOpemw+NfT1eA==",
       "dev": true
     },
-    "is-typedarray": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/is-typedarray/-/is-typedarray-1.0.0.tgz",
-      "integrity": "sha1-5HnICFjfDBsR3dppQPlgEfzaSpo=",
-      "dev": true
-    },
     "isarray": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
-      "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=",
+      "version": "0.0.1",
+      "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz",
+      "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=",
       "dev": true
     },
     "isexe": {
@@ -902,12 +793,6 @@
       "integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=",
       "dev": true
     },
-    "isstream": {
-      "version": "0.1.2",
-      "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz",
-      "integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo=",
-      "dev": true
-    },
     "js-yaml": {
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.0.0.tgz",
@@ -917,36 +802,12 @@
         "argparse": "^2.0.1"
       }
     },
-    "jsbn": {
-      "version": "0.1.1",
-      "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
-      "integrity": "sha1-peZUwuWi3rXyAdls77yoDA7y9RM=",
-      "dev": true
-    },
     "json-parse-better-errors": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/json-parse-better-errors/-/json-parse-better-errors-1.0.2.tgz",
       "integrity": "sha512-mrqyZKfX5EhL7hvqcV6WG1yYjnjeuYDzDhhcAAUrq8Po85NBQBJP+ZDUT75qZQ98IkUoBqdkExkukOU7Ts2wrw==",
       "dev": true
     },
-    "json-schema": {
-      "version": "0.2.3",
-      "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz",
-      "integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=",
-      "dev": true
-    },
-    "json-schema-traverse": {
-      "version": "0.4.1",
-      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
-      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
-      "dev": true
-    },
-    "json-stringify-safe": {
-      "version": "5.0.1",
-      "resolved": "https://registry.npmjs.org/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz",
-      "integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus=",
-      "dev": true
-    },
     "jsonc": {
       "version": "2.0.0",
       "resolved": "https://registry.npmjs.org/jsonc/-/jsonc-2.0.0.tgz",
@@ -978,18 +839,6 @@
         "graceful-fs": "^4.1.6"
       }
     },
-    "jsprim": {
-      "version": "1.4.1",
-      "resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.1.tgz",
-      "integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=",
-      "dev": true,
-      "requires": {
-        "assert-plus": "1.0.0",
-        "extsprintf": "1.3.0",
-        "json-schema": "0.2.3",
-        "verror": "1.10.0"
-      }
-    },
     "lcid": {
       "version": "1.0.0",
       "resolved": "https://registry.npmjs.org/lcid/-/lcid-1.0.0.tgz",
@@ -1060,47 +909,6 @@
       "dev": true,
       "requires": {
         "readable-stream": "~1.0.26-2"
-      },
-      "dependencies": {
-        "isarray": {
-          "version": "0.0.1",
-          "resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz",
-          "integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8=",
-          "dev": true
-        },
-        "readable-stream": {
-          "version": "1.0.34",
-          "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz",
-          "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=",
-          "dev": true,
-          "requires": {
-            "core-util-is": "~1.0.0",
-            "inherits": "~2.0.1",
-            "isarray": "0.0.1",
-            "string_decoder": "~0.10.x"
-          }
-        },
-        "string_decoder": {
-          "version": "0.10.31",
-          "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
-          "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=",
-          "dev": true
-        }
-      }
-    },
-    "mime-db": {
-      "version": "1.47.0",
-      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.47.0.tgz",
-      "integrity": "sha512-QBmA/G2y+IfeS4oktet3qRZ+P5kPhCKRXxXnQEudYqUaEioAU1/Lq2us3D/t1Jfo4hE9REQPrbB7K5sOczJVIw==",
-      "dev": true
-    },
-    "mime-types": {
-      "version": "2.1.30",
-      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.30.tgz",
-      "integrity": "sha512-crmjA4bLtR8m9qLpHvgxSChT+XoSlZi8J4n/aIdn3z92e/U47Z0V/yl+Wh9W046GgFVAmoNR/fmdbZYcSSIUeg==",
-      "dev": true,
-      "requires": {
-        "mime-db": "1.47.0"
       }
     },
     "minimatch": {
@@ -1300,18 +1108,23 @@
       "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
       "dev": true
     },
+    "npmlog": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-1.2.1.tgz",
+      "integrity": "sha1-KOe+YZYJtT960d0wChDWTXFiaLY=",
+      "dev": true,
+      "requires": {
+        "ansi": "~0.3.0",
+        "are-we-there-yet": "~1.0.0",
+        "gauge": "~1.2.0"
+      }
+    },
     "number-is-nan": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz",
       "integrity": "sha1-CXtgK1NCKlIsGvuHkDGDNpQaAR0=",
       "dev": true
     },
-    "oauth-sign": {
-      "version": "0.9.0",
-      "resolved": "https://registry.npmjs.org/oauth-sign/-/oauth-sign-0.9.0.tgz",
-      "integrity": "sha512-fexhUFFPTGV8ybAtSIGbV6gOkSv8UtRbDBnAyLQw4QPKkgNlsH2ByPGtMUqdWkos6YCRmAqViwgZrJc/mRDzZQ==",
-      "dev": true
-    },
     "once": {
       "version": "1.4.0",
       "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
@@ -2504,12 +2317,6 @@
       "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=",
       "dev": true
     },
-    "performance-now": {
-      "version": "2.1.0",
-      "resolved": "https://registry.npmjs.org/performance-now/-/performance-now-2.1.0.tgz",
-      "integrity": "sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns=",
-      "dev": true
-    },
     "picomatch": {
       "version": "2.2.3",
       "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.2.3.tgz",
@@ -2551,24 +2358,6 @@
         }
       }
     },
-    "psl": {
-      "version": "1.8.0",
-      "resolved": "https://registry.npmjs.org/psl/-/psl-1.8.0.tgz",
-      "integrity": "sha512-RIdOzyoavK+hA18OGGWDqUTsCLhtA7IcZ/6NCs4fFJaHBDab+pDDmDIByWFRQJq2Cd7r1OoQxBGKOaztq+hjIQ==",
-      "dev": true
-    },
-    "punycode": {
-      "version": "2.1.1",
-      "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.1.1.tgz",
-      "integrity": "sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==",
-      "dev": true
-    },
-    "qs": {
-      "version": "6.5.2",
-      "resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz",
-      "integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA==",
-      "dev": true
-    },
     "randombytes": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz",
@@ -2591,18 +2380,15 @@
       }
     },
     "readable-stream": {
-      "version": "2.3.7",
-      "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.7.tgz",
-      "integrity": "sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==",
+      "version": "1.0.34",
+      "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz",
+      "integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=",
       "dev": true,
       "requires": {
         "core-util-is": "~1.0.0",
-        "inherits": "~2.0.3",
-        "isarray": "~1.0.0",
-        "process-nextick-args": "~2.0.0",
-        "safe-buffer": "~5.1.1",
-        "string_decoder": "~1.1.1",
-        "util-deprecate": "~1.0.1"
+        "inherits": "~2.0.1",
+        "isarray": "0.0.1",
+        "string_decoder": "~0.10.x"
       }
     },
     "readdirp": {
@@ -2614,34 +2400,6 @@
         "picomatch": "^2.2.1"
       }
     },
-    "request": {
-      "version": "2.88.2",
-      "resolved": "https://registry.npmjs.org/request/-/request-2.88.2.tgz",
-      "integrity": "sha512-MsvtOrfG9ZcrOwAW+Qi+F6HbD0CWXEh9ou77uOb7FM2WPhwT7smM833PzanhJLsgXjN89Ir6V2PczXNnMpwKhw==",
-      "dev": true,
-      "requires": {
-        "aws-sign2": "~0.7.0",
-        "aws4": "^1.8.0",
-        "caseless": "~0.12.0",
-        "combined-stream": "~1.0.6",
-        "extend": "~3.0.2",
-        "forever-agent": "~0.6.1",
-        "form-data": "~2.3.2",
-        "har-validator": "~5.1.3",
-        "http-signature": "~1.2.0",
-        "is-typedarray": "~1.0.0",
-        "isstream": "~0.1.2",
-        "json-stringify-safe": "~5.0.1",
-        "mime-types": "~2.1.19",
-        "oauth-sign": "~0.9.0",
-        "performance-now": "^2.1.0",
-        "qs": "~6.5.2",
-        "safe-buffer": "^5.1.2",
-        "tough-cookie": "~2.5.0",
-        "tunnel-agent": "^0.6.0",
-        "uuid": "^3.3.2"
-      }
-    },
     "require-directory": {
       "version": "2.1.1",
       "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
@@ -2663,12 +2421,6 @@
       "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==",
       "dev": true
     },
-    "safer-buffer": {
-      "version": "2.1.2",
-      "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz",
-      "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==",
-      "dev": true
-    },
     "semver": {
       "version": "5.7.1",
       "resolved": "https://registry.npmjs.org/semver/-/semver-5.7.1.tgz",
@@ -2696,23 +2448,6 @@
       "integrity": "sha1-/p965lc3GzOxDLgNoUPPgknPazs=",
       "dev": true
     },
-    "sshpk": {
-      "version": "1.16.1",
-      "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.1.tgz",
-      "integrity": "sha512-HXXqVUq7+pcKeLqqZj6mHFUMvXtOJt1uoUx09pFW6011inTMxqI8BA8PM95myrIyyKwdnzjdFjLiE6KBPVtJIg==",
-      "dev": true,
-      "requires": {
-        "asn1": "~0.2.3",
-        "assert-plus": "^1.0.0",
-        "bcrypt-pbkdf": "^1.0.0",
-        "dashdash": "^1.12.0",
-        "ecc-jsbn": "~0.1.1",
-        "getpass": "^0.1.1",
-        "jsbn": "~0.1.0",
-        "safer-buffer": "^2.0.2",
-        "tweetnacl": "~0.14.0"
-      }
-    },
     "string-width": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz",
@@ -2725,13 +2460,10 @@
       }
     },
     "string_decoder": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz",
-      "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==",
-      "dev": true,
-      "requires": {
-        "safe-buffer": "~5.1.0"
-      }
+      "version": "0.10.31",
+      "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
+      "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=",
+      "dev": true
     },
     "strip-ansi": {
       "version": "3.0.1",
@@ -2795,37 +2527,12 @@
         "is-number": "^7.0.0"
       }
     },
-    "tough-cookie": {
-      "version": "2.5.0",
-      "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-2.5.0.tgz",
-      "integrity": "sha512-nlLsUzgm1kfLXSXfRZMc1KLAugd4hqJHDTvc2hDIwS3mZAfMEuMbc03SujMF+GEcpaX/qboeycw6iO8JwVv2+g==",
-      "dev": true,
-      "requires": {
-        "psl": "^1.1.28",
-        "punycode": "^2.1.1"
-      }
-    },
     "traverse": {
       "version": "0.3.9",
       "resolved": "https://registry.npmjs.org/traverse/-/traverse-0.3.9.tgz",
       "integrity": "sha1-cXuPIgzAu3tE5AUUwisui7xw2Lk=",
       "dev": true
     },
-    "tunnel-agent": {
-      "version": "0.6.0",
-      "resolved": "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz",
-      "integrity": "sha1-J6XeoGs2sEoKmWZ3SykIaPD8QP0=",
-      "dev": true,
-      "requires": {
-        "safe-buffer": "^5.0.1"
-      }
-    },
-    "tweetnacl": {
-      "version": "0.14.5",
-      "resolved": "https://registry.npmjs.org/tweetnacl/-/tweetnacl-0.14.5.tgz",
-      "integrity": "sha1-WuaBd/GS1EViadEIr6k/+HQ/T2Q=",
-      "dev": true
-    },
     "typescript": {
       "version": "4.2.4",
       "resolved": "https://registry.npmjs.org/typescript/-/typescript-4.2.4.tgz",
@@ -2855,6 +2562,12 @@
         "setimmediate": "~1.0.4"
       },
       "dependencies": {
+        "isarray": {
+          "version": "1.0.0",
+          "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz",
+          "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=",
+          "dev": true
+        },
         "process-nextick-args": {
           "version": "1.0.7",
           "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-1.0.7.tgz",
@@ -2875,24 +2588,9 @@
             "string_decoder": "~0.10.x",
             "util-deprecate": "~1.0.1"
           }
-        },
-        "string_decoder": {
-          "version": "0.10.31",
-          "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
-          "integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ=",
-          "dev": true
         }
       }
     },
-    "uri-js": {
-      "version": "4.4.1",
-      "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
-      "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
-      "dev": true,
-      "requires": {
-        "punycode": "^2.1.0"
-      }
-    },
     "url-join": {
       "version": "0.0.1",
       "resolved": "https://registry.npmjs.org/url-join/-/url-join-0.0.1.tgz",
@@ -2905,23 +2603,6 @@
       "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=",
       "dev": true
     },
-    "uuid": {
-      "version": "3.4.0",
-      "resolved": "https://registry.npmjs.org/uuid/-/uuid-3.4.0.tgz",
-      "integrity": "sha512-HjSDRw6gZE5JMggctHBcjVak08+KEVhSIiDzFnT9S9aegmp85S/bReBVTb4QTFaRNptJ9kuYaNhnbNEOkbKb/A==",
-      "dev": true
-    },
-    "verror": {
-      "version": "1.10.0",
-      "resolved": "https://registry.npmjs.org/verror/-/verror-1.10.0.tgz",
-      "integrity": "sha1-OhBcoXBTr1XW4nDB+CiGguGNpAA=",
-      "dev": true,
-      "requires": {
-        "assert-plus": "^1.0.0",
-        "core-util-is": "1.0.2",
-        "extsprintf": "^1.2.0"
-      }
-    },
     "which": {
       "version": "1.3.1",
       "resolved": "https://registry.npmjs.org/which/-/which-1.3.1.tgz",
diff --git a/js/node/package.json b/js/node/package.json
index 7f85243815..e05abd6abe 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -32,7 +32,7 @@
     "@types/minimist": "1.2.1",
     "@types/mocha": "^8.2.2",
     "@types/node": "^14.14.37",
-    "cmake-js": "^6.1.0",
+    "cmake-js": "^6.2.1",
     "fs-extra": "^9.1.0",
     "jsonc": "^2.0.0",
     "minimist": "^1.2.5",
diff --git a/js/package-lock.json b/js/package-lock.json
index 8274737fa0..9eb4399f55 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -552,9 +552,9 @@
       "dev": true
     },
     "ansi-regex": {
-      "version": "5.0.0",
-      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz",
-      "integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg==",
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
       "dev": true
     },
     "ansi-styles": {
@@ -2053,9 +2053,9 @@
       "dev": true
     },
     "path-parse": {
-      "version": "1.0.6",
-      "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.6.tgz",
-      "integrity": "sha512-GSmOT2EbHrINBf9SR7CDELwlJ8AENk3Qn7OikK4nFYAu3Ote2+JYNVvkpAEQm3/TLNEJFD/xZJjzyxg3KBWOzw==",
+      "version": "1.0.7",
+      "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz",
+      "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==",
       "dev": true
     },
     "path-type": {
diff --git a/js/react_native/package.json b/js/react_native/package.json
index c8c2ede152..659359d401 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -58,7 +58,7 @@
     "pod-install": "^0.1.0",
     "prettier": "^2.0.5",
     "react": "16.13.1",
-    "react-native": "0.63.4",
+    "react-native": "0.64.1",
     "react-native-builder-bob": "^0.17.1",
     "release-it": "^14.2.2",
     "typescript": "^4.1.3"
diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
index e03ea3b02d..9f1a565593 100644
--- a/js/react_native/yarn.lock
+++ b/js/react_native/yarn.lock
@@ -9,11 +9,23 @@
   dependencies:
     "@babel/highlight" "^7.12.13"
 
+"@babel/code-frame@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.16.0.tgz#0dfc80309beec8411e65e706461c408b0bb9b431"
+  integrity sha512-IF4EOMEV+bfYwOmNxGzSnjR2EmQod7f1UXOpZM3l4i4o4QNwzjtJAu/HxdjHq0aYBvdqMuQEY1eg0nqW9ZPORA==
+  dependencies:
+    "@babel/highlight" "^7.16.0"
+
 "@babel/compat-data@^7.13.11", "@babel/compat-data@^7.13.15", "@babel/compat-data@^7.13.8", "@babel/compat-data@^7.14.0":
   version "7.14.0"
   resolved "https://registry.yarnpkg.com/@babel/compat-data/-/compat-data-7.14.0.tgz#a901128bce2ad02565df95e6ecbf195cf9465919"
   integrity sha512-vu9V3uMM/1o5Hl5OekMUowo3FqXLJSw+s+66nt0fSWVWTtmosdzn45JHOB3cPtZoe6CTBDzvSw0RdOY85Q37+Q==
 
+"@babel/compat-data@^7.16.0":
+  version "7.16.4"
+  resolved "https://registry.yarnpkg.com/@babel/compat-data/-/compat-data-7.16.4.tgz#081d6bbc336ec5c2435c6346b2ae1fb98b5ac68e"
+  integrity sha512-1o/jo7D+kC9ZjHX5v+EHrdjl3PhxMrLSOTGsOdHJ+KL8HCaEK6ehrVL2RS6oHDZp+L7xLirLrPmQtEng769J/Q==
+
 "@babel/core@^7.0.0", "@babel/core@^7.1.0", "@babel/core@^7.12.10", "@babel/core@^7.7.5":
   version "7.14.0"
   resolved "https://registry.yarnpkg.com/@babel/core/-/core-7.14.0.tgz#47299ff3ec8d111b493f1a9d04bf88c04e728d88"
@@ -35,6 +47,27 @@
     semver "^6.3.0"
     source-map "^0.5.0"
 
+"@babel/core@^7.1.6":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/core/-/core-7.16.0.tgz#c4ff44046f5fe310525cc9eb4ef5147f0c5374d4"
+  integrity sha512-mYZEvshBRHGsIAiyH5PzCFTCfbWfoYbO/jcSdXQSUQu1/pW0xDZAUP7KEc32heqWTAfAHhV9j1vH8Sav7l+JNQ==
+  dependencies:
+    "@babel/code-frame" "^7.16.0"
+    "@babel/generator" "^7.16.0"
+    "@babel/helper-compilation-targets" "^7.16.0"
+    "@babel/helper-module-transforms" "^7.16.0"
+    "@babel/helpers" "^7.16.0"
+    "@babel/parser" "^7.16.0"
+    "@babel/template" "^7.16.0"
+    "@babel/traverse" "^7.16.0"
+    "@babel/types" "^7.16.0"
+    convert-source-map "^1.7.0"
+    debug "^4.1.0"
+    gensync "^1.0.0-beta.2"
+    json5 "^2.1.2"
+    semver "^6.3.0"
+    source-map "^0.5.0"
+
 "@babel/generator@^7.14.0", "@babel/generator@^7.5.0":
   version "7.14.1"
   resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.14.1.tgz#1f99331babd65700183628da186f36f63d615c93"
@@ -44,6 +77,15 @@
     jsesc "^2.5.1"
     source-map "^0.5.0"
 
+"@babel/generator@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.16.0.tgz#d40f3d1d5075e62d3500bccb67f3daa8a95265b2"
+  integrity sha512-RR8hUCfRQn9j9RPKEVXo9LiwoxLPYn6hNZlvUOR8tSnaxlD0p0+la00ZP9/SnRt6HchKr+X0fO2r8vrETiJGew==
+  dependencies:
+    "@babel/types" "^7.16.0"
+    jsesc "^2.5.1"
+    source-map "^0.5.0"
+
 "@babel/helper-annotate-as-pure@^7.10.4", "@babel/helper-annotate-as-pure@^7.12.13":
   version "7.12.13"
   resolved "https://registry.yarnpkg.com/@babel/helper-annotate-as-pure/-/helper-annotate-as-pure-7.12.13.tgz#0f58e86dfc4bb3b1fcd7db806570e177d439b6ab"
@@ -51,6 +93,13 @@
   dependencies:
     "@babel/types" "^7.12.13"
 
+"@babel/helper-annotate-as-pure@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-annotate-as-pure/-/helper-annotate-as-pure-7.16.0.tgz#9a1f0ebcda53d9a2d00108c4ceace6a5d5f1f08d"
+  integrity sha512-ItmYF9vR4zA8cByDocY05o0LGUkp1zhbTQOH1NFyl5xXEqlTJQCEJjieriw+aFpxo16swMxUnUiKS7a/r4vtHg==
+  dependencies:
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-builder-binary-assignment-operator-visitor@^7.12.13":
   version "7.12.13"
   resolved "https://registry.yarnpkg.com/@babel/helper-builder-binary-assignment-operator-visitor/-/helper-builder-binary-assignment-operator-visitor-7.12.13.tgz#6bc20361c88b0a74d05137a65cac8d3cbf6f61fc"
@@ -69,6 +118,16 @@
     browserslist "^4.14.5"
     semver "^6.3.0"
 
+"@babel/helper-compilation-targets@^7.16.0":
+  version "7.16.3"
+  resolved "https://registry.yarnpkg.com/@babel/helper-compilation-targets/-/helper-compilation-targets-7.16.3.tgz#5b480cd13f68363df6ec4dc8ac8e2da11363cbf0"
+  integrity sha512-vKsoSQAyBmxS35JUOOt+07cLc6Nk/2ljLIHwmq2/NM6hdioUaqEXq/S+nXvbvXbZkNDlWOymPanJGOc4CBjSJA==
+  dependencies:
+    "@babel/compat-data" "^7.16.0"
+    "@babel/helper-validator-option" "^7.14.5"
+    browserslist "^4.17.5"
+    semver "^6.3.0"
+
 "@babel/helper-create-class-features-plugin@^7.13.0", "@babel/helper-create-class-features-plugin@^7.14.0":
   version "7.14.1"
   resolved "https://registry.yarnpkg.com/@babel/helper-create-class-features-plugin/-/helper-create-class-features-plugin-7.14.1.tgz#1fe11b376f3c41650ad9fedc665b0068722ea76c"
@@ -81,6 +140,18 @@
     "@babel/helper-replace-supers" "^7.13.12"
     "@babel/helper-split-export-declaration" "^7.12.13"
 
+"@babel/helper-create-class-features-plugin@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-create-class-features-plugin/-/helper-create-class-features-plugin-7.16.0.tgz#090d4d166b342a03a9fec37ef4fd5aeb9c7c6a4b"
+  integrity sha512-XLwWvqEaq19zFlF5PTgOod4bUA+XbkR4WLQBct1bkzmxJGB0ZEJaoKF4c8cgH9oBtCDuYJ8BP5NB9uFiEgO5QA==
+  dependencies:
+    "@babel/helper-annotate-as-pure" "^7.16.0"
+    "@babel/helper-function-name" "^7.16.0"
+    "@babel/helper-member-expression-to-functions" "^7.16.0"
+    "@babel/helper-optimise-call-expression" "^7.16.0"
+    "@babel/helper-replace-supers" "^7.16.0"
+    "@babel/helper-split-export-declaration" "^7.16.0"
+
 "@babel/helper-create-regexp-features-plugin@^7.12.13":
   version "7.12.17"
   resolved "https://registry.yarnpkg.com/@babel/helper-create-regexp-features-plugin/-/helper-create-regexp-features-plugin-7.12.17.tgz#a2ac87e9e319269ac655b8d4415e94d38d663cb7"
@@ -119,6 +190,15 @@
     "@babel/template" "^7.12.13"
     "@babel/types" "^7.12.13"
 
+"@babel/helper-function-name@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-function-name/-/helper-function-name-7.16.0.tgz#b7dd0797d00bbfee4f07e9c4ea5b0e30c8bb1481"
+  integrity sha512-BZh4mEk1xi2h4HFjWUXRQX5AEx4rvaZxHgax9gcjdLWdkjsY7MKt5p0otjsg5noXw+pB+clMCjw+aEVYADMjog==
+  dependencies:
+    "@babel/helper-get-function-arity" "^7.16.0"
+    "@babel/template" "^7.16.0"
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-get-function-arity@^7.12.13":
   version "7.12.13"
   resolved "https://registry.yarnpkg.com/@babel/helper-get-function-arity/-/helper-get-function-arity-7.12.13.tgz#bc63451d403a3b3082b97e1d8b3fe5bd4091e583"
@@ -126,6 +206,13 @@
   dependencies:
     "@babel/types" "^7.12.13"
 
+"@babel/helper-get-function-arity@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-get-function-arity/-/helper-get-function-arity-7.16.0.tgz#0088c7486b29a9cb5d948b1a1de46db66e089cfa"
+  integrity sha512-ASCquNcywC1NkYh/z7Cgp3w31YW8aojjYIlNg4VeJiHkqyP4AzIvr4qx7pYDb4/s8YcsZWqqOSxgkvjUz1kpDQ==
+  dependencies:
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-hoist-variables@^7.13.0":
   version "7.13.16"
   resolved "https://registry.yarnpkg.com/@babel/helper-hoist-variables/-/helper-hoist-variables-7.13.16.tgz#1b1651249e94b51f8f0d33439843e33e39775b30"
@@ -134,6 +221,13 @@
     "@babel/traverse" "^7.13.15"
     "@babel/types" "^7.13.16"
 
+"@babel/helper-hoist-variables@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-hoist-variables/-/helper-hoist-variables-7.16.0.tgz#4c9023c2f1def7e28ff46fc1dbcd36a39beaa81a"
+  integrity sha512-1AZlpazjUR0EQZQv3sgRNfM9mEVWPK3M6vlalczA+EECcPz3XPh6VplbErL5UoMpChhSck5wAJHthlj1bYpcmg==
+  dependencies:
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-member-expression-to-functions@^7.13.12":
   version "7.13.12"
   resolved "https://registry.yarnpkg.com/@babel/helper-member-expression-to-functions/-/helper-member-expression-to-functions-7.13.12.tgz#dfe368f26d426a07299d8d6513821768216e6d72"
@@ -141,6 +235,13 @@
   dependencies:
     "@babel/types" "^7.13.12"
 
+"@babel/helper-member-expression-to-functions@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-member-expression-to-functions/-/helper-member-expression-to-functions-7.16.0.tgz#29287040efd197c77636ef75188e81da8bccd5a4"
+  integrity sha512-bsjlBFPuWT6IWhl28EdrQ+gTvSvj5tqVP5Xeftp07SEuz5pLnsXZuDkDD3Rfcxy0IsHmbZ+7B2/9SHzxO0T+sQ==
+  dependencies:
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-module-imports@^7.12.13", "@babel/helper-module-imports@^7.13.12":
   version "7.13.12"
   resolved "https://registry.yarnpkg.com/@babel/helper-module-imports/-/helper-module-imports-7.13.12.tgz#c6a369a6f3621cb25da014078684da9196b61977"
@@ -148,6 +249,13 @@
   dependencies:
     "@babel/types" "^7.13.12"
 
+"@babel/helper-module-imports@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-module-imports/-/helper-module-imports-7.16.0.tgz#90538e60b672ecf1b448f5f4f5433d37e79a3ec3"
+  integrity sha512-kkH7sWzKPq0xt3H1n+ghb4xEMP8k0U7XV3kkB+ZGy69kDk2ySFW1qPi06sjKzFY3t1j6XbJSqr4mF9L7CYVyhg==
+  dependencies:
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-module-transforms@^7.13.0", "@babel/helper-module-transforms@^7.14.0":
   version "7.14.0"
   resolved "https://registry.yarnpkg.com/@babel/helper-module-transforms/-/helper-module-transforms-7.14.0.tgz#8fcf78be220156f22633ee204ea81f73f826a8ad"
@@ -162,6 +270,20 @@
     "@babel/traverse" "^7.14.0"
     "@babel/types" "^7.14.0"
 
+"@babel/helper-module-transforms@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-module-transforms/-/helper-module-transforms-7.16.0.tgz#1c82a8dd4cb34577502ebd2909699b194c3e9bb5"
+  integrity sha512-My4cr9ATcaBbmaEa8M0dZNA74cfI6gitvUAskgDtAFmAqyFKDSHQo5YstxPbN+lzHl2D9l/YOEFqb2mtUh4gfA==
+  dependencies:
+    "@babel/helper-module-imports" "^7.16.0"
+    "@babel/helper-replace-supers" "^7.16.0"
+    "@babel/helper-simple-access" "^7.16.0"
+    "@babel/helper-split-export-declaration" "^7.16.0"
+    "@babel/helper-validator-identifier" "^7.15.7"
+    "@babel/template" "^7.16.0"
+    "@babel/traverse" "^7.16.0"
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-optimise-call-expression@^7.12.13":
   version "7.12.13"
   resolved "https://registry.yarnpkg.com/@babel/helper-optimise-call-expression/-/helper-optimise-call-expression-7.12.13.tgz#5c02d171b4c8615b1e7163f888c1c81c30a2aaea"
@@ -169,11 +291,23 @@
   dependencies:
     "@babel/types" "^7.12.13"
 
+"@babel/helper-optimise-call-expression@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-optimise-call-expression/-/helper-optimise-call-expression-7.16.0.tgz#cecdb145d70c54096b1564f8e9f10cd7d193b338"
+  integrity sha512-SuI467Gi2V8fkofm2JPnZzB/SUuXoJA5zXe/xzyPP2M04686RzFKFHPK6HDVN6JvWBIEW8tt9hPR7fXdn2Lgpw==
+  dependencies:
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-plugin-utils@^7.0.0", "@babel/helper-plugin-utils@^7.10.4", "@babel/helper-plugin-utils@^7.12.13", "@babel/helper-plugin-utils@^7.13.0", "@babel/helper-plugin-utils@^7.8.0", "@babel/helper-plugin-utils@^7.8.3":
   version "7.13.0"
   resolved "https://registry.yarnpkg.com/@babel/helper-plugin-utils/-/helper-plugin-utils-7.13.0.tgz#806526ce125aed03373bc416a828321e3a6a33af"
   integrity sha512-ZPafIPSwzUlAoWT8DKs1W2VyF2gOWthGd5NGFMsBcMMol+ZhK+EQY/e6V96poa6PA/Bh+C9plWN0hXO1uB8AfQ==
 
+"@babel/helper-plugin-utils@^7.14.5":
+  version "7.14.5"
+  resolved "https://registry.yarnpkg.com/@babel/helper-plugin-utils/-/helper-plugin-utils-7.14.5.tgz#5ac822ce97eec46741ab70a517971e443a70c5a9"
+  integrity sha512-/37qQCE3K0vvZKwoK4XU/irIJQdIfCJuhU5eKnNxpFDsOkgFaUAwbv+RYw6eYgsC0E4hS7r5KqGULUogqui0fQ==
+
 "@babel/helper-remap-async-to-generator@^7.13.0":
   version "7.13.0"
   resolved "https://registry.yarnpkg.com/@babel/helper-remap-async-to-generator/-/helper-remap-async-to-generator-7.13.0.tgz#376a760d9f7b4b2077a9dd05aa9c3927cadb2209"
@@ -193,6 +327,16 @@
     "@babel/traverse" "^7.13.0"
     "@babel/types" "^7.13.12"
 
+"@babel/helper-replace-supers@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-replace-supers/-/helper-replace-supers-7.16.0.tgz#73055e8d3cf9bcba8ddb55cad93fedc860f68f17"
+  integrity sha512-TQxuQfSCdoha7cpRNJvfaYxxxzmbxXw/+6cS7V02eeDYyhxderSoMVALvwupA54/pZcOTtVeJ0xccp1nGWladA==
+  dependencies:
+    "@babel/helper-member-expression-to-functions" "^7.16.0"
+    "@babel/helper-optimise-call-expression" "^7.16.0"
+    "@babel/traverse" "^7.16.0"
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-simple-access@^7.13.12":
   version "7.13.12"
   resolved "https://registry.yarnpkg.com/@babel/helper-simple-access/-/helper-simple-access-7.13.12.tgz#dd6c538afb61819d205a012c31792a39c7a5eaf6"
@@ -200,6 +344,13 @@
   dependencies:
     "@babel/types" "^7.13.12"
 
+"@babel/helper-simple-access@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-simple-access/-/helper-simple-access-7.16.0.tgz#21d6a27620e383e37534cf6c10bba019a6f90517"
+  integrity sha512-o1rjBT/gppAqKsYfUdfHq5Rk03lMQrkPHG1OWzHWpLgVXRH4HnMM9Et9CVdIqwkCQlobnGHEJMsgWP/jE1zUiw==
+  dependencies:
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-skip-transparent-expression-wrappers@^7.12.1":
   version "7.12.1"
   resolved "https://registry.yarnpkg.com/@babel/helper-skip-transparent-expression-wrappers/-/helper-skip-transparent-expression-wrappers-7.12.1.tgz#462dc63a7e435ade8468385c63d2b84cce4b3cbf"
@@ -207,6 +358,13 @@
   dependencies:
     "@babel/types" "^7.12.1"
 
+"@babel/helper-skip-transparent-expression-wrappers@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-skip-transparent-expression-wrappers/-/helper-skip-transparent-expression-wrappers-7.16.0.tgz#0ee3388070147c3ae051e487eca3ebb0e2e8bb09"
+  integrity sha512-+il1gTy0oHwUsBQZyJvukbB4vPMdcYBrFHa0Uc4AizLxbq6BOYC51Rv4tWocX9BLBDLZ4kc6qUFpQ6HRgL+3zw==
+  dependencies:
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-split-export-declaration@^7.12.13":
   version "7.12.13"
   resolved "https://registry.yarnpkg.com/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.12.13.tgz#e9430be00baf3e88b0e13e6f9d4eaf2136372b05"
@@ -214,16 +372,33 @@
   dependencies:
     "@babel/types" "^7.12.13"
 
+"@babel/helper-split-export-declaration@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.16.0.tgz#29672f43663e936df370aaeb22beddb3baec7438"
+  integrity sha512-0YMMRpuDFNGTHNRiiqJX19GjNXA4H0E8jZ2ibccfSxaCogbm3am5WN/2nQNj0YnQwGWM1J06GOcQ2qnh3+0paw==
+  dependencies:
+    "@babel/types" "^7.16.0"
+
 "@babel/helper-validator-identifier@^7.12.11", "@babel/helper-validator-identifier@^7.14.0":
   version "7.14.0"
   resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.14.0.tgz#d26cad8a47c65286b15df1547319a5d0bcf27288"
   integrity sha512-V3ts7zMSu5lfiwWDVWzRDGIN+lnCEUdaXgtVHJgLb1rGaA6jMrtB9EmE7L18foXJIE8Un/A/h6NJfGQp/e1J4A==
 
+"@babel/helper-validator-identifier@^7.15.7":
+  version "7.15.7"
+  resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.15.7.tgz#220df993bfe904a4a6b02ab4f3385a5ebf6e2389"
+  integrity sha512-K4JvCtQqad9OY2+yTU8w+E82ywk/fe+ELNlt1G8z3bVGlZfn/hOcQQsUhGhW/N+tb3fxK800wLtKOE/aM0m72w==
+
 "@babel/helper-validator-option@^7.12.17":
   version "7.12.17"
   resolved "https://registry.yarnpkg.com/@babel/helper-validator-option/-/helper-validator-option-7.12.17.tgz#d1fbf012e1a79b7eebbfdc6d270baaf8d9eb9831"
   integrity sha512-TopkMDmLzq8ngChwRlyjR6raKD6gMSae4JdYDB8bByKreQgG0RBTuKe9LRxW3wFtUnjxOPRKBDwEH6Mg5KeDfw==
 
+"@babel/helper-validator-option@^7.14.5":
+  version "7.14.5"
+  resolved "https://registry.yarnpkg.com/@babel/helper-validator-option/-/helper-validator-option-7.14.5.tgz#6e72a1fff18d5dfcb878e1e62f1a021c4b72d5a3"
+  integrity sha512-OX8D5eeX4XwcroVW45NMvoYaIuFI+GQpA2a8Gi+X/U/cDUIRsV37qQfF905F0htTRCREQIB4KqPeaveRJUl3Ow==
+
 "@babel/helper-wrap-function@^7.13.0":
   version "7.13.0"
   resolved "https://registry.yarnpkg.com/@babel/helper-wrap-function/-/helper-wrap-function-7.13.0.tgz#bdb5c66fda8526ec235ab894ad53a1235c79fcc4"
@@ -243,6 +418,15 @@
     "@babel/traverse" "^7.14.0"
     "@babel/types" "^7.14.0"
 
+"@babel/helpers@^7.16.0":
+  version "7.16.3"
+  resolved "https://registry.yarnpkg.com/@babel/helpers/-/helpers-7.16.3.tgz#27fc64f40b996e7074dc73128c3e5c3e7f55c43c"
+  integrity sha512-Xn8IhDlBPhvYTvgewPKawhADichOsbkZuzN7qz2BusOM0brChsyXMDJvldWaYMMUNiCQdQzNEioXTp3sC8Nt8w==
+  dependencies:
+    "@babel/template" "^7.16.0"
+    "@babel/traverse" "^7.16.3"
+    "@babel/types" "^7.16.0"
+
 "@babel/highlight@^7.12.13":
   version "7.14.0"
   resolved "https://registry.yarnpkg.com/@babel/highlight/-/highlight-7.14.0.tgz#3197e375711ef6bf834e67d0daec88e4f46113cf"
@@ -252,11 +436,25 @@
     chalk "^2.0.0"
     js-tokens "^4.0.0"
 
+"@babel/highlight@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/highlight/-/highlight-7.16.0.tgz#6ceb32b2ca4b8f5f361fb7fd821e3fddf4a1725a"
+  integrity sha512-t8MH41kUQylBtu2+4IQA3atqevA2lRgqA2wyVB/YiWmsDSuylZZuXOUy9ric30hfzauEFfdsuk/eXTRrGrfd0g==
+  dependencies:
+    "@babel/helper-validator-identifier" "^7.15.7"
+    chalk "^2.0.0"
+    js-tokens "^4.0.0"
+
 "@babel/parser@^7.0.0", "@babel/parser@^7.1.0", "@babel/parser@^7.12.13", "@babel/parser@^7.14.0":
   version "7.14.1"
   resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.14.1.tgz#1bd644b5db3f5797c4479d89ec1817fe02b84c47"
   integrity sha512-muUGEKu8E/ftMTPlNp+mc6zL3E9zKWmF5sDHZ5MSsoTP9Wyz64AhEf9kD08xYJ7w6Hdcu8H550ircnPyWSIF0Q==
 
+"@babel/parser@^7.1.6", "@babel/parser@^7.16.0", "@babel/parser@^7.16.3":
+  version "7.16.4"
+  resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.16.4.tgz#d5f92f57cf2c74ffe9b37981c0e72fee7311372e"
+  integrity sha512-6V0qdPUaiVHH3RtZeLIsc+6pDhbYzHR8ogA8w+f+Wc77DuXto19g2QUwveINoS34Uw+W8/hQDGJCx+i4n7xcng==
+
 "@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining@^7.13.12":
   version "7.13.12"
   resolved "https://registry.yarnpkg.com/@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining/-/plugin-bugfix-v8-spread-parameters-in-optional-chaining-7.13.12.tgz#a3484d84d0b549f3fc916b99ee4783f26fabad2a"
@@ -266,13 +464,6 @@
     "@babel/helper-skip-transparent-expression-wrappers" "^7.12.1"
     "@babel/plugin-proposal-optional-chaining" "^7.13.12"
 
-"@babel/plugin-external-helpers@^7.0.0":
-  version "7.12.13"
-  resolved "https://registry.yarnpkg.com/@babel/plugin-external-helpers/-/plugin-external-helpers-7.12.13.tgz#65ef9f4576297250dc601d2aa334769790d9966d"
-  integrity sha512-ClvAsk4RqpE6iacYUjdU9PtvIwC9yAefZENsPfGeG5FckX3jFZLDlWPuyv5gi9/9C2VgwX6H8q1ukBifC0ha+Q==
-  dependencies:
-    "@babel/helper-plugin-utils" "^7.12.13"
-
 "@babel/plugin-proposal-async-generator-functions@^7.13.15":
   version "7.13.15"
   resolved "https://registry.yarnpkg.com/@babel/plugin-proposal-async-generator-functions/-/plugin-proposal-async-generator-functions-7.13.15.tgz#80e549df273a3b3050431b148c892491df1bcc5b"
@@ -290,6 +481,14 @@
     "@babel/helper-create-class-features-plugin" "^7.13.0"
     "@babel/helper-plugin-utils" "^7.13.0"
 
+"@babel/plugin-proposal-class-properties@^7.1.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/plugin-proposal-class-properties/-/plugin-proposal-class-properties-7.16.0.tgz#c029618267ddebc7280fa286e0f8ca2a278a2d1a"
+  integrity sha512-mCF3HcuZSY9Fcx56Lbn+CGdT44ioBMMvjNVldpKtj8tpniETdLjnxdHI1+sDWXIM1nNt+EanJOZ3IG9lzVjs7A==
+  dependencies:
+    "@babel/helper-create-class-features-plugin" "^7.16.0"
+    "@babel/helper-plugin-utils" "^7.14.5"
+
 "@babel/plugin-proposal-class-static-block@^7.13.11":
   version "7.13.11"
   resolved "https://registry.yarnpkg.com/@babel/plugin-proposal-class-static-block/-/plugin-proposal-class-static-block-7.13.11.tgz#6fcbba4a962702c17e5371a0c7b39afde186d703"
@@ -346,6 +545,14 @@
     "@babel/helper-plugin-utils" "^7.13.0"
     "@babel/plugin-syntax-nullish-coalescing-operator" "^7.8.3"
 
+"@babel/plugin-proposal-nullish-coalescing-operator@^7.1.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/plugin-proposal-nullish-coalescing-operator/-/plugin-proposal-nullish-coalescing-operator-7.16.0.tgz#44e1cce08fe2427482cf446a91bb451528ed0596"
+  integrity sha512-3bnHA8CAFm7cG93v8loghDYyQ8r97Qydf63BeYiGgYbjKKB/XP53W15wfRC7dvKfoiJ34f6Rbyyx2btExc8XsQ==
+  dependencies:
+    "@babel/helper-plugin-utils" "^7.14.5"
+    "@babel/plugin-syntax-nullish-coalescing-operator" "^7.8.3"
+
 "@babel/plugin-proposal-numeric-separator@^7.12.13":
   version "7.12.13"
   resolved "https://registry.yarnpkg.com/@babel/plugin-proposal-numeric-separator/-/plugin-proposal-numeric-separator-7.12.13.tgz#bd9da3188e787b5120b4f9d465a8261ce67ed1db"
@@ -382,6 +589,15 @@
     "@babel/helper-skip-transparent-expression-wrappers" "^7.12.1"
     "@babel/plugin-syntax-optional-chaining" "^7.8.3"
 
+"@babel/plugin-proposal-optional-chaining@^7.1.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/plugin-proposal-optional-chaining/-/plugin-proposal-optional-chaining-7.16.0.tgz#56dbc3970825683608e9efb55ea82c2a2d6c8dc0"
+  integrity sha512-Y4rFpkZODfHrVo70Uaj6cC1JJOt3Pp0MdWSwIKtb8z1/lsjl9AmnB7ErRFV+QNGIfcY1Eruc2UMx5KaRnXjMyg==
+  dependencies:
+    "@babel/helper-plugin-utils" "^7.14.5"
+    "@babel/helper-skip-transparent-expression-wrappers" "^7.16.0"
+    "@babel/plugin-syntax-optional-chaining" "^7.8.3"
+
 "@babel/plugin-proposal-private-methods@^7.13.0":
   version "7.13.0"
   resolved "https://registry.yarnpkg.com/@babel/plugin-proposal-private-methods/-/plugin-proposal-private-methods-7.13.0.tgz#04bd4c6d40f6e6bbfa2f57e2d8094bad900ef787"
@@ -464,6 +680,13 @@
   dependencies:
     "@babel/helper-plugin-utils" "^7.12.13"
 
+"@babel/plugin-syntax-flow@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/plugin-syntax-flow/-/plugin-syntax-flow-7.16.0.tgz#07427021d093ed77019408221beaf0272bbcfaec"
+  integrity sha512-dH91yCo0RyqfzWgoM5Ji9ir8fQ+uFbt9KHM3d2x4jZOuHS6wNA+CRmRUP/BWCsHG2bjc7A2Way6AvH1eQk0wig==
+  dependencies:
+    "@babel/helper-plugin-utils" "^7.14.5"
+
 "@babel/plugin-syntax-import-meta@^7.8.3":
   version "7.10.4"
   resolved "https://registry.yarnpkg.com/@babel/plugin-syntax-import-meta/-/plugin-syntax-import-meta-7.10.4.tgz#ee601348c370fa334d2207be158777496521fd51"
@@ -548,6 +771,13 @@
   dependencies:
     "@babel/helper-plugin-utils" "^7.12.13"
 
+"@babel/plugin-syntax-typescript@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/plugin-syntax-typescript/-/plugin-syntax-typescript-7.16.0.tgz#2feeb13d9334cc582ea9111d3506f773174179bb"
+  integrity sha512-Xv6mEXqVdaqCBfJFyeab0fH2DnUoMsDmhamxsSi4j8nLd4Vtw213WMJr55xxqipC/YVWyPY3K0blJncPYji+dQ==
+  dependencies:
+    "@babel/helper-plugin-utils" "^7.14.5"
+
 "@babel/plugin-transform-arrow-functions@^7.0.0", "@babel/plugin-transform-arrow-functions@^7.13.0":
   version "7.13.0"
   resolved "https://registry.yarnpkg.com/@babel/plugin-transform-arrow-functions/-/plugin-transform-arrow-functions-7.13.0.tgz#10a59bebad52d637a027afa692e8d5ceff5e3dae"
@@ -636,6 +866,14 @@
     "@babel/helper-plugin-utils" "^7.13.0"
     "@babel/plugin-syntax-flow" "^7.12.13"
 
+"@babel/plugin-transform-flow-strip-types@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/plugin-transform-flow-strip-types/-/plugin-transform-flow-strip-types-7.16.0.tgz#edd968dc2041c1b69e451a262e948d6654a79dc2"
+  integrity sha512-vs/F5roOaO/+WxKfp9PkvLsAyj0G+Q0zbFimHm9X2KDgabN2XmNFoAafmeGEYspUlIF9+MvVmyek9UyHiqeG/w==
+  dependencies:
+    "@babel/helper-plugin-utils" "^7.14.5"
+    "@babel/plugin-syntax-flow" "^7.16.0"
+
 "@babel/plugin-transform-for-of@^7.0.0", "@babel/plugin-transform-for-of@^7.13.0":
   version "7.13.0"
   resolved "https://registry.yarnpkg.com/@babel/plugin-transform-for-of/-/plugin-transform-for-of-7.13.0.tgz#c799f881a8091ac26b54867a845c3e97d2696062"
@@ -684,6 +922,16 @@
     "@babel/helper-simple-access" "^7.13.12"
     babel-plugin-dynamic-import-node "^2.3.3"
 
+"@babel/plugin-transform-modules-commonjs@^7.1.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/plugin-transform-modules-commonjs/-/plugin-transform-modules-commonjs-7.16.0.tgz#add58e638c8ddc4875bd9a9ecb5c594613f6c922"
+  integrity sha512-Dzi+NWqyEotgzk/sb7kgQPJQf7AJkQBWsVp1N6JWc1lBVo0vkElUnGdr1PzUBmfsCCN5OOFya3RtpeHk15oLKQ==
+  dependencies:
+    "@babel/helper-module-transforms" "^7.16.0"
+    "@babel/helper-plugin-utils" "^7.14.5"
+    "@babel/helper-simple-access" "^7.16.0"
+    babel-plugin-dynamic-import-node "^2.3.3"
+
 "@babel/plugin-transform-modules-systemjs@^7.13.8":
   version "7.13.8"
   resolved "https://registry.yarnpkg.com/@babel/plugin-transform-modules-systemjs/-/plugin-transform-modules-systemjs-7.13.8.tgz#6d066ee2bff3c7b3d60bf28dec169ad993831ae3"
@@ -864,6 +1112,15 @@
     "@babel/helper-plugin-utils" "^7.13.0"
     "@babel/plugin-syntax-typescript" "^7.12.13"
 
+"@babel/plugin-transform-typescript@^7.16.0":
+  version "7.16.1"
+  resolved "https://registry.yarnpkg.com/@babel/plugin-transform-typescript/-/plugin-transform-typescript-7.16.1.tgz#cc0670b2822b0338355bc1b3d2246a42b8166409"
+  integrity sha512-NO4XoryBng06jjw/qWEU2LhcLJr1tWkhpMam/H4eas/CDKMX/b2/Ylb6EI256Y7+FVPCawwSM1rrJNOpDiz+Lg==
+  dependencies:
+    "@babel/helper-create-class-features-plugin" "^7.16.0"
+    "@babel/helper-plugin-utils" "^7.14.5"
+    "@babel/plugin-syntax-typescript" "^7.16.0"
+
 "@babel/plugin-transform-unicode-escapes@^7.12.13":
   version "7.12.13"
   resolved "https://registry.yarnpkg.com/@babel/plugin-transform-unicode-escapes/-/plugin-transform-unicode-escapes-7.12.13.tgz#840ced3b816d3b5127dd1d12dcedc5dead1a5e74"
@@ -958,6 +1215,15 @@
     core-js-compat "^3.9.0"
     semver "^6.3.0"
 
+"@babel/preset-flow@^7.0.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/preset-flow/-/preset-flow-7.16.0.tgz#9f1f6e72714d79460d48058cb5658fc87da7150b"
+  integrity sha512-e5NE1EoPMpoHFkyFkMSj2h9tu7OolARcUHki8mnBv4NiFK9so+UrhbvT9mV99tMJOUEx8BOj67T6dXvGcTeYeQ==
+  dependencies:
+    "@babel/helper-plugin-utils" "^7.14.5"
+    "@babel/helper-validator-option" "^7.14.5"
+    "@babel/plugin-transform-flow-strip-types" "^7.16.0"
+
 "@babel/preset-flow@^7.12.1":
   version "7.13.13"
   resolved "https://registry.yarnpkg.com/@babel/preset-flow/-/preset-flow-7.13.13.tgz#a61a1c149b3f77589d795287744393444d5cdd9e"
@@ -990,6 +1256,15 @@
     "@babel/plugin-transform-react-jsx-development" "^7.12.17"
     "@babel/plugin-transform-react-pure-annotations" "^7.12.1"
 
+"@babel/preset-typescript@^7.1.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/preset-typescript/-/preset-typescript-7.16.0.tgz#b0b4f105b855fb3d631ec036cdc9d1ffd1fa5eac"
+  integrity sha512-txegdrZYgO9DlPbv+9QOVpMnKbOtezsLHWsnsRF4AjbSIsVaujrq1qg8HK0mxQpWv0jnejt0yEoW1uWpvbrDTg==
+  dependencies:
+    "@babel/helper-plugin-utils" "^7.14.5"
+    "@babel/helper-validator-option" "^7.14.5"
+    "@babel/plugin-transform-typescript" "^7.16.0"
+
 "@babel/preset-typescript@^7.12.7":
   version "7.13.0"
   resolved "https://registry.yarnpkg.com/@babel/preset-typescript/-/preset-typescript-7.13.0.tgz#ab107e5f050609d806fbb039bec553b33462c60a"
@@ -1010,7 +1285,7 @@
     pirates "^4.0.0"
     source-map-support "^0.5.16"
 
-"@babel/runtime@^7.0.0", "@babel/runtime@^7.8.4":
+"@babel/runtime@^7.8.4":
   version "7.14.0"
   resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.14.0.tgz#46794bc20b612c5f75e62dd071e24dfd95f1cbe6"
   integrity sha512-JELkvo/DlpNdJ7dlyw/eY7E0suy5i5GQH+Vlxaq1nsNJ+H7f4Vtv3jMeCEgRhZZQFXTjldYfQgv2qmM6M1v5wA==
@@ -1026,6 +1301,15 @@
     "@babel/parser" "^7.12.13"
     "@babel/types" "^7.12.13"
 
+"@babel/template@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.16.0.tgz#d16a35ebf4cd74e202083356fab21dd89363ddd6"
+  integrity sha512-MnZdpFD/ZdYhXwiunMqqgyZyucaYsbL0IrjoGjaVhGilz+x8YB++kRfygSOIj1yOtWKPlx7NBp+9I1RQSgsd5A==
+  dependencies:
+    "@babel/code-frame" "^7.16.0"
+    "@babel/parser" "^7.16.0"
+    "@babel/types" "^7.16.0"
+
 "@babel/traverse@^7.0.0", "@babel/traverse@^7.1.0", "@babel/traverse@^7.13.0", "@babel/traverse@^7.13.15", "@babel/traverse@^7.14.0":
   version "7.14.0"
   resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.14.0.tgz#cea0dc8ae7e2b1dec65f512f39f3483e8cc95aef"
@@ -1040,6 +1324,21 @@
     debug "^4.1.0"
     globals "^11.1.0"
 
+"@babel/traverse@^7.16.0", "@babel/traverse@^7.16.3":
+  version "7.16.3"
+  resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.16.3.tgz#f63e8a938cc1b780f66d9ed3c54f532ca2d14787"
+  integrity sha512-eolumr1vVMjqevCpwVO99yN/LoGL0EyHiLO5I043aYQvwOJ9eR5UsZSClHVCzfhBduMAsSzgA/6AyqPjNayJag==
+  dependencies:
+    "@babel/code-frame" "^7.16.0"
+    "@babel/generator" "^7.16.0"
+    "@babel/helper-function-name" "^7.16.0"
+    "@babel/helper-hoist-variables" "^7.16.0"
+    "@babel/helper-split-export-declaration" "^7.16.0"
+    "@babel/parser" "^7.16.3"
+    "@babel/types" "^7.16.0"
+    debug "^4.1.0"
+    globals "^11.1.0"
+
 "@babel/types@^7.0.0", "@babel/types@^7.12.1", "@babel/types@^7.12.13", "@babel/types@^7.13.0", "@babel/types@^7.13.12", "@babel/types@^7.13.16", "@babel/types@^7.14.0", "@babel/types@^7.14.1", "@babel/types@^7.3.0", "@babel/types@^7.3.3", "@babel/types@^7.4.4":
   version "7.14.1"
   resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.14.1.tgz#095bd12f1c08ab63eff6e8f7745fa7c9cc15a9db"
@@ -1048,6 +1347,14 @@
     "@babel/helper-validator-identifier" "^7.14.0"
     to-fast-properties "^2.0.0"
 
+"@babel/types@^7.16.0":
+  version "7.16.0"
+  resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.16.0.tgz#db3b313804f96aadd0b776c4823e127ad67289ba"
+  integrity sha512-PJgg/k3SdLsGb3hhisFvtLOw5ts113klrpLuIPtCJIU+BB24fqq6lf8RWqKJEjzqXR9AEH1rIb5XTqwBHB+kQg==
+  dependencies:
+    "@babel/helper-validator-identifier" "^7.15.7"
+    to-fast-properties "^2.0.0"
+
 "@bcoe/v8-coverage@^0.2.3":
   version "0.2.3"
   resolved "https://registry.yarnpkg.com/@bcoe/v8-coverage/-/v8-coverage-0.2.3.tgz#75a2e8b51cb758a7553d6804a5932d7aace75c39"
@@ -1061,37 +1368,17 @@
     exec-sh "^0.3.2"
     minimist "^1.2.0"
 
-"@hapi/address@2.x.x":
-  version "2.1.4"
-  resolved "https://registry.yarnpkg.com/@hapi/address/-/address-2.1.4.tgz#5d67ed43f3fd41a69d4b9ff7b56e7c0d1d0a81e5"
-  integrity sha512-QD1PhQk+s31P1ixsX0H0Suoupp3VMXzIVMSwobR3F3MSUO2YCV0B7xqLcUw/Bh8yuvd3LhpyqLQWTNcRmp6IdQ==
+"@hapi/hoek@^9.0.0":
+  version "9.2.1"
+  resolved "https://registry.yarnpkg.com/@hapi/hoek/-/hoek-9.2.1.tgz#9551142a1980503752536b5050fd99f4a7f13b17"
+  integrity sha512-gfta+H8aziZsm8pZa0vj04KO6biEiisppNgA1kbJvFrrWu9Vm7eaUEy76DIxsuTaWvti5fkJVhllWc6ZTE+Mdw==
 
-"@hapi/bourne@1.x.x":
-  version "1.3.2"
-  resolved "https://registry.yarnpkg.com/@hapi/bourne/-/bourne-1.3.2.tgz#0a7095adea067243ce3283e1b56b8a8f453b242a"
-  integrity sha512-1dVNHT76Uu5N3eJNTYcvxee+jzX4Z9lfciqRRHCU27ihbUcYi+iSc2iml5Ke1LXe1SyJCLA0+14Jh4tXJgOppA==
-
-"@hapi/hoek@8.x.x", "@hapi/hoek@^8.3.0":
-  version "8.5.1"
-  resolved "https://registry.yarnpkg.com/@hapi/hoek/-/hoek-8.5.1.tgz#fde96064ca446dec8c55a8c2f130957b070c6e06"
-  integrity sha512-yN7kbciD87WzLGc5539Tn0sApjyiGHAJgKvG9W8C7O+6c7qmoQMfVs0W4bX17eqz6C78QJqqFrtgdK5EWf6Qow==
-
-"@hapi/joi@^15.0.3":
-  version "15.1.1"
-  resolved "https://registry.yarnpkg.com/@hapi/joi/-/joi-15.1.1.tgz#c675b8a71296f02833f8d6d243b34c57b8ce19d7"
-  integrity sha512-entf8ZMOK8sc+8YfeOlM8pCfg3b5+WZIKBfUaaJT8UsjAAPjartzxIYm3TIbjvA4u+u++KbcXD38k682nVHDAQ==
+"@hapi/topo@^5.0.0":
+  version "5.1.0"
+  resolved "https://registry.yarnpkg.com/@hapi/topo/-/topo-5.1.0.tgz#dc448e332c6c6e37a4dc02fd84ba8d44b9afb012"
+  integrity sha512-foQZKJig7Ob0BMAYBfcJk8d77QtOe7Wo4ox7ff1lQYoNNAb6jwcY1ncdoy2e9wQZzvNy7ODZCYJkK8kzmcAnAg==
   dependencies:
-    "@hapi/address" "2.x.x"
-    "@hapi/bourne" "1.x.x"
-    "@hapi/hoek" "8.x.x"
-    "@hapi/topo" "3.x.x"
-
-"@hapi/topo@3.x.x":
-  version "3.1.6"
-  resolved "https://registry.yarnpkg.com/@hapi/topo/-/topo-3.1.6.tgz#68d935fa3eae7fdd5ab0d7f953f3205d8b2bfc29"
-  integrity sha512-tAag0jEcjwH+P2quUfipd7liWCNX2F8NvYjQp2wtInsZxnMlypdw0FtAOLxtvvkO+GSRRbmNi8m/5y42PQJYCQ==
-  dependencies:
-    "@hapi/hoek" "^8.3.0"
+    "@hapi/hoek" "^9.0.0"
 
 "@iarna/toml@2.2.5":
   version "2.2.5"
@@ -1114,15 +1401,6 @@
   resolved "https://registry.yarnpkg.com/@istanbuljs/schema/-/schema-0.1.3.tgz#e45e384e4b8ec16bce2fd903af78450f6bf7ec98"
   integrity sha512-ZXRY4jNvVgSVQ8DL3LTcakaAtXwTVUxE81hslsyD2AtoXW/wVob10HkOJ1X/pAlcI7D+2YoZKg5do8G/w6RYgA==
 
-"@jest/console@^24.9.0":
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/@jest/console/-/console-24.9.0.tgz#79b1bc06fb74a8cfb01cbdedf945584b1b9707f0"
-  integrity sha512-Zuj6b8TnKXi3q4ymac8EQfc3ea/uhLeCGThFqXeC8H9/raaH8ARPUTdId+XyGd03Z4In0/VjD2OYFcBF09fNLQ==
-  dependencies:
-    "@jest/source-map" "^24.9.0"
-    chalk "^2.0.1"
-    slash "^2.0.0"
-
 "@jest/console@^26.6.2":
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/@jest/console/-/console-26.6.2.tgz#4e04bc464014358b03ab4937805ee36a0aeb98f2"
@@ -1169,6 +1447,13 @@
     slash "^3.0.0"
     strip-ansi "^6.0.0"
 
+"@jest/create-cache-key-function@^26.5.0":
+  version "26.6.2"
+  resolved "https://registry.yarnpkg.com/@jest/create-cache-key-function/-/create-cache-key-function-26.6.2.tgz#04cf439207a4fd12418d8aee551cddc86f9ac5f5"
+  integrity sha512-LgEuqU1f/7WEIPYqwLPIvvHuc1sB6gMVbT6zWhin3txYUNYK/kGQrC1F2WR4gR34YlI9bBtViTm5z98RqVZAaw==
+  dependencies:
+    "@jest/types" "^26.6.2"
+
 "@jest/environment@^26.6.2":
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/@jest/environment/-/environment-26.6.2.tgz#ba364cc72e221e79cc8f0a99555bf5d7577cf92c"
@@ -1179,15 +1464,6 @@
     "@types/node" "*"
     jest-mock "^26.6.2"
 
-"@jest/fake-timers@^24.9.0":
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/@jest/fake-timers/-/fake-timers-24.9.0.tgz#ba3e6bf0eecd09a636049896434d306636540c93"
-  integrity sha512-eWQcNa2YSwzXWIMC5KufBh3oWRIijrQFROsIqt6v/NS9Io/gknw1jsAC9c+ih/RQX4A3O7SeWAhQeN0goKhT9A==
-  dependencies:
-    "@jest/types" "^24.9.0"
-    jest-message-util "^24.9.0"
-    jest-mock "^24.9.0"
-
 "@jest/fake-timers@^26.6.2":
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/@jest/fake-timers/-/fake-timers-26.6.2.tgz#459c329bcf70cee4af4d7e3f3e67848123535aad"
@@ -1241,15 +1517,6 @@
   optionalDependencies:
     node-notifier "^8.0.0"
 
-"@jest/source-map@^24.9.0":
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/@jest/source-map/-/source-map-24.9.0.tgz#0e263a94430be4b41da683ccc1e6bffe2a191714"
-  integrity sha512-/Xw7xGlsZb4MJzNDgB7PW5crou5JqWiBQaz6xyPd3ArOg2nfn/PunV8+olXbbEZzNl591o5rWKE9BRDaFAuIBg==
-  dependencies:
-    callsites "^3.0.0"
-    graceful-fs "^4.1.15"
-    source-map "^0.6.0"
-
 "@jest/source-map@^26.6.2":
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/@jest/source-map/-/source-map-26.6.2.tgz#29af5e1e2e324cafccc936f218309f54ab69d535"
@@ -1259,15 +1526,6 @@
     graceful-fs "^4.2.4"
     source-map "^0.6.0"
 
-"@jest/test-result@^24.9.0":
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/@jest/test-result/-/test-result-24.9.0.tgz#11796e8aa9dbf88ea025757b3152595ad06ba0ca"
-  integrity sha512-XEFrHbBonBJ8dGp2JmF8kP/nQI/ImPpygKHwQ/SY+es59Z3L5PI4Qb9TQQMAEeYsThG1xF0k6tmG0tIKATNiiA==
-  dependencies:
-    "@jest/console" "^24.9.0"
-    "@jest/types" "^24.9.0"
-    "@types/istanbul-lib-coverage" "^2.0.0"
-
 "@jest/test-result@^26.6.2":
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/@jest/test-result/-/test-result-26.6.2.tgz#55da58b62df134576cc95476efa5f7949e3f5f18"
@@ -1310,25 +1568,6 @@
     source-map "^0.6.1"
     write-file-atomic "^3.0.0"
 
-"@jest/types@^24.9.0":
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/@jest/types/-/types-24.9.0.tgz#63cb26cb7500d069e5a389441a7c6ab5e909fc59"
-  integrity sha512-XKK7ze1apu5JWQ5eZjHITP66AX+QsLlbaJRBGYr8pNzwcAE2JVkwnf0yqjHTsDRcjR0mujy/NmZMXw5kl+kGBw==
-  dependencies:
-    "@types/istanbul-lib-coverage" "^2.0.0"
-    "@types/istanbul-reports" "^1.1.1"
-    "@types/yargs" "^13.0.0"
-
-"@jest/types@^25.5.0":
-  version "25.5.0"
-  resolved "https://registry.yarnpkg.com/@jest/types/-/types-25.5.0.tgz#4d6a4793f7b9599fc3680877b856a97dbccf2a9d"
-  integrity sha512-OXD0RgQ86Tu3MazKo8bnrkDRaDXXMGUqd+kTtLtK1Zb7CRzQcaSRPPPV37SvYTdevXEBVxe0HXylEjs8ibkmCw==
-  dependencies:
-    "@types/istanbul-lib-coverage" "^2.0.0"
-    "@types/istanbul-reports" "^1.1.1"
-    "@types/yargs" "^15.0.0"
-    chalk "^3.0.0"
-
 "@jest/types@^26.6.2":
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/@jest/types/-/types-26.6.2.tgz#bef5a532030e1d88a2f5a6d933f84e97226ed48e"
@@ -1462,30 +1701,30 @@
   dependencies:
     "@octokit/openapi-types" "^7.0.0"
 
-"@react-native-community/cli-debugger-ui@^4.13.1":
-  version "4.13.1"
-  resolved "https://registry.yarnpkg.com/@react-native-community/cli-debugger-ui/-/cli-debugger-ui-4.13.1.tgz#07de6d4dab80ec49231de1f1fbf658b4ad39b32c"
-  integrity sha512-UFnkg5RTq3s2X15fSkrWY9+5BKOFjihNSnJjTV2H5PtTUFbd55qnxxPw8CxSfK0bXb1IrSvCESprk2LEpqr5cg==
+"@react-native-community/cli-debugger-ui@^5.0.1":
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/@react-native-community/cli-debugger-ui/-/cli-debugger-ui-5.0.1.tgz#6b1f3367b8e5211e899983065ea2e72c1901d75f"
+  integrity sha512-5gGKaaXYOVE423BUqxIfvfAVSj5Cg1cU/TpGbeg/iqpy2CfqyWqJB3tTuVUbOOiOvR5wbU8tti6pIi1pchJ+oA==
   dependencies:
     serve-static "^1.13.1"
 
-"@react-native-community/cli-hermes@^4.13.0":
-  version "4.13.0"
-  resolved "https://registry.yarnpkg.com/@react-native-community/cli-hermes/-/cli-hermes-4.13.0.tgz#6243ed9c709dad5e523f1ccd7d21066b32f2899d"
-  integrity sha512-oG+w0Uby6rSGsUkJGLvMQctZ5eVRLLfhf84lLyz942OEDxFRa9U19YJxOe9FmgCKtotbYiM3P/XhK+SVCuerPQ==
+"@react-native-community/cli-hermes@^5.0.1":
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/@react-native-community/cli-hermes/-/cli-hermes-5.0.1.tgz#039d064bf2dcd5043beb7dcd6cdf5f5cdd51e7fc"
+  integrity sha512-nD+ZOFvu5MfjLB18eDJ01MNiFrzj8SDtENjGpf0ZRFndOWASDAmU54/UlU/wj8OzTToK1+S1KY7j2P2M1gleww==
   dependencies:
-    "@react-native-community/cli-platform-android" "^4.13.0"
-    "@react-native-community/cli-tools" "^4.13.0"
+    "@react-native-community/cli-platform-android" "^5.0.1"
+    "@react-native-community/cli-tools" "^5.0.1"
     chalk "^3.0.0"
     hermes-profile-transformer "^0.0.6"
     ip "^1.1.5"
 
-"@react-native-community/cli-platform-android@^4.10.0", "@react-native-community/cli-platform-android@^4.13.0":
-  version "4.13.0"
-  resolved "https://registry.yarnpkg.com/@react-native-community/cli-platform-android/-/cli-platform-android-4.13.0.tgz#922681ec82ee1aadd993598b814df1152118be02"
-  integrity sha512-3i8sX8GklEytUZwPnojuoFbCjIRzMugCdzDIdZ9UNmi/OhD4/8mLGO0dgXfT4sMWjZwu3qjy45sFfk2zOAgHbA==
+"@react-native-community/cli-platform-android@^5.0.1", "@react-native-community/cli-platform-android@^5.0.1-alpha.0":
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/@react-native-community/cli-platform-android/-/cli-platform-android-5.0.1.tgz#7f761e1818e5a099877ec59a1b739553fd6a6905"
+  integrity sha512-qv9GJX6BJ+Y4qvV34vgxKwwN1cnveXUdP6y2YmTW7XoAYs5YUzKqHajpY58EyucAL2y++6+573t5y4U/9IIoww==
   dependencies:
-    "@react-native-community/cli-tools" "^4.13.0"
+    "@react-native-community/cli-tools" "^5.0.1"
     chalk "^3.0.0"
     execa "^1.0.0"
     fs-extra "^8.1.0"
@@ -1496,12 +1735,12 @@
     slash "^3.0.0"
     xmldoc "^1.1.2"
 
-"@react-native-community/cli-platform-ios@^4.10.0":
-  version "4.13.0"
-  resolved "https://registry.yarnpkg.com/@react-native-community/cli-platform-ios/-/cli-platform-ios-4.13.0.tgz#a738915c68cac86df54e578b59a1311ea62b1aef"
-  integrity sha512-6THlTu8zp62efkzimfGr3VIuQJ2514o+vScZERJCV1xgEi8XtV7mb/ZKt9o6Y9WGxKKkc0E0b/aVAtgy+L27CA==
+"@react-native-community/cli-platform-ios@^5.0.1-alpha.0":
+  version "5.0.2"
+  resolved "https://registry.yarnpkg.com/@react-native-community/cli-platform-ios/-/cli-platform-ios-5.0.2.tgz#62485534053c0dad28a67de188248de177f4b0fb"
+  integrity sha512-IAJ2B3j2BTsQUJZ4R6cVvnTbPq0Vza7+dOgP81ISz2BKRtQ0VqNFv+VOALH2jLaDzf4t7NFlskzIXFqWqy2BLg==
   dependencies:
-    "@react-native-community/cli-tools" "^4.13.0"
+    "@react-native-community/cli-tools" "^5.0.1"
     chalk "^3.0.0"
     glob "^7.1.3"
     js-yaml "^3.13.1"
@@ -1509,25 +1748,25 @@
     plist "^3.0.1"
     xcode "^2.0.0"
 
-"@react-native-community/cli-server-api@^4.13.1":
-  version "4.13.1"
-  resolved "https://registry.yarnpkg.com/@react-native-community/cli-server-api/-/cli-server-api-4.13.1.tgz#bee7ee9702afce848e9d6ca3dcd5669b99b125bd"
-  integrity sha512-vQzsFKD9CjHthA2ehTQX8c7uIzlI9A7ejaIow1I9RlEnLraPH2QqVDmzIdbdh5Od47UPbRzamCgAP8Bnqv3qwQ==
+"@react-native-community/cli-server-api@^5.0.1":
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/@react-native-community/cli-server-api/-/cli-server-api-5.0.1.tgz#3cf92dac766fab766afedf77df3fe4d5f51e4d2b"
+  integrity sha512-OOxL+y9AOZayQzmSW+h5T54wQe+QBc/f67Y9QlWzzJhkKJdYx+S4VOooHoD5PFJzGbYaxhu2YF17p517pcEIIA==
   dependencies:
-    "@react-native-community/cli-debugger-ui" "^4.13.1"
-    "@react-native-community/cli-tools" "^4.13.0"
+    "@react-native-community/cli-debugger-ui" "^5.0.1"
+    "@react-native-community/cli-tools" "^5.0.1"
     compression "^1.7.1"
     connect "^3.6.5"
     errorhandler "^1.5.0"
     nocache "^2.1.0"
-    pretty-format "^25.1.0"
+    pretty-format "^26.6.2"
     serve-static "^1.13.1"
     ws "^1.1.0"
 
-"@react-native-community/cli-tools@^4.13.0":
-  version "4.13.0"
-  resolved "https://registry.yarnpkg.com/@react-native-community/cli-tools/-/cli-tools-4.13.0.tgz#b406463d33af16cedc4305a9a9257ed32845cf1b"
-  integrity sha512-s4f489h5+EJksn4CfheLgv5PGOM0CDmK1UEBLw2t/ncWs3cW2VI7vXzndcd/WJHTv3GntJhXDcJMuL+Z2IAOgg==
+"@react-native-community/cli-tools@^5.0.1":
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/@react-native-community/cli-tools/-/cli-tools-5.0.1.tgz#9ee564dbe20448becd6bce9fbea1b59aa5797919"
+  integrity sha512-XOX5w98oSE8+KnkMZZPMRT7I5TaP8fLbDl0tCu40S7Epz+Zz924n80fmdu6nUDIfPT1nV6yH1hmHmWAWTDOR+Q==
   dependencies:
     chalk "^3.0.0"
     lodash "^4.17.15"
@@ -1536,22 +1775,24 @@
     open "^6.2.0"
     shell-quote "1.6.1"
 
-"@react-native-community/cli-types@^4.10.1":
-  version "4.10.1"
-  resolved "https://registry.yarnpkg.com/@react-native-community/cli-types/-/cli-types-4.10.1.tgz#d68a2dcd1649d3b3774823c64e5e9ce55bfbe1c9"
-  integrity sha512-ael2f1onoPF3vF7YqHGWy7NnafzGu+yp88BbFbP0ydoCP2xGSUzmZVw0zakPTC040Id+JQ9WeFczujMkDy6jYQ==
-
-"@react-native-community/cli@^4.10.0":
-  version "4.14.0"
-  resolved "https://registry.yarnpkg.com/@react-native-community/cli/-/cli-4.14.0.tgz#bb106a98341bfa2db36060091ff90bfe82ea4f55"
-  integrity sha512-EYJKBuxFxAu/iwNUfwDq41FjORpvSh1wvQ3qsHjzcR5uaGlWEOJrd3uNJDuKBAS0TVvbEesLF9NEXipjyRVr4Q==
+"@react-native-community/cli-types@^5.0.1":
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/@react-native-community/cli-types/-/cli-types-5.0.1.tgz#8c5db4011988b0836d27a5efe230cb34890915dc"
+  integrity sha512-BesXnuFFlU/d1F3+sHhvKt8fUxbQlAbZ3hhMEImp9A6sopl8TEtryUGJ1dbazGjRXcADutxvjwT/i3LJVTIQug==
   dependencies:
-    "@hapi/joi" "^15.0.3"
-    "@react-native-community/cli-debugger-ui" "^4.13.1"
-    "@react-native-community/cli-hermes" "^4.13.0"
-    "@react-native-community/cli-server-api" "^4.13.1"
-    "@react-native-community/cli-tools" "^4.13.0"
-    "@react-native-community/cli-types" "^4.10.1"
+    ora "^3.4.0"
+
+"@react-native-community/cli@^5.0.1-alpha.0":
+  version "5.0.1"
+  resolved "https://registry.yarnpkg.com/@react-native-community/cli/-/cli-5.0.1.tgz#1f7a66d813d5daf102e593f3c550650fa0cc8314"
+  integrity sha512-9VzSYUYSEqxEH5Ib2UNSdn2eyPiYZ4T7Y79o9DKtRBuSaUIwbCUdZtIm+UUjBpLS1XYBkW26FqL8/UdZDmQvXw==
+  dependencies:
+    "@react-native-community/cli-debugger-ui" "^5.0.1"
+    "@react-native-community/cli-hermes" "^5.0.1"
+    "@react-native-community/cli-server-api" "^5.0.1"
+    "@react-native-community/cli-tools" "^5.0.1"
+    "@react-native-community/cli-types" "^5.0.1"
+    appdirsjs "^1.2.4"
     chalk "^3.0.0"
     command-exists "^1.2.8"
     commander "^2.19.0"
@@ -1563,25 +1804,42 @@
     fs-extra "^8.1.0"
     glob "^7.1.3"
     graceful-fs "^4.1.3"
-    inquirer "^3.0.6"
+    joi "^17.2.1"
     leven "^3.1.0"
     lodash "^4.17.15"
-    metro "^0.59.0"
-    metro-config "^0.59.0"
-    metro-core "^0.59.0"
-    metro-react-native-babel-transformer "^0.59.0"
-    metro-resolver "^0.59.0"
+    metro "^0.64.0"
+    metro-config "^0.64.0"
+    metro-core "^0.64.0"
+    metro-react-native-babel-transformer "^0.64.0"
+    metro-resolver "^0.64.0"
+    metro-runtime "^0.64.0"
     minimist "^1.2.0"
     mkdirp "^0.5.1"
     node-stream-zip "^1.9.1"
     ora "^3.4.0"
-    pretty-format "^25.2.0"
+    pretty-format "^26.6.2"
+    prompts "^2.4.0"
     semver "^6.3.0"
     serve-static "^1.13.1"
     strip-ansi "^5.2.0"
     sudo-prompt "^9.0.0"
     wcwidth "^1.0.1"
 
+"@react-native/assets@1.0.0":
+  version "1.0.0"
+  resolved "https://registry.yarnpkg.com/@react-native/assets/-/assets-1.0.0.tgz#c6f9bf63d274bafc8e970628de24986b30a55c8e"
+  integrity sha512-KrwSpS1tKI70wuKl68DwJZYEvXktDHdZMG0k2AXD/rJVSlB23/X2CB2cutVR0HwNMJIal9HOUOBB2rVfa6UGtQ==
+
+"@react-native/normalize-color@1.0.0":
+  version "1.0.0"
+  resolved "https://registry.yarnpkg.com/@react-native/normalize-color/-/normalize-color-1.0.0.tgz#c52a99d4fe01049102d47dc45d40cbde4f720ab6"
+  integrity sha512-xUNRvNmCl3UGCPbbHvfyFMnpvLPoOjDCcp5bT9m2k+TF/ZBklEQwhPZlkrxRx2NhgFh1X3a5uL7mJ7ZR+8G7Qg==
+
+"@react-native/polyfills@1.0.0":
+  version "1.0.0"
+  resolved "https://registry.yarnpkg.com/@react-native/polyfills/-/polyfills-1.0.0.tgz#05bb0031533598f9458cf65a502b8df0eecae780"
+  integrity sha512-0jbp4RxjYopTsIdLl+/Fy2TiwVYHy4mgeu07DG4b/LyM0OS/+lPP5c9sbnt/AMlnF6qz2JRZpPpGw1eMNS6A4w==
+
 "@release-it/conventional-changelog@^2.0.0":
   version "2.0.1"
   resolved "https://registry.yarnpkg.com/@release-it/conventional-changelog/-/conventional-changelog-2.0.1.tgz#bdd52ad3ecc0d6e39d637592d6ea2bd6d28e5ecb"
@@ -1592,6 +1850,23 @@
     conventional-recommended-bump "^6.1.0"
     prepend-file "^2.0.0"
 
+"@sideway/address@^4.1.0":
+  version "4.1.2"
+  resolved "https://registry.yarnpkg.com/@sideway/address/-/address-4.1.2.tgz#811b84333a335739d3969cfc434736268170cad1"
+  integrity sha512-idTz8ibqWFrPU8kMirL0CoPH/A29XOzzAzpyN3zQ4kAWnzmNfFmRaoMNN6VI8ske5M73HZyhIaW4OuSFIdM4oA==
+  dependencies:
+    "@hapi/hoek" "^9.0.0"
+
+"@sideway/formula@^3.0.0":
+  version "3.0.0"
+  resolved "https://registry.yarnpkg.com/@sideway/formula/-/formula-3.0.0.tgz#fe158aee32e6bd5de85044be615bc08478a0a13c"
+  integrity sha512-vHe7wZ4NOXVfkoRb8T5otiENVlT7a3IAiw7H5M2+GO+9CDgcVUUsX1zalAztCmwyOr2RUTGJdgB+ZvSVqmdHmg==
+
+"@sideway/pinpoint@^2.0.0":
+  version "2.0.0"
+  resolved "https://registry.yarnpkg.com/@sideway/pinpoint/-/pinpoint-2.0.0.tgz#cff8ffadc372ad29fd3f78277aeb29e632cc70df"
+  integrity sha512-RNiOoTPkptFtSVzQevY/yWtZwf/RxyVnPy/OcA9HBM3MlGDnBEYL5B41H0MTn0Uec8Hi+2qUtTfG2WWZBmMejQ==
+
 "@sindresorhus/is@^0.14.0":
   version "0.14.0"
   resolved "https://registry.yarnpkg.com/@sindresorhus/is/-/is-0.14.0.tgz#9fb3a3cf3132328151f353de4632e01e52102bea"
@@ -1704,14 +1979,6 @@
   dependencies:
     "@types/istanbul-lib-coverage" "*"
 
-"@types/istanbul-reports@^1.1.1":
-  version "1.1.2"
-  resolved "https://registry.yarnpkg.com/@types/istanbul-reports/-/istanbul-reports-1.1.2.tgz#e875cc689e47bce549ec81f3df5e6f6f11cfaeb2"
-  integrity sha512-P/W9yOX/3oPZSpaYOCQzGqgCQRXn0FFO/V8bWrCQs+wLmvVVxk6CRBXALEvNs9OHIatlnlFokfhuDo2ug01ciw==
-  dependencies:
-    "@types/istanbul-lib-coverage" "*"
-    "@types/istanbul-lib-report" "*"
-
 "@types/istanbul-reports@^3.0.0":
   version "3.0.0"
   resolved "https://registry.yarnpkg.com/@types/istanbul-reports/-/istanbul-reports-3.0.0.tgz#508b13aa344fa4976234e75dddcc34925737d821"
@@ -1801,11 +2068,6 @@
   resolved "https://registry.yarnpkg.com/@types/scheduler/-/scheduler-0.16.1.tgz#18845205e86ff0038517aab7a18a62a6b9f71275"
   integrity sha512-EaCxbanVeyxDRTQBkdLb3Bvl/HK7PBK6UJjsSixB0iHKoWxE5uu2Q/DgtpOhPIojN0Zl1whvOd7PoHs2P0s5eA==
 
-"@types/stack-utils@^1.0.1":
-  version "1.0.1"
-  resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-1.0.1.tgz#0a851d3bd96498fa25c33ab7278ed3bd65f06c3e"
-  integrity sha512-l42BggppR6zLmpfU6fq9HEa2oGPEI8yrSPL3GITjfRInppYFahObbIQOQK3UGxEnyQpltZLaPe75046NOZQikw==
-
 "@types/stack-utils@^2.0.0":
   version "2.0.0"
   resolved "https://registry.yarnpkg.com/@types/stack-utils/-/stack-utils-2.0.0.tgz#7036640b4e21cc2f259ae826ce843d277dad8cff"
@@ -1816,13 +2078,6 @@
   resolved "https://registry.yarnpkg.com/@types/yargs-parser/-/yargs-parser-20.2.0.tgz#dd3e6699ba3237f0348cd085e4698780204842f9"
   integrity sha512-37RSHht+gzzgYeobbG+KWryeAW8J33Nhr69cjTqSYymXVZEN9NbRYWoYlRtDhHKPVT1FyNKwaTPC1NynKZpzRA==
 
-"@types/yargs@^13.0.0":
-  version "13.0.11"
-  resolved "https://registry.yarnpkg.com/@types/yargs/-/yargs-13.0.11.tgz#def2f0c93e4bdf2c61d7e34899b17e34be28d3b1"
-  integrity sha512-NRqD6T4gktUrDi1o1wLH3EKC1o2caCr7/wR87ODcbVITQF106OM3sFN92ysZ++wqelOd1CTzatnOBRDYYG6wGQ==
-  dependencies:
-    "@types/yargs-parser" "*"
-
 "@types/yargs@^15.0.0":
   version "15.0.13"
   resolved "https://registry.yarnpkg.com/@types/yargs/-/yargs-15.0.13.tgz#34f7fec8b389d7f3c1fd08026a5763e072d3c6dc"
@@ -1855,7 +2110,7 @@ absolute-path@^0.0.0:
   resolved "https://registry.yarnpkg.com/absolute-path/-/absolute-path-0.0.0.tgz#a78762fbdadfb5297be99b15d35a785b2f095bf7"
   integrity sha1-p4di+9rftSl76ZsV01p4Wy8JW/c=
 
-accepts@~1.3.5, accepts@~1.3.7:
+accepts@^1.3.7, accepts@~1.3.5, accepts@~1.3.7:
   version "1.3.7"
   resolved "https://registry.yarnpkg.com/accepts/-/accepts-1.3.7.tgz#531bc726517a3b2b41f850021c6cc15eaab507cd"
   integrity sha512-Il80Qs2WjYlJIBNzNkK6KYqlVMTbZLXgHx2oT0pU/fjRHyEp+PEfEPY0R3WCwAGVOtauxh1hOxNgIf5bv7dQpA==
@@ -1921,25 +2176,6 @@ ansi-align@^3.0.0:
   dependencies:
     string-width "^3.0.0"
 
-ansi-colors@^1.0.1:
-  version "1.1.0"
-  resolved "https://registry.yarnpkg.com/ansi-colors/-/ansi-colors-1.1.0.tgz#6374b4dd5d4718ff3ce27a671a3b1cad077132a9"
-  integrity sha512-SFKX67auSNoVR38N3L+nvsPjOE0bybKTYbkf5tRvushrAPQ9V75huw0ZxBkKVeRU9kqH3d6HA4xTckbwZ4ixmA==
-  dependencies:
-    ansi-wrap "^0.1.0"
-
-ansi-cyan@^0.1.1:
-  version "0.1.1"
-  resolved "https://registry.yarnpkg.com/ansi-cyan/-/ansi-cyan-0.1.1.tgz#538ae528af8982f28ae30d86f2f17456d2609873"
-  integrity sha1-U4rlKK+JgvKK4w2G8vF0VtJgmHM=
-  dependencies:
-    ansi-wrap "0.1.0"
-
-ansi-escapes@^3.0.0:
-  version "3.2.0"
-  resolved "https://registry.yarnpkg.com/ansi-escapes/-/ansi-escapes-3.2.0.tgz#8780b98ff9dbf5638152d1f1fe5c1d7b4442976b"
-  integrity sha512-cBhpre4ma+U0T1oM5fXg7Dy1Jw7zzwv7lt/GoCpr+hDQJoYnKVPLL4dCvSEFMmQurOQvSrwT7SL/DAlhBI97RQ==
-
 ansi-escapes@^4.2.1:
   version "4.3.2"
   resolved "https://registry.yarnpkg.com/ansi-escapes/-/ansi-escapes-4.3.2.tgz#6b2291d1db7d98b6521d5f1efa42d0f3a9feb65e"
@@ -1956,26 +2192,7 @@ ansi-fragments@^0.2.1:
     slice-ansi "^2.0.0"
     strip-ansi "^5.0.0"
 
-ansi-gray@^0.1.1:
-  version "0.1.1"
-  resolved "https://registry.yarnpkg.com/ansi-gray/-/ansi-gray-0.1.1.tgz#2962cf54ec9792c48510a3deb524436861ef7251"
-  integrity sha1-KWLPVOyXksSFEKPetSRDaGHvclE=
-  dependencies:
-    ansi-wrap "0.1.0"
-
-ansi-red@^0.1.1:
-  version "0.1.1"
-  resolved "https://registry.yarnpkg.com/ansi-red/-/ansi-red-0.1.1.tgz#8c638f9d1080800a353c9c28c8a81ca4705d946c"
-  integrity sha1-jGOPnRCAgAo1PJwoyKgcpHBdlGw=
-  dependencies:
-    ansi-wrap "0.1.0"
-
-ansi-regex@^3.0.0:
-  version "3.0.0"
-  resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-3.0.0.tgz#ed0317c322064f79466c02966bddb605ab37d998"
-  integrity sha1-7QMXwyIGT3lGbAKWa922Bas32Zg=
-
-ansi-regex@^4.0.0, ansi-regex@^4.1.0:
+ansi-regex@^4.1.0:
   version "4.1.0"
   resolved "https://registry.yarnpkg.com/ansi-regex/-/ansi-regex-4.1.0.tgz#8b9f8f08cf1acb843756a839ca8c7e3168c51997"
   integrity sha512-1apePfXM1UOSqw0o9IiFAovVz9M5S1Dg+4TrDwfMewQ6p/rmMueb7tWZjQ1rx4Loy1ArBggoqGpfqqdI4rondg==
@@ -1999,11 +2216,6 @@ ansi-styles@^4.0.0, ansi-styles@^4.1.0:
   dependencies:
     color-convert "^2.0.1"
 
-ansi-wrap@0.1.0, ansi-wrap@^0.1.0:
-  version "0.1.0"
-  resolved "https://registry.yarnpkg.com/ansi-wrap/-/ansi-wrap-0.1.0.tgz#a82250ddb0015e9a27ca82e82ea603bbfa45efaf"
-  integrity sha1-qCJQ3bABXponyoLoLqYDu/pF768=
-
 anymatch@^2.0.0:
   version "2.0.0"
   resolved "https://registry.yarnpkg.com/anymatch/-/anymatch-2.0.0.tgz#bcb24b4f37934d9aa7ac17b4adaf89e7c76ef2eb"
@@ -2020,6 +2232,11 @@ anymatch@^3.0.3:
     normalize-path "^3.0.0"
     picomatch "^2.0.4"
 
+appdirsjs@^1.2.4:
+  version "1.2.6"
+  resolved "https://registry.yarnpkg.com/appdirsjs/-/appdirsjs-1.2.6.tgz#fccf9ee543315492867cacfcfd4a2b32257d30ac"
+  integrity sha512-D8wJNkqMCeQs3kLasatELsddox/Xqkhp+J07iXGyL54fVN7oc+nmNfYzGuCs1IEP6uBw+TfpuO3JKwc+lECy4w==
+
 argparse@^1.0.7:
   version "1.0.10"
   resolved "https://registry.yarnpkg.com/argparse/-/argparse-1.0.10.tgz#bcd6791ea5ae09725e17e5ad988134cd40b3d911"
@@ -2027,29 +2244,16 @@ argparse@^1.0.7:
   dependencies:
     sprintf-js "~1.0.2"
 
-arr-diff@^1.0.1:
-  version "1.1.0"
-  resolved "https://registry.yarnpkg.com/arr-diff/-/arr-diff-1.1.0.tgz#687c32758163588fef7de7b36fabe495eb1a399a"
-  integrity sha1-aHwydYFjWI/vfeezb6vklesaOZo=
-  dependencies:
-    arr-flatten "^1.0.1"
-    array-slice "^0.2.3"
-
 arr-diff@^4.0.0:
   version "4.0.0"
   resolved "https://registry.yarnpkg.com/arr-diff/-/arr-diff-4.0.0.tgz#d6461074febfec71e7e15235761a329a5dc7c520"
   integrity sha1-1kYQdP6/7HHn4VI1dhoyml3HxSA=
 
-arr-flatten@^1.0.1, arr-flatten@^1.1.0:
+arr-flatten@^1.1.0:
   version "1.1.0"
   resolved "https://registry.yarnpkg.com/arr-flatten/-/arr-flatten-1.1.0.tgz#36048bbff4e7b47e136644316c99669ea5ae91f1"
   integrity sha512-L3hKV5R/p5o81R7O02IGnwpDmkp6E982XhtbuwSe3O4qOtMMMtodicASA1Cny2U+aCXcNpml+m4dPsvsJ3jatg==
 
-arr-union@^2.0.1:
-  version "2.1.0"
-  resolved "https://registry.yarnpkg.com/arr-union/-/arr-union-2.1.0.tgz#20f9eab5ec70f5c7d215b1077b1c39161d292c7d"
-  integrity sha1-IPnqtexw9cfSFbEHexw5Fh0pLH0=
-
 arr-union@^3.1.0:
   version "3.1.0"
   resolved "https://registry.yarnpkg.com/arr-union/-/arr-union-3.1.0.tgz#e39b09aea9def866a8f206e288af63919bae39c4"
@@ -2080,11 +2284,6 @@ array-reduce@~0.0.0:
   resolved "https://registry.yarnpkg.com/array-reduce/-/array-reduce-0.0.0.tgz#173899d3ffd1c7d9383e4479525dbe278cab5f2b"
   integrity sha1-FziZ0//Rx9k4PkR5Ul2+J4yrXys=
 
-array-slice@^0.2.3:
-  version "0.2.3"
-  resolved "https://registry.yarnpkg.com/array-slice/-/array-slice-0.2.3.tgz#dd3cfb80ed7973a75117cdac69b0b99ec86186f5"
-  integrity sha1-3Tz7gO15c6dRF82sabC5nshhhvU=
-
 array-union@^2.1.0:
   version "2.1.0"
   resolved "https://registry.yarnpkg.com/array-union/-/array-union-2.1.0.tgz#b798420adbeb1de828d84acd8a2e23d3efe85e8d"
@@ -2100,7 +2299,7 @@ arrify@^1.0.1:
   resolved "https://registry.yarnpkg.com/arrify/-/arrify-1.0.1.tgz#898508da2226f380df904728456849c1501a4b0d"
   integrity sha1-iYUI2iIm84DfkEcoRWhJwVAaSw0=
 
-asap@~2.0.3, asap@~2.0.6:
+asap@~2.0.6:
   version "2.0.6"
   resolved "https://registry.yarnpkg.com/asap/-/asap-2.0.6.tgz#e50347611d7e690943208bbdafebcbc2fb866d46"
   integrity sha1-5QNHYR1+aQlDIIu9r+vLwvuGbUY=
@@ -2122,11 +2321,23 @@ assign-symbols@^1.0.0:
   resolved "https://registry.yarnpkg.com/assign-symbols/-/assign-symbols-1.0.0.tgz#59667f41fadd4f20ccbc2bb96b8d4f7f78ec0367"
   integrity sha1-WWZ/QfrdTyDMvCu5a41Pf3jsA2c=
 
+ast-types@0.14.2:
+  version "0.14.2"
+  resolved "https://registry.yarnpkg.com/ast-types/-/ast-types-0.14.2.tgz#600b882df8583e3cd4f2df5fa20fa83759d4bdfd"
+  integrity sha512-O0yuUDnZeQDL+ncNGlJ78BiO4jnYI3bvMsD5prT0/nsgijG/LpNBIr63gTjVTNsiGkgQhiyCShTgxt8oXOrklA==
+  dependencies:
+    tslib "^2.0.1"
+
 astral-regex@^1.0.0:
   version "1.0.0"
   resolved "https://registry.yarnpkg.com/astral-regex/-/astral-regex-1.0.0.tgz#6c8c3fb827dd43ee3918f27b82782ab7658a6fd9"
   integrity sha512-+Ryf6g3BKoRc7jfp7ad8tM4TtMiaWvbF/1/sQcZPkkS7ag3D5nMBCe2UfOTONtAkaG0tO0ij3C5Lwmf1EiyjHg==
 
+async-limiter@~1.0.0:
+  version "1.0.1"
+  resolved "https://registry.yarnpkg.com/async-limiter/-/async-limiter-1.0.1.tgz#dd379e94f0db8310b08291f9d64c3209766617fd"
+  integrity sha512-csOlWGAcRFJaI6m+F2WKdnMKr4HhdhFVBk0H/QbJFMCr+uO2kwohwXQPxw/9OCxp05r5ghVBFSyioixx3gfkNQ==
+
 async-retry@1.3.1:
   version "1.3.1"
   resolved "https://registry.yarnpkg.com/async-retry/-/async-retry-1.3.1.tgz#139f31f8ddce50c0870b0ba558a6079684aaed55"
@@ -2171,6 +2382,11 @@ aws4@^1.8.0:
   resolved "https://registry.yarnpkg.com/aws4/-/aws4-1.11.0.tgz#d61f46d83b2519250e2784daf5b09479a8b41c59"
   integrity sha512-xh1Rl34h6Fi1DC2WWKfxUTVqRsNnr6LsKz2+hfwDxQJWmrx8+c7ylaqBMcHfl1U1r2dsifOvKX3LQuLNZ+XSvA==
 
+babel-core@^7.0.0-bridge.0:
+  version "7.0.0-bridge.0"
+  resolved "https://registry.yarnpkg.com/babel-core/-/babel-core-7.0.0-bridge.0.tgz#95a492ddd90f9b4e9a4a1da14eb335b87b634ece"
+  integrity sha512-poPX9mZH/5CSanm50Q+1toVci6pv5KSRv/5TWCwtzQS5XEwn40BcCrgIeMFWP9CKKIniKXNxoIOnOq4VVlGXhg==
+
 babel-jest@^26.6.3:
   version "26.6.3"
   resolved "https://registry.yarnpkg.com/babel-jest/-/babel-jest-26.6.3.tgz#d87d25cb0037577a0c89f82e5755c5d293c01056"
@@ -2260,7 +2476,7 @@ babel-preset-current-node-syntax@^1.0.0:
     "@babel/plugin-syntax-optional-chaining" "^7.8.3"
     "@babel/plugin-syntax-top-level-await" "^7.8.3"
 
-babel-preset-fbjs@^3.2.0, babel-preset-fbjs@^3.3.0:
+babel-preset-fbjs@^3.3.0:
   version "3.3.0"
   resolved "https://registry.yarnpkg.com/babel-preset-fbjs/-/babel-preset-fbjs-3.3.0.tgz#a6024764ea86c8e06a22d794ca8b69534d263541"
   integrity sha512-7QTLTCd2gwB2qGoi5epSULMHugSVgpcVt5YAeiFO9ABLrutDQzKfGwzxgZHLpugq8qMdg/DhRZDZ5CLKxBkEbw==
@@ -2341,13 +2557,6 @@ big-integer@^1.6.44:
   resolved "https://registry.yarnpkg.com/big-integer/-/big-integer-1.6.48.tgz#8fd88bd1632cba4a1c8c3e3d7159f08bb95b4b9e"
   integrity sha512-j51egjPa7/i+RdiRuJbPdJ2FIUYYPhvYLjzoYbcMMm62ooO6F94fETG4MTs46zPAF9Brs04OajboA/qTGuz78w==
 
-bindings@^1.5.0:
-  version "1.5.0"
-  resolved "https://registry.yarnpkg.com/bindings/-/bindings-1.5.0.tgz#10353c9e945334bc0511a6d90b38fbc7c9c504df"
-  integrity sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==
-  dependencies:
-    file-uri-to-path "1.0.0"
-
 bl@^4.1.0:
   version "4.1.0"
   resolved "https://registry.yarnpkg.com/bl/-/bl-4.1.0.tgz#451535264182bec2fbbc83a62ab98cf11d9f7b3a"
@@ -2432,6 +2641,17 @@ browserslist@^4.14.5, browserslist@^4.16.0, browserslist@^4.16.6:
     escalade "^3.1.1"
     node-releases "^1.1.71"
 
+browserslist@^4.17.5:
+  version "4.18.1"
+  resolved "https://registry.yarnpkg.com/browserslist/-/browserslist-4.18.1.tgz#60d3920f25b6860eb917c6c7b185576f4d8b017f"
+  integrity sha512-8ScCzdpPwR2wQh8IT82CA2VgDwjHyqMovPBZSNH54+tm4Jk2pCuv90gmAdH6J84OCRWi0b4gMe6O6XPXuJnjgQ==
+  dependencies:
+    caniuse-lite "^1.0.30001280"
+    electron-to-chromium "^1.3.896"
+    escalade "^3.1.1"
+    node-releases "^2.0.1"
+    picocolors "^1.0.0"
+
 bser@2.1.1:
   version "2.1.1"
   resolved "https://registry.yarnpkg.com/bser/-/bser-2.1.1.tgz#e6787da20ece9d07998533cfd9de6f5c38f4bc05"
@@ -2439,11 +2659,6 @@ bser@2.1.1:
   dependencies:
     node-int64 "^0.4.0"
 
-buffer-crc32@^0.2.13:
-  version "0.2.13"
-  resolved "https://registry.yarnpkg.com/buffer-crc32/-/buffer-crc32-0.2.13.tgz#0d333e3f00eac50aa1454abd30ef8c2a5d9a7242"
-  integrity sha1-DTM+PwDqxQqhRUq9MO+MKl2ackI=
-
 buffer-from@^1.0.0:
   version "1.1.1"
   resolved "https://registry.yarnpkg.com/buffer-from/-/buffer-from-1.1.1.tgz#32713bc028f75c02fdb710d7c7bcec1f2c6070ef"
@@ -2590,6 +2805,11 @@ caniuse-lite@^1.0.30001219:
   resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001221.tgz#b916721ddf59066cfbe96c5c9a77cf7ae5c52e65"
   integrity sha512-b9TOZfND3uGSLjMOrLh8XxSQ41x8mX+9MLJYDM4AAHLfaZHttrLNPrScWjVnBITRZbY5sPpCt7X85n7VSLZ+/g==
 
+caniuse-lite@^1.0.30001280:
+  version "1.0.30001282"
+  resolved "https://registry.yarnpkg.com/caniuse-lite/-/caniuse-lite-1.0.30001282.tgz#38c781ee0a90ccfe1fe7fefd00e43f5ffdcb96fd"
+  integrity sha512-YhF/hG6nqBEllymSIjLtR2iWDDnChvhnVJqp+vloyt2tEHFG1yBR+ac2B/rOw0qOK0m0lEXU2dv4E/sMk5P9Kg==
+
 capture-exit@^2.0.0:
   version "2.0.0"
   resolved "https://registry.yarnpkg.com/capture-exit/-/capture-exit-2.0.0.tgz#fb953bfaebeb781f62898239dabb426d08a509a4"
@@ -2610,7 +2830,7 @@ chalk@4.1.0:
     ansi-styles "^4.1.0"
     supports-color "^7.1.0"
 
-chalk@^2.0.0, chalk@^2.0.1, chalk@^2.4.1, chalk@^2.4.2:
+chalk@^2.0.0, chalk@^2.0.1, chalk@^2.4.2:
   version "2.4.2"
   resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
   integrity sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==
@@ -2640,11 +2860,6 @@ char-regex@^1.0.2:
   resolved "https://registry.yarnpkg.com/char-regex/-/char-regex-1.0.2.tgz#d744358226217f981ed58f479b1d6bcc29545dcf"
   integrity sha512-kWWXztvZ5SBQV+eRgKFeh8q5sLuZY2+8WUIzlxWVTg+oGwY14qylx1KbKzHd8P6ZYkAg0xyIDU9JMHhyJMZ1jw==
 
-chardet@^0.4.0:
-  version "0.4.2"
-  resolved "https://registry.yarnpkg.com/chardet/-/chardet-0.4.2.tgz#b5473b33dc97c424e5d98dc87d55d4d8a29c8bf2"
-  integrity sha1-tUc7M9yXxCTl2Y3IfVXU2KKci/I=
-
 chardet@^0.7.0:
   version "0.7.0"
   resolved "https://registry.yarnpkg.com/chardet/-/chardet-0.7.0.tgz#90094849f0937f2eedc2425d0d28a9e5f0cbad9e"
@@ -2704,25 +2919,11 @@ cli-spinners@^2.0.0, cli-spinners@^2.5.0:
   resolved "https://registry.yarnpkg.com/cli-spinners/-/cli-spinners-2.6.0.tgz#36c7dc98fb6a9a76bd6238ec3f77e2425627e939"
   integrity sha512-t+4/y50K/+4xcCRosKkA7W4gTr1MySvLV0q+PxmG7FJ5g+66ChKurYjxBCjHggHH3HA5Hh9cy+lcUGWDqVH+4Q==
 
-cli-width@^2.0.0:
-  version "2.2.1"
-  resolved "https://registry.yarnpkg.com/cli-width/-/cli-width-2.2.1.tgz#b0433d0b4e9c847ef18868a4ef16fd5fc8271c48"
-  integrity sha512-GRMWDxpOB6Dgk2E5Uo+3eEBvtOOlimMmpbFiKuLFnQzYDavtLFY3K5ona41jgN/WdRZtG7utuVSVTL4HbZHGkw==
-
 cli-width@^3.0.0:
   version "3.0.0"
   resolved "https://registry.yarnpkg.com/cli-width/-/cli-width-3.0.0.tgz#a2f48437a2caa9a22436e794bf071ec9e61cedf6"
   integrity sha512-FxqpkPPwu1HjuN93Omfm4h8uIanXofW0RxVEW3k5RKx+mJJYSthzNhp32Kzxxy3YAEZ/Dc/EWN1vZRY0+kOhbw==
 
-cliui@^5.0.0:
-  version "5.0.0"
-  resolved "https://registry.yarnpkg.com/cliui/-/cliui-5.0.0.tgz#deefcfdb2e800784aa34f46fa08e06851c7bbbc5"
-  integrity sha512-PYeGSEmmHM6zvoef2w8TPzlrnNpXIjTipYK780YswmIP9vjxmd6Y2a3CB2Ks6/AU8NHjZugXvo8w3oWM2qnwXA==
-  dependencies:
-    string-width "^3.1.0"
-    strip-ansi "^5.2.0"
-    wrap-ansi "^5.1.0"
-
 cliui@^6.0.0:
   version "6.0.0"
   resolved "https://registry.yarnpkg.com/cliui/-/cliui-6.0.0.tgz#511d702c0c4e41ca156d7d0e96021f23e13225b1"
@@ -2804,16 +3005,16 @@ color-name@~1.1.4:
   resolved "https://registry.yarnpkg.com/color-name/-/color-name-1.1.4.tgz#c2a09a87acbde69543de6f63fa3995c826c536a2"
   integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==
 
-color-support@^1.1.3:
-  version "1.1.3"
-  resolved "https://registry.yarnpkg.com/color-support/-/color-support-1.1.3.tgz#93834379a1cc9a0c61f82f52f0d04322251bd5a2"
-  integrity sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==
-
 colorette@^1.0.7, colorette@^1.2.2:
   version "1.2.2"
   resolved "https://registry.yarnpkg.com/colorette/-/colorette-1.2.2.tgz#cbcc79d5e99caea2dbf10eb3a26fd8b3e6acfa94"
   integrity sha512-MKGMzyfeuutC/ZJ1cba9NqcNpfeqMUcYmyF1ZFY6/Cn7CNSAKx6a+s48sqLqyAiZuaP2TcqMhoo+dlwFnVxT9w==
 
+colors@^1.1.2:
+  version "1.4.0"
+  resolved "https://registry.yarnpkg.com/colors/-/colors-1.4.0.tgz#c50491479d4c1bdaed2c9ced32cf7c7dc2360f78"
+  integrity sha512-a+UqTh4kgZg/SlGvfbzDHpgRu7AAQOmmqRHJnxhRZICKFUT91brVhNNt58CMWU9PsBbv3PDCZUHbVxuDiH2mtA==
+
 combined-stream@^1.0.6, combined-stream@^1.0.8, combined-stream@~1.0.6:
   version "1.0.8"
   resolved "https://registry.yarnpkg.com/combined-stream/-/combined-stream-1.0.8.tgz#c3d45a8b34fd730631a110a8a2520682b31d5a7f"
@@ -2879,16 +3080,6 @@ concat-map@0.0.1:
   resolved "https://registry.yarnpkg.com/concat-map/-/concat-map-0.0.1.tgz#d8a96bd77fd68df7793a73036a3ba0d5405d477b"
   integrity sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=
 
-concat-stream@^1.6.0:
-  version "1.6.2"
-  resolved "https://registry.yarnpkg.com/concat-stream/-/concat-stream-1.6.2.tgz#904bdf194cd3122fc675c77fc4ac3d4ff0fd1a34"
-  integrity sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==
-  dependencies:
-    buffer-from "^1.0.0"
-    inherits "^2.0.3"
-    readable-stream "^2.2.2"
-    typedarray "^0.0.6"
-
 concat-stream@^2.0.0:
   version "2.0.0"
   resolved "https://registry.yarnpkg.com/concat-stream/-/concat-stream-2.0.0.tgz#414cf5af790a48c60ab9be4527d56d5e41133cb1"
@@ -3102,11 +3293,6 @@ core-js-compat@^3.9.0, core-js-compat@^3.9.1:
     browserslist "^4.16.6"
     semver "7.0.0"
 
-core-js@^2.4.1:
-  version "2.6.12"
-  resolved "https://registry.yarnpkg.com/core-js/-/core-js-2.6.12.tgz#d9333dfa7b065e347cc5682219d6f690859cc2ec"
-  integrity sha512-Kb2wC0fvsWfQrgk8HU5lW6U/Lcs8+9aaYcy4ZFc6DDlo4nZ7n70dEgE5rtR0oG6ufKDUnrwfWL1mXR5ljDatrQ==
-
 core-util-is@1.0.2, core-util-is@~1.0.0:
   version "1.0.2"
   resolved "https://registry.yarnpkg.com/core-util-is/-/core-util-is-1.0.2.tgz#b5fd54220aa2bc5ab57aab7140c940754503c1a7"
@@ -3133,15 +3319,6 @@ cosmiconfig@^5.0.5, cosmiconfig@^5.1.0:
     js-yaml "^3.13.1"
     parse-json "^4.0.0"
 
-cross-spawn@^5.1.0:
-  version "5.1.0"
-  resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-5.1.0.tgz#e8bd0efee58fcff6f8f94510a0a554bbfa235449"
-  integrity sha1-6L0O/uWPz/b4+UUQoKVUu/ojVEk=
-  dependencies:
-    lru-cache "^4.0.1"
-    shebang-command "^1.2.0"
-    which "^1.2.9"
-
 cross-spawn@^6.0.0:
   version "6.0.5"
   resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-6.0.5.tgz#4a5ec7c64dfae22c3a14124dbacdee846d80cbc4"
@@ -3457,6 +3634,11 @@ electron-to-chromium@^1.3.723:
   resolved "https://registry.yarnpkg.com/electron-to-chromium/-/electron-to-chromium-1.3.725.tgz#04fc83f9189169aff50f0a00c6b4090b910cba85"
   integrity sha512-2BbeAESz7kc6KBzs7WVrMc1BY5waUphk4D4DX5dSQXJhsc3tP5ZFaiyuL0AB7vUKzDYpIeYwTYlEfxyjsGUrhw==
 
+electron-to-chromium@^1.3.896:
+  version "1.3.902"
+  resolved "https://registry.yarnpkg.com/electron-to-chromium/-/electron-to-chromium-1.3.902.tgz#926726705c17f9531be23bda545b819b35da665d"
+  integrity sha512-zFv5jbtyIr+V9FuT9o439isXbkXQ27mJqZfLXpBKzXugWE8+3RotHbXJlli0/r+Rvdlkut0OOMzeOWLAjH0jCw==
+
 emittery@^0.7.1:
   version "0.7.2"
   resolved "https://registry.yarnpkg.com/emittery/-/emittery-0.7.2.tgz#25595908e13af0f5674ab419396e2fb394cdfa82"
@@ -3477,13 +3659,6 @@ encodeurl@~1.0.2:
   resolved "https://registry.yarnpkg.com/encodeurl/-/encodeurl-1.0.2.tgz#ad3ff4c86ec2d029322f5a02c3a9a606c95b3f59"
   integrity sha1-rT/0yG7C0CkyL1oCw6mmBslbP1k=
 
-encoding@^0.1.11:
-  version "0.1.13"
-  resolved "https://registry.yarnpkg.com/encoding/-/encoding-0.1.13.tgz#56574afdd791f54a8e9b2785c0582a2d26210fa9"
-  integrity sha512-ETBauow1T35Y/WZMkio9jiM0Z5xjHHmJ4XmjZOq1l/dXz3lr2sRn87nJy20RupqSh1F2m3HHPSp8ShIPQJrJ3A==
-  dependencies:
-    iconv-lite "^0.6.2"
-
 end-of-stream@^1.1.0:
   version "1.4.4"
   resolved "https://registry.yarnpkg.com/end-of-stream/-/end-of-stream-1.4.4.tgz#5ae64a5f45057baf3626ec14da0ca5e4b2431eb0"
@@ -3555,7 +3730,7 @@ escodegen@^2.0.0:
   optionalDependencies:
     source-map "~0.6.1"
 
-esprima@^4.0.0, esprima@^4.0.1:
+esprima@^4.0.0, esprima@^4.0.1, esprima@~4.0.0:
   version "4.0.1"
   resolved "https://registry.yarnpkg.com/esprima/-/esprima-4.0.1.tgz#13b04cdb3e6c5d19df91ab6987a8695619b0aa71"
   integrity sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==
@@ -3580,11 +3755,6 @@ event-target-shim@^5.0.0, event-target-shim@^5.0.1:
   resolved "https://registry.yarnpkg.com/event-target-shim/-/event-target-shim-5.0.1.tgz#5d4d3ebdf9583d63a5333ce2deb7480ab2b05789"
   integrity sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==
 
-eventemitter3@^3.0.0:
-  version "3.1.2"
-  resolved "https://registry.yarnpkg.com/eventemitter3/-/eventemitter3-3.1.2.tgz#2d3d48f9c346698fce83a85d7d664e98535df6e7"
-  integrity sha512-tvtQIeLVHjDkJYnzf2dgVMxfuSGJeM/7UCG17TT4EumTfNtF+0nebF/4zWOIkCreAbtNqhGEboB6BWrwqNaw4Q==
-
 exec-sh@^0.3.2:
   version "0.3.6"
   resolved "https://registry.yarnpkg.com/exec-sh/-/exec-sh-0.3.6.tgz#ff264f9e325519a60cb5e273692943483cca63bc"
@@ -3663,13 +3833,6 @@ expect@^26.6.2:
     jest-message-util "^26.6.2"
     jest-regex-util "^26.0.0"
 
-extend-shallow@^1.1.2:
-  version "1.1.4"
-  resolved "https://registry.yarnpkg.com/extend-shallow/-/extend-shallow-1.1.4.tgz#19d6bf94dfc09d76ba711f39b872d21ff4dd9071"
-  integrity sha1-Gda/lN/AnXa6cR85uHLSH/TdkHE=
-  dependencies:
-    kind-of "^1.1.0"
-
 extend-shallow@^2.0.1:
   version "2.0.1"
   resolved "https://registry.yarnpkg.com/extend-shallow/-/extend-shallow-2.0.1.tgz#51af7d614ad9a9f610ea1bafbb989d6b1c56890f"
@@ -3690,15 +3853,6 @@ extend@~3.0.2:
   resolved "https://registry.yarnpkg.com/extend/-/extend-3.0.2.tgz#f8b1136b4071fbd8eb140aff858b1019ec2915fa"
   integrity sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==
 
-external-editor@^2.0.4:
-  version "2.2.0"
-  resolved "https://registry.yarnpkg.com/external-editor/-/external-editor-2.2.0.tgz#045511cfd8d133f3846673d1047c154e214ad3d5"
-  integrity sha512-bSn6gvGxKt+b7+6TKEv1ZycHleA7aHhRHyAqJyp5pbUFuYYNIzpZnQDk7AsYckyWdEnTeAnay0aCy2aV6iTk9A==
-  dependencies:
-    chardet "^0.4.0"
-    iconv-lite "^0.4.17"
-    tmp "^0.0.33"
-
 external-editor@^3.0.3:
   version "3.1.0"
   resolved "https://registry.yarnpkg.com/external-editor/-/external-editor-3.1.0.tgz#cb03f740befae03ea4d283caed2741a83f335495"
@@ -3732,16 +3886,6 @@ extsprintf@^1.2.0:
   resolved "https://registry.yarnpkg.com/extsprintf/-/extsprintf-1.4.0.tgz#e2689f8f356fad62cca65a3a91c5df5f9551692f"
   integrity sha1-4mifjzVvrWLMplo6kcXfX5VRaS8=
 
-fancy-log@^1.3.2:
-  version "1.3.3"
-  resolved "https://registry.yarnpkg.com/fancy-log/-/fancy-log-1.3.3.tgz#dbc19154f558690150a23953a0adbd035be45fc7"
-  integrity sha512-k9oEhlyc0FrVh25qYuSELjr8oxsCoc4/LEZfg2iJJrfEk/tZL9bCoJE47gqAvI2m/AUjluCS4+3I0eTx8n3AEw==
-  dependencies:
-    ansi-gray "^0.1.1"
-    color-support "^1.1.3"
-    parse-node-version "^1.0.0"
-    time-stamp "^1.0.0"
-
 fast-deep-equal@^3.1.1:
   version "3.1.3"
   resolved "https://registry.yarnpkg.com/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz#3a7d56b559d6cbc3eb512325244e619a65c6c525"
@@ -3783,48 +3927,6 @@ fb-watchman@^2.0.0:
   dependencies:
     bser "2.1.1"
 
-fbjs-css-vars@^1.0.0:
-  version "1.0.2"
-  resolved "https://registry.yarnpkg.com/fbjs-css-vars/-/fbjs-css-vars-1.0.2.tgz#216551136ae02fe255932c3ec8775f18e2c078b8"
-  integrity sha512-b2XGFAFdWZWg0phtAWLHCk836A1Xann+I+Dgd3Gk64MHKZO44FfoD1KxyvbSh0qZsIoXQGGlVztIY+oitJPpRQ==
-
-fbjs-scripts@^1.1.0:
-  version "1.2.0"
-  resolved "https://registry.yarnpkg.com/fbjs-scripts/-/fbjs-scripts-1.2.0.tgz#069a0c0634242d10031c6460ef1fccefcdae8b27"
-  integrity sha512-5krZ8T0Bf8uky0abPoCLrfa7Orxd8UH4Qq8hRUF2RZYNMu+FmEOrBc7Ib3YVONmxTXTlLAvyrrdrVmksDb2OqQ==
-  dependencies:
-    "@babel/core" "^7.0.0"
-    ansi-colors "^1.0.1"
-    babel-preset-fbjs "^3.2.0"
-    core-js "^2.4.1"
-    cross-spawn "^5.1.0"
-    fancy-log "^1.3.2"
-    object-assign "^4.0.1"
-    plugin-error "^0.1.2"
-    semver "^5.1.0"
-    through2 "^2.0.0"
-
-fbjs@^1.0.0:
-  version "1.0.0"
-  resolved "https://registry.yarnpkg.com/fbjs/-/fbjs-1.0.0.tgz#52c215e0883a3c86af2a7a776ed51525ae8e0a5a"
-  integrity sha512-MUgcMEJaFhCaF1QtWGnmq9ZDRAzECTCRAF7O6UZIlAlkTs1SasiX9aP0Iw7wfD2mJ7wDTNfg2w7u5fSCwJk1OA==
-  dependencies:
-    core-js "^2.4.1"
-    fbjs-css-vars "^1.0.0"
-    isomorphic-fetch "^2.1.1"
-    loose-envify "^1.0.0"
-    object-assign "^4.1.0"
-    promise "^7.1.1"
-    setimmediate "^1.0.5"
-    ua-parser-js "^0.7.18"
-
-figures@^2.0.0:
-  version "2.0.0"
-  resolved "https://registry.yarnpkg.com/figures/-/figures-2.0.0.tgz#3ab1a2d2a62c8bfb431a0c94cb797a2fce27c962"
-  integrity sha1-OrGi0qYsi/tDGgyUy3l6L84nyWI=
-  dependencies:
-    escape-string-regexp "^1.0.5"
-
 figures@^3.0.0:
   version "3.2.0"
   resolved "https://registry.yarnpkg.com/figures/-/figures-3.2.0.tgz#625c18bd293c604dc4a8ddb2febf0c88341746af"
@@ -3832,11 +3934,6 @@ figures@^3.0.0:
   dependencies:
     escape-string-regexp "^1.0.5"
 
-file-uri-to-path@1.0.0:
-  version "1.0.0"
-  resolved "https://registry.yarnpkg.com/file-uri-to-path/-/file-uri-to-path-1.0.0.tgz#553a7b8446ff6f684359c445f1e37a05dacc33dd"
-  integrity sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==
-
 filelist@^1.0.1:
   version "1.0.2"
   resolved "https://registry.yarnpkg.com/filelist/-/filelist-1.0.2.tgz#80202f21462d4d1c2e214119b1807c1bc0380e5b"
@@ -3931,6 +4028,16 @@ flat@^5.0.2:
   resolved "https://registry.yarnpkg.com/flat/-/flat-5.0.2.tgz#8ca6fe332069ffa9d324c327198c598259ceb241"
   integrity sha512-b6suED+5/3rTpUBdG1gupIl8MPFCAMA0QXwmljLhvCUKcUvdE4gWky9zpuGCcXHOsz4J9wPGNWq6OKpmIzz3hQ==
 
+flow-parser@0.*:
+  version "0.165.0"
+  resolved "https://registry.yarnpkg.com/flow-parser/-/flow-parser-0.165.0.tgz#327f4c2c6516face02cb2239bb3bf4b4bf0e8fd8"
+  integrity sha512-s9pW6fqJvYwdc2KfS0R76e2T1tgKpz4TzFyrnaHPdo/nEZbMiQQuB+uqUgg+sSPmHya0UV3Igc3Dy+NxSDN5sg==
+
+flow-parser@^0.121.0:
+  version "0.121.0"
+  resolved "https://registry.yarnpkg.com/flow-parser/-/flow-parser-0.121.0.tgz#9f9898eaec91a9f7c323e9e992d81ab5c58e618f"
+  integrity sha512-1gIBiWJNR0tKUNv8gZuk7l9rVX06OuLzY9AoGio7y/JT4V1IZErEMEq2TJS+PFcw/y0RshZ1J/27VfK1UQzYVg==
+
 for-in@^1.0.2:
   version "1.0.2"
   resolved "https://registry.yarnpkg.com/for-in/-/for-in-1.0.2.tgz#81068d295a8142ec0ac726c6e2200c30fb6d5e80"
@@ -4004,14 +4111,6 @@ fs.realpath@^1.0.0:
   resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f"
   integrity sha1-FQStJSMVjKpA20onh8sBQRmU6k8=
 
-fsevents@^1.2.7:
-  version "1.2.13"
-  resolved "https://registry.yarnpkg.com/fsevents/-/fsevents-1.2.13.tgz#f325cb0455592428bcf11b383370ef70e3bfcc38"
-  integrity sha512-oWb1Z6mkHIskLzEJ/XWX0srkpkTQ7vaopMQkyaEIoq0fmtFVxOthb8cCxeT+p3ynTdkk/RZwbgG4brR5BeWECw==
-  dependencies:
-    bindings "^1.5.0"
-    nan "^2.12.1"
-
 fsevents@^2.1.2:
   version "2.3.2"
   resolved "https://registry.yarnpkg.com/fsevents/-/fsevents-2.3.2.tgz#8a526f78b8fdf4623b709e0b975c52c24c02fd1a"
@@ -4233,6 +4332,11 @@ got@^9.5.0, got@^9.6.0:
     to-readable-stream "^1.0.0"
     url-parse-lax "^3.0.0"
 
+graceful-fs@^4.1.11:
+  version "4.2.8"
+  resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.8.tgz#e412b8d33f5e006593cbd3cee6df9f2cebbe802a"
+  integrity sha512-qkIilPUYcNhJpd33n0GBXTB1MMPp14TxEsEs0pTrsSVucApsYzW5V+Q8Qxhik6KU3evy+qkAAowTByymK0avdg==
+
 graceful-fs@^4.1.15, graceful-fs@^4.1.2, graceful-fs@^4.1.3, graceful-fs@^4.1.6, graceful-fs@^4.1.9, graceful-fs@^4.2.0, graceful-fs@^4.2.4:
   version "4.2.6"
   resolved "https://registry.yarnpkg.com/graceful-fs/-/graceful-fs-4.2.6.tgz#ff040b2b0853b23c3d31027523706f1885d76bee"
@@ -4331,10 +4435,10 @@ has@^1.0.3:
   dependencies:
     function-bind "^1.1.1"
 
-hermes-engine@~0.5.0:
-  version "0.5.1"
-  resolved "https://registry.yarnpkg.com/hermes-engine/-/hermes-engine-0.5.1.tgz#601115e4b1e0a17d9aa91243b96277de4e926e09"
-  integrity sha512-hLwqh8dejHayjlpvZY40e1aDCDvyP98cWx/L5DhAjSJLH8g4z9Tp08D7y4+3vErDsncPOdf1bxm+zUWpx0/Fxg==
+hermes-engine@~0.7.0:
+  version "0.7.2"
+  resolved "https://registry.yarnpkg.com/hermes-engine/-/hermes-engine-0.7.2.tgz#303cd99d23f68e708b223aec2d49d5872985388b"
+  integrity sha512-E2DkRaO97gwL98LPhgfkMqhHiNsrAjIfEk3wWYn2Y31xdkdWn0572H7RnVcGujMJVqZNJvtknxlpsUb8Wzc3KA==
 
 hermes-profile-transformer@^0.0.6:
   version "0.0.6"
@@ -4410,20 +4514,13 @@ human-signals@^2.1.0:
   resolved "https://registry.yarnpkg.com/human-signals/-/human-signals-2.1.0.tgz#dc91fcba42e4d06e4abaed33b3e7a3c02f514ea0"
   integrity sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==
 
-iconv-lite@0.4.24, iconv-lite@^0.4.17, iconv-lite@^0.4.24:
+iconv-lite@0.4.24, iconv-lite@^0.4.24:
   version "0.4.24"
   resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.4.24.tgz#2022b4b25fbddc21d2f524974a474aafe733908b"
   integrity sha512-v3MXnZAcvnywkTUEZomIActle7RXXeedOR31wwl7VlyoXO4Qi9arvSenNQWne1TcRwhCL1HwLI21bEqdpj8/rA==
   dependencies:
     safer-buffer ">= 2.1.2 < 3"
 
-iconv-lite@^0.6.2:
-  version "0.6.2"
-  resolved "https://registry.yarnpkg.com/iconv-lite/-/iconv-lite-0.6.2.tgz#ce13d1875b0c3a674bd6a04b7f76b01b1b6ded01"
-  integrity sha512-2y91h5OpQlolefMPmUlivelittSWy0rP+oYVpn6A7GwVHNE8AWzoYOBNmlwks3LobaJxgHCYZAnyNo2GgpNRNQ==
-  dependencies:
-    safer-buffer ">= 2.1.2 < 3.0.0"
-
 ieee754@^1.1.13, ieee754@^1.2.1:
   version "1.2.1"
   resolved "https://registry.yarnpkg.com/ieee754/-/ieee754-1.2.1.tgz#8eb7a10a63fff25d15a57b001586d177d1b0d352"
@@ -4541,26 +4638,6 @@ inquirer@8.0.0:
     strip-ansi "^6.0.0"
     through "^2.3.6"
 
-inquirer@^3.0.6:
-  version "3.3.0"
-  resolved "https://registry.yarnpkg.com/inquirer/-/inquirer-3.3.0.tgz#9dd2f2ad765dcab1ff0443b491442a20ba227dc9"
-  integrity sha512-h+xtnyk4EwKvFWHrUYsWErEVR+igKtLdchu+o0Z1RL7VU/jVMFbYir2bp6bAj8efFNxWqHX0dIss6fJQ+/+qeQ==
-  dependencies:
-    ansi-escapes "^3.0.0"
-    chalk "^2.0.0"
-    cli-cursor "^2.1.0"
-    cli-width "^2.0.0"
-    external-editor "^2.0.4"
-    figures "^2.0.0"
-    lodash "^4.3.0"
-    mute-stream "0.0.7"
-    run-async "^2.2.0"
-    rx-lite "^4.0.8"
-    rx-lite-aggregates "^4.0.8"
-    string-width "^2.1.0"
-    strip-ansi "^4.0.0"
-    through "^2.3.6"
-
 interpret@^1.0.0:
   version "1.4.0"
   resolved "https://registry.yarnpkg.com/interpret/-/interpret-1.4.0.tgz#665ab8bc4da27a774a40584e812e3e0fa45b1a1e"
@@ -4814,7 +4891,7 @@ is-ssh@^1.3.0:
   dependencies:
     protocols "^1.1.0"
 
-is-stream@^1.0.1, is-stream@^1.1.0:
+is-stream@^1.1.0:
   version "1.1.0"
   resolved "https://registry.yarnpkg.com/is-stream/-/is-stream-1.1.0.tgz#12d4a3dd4e68e0b79ceb8dbc84173ae80d91ca44"
   integrity sha1-EtSj3U5o4Lec6428hBc66A2RykQ=
@@ -4897,14 +4974,6 @@ isobject@^3.0.0, isobject@^3.0.1:
   resolved "https://registry.yarnpkg.com/isobject/-/isobject-3.0.1.tgz#4e431e92b11a9731636aa1f9c8d1ccbcfdab78df"
   integrity sha1-TkMekrEalzFjaqH5yNHMvP2reN8=
 
-isomorphic-fetch@^2.1.1:
-  version "2.2.1"
-  resolved "https://registry.yarnpkg.com/isomorphic-fetch/-/isomorphic-fetch-2.2.1.tgz#611ae1acf14f5e81f729507472819fe9733558a9"
-  integrity sha1-YRrhrPFPXoH3KVB0coGf6XM1WKk=
-  dependencies:
-    node-fetch "^1.0.1"
-    whatwg-fetch ">=0.10.0"
-
 isstream@~0.1.2:
   version "0.1.2"
   resolved "https://registry.yarnpkg.com/isstream/-/isstream-0.1.2.tgz#47e63f7af55afa6f92e1500e690eb8b8529c099a"
@@ -5066,36 +5135,12 @@ jest-environment-node@^26.6.2:
     jest-mock "^26.6.2"
     jest-util "^26.6.2"
 
-jest-get-type@^24.9.0:
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/jest-get-type/-/jest-get-type-24.9.0.tgz#1684a0c8a50f2e4901b6644ae861f579eed2ef0e"
-  integrity sha512-lUseMzAley4LhIcpSP9Jf+fTrQ4a1yHQwLNeeVa2cEmbCGeoZAtYPOIv8JaxLD/sUpKxetKGP+gsHl8f8TSj8Q==
-
 jest-get-type@^26.3.0:
   version "26.3.0"
   resolved "https://registry.yarnpkg.com/jest-get-type/-/jest-get-type-26.3.0.tgz#e97dc3c3f53c2b406ca7afaed4493b1d099199e0"
   integrity sha512-TpfaviN1R2pQWkIihlfEanwOXK0zcxrKEE4MlU6Tn7keoXdN6/3gK/xl0yEh8DOunn5pOVGKf8hB4R9gVh04ig==
 
-jest-haste-map@^24.9.0:
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/jest-haste-map/-/jest-haste-map-24.9.0.tgz#b38a5d64274934e21fa417ae9a9fbeb77ceaac7d"
-  integrity sha512-kfVFmsuWui2Sj1Rp1AJ4D9HqJwE4uwTlS/vO+eRUaMmd54BFpli2XhMQnPC2k4cHFVbB2Q2C+jtI1AGLgEnCjQ==
-  dependencies:
-    "@jest/types" "^24.9.0"
-    anymatch "^2.0.0"
-    fb-watchman "^2.0.0"
-    graceful-fs "^4.1.15"
-    invariant "^2.2.4"
-    jest-serializer "^24.9.0"
-    jest-util "^24.9.0"
-    jest-worker "^24.9.0"
-    micromatch "^3.1.10"
-    sane "^4.0.3"
-    walker "^1.0.7"
-  optionalDependencies:
-    fsevents "^1.2.7"
-
-jest-haste-map@^26.6.2:
+jest-haste-map@^26.5.2, jest-haste-map@^26.6.2:
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/jest-haste-map/-/jest-haste-map-26.6.2.tgz#dd7e60fe7dc0e9f911a23d79c5ff7fb5c2cafeaa"
   integrity sha512-easWIJXIw71B2RdR8kgqpjQrbMRWQBgiBwXYEhtGUTaX+doCjBheluShdDMeR8IMfJiTqH4+zfhtg29apJf/8w==
@@ -5158,20 +5203,6 @@ jest-matcher-utils@^26.6.2:
     jest-get-type "^26.3.0"
     pretty-format "^26.6.2"
 
-jest-message-util@^24.9.0:
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/jest-message-util/-/jest-message-util-24.9.0.tgz#527f54a1e380f5e202a8d1149b0ec872f43119e3"
-  integrity sha512-oCj8FiZ3U0hTP4aSui87P4L4jC37BtQwUMqk+zk/b11FR19BJDeZsZAvIHutWnmtw7r85UmR3CEWZ0HWU2mAlw==
-  dependencies:
-    "@babel/code-frame" "^7.0.0"
-    "@jest/test-result" "^24.9.0"
-    "@jest/types" "^24.9.0"
-    "@types/stack-utils" "^1.0.1"
-    chalk "^2.0.1"
-    micromatch "^3.1.10"
-    slash "^2.0.0"
-    stack-utils "^1.0.1"
-
 jest-message-util@^26.6.2:
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/jest-message-util/-/jest-message-util-26.6.2.tgz#58173744ad6fc0506b5d21150b9be56ef001ca07"
@@ -5187,13 +5218,6 @@ jest-message-util@^26.6.2:
     slash "^3.0.0"
     stack-utils "^2.0.2"
 
-jest-mock@^24.9.0:
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/jest-mock/-/jest-mock-24.9.0.tgz#c22835541ee379b908673ad51087a2185c13f1c6"
-  integrity sha512-3BEYN5WbSq9wd+SyLDES7AHnjH9A/ROBwmz7l2y+ol+NtSFO8DYiEBzoO1CeFc9a8DYy10EO4dDFVv/wN3zl1w==
-  dependencies:
-    "@jest/types" "^24.9.0"
-
 jest-mock@^26.6.2:
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/jest-mock/-/jest-mock-26.6.2.tgz#d6cb712b041ed47fe0d9b6fc3474bc6543feb302"
@@ -5294,11 +5318,6 @@ jest-runtime@^26.6.3:
     strip-bom "^4.0.0"
     yargs "^15.4.1"
 
-jest-serializer@^24.9.0:
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/jest-serializer/-/jest-serializer-24.9.0.tgz#e6d7d7ef96d31e8b9079a714754c5d5c58288e73"
-  integrity sha512-DxYipDr8OvfrKH3Kel6NdED3OXxjvxXZ1uIY2I9OFbGg+vUkkg7AGvi65qbhbWNPvDckXmzMPbK3u3HaDO49bQ==
-
 jest-serializer@^26.6.2:
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/jest-serializer/-/jest-serializer-26.6.2.tgz#d139aafd46957d3a448f3a6cdabe2919ba0742d1"
@@ -5329,24 +5348,6 @@ jest-snapshot@^26.6.2:
     pretty-format "^26.6.2"
     semver "^7.3.2"
 
-jest-util@^24.9.0:
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/jest-util/-/jest-util-24.9.0.tgz#7396814e48536d2e85a37de3e4c431d7cb140162"
-  integrity sha512-x+cZU8VRmOJxbA1K5oDBdxQmdq0OIdADarLxk0Mq+3XS4jgvhG/oKGWcIDCtPG0HgjxOYvF+ilPJQsAyXfbNOg==
-  dependencies:
-    "@jest/console" "^24.9.0"
-    "@jest/fake-timers" "^24.9.0"
-    "@jest/source-map" "^24.9.0"
-    "@jest/test-result" "^24.9.0"
-    "@jest/types" "^24.9.0"
-    callsites "^3.0.0"
-    chalk "^2.0.1"
-    graceful-fs "^4.1.15"
-    is-ci "^2.0.0"
-    mkdirp "^0.5.1"
-    slash "^2.0.0"
-    source-map "^0.6.0"
-
 jest-util@^26.6.2:
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/jest-util/-/jest-util-26.6.2.tgz#907535dbe4d5a6cb4c47ac9b926f6af29576cbc1"
@@ -5359,19 +5360,7 @@ jest-util@^26.6.2:
     is-ci "^2.0.0"
     micromatch "^4.0.2"
 
-jest-validate@^24.9.0:
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/jest-validate/-/jest-validate-24.9.0.tgz#0775c55360d173cd854e40180756d4ff52def8ab"
-  integrity sha512-HPIt6C5ACwiqSiwi+OfSSHbK8sG7akG8eATl+IPKaeIjtPOeBUd/g3J7DghugzxrGjI93qS/+RPKe1H6PqvhRQ==
-  dependencies:
-    "@jest/types" "^24.9.0"
-    camelcase "^5.3.1"
-    chalk "^2.0.1"
-    jest-get-type "^24.9.0"
-    leven "^3.1.0"
-    pretty-format "^24.9.0"
-
-jest-validate@^26.6.2:
+jest-validate@^26.5.2, jest-validate@^26.6.2:
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/jest-validate/-/jest-validate-26.6.2.tgz#23d380971587150467342911c3d7b4ac57ab20ec"
   integrity sha512-NEYZ9Aeyj0i5rQqbq+tpIOom0YS1u2MVu6+euBsvpgIme+FOfRmoC4R5p0JiAUpaFvFy24xgrpMknarR/93XjQ==
@@ -5396,15 +5385,7 @@ jest-watcher@^26.6.2:
     jest-util "^26.6.2"
     string-length "^4.0.1"
 
-jest-worker@^24.9.0:
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/jest-worker/-/jest-worker-24.9.0.tgz#5dbfdb5b2d322e98567898238a9697bcce67b3e5"
-  integrity sha512-51PE4haMSXcHohnSMdM42anbvZANYTqMrr52tVKPqqsPJMzoP6FYYDVqahX/HrAoKEKz3uUPzSvKs9A3qR4iVw==
-  dependencies:
-    merge-stream "^2.0.0"
-    supports-color "^6.1.0"
-
-jest-worker@^26.6.2:
+jest-worker@^26.0.0, jest-worker@^26.6.2:
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/jest-worker/-/jest-worker-26.6.2.tgz#7f72cbc4d643c365e27b9fd775f9d0eaa9c7a8ed"
   integrity sha512-KWYVV1c4i+jbMpaBC+U++4Va0cp8OisU185o73T1vo99hqi7w8tSJfUXYswwqqrjzwxa6KpRK54WhPvwf5w6PQ==
@@ -5427,6 +5408,17 @@ jetifier@^1.6.2, jetifier@^1.6.6:
   resolved "https://registry.yarnpkg.com/jetifier/-/jetifier-1.6.7.tgz#0e3f5ca716db03f1c869c9711ce1b7d0a6e5b24a"
   integrity sha512-IMVhXwdIxrR7vVq624m8H79IsaLysUYDPI2KC18L3LCc1Tg/KYvNMIkHN3X96XF5fnPVdINBFkFfC3rsPX1yYg==
 
+joi@^17.2.1:
+  version "17.4.2"
+  resolved "https://registry.yarnpkg.com/joi/-/joi-17.4.2.tgz#02f4eb5cf88e515e614830239379dcbbe28ce7f7"
+  integrity sha512-Lm56PP+n0+Z2A2rfRvsfWVDXGEWjXxatPopkQ8qQ5mxCEhwHG+Ettgg5o98FFaxilOxozoa14cFhrE/hOzh/Nw==
+  dependencies:
+    "@hapi/hoek" "^9.0.0"
+    "@hapi/topo" "^5.0.0"
+    "@sideway/address" "^4.1.0"
+    "@sideway/formula" "^3.0.0"
+    "@sideway/pinpoint" "^2.0.0"
+
 "js-tokens@^3.0.0 || ^4.0.0", js-tokens@^4.0.0:
   version "4.0.0"
   resolved "https://registry.yarnpkg.com/js-tokens/-/js-tokens-4.0.0.tgz#19203fb59991df98e3a287050d4647cdeaf32499"
@@ -5450,6 +5442,31 @@ jsc-android@^245459.0.0:
   resolved "https://registry.yarnpkg.com/jsc-android/-/jsc-android-245459.0.0.tgz#e584258dd0b04c9159a27fb104cd5d491fd202c9"
   integrity sha512-wkjURqwaB1daNkDi2OYYbsLnIdC/lUM2nPXQKRs5pqEU9chDg435bjvo+LSaHotDENygHQDHe+ntUkkw2gwMtg==
 
+jscodeshift@^0.11.0:
+  version "0.11.0"
+  resolved "https://registry.yarnpkg.com/jscodeshift/-/jscodeshift-0.11.0.tgz#4f95039408f3f06b0e39bb4d53bc3139f5330e2f"
+  integrity sha512-SdRK2C7jjs4k/kT2mwtO07KJN9RnjxtKn03d9JVj6c3j9WwaLcFYsICYDnLAzY0hp+wG2nxl+Cm2jWLiNVYb8g==
+  dependencies:
+    "@babel/core" "^7.1.6"
+    "@babel/parser" "^7.1.6"
+    "@babel/plugin-proposal-class-properties" "^7.1.0"
+    "@babel/plugin-proposal-nullish-coalescing-operator" "^7.1.0"
+    "@babel/plugin-proposal-optional-chaining" "^7.1.0"
+    "@babel/plugin-transform-modules-commonjs" "^7.1.0"
+    "@babel/preset-flow" "^7.0.0"
+    "@babel/preset-typescript" "^7.1.0"
+    "@babel/register" "^7.0.0"
+    babel-core "^7.0.0-bridge.0"
+    colors "^1.1.2"
+    flow-parser "0.*"
+    graceful-fs "^4.2.4"
+    micromatch "^3.1.10"
+    neo-async "^2.5.0"
+    node-dir "^0.1.17"
+    recast "^0.20.3"
+    temp "^0.8.1"
+    write-file-atomic "^2.3.0"
+
 jsdom@^16.4.0:
   version "16.5.3"
   resolved "https://registry.yarnpkg.com/jsdom/-/jsdom-16.5.3.tgz#13a755b3950eb938b4482c407238ddf16f0d2136"
@@ -5522,13 +5539,6 @@ json-schema@0.2.3:
   resolved "https://registry.yarnpkg.com/json-schema/-/json-schema-0.2.3.tgz#b480c892e59a2f05954ce727bd3f2a4e882f9e13"
   integrity sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM=
 
-json-stable-stringify@^1.0.1:
-  version "1.0.1"
-  resolved "https://registry.yarnpkg.com/json-stable-stringify/-/json-stable-stringify-1.0.1.tgz#9a759d39c5f2ff503fd5300646ed445f88c4f9af"
-  integrity sha1-mnWdOcXy/1A/1TAGRu1EX4jE+a8=
-  dependencies:
-    jsonify "~0.0.0"
-
 json-stringify-safe@^5.0.1, json-stringify-safe@~5.0.1:
   version "5.0.1"
   resolved "https://registry.yarnpkg.com/json-stringify-safe/-/json-stringify-safe-5.0.1.tgz#1296a2d58fd45f19a0f6ce01d65701e2c735b6eb"
@@ -5598,11 +5608,6 @@ keyv@^4.0.0:
   dependencies:
     json-buffer "3.0.1"
 
-kind-of@^1.1.0:
-  version "1.1.0"
-  resolved "https://registry.yarnpkg.com/kind-of/-/kind-of-1.1.0.tgz#140a3d2d41a36d2efcfa9377b62c24f8495a5c44"
-  integrity sha1-FAo9LUGjbS78+pN3tiwk+ElaXEQ=
-
 kind-of@^3.0.2, kind-of@^3.0.3, kind-of@^3.2.0:
   version "3.2.2"
   resolved "https://registry.yarnpkg.com/kind-of/-/kind-of-3.2.2.tgz#31ea21a734bab9bbb0f32466d893aea51e4a3c64"
@@ -5730,7 +5735,7 @@ lodash.throttle@^4.1.1:
   resolved "https://registry.yarnpkg.com/lodash.throttle/-/lodash.throttle-4.1.1.tgz#c23e91b710242ac70c37f1e1cda9274cc39bf2f4"
   integrity sha1-wj6RtxAkKscMN/HhzaknTMOb8vQ=
 
-lodash@4.17.21, lodash@^4.17.14, lodash@^4.17.15, lodash@^4.17.19, lodash@^4.17.20, lodash@^4.17.21, lodash@^4.3.0, lodash@^4.7.0:
+lodash@4.17.21, lodash@^4.17.14, lodash@^4.17.15, lodash@^4.17.19, lodash@^4.17.20, lodash@^4.17.21, lodash@^4.7.0:
   version "4.17.21"
   resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c"
   integrity sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==
@@ -5784,14 +5789,6 @@ lowercase-keys@^2.0.0:
   resolved "https://registry.yarnpkg.com/lowercase-keys/-/lowercase-keys-2.0.0.tgz#2603e78b7b4b0006cbca2fbcc8a3202558ac9479"
   integrity sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA==
 
-lru-cache@^4.0.1:
-  version "4.1.5"
-  resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-4.1.5.tgz#8bbe50ea85bed59bc9e33dcab8235ee9bcf443cd"
-  integrity sha512-sWZlbEP2OsHNkXrMl5GYk/jKk70MBng6UU4YI/qGDYbgf6YbP4EvmqISbXCoJiRKs+1bSpFHVgQxvJ17F2li5g==
-  dependencies:
-    pseudomap "^1.0.2"
-    yallist "^2.1.2"
-
 lru-cache@^6.0.0:
   version "6.0.0"
   resolved "https://registry.yarnpkg.com/lru-cache/-/lru-cache-6.0.0.tgz#6d6fe6570ebd96aaf90fcad1dafa3b2566db3a94"
@@ -5881,13 +5878,6 @@ meow@^8.0.0:
     type-fest "^0.18.0"
     yargs-parser "^20.2.3"
 
-merge-stream@^1.0.1:
-  version "1.0.1"
-  resolved "https://registry.yarnpkg.com/merge-stream/-/merge-stream-1.0.1.tgz#4041202d508a342ba00174008df0c251b8c135e1"
-  integrity sha1-QEEgLVCKNCugAXQAjfDCUbjBNeE=
-  dependencies:
-    readable-stream "^2.0.1"
-
 merge-stream@^2.0.0:
   version "2.0.0"
   resolved "https://registry.yarnpkg.com/merge-stream/-/merge-stream-2.0.0.tgz#52823629a14dd00c9770fb6ad47dc6310f2c1f60"
@@ -5898,10 +5888,10 @@ merge2@^1.3.0:
   resolved "https://registry.yarnpkg.com/merge2/-/merge2-1.4.1.tgz#4368892f885e907455a6fd7dc55c0c9d404990ae"
   integrity sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==
 
-metro-babel-register@0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-babel-register/-/metro-babel-register-0.59.0.tgz#2bcff65641b36794cf083ba732fbc46cf870fb43"
-  integrity sha512-JtWc29erdsXO/V3loenXKw+aHUXgj7lt0QPaZKPpctLLy8kcEpI/8pfXXgVK9weXICCpCnYtYncIosAyzh0xjg==
+metro-babel-register@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-babel-register/-/metro-babel-register-0.64.0.tgz#1a2d23f68da8b8ee42e78dca37ad21a5f4d3647d"
+  integrity sha512-Kf6YvE3kIRumGnjK0Q9LqGDIdnsX9eFGtNBmBuCVDuB9wGGA/5CgX8We8W7Y44dz1RGTcHJRhfw5iGg+pwC3aQ==
   dependencies:
     "@babel/core" "^7.0.0"
     "@babel/plugin-proposal-class-properties" "^7.0.0"
@@ -5912,67 +5902,78 @@ metro-babel-register@0.59.0:
     "@babel/register" "^7.0.0"
     escape-string-regexp "^1.0.5"
 
-metro-babel-transformer@0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-babel-transformer/-/metro-babel-transformer-0.59.0.tgz#dda99c75d831b00142c42c020c51c103b29f199d"
-  integrity sha512-fdZJl8rs54GVFXokxRdD7ZrQ1TJjxWzOi/xSP25VR3E8tbm3nBZqS+/ylu643qSr/IueABR+jrlqAyACwGEf6w==
+metro-babel-transformer@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-babel-transformer/-/metro-babel-transformer-0.64.0.tgz#a21f8a989a5ea60c1109456e21bd4d9374194ea0"
+  integrity sha512-itZaxKTgmKGEZWxNzbSZBc22NngrMZzoUNuU92aHSTGkYi2WH4XlvzEHsstmIKHMsRVKl75cA+mNmgk4gBFJKw==
   dependencies:
     "@babel/core" "^7.0.0"
-    metro-source-map "0.59.0"
+    metro-source-map "0.64.0"
+    nullthrows "^1.1.1"
 
-metro-cache@0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-cache/-/metro-cache-0.59.0.tgz#ef3c055f276933979b731455dc8317d7a66f0f2d"
-  integrity sha512-ryWNkSnpyADfRpHGb8BRhQ3+k8bdT/bsxMH2O0ntlZYZ188d8nnYWmxbRvFmEzToJxe/ol4uDw0tJFAaQsN8KA==
+metro-cache-key@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-cache-key/-/metro-cache-key-0.64.0.tgz#98d0a94332453c4c52b74f72c07cc62a5c264c4f"
+  integrity sha512-O9B65G8L/fopck45ZhdRosyVZdMtUQuX5mBWEC1NRj02iWBIUPLmYMjrunqIe8vHipCMp3DtTCm/65IlBmO8jg==
+
+metro-cache@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-cache/-/metro-cache-0.64.0.tgz#a769503e12521d9e9d95ce5840ffb2efdb4e8703"
+  integrity sha512-QvGfxe/1QQYM9XOlR8W1xqE9eHDw/AgJIgYGn/TxZxBu9Zga+Rgs1omeSZju45D8w5VWgMr83ma5kACgzvOecg==
   dependencies:
-    jest-serializer "^24.9.0"
-    metro-core "0.59.0"
+    metro-core "0.64.0"
     mkdirp "^0.5.1"
     rimraf "^2.5.4"
 
-metro-config@0.59.0, metro-config@^0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-config/-/metro-config-0.59.0.tgz#9844e388069321dd7403e49f0d495a81f9aa0fef"
-  integrity sha512-MDsknFG9vZ4Nb5VR6OUDmGHaWz6oZg/FtE3up1zVBKPVRTXE1Z+k7zypnPtMXjMh3WHs/Sy4+wU1xnceE/zdnA==
+metro-config@0.64.0, metro-config@^0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-config/-/metro-config-0.64.0.tgz#b634fa05cffd06b1e50e4339c200f90a42924afb"
+  integrity sha512-QhM4asnX5KhlRWaugwVGNNXhX0Z85u5nK0UQ/A90bBb4xWyXqUe20e788VtdA75rkQiiI6wXTCIHWT0afbnjwQ==
   dependencies:
     cosmiconfig "^5.0.5"
-    jest-validate "^24.9.0"
-    metro "0.59.0"
-    metro-cache "0.59.0"
-    metro-core "0.59.0"
+    jest-validate "^26.5.2"
+    metro "0.64.0"
+    metro-cache "0.64.0"
+    metro-core "0.64.0"
+    metro-runtime "0.64.0"
 
-metro-core@0.59.0, metro-core@^0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-core/-/metro-core-0.59.0.tgz#958cde3fe5c8cd84a78e1899af801ad69e9c83b1"
-  integrity sha512-kb5LKvV5r2pqMEzGyTid8ai2mIjW13NMduQ8oBmfha7/EPTATcTQ//s+bkhAs1toQD8vqVvjAb0cPNjWQEmcmQ==
+metro-core@0.64.0, metro-core@^0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-core/-/metro-core-0.64.0.tgz#7616b27acfe7baa476f6cd6bd9e70ae64fa62541"
+  integrity sha512-v8ZQ5j72EaUwamQ8pLfHlOHTyp7SbdazvHPzFGDpHnwIQqIT0Bw3Syg8R4regTlVG3ngpeSEAi005UITljmMcQ==
   dependencies:
-    jest-haste-map "^24.9.0"
+    jest-haste-map "^26.5.2"
     lodash.throttle "^4.1.1"
-    metro-resolver "0.59.0"
-    wordwrap "^1.0.0"
+    metro-resolver "0.64.0"
 
-metro-inspector-proxy@0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-inspector-proxy/-/metro-inspector-proxy-0.59.0.tgz#39d1390772d13767fc595be9a1a7074e2425cf8e"
-  integrity sha512-hPeAuQcofTOH0F+2GEZqWkvkVY1/skezSSlMocDQDaqds+Kw6JgdA7FlZXxnKmQ/jYrWUzff/pl8SUCDwuYthQ==
+metro-hermes-compiler@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-hermes-compiler/-/metro-hermes-compiler-0.64.0.tgz#e6043d7aa924e5b2be99bd3f602e693685d15386"
+  integrity sha512-CLAjVDWGAoGhbi2ZyPHnH5YDdfrDIx6+tzFWfHGIMTZkYBXsYta9IfYXBV8lFb6BIbrXLjlXZAOoosknetMPOA==
+
+metro-inspector-proxy@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-inspector-proxy/-/metro-inspector-proxy-0.64.0.tgz#9a481b3f49773d5418e028178efec68f861bec88"
+  integrity sha512-KywbH3GNSz9Iqw4UH3smgaV2dBHHYMISeN7ORntDL/G+xfgPc6vt13d+zFb907YpUcXj5N0vdoiAHI5V/0y8IA==
   dependencies:
     connect "^3.6.5"
     debug "^2.2.0"
     ws "^1.1.5"
-    yargs "^14.2.0"
+    yargs "^15.3.1"
 
-metro-minify-uglify@0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-minify-uglify/-/metro-minify-uglify-0.59.0.tgz#6491876308d878742f7b894d7fca4af356886dd5"
-  integrity sha512-7IzVgCVWZMymgZ/quieg/9v5EQ8QmZWAgDc86Zp9j0Vy6tQTjUn6jlU+YAKW3mfMEjMr6iIUzCD8YklX78tFAw==
+metro-minify-uglify@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-minify-uglify/-/metro-minify-uglify-0.64.0.tgz#da6ab4dda030e3211f5924e7f41ed308d466068f"
+  integrity sha512-DRwRstqXR5qfte9Nuwoov5dRXxL7fJeVlO5fGyOajWeO3+AgPjvjXh/UcLJqftkMWTPGUFuzAD5/7JC5v5FLWw==
   dependencies:
     uglify-es "^3.1.9"
 
-metro-react-native-babel-preset@0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-react-native-babel-preset/-/metro-react-native-babel-preset-0.59.0.tgz#20e020bc6ac9849e1477de1333d303ed42aba225"
-  integrity sha512-BoO6ncPfceIDReIH8pQ5tQptcGo5yRWQXJGVXfANbiKLq4tfgdZB1C1e2rMUJ6iypmeJU9dzl+EhPmIFKtgREg==
+metro-react-native-babel-preset@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-react-native-babel-preset/-/metro-react-native-babel-preset-0.64.0.tgz#76861408681dfda3c1d962eb31a8994918c976f8"
+  integrity sha512-HcZ0RWQRuJfpPiaHyFQJzcym+/dDIVUPwUAXWoub/C4GkGu+mPjp8vqK6g0FxokCnnI2TK0gZTza2IDfiNNscQ==
   dependencies:
+    "@babel/core" "^7.0.0"
     "@babel/plugin-proposal-class-properties" "^7.0.0"
     "@babel/plugin-proposal-export-default-from" "^7.0.0"
     "@babel/plugin-proposal-nullish-coalescing-operator" "^7.0.0"
@@ -6012,109 +6013,142 @@ metro-react-native-babel-preset@0.59.0:
     "@babel/template" "^7.0.0"
     react-refresh "^0.4.0"
 
-metro-react-native-babel-transformer@0.59.0, metro-react-native-babel-transformer@^0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-react-native-babel-transformer/-/metro-react-native-babel-transformer-0.59.0.tgz#9b3dfd6ad35c6ef37fc4ce4d20a2eb67fabbb4be"
-  integrity sha512-1O3wrnMq4NcPQ1asEcl9lRDn/t+F1Oef6S9WaYVIKEhg9m/EQRGVrrTVP+R6B5Eeaj3+zNKbzM8Dx/NWy1hUbQ==
+metro-react-native-babel-transformer@0.64.0, metro-react-native-babel-transformer@^0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-react-native-babel-transformer/-/metro-react-native-babel-transformer-0.64.0.tgz#eafef756972f20efdc51bd5361d55f8598355623"
+  integrity sha512-K1sHO3ODBFCr7uEiCQ4RvVr+cQg0EHQF8ChVPnecGh/WDD8udrTq9ECwB0dRfMjAvlsHtRUlJm6ZSI8UPgum2w==
   dependencies:
     "@babel/core" "^7.0.0"
     babel-preset-fbjs "^3.3.0"
-    metro-babel-transformer "0.59.0"
-    metro-react-native-babel-preset "0.59.0"
-    metro-source-map "0.59.0"
+    metro-babel-transformer "0.64.0"
+    metro-react-native-babel-preset "0.64.0"
+    metro-source-map "0.64.0"
+    nullthrows "^1.1.1"
 
-metro-resolver@0.59.0, metro-resolver@^0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-resolver/-/metro-resolver-0.59.0.tgz#fbc9d7c95f094c52807877d0011feffb9e896fad"
-  integrity sha512-lbgiumnwoVosffEI96z0FGuq1ejTorHAj3QYUPmp5dFMfitRxLP7Wm/WP9l4ZZjIptxTExsJwuEff1SLRCPD9w==
+metro-resolver@0.64.0, metro-resolver@^0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-resolver/-/metro-resolver-0.64.0.tgz#21126b44f31346ac2ce0b06b77ef65e8c9e2294a"
+  integrity sha512-cJ26Id8Zf+HmS/1vFwu71K3u7ep/+HeXXAJIeVDYf+niE7AWB9FijyMtAlQgbD8elWqv1leJCnQ/xHRFBfGKYA==
   dependencies:
     absolute-path "^0.0.0"
 
-metro-source-map@0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-source-map/-/metro-source-map-0.59.0.tgz#e9beb9fc51bfb4e060f95820cf1508fc122d23f7"
-  integrity sha512-0w5CmCM+ybSqXIjqU4RiK40t4bvANL6lafabQ2GP2XD3vSwkLY+StWzCtsb4mPuyi9R/SgoLBel+ZOXHXAH0eQ==
+metro-runtime@0.64.0, metro-runtime@^0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-runtime/-/metro-runtime-0.64.0.tgz#cdaa1121d91041bf6345f2a69eb7c2fb289eff7b"
+  integrity sha512-m7XbWOaIOeFX7YcxUhmnOi6Pg8EaeL89xyZ+quZyZVF1aNoTr4w8FfbKxvijpjsytKHIZtd+43m2Wt5JrqyQmQ==
+
+metro-source-map@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-source-map/-/metro-source-map-0.64.0.tgz#4310e17c3d4539c6369688022494ad66fa4d39a1"
+  integrity sha512-OCG2rtcp5cLEGYvAbfkl6mEc0J2FPRP4/UCEly+juBk7hawS9bCBMBfhJm/HIsvY1frk6nT2Vsl1O8YBbwyx2g==
   dependencies:
     "@babel/traverse" "^7.0.0"
     "@babel/types" "^7.0.0"
     invariant "^2.2.4"
-    metro-symbolicate "0.59.0"
-    ob1 "0.59.0"
+    metro-symbolicate "0.64.0"
+    nullthrows "^1.1.1"
+    ob1 "0.64.0"
     source-map "^0.5.6"
     vlq "^1.0.0"
 
-metro-symbolicate@0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro-symbolicate/-/metro-symbolicate-0.59.0.tgz#fc7f93957a42b02c2bfc57ed1e8f393f5f636a54"
-  integrity sha512-asLaF2A7rndrToGFIknL13aiohwPJ95RKHf0NM3hP/nipiLDoMzXT6ZnQvBqDxkUKyP+51AI75DMtb+Wcyw4Bw==
+metro-symbolicate@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-symbolicate/-/metro-symbolicate-0.64.0.tgz#405c21438ab553c29f6841da52ca76ee87bb06ac"
+  integrity sha512-qIi+YRrDWnLVmydj6gwidYLPaBsakZRibGWSspuXgHAxOI3UuLwlo4dpQ73Et0gyHjI7ZvRMRY8JPiOntf9AQQ==
   dependencies:
     invariant "^2.2.4"
-    metro-source-map "0.59.0"
+    metro-source-map "0.64.0"
+    nullthrows "^1.1.1"
     source-map "^0.5.6"
     through2 "^2.0.1"
     vlq "^1.0.0"
 
-metro@0.59.0, metro@^0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/metro/-/metro-0.59.0.tgz#64a87cd61357814a4f279518e0781b1eab5934b8"
-  integrity sha512-OpVgYXyuTvouusFZQJ/UYKEbwfLmialrSCUUTGTFaBor6UMUHZgXPYtK86LzesgMqRc8aiuTQVO78iKW2Iz3wg==
+metro-transform-plugins@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-transform-plugins/-/metro-transform-plugins-0.64.0.tgz#41d3dce0f2966bbd79fea1ecff61bcc8a00e4665"
+  integrity sha512-iTIRBD/wBI98plfxj8jAoNUUXfXLNlyvcjPtshhpGvdwu9pzQilGfnDnOaaK+vbITcOk9w5oQectXyJwAqTr1A==
+  dependencies:
+    "@babel/core" "^7.0.0"
+    "@babel/generator" "^7.5.0"
+    "@babel/template" "^7.0.0"
+    "@babel/traverse" "^7.0.0"
+    nullthrows "^1.1.1"
+
+metro-transform-worker@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro-transform-worker/-/metro-transform-worker-0.64.0.tgz#f94429b2c42b13cb1c93be4c2e25e97f2d27ca60"
+  integrity sha512-wegRtK8GyLF6IPZRBJp+zsORgA4iX0h1DRpknyAMDCtSbJ4VU2xV/AojteOgAsDvY3ucAGsvfuZLNDJHUdUNHQ==
+  dependencies:
+    "@babel/core" "^7.0.0"
+    "@babel/generator" "^7.5.0"
+    "@babel/parser" "^7.0.0"
+    "@babel/types" "^7.0.0"
+    babel-preset-fbjs "^3.3.0"
+    metro "0.64.0"
+    metro-babel-transformer "0.64.0"
+    metro-cache "0.64.0"
+    metro-cache-key "0.64.0"
+    metro-hermes-compiler "0.64.0"
+    metro-source-map "0.64.0"
+    metro-transform-plugins "0.64.0"
+    nullthrows "^1.1.1"
+
+metro@0.64.0, metro@^0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/metro/-/metro-0.64.0.tgz#0091a856cfbcc94dd576da563eee466e96186195"
+  integrity sha512-G2OC08Rzfs0kqnSEuKo2yZxR+/eNUpA93Ru45c60uN0Dw3HPrDi+ZBipgFftC6iLE0l+6hu8roFFIofotWxybw==
   dependencies:
     "@babel/code-frame" "^7.0.0"
     "@babel/core" "^7.0.0"
     "@babel/generator" "^7.5.0"
     "@babel/parser" "^7.0.0"
-    "@babel/plugin-external-helpers" "^7.0.0"
     "@babel/template" "^7.0.0"
     "@babel/traverse" "^7.0.0"
     "@babel/types" "^7.0.0"
     absolute-path "^0.0.0"
+    accepts "^1.3.7"
     async "^2.4.0"
-    babel-preset-fbjs "^3.3.0"
-    buffer-crc32 "^0.2.13"
-    chalk "^2.4.1"
+    chalk "^4.0.0"
     ci-info "^2.0.0"
-    concat-stream "^1.6.0"
     connect "^3.6.5"
     debug "^2.2.0"
     denodeify "^1.2.1"
     error-stack-parser "^2.0.6"
-    eventemitter3 "^3.0.0"
-    fbjs "^1.0.0"
     fs-extra "^1.0.0"
     graceful-fs "^4.1.3"
     image-size "^0.6.0"
     invariant "^2.2.4"
-    jest-haste-map "^24.9.0"
-    jest-worker "^24.9.0"
-    json-stable-stringify "^1.0.1"
+    jest-haste-map "^26.5.2"
+    jest-worker "^26.0.0"
     lodash.throttle "^4.1.1"
-    merge-stream "^1.0.1"
-    metro-babel-register "0.59.0"
-    metro-babel-transformer "0.59.0"
-    metro-cache "0.59.0"
-    metro-config "0.59.0"
-    metro-core "0.59.0"
-    metro-inspector-proxy "0.59.0"
-    metro-minify-uglify "0.59.0"
-    metro-react-native-babel-preset "0.59.0"
-    metro-resolver "0.59.0"
-    metro-source-map "0.59.0"
-    metro-symbolicate "0.59.0"
-    mime-types "2.1.11"
+    metro-babel-register "0.64.0"
+    metro-babel-transformer "0.64.0"
+    metro-cache "0.64.0"
+    metro-cache-key "0.64.0"
+    metro-config "0.64.0"
+    metro-core "0.64.0"
+    metro-hermes-compiler "0.64.0"
+    metro-inspector-proxy "0.64.0"
+    metro-minify-uglify "0.64.0"
+    metro-react-native-babel-preset "0.64.0"
+    metro-resolver "0.64.0"
+    metro-runtime "0.64.0"
+    metro-source-map "0.64.0"
+    metro-symbolicate "0.64.0"
+    metro-transform-plugins "0.64.0"
+    metro-transform-worker "0.64.0"
+    mime-types "^2.1.27"
     mkdirp "^0.5.1"
     node-fetch "^2.2.0"
     nullthrows "^1.1.1"
-    resolve "^1.5.0"
     rimraf "^2.5.4"
     serialize-error "^2.1.0"
     source-map "^0.5.6"
-    strip-ansi "^4.0.0"
+    strip-ansi "^6.0.0"
     temp "0.8.3"
-    throat "^4.1.0"
-    wordwrap "^1.0.0"
+    throat "^5.0.0"
     ws "^1.1.5"
-    xpipe "^1.0.5"
-    yargs "^14.2.0"
+    yargs "^15.3.1"
 
 micromatch@^3.1.10, micromatch@^3.1.4:
   version "3.1.10"
@@ -6148,17 +6182,10 @@ mime-db@1.47.0, "mime-db@>= 1.43.0 < 2":
   resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.47.0.tgz#8cb313e59965d3c05cfbf898915a267af46a335c"
   integrity sha512-QBmA/G2y+IfeS4oktet3qRZ+P5kPhCKRXxXnQEudYqUaEioAU1/Lq2us3D/t1Jfo4hE9REQPrbB7K5sOczJVIw==
 
-mime-db@~1.23.0:
-  version "1.23.0"
-  resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.23.0.tgz#a31b4070adaea27d732ea333740a64d0ec9a6659"
-  integrity sha1-oxtAcK2uon1zLqMzdApk0OyaZlk=
-
-mime-types@2.1.11:
-  version "2.1.11"
-  resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.11.tgz#c259c471bda808a85d6cd193b430a5fae4473b3c"
-  integrity sha1-wlnEcb2oCKhdbNGTtDCl+uRHOzw=
-  dependencies:
-    mime-db "~1.23.0"
+mime-db@1.51.0:
+  version "1.51.0"
+  resolved "https://registry.yarnpkg.com/mime-db/-/mime-db-1.51.0.tgz#d9ff62451859b18342d960850dc3cfb77e63fb0c"
+  integrity sha512-5y8A56jg7XVQx2mbv1lu49NR4dokRnhZYTtL+KGfaa27uq4pSTXkwQkFJl4pkRMyNFz/EtYDSkiiEHx3F7UN6g==
 
 mime-types@2.1.30, mime-types@^2.1.12, mime-types@~2.1.19, mime-types@~2.1.24:
   version "2.1.30"
@@ -6167,6 +6194,13 @@ mime-types@2.1.30, mime-types@^2.1.12, mime-types@~2.1.19, mime-types@~2.1.24:
   dependencies:
     mime-db "1.47.0"
 
+mime-types@^2.1.27:
+  version "2.1.34"
+  resolved "https://registry.yarnpkg.com/mime-types/-/mime-types-2.1.34.tgz#5a712f9ec1503511a945803640fafe09d3793c24"
+  integrity sha512-6cP692WwGIs9XXdOO4++N+7qjqv0rqxxVvJ3VHPh/Sc9mVZcQP+ZGhkKiTvWMQRr2tbHkJP/Yn7Y0npb3ZBs4A==
+  dependencies:
+    mime-db "1.51.0"
+
 mime@1.6.0:
   version "1.6.0"
   resolved "https://registry.yarnpkg.com/mime/-/mime-1.6.0.tgz#32cd9e5c64553bd58d19a568af452acff04981b1"
@@ -6202,7 +6236,7 @@ min-indent@^1.0.0:
   resolved "https://registry.yarnpkg.com/min-indent/-/min-indent-1.0.1.tgz#a63f681673b30571fbe8bc25686ae746eefa9869"
   integrity sha512-I9jwMn07Sy/IwOj3zVkVik2JTvgpaykDZEigL6Rx6N9LbMywwUSMtxET+7lVoDLLd3O3IXwJwvuuns8UB/HeAg==
 
-minimatch@^3.0.4:
+minimatch@^3.0.2, minimatch@^3.0.4:
   version "3.0.4"
   resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.0.4.tgz#5166e286457f03306064be5497e8dbb0c3d32083"
   integrity sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==
@@ -6258,21 +6292,11 @@ ms@2.1.2:
   resolved "https://registry.yarnpkg.com/ms/-/ms-2.1.2.tgz#d09d1f357b443f493382a8eb3ccd183872ae6009"
   integrity sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==
 
-mute-stream@0.0.7:
-  version "0.0.7"
-  resolved "https://registry.yarnpkg.com/mute-stream/-/mute-stream-0.0.7.tgz#3075ce93bc21b8fab43e1bc4da7e8115ed1e7bab"
-  integrity sha1-MHXOk7whuPq0PhvE2n6BFe0ee6s=
-
 mute-stream@0.0.8:
   version "0.0.8"
   resolved "https://registry.yarnpkg.com/mute-stream/-/mute-stream-0.0.8.tgz#1630c42b2251ff81e2a283de96a5497ea92e5e0d"
   integrity sha512-nnbWWOkoWyUsTjKrhgD0dcz22mdkSnpYqbEjIm2nhwhuxlSkpywJmBo8h0ZqJdkp73mb90SssHkN4rsRaBAfAA==
 
-nan@^2.12.1:
-  version "2.14.2"
-  resolved "https://registry.yarnpkg.com/nan/-/nan-2.14.2.tgz#f5376400695168f4cc694ac9393d0c9585eeea19"
-  integrity sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ==
-
 nanomatch@^1.2.9:
   version "1.2.13"
   resolved "https://registry.yarnpkg.com/nanomatch/-/nanomatch-1.2.13.tgz#b87a8aa4fc0de8fe6be88895b38983ff265bd119"
@@ -6300,7 +6324,7 @@ negotiator@0.6.2:
   resolved "https://registry.yarnpkg.com/negotiator/-/negotiator-0.6.2.tgz#feacf7ccf525a77ae9634436a64883ffeca346fb"
   integrity sha512-hZXc7K2e+PgeI1eDBe/10Ard4ekbfrrqG8Ep+8Jmf4JID2bNg7NvCPOZN+kfF574pFQI7mum2AUqDidoKqcTOw==
 
-neo-async@^2.6.0:
+neo-async@^2.5.0, neo-async@^2.6.0:
   version "2.6.2"
   resolved "https://registry.yarnpkg.com/neo-async/-/neo-async-2.6.2.tgz#b4aafb93e3aeb2d8174ca53cf163ab7d7308305f"
   integrity sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==
@@ -6315,13 +6339,12 @@ nocache@^2.1.0:
   resolved "https://registry.yarnpkg.com/nocache/-/nocache-2.1.0.tgz#120c9ffec43b5729b1d5de88cd71aa75a0ba491f"
   integrity sha512-0L9FvHG3nfnnmaEQPjT9xhfN4ISk0A8/2j4M37Np4mcDesJjHgEUfgPhdCyZuFI954tjokaIj/A3NdpFNdEh4Q==
 
-node-fetch@^1.0.1:
-  version "1.7.3"
-  resolved "https://registry.yarnpkg.com/node-fetch/-/node-fetch-1.7.3.tgz#980f6f72d85211a5347c6b2bc18c5b84c3eb47ef"
-  integrity sha512-NhZ4CsKx7cYm2vSrBAr2PvFOe6sWDf0UYLRqA6svUYg7+/TSfVAu49jYC4BvQ4Sms9SZgdqGBgroqfDhJdTyKQ==
+node-dir@^0.1.17:
+  version "0.1.17"
+  resolved "https://registry.yarnpkg.com/node-dir/-/node-dir-0.1.17.tgz#5f5665d93351335caabef8f1c554516cf5f1e4e5"
+  integrity sha1-X1Zl2TNRM1yqvvjxxVRRbPXx5OU=
   dependencies:
-    encoding "^0.1.11"
-    is-stream "^1.0.1"
+    minimatch "^3.0.2"
 
 node-fetch@^2.2.0, node-fetch@^2.6.0, node-fetch@^2.6.1:
   version "2.6.1"
@@ -6355,6 +6378,11 @@ node-releases@^1.1.71:
   resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-1.1.71.tgz#cb1334b179896b1c89ecfdd4b725fb7bbdfc7dbb"
   integrity sha512-zR6HoT6LrLCRBwukmrVbHv0EpEQjksO6GmFcZQQuCAy139BEsoVKPYnf3jongYW83fAa1torLGYwxxky/p28sg==
 
+node-releases@^2.0.1:
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/node-releases/-/node-releases-2.0.1.tgz#3d1d395f204f1f2f29a54358b9fb678765ad2fc5"
+  integrity sha512-CqyzN6z7Q6aMeF/ktcMVTzhAHCEpf8SOarwpzpf8pNBY2k5/oM34UHldUwp8VKI7uxct2HxSRdJjBaZeESzcxA==
+
 node-stream-zip@^1.9.1:
   version "1.13.3"
   resolved "https://registry.yarnpkg.com/node-stream-zip/-/node-stream-zip-1.13.3.tgz#63235337abebcef408b244b4e28f28961e6e86f0"
@@ -6431,12 +6459,12 @@ oauth-sign@~0.9.0:
   resolved "https://registry.yarnpkg.com/oauth-sign/-/oauth-sign-0.9.0.tgz#47a7b016baa68b5fa0ecf3dee08a85c679ac6455"
   integrity sha512-fexhUFFPTGV8ybAtSIGbV6gOkSv8UtRbDBnAyLQw4QPKkgNlsH2ByPGtMUqdWkos6YCRmAqViwgZrJc/mRDzZQ==
 
-ob1@0.59.0:
-  version "0.59.0"
-  resolved "https://registry.yarnpkg.com/ob1/-/ob1-0.59.0.tgz#ee103619ef5cb697f2866e3577da6f0ecd565a36"
-  integrity sha512-opXMTxyWJ9m68ZglCxwo0OPRESIC/iGmKFPXEXzMZqsVIrgoRXOHmoMDkQzz4y3irVjbyPJRAh5pI9fd0MJTFQ==
+ob1@0.64.0:
+  version "0.64.0"
+  resolved "https://registry.yarnpkg.com/ob1/-/ob1-0.64.0.tgz#f254a55a53ca395c4f9090e28a85483eac5eba19"
+  integrity sha512-CO1N+5dhvy+MoAwxz8+fymEUcwsT4a+wHhrHFb02LppcJdHxgcBWviwEhUwKOD2kLMQ7ijrrzybOqpGcqEtvpQ==
 
-object-assign@^4.0.1, object-assign@^4.1.0, object-assign@^4.1.1:
+object-assign@^4.0.1, object-assign@^4.1.1:
   version "4.1.1"
   resolved "https://registry.yarnpkg.com/object-assign/-/object-assign-4.1.1.tgz#2109adc7965887cfc05cbbd442cac8bfbb360863"
   integrity sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM=
@@ -6518,7 +6546,7 @@ onetime@^5.1.0, onetime@^5.1.2:
     mimic-fn "^2.1.0"
 
 "onnxruntime-common@file:../common":
-  version "1.7.0"
+  version "1.10.0"
 
 open@^6.2.0:
   version "6.4.0"
@@ -6717,11 +6745,6 @@ parse-json@^4.0.0:
     error-ex "^1.3.1"
     json-parse-better-errors "^1.0.1"
 
-parse-node-version@^1.0.0:
-  version "1.0.1"
-  resolved "https://registry.yarnpkg.com/parse-node-version/-/parse-node-version-1.0.1.tgz#e2b5dbede00e7fa9bc363607f53327e8b073189b"
-  integrity sha512-3YHlOa/JgH6Mnpr05jP9eDG254US9ek25LyIxZlDItp2iJtwyaXQb57lBYLdT3MowkUFYEV2XXNAYIPlESvJlA==
-
 parse-path@^4.0.0:
   version "4.0.3"
   resolved "https://registry.yarnpkg.com/parse-path/-/parse-path-4.0.3.tgz#82d81ec3e071dcc4ab49aa9f2c9c0b8966bb22bf"
@@ -6790,9 +6813,9 @@ path-key@^3.0.0, path-key@^3.1.0:
   integrity sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==
 
 path-parse@^1.0.6:
-  version "1.0.6"
-  resolved "https://registry.yarnpkg.com/path-parse/-/path-parse-1.0.6.tgz#d62dbb5679405d72c4737ec58600e9ddcf06d24c"
-  integrity sha512-GSmOT2EbHrINBf9SR7CDELwlJ8AENk3Qn7OikK4nFYAu3Ote2+JYNVvkpAEQm3/TLNEJFD/xZJjzyxg3KBWOzw==
+  version "1.0.7"
+  resolved "https://registry.yarnpkg.com/path-parse/-/path-parse-1.0.7.tgz#fbc114b60ca42b30d9daf5858e4bd68bbedb6735"
+  integrity sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==
 
 path-type@^1.0.0:
   version "1.1.0"
@@ -6820,6 +6843,11 @@ performance-now@^2.1.0:
   resolved "https://registry.yarnpkg.com/performance-now/-/performance-now-2.1.0.tgz#6309f4e0e5fa913ec1c69307ae364b4b377c9e7b"
   integrity sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns=
 
+picocolors@^1.0.0:
+  version "1.0.0"
+  resolved "https://registry.yarnpkg.com/picocolors/-/picocolors-1.0.0.tgz#cb5bdc74ff3f51892236eaf79d68bc44564ab81c"
+  integrity sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==
+
 picomatch@^2.0.4, picomatch@^2.2.1, picomatch@^2.2.3:
   version "2.2.3"
   resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.2.3.tgz#465547f359ccc206d3c48e46a1bcb89bf7ee619d"
@@ -6882,17 +6910,6 @@ plist@^3.0.1:
     xmlbuilder "^9.0.7"
     xmldom "^0.5.0"
 
-plugin-error@^0.1.2:
-  version "0.1.2"
-  resolved "https://registry.yarnpkg.com/plugin-error/-/plugin-error-0.1.2.tgz#3b9bb3335ccf00f425e07437e19276967da47ace"
-  integrity sha1-O5uzM1zPAPQl4HQ34ZJ2ln2kes4=
-  dependencies:
-    ansi-cyan "^0.1.1"
-    ansi-red "^0.1.1"
-    arr-diff "^1.0.1"
-    arr-union "^2.0.1"
-    extend-shallow "^1.1.2"
-
 pod-install@^0.1.0:
   version "0.1.21"
   resolved "https://registry.yarnpkg.com/pod-install/-/pod-install-0.1.21.tgz#a731d172f691bb875f7a0e09dc28b3701947645c"
@@ -6926,27 +6943,7 @@ prettier@^2.0.5:
   resolved "https://registry.yarnpkg.com/prettier/-/prettier-2.2.1.tgz#795a1a78dd52f073da0cd42b21f9c91381923ff5"
   integrity sha512-PqyhM2yCjg/oKkFPtTGUojv7gnZAoG80ttl45O6x2Ug/rMJw4wcc9k6aaf2hibP7BGVCCM33gZoGjyvt9mm16Q==
 
-pretty-format@^24.9.0:
-  version "24.9.0"
-  resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-24.9.0.tgz#12fac31b37019a4eea3c11aa9a959eb7628aa7c9"
-  integrity sha512-00ZMZUiHaJrNfk33guavqgvfJS30sLYf0f8+Srklv0AMPodGGHcoHgksZ3OThYnIvOd+8yMCn0YiEOogjlgsnA==
-  dependencies:
-    "@jest/types" "^24.9.0"
-    ansi-regex "^4.0.0"
-    ansi-styles "^3.2.0"
-    react-is "^16.8.4"
-
-pretty-format@^25.1.0, pretty-format@^25.2.0:
-  version "25.5.0"
-  resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-25.5.0.tgz#7873c1d774f682c34b8d48b6743a2bf2ac55791a"
-  integrity sha512-kbo/kq2LQ/A/is0PQwsEHM7Ca6//bGPPvU6UnsdDRSKTWxT/ru/xb88v4BJf6a69H+uTytOEsTusT9ksd/1iWQ==
-  dependencies:
-    "@jest/types" "^25.5.0"
-    ansi-regex "^5.0.0"
-    ansi-styles "^4.0.0"
-    react-is "^16.12.0"
-
-pretty-format@^26.0.0, pretty-format@^26.6.2:
+pretty-format@^26.0.0, pretty-format@^26.5.2, pretty-format@^26.6.2:
   version "26.6.2"
   resolved "https://registry.yarnpkg.com/pretty-format/-/pretty-format-26.6.2.tgz#e35c2705f14cb7fe2fe94fa078345b444120fc93"
   integrity sha512-7AeGuCYNGmycyQbCqd/3PWH4eOoX/OiCa0uphp57NVTeAGdJGaAliecxwBDHYQCIvrW7aDBZCYeNTP/WX69mkg==
@@ -6961,13 +6958,6 @@ process-nextick-args@~2.0.0:
   resolved "https://registry.yarnpkg.com/process-nextick-args/-/process-nextick-args-2.0.1.tgz#7820d9b16120cc55ca9ae7792680ae7dba6d7fe2"
   integrity sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==
 
-promise@^7.1.1:
-  version "7.3.1"
-  resolved "https://registry.yarnpkg.com/promise/-/promise-7.3.1.tgz#064b72602b18f90f29192b8b1bc418ffd1ebd3bf"
-  integrity sha512-nolQXZ/4L+bP/UGlkfaIujX9BKxGwmQ9OT4mOt5yvy8iK1h3wqTEJCijzGANTCCl9nWjY41juyAn2K3Q1hLLTg==
-  dependencies:
-    asap "~2.0.3"
-
 promise@^8.0.3:
   version "8.1.0"
   resolved "https://registry.yarnpkg.com/promise/-/promise-8.1.0.tgz#697c25c3dfe7435dd79fcd58c38a135888eaf05e"
@@ -6997,11 +6987,6 @@ protocols@^1.1.0, protocols@^1.4.0:
   resolved "https://registry.yarnpkg.com/protocols/-/protocols-1.4.8.tgz#48eea2d8f58d9644a4a32caae5d5db290a075ce8"
   integrity sha512-IgjKyaUSjsROSO8/D49Ab7hP8mJgTYcqApOqdPhLoPxAplXmkp+zRvsrSQjFn5by0rhm4VH0GAUELIPpx7B1yg==
 
-pseudomap@^1.0.2:
-  version "1.0.2"
-  resolved "https://registry.yarnpkg.com/pseudomap/-/pseudomap-1.0.2.tgz#f052a28da70e618917ef0a8ac34c1ae5a68286b3"
-  integrity sha1-8FKijacOYYkX7wqKw0wa5aaChrM=
-
 psl@^1.1.28, psl@^1.1.33:
   version "1.8.0"
   resolved "https://registry.yarnpkg.com/psl/-/psl-1.8.0.tgz#9326f8bcfb013adcc005fdff056acce020e51c24"
@@ -7092,7 +7077,7 @@ react-devtools-core@^4.6.0:
     shell-quote "^1.6.1"
     ws "^7"
 
-react-is@^16.12.0, react-is@^16.8.1, react-is@^16.8.4:
+react-is@^16.8.1:
   version "16.13.1"
   resolved "https://registry.yarnpkg.com/react-is/-/react-is-16.13.1.tgz#789729a4dc36de2999dc156dd6c1d9c18cea56a4"
   integrity sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==
@@ -7132,38 +7117,52 @@ react-native-builder-bob@^0.17.1:
   optionalDependencies:
     jetifier "^1.6.6"
 
-react-native@0.63.4:
-  version "0.63.4"
-  resolved "https://registry.yarnpkg.com/react-native/-/react-native-0.63.4.tgz#2210fdd404c94a5fa6b423c6de86f8e48810ec36"
-  integrity sha512-I4kM8kYO2mWEYUFITMcpRulcy4/jd+j9T6PbIzR0FuMcz/xwd+JwHoLPa1HmCesvR1RDOw9o4D+OFLwuXXfmGw==
+react-native-codegen@^0.0.6:
+  version "0.0.6"
+  resolved "https://registry.yarnpkg.com/react-native-codegen/-/react-native-codegen-0.0.6.tgz#b3173faa879cf71bfade8d030f9c4698388f6909"
+  integrity sha512-cMvrUelD81wiPitEPiwE/TCNscIVauXxmt4NTGcy18HrUd0WRWXfYzAQGXm0eI87u3NMudNhqFj2NISJenxQHg==
   dependencies:
-    "@babel/runtime" "^7.0.0"
-    "@react-native-community/cli" "^4.10.0"
-    "@react-native-community/cli-platform-android" "^4.10.0"
-    "@react-native-community/cli-platform-ios" "^4.10.0"
+    flow-parser "^0.121.0"
+    jscodeshift "^0.11.0"
+    nullthrows "^1.1.1"
+
+react-native@0.64.1:
+  version "0.64.1"
+  resolved "https://registry.yarnpkg.com/react-native/-/react-native-0.64.1.tgz#cd38f5b47b085549686f34eb0c9dcd466f307635"
+  integrity sha512-jvSj+hNAfwvhaSmxd5KHJ5HidtG0pDXzoH6DaqNpU74g3CmAiA8vuk58B5yx/DYuffGq6PeMniAcwuh3Xp4biQ==
+  dependencies:
+    "@jest/create-cache-key-function" "^26.5.0"
+    "@react-native-community/cli" "^5.0.1-alpha.0"
+    "@react-native-community/cli-platform-android" "^5.0.1-alpha.0"
+    "@react-native-community/cli-platform-ios" "^5.0.1-alpha.0"
+    "@react-native/assets" "1.0.0"
+    "@react-native/normalize-color" "1.0.0"
+    "@react-native/polyfills" "1.0.0"
     abort-controller "^3.0.0"
     anser "^1.4.9"
     base64-js "^1.1.2"
     event-target-shim "^5.0.1"
-    fbjs "^1.0.0"
-    fbjs-scripts "^1.1.0"
-    hermes-engine "~0.5.0"
+    hermes-engine "~0.7.0"
     invariant "^2.2.4"
     jsc-android "^245459.0.0"
-    metro-babel-register "0.59.0"
-    metro-react-native-babel-transformer "0.59.0"
-    metro-source-map "0.59.0"
+    metro-babel-register "0.64.0"
+    metro-react-native-babel-transformer "0.64.0"
+    metro-runtime "0.64.0"
+    metro-source-map "0.64.0"
     nullthrows "^1.1.1"
-    pretty-format "^24.9.0"
+    pretty-format "^26.5.2"
     promise "^8.0.3"
     prop-types "^15.7.2"
     react-devtools-core "^4.6.0"
+    react-native-codegen "^0.0.6"
     react-refresh "^0.4.0"
     regenerator-runtime "^0.13.2"
-    scheduler "0.19.1"
+    scheduler "^0.20.1"
+    shelljs "^0.8.4"
     stacktrace-parser "^0.1.3"
     use-subscription "^1.0.0"
     whatwg-fetch "^3.0.0"
+    ws "^6.1.4"
 
 react-refresh@^0.4.0:
   version "0.4.3"
@@ -7241,7 +7240,7 @@ readable-stream@3, readable-stream@^3.0.0, readable-stream@^3.0.2, readable-stre
     string_decoder "^1.1.1"
     util-deprecate "^1.0.1"
 
-readable-stream@^2.0.1, readable-stream@^2.2.2, readable-stream@~2.3.6:
+readable-stream@~2.3.6:
   version "2.3.7"
   resolved "https://registry.yarnpkg.com/readable-stream/-/readable-stream-2.3.7.tgz#1eca1cf711aef814c04f62252a36a62f6cb23b57"
   integrity sha512-Ebho8K4jIbHAxnuxi7o42OrZgF/ZTNcsZj6nRKyUmkhLFq8CHItp/fy6hQZuZmP/n3yZ9VBUbp4zz/mX8hmYPw==
@@ -7254,6 +7253,16 @@ readable-stream@^2.0.1, readable-stream@^2.2.2, readable-stream@~2.3.6:
     string_decoder "~1.1.1"
     util-deprecate "~1.0.1"
 
+recast@^0.20.3:
+  version "0.20.5"
+  resolved "https://registry.yarnpkg.com/recast/-/recast-0.20.5.tgz#8e2c6c96827a1b339c634dd232957d230553ceae"
+  integrity sha512-E5qICoPoNL4yU0H0NoBDntNB0Q5oMSNh9usFctYniLBluTthi3RsQVBXIJNbApOlvSwW/RGxIuokPcAc59J5fQ==
+  dependencies:
+    ast-types "0.14.2"
+    esprima "~4.0.0"
+    source-map "~0.6.1"
+    tslib "^2.0.1"
+
 rechoir@^0.6.2:
   version "0.6.2"
   resolved "https://registry.yarnpkg.com/rechoir/-/rechoir-0.6.2.tgz#85204b54dba82d5742e28c96756ef43af50e3384"
@@ -7487,7 +7496,7 @@ resolve-url@^0.2.1:
   resolved "https://registry.yarnpkg.com/resolve-url/-/resolve-url-0.2.1.tgz#2c637fe77c893afd2a663fe21aa9080068e2052a"
   integrity sha1-LGN/53yJOv0qZj/iGqkIAGjiBSo=
 
-resolve@^1.1.6, resolve@^1.10.0, resolve@^1.14.2, resolve@^1.18.1, resolve@^1.20.0, resolve@^1.5.0:
+resolve@^1.1.6, resolve@^1.10.0, resolve@^1.14.2, resolve@^1.18.1, resolve@^1.20.0:
   version "1.20.0"
   resolved "https://registry.yarnpkg.com/resolve/-/resolve-1.20.0.tgz#629a013fb3f70755d6f0b7935cc1c2c5378b1975"
   integrity sha512-wENBPt4ySzg4ybFQW2TT1zMQucPK95HSh/nq2CFTZVOGut2+pQvSsgtda4d26YrYcr067wjbmzOG8byDPBX63A==
@@ -7559,12 +7568,19 @@ rimraf@~2.2.6:
   resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-2.2.8.tgz#e439be2aaee327321952730f99a8929e4fc50582"
   integrity sha1-5Dm+Kq7jJzIZUnMPmaiSnk/FBYI=
 
+rimraf@~2.6.2:
+  version "2.6.3"
+  resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-2.6.3.tgz#b2d104fe0d8fb27cf9e0a1cda8262dd3833c6cab"
+  integrity sha512-mwqeW5XsA2qAejG46gYdENaxXjx9onRNCfn7L0duuP4hCuTIi/QO7PDK07KJfp1d+izWPrzEJDcSqBa0OZQriA==
+  dependencies:
+    glob "^7.1.3"
+
 rsvp@^4.8.4:
   version "4.8.5"
   resolved "https://registry.yarnpkg.com/rsvp/-/rsvp-4.8.5.tgz#c8f155311d167f68f21e168df71ec5b083113734"
   integrity sha512-nfMOlASu9OnRJo1mbEk2cz0D56a1MBNrJ7orjRZQG10XDyuvwksKbuXNp6qa+kbn839HwjwhBzhFmdsaEAfauA==
 
-run-async@^2.2.0, run-async@^2.4.0:
+run-async@^2.4.0:
   version "2.4.1"
   resolved "https://registry.yarnpkg.com/run-async/-/run-async-2.4.1.tgz#8440eccf99ea3e70bd409d49aab88e10c189a455"
   integrity sha512-tvVnVv01b8c1RrA6Ep7JkStj85Guv/YrMcwqYQnwjsAS2cTmmPGBBjAjpCW7RrSodNSoE2/qg9O4bceNvUuDgQ==
@@ -7576,18 +7592,6 @@ run-parallel@^1.1.9:
   dependencies:
     queue-microtask "^1.2.2"
 
-rx-lite-aggregates@^4.0.8:
-  version "4.0.8"
-  resolved "https://registry.yarnpkg.com/rx-lite-aggregates/-/rx-lite-aggregates-4.0.8.tgz#753b87a89a11c95467c4ac1626c4efc4e05c67be"
-  integrity sha1-dTuHqJoRyVRnxKwWJsTvxOBcZ74=
-  dependencies:
-    rx-lite "*"
-
-rx-lite@*, rx-lite@^4.0.8:
-  version "4.0.8"
-  resolved "https://registry.yarnpkg.com/rx-lite/-/rx-lite-4.0.8.tgz#0b1e11af8bc44836f04a6407e92da42467b79444"
-  integrity sha1-Cx4Rr4vESDbwSmQH6S2kJGe3lEQ=
-
 rxjs@^6.6.6:
   version "6.6.7"
   resolved "https://registry.yarnpkg.com/rxjs/-/rxjs-6.6.7.tgz#90ac018acabf491bf65044235d5863c4dab804c9"
@@ -7612,7 +7616,7 @@ safe-regex@^1.1.0:
   dependencies:
     ret "~0.1.10"
 
-"safer-buffer@>= 2.1.2 < 3", "safer-buffer@>= 2.1.2 < 3.0.0", safer-buffer@^2.0.2, safer-buffer@^2.1.0, safer-buffer@~2.1.0:
+"safer-buffer@>= 2.1.2 < 3", safer-buffer@^2.0.2, safer-buffer@^2.1.0, safer-buffer@~2.1.0:
   version "2.1.2"
   resolved "https://registry.yarnpkg.com/safer-buffer/-/safer-buffer-2.1.2.tgz#44fa161b0187b9549dd84bb91802f9bd8385cd6a"
   integrity sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==
@@ -7644,10 +7648,10 @@ saxes@^5.0.1:
   dependencies:
     xmlchars "^2.2.0"
 
-scheduler@0.19.1:
-  version "0.19.1"
-  resolved "https://registry.yarnpkg.com/scheduler/-/scheduler-0.19.1.tgz#4f3e2ed2c1a7d65681f4c854fa8c5a1ccb40f196"
-  integrity sha512-n/zwRWRYSUj0/3g/otKDRPMh6qv2SYMWNq85IEa8iZyAv8od9zDYpGSnpBEjNgcMNq6Scbu5KfIPxNF72R/2EA==
+scheduler@^0.20.1:
+  version "0.20.2"
+  resolved "https://registry.yarnpkg.com/scheduler/-/scheduler-0.20.2.tgz#4baee39436e34aa93b4874bddcbf0fe8b8b50e91"
+  integrity sha512-2eWfGgAqqWFGqtdMmcL5zCMK1U8KlXv8SQFGglL3CEtd0aDVDWgeF/YoCmvln55m5zSk3J/20hTaSBeSObsQDQ==
   dependencies:
     loose-envify "^1.1.0"
     object-assign "^4.1.1"
@@ -7659,7 +7663,7 @@ semver-diff@^3.1.1:
   dependencies:
     semver "^6.3.0"
 
-"semver@2 || 3 || 4 || 5", semver@^5.1.0, semver@^5.5.0, semver@^5.6.0:
+"semver@2 || 3 || 4 || 5", semver@^5.5.0, semver@^5.6.0:
   version "5.7.1"
   resolved "https://registry.yarnpkg.com/semver/-/semver-5.7.1.tgz#a954f931aeba508d307bbf069eff0c01c96116f7"
   integrity sha512-sauaDf/PZdVgrLTNYHRtpXa1iRiKcaebiKQ1BJdpQlWH2lCvexQdX55snPFyK7QzpudqbCI0qXFfOasHdyNDGQ==
@@ -7730,11 +7734,6 @@ set-value@^2.0.0, set-value@^2.0.1:
     is-plain-object "^2.0.3"
     split-string "^3.0.1"
 
-setimmediate@^1.0.5:
-  version "1.0.5"
-  resolved "https://registry.yarnpkg.com/setimmediate/-/setimmediate-1.0.5.tgz#290cbb232e306942d7d7ea9b83732ab7856f8285"
-  integrity sha1-KQy7Iy4waULX1+qbg3Mqt4VvgoU=
-
 setprototypeof@1.1.1:
   version "1.1.1"
   resolved "https://registry.yarnpkg.com/setprototypeof/-/setprototypeof-1.1.1.tgz#7e95acb24aa92f5885e0abef5ba131330d4ae683"
@@ -7786,7 +7785,7 @@ shell-quote@^1.6.1:
   resolved "https://registry.yarnpkg.com/shell-quote/-/shell-quote-1.7.2.tgz#67a7d02c76c9da24f99d20808fcaded0e0e04be2"
   integrity sha512-mRz/m/JVscCrkMyPqHc/bczi3OQHkLTqXHEFu0zDhK/qfv3UcOA4SVmRCLmos4bhjr9ekVQubj/R7waKapmiQg==
 
-shelljs@0.8.4, shelljs@^0.8.3:
+shelljs@0.8.4, shelljs@^0.8.3, shelljs@^0.8.4:
   version "0.8.4"
   resolved "https://registry.yarnpkg.com/shelljs/-/shelljs-0.8.4.tgz#de7684feeb767f8716b326078a8a00875890e3c2"
   integrity sha512-7gk3UZ9kOfPLIAbslLzyWeGiEqx9e3rxwZM0KE6EL8GlGwjym9Mrlx5/p33bWTu9YG6vcS4MBxYZDHYr5lr8BQ==
@@ -7828,11 +7827,6 @@ sisteransi@^1.0.5:
   resolved "https://registry.yarnpkg.com/sisteransi/-/sisteransi-1.0.5.tgz#134d681297756437cc05ca01370d3a7a571075ed"
   integrity sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==
 
-slash@^2.0.0:
-  version "2.0.0"
-  resolved "https://registry.yarnpkg.com/slash/-/slash-2.0.0.tgz#de552851a1759df3a8f206535442f5ec4ddeab44"
-  integrity sha512-ZYKh3Wh2z1PpEXWr0MpSBZ0V6mZHAQfYevttO11c51CaWjGTaadiKZ+wVt1PbMlDV5qhMFslpZCemhwOK7C89A==
-
 slash@^3.0.0:
   version "3.0.0"
   resolved "https://registry.yarnpkg.com/slash/-/slash-3.0.0.tgz#6539be870c165adbd5240220dbe361f1bc4d4634"
@@ -7988,13 +7982,6 @@ sshpk@^1.7.0:
     safer-buffer "^2.0.2"
     tweetnacl "~0.14.0"
 
-stack-utils@^1.0.1:
-  version "1.0.5"
-  resolved "https://registry.yarnpkg.com/stack-utils/-/stack-utils-1.0.5.tgz#a19b0b01947e0029c8e451d5d61a498f5bb1471b"
-  integrity sha512-KZiTzuV3CnSnSvgMRrARVCj+Ht7rMbauGDK0LdVFRGyenwdylpajAp4Q0i6SX8rEmbTpMMf6ryq2gb8pPq2WgQ==
-  dependencies:
-    escape-string-regexp "^2.0.0"
-
 stack-utils@^2.0.2:
   version "2.0.3"
   resolved "https://registry.yarnpkg.com/stack-utils/-/stack-utils-2.0.3.tgz#cd5f030126ff116b78ccb3c027fe302713b61277"
@@ -8050,15 +8037,7 @@ string-length@^4.0.1:
     char-regex "^1.0.2"
     strip-ansi "^6.0.0"
 
-string-width@^2.1.0:
-  version "2.1.1"
-  resolved "https://registry.yarnpkg.com/string-width/-/string-width-2.1.1.tgz#ab93f27a8dc13d28cac815c462143a6d9012ae9e"
-  integrity sha512-nOqH59deCq9SRHlxq1Aw85Jnt4w6KvLKqWVik6oA9ZklXLNIOlqg4F2yrT1MVaTjAqvVwdfeZ7w7aCvJD7ugkw==
-  dependencies:
-    is-fullwidth-code-point "^2.0.0"
-    strip-ansi "^4.0.0"
-
-string-width@^3.0.0, string-width@^3.1.0:
+string-width@^3.0.0:
   version "3.1.0"
   resolved "https://registry.yarnpkg.com/string-width/-/string-width-3.1.0.tgz#22767be21b62af1081574306f69ac51b62203961"
   integrity sha512-vafcv6KjVZKSgz06oM/H6GDBrAtz8vdhQakGjFIvNrHA6y3HCF1CInLy+QLq8dTJPQ1b+KDUqDFctkdRW44e1w==
@@ -8090,13 +8069,6 @@ string_decoder@~1.1.1:
   dependencies:
     safe-buffer "~5.1.0"
 
-strip-ansi@^4.0.0:
-  version "4.0.0"
-  resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-4.0.0.tgz#a8479022eb1ac368a871389b635262c505ee368f"
-  integrity sha1-qEeQIusaw2iocTibY1JixQXuNo8=
-  dependencies:
-    ansi-regex "^3.0.0"
-
 strip-ansi@^5.0.0, strip-ansi@^5.1.0, strip-ansi@^5.2.0:
   version "5.2.0"
   resolved "https://registry.yarnpkg.com/strip-ansi/-/strip-ansi-5.2.0.tgz#8c9a536feb6afc962bdfa5b104a5091c1ad9c0ae"
@@ -8169,13 +8141,6 @@ supports-color@^5.3.0:
   dependencies:
     has-flag "^3.0.0"
 
-supports-color@^6.1.0:
-  version "6.1.0"
-  resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-6.1.0.tgz#0764abc69c63d5ac842dd4867e8d025e880df8f3"
-  integrity sha512-qe1jfm1Mg7Nq/NSh6XE24gPXROEVsWHxC1LIx//XNlD9iw7YZQGjZNjYN7xGaEG6iKdA8EtNFW6R0gjnVXp+wQ==
-  dependencies:
-    has-flag "^3.0.0"
-
 supports-color@^7.0.0, supports-color@^7.1.0:
   version "7.2.0"
   resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-7.2.0.tgz#1b7dcdcb32b8138801b3e478ba6a51caa89648da"
@@ -8220,6 +8185,13 @@ temp@0.8.3:
     os-tmpdir "^1.0.0"
     rimraf "~2.2.6"
 
+temp@^0.8.1:
+  version "0.8.4"
+  resolved "https://registry.yarnpkg.com/temp/-/temp-0.8.4.tgz#8c97a33a4770072e0a05f919396c7665a7dd59f2"
+  integrity sha512-s0ZZzd0BzYv5tLSptZooSjK8oj6C+c19p7Vqta9+6NPOf7r+fxq0cJe6/oN4LTC79sy5NY8ucOJNgwsKCSbfqg==
+  dependencies:
+    rimraf "~2.6.2"
+
 terminal-link@^2.0.0:
   version "2.1.1"
   resolved "https://registry.yarnpkg.com/terminal-link/-/terminal-link-2.1.1.tgz#14a64a27ab3c0df933ea546fba55f2d078edc994"
@@ -8242,11 +8214,6 @@ text-extensions@^1.0.0:
   resolved "https://registry.yarnpkg.com/text-extensions/-/text-extensions-1.9.0.tgz#1853e45fee39c945ce6f6c36b2d659b5aabc2a26"
   integrity sha512-wiBrwC1EhBelW12Zy26JeOUkQ5mRu+5o8rpsJk5+2t+Y5vE7e842qtZDQ2g1NpX/29HdyFeJ4nSIhI47ENSxlQ==
 
-throat@^4.1.0:
-  version "4.1.0"
-  resolved "https://registry.yarnpkg.com/throat/-/throat-4.1.0.tgz#89037cbc92c56ab18926e6ba4cbb200e15672a6a"
-  integrity sha1-iQN8vJLFarGJJua6TLsgDhVnKmo=
-
 throat@^5.0.0:
   version "5.0.0"
   resolved "https://registry.yarnpkg.com/throat/-/throat-5.0.0.tgz#c5199235803aad18754a667d659b5e72ce16764b"
@@ -8272,11 +8239,6 @@ through@2, "through@>=2.2.7 <3", through@^2.3.6:
   resolved "https://registry.yarnpkg.com/through/-/through-2.3.8.tgz#0dd4c9ffaabc357960b1b724115d7e0e86a2e1f5"
   integrity sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=
 
-time-stamp@^1.0.0:
-  version "1.1.0"
-  resolved "https://registry.yarnpkg.com/time-stamp/-/time-stamp-1.1.0.tgz#764a5a11af50561921b133f3b44e618687e0f5c3"
-  integrity sha1-dkpaEa9QVhkhsTPztE5hhofg9cM=
-
 tmp@^0.0.33:
   version "0.0.33"
   resolved "https://registry.yarnpkg.com/tmp/-/tmp-0.0.33.tgz#6d34335889768d21b2bcda0aa277ced3b1bfadf9"
@@ -8285,9 +8247,9 @@ tmp@^0.0.33:
     os-tmpdir "~1.0.2"
 
 tmpl@1.0.x:
-  version "1.0.4"
-  resolved "https://registry.yarnpkg.com/tmpl/-/tmpl-1.0.4.tgz#23640dd7b42d00433911140820e5cf440e521dd1"
-  integrity sha1-I2QN17QtAEM5ERQIIOXPRA5SHdE=
+  version "1.0.5"
+  resolved "https://registry.yarnpkg.com/tmpl/-/tmpl-1.0.5.tgz#8683e0b902bb9c20c4f726e3c0b69f36518c07cc"
+  integrity sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==
 
 to-fast-properties@^2.0.0:
   version "2.0.0"
@@ -8380,6 +8342,11 @@ tslib@^1.9.0:
   resolved "https://registry.yarnpkg.com/tslib/-/tslib-1.14.1.tgz#cf2d38bdc34a134bcaf1091c41f6619e2f672d00"
   integrity sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==
 
+tslib@^2.0.1:
+  version "2.3.1"
+  resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.3.1.tgz#e8a335add5ceae51aa261d32a490158ef042ef01"
+  integrity sha512-77EbyPPpMz+FRFRuAFlWMtmgUWGe9UOG2Z25NqCwiIjRhOf5iKGuzSe5P2w1laq+FkRy4p+PCuVkJSGkzTEKVw==
+
 tunnel-agent@^0.6.0:
   version "0.6.0"
   resolved "https://registry.yarnpkg.com/tunnel-agent/-/tunnel-agent-0.6.0.tgz#27a5dea06b36b04a0a9966774b290868f0fc40fd"
@@ -8451,11 +8418,6 @@ typescript@^4.1.3:
   resolved "https://registry.yarnpkg.com/typescript/-/typescript-4.2.4.tgz#8610b59747de028fda898a8aef0e103f156d0961"
   integrity sha512-V+evlYHZnQkaz8TRBuxTA92yZBPotr5H+WhQ7bD3hZUndx5tGOa1fuCgeSjxAzM1RiN5IzvadIXTVefuuwZCRg==
 
-ua-parser-js@^0.7.18:
-  version "0.7.28"
-  resolved "https://registry.yarnpkg.com/ua-parser-js/-/ua-parser-js-0.7.28.tgz#8ba04e653f35ce210239c64661685bf9121dec31"
-  integrity sha512-6Gurc1n//gjp9eQNXjD9O3M/sMwVtN5S8Lv9bvOYBfKfDNiIIhqiyi01vMBO45u4zkDE420w/e0se7Vs+sIg+g==
-
 uglify-es@^3.1.9:
   version "3.3.9"
   resolved "https://registry.yarnpkg.com/uglify-es/-/uglify-es-3.3.9.tgz#0c1c4f0700bed8dbc124cdb304d2592ca203e677"
@@ -8711,7 +8673,7 @@ whatwg-encoding@^1.0.5:
   dependencies:
     iconv-lite "0.4.24"
 
-whatwg-fetch@>=0.10.0, whatwg-fetch@^3.0.0:
+whatwg-fetch@^3.0.0:
   version "3.6.2"
   resolved "https://registry.yarnpkg.com/whatwg-fetch/-/whatwg-fetch-3.6.2.tgz#dced24f37f2624ed0281725d51d0e2e3fe677f8c"
   integrity sha512-bJlen0FcuU/0EMLrdbJ7zOnW6ITZLrZMIarMUVmdKtsGvZna8vxKYaexICWPfZ8qwf9fzNq+UEIZrnSaApt6RA==
@@ -8773,15 +8735,6 @@ wordwrap@^1.0.0:
   resolved "https://registry.yarnpkg.com/wordwrap/-/wordwrap-1.0.0.tgz#27584810891456a4171c8d0226441ade90cbcaeb"
   integrity sha1-J1hIEIkUVqQXHI0CJkQa3pDLyus=
 
-wrap-ansi@^5.1.0:
-  version "5.1.0"
-  resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-5.1.0.tgz#1fd1f67235d5b6d0fee781056001bfb694c03b09"
-  integrity sha512-QC1/iN/2/RPVJ5jYK8BGttj5z83LmSKmvbvrXPNCLZSEb32KKVDJDl/MOt2N01qU2H/FkzEa9PKto1BqDjtd7Q==
-  dependencies:
-    ansi-styles "^3.2.0"
-    string-width "^3.0.0"
-    strip-ansi "^5.0.0"
-
 wrap-ansi@^6.2.0:
   version "6.2.0"
   resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-6.2.0.tgz#e9393ba07102e6c91a3b221478f0257cd2856e53"
@@ -8805,6 +8758,15 @@ wrappy@1:
   resolved "https://registry.yarnpkg.com/wrappy/-/wrappy-1.0.2.tgz#b5243d8f3ec1aa35f1364605bc0d1036e30ab69f"
   integrity sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=
 
+write-file-atomic@^2.3.0:
+  version "2.4.3"
+  resolved "https://registry.yarnpkg.com/write-file-atomic/-/write-file-atomic-2.4.3.tgz#1fd2e9ae1df3e75b8d8c367443c692d4ca81f481"
+  integrity sha512-GaETH5wwsX+GcnzhPgKcKjJ6M2Cq3/iZp1WyY/X1CSqrW+jVNM9Y7D8EC2sM4ZG/V8wZlSniJnCKWPmBYAucRQ==
+  dependencies:
+    graceful-fs "^4.1.11"
+    imurmurhash "^0.1.4"
+    signal-exit "^3.0.2"
+
 write-file-atomic@^3.0.0:
   version "3.0.3"
   resolved "https://registry.yarnpkg.com/write-file-atomic/-/write-file-atomic-3.0.3.tgz#56bd5c5a5c70481cd19c571bd39ab965a5de56e8"
@@ -8823,6 +8785,13 @@ ws@^1.1.0, ws@^1.1.5:
     options ">=0.0.5"
     ultron "1.0.x"
 
+ws@^6.1.4:
+  version "6.2.2"
+  resolved "https://registry.yarnpkg.com/ws/-/ws-6.2.2.tgz#dd5cdbd57a9979916097652d78f1cc5faea0c32e"
+  integrity sha512-zmhltoSR8u1cnDsD43TX59mzoMZsLKqUweyYBAIvTngR3shc0W6aOZylZmq/7hqyVxPdi+5Ud2QInblgyE72fw==
+  dependencies:
+    async-limiter "~1.0.0"
+
 ws@^7, ws@^7.4.4:
   version "7.4.5"
   resolved "https://registry.yarnpkg.com/ws/-/ws-7.4.5.tgz#a484dd851e9beb6fdb420027e3885e8ce48986c1"
@@ -8868,11 +8837,6 @@ xmldom@^0.5.0:
   resolved "https://registry.yarnpkg.com/xmldom/-/xmldom-0.5.0.tgz#193cb96b84aa3486127ea6272c4596354cb4962e"
   integrity sha512-Foaj5FXVzgn7xFzsKeNIde9g6aFBxTPi37iwsno8QvApmtg7KYrr+OPyRHcJF7dud2a5nGRBXK3n0dL62Gf7PA==
 
-xpipe@^1.0.5:
-  version "1.0.5"
-  resolved "https://registry.yarnpkg.com/xpipe/-/xpipe-1.0.5.tgz#8dd8bf45fc3f7f55f0e054b878f43a62614dafdf"
-  integrity sha1-jdi/Rfw/f1Xw4FS4ePQ6YmFNr98=
-
 xtend@~4.0.1:
   version "4.0.2"
   resolved "https://registry.yarnpkg.com/xtend/-/xtend-4.0.2.tgz#bb72779f5fa465186b1f438f674fa347fdb5db54"
@@ -8888,11 +8852,6 @@ y18n@^5.0.5:
   resolved "https://registry.yarnpkg.com/y18n/-/y18n-5.0.8.tgz#7f4934d0f7ca8c56f95314939ddcd2dd91ce1d55"
   integrity sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==
 
-yallist@^2.1.2:
-  version "2.1.2"
-  resolved "https://registry.yarnpkg.com/yallist/-/yallist-2.1.2.tgz#1c11f9218f076089a47dd512f93c6699a6a81d52"
-  integrity sha1-HBH5IY8HYImkfdUS+TxmmaaoHVI=
-
 yallist@^4.0.0:
   version "4.0.0"
   resolved "https://registry.yarnpkg.com/yallist/-/yallist-4.0.0.tgz#9bb92790d9c0effec63be73519e11a35019a3a72"
@@ -8908,14 +8867,6 @@ yargs-parser@20.2.7, yargs-parser@^20.2.2, yargs-parser@^20.2.3:
   resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-20.2.7.tgz#61df85c113edfb5a7a4e36eb8aa60ef423cbc90a"
   integrity sha512-FiNkvbeHzB/syOjIUxFDCnhSfzAL8R5vs40MgLFBorXACCOAEaWu0gRZl14vG8MR9AOJIZbmkjhusqBYZ3HTHw==
 
-yargs-parser@^15.0.1:
-  version "15.0.1"
-  resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-15.0.1.tgz#54786af40b820dcb2fb8025b11b4d659d76323b3"
-  integrity sha512-0OAMV2mAZQrs3FkNpDQcBk1x5HXb8X4twADss4S0Iuk+2dGnLOE/fRHrsYm542GduMveyA77OF4wrNJuanRCWw==
-  dependencies:
-    camelcase "^5.0.0"
-    decamelize "^1.2.0"
-
 yargs-parser@^18.1.2:
   version "18.1.3"
   resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-18.1.3.tgz#be68c4975c6b2abf469236b0c870362fab09a7b0"
@@ -8924,24 +8875,7 @@ yargs-parser@^18.1.2:
     camelcase "^5.0.0"
     decamelize "^1.2.0"
 
-yargs@^14.2.0:
-  version "14.2.3"
-  resolved "https://registry.yarnpkg.com/yargs/-/yargs-14.2.3.tgz#1a1c3edced1afb2a2fea33604bc6d1d8d688a414"
-  integrity sha512-ZbotRWhF+lkjijC/VhmOT9wSgyBQ7+zr13+YLkhfsSiTriYsMzkTUFP18pFhWwBeMa5gUc1MzbhrO6/VB7c9Xg==
-  dependencies:
-    cliui "^5.0.0"
-    decamelize "^1.2.0"
-    find-up "^3.0.0"
-    get-caller-file "^2.0.1"
-    require-directory "^2.1.1"
-    require-main-filename "^2.0.0"
-    set-blocking "^2.0.0"
-    string-width "^3.0.0"
-    which-module "^2.0.0"
-    y18n "^4.0.0"
-    yargs-parser "^15.0.1"
-
-yargs@^15.1.0, yargs@^15.4.1:
+yargs@^15.1.0, yargs@^15.3.1, yargs@^15.4.1:
   version "15.4.1"
   resolved "https://registry.yarnpkg.com/yargs/-/yargs-15.4.1.tgz#0d87a16de01aee9d8bec2bfbf74f67851730f4f8"
   integrity sha512-aePbxDmcYW++PaqBsJ+HYUFwCdv4LVvdnhBy78E57PIor8/OVvhMrADFFEDh8DHDFRv/O9i3lPhsENjO7QX0+A==
diff --git a/js/tsconfig.json b/js/tsconfig.json
index b685a91da2..0d51436845 100644
--- a/js/tsconfig.json
+++ b/js/tsconfig.json
@@ -16,8 +16,6 @@
     "strictNullChecks": true,
     "pretty": true,
     "allowUnreachableCode": false,
-    "experimentalDecorators": true,
-    "downlevelIteration": true,
     "incremental": true
   }
 }
diff --git a/js/web/.npmignore b/js/web/.npmignore
index 49f3adcad4..e21a906c9a 100644
--- a/js/web/.npmignore
+++ b/js/web/.npmignore
@@ -2,6 +2,8 @@
 /script/
 /test/
 
+/dist/**/*.report.html
+
 /types/**/*.d.ts
 !/types/lib/**/*.d.ts
 
diff --git a/js/web/README.md b/js/web/README.md
index c6e8f5e116..e81cbc234c 100644
--- a/js/web/README.md
+++ b/js/web/README.md
@@ -12,7 +12,7 @@ The [Open Neural Network Exchange](http://onnx.ai/) (ONNX) is an open standard f
 
 With ONNX Runtime Web, web developers can score models directly on browsers with various benefits including reducing server-client communication and protecting user privacy, as well as offering install-free and cross-platform in-browser ML experience.
 
-ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web complies the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](http://www.onnxruntime.ai/docs/how-to/deploy-on-mobile.html). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend.
+ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web complies the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](https://onnxruntime.ai/docs/tutorials/mobile/). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend.
 
 See [Compatibility](#Compatibility) and [Operators Supported](#Operators) for a list of platforms and operators ONNX Runtime Web currently supports.
 
diff --git a/js/web/lib/build-def.ts b/js/web/lib/build-def.ts
new file mode 100644
index 0000000000..687b5aefcf
--- /dev/null
+++ b/js/web/lib/build-def.ts
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+/* eslint-disable @typescript-eslint/no-unused-vars, @typescript-eslint/naming-convention */
+
+/**
+ * The interface BuildDefinitions contains a set of flags which are defined at build time.
+ *
+ * Those flags are processed in terser for tree shaking to remove unused code.
+ * No flags in this file should present in production build.
+ */
+interface BuildDefinitions {
+  /**
+   * defines whether to disable the whole WebGL backend in the build.
+   */
+  DISABLE_WEBGL: boolean;
+  /**
+   * defines whether to disable the whole WebAssembly backend in the build.
+   */
+  DISABLE_WASM: boolean;
+  /**
+   * defines whether to disable proxy feature in WebAssembly backend in the build.
+   */
+  DISABLE_WASM_PROXY: boolean;
+  /**
+   * defines whether to disable multi-threading feature in WebAssembly backend in the build.
+   */
+  DISABLE_WASM_THREAD: boolean;
+}
+
+declare let BUILD_DEFS: BuildDefinitions;
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 970bd02fd3..fea2cd17e8 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -1,10 +1,19 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+/* eslint-disable @typescript-eslint/no-var-requires, @typescript-eslint/no-require-imports */
+// We use "require" instead of "import" here because import statement must be put in top level. Our current code does
+// not allow terser to tree-shaking code as expected because some codes are treated as having side effects.
+// So we import code inside the if-clause to allow terser remove the code safely.
+
 export * from 'onnxruntime-common';
 import {registerBackend} from 'onnxruntime-common';
-import {onnxjsBackend} from './backend-onnxjs';
-import {wasmBackend} from './backend-wasm';
 
-registerBackend('webgl', onnxjsBackend, -1);
-registerBackend('wasm', wasmBackend, 0);
+if (!BUILD_DEFS.DISABLE_WEBGL) {
+  const onnxjsBackend = require('./backend-onnxjs').onnxjsBackend;
+  registerBackend('webgl', onnxjsBackend, -1);
+}
+if (!BUILD_DEFS.DISABLE_WASM) {
+  const wasmBackend = require('./backend-wasm').wasmBackend;
+  registerBackend('wasm', wasmBackend, 0);
+}
diff --git a/js/web/lib/onnxjs/attribute.ts b/js/web/lib/onnxjs/attribute.ts
index 5b1b926476..2fda9ee334 100644
--- a/js/web/lib/onnxjs/attribute.ts
+++ b/js/web/lib/onnxjs/attribute.ts
@@ -7,7 +7,7 @@ import {onnxruntime} from './ort-schema/ort-generated';
 import ortFbs = onnxruntime.experimental.fbs;
 
 import {Tensor} from './tensor';
-import {LongUtil} from './util';
+import {decodeUtf8String, LongUtil} from './util';
 
 export declare namespace Attribute {
   export interface DataTypeMap {
@@ -171,7 +171,7 @@ export class Attribute {
       // string attributes are returned as string, so no conversion is needed.
       if (attr instanceof onnx.AttributeProto) {
         const utf8String = value as Uint8Array;
-        return Buffer.from(utf8String.buffer, utf8String.byteOffset, utf8String.byteLength).toString();
+        return decodeUtf8String(utf8String);
       }
     }
 
@@ -181,8 +181,7 @@ export class Attribute {
       // format strings attributes are returned as string[], so no conversion is needed.
       if (attr instanceof onnx.AttributeProto) {
         const utf8Strings = value as Uint8Array[];
-        return utf8Strings.map(
-            utf8String => Buffer.from(utf8String.buffer, utf8String.byteOffset, utf8String.byteLength).toString());
+        return utf8Strings.map(decodeUtf8String);
       }
     }
 
diff --git a/js/web/lib/onnxjs/session.ts b/js/web/lib/onnxjs/session.ts
index 2978aaa4e5..3a27a424e7 100644
--- a/js/web/lib/onnxjs/session.ts
+++ b/js/web/lib/onnxjs/session.ts
@@ -63,7 +63,7 @@ export class Session {
         if (typeof fetch === 'undefined') {
           // node
           const buf = await promisify(readFile)(arg);
-          this.initialize(Buffer.from(buf), isOrtFormat);
+          this.initialize(buf, isOrtFormat);
         } else {
           // browser
           const response = await fetch(arg);
diff --git a/js/web/lib/onnxjs/tensor.ts b/js/web/lib/onnxjs/tensor.ts
index df19d67caa..42757d0ef7 100644
--- a/js/web/lib/onnxjs/tensor.ts
+++ b/js/web/lib/onnxjs/tensor.ts
@@ -9,7 +9,7 @@ import {onnxruntime} from './ort-schema/ort-generated';
 
 import ortFbs = onnxruntime.experimental.fbs;
 
-import {ProtoUtil, ShapeUtil} from './util';
+import {decodeUtf8String, ProtoUtil, ShapeUtil} from './util';
 
 export declare namespace Tensor {
   export interface DataTypeMap {
@@ -217,8 +217,7 @@ export class Tensor {
       // When it's STRING type, the value should always be stored in field
       // 'stringData'
       tensorProto.stringData!.forEach((str, i) => {
-        const buf = Buffer.from(str.buffer, str.byteOffset, str.byteLength);
-        value.data[i] = buf.toString();
+        value.data[i] = decodeUtf8String(str);
       });
 
     } else if (
diff --git a/js/web/lib/onnxjs/util.ts b/js/web/lib/onnxjs/util.ts
index 73d3ca88fc..2e2d49ae4b 100644
--- a/js/web/lib/onnxjs/util.ts
+++ b/js/web/lib/onnxjs/util.ts
@@ -1249,3 +1249,7 @@ export class PoolConvUtil {
 
 export const MIN_CLIP = -3.4028234663852886e+38;
 export const MAX_CLIP = 3.4028234663852886e+38;
+
+export function decodeUtf8String(buffer: Uint8Array): string {
+  return new TextDecoder().decode(buffer);
+}
diff --git a/js/web/lib/wasm/proxy-wrapper.ts b/js/web/lib/wasm/proxy-wrapper.ts
index 105f46c43f..3b20f107e8 100644
--- a/js/web/lib/wasm/proxy-wrapper.ts
+++ b/js/web/lib/wasm/proxy-wrapper.ts
@@ -82,7 +82,7 @@ const onProxyWorkerMessage = (ev: MessageEvent<OrtWasmMessage>): void => {
 const scriptSrc = typeof document !== 'undefined' ? (document?.currentScript as HTMLScriptElement)?.src : undefined;
 
 export const initWasm = async(): Promise<void> => {
-  if (isProxy()) {
+  if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
     if (initialized) {
       return;
     }
@@ -118,7 +118,7 @@ export const initWasm = async(): Promise<void> => {
 };
 
 export const initOrt = async(numThreads: number, loggingLevel: number): Promise<void> => {
-  if (isProxy()) {
+  if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
     ensureWorker();
     return new Promise<void>((resolve, reject) => {
       initOrtCallbacks = [resolve, reject];
@@ -132,7 +132,7 @@ export const initOrt = async(numThreads: number, loggingLevel: number): Promise<
 
 export const createSession =
     async(model: Uint8Array, options?: InferenceSession.SessionOptions): Promise<SerializableSessionMetadata> => {
-  if (isProxy()) {
+  if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
     ensureWorker();
     return new Promise<SerializableSessionMetadata>((resolve, reject) => {
       createSessionCallbacks.push([resolve, reject]);
@@ -145,7 +145,7 @@ export const createSession =
 };
 
 export const releaseSession = async(sessionId: number): Promise<void> => {
-  if (isProxy()) {
+  if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
     ensureWorker();
     return new Promise<void>((resolve, reject) => {
       releaseSessionCallbacks.push([resolve, reject]);
@@ -160,7 +160,7 @@ export const releaseSession = async(sessionId: number): Promise<void> => {
 export const run = async(
     sessionId: number, inputIndices: number[], inputs: SerializableTensor[], outputIndices: number[],
     options: InferenceSession.RunOptions): Promise<SerializableTensor[]> => {
-  if (isProxy()) {
+  if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
     ensureWorker();
     return new Promise<SerializableTensor[]>((resolve, reject) => {
       runCallbacks.push([resolve, reject]);
@@ -173,7 +173,7 @@ export const run = async(
 };
 
 export const endProfiling = async(sessionId: number): Promise<void> => {
-  if (isProxy()) {
+  if (!BUILD_DEFS.DISABLE_WASM_PROXY && isProxy()) {
     ensureWorker();
     return new Promise<void>((resolve, reject) => {
       endProfilingCallbacks.push([resolve, reject]);
diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts
index 2a306c8c1d..056533c415 100644
--- a/js/web/lib/wasm/wasm-factory.ts
+++ b/js/web/lib/wasm/wasm-factory.ts
@@ -6,8 +6,10 @@ import * as path from 'path';
 
 import {OrtWasmModule} from './binding/ort-wasm';
 import {OrtWasmThreadedModule} from './binding/ort-wasm-threaded';
-import ortWasmFactoryThreaded from './binding/ort-wasm-threaded.js';
 import ortWasmFactory from './binding/ort-wasm.js';
+const ortWasmFactoryThreaded: EmscriptenModuleFactory<OrtWasmModule> =
+    // eslint-disable-next-line @typescript-eslint/no-require-imports
+    !BUILD_DEFS.DISABLE_WASM_THREAD ? require('./binding/ort-wasm-threaded.js') : ortWasmFactory;
 
 let wasm: OrtWasmModule|undefined;
 let initialized = false;
@@ -116,7 +118,8 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
     const factory = useThreads ? ortWasmFactoryThreaded : ortWasmFactory;
     const config: Partial<OrtWasmModule> = {
       locateFile: (fileName: string, scriptDirectory: string) => {
-        if (fileName.endsWith('.worker.js') && typeof Blob !== 'undefined') {
+        if (!BUILD_DEFS.DISABLE_WASM_THREAD && useThreads && fileName.endsWith('.worker.js') &&
+            typeof Blob !== 'undefined') {
           return URL.createObjectURL(new Blob(
               [
                 // This require() function is handled by webpack to load file content of the corresponding .worker.js
@@ -135,12 +138,11 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise
       }
     };
 
-    if (useThreads) {
+    if (!BUILD_DEFS.DISABLE_WASM_THREAD && useThreads) {
       if (typeof Blob === 'undefined') {
         config.mainScriptUrlOrBlob = path.join(__dirname, 'ort-wasm-threaded.js');
       } else {
-        const scriptSourceCode =
-            `var ortWasmThreaded=(function(){var _scriptDir;return ${ortWasmFactoryThreaded.toString()}})();`;
+        const scriptSourceCode = `var ortWasmThreaded=(function(){var _scriptDir;return ${factory.toString()}})();`;
         config.mainScriptUrlOrBlob = new Blob([scriptSourceCode], {type: 'text/javascript'});
       }
     }
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index a0a3422480..85fdecc33c 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -17,15 +17,15 @@
       "dev": true
     },
     "@electron/get": {
-      "version": "1.12.4",
-      "resolved": "https://registry.npmjs.org/@electron/get/-/get-1.12.4.tgz",
-      "integrity": "sha512-6nr9DbJPUR9Xujw6zD3y+rS95TyItEVM0NVjt1EehY2vUWfIgPiIPVHxCvaTS0xr2B+DRxovYVKbuOWqC35kjg==",
+      "version": "1.13.1",
+      "resolved": "https://registry.npmjs.org/@electron/get/-/get-1.13.1.tgz",
+      "integrity": "sha512-U5vkXDZ9DwXtkPqlB45tfYnnYBN8PePp1z/XDCupnSpdrxT8/ThCv9WCwPLf9oqiSGZTkH6dx2jDUPuoXpjkcA==",
       "dev": true,
       "requires": {
         "debug": "^4.1.1",
         "env-paths": "^2.2.0",
         "fs-extra": "^8.1.0",
-        "global-agent": "^2.0.2",
+        "global-agent": "^3.0.0",
         "global-tunnel-ng": "^2.7.1",
         "got": "^9.6.0",
         "progress": "^2.0.3",
@@ -87,6 +87,12 @@
         "fastq": "^1.6.0"
       }
     },
+    "@polka/url": {
+      "version": "1.0.0-next.21",
+      "resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.21.tgz",
+      "integrity": "sha512-a5Sab1C4/icpTZVzZc5Ghpz88yQtGOyNqYXcZgOssB2uuAr+wF/MvN6bgtW32q7HHrvBki+BsZ0OuNv6EV3K9g==",
+      "dev": true
+    },
     "@protobufjs/aspromise": {
       "version": "1.1.2",
       "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
@@ -485,6 +491,12 @@
       "integrity": "sha512-Ibt84YwBDDA890eDiDCEqcbwvHlBvzzDkU2cGBBDDI1QWT12jTiXIOn2CIw5KK4i6N5Z2HUxwYjzriDyqaqqZg==",
       "dev": true
     },
+    "acorn-walk": {
+      "version": "8.2.0",
+      "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz",
+      "integrity": "sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==",
+      "dev": true
+    },
     "agent-base": {
       "version": "4.3.0",
       "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-4.3.0.tgz",
@@ -714,9 +726,9 @@
       }
     },
     "boolean": {
-      "version": "3.0.3",
-      "resolved": "https://registry.npmjs.org/boolean/-/boolean-3.0.3.tgz",
-      "integrity": "sha512-EqrTKXQX6Z3A2nRmMEIlAIfjQOgFnVO2nqZGpbcsPnYGWBwpFqzlrozU1dy+S2iqfYDLh26ef4KrgTxu9xQrxA==",
+      "version": "3.1.4",
+      "resolved": "https://registry.npmjs.org/boolean/-/boolean-3.1.4.tgz",
+      "integrity": "sha512-3hx0kwU3uzG6ReQ3pnaFQPSktpBw6RHN3/ivDKEuU8g1XSfafowyvDnadjv1xp8IZqhtSukxlwv9bF6FhX8m0w==",
       "dev": true,
       "optional": true
     },
@@ -918,9 +930,9 @@
       "dev": true
     },
     "buffer-from": {
-      "version": "1.1.1",
-      "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.1.tgz",
-      "integrity": "sha512-MQcXEUbCKtEo7bhqEs6560Hyd4XaovZlO/k9V3hjVUF/zwW7KBVdSK4gIt/bzwS9MbR5qob+F5jusZsb0YQK2A==",
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
+      "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
       "dev": true
     },
     "buffer-xor": {
@@ -1216,9 +1228,9 @@
       }
     },
     "config-chain": {
-      "version": "1.1.12",
-      "resolved": "https://registry.npmjs.org/config-chain/-/config-chain-1.1.12.tgz",
-      "integrity": "sha512-a1eOIcu8+7lUInge4Rpf/n4Krkf3Dd9lqhljRzII1/Zno/kRtUWnznPO3jOKBmTEktkt3fkxisUcivoj0ebzoA==",
+      "version": "1.1.13",
+      "resolved": "https://registry.npmjs.org/config-chain/-/config-chain-1.1.13.tgz",
+      "integrity": "sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==",
       "dev": true,
       "optional": true,
       "requires": {
@@ -1291,13 +1303,6 @@
       "integrity": "sha512-ZwrFkGJxUR3EIoXtO+yVE69Eb7KlixbaeAWfBQB9vVsNn/o+Yw69gBWSSDK825hQNdN+wF8zELf3dFNl/kxkUA==",
       "dev": true
     },
-    "core-js": {
-      "version": "3.10.1",
-      "resolved": "https://registry.npmjs.org/core-js/-/core-js-3.10.1.tgz",
-      "integrity": "sha512-pwCxEXnj27XG47mu7SXAwhLP3L5CrlvCB91ANUkIz40P27kUcvNfSdvyZJ9CLHiVoKSp+TTChMQMSKQEH/IQxA==",
-      "dev": true,
-      "optional": true
-    },
     "core-util-is": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz",
@@ -1482,9 +1487,9 @@
       }
     },
     "detect-node": {
-      "version": "2.0.5",
-      "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.0.5.tgz",
-      "integrity": "sha512-qi86tE6hRcFHy8jI1m2VG+LaPUR1LhqDa5G8tVjuUXmOrpuAgqsA1pN0+ldgr3aKUH+QLI9hCY/OcRYisERejw==",
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/detect-node/-/detect-node-2.1.0.tgz",
+      "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g==",
       "dev": true,
       "optional": true
     },
@@ -1581,9 +1586,9 @@
       "dev": true
     },
     "electron": {
-      "version": "12.0.2",
-      "resolved": "https://registry.npmjs.org/electron/-/electron-12.0.2.tgz",
-      "integrity": "sha512-14luh9mGzfL4e0sncyy0+kW37IU7Y0Y1tvI97FDRSW0ZBQxi5cmAwSs5dmPmNBFBIGtzkaGaEB01j9RjZuCmow==",
+      "version": "12.2.3",
+      "resolved": "https://registry.npmjs.org/electron/-/electron-12.2.3.tgz",
+      "integrity": "sha512-B27c7eqx1bC5kea6An8oVhk1pShNC4VGqWarHMhD47MDtmg54KepHO5AbAvmKKZK/jWN7NTC7wyCYTDElJNtQA==",
       "dev": true,
       "requires": {
         "@electron/get": "^1.0.1",
@@ -2182,14 +2187,13 @@
       "dev": true
     },
     "global-agent": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-2.2.0.tgz",
-      "integrity": "sha512-+20KpaW6DDLqhG7JDiJpD1JvNvb8ts+TNl7BPOYcURqCrXqnN1Vf+XVOrkKJAFPqfX+oEhsdzOj1hLWkBTdNJg==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/global-agent/-/global-agent-3.0.0.tgz",
+      "integrity": "sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q==",
       "dev": true,
       "optional": true,
       "requires": {
         "boolean": "^3.0.1",
-        "core-js": "^3.6.5",
         "es6-error": "^4.1.1",
         "matcher": "^3.0.0",
         "roarr": "^2.15.3",
@@ -2288,6 +2292,15 @@
       "resolved": "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz",
       "integrity": "sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ=="
     },
+    "gzip-size": {
+      "version": "6.0.0",
+      "resolved": "https://registry.npmjs.org/gzip-size/-/gzip-size-6.0.0.tgz",
+      "integrity": "sha512-ax7ZYomf6jqPTQ4+XCpUGyXKHk5WweS+e05MBO4/y3WJ5RkmPXNKvX+bx1behVILVwr6JSQvZAku021CHPXG3Q==",
+      "dev": true,
+      "requires": {
+        "duplexer": "^0.1.2"
+      }
+    },
     "has": {
       "version": "1.0.3",
       "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz",
@@ -3623,6 +3636,12 @@
     "onnxruntime-common": {
       "version": "file:../common"
     },
+    "opener": {
+      "version": "1.5.2",
+      "resolved": "https://registry.npmjs.org/opener/-/opener-1.5.2.tgz",
+      "integrity": "sha512-ur5UIdyw5Y7yEj9wLzhqXiy6GZ3Mwx0yGI+5sMn2r0N0v3cKJvUmFH5yPP+WXh9e0xfyzyJX95D8l088DNFj7A==",
+      "dev": true
+    },
     "os-browserify": {
       "version": "0.3.0",
       "resolved": "https://registry.npmjs.org/os-browserify/-/os-browserify-0.3.0.tgz",
@@ -4266,6 +4285,17 @@
       "integrity": "sha512-VUJ49FC8U1OxwZLxIbTTrDvLnf/6TDgxZcK8wxR8zs13xpx7xbG60ndBlhNrFi2EMuFRoeDoJO7wthSLq42EjA==",
       "dev": true
     },
+    "sirv": {
+      "version": "1.0.18",
+      "resolved": "https://registry.npmjs.org/sirv/-/sirv-1.0.18.tgz",
+      "integrity": "sha512-f2AOPogZmXgJ9Ma2M22ZEhc1dNtRIzcEkiflMFeVTRq+OViOZMvH1IPMVOwrKaxpSaHioBJiDR0SluRqGa7atA==",
+      "dev": true,
+      "requires": {
+        "@polka/url": "^1.0.0-next.20",
+        "mime": "^2.3.1",
+        "totalist": "^1.0.0"
+      }
+    },
     "slash": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz",
@@ -4319,9 +4349,9 @@
       "dev": true
     },
     "source-map-support": {
-      "version": "0.5.19",
-      "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.19.tgz",
-      "integrity": "sha512-Wonm7zOCIJzBGQdB+thsPar0kYuCIzYvxZwlBa87yi/Mdjv7Tip2cyVbLj5o0cFPN4EVkuTwb3GDDyUx2DGnGw==",
+      "version": "0.5.21",
+      "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz",
+      "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==",
       "dev": true,
       "requires": {
         "buffer-from": "^1.0.0",
@@ -4581,14 +4611,14 @@
       }
     },
     "terser": {
-      "version": "5.7.0",
-      "resolved": "https://registry.npmjs.org/terser/-/terser-5.7.0.tgz",
-      "integrity": "sha512-HP5/9hp2UaZt5fYkuhNBR8YyRcT8juw8+uFbAme53iN9hblvKnLUTKkmwJG6ocWpIKf8UK4DoeWG4ty0J6S6/g==",
+      "version": "5.10.0",
+      "resolved": "https://registry.npmjs.org/terser/-/terser-5.10.0.tgz",
+      "integrity": "sha512-AMmF99DMfEDiRJfxfY5jj5wNH/bYO09cniSqhfoyxc8sFoYIgkJy86G04UoZU5VjlpnplVu0K6Tx6E9b5+DlHA==",
       "dev": true,
       "requires": {
         "commander": "^2.20.0",
         "source-map": "~0.7.2",
-        "source-map-support": "~0.5.19"
+        "source-map-support": "~0.5.20"
       },
       "dependencies": {
         "commander": {
@@ -4664,6 +4694,12 @@
       "integrity": "sha512-yaOH/Pk/VEhBWWTlhI+qXxDFXlejDGcQipMlyxda9nthulaxLZUNcUqFxokp0vcYnvteJln5FNQDRrxj3YcbVw==",
       "dev": true
     },
+    "totalist": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/totalist/-/totalist-1.1.0.tgz",
+      "integrity": "sha512-gduQwd1rOdDMGxFG1gEvhV88Oirdo2p+KjoYFU7k2g+i7n6AFFbDQ5kMPUsW0pNbfQsB/cwXvT1i4Bue0s9g5g==",
+      "dev": true
+    },
     "ts-loader": {
       "version": "9.1.2",
       "resolved": "https://registry.npmjs.org/ts-loader/-/ts-loader-9.1.2.tgz",
@@ -4941,6 +4977,80 @@
         "webpack-sources": "^2.1.1"
       }
     },
+    "webpack-bundle-analyzer": {
+      "version": "4.5.0",
+      "resolved": "https://registry.npmjs.org/webpack-bundle-analyzer/-/webpack-bundle-analyzer-4.5.0.tgz",
+      "integrity": "sha512-GUMZlM3SKwS8Z+CKeIFx7CVoHn3dXFcUAjT/dcZQQmfSZGvitPfMob2ipjai7ovFFqPvTqkEZ/leL4O0YOdAYQ==",
+      "dev": true,
+      "requires": {
+        "acorn": "^8.0.4",
+        "acorn-walk": "^8.0.0",
+        "chalk": "^4.1.0",
+        "commander": "^7.2.0",
+        "gzip-size": "^6.0.0",
+        "lodash": "^4.17.20",
+        "opener": "^1.5.2",
+        "sirv": "^1.0.7",
+        "ws": "^7.3.1"
+      },
+      "dependencies": {
+        "ansi-styles": {
+          "version": "4.3.0",
+          "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+          "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+          "dev": true,
+          "requires": {
+            "color-convert": "^2.0.1"
+          }
+        },
+        "chalk": {
+          "version": "4.1.2",
+          "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+          "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+          "dev": true,
+          "requires": {
+            "ansi-styles": "^4.1.0",
+            "supports-color": "^7.1.0"
+          }
+        },
+        "color-convert": {
+          "version": "2.0.1",
+          "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+          "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+          "dev": true,
+          "requires": {
+            "color-name": "~1.1.4"
+          }
+        },
+        "color-name": {
+          "version": "1.1.4",
+          "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+          "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+          "dev": true
+        },
+        "commander": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/commander/-/commander-7.2.0.tgz",
+          "integrity": "sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==",
+          "dev": true
+        },
+        "has-flag": {
+          "version": "4.0.0",
+          "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+          "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+          "dev": true
+        },
+        "supports-color": {
+          "version": "7.2.0",
+          "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+          "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+          "dev": true,
+          "requires": {
+            "has-flag": "^4.0.0"
+          }
+        }
+      }
+    },
     "webpack-cli": {
       "version": "4.7.0",
       "resolved": "https://registry.npmjs.org/webpack-cli/-/webpack-cli-4.7.0.tgz",
diff --git a/js/web/package.json b/js/web/package.json
index da4eecebdb..32f95d118f 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -43,9 +43,10 @@
     "@types/mocha": "^8.2.2",
     "@types/npmlog": "^4.1.2",
     "@types/platform": "^1.3.3",
+    "base64-js": "^1.5.1",
     "chai": "^4.3.4",
     "dir-compare": "^3.3.0",
-    "electron": "^12.0.2",
+    "electron": "^12.2.3",
     "fs-extra": "^9.1.0",
     "globby": "^11.0.3",
     "jszip": "^3.7.1",
@@ -67,9 +68,11 @@
     "npmlog": "^4.1.2",
     "numpy-parser": "^1.2.3",
     "strip-json-comments": "^3.1.1",
+    "terser": "^5.10.0",
     "ts-loader": "^9.1.2",
     "typescript": "^4.2.4",
     "webpack": "^5.36.2",
+    "webpack-bundle-analyzer": "^4.5.0",
     "webpack-cli": "^4.7.0",
     "worker-loader": "^3.0.8"
   },
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index bfb21473a6..e3f9723f26 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -23,6 +23,12 @@ if (['prod', 'dev', 'perf', 'node'].indexOf(MODE) === -1) {
 // --no-wasm
 const WASM = typeof args.wasm === 'undefined' ? true : !!args.wasm;
 
+// -a; --analyzer
+const ANALYZER = !!args.a || !!args.analyzer;
+
+// -f; --filter=<regex>
+const FILTER = args.f || args.filter;
+
 // Path variables
 const WASM_BINDING_FOLDER = path.join(__dirname, '..', 'lib', 'wasm', 'binding');
 const WASM_BINDING_JS_PATH = path.join(WASM_BINDING_FOLDER, 'ort-wasm.js');
@@ -134,6 +140,12 @@ npmlog.info('Build', 'Building bundle...');
   npmlog.info('Build.Bundle', 'Running webpack to generate bundles...');
   const webpackCommand = path.join(npmBin, 'webpack');
   const webpackArgs = ['--env', `--bundle-mode=${MODE}`];
+  if (ANALYZER) {
+    webpackArgs.push('--env', '-a');
+  }
+  if (FILTER) {
+    webpackArgs.push('--env', `-f=${FILTER}`);
+  }
   npmlog.info('Build.Bundle', `CMD: ${webpackCommand} ${webpackArgs.join(' ')}`);
   const webpack = spawnSync(webpackCommand, webpackArgs, {shell: true, stdio: 'inherit'});
   if (webpack.status !== 0) {
diff --git a/js/web/test/e2e/browser-test-wasm-no-threads-proxy.js b/js/web/test/e2e/browser-test-wasm-proxy-no-threads.js
similarity index 80%
rename from js/web/test/e2e/browser-test-wasm-no-threads-proxy.js
rename to js/web/test/e2e/browser-test-wasm-proxy-no-threads.js
index d9e4a66d1d..83bdfdcde9 100644
--- a/js/web/test/e2e/browser-test-wasm-no-threads-proxy.js
+++ b/js/web/test/e2e/browser-test-wasm-proxy-no-threads.js
@@ -3,7 +3,7 @@
 
 'use strict';
 
-it('Browser E2E testing - WebAssembly backend (no threads, proxy)', async function () {
+it('Browser E2E testing - WebAssembly backend (proxy, no threads)', async function () {
   ort.env.wasm.numThreads = 1;
   ort.env.wasm.proxy = true;
   await testFunction(ort, { executionProviders: ['wasm'] });
diff --git a/js/web/test/e2e/karma.conf.js b/js/web/test/e2e/karma.conf.js
index 42ca6e0313..7607f0d6a9 100644
--- a/js/web/test/e2e/karma.conf.js
+++ b/js/web/test/e2e/karma.conf.js
@@ -5,6 +5,7 @@
 
 const args = require('minimist')(process.argv.slice(2));
 const SELF_HOST = !!args['self-host'];
+const ORT_MAIN = args['ort-main'];
 const TEST_MAIN = args['test-main'];
 if (typeof TEST_MAIN !== 'string') {
   throw new Error('flag --test-main=<TEST_MAIN_JS_FILE> is required');
@@ -19,7 +20,7 @@ module.exports = function (config) {
   config.set({
     frameworks: ['mocha'],
     files: [
-      { pattern: distPrefix + 'ort.min.js' },
+      { pattern: distPrefix + ORT_MAIN },
       { pattern: './common.js' },
       { pattern: TEST_MAIN },
       { pattern: './node_modules/onnxruntime-web/dist/*.wasm', included: false, nocache: true },
diff --git a/js/web/test/e2e/run.js b/js/web/test/e2e/run.js
index 5b2544a1af..9a681ccd19 100644
--- a/js/web/test/e2e/run.js
+++ b/js/web/test/e2e/run.js
@@ -96,19 +96,24 @@ async function testAllNodejsCases() {
 }
 
 async function testAllBrowserCases({ hostInKarma }) {
-  await runKarma({ hostInKarma, main: './browser-test-webgl.js', browser: 'Chrome_default' });
-  await runKarma({ hostInKarma, main: './browser-test-wasm.js', browser: 'Chrome_default' });
-  await runKarma({ hostInKarma, main: './browser-test-wasm-no-threads.js', browser: 'Chrome_default' });
-  await runKarma({ hostInKarma, main: './browser-test-wasm-proxy.js', browser: 'Chrome_default' });
-  await runKarma({ hostInKarma, main: './browser-test-wasm-no-threads-proxy.js', browser: 'Chrome_default' });
-  await runKarma({ hostInKarma, main: './browser-test-wasm-path-override-filename.js', browser: 'Chrome_default' });
-  await runKarma({ hostInKarma, main: './browser-test-wasm-path-override-prefix.js', browser: 'Chrome_default' });
+  await runKarma({ hostInKarma, main: './browser-test-webgl.js'});
+  await runKarma({ hostInKarma, main: './browser-test-webgl.js', ortMain: 'ort.webgl.min.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm.js', ortMain: 'ort.wasm.min.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm-no-threads.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm-no-threads.js', ortMain: 'ort.wasm-core.min.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm-proxy.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm-proxy-no-threads.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm-path-override-filename.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm-path-override-filename.js', ortMain: 'ort.wasm.min.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm-path-override-prefix.js'});
+  await runKarma({ hostInKarma, main: './browser-test-wasm-path-override-prefix.js', ortMain: 'ort.wasm.min.js'});
 }
 
-async function runKarma({ hostInKarma, main, browser }) {
+async function runKarma({ hostInKarma, main, browser = 'Chrome_default', ortMain = 'ort.min.js' }) {
   const selfHostFlag = hostInKarma ? '--self-host' : '';
   await runInShell(
-    `npx karma start --single-run --browsers ${browser} ${selfHostFlag} --test-main=${main} --user-data=${getNextUserDataDir()}`);
+    `npx karma start --single-run --browsers ${browser} ${selfHostFlag} --ort-main=${ortMain} --test-main=${main} --user-data=${getNextUserDataDir()}`);
 }
 
 async function runInShell(cmd) {
diff --git a/js/web/test/e2e/simple-http-server.js b/js/web/test/e2e/simple-http-server.js
index 83866dfc31..bf5c8ccf55 100644
--- a/js/web/test/e2e/simple-http-server.js
+++ b/js/web/test/e2e/simple-http-server.js
@@ -24,6 +24,9 @@ const validRequests = {
   // .js files
   '/dist/ort.min.js': ['dist/ort.min.js', 'text/javascript'],
   '/dist/ort.js': ['dist/ort.js', 'text/javascript'],
+  '/dist/ort.webgl.min.js': ['dist/ort.webgl.min.js', 'text/javascript'],
+  '/dist/ort.wasm.min.js': ['dist/ort.wasm.min.js', 'text/javascript'],
+  '/dist/ort.wasm-core.min.js': ['dist/ort.wasm-core.min.js', 'text/javascript'],
 };
 
 module.exports = function (dir) {
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 68571db37a..fa4f4d0413 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -43,20 +43,20 @@ function fromInternalTensor(tensor: Tensor): ort.Tensor {
   return new ort.Tensor(tensor.type, tensor.data as ort.Tensor.DataType, tensor.dims);
 }
 
-async function loadFile(uri: string): Promise<Uint8Array|ArrayBuffer> {
+async function loadFile(uri: string): Promise<Uint8Array> {
   if (typeof fetch === 'undefined') {
     // node
     return promisify(readFile)(uri);
   } else {
     // browser
     const response = await fetch(uri);
-    return response.arrayBuffer();
+    return new Uint8Array(await response.arrayBuffer());
   }
 }
 
 async function loadTensorProto(uriOrData: string|Uint8Array): Promise<Test.NamedTensor> {
   const buf = (typeof uriOrData === 'string') ? await loadFile(uriOrData) : uriOrData;
-  const tensorProto = onnxProto.TensorProto.decode(Buffer.from(buf));
+  const tensorProto = onnxProto.TensorProto.decode(buf);
   const tensor = Tensor.fromProto(tensorProto);
   // add property 'name' to the tensor object.
   const namedTensor = fromInternalTensor(tensor) as unknown as Test.NamedTensor;
diff --git a/js/web/test/test-shared.ts b/js/web/test/test-shared.ts
index f2ea1892aa..41a485bb3d 100644
--- a/js/web/test/test-shared.ts
+++ b/js/web/test/test-shared.ts
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import * as base64 from 'base64-js';
 import * as fs from 'fs';
 import {promisify} from 'util';
 
@@ -8,11 +9,11 @@ import {Attribute} from '../lib/onnxjs/attribute';
 import {Graph} from '../lib/onnxjs/graph';
 
 export function base64toBuffer(data: string): Uint8Array {
-  return Buffer.from(data, 'base64');
+  return base64.toByteArray(data);
 }
 
 export function bufferToBase64(buffer: Uint8Array): string {
-  return Buffer.from(buffer).toString('base64');
+  return base64.fromByteArray(buffer);
 }
 
 async function readFile(file: string) {
@@ -22,14 +23,13 @@ async function readFile(file: string) {
   } else {
     // browser
     const response = await fetch(file);
-    const buffer = await response.arrayBuffer();
-    return Buffer.from(buffer);
+    return new Uint8Array(await response.arrayBuffer());
   }
 }
 
 export async function readJsonFile(file: string): Promise<any> {
   const content = await readFile(file);
-  return JSON.parse(content.toString());
+  return JSON.parse(new TextDecoder().decode(content));
 }
 
 /**
diff --git a/js/web/tsconfig.json b/js/web/tsconfig.json
index 5700f76f73..7789133ce5 100644
--- a/js/web/tsconfig.json
+++ b/js/web/tsconfig.json
@@ -2,6 +2,7 @@
   "extends": "../tsconfig.json",
   "compilerOptions": {
     "module": "CommonJS",
+    "downlevelIteration": true,
     "declarationDir": "./types",
     "lib": ["DOM"]
   },
diff --git a/js/web/webpack.config.js b/js/web/webpack.config.js
index 0d495adb9b..79de304ccd 100644
--- a/js/web/webpack.config.js
+++ b/js/web/webpack.config.js
@@ -5,10 +5,17 @@
 
 const path = require('path');
 const webpack = require('webpack');
+const BundleAnalyzerPlugin = require('webpack-bundle-analyzer').BundleAnalyzerPlugin;
 const NodePolyfillPlugin = require('node-polyfill-webpack-plugin');
 const TerserPlugin = require("terser-webpack-plugin");
 const minimist = require('minimist');
 
+// commandline args
+const args = minimist(process.argv);
+const bundleMode = args['bundle-mode'] || 'prod';  // 'prod'|'dev'|'perf'|'node'|undefined;
+const useAnalyzer = !!args.a || !!args['use-analyzer'];  // -a, --use-analyzer
+const filter = args.f || args.filter;
+
 const VERSION = require(path.join(__dirname, 'package.json')).version;
 const COPYRIGHT_BANNER = `/*!
 * ONNX Runtime Web v${VERSION}
@@ -16,10 +23,25 @@ const COPYRIGHT_BANNER = `/*!
 * Licensed under the MIT License.
 */`;
 
-function defaultTerserPluginOptions() {
+function terserEcmaVersionFromWebpackTarget(target) {
+  switch (target) {
+    case 'es5':
+      return 5;
+    case 'es6':
+    case 'es2015':
+      return 2015;
+    case 'es2017':
+      return 2017;
+    default:
+      throw new RangeError(`not supported ECMA version: ${target}`);
+  }
+}
+
+function defaultTerserPluginOptions(target) {
   return {
     extractComments: false,
     terserOptions: {
+      ecma: terserEcmaVersionFromWebpackTarget(target),
       format: {
         comments: false,
       },
@@ -33,8 +55,15 @@ function defaultTerserPluginOptions() {
   };
 }
 
+const DEFAULT_BUILD_DEFS = {
+  DISABLE_WEBGL: false,
+  DISABLE_WASM: false,
+  DISABLE_WASM_PROXY: false,
+  DISABLE_WASM_THREAD: false,
+};
+
 // common config for release bundle
-function buildConfig({ filename, format, target, mode, devtool }) {
+function buildConfig({ filename, format, target, mode, devtool, build_defs }) {
   const config = {
     target: [format === 'commonjs' ? 'node' : 'web', target],
     entry: path.resolve(__dirname, 'lib/index.ts'),
@@ -59,7 +88,10 @@ function buildConfig({ filename, format, target, mode, devtool }) {
         "perf_hooks": false,
       }
     },
-    plugins: [new webpack.WatchIgnorePlugin({ paths: [/\.js$/, /\.d\.ts$/] })],
+    plugins: [
+      new webpack.DefinePlugin({ BUILD_DEFS: build_defs }),
+      new webpack.WatchIgnorePlugin({ paths: [/\.js$/, /\.d\.ts$/] })
+    ],
     module: {
       rules: [{
         test: /\.ts$/,
@@ -80,13 +112,47 @@ function buildConfig({ filename, format, target, mode, devtool }) {
     devtool
   };
 
+  if (useAnalyzer) {
+    config.plugins.unshift(new BundleAnalyzerPlugin({
+      analyzerMode: 'static',
+      reportFilename: `${filename}.report.html`
+    }));
+  }
+
   if (mode === 'production') {
     config.resolve.alias['./binding/ort-wasm-threaded.js'] = './binding/ort-wasm-threaded.min.js';
     config.resolve.alias['./binding/ort-wasm-threaded.worker.js'] = './binding/ort-wasm-threaded.min.worker.js';
 
-    const options = defaultTerserPluginOptions();
+    const options = defaultTerserPluginOptions(target);
     options.terserOptions.format.preamble = COPYRIGHT_BANNER;
     config.plugins.push(new TerserPlugin(options));
+
+    // add a custom plugin to check whether code contains 'BUILD_DEFS'
+    config.plugins.push({
+      apply: (compiler) => {
+        compiler.hooks.afterCompile.tap(
+          'Check BUILD_DEFS',
+          (compilation) => {
+            for (const filename of compilation.assetsInfo.keys()) {
+              if (filename.endsWith('.js')) {
+                const asset = compilation.assets[filename];
+                if (asset) {
+                  const content = asset.source();
+                  if (typeof content !== 'string') {
+                    throw new Error(`content for target file '${filename}' is not string.`);
+                  }
+                  if (content.includes('DISABLE_WEBGL')
+                    || content.includes('DISABLE_WASM')
+                    || content.includes('DISABLE_WASM_PROXY')
+                    || content.includes('DISABLE_WASM_THREAD')) {
+                    throw new Error(`target file '${filename}' contains data fields from "BUILD_DEFS".`);
+                  }
+                }
+              }
+            }
+          });
+      }
+    });
   } else {
     config.plugins.push(new webpack.BannerPlugin({ banner: COPYRIGHT_BANNER, raw: true }));
   }
@@ -97,11 +163,12 @@ function buildConfig({ filename, format, target, mode, devtool }) {
 // "ort{.min}.js" config
 function buildOrtConfig({
   suffix = '',
-  target = 'es5',
+  target = 'es2017',
   mode = 'production',
-  devtool = 'source-map'
+  devtool = 'source-map',
+  build_defs = DEFAULT_BUILD_DEFS
 }) {
-  const config = buildConfig({ filename: `ort${suffix}.js`, format: 'umd', target, mode, devtool });
+  const config = buildConfig({ filename: `ort${suffix}.js`, format: 'umd', target, mode, devtool, build_defs });
   // set global name 'ort'
   config.output.library.name = 'ort';
   return config;
@@ -111,11 +178,12 @@ function buildOrtConfig({
 function buildOrtWebConfig({
   suffix = '',
   format = 'umd',
-  target = 'es5',
+  target = 'es2017',
   mode = 'production',
-  devtool = 'source-map'
+  devtool = 'source-map',
+  build_defs = DEFAULT_BUILD_DEFS
 }) {
-  const config = buildConfig({ filename: `ort-web${suffix}.js`, format, target, mode, devtool });
+  const config = buildConfig({ filename: `ort-web${suffix}.js`, format, target, mode, devtool, build_defs });
   // exclude onnxruntime-common from bundle
   config.externals = {
     'onnxruntime-common': {
@@ -139,7 +207,7 @@ function buildOrtWebConfig({
 function buildTestRunnerConfig({
   suffix = '',
   format = 'umd',
-  target = 'es5',
+  target = 'es2017',
   mode = 'production',
   devtool = 'source-map'
 }) {
@@ -174,9 +242,10 @@ function buildTestRunnerConfig({
       }
     },
     plugins: [
+      new webpack.DefinePlugin({ BUILD_DEFS: DEFAULT_BUILD_DEFS }),
       new webpack.WatchIgnorePlugin({ paths: [/\.js$/, /\.d\.ts$/] }),
       new NodePolyfillPlugin({
-        excludeAliases: ["console"]
+        excludeAliases: ["console", "Buffer"]
       }),
     ],
     module: {
@@ -186,7 +255,7 @@ function buildTestRunnerConfig({
           {
             loader: 'ts-loader',
             options: {
-              compilerOptions: { target: target }
+              compilerOptions: { target }
             }
           }
         ]
@@ -200,15 +269,13 @@ function buildTestRunnerConfig({
   };
 
   if (mode === 'production') {
-    config.plugins.push(new TerserPlugin(defaultTerserPluginOptions()));
+    config.plugins.push(new TerserPlugin(defaultTerserPluginOptions(target)));
   }
 
   return config;
 }
 
 module.exports = () => {
-  const args = minimist(process.argv);
-  const bundleMode = args['bundle-mode'] || 'prod';  // 'prod'|'dev'|'perf'|'node'|undefined;
   const builds = [];
 
   switch (bundleMode) {
@@ -220,8 +287,32 @@ module.exports = () => {
         buildOrtConfig({ mode: 'development', devtool: 'inline-source-map' }),
         // ort.es6.min.js
         buildOrtConfig({ suffix: '.es6.min', target: 'es6' }),
-        // ort.es6.js
-        buildOrtConfig({ suffix: '.es6', mode: 'development', devtool: 'inline-source-map', target: 'es6' }),
+        // ort.es5.min.js
+        buildOrtConfig({ suffix: '.es5.min', target: 'es5' }),
+
+        // ort.wasm.min.js
+        buildOrtConfig({
+          suffix: '.wasm.min', build_defs: {
+            ...DEFAULT_BUILD_DEFS,
+            DISABLE_WEBGL: true,
+          }
+        }),
+        // ort.webgl.min.js
+        buildOrtConfig({
+          suffix: '.webgl.min', build_defs: {
+            ...DEFAULT_BUILD_DEFS,
+            DISABLE_WASM: true,
+          }
+        }),
+        // ort.wasm-core.min.js
+        buildOrtConfig({
+          suffix: '.wasm-core.min', build_defs: {
+            ...DEFAULT_BUILD_DEFS,
+            DISABLE_WEBGL: true,
+            DISABLE_WASM_PROXY: true,
+            DISABLE_WASM_THREAD: true,
+          }
+        }),
 
         // ort-web.min.js
         buildOrtWebConfig({ suffix: '.min' }),
@@ -229,8 +320,8 @@ module.exports = () => {
         buildOrtWebConfig({ mode: 'development', devtool: 'inline-source-map' }),
         // ort-web.es6.min.js
         buildOrtWebConfig({ suffix: '.es6.min', target: 'es6' }),
-        // ort-web.es6.js
-        buildOrtWebConfig({ suffix: '.es6', mode: 'development', devtool: 'inline-source-map', target: 'es6' }),
+        // ort-web.es5.min.js
+        buildOrtWebConfig({ suffix: '.es5.min', target: 'es5' }),
       );
 
     case 'node':
@@ -249,5 +340,10 @@ module.exports = () => {
       throw new Error(`unsupported bundle mode: ${bundleMode}`);
   }
 
+  if (filter) {
+    const filterRegex = new RegExp(filter);
+    return builds.filter(b => filterRegex.test(b.output.filename));
+  }
+
   return builds;
 };
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index 702b5bcd67..ebf2c6c65d 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -72,7 +72,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, MatMulIntegerToFloat);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, DynamicQuantizeLSTM);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearConv);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, NhwcMaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, NhwcMaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, NhwcMaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QEmbedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QGemm);
 // ******** End: Quantization ******************* //
@@ -160,7 +161,8 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, MatMulIntegerToFloat)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, DynamicQuantizeLSTM)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, QLinearConv)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, NhwcMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, NhwcMaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, NhwcMaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, QEmbedLayerNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QGemm)>,
   };
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.cc b/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.cc
index d8314d4cfd..ee63c23869 100644
--- a/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.cc
@@ -14,13 +14,14 @@ using onnxruntime::concurrency::ThreadPool;
 namespace onnxruntime {
 namespace contrib {
 
+template <typename T8Bits>
 Status ComputeQLinearGlobalAvgPool(
-    const uint8_t* x,
+    const T8Bits* x,
     float x_scale,
-    uint8_t x_zero_point,
-    uint8_t* y,
+    T8Bits x_zero_point,
+    T8Bits* y,
     float y_scale,
-    uint8_t y_zero_point,
+    T8Bits y_zero_point,
     int64_t N,
     int64_t C,
     int64_t image_size,
@@ -28,8 +29,8 @@ Status ComputeQLinearGlobalAvgPool(
     concurrency::ThreadPool* tp) {
   if (!channels_last || C == 1) {
     auto worker = [=](std::ptrdiff_t first, std::ptrdiff_t last) {
-      const uint8_t* input = (const uint8_t*)(x + (first * image_size));
-      uint8_t* output = (uint8_t*)(y + first);
+      const T8Bits* input = (const T8Bits*)(x + (first * image_size));
+      T8Bits* output = (T8Bits*)(y + first);
       std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), last - first));
       MlasQLinearGlobalAveragePoolNchw(input, x_scale, x_zero_point, output, y_scale, y_zero_point, last - first, image_size, acc_buffer.data());
     };
@@ -37,17 +38,17 @@ Status ComputeQLinearGlobalAvgPool(
         tp, static_cast<std::ptrdiff_t>(N * C), {1.0 * image_size, 1.0, 8.0 * image_size}, worker);
   } else {
     auto worker = [=](std::ptrdiff_t first, std::ptrdiff_t last) {
-      const uint8_t* input = x + first * C * image_size;
-      uint8_t* output = y + first * C;
+      const T8Bits* input = x + first * C * image_size;
+      T8Bits* output = y + first * C;
       std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), C));
-      std::vector<uint8_t> zero_buffer(MlasQLinearSafePaddingElementCount(sizeof(uint8_t), C), 0);
+      std::vector<T8Bits> zero_buffer(MlasQLinearSafePaddingElementCount(sizeof(T8Bits), C), 0);
       MlasQLinearGlobalAveragePoolNhwc(
           input, x_scale, x_zero_point, output, y_scale, y_zero_point,
           last - first, image_size, C, C, acc_buffer.data(), zero_buffer.data());
     };
     concurrency::ThreadPool::TryParallelFor(
         tp, static_cast<std::ptrdiff_t>(N),
-        {1.0 * image_size * C, 1.0 * C, 8.0 *image_size * C},
+        {1.0 * image_size * C, 1.0 * C, 8.0 * image_size * C},
         worker);
   }
   return Status::OK();
@@ -88,19 +89,25 @@ Status QLinearGlobalAveragePool::Compute(OpKernelContext* context) const {
 
   const float x_scale = *(tensor_x_scale->Data<float>());
   const float y_scale = *(tensor_y_scale->Data<float>());
+
   auto dtype = X.GetElementType();
-  switch (dtype) {
-    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:
-      return ComputeQLinearGlobalAvgPool(X.Data<uint8_t>(), x_scale, *(tensor_x_zero_point->Data<uint8_t>()),
-                                Y.MutableData<uint8_t>(), y_scale, *(tensor_y_zero_point->Data<uint8_t>()),
-                                N, C, image_size, channels_last_, tp);
-    default:
-      ORT_THROW("Unsupported 'dtype' value: ", dtype);
+  if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
+    return ComputeQLinearGlobalAvgPool(X.Data<uint8_t>(), x_scale, *(tensor_x_zero_point->Data<uint8_t>()),
+                                       Y.MutableData<uint8_t>(), y_scale, *(tensor_y_zero_point->Data<uint8_t>()),
+                                       N, C, image_size, channels_last_, tp);
+  } else {
+    return ComputeQLinearGlobalAvgPool(X.Data<int8_t>(), x_scale, *(tensor_x_zero_point->Data<int8_t>()),
+                                       Y.MutableData<int8_t>(), y_scale, *(tensor_y_zero_point->Data<int8_t>()),
+                                       N, C, image_size, channels_last_, tp);
   }
 }
 
-ONNX_OPERATOR_KERNEL_EX(QLinearGlobalAveragePool, kMSDomain, 1, kCpuExecutionProvider, KernelDefBuilder(), QLinearGlobalAveragePool);
+ONNX_OPERATOR_KERNEL_EX(QLinearGlobalAveragePool,
+                        kMSDomain,
+                        1,
+                        kCpuExecutionProvider,
+                        KernelDefBuilder(),
+                        QLinearGlobalAveragePool);
 
 }  // namespace contrib
-
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.h b/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.h
index f46ed1c0e4..bc0fca6063 100644
--- a/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.h
+++ b/onnxruntime/contrib_ops/cpu/qlinear_global_average_pool.h
@@ -21,13 +21,14 @@ class QLinearGlobalAveragePool final : public OpKernel {
   bool channels_last_;
 };
 
+template<typename T8Bits>
 Status ComputeQLinearGlobalAvgPool(
-    const uint8_t* x,
+    const T8Bits* x,
     float x_scale,
-    uint8_t x_zero_point,
-    uint8_t* y,
+    T8Bits x_zero_point,
+    T8Bits* y,
     float y_scale,
-    uint8_t y_zero_point,
+    T8Bits y_zero_point,
     int64_t N,
     int64_t C,
     int64_t image_size,
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
index 154c4620b3..6953206a70 100644
--- a/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.cc
@@ -23,19 +23,15 @@ using concurrency::ThreadPool;
 namespace contrib {
 
 template <typename T8Bits>
-static inline float dequantize_value(T8Bits x, float x_scale, T8Bits x_zero_point);
-
-template <typename T8Bits>
-static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point);
-
-template <>
-inline float dequantize_value<uint8_t>(uint8_t x, float x_scale, uint8_t x_zero_point) {
+static inline float dequantize_value(T8Bits x, float x_scale, T8Bits x_zero_point) {
   return x_scale * (static_cast<int>(x) - x_zero_point);
 }
 
-template <>
-inline uint8_t quantize_value<uint8_t>(float y, float y_scale, uint8_t y_zero_point) {
-  return static_cast<uint8_t>(std::max(0.0f, std::min(std::nearbyintf(y / y_scale + y_zero_point), 255.0f)));
+template <typename T8Bits>
+static inline T8Bits quantize_value(float y, float y_scale, T8Bits y_zero_point) {
+  constexpr int32_t min_8bits = std::numeric_limits<T8Bits>::lowest();
+  constexpr int32_t max_8bits = std::numeric_limits<T8Bits>::max();
+  return static_cast<T8Bits>(std::max(min_8bits, std::min(static_cast<int32_t>(std::nearbyintf(y / y_scale + y_zero_point)), max_8bits)));
 }
 
 static void SwitchDimsNchwNhwc(std::vector<int64_t>& dims, bool from_nchw_to_nhwc) {
@@ -509,6 +505,15 @@ void dequantize_array(int64_t N, const T8Bits* input, float scale, T8Bits zero_p
 }
 
 Status QLinearAveragePool::Compute(OpKernelContext* context) const {
+  if (is_input_signed_) {
+    return ComputeImpl<int8_t>(context);
+  } else {
+    return ComputeImpl<uint8_t>(context);
+  }
+}
+
+template <typename T8Bits>
+Status QLinearAveragePool::ComputeImpl(OpKernelContext* context) const {
   const auto tensor_x_scale = context->Input<Tensor>(1);
   const auto tensor_x_zero_point = context->Input<Tensor>(2);
   const auto tensor_y_scale = context->Input<Tensor>(3);
@@ -524,16 +529,12 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
               "input y_zero_point must be a scalar or 1D tensor of size 1 if given");
 
   const auto* X = context->Input<Tensor>(0);
-  auto dtype = X->GetElementType();
-  if (dtype != ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    ORT_THROW("Unsupported 'dtype' in QLinear Pooling:", dtype);
-  }
 
   TensorShape x_shape = X->Shape();
   const float x_scale = *(tensor_x_scale->Data<float>());
   const float y_scale = *(tensor_y_scale->Data<float>());
-  uint8_t x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data<uint8_t>()) : (uint8_t)0);
-  uint8_t y_zero_point = (tensor_y_zero_point ? *(tensor_y_zero_point->Data<uint8_t>()) : (uint8_t)0);
+  T8Bits x_zero_point = (tensor_x_zero_point ? *(tensor_x_zero_point->Data<T8Bits>()) : (T8Bits)0);
+  T8Bits y_zero_point = (tensor_y_zero_point ? *(tensor_y_zero_point->Data<T8Bits>()) : (T8Bits)0);
 
   ORT_RETURN_IF_NOT(x_shape.NumDimensions() >= 3, "Input dimension cannot be less than 3.");
   std::vector<int64_t> pads = pool_attrs_.pads;
@@ -564,8 +565,8 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
     SwitchDimsNchwNhwc(output_dims, true);
   }
   Tensor* Y = context->Output(0, output_dims);
-  const auto* X_data = X->Data<uint8_t>();
-  auto* Y_data = Y->MutableData<uint8_t>();
+  const auto* X_data = X->Data<T8Bits>();
+  auto* Y_data = Y->MutableData<T8Bits>();
   ThreadPool* tp = context->GetOperatorThreadPool();
 
   // Check for special case which could fall back to global average pool
@@ -589,12 +590,12 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
   switch (kernel_shape.size()) {
     case 1: {
       if (channels_last_) {
-        QLinearPoolNhwc1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
+        QLinearPoolNhwc1DTask<T8Bits, onnxruntime::AveragePool> avg_pool_task_1d = {
             x_data_fp32, Y_data, y_scale, y_zero_point, channels,
             pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
         ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_1d.Cost(), avg_pool_task_1d);
       } else {
-        QLinearPool1DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_1d = {
+        QLinearPool1DTask<T8Bits, onnxruntime::AveragePool> avg_pool_task_1d = {
             x_data_fp32, Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
             pooled_height, strides[0], height, kernel_shape, pads, pool_context_, pool_attrs_};
         ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_1d.Cost(), avg_pool_task_1d);
@@ -604,13 +605,13 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
 
     case 2: {
       if (channels_last_) {
-        QLinearPoolNhwc2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
+        QLinearPoolNhwc2DTask<T8Bits, onnxruntime::AveragePool> avg_pool_task_2d = {
             x_data_fp32, Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
             pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
         ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_2d.Cost(), avg_pool_task_2d);
 
       } else {
-        QLinearPool2DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_2d = {
+        QLinearPool2DTask<T8Bits, onnxruntime::AveragePool> avg_pool_task_2d = {
             x_data_fp32, Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
             pooled_height, pooled_width, strides[0], strides[1], height, width, kernel_shape, pads, pool_context_, pool_attrs_};
         ThreadPool::TryParallelFor(tp, total_channels, avg_pool_task_2d.Cost(), avg_pool_task_2d);
@@ -620,14 +621,14 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
 
     case 3: {
       if (channels_last_) {
-        QLinearPoolNhwc3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
+        QLinearPoolNhwc3DTask<T8Bits, onnxruntime::AveragePool> avg_pool_task_3d = {
             x_data_fp32, Y_data, y_scale, y_zero_point, x_image_size, y_image_size, kernel_size, channels,
             pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
             kernel_shape, pads, pool_context_, pool_attrs_};
         ThreadPool::TryParallelFor(tp, y_image_size * batch_count, avg_pool_task_3d.Cost(), avg_pool_task_3d);
 
       } else {
-        QLinearPool3DTask<uint8_t, onnxruntime::AveragePool> avg_pool_task_3d = {
+        QLinearPool3DTask<T8Bits, onnxruntime::AveragePool> avg_pool_task_3d = {
             x_data_fp32, Y_data, y_scale, y_zero_point, x_image_size, y_image_size,
             pooled_height, pooled_width, pooled_depth, strides[0], strides[1], strides[2], height, width, depth,
             kernel_shape, pads, pool_context_, pool_attrs_};
@@ -647,7 +648,12 @@ Status QLinearAveragePool::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
-ONNX_OPERATOR_KERNEL_EX(QLinearAveragePool, kMSDomain, 1, kCpuExecutionProvider, KernelDefBuilder(), QLinearAveragePool);
+ONNX_OPERATOR_KERNEL_EX(QLinearAveragePool,
+                        kMSDomain,
+                        1,
+                        kCpuExecutionProvider,
+                        KernelDefBuilder(),
+                        QLinearAveragePool);
 
 }  // namespace contrib
 
diff --git a/onnxruntime/contrib_ops/cpu/qlinear_pool.h b/onnxruntime/contrib_ops/cpu/qlinear_pool.h
index 92285e4f78..ebb6be4a00 100644
--- a/onnxruntime/contrib_ops/cpu/qlinear_pool.h
+++ b/onnxruntime/contrib_ops/cpu/qlinear_pool.h
@@ -14,6 +14,9 @@ class QLinearAveragePool final : public OpKernel, public PoolBase {
  public:
   QLinearAveragePool(const OpKernelInfo& info) : OpKernel(info), PoolBase(info) {
     channels_last_ = (info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(0)) != 0);
+
+    int32_t input_type = info.node().InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+    is_input_signed_ = ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8 == input_type;
   }
 
   ~QLinearAveragePool() override = default;
@@ -21,8 +24,12 @@ class QLinearAveragePool final : public OpKernel, public PoolBase {
   Status Compute(OpKernelContext* context) const override;
 
  private:
+  template <typename T8Bits>
+  Status ComputeImpl(OpKernelContext* context) const;
+
   PoolProcessContext pool_context_;
   bool channels_last_;
+  bool is_input_signed_;
 };
 
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cpu/quantization/nhwc_max_pool.cc b/onnxruntime/contrib_ops/cpu/quantization/nhwc_max_pool.cc
index 991f5f19e8..a6f297c7b5 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/nhwc_max_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/nhwc_max_pool.cc
@@ -11,6 +11,7 @@
 namespace onnxruntime {
 namespace contrib {
 
+template <typename T8Bits>
 class NhwcMaxPool : public OpKernel {
  public:
   explicit NhwcMaxPool(const OpKernelInfo& info) : OpKernel(info),
@@ -20,10 +21,11 @@ class NhwcMaxPool : public OpKernel {
   Status Compute(OpKernelContext* context) const override;
 
  private:
-   PoolAttributes pool_attrs_;
+  PoolAttributes pool_attrs_;
 };
 
-Status NhwcMaxPool::Compute(OpKernelContext* context) const {
+template <typename T8Bits>
+Status NhwcMaxPool<T8Bits>::Compute(OpKernelContext* context) const {
   const auto* X = context->Input<Tensor>(0);
   const TensorShape& input_shape = X->Shape();
 
@@ -73,17 +75,17 @@ Status NhwcMaxPool::Compute(OpKernelContext* context) const {
   AllocatorPtr alloc;
   ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
   int64_t col_buffer_batch_count = std::min(output_image_size, output_batch_count);
-  auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(const uint8_t*)) * kernel_size * col_buffer_batch_count);
+  auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(const T8Bits*)) * kernel_size * col_buffer_batch_count);
   BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc));
-  std::vector<uint8_t> padding_data(static_cast<size_t>(C), 0);
+  std::vector<T8Bits> padding_data(static_cast<size_t>(C), std::numeric_limits<T8Bits>::lowest());
 
-  const auto* Xdata = X->template Data<uint8_t>();
-  auto* Ydata = Y->template MutableData<uint8_t>();
+  const auto* Xdata = X->template Data<T8Bits>();
+  auto* Ydata = Y->template MutableData<T8Bits>();
 
   for (int64_t image_id = 0; image_id < N; ++image_id) {
     for (int64_t output_start = 0; output_start < output_image_size;) {
       int64_t output_count = std::min(output_image_size - output_start, output_batch_count);
-      math::Im2col<uint8_t, StorageOrder::NHWC>()(
+      math::Im2col<T8Bits, StorageOrder::NHWC>()(
           Xdata,
           C,
           input_shape.GetDims().data() + 1,
@@ -95,10 +97,10 @@ Status NhwcMaxPool::Compute(OpKernelContext* context) const {
           static_cast<ptrdiff_t>(spatial_dims),
           output_start,
           output_count,
-          static_cast<uint8_t const**>(col_buffer.get()),
+          static_cast<T8Bits const**>(col_buffer.get()),
           padding_data.data());
       MlasMaximumPool(
-          static_cast<uint8_t const**>(col_buffer.get()),
+          static_cast<T8Bits const**>(col_buffer.get()),
           Ydata,
           static_cast<size_t>(C),
           static_cast<size_t>(output_count),
@@ -114,14 +116,19 @@ Status NhwcMaxPool::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
-ONNX_OPERATOR_KERNEL_EX(
-    NhwcMaxPool,
-    kMSDomain,
-    1,
-    kCpuExecutionProvider,
-    KernelDefBuilder()
-        .TypeConstraint("T", DataTypeImpl::GetTensorType<uint8_t>()),
-    NhwcMaxPool);
+#define REGISTER_NHWCMAXPOOL_TYPED_KERNEL(T)                      \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      NhwcMaxPool,                                                \
+      kMSDomain,                                                  \
+      1,                                                          \
+      T,                                                          \
+      kCpuExecutionProvider,                                      \
+      KernelDefBuilder()                                          \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      NhwcMaxPool<T>);
+
+REGISTER_NHWCMAXPOOL_TYPED_KERNEL(int8_t);
+REGISTER_NHWCMAXPOOL_TYPED_KERNEL(uint8_t);
 
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_concat.cu b/onnxruntime/contrib_ops/cuda/bert/attention_concat.cu
new file mode 100644
index 0000000000..88a75f36d8
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_concat.cu
@@ -0,0 +1,222 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cuda_common.h"
+#include "attention_impl.h"
+
+using namespace onnxruntime::cuda;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+template <typename T>
+__global__ void ConcatTensorToTensor(const int tensor_add_sequence_length,
+                                     const T* tensor_in,
+                                     const T* tensor_add,
+                                     T* tensor_out) {
+  const int h = threadIdx.x;
+  const int n = threadIdx.y;
+  const int s = blockIdx.x;
+  const int b = blockIdx.y;
+  const int chunk_id = blockIdx.z;
+
+  const int all_sequence_length = gridDim.x;
+  const int batch_size = gridDim.y;
+  const int num_heads = blockDim.y;
+  const int H = blockDim.x;
+
+  // K: number of identical tensors
+  // tensor_in:    K x BxNxS'xH
+  // tensor_add:   K x BxNxSxH
+  // tensor_out:   K x BxNx(S'+S)xH
+  const int tensor_in_sequence_length = all_sequence_length - tensor_add_sequence_length;
+
+  const int present_SH = all_sequence_length * H;
+  const int present_NSH = num_heads * present_SH;
+  int out_offset = b * present_NSH + n * present_SH + s * H + h + chunk_id * (present_NSH * batch_size);
+  if (s < tensor_in_sequence_length) {
+    const int past_SH = tensor_in_sequence_length * H;
+    const int past_NSH = num_heads * past_SH;
+    const int in_offset = b * past_NSH + n * past_SH + s * H + h + chunk_id * (past_NSH * batch_size);
+    tensor_out[out_offset] = tensor_in[in_offset];
+  } else if (s < all_sequence_length) {
+    const int SH = tensor_add_sequence_length * H;
+    const int NSH = num_heads * SH;
+    const int in_offset = b * NSH + n * SH + (s - tensor_in_sequence_length) * H + h + chunk_id * (NSH * batch_size);
+    tensor_out[out_offset] = tensor_add[in_offset];
+  }
+}
+
+template <typename T>
+__global__ void ConcatTensorToTensorLarge(const int tensor_add_sequence_length,
+                                          const int H,
+                                          const T* tensor_in,
+                                          const T* tensor_add,
+                                          T* tensor_out) {
+  // Use when (H*)*num_heads > 1024
+  int h = threadIdx.x;
+  const int n = threadIdx.y;
+  const int s = blockIdx.x;
+  const int b = blockIdx.y;
+  const int chunk_id = blockIdx.z;
+
+  const int all_sequence_length = gridDim.x;
+  const int batch_size = gridDim.y;
+  const int num_heads = blockDim.y;
+  const int stride = blockDim.x;
+
+  // K: number of identical tensor
+  // tensor_in:    K x BxNxS'xH
+  // tensor_add:   K x BxNxSxH
+  // tensor_out:   K x BxNx(S'+S)xH
+  const int tensor_in_sequence_length = all_sequence_length - tensor_add_sequence_length;
+
+  const int present_SH = all_sequence_length * H;
+  const int present_NSH = num_heads * present_SH;
+  while (h < H) {
+    int out_offset = b * present_NSH + n * present_SH + s * H + h + chunk_id * (present_NSH * batch_size);
+    if (s < tensor_in_sequence_length) {
+      const int past_SH = tensor_in_sequence_length * H;
+      const int past_NSH = num_heads * past_SH;
+      const int in_offset = b * past_NSH + n * past_SH + s * H + h + chunk_id * (past_NSH * batch_size);
+      tensor_out[out_offset] = tensor_in[in_offset];
+    } else if (s < all_sequence_length) {
+      const int SH = tensor_add_sequence_length * H;
+      const int NSH = num_heads * SH;
+      const int in_offset = b * NSH + n * SH + (s - tensor_in_sequence_length) * H + h + chunk_id * (NSH * batch_size);
+      tensor_out[out_offset] = tensor_add[in_offset];
+    }
+
+    h += stride;
+  }
+}
+
+
+bool LaunchConcatTensorToTensor(cudaStream_t stream,
+                                const int all_sequence_length,
+                                const int sequence_length,
+                                const int batch_size,
+                                const int head_size,
+                                const int num_heads,
+                                const int max_threads_per_block,
+                                const int matrix_num,
+                                const float* tensor_in,
+                                const float* tensor_add,
+                                float* tensor_out) {
+  const dim3 grid(all_sequence_length, batch_size, matrix_num);
+  if (0 == (head_size & 1)) {
+    const int H = head_size / 2;
+    if (H * num_heads <= max_threads_per_block) {
+      const dim3 block(H, num_heads, 1);
+      ConcatTensorToTensor<float2><<<grid, block, 0, stream>>>(sequence_length, reinterpret_cast<const float2*>(tensor_in), reinterpret_cast<const float2*>(tensor_add), reinterpret_cast<float2*>(tensor_out));
+    } else {
+      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
+      ConcatTensorToTensorLarge<float2><<<grid, block, 0, stream>>>(sequence_length, H, reinterpret_cast<const float2*>(tensor_in), reinterpret_cast<const float2*>(tensor_add), reinterpret_cast<float2*>(tensor_out));
+    }
+  } else {
+    if (head_size * num_heads <= max_threads_per_block) {
+      const dim3 block(head_size, num_heads, 1);
+      ConcatTensorToTensor<float><<<grid, block, 0, stream>>>(sequence_length, tensor_in, tensor_add, tensor_out);
+    } else {
+      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
+      ConcatTensorToTensorLarge<float><<<grid, block, 0, stream>>>(sequence_length, head_size, tensor_in, tensor_add, tensor_out);
+    }
+
+  }
+  return CUDA_CALL(cudaPeekAtLastError());
+}
+
+bool LaunchConcatTensorToTensor(cudaStream_t stream,
+                                const int all_sequence_length,
+                                const int sequence_length,
+                                const int batch_size,
+                                const int head_size,
+                                const int num_heads,
+                                const int max_threads_per_block,
+                                const int matrix_num,
+                                const half* tensor_in,
+                                const half* tensor_add,
+                                half* tensor_out) {
+  const dim3 grid(all_sequence_length, batch_size, matrix_num);
+  if (0 == (head_size % 4)) {
+    const int H = head_size / 4;
+    if (H * num_heads <= max_threads_per_block) {
+      const dim3 block(H, num_heads, 1);
+      ConcatTensorToTensor<float2><<<grid, block, 0, stream>>>(sequence_length, reinterpret_cast<const float2*>(tensor_in), reinterpret_cast<const float2*>(tensor_add), reinterpret_cast<float2*>(tensor_out));
+    } else {
+      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
+      ConcatTensorToTensorLarge<float2><<<grid, block, 0, stream>>>(sequence_length, H, reinterpret_cast<const float2*>(tensor_in), reinterpret_cast<const float2*>(tensor_add), reinterpret_cast<float2*>(tensor_out));
+    }
+  } else if (0 == (head_size & 1)) {
+    const int H = head_size / 2;
+    if (H * num_heads <= max_threads_per_block) {
+      const dim3 block(H, num_heads, 1);
+      ConcatTensorToTensor<half2><<<grid, block, 0, stream>>>(sequence_length, reinterpret_cast<const half2*>(tensor_in), reinterpret_cast<const half2*>(tensor_add), reinterpret_cast<half2*>(tensor_out));
+    } else {
+      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
+      ConcatTensorToTensorLarge<half2><<<grid, block, 0, stream>>>(sequence_length, H, reinterpret_cast<const half2*>(tensor_in), reinterpret_cast<const half2*>(tensor_add), reinterpret_cast<half2*>(tensor_out));
+    }
+  } else {  // this should be an "odd" case. probably not worth catching it in the half2 kernel.
+    if (head_size * num_heads <= max_threads_per_block) {
+      const dim3 block(head_size, num_heads, 1);
+      ConcatTensorToTensor<half><<<grid, block, 0, stream>>>(sequence_length, tensor_in, tensor_add, tensor_out);
+    } else {
+      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
+      ConcatTensorToTensorLarge<half><<<grid, block, 0, stream>>>(sequence_length, head_size, tensor_in, tensor_add, tensor_out);
+    }
+  }
+  return CUDA_CALL(cudaPeekAtLastError());
+}
+
+bool LaunchConcatPastToPresent(cudaStream_t stream,
+                               const int all_sequence_length,
+                               const int sequence_length,
+                               const int batch_size,
+                               const int head_size,
+                               const int num_heads,
+                               const int max_threads_per_block,
+                               const float* past,
+                               const float* k_v,
+                               float* present) {
+  return LaunchConcatTensorToTensor(
+    stream,
+    all_sequence_length,
+    sequence_length,
+    batch_size,
+    head_size,
+    num_heads,
+    max_threads_per_block,
+    2,
+    past,
+    k_v,
+    present);
+}
+
+bool LaunchConcatPastToPresent(cudaStream_t stream,
+                               const int all_sequence_length,
+                               const int sequence_length,
+                               const int batch_size,
+                               const int head_size,
+                               const int num_heads,
+                               const int max_threads_per_block,
+                               const half* past,
+                               const half* k_v,
+                               half* present) {
+  return LaunchConcatTensorToTensor(
+    stream,
+    all_sequence_length,
+    sequence_length,
+    batch_size,
+    head_size,
+    num_heads,
+    max_threads_per_block,
+    2,
+    past,
+    k_v,
+    present);
+}
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
index 2f48d806e5..368f520515 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
@@ -32,6 +32,11 @@ limitations under the License.
 using namespace onnxruntime::cuda;
 using namespace cub;
 
+#define CHECK_CUDA(expr)  \
+  if (!CUDA_CALL(expr)) { \
+    return false;         \
+  }
+
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
@@ -73,10 +78,10 @@ bool QkvToContext(
   T* scratch2 = scratch1 + (bytes / element_size);
   T* scratch3 = scratch2 + (bytes / element_size);
 
-  const int max_threads_per_block(prop.maxThreadsPerBlock);
+  const int max_threads_per_block = prop.maxThreadsPerBlock;
 
   // input should be BxSx3xNxH => scratch3: 3xBxNxSxH
-  if (!LaunchTransQkv(stream, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, input, scratch3)) {
+  if (!LaunchTransQkv(stream, 3, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, false, input, scratch3)) {
     return false;
   }
 
@@ -130,7 +135,7 @@ bool QkvToContext(
     const int64_t max_sequence_length = mask_dimension == 4 ? mask_index_dims.at(3) : 0;
 
     T* persistent_softmax_workspace = scratch1; // replace Q*K' in place with masked score if persistent softmax is selected.
-    if (!ComputeSoftmaxWithRawMask<T>(stream, all_sequence_length, sequence_length, batch_size, num_heads, mask_index, extra_add_qk, scratch1, scratch2, 
+    if (!ComputeSoftmaxWithRawMask<T>(stream, all_sequence_length, sequence_length, batch_size, num_heads, mask_index, nullptr, extra_add_qk, scratch1, scratch2,
                                       is_unidirectional, rsqrt_head_size, mask_dimension, static_cast<int>(max_sequence_length),
                                       use_persistent_softmax, persistent_softmax_workspace)) {
       return false;
@@ -156,7 +161,7 @@ bool QkvToContext(
   }
 
   // scratch3 is BxNxSxH, transpose to output BxSxNxH
-  return LaunchTransCtx(stream, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, scratch3, output);
+  return LaunchTransCtx(stream, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, false, scratch3, output);
 }
 
 bool LaunchAttentionKernel(
@@ -179,7 +184,6 @@ bool LaunchAttentionKernel(
     const void* extra_add_qk,
     void* present) {
 
-  
   // For testing, environment variable ORT_TRANSFORMER_OPTIONS=1 could enable persistent softmax
   const TransformerOptions* options = TransformerOptions::GetInstance();
   bool use_persistent_softmax = options->IsPrecisionMode() && !options->DisablePersistentSoftmax();
@@ -189,18 +193,246 @@ bool LaunchAttentionKernel(
                         batch_size, sequence_length, num_heads, head_size, element_size,
                         reinterpret_cast<const half*>(input), reinterpret_cast<half*>(output), reinterpret_cast<half*>(workspace),
                         mask_index, mask_index_dims, is_unidirectional,
-                        past_sequence_length, reinterpret_cast<const half*>(past), reinterpret_cast<const half*>(extra_add_qk), 
+                        past_sequence_length, reinterpret_cast<const half*>(past), reinterpret_cast<const half*>(extra_add_qk),
                         reinterpret_cast<half*>(present), use_persistent_softmax);
   } else {
     return QkvToContext(prop, cublas, stream,
                         batch_size, sequence_length, num_heads, head_size, element_size,
                         reinterpret_cast<const float*>(input), reinterpret_cast<float*>(output), reinterpret_cast<float*>(workspace),
                         mask_index, mask_index_dims, is_unidirectional,
-                        past_sequence_length, reinterpret_cast<const float*>(past), reinterpret_cast<const float*>(extra_add_qk), 
+                        past_sequence_length, reinterpret_cast<const float*>(past), reinterpret_cast<const float*>(extra_add_qk),
                         reinterpret_cast<float*>(present), use_persistent_softmax);
   }
 }
 
+
+template <typename T>
+bool DecoderQkvToContext(
+  const cudaDeviceProp& prop,
+  cudaStream_t stream,
+  cublasHandle_t& cublas,
+  const size_t element_size,
+  const int batch_size,
+  const int sequence_length,
+  const int kv_sequence_length,
+  const int num_heads,
+  const int head_size,
+  const bool static_kv,
+  const bool use_past,
+  const bool has_layer_state,
+  const bool has_key_padding_mask,
+  const T* gemm_query_buffer,
+  const T* gemm_kv_buffer,
+  const bool* key_padding_mask,
+  const T* key_cache,
+  const T* value_cache,
+  T* qkv_buffer,
+  T* workspace_buffer,
+  T* output,
+  T* new_key_cache,
+  T* new_value_cache)
+{
+  const int max_threads_per_block = prop.maxThreadsPerBlock;
+  const int BN = batch_size * num_heads;
+  const int BHN = BN * head_size;
+  const int BNS = BN * sequence_length;
+  const int k_buffer_offset = sequence_length * BHN;
+  const int v_buffer_offset = (sequence_length + kv_sequence_length) * BHN;
+
+  T* temp_qkv_buffer = workspace_buffer;
+
+  const T* q = qkv_buffer;
+  //transpose q and copy them to qkv_buffer
+  if (!LaunchTransQkv(stream, 1, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, true, gemm_query_buffer, qkv_buffer)) {
+    return false;
+  }
+
+  const T* k = qkv_buffer + k_buffer_offset;
+  const T* v = qkv_buffer + v_buffer_offset;
+  if (!has_layer_state || !use_past) {
+    if (!static_kv) {
+      //transpose kv and copy them to qkv_buffer
+      if (!LaunchTransQkv(stream, 2, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, true, gemm_kv_buffer, qkv_buffer + k_buffer_offset)) {
+        return false;
+      }
+    } else {
+      //transpose kv and copy them to qkv_buffer
+      if (!LaunchTransQkv(stream, 2, kv_sequence_length, batch_size, head_size, num_heads, max_threads_per_block, true, gemm_kv_buffer, qkv_buffer + k_buffer_offset)) {
+        return false;
+      }
+    }
+  } else {
+    if (!static_kv) {
+      //transpose kv and copy them to temp_buffer
+      if (!LaunchTransQkv(stream, 2, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, true, gemm_kv_buffer, temp_qkv_buffer)) {
+        return false;
+      }
+      // concat cache-k with k and copy to qkv_buffer
+      if (nullptr != key_cache && !LaunchConcatTensorToTensor(stream, kv_sequence_length, sequence_length, batch_size, head_size, num_heads,
+          max_threads_per_block, 1, key_cache, temp_qkv_buffer, qkv_buffer + k_buffer_offset)) {
+        return false;
+      }
+      // concat cache-v with v and copy to qkv_buffer
+      if (nullptr != value_cache && !LaunchConcatTensorToTensor(stream, kv_sequence_length, sequence_length, batch_size, head_size, num_heads,
+          max_threads_per_block, 1, value_cache, temp_qkv_buffer + k_buffer_offset, qkv_buffer + v_buffer_offset)) {
+        return false;
+      }
+    }
+  }
+
+  if (has_layer_state) {
+    if (use_past && static_kv) {
+      CHECK_CUDA(cudaMemcpyAsync(new_key_cache, key_cache, kv_sequence_length * BHN * sizeof(T), cudaMemcpyDeviceToDevice, stream));
+      CHECK_CUDA(cudaMemcpyAsync(new_value_cache, value_cache, kv_sequence_length * BHN * sizeof(T), cudaMemcpyDeviceToDevice, stream));
+    } else {
+      CHECK_CUDA(cudaMemcpyAsync(new_key_cache, k, kv_sequence_length * BHN * sizeof(T), cudaMemcpyDeviceToDevice, stream));
+      CHECK_CUDA(cudaMemcpyAsync(new_value_cache, v, kv_sequence_length * BHN * sizeof(T), cudaMemcpyDeviceToDevice, stream));
+    }
+  }
+
+  // scratch1: BxNxSxS* buffer
+  // scratch2: BxNxSxS* buffer
+  // scratch3: BxNxSxH  buffer
+  T* scratch1 = temp_qkv_buffer + 3 * BHN * sequence_length;
+  T* scratch2 = scratch1 + BNS * kv_sequence_length;
+  T* scratch3 = scratch2 + BNS * kv_sequence_length;
+
+  // compute Q*K' (as K'*Q), scaled by 1/sqrt(H) and store in scratch1: BxNxSxS*
+  // Q: BxNxSxH, K (present_k): BxNxS*xH, Q*K': BxNxSxS*
+  const float rsqrt_head_size = 1.f / sqrt(static_cast<float>(head_size));
+  const int temp_matrix_size = sequence_length * kv_sequence_length;
+  float one = 1.0f;
+  float zero = 0.f;
+
+  float alpha = rsqrt_head_size;
+  const int strideA = kv_sequence_length * head_size;
+  const int strideB = sequence_length * head_size;
+  if (use_past && static_kv) {
+    if (!CUBLAS_CALL(cublasGemmStridedBatchedHelper(
+      cublas, CUBLAS_OP_T, CUBLAS_OP_N, kv_sequence_length, sequence_length, head_size, &alpha, key_cache, head_size, strideA,
+      q, head_size, strideB, &zero, scratch1, kv_sequence_length, temp_matrix_size, BN, prop))) {
+      return false;
+    }
+  } else {
+    if (!CUBLAS_CALL(cublasGemmStridedBatchedHelper(
+      cublas, CUBLAS_OP_T, CUBLAS_OP_N, kv_sequence_length, sequence_length, head_size, &alpha, k, head_size, strideA,
+      q, head_size, strideB, &zero, scratch1, kv_sequence_length, temp_matrix_size, BN, prop))) {
+      return false;
+    }
+  }
+
+  if (has_key_padding_mask) {
+    if (!ComputeSoftmaxWithRawMask<T>(stream, kv_sequence_length, sequence_length, batch_size, num_heads, nullptr, key_padding_mask, nullptr, scratch1, scratch2,
+        false, 1, 2, static_cast<int>(0), false, nullptr)) {
+      return false;
+    }
+  } else {
+    if (!ComputeSoftmax<T>(stream, kv_sequence_length, sequence_length, batch_size, num_heads, nullptr, scratch1, scratch2, false)) {
+      return false;
+    }
+  }
+
+  // compute P*V (as V*P), and store in scratch3: BxNxSxH
+  if (use_past && static_kv) {
+    if (!CUBLAS_CALL(cublasGemmStridedBatchedHelper(
+      cublas, CUBLAS_OP_N, CUBLAS_OP_N, head_size, sequence_length, kv_sequence_length, &one, value_cache, head_size, strideA,
+      scratch2, kv_sequence_length, temp_matrix_size, &zero, scratch3, head_size, strideB, BN, prop))) {
+      return false;
+    }
+  } else {
+    if (!CUBLAS_CALL(cublasGemmStridedBatchedHelper(
+      cublas, CUBLAS_OP_N, CUBLAS_OP_N, head_size, sequence_length, kv_sequence_length, &one, v, head_size, strideA,
+      scratch2, kv_sequence_length, temp_matrix_size, &zero, scratch3, head_size, strideB, BN, prop))) {
+      return false;
+    }
+  }
+
+  // scratch3 is BxNxSxH, transpose to output SxBxNxH
+  return LaunchTransCtx(stream, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, true, scratch3, output);
+}
+
+
+bool LaunchDecoderAttentionKernel(
+  const cudaDeviceProp& prop,
+  cudaStream_t stream,
+  cublasHandle_t& cublas,
+  const size_t element_size,
+  const int batch_size,
+  const int sequence_length,
+  const int kv_sequence_length,
+  const int num_heads,
+  const int head_size,
+  const bool static_kv,
+  const bool use_past,
+  const bool has_layer_state,
+  const bool has_key_padding_mask,
+  const void* gemm_query_buffer,
+  const void* gemm_kv_buffer,
+  const bool* key_padding_mask,
+  const void* key_cache,
+  const void* value_cache,
+  void* qkv_buffer,
+  void* workspace_buffer,
+  void* output,
+  void* new_key_cache,
+  void* new_value_cache)
+{
+
+  if (element_size == 2) {
+    return DecoderQkvToContext(
+      prop,
+      stream,
+      cublas,
+      element_size,
+      batch_size,
+      sequence_length,
+      kv_sequence_length,
+      num_heads,
+      head_size,
+      static_kv,
+      use_past,
+      has_layer_state,
+      has_key_padding_mask,
+      reinterpret_cast<const half*>(gemm_query_buffer),
+      reinterpret_cast<const half*>(gemm_kv_buffer),
+      key_padding_mask,
+      reinterpret_cast<const half*>(key_cache),
+      reinterpret_cast<const half*>(value_cache),
+      reinterpret_cast<half*>(qkv_buffer),
+      reinterpret_cast<half*>(workspace_buffer),
+      reinterpret_cast<half*>(output),
+      reinterpret_cast<half*>(new_key_cache),
+      reinterpret_cast<half*>(new_value_cache)
+    );
+  } else {
+    return DecoderQkvToContext(
+      prop,
+      stream,
+      cublas,
+      element_size,
+      batch_size,
+      sequence_length,
+      kv_sequence_length,
+      num_heads,
+      head_size,
+      static_kv,
+      use_past,
+      has_layer_state,
+      has_key_padding_mask,
+      reinterpret_cast<const float*>(gemm_query_buffer),
+      reinterpret_cast<const float*>(gemm_kv_buffer),
+      key_padding_mask,
+      reinterpret_cast<const float*>(key_cache),
+      reinterpret_cast<const float*>(value_cache),
+      reinterpret_cast<float*>(qkv_buffer),
+      reinterpret_cast<float*>(workspace_buffer),
+      reinterpret_cast<float*>(output),
+      reinterpret_cast<float*>(new_key_cache),
+      reinterpret_cast<float*>(new_value_cache)
+    );
+  }
+}
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h
index e07b45e897..9affcf2f49 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_impl.h
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_impl.h
@@ -39,21 +39,71 @@ bool LaunchAttentionKernel(
     void* present                                 // Present state output
 );
 
-bool LaunchTransCtx(cudaStream_t stream,
-                    const int sequence_length, const int batch_size, const int head_size, const int num_heads,
-                    const int max_threads_per_block, const float* input, float* output);
+bool LaunchDecoderAttentionKernel(
+    const cudaDeviceProp& prop,                   // Device Properties
+    cudaStream_t stream,                          // Cuda stream
+    cublasHandle_t& cublas,                       // Cublas handle
+    const size_t element_size,                    // Element size of input tensor
+    const int batch_size,                         // Batch size (B)
+    const int sequence_length,                    // Sequence length (S)
+    const int kv_sequence_length,                 // Key/Value/Cache sequence length
+    const int num_heads,                          // Number of attention heads (N)
+    const int head_size,                          // Hidden layer size per head (H)
+    const bool static_kv,                         // Whether cross attention or not
+    const bool use_past,                          // Whether use cache or not
+    const bool has_layer_state,                   // Whether output cache or not
+    const bool has_key_padding_mask,              // Whether use key_padding_mask or not
+    const void* gemm_query_buffer,                // Query buffer
+    const void* gemm_kv_buffer,                   // Key and value buffer
+    const bool* key_padding_mask,                 // Key padding mask
+    const void* key_cache,                        // Input key cache
+    const void* value_cache,                      // Input value cache
+    void* qkv_buffer,                             // Temporary buffer
+    void* workspace_buffer,                       // Temporary buffer
+    void* output,                                 // Output tensor
+    void* new_key_cache,                          // New_key_cache tensor
+    void* new_value_cache                         // New_value_cache tensor
+);
 
 bool LaunchTransCtx(cudaStream_t stream,
                     const int sequence_length, const int batch_size, const int head_size, const int num_heads,
-                    const int max_threads_per_block, const half* input, half* output);
+                    const int max_threads_per_block, const bool reversed_bs, const float* input, float* output);
 
-bool LaunchTransQkv(cudaStream_t stream,
+bool LaunchTransCtx(cudaStream_t stream,
                     const int sequence_length, const int batch_size, const int head_size, const int num_heads,
-                    const int max_threads_per_block, const float* input, float* output);
+                    const int max_threads_per_block, const bool reversed_bs, const half* input, half* output);
 
-bool LaunchTransQkv(cudaStream_t stream,
+bool LaunchTransQkv(cudaStream_t stream, const int matrix_num,
                     const int sequence_length, const int batch_size, const int head_size, const int num_heads,
-                    const int max_threads_per_block, const half* input, half* output);
+                    const int max_threads_per_block, const bool reversed_bs, const float* input, float* output);
+
+bool LaunchTransQkv(cudaStream_t stream, const int matrix_num,
+                    const int sequence_length, const int batch_size, const int head_size, const int num_heads,
+                    const int max_threads_per_block, const bool reversed_bs, const half* input, half* output);
+
+bool LaunchConcatTensorToTensor(cudaStream_t stream,
+                                const int all_sequence_length,
+                                const int sequence_length,
+                                const int batch_size,
+                                const int head_size,
+                                const int num_heads,
+                                const int max_threads_per_block,
+                                const int matrix_num,
+                                const float* tensor_in,
+                                const float* tensor_add,
+                                float* tensor_out);
+
+bool LaunchConcatTensorToTensor(cudaStream_t stream,
+                                const int all_sequence_length,
+                                const int sequence_length,
+                                const int batch_size,
+                                const int head_size,
+                                const int num_heads,
+                                const int max_threads_per_block,
+                                const int matrix_num,
+                                const half* tensor_in,
+                                const half* tensor_add,
+                                half* tensor_out);
 
 bool LaunchConcatPastToPresent(cudaStream_t stream,
                                const int all_sequence_length,
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_past.cu b/onnxruntime/contrib_ops/cuda/bert/attention_past.cu
deleted file mode 100644
index 3e6ba378d8..0000000000
--- a/onnxruntime/contrib_ops/cuda/bert/attention_past.cu
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "core/providers/cuda/cuda_common.h"
-#include "attention_impl.h"
-
-using namespace onnxruntime::cuda;
-
-namespace onnxruntime {
-namespace contrib {
-namespace cuda {
-
-template <typename T>
-__global__ void ConcatPastToPresent(const int sequence_length,
-                                    const T* past,
-                                    const T* k_v,
-                                    T* present) {
-  const int h = threadIdx.x;
-  const int n = threadIdx.y;
-  const int s = blockIdx.x;
-  const int b = blockIdx.y;
-  const int is_v = blockIdx.z;  // 0 for k, 1 for v
-
-  const int all_sequence_length = gridDim.x;
-  const int batch_size = gridDim.y;
-  const int num_heads = blockDim.y;
-  const int H = blockDim.x;
-
-  // past:    2 x BxNxS'xH   (past_k and past_v)
-  // k_v:     2 x BxNxSxH    (k and v)
-  // present: 2 x BxNxS*xH   (present_k and present_v)
-  const int past_sequence_length = all_sequence_length - sequence_length;
-
-  const int present_SH = all_sequence_length * H;
-  const int present_NSH = num_heads * present_SH;
-  int out_offset = b * present_NSH + n * present_SH + s * H + h + is_v * (present_NSH * batch_size);
-  if (s < past_sequence_length) {
-    const int past_SH = past_sequence_length * H;
-    const int past_NSH = num_heads * past_SH;
-    const int in_offset = b * past_NSH + n * past_SH + s * H + h + is_v * (past_NSH * batch_size);
-    present[out_offset] = past[in_offset];
-  } else if (s < all_sequence_length) {
-    const int SH = sequence_length * H;
-    const int NSH = num_heads * SH;
-    const int in_offset = b * NSH + n * SH + (s - past_sequence_length) * H + h + is_v * (NSH * batch_size);
-    present[out_offset] = k_v[in_offset];
-  }
-}
-
-template <typename T>
-__global__ void ConcatPastToPresentLarge(const int sequence_length,
-                                         const int H,
-                                         const T* past,
-                                         const T* k_v,
-                                         T* present) {
-  // Use when (H*)*num_heads > 1024
-  int h = threadIdx.x;
-  const int n = threadIdx.y;
-  const int s = blockIdx.x;
-  const int b = blockIdx.y;
-  const int is_v = blockIdx.z;  // 0 for k, 1 for v
-
-  const int all_sequence_length = gridDim.x;
-  const int batch_size = gridDim.y;
-  const int num_heads = blockDim.y;
-  const int stride = blockDim.x;
-
-  // past:    2 x BxNxS'xH   (past_k and past_v)
-  // k_v:     2 x BxNxSxH    (k and v)
-  // present: 2 x BxNxS*xH   (present_k and present_v)
-  const int past_sequence_length = all_sequence_length - sequence_length;
-
-  const int present_SH = all_sequence_length * H;
-  const int present_NSH = num_heads * present_SH;
-  while (h < H) {
-    int out_offset = b * present_NSH + n * present_SH + s * H + h + is_v * (present_NSH * batch_size);
-    if (s < past_sequence_length) {
-      const int past_SH = past_sequence_length * H;
-      const int past_NSH = num_heads * past_SH;
-      const int in_offset = b * past_NSH + n * past_SH + s * H + h + is_v * (past_NSH * batch_size);
-      present[out_offset] = past[in_offset];
-    } else if (s < all_sequence_length) {
-      const int SH = sequence_length * H;
-      const int NSH = num_heads * SH;
-      const int in_offset = b * NSH + n * SH + (s - past_sequence_length) * H + h + is_v * (NSH * batch_size);
-      present[out_offset] = k_v[in_offset];
-    }
-
-    h += stride;
-  }
-}
-
-bool LaunchConcatPastToPresent(cudaStream_t stream,
-                               const int all_sequence_length,
-                               const int sequence_length,
-                               const int batch_size,
-                               const int head_size,
-                               const int num_heads,
-                               const int max_threads_per_block,
-                               const float* past,
-                               const float* k_v,
-                               float* present) {
-  const dim3 grid(all_sequence_length, batch_size, 2);
-  if (0 == (head_size & 1)) {
-    const int H = head_size / 2;
-    if (H * num_heads <= max_threads_per_block) {
-      const dim3 block(H, num_heads, 1);
-      ConcatPastToPresent<float2><<<grid, block, 0, stream>>>(sequence_length, reinterpret_cast<const float2*>(past), reinterpret_cast<const float2*>(k_v), reinterpret_cast<float2*>(present));
-    } else {
-      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      ConcatPastToPresentLarge<float2><<<grid, block, 0, stream>>>(sequence_length, H, reinterpret_cast<const float2*>(past), reinterpret_cast<const float2*>(k_v), reinterpret_cast<float2*>(present));
-    }
-  } else {
-    if (head_size * num_heads <= max_threads_per_block) {
-      const dim3 block(head_size, num_heads, 1);
-      ConcatPastToPresent<float><<<grid, block, 0, stream>>>(sequence_length, past, k_v, present);
-    } else {
-      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      ConcatPastToPresentLarge<float><<<grid, block, 0, stream>>>(sequence_length, head_size, past, k_v, present);
-    }
-
-  }
-  return CUDA_CALL(cudaPeekAtLastError());
-}
-
-bool LaunchConcatPastToPresent(cudaStream_t stream,
-                               const int all_sequence_length,
-                               const int sequence_length,
-                               const int batch_size,
-                               const int head_size,
-                               const int num_heads,
-                               const int max_threads_per_block,
-                               const half* past,
-                               const half* k_v,
-                               half* present) {
-  const dim3 grid(all_sequence_length, batch_size, 2);
-  if (0 == (head_size % 4)) {
-    const int H = head_size / 4;
-    if (H * num_heads <= max_threads_per_block) {
-      const dim3 block(H, num_heads, 1);
-      ConcatPastToPresent<float2><<<grid, block, 0, stream>>>(sequence_length, reinterpret_cast<const float2*>(past), reinterpret_cast<const float2*>(k_v), reinterpret_cast<float2*>(present));
-    } else {
-      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      ConcatPastToPresentLarge<float2><<<grid, block, 0, stream>>>(sequence_length, H, reinterpret_cast<const float2*>(past), reinterpret_cast<const float2*>(k_v), reinterpret_cast<float2*>(present));
-    }
-  } else if (0 == (head_size & 1)) {
-    const int H = head_size / 2;
-    if (H * num_heads <= max_threads_per_block) {
-      const dim3 block(H, num_heads, 1);
-      ConcatPastToPresent<half2><<<grid, block, 0, stream>>>(sequence_length, reinterpret_cast<const half2*>(past), reinterpret_cast<const half2*>(k_v), reinterpret_cast<half2*>(present));
-    } else {
-      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      ConcatPastToPresentLarge<half2><<<grid, block, 0, stream>>>(sequence_length, H, reinterpret_cast<const half2*>(past), reinterpret_cast<const half2*>(k_v), reinterpret_cast<half2*>(present));
-    }
-  } else {  // this should be an "odd" case. probably not worth catching it in the half2 kernel.
-    if (head_size * num_heads <= max_threads_per_block) {
-      const dim3 block(head_size, num_heads, 1);
-      ConcatPastToPresent<half><<<grid, block, 0, stream>>>(sequence_length, past, k_v, present);
-    } else {
-      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      ConcatPastToPresentLarge<half><<<grid, block, 0, stream>>>(sequence_length, head_size, past, k_v, present);
-    }
-  }
-  return CUDA_CALL(cudaPeekAtLastError());
-}
-
-}  // namespace cuda
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h b/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h
index 7f5eb3b00c..d5f908b922 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_softmax.h
@@ -169,6 +169,7 @@ template <typename T, unsigned TPB>
 __device__ inline void SoftmaxWithRawMaskSmall(const int all_sequence_length,
                                                const int sequence_length,
                                                const int* attention_mask,  // 2D, 3D or 4D attention mask
+                                               const bool* key_padding_mask,
                                                const T* add_before_softmax,
                                                const T* input,
                                                T* output,
@@ -212,9 +213,16 @@ __device__ inline void SoftmaxWithRawMaskSmall(const int all_sequence_length,
       mask_offset = (batch_index * max_sequence_length + all_sequence_length - sequence_length + sequence_index) * max_sequence_length + threadIdx.x;
     }
 
-    const int& mask = attention_mask[mask_offset];
-    if (mask == 0)
-      thread_data += -10000.0f;
+    if (nullptr == key_padding_mask) {
+      const int& mask = attention_mask[mask_offset];
+      if (mask == 0)
+        thread_data += -10000.0f;
+    } else {
+      const bool mask = key_padding_mask[mask_offset];
+      if (mask) {
+        thread_data = -CUDART_INF_F;
+      }
+    }
   }
 
   if (skip_softmax) {
@@ -332,10 +340,10 @@ __global__ void MaskedSoftmaxKernel(const int all_sequence_length, const int seq
 }
 
 template <typename T, unsigned TPB>
-__global__ void SoftmaxWithRawMaskSmallKernel(const int all_sequence_length, const int sequence_length, const int* attention_mask, const T* add_before_softmax, const T* input, 
+__global__ void SoftmaxWithRawMaskSmallKernel(const int all_sequence_length, const int sequence_length, const int* attention_mask, const bool* key_padding_mask, const T* add_before_softmax, const T* input,
                                               T* output, const bool is_unidirectional, const float rsqrt_head_size, const int mask_dimension, const int max_sequence_length,
                                               const bool skip_softmax) {
-  SoftmaxWithRawMaskSmall<T, TPB>(all_sequence_length, sequence_length, attention_mask, add_before_softmax, input, output, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, skip_softmax);
+  SoftmaxWithRawMaskSmall<T, TPB>(all_sequence_length, sequence_length, attention_mask, key_padding_mask, add_before_softmax, input, output, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, skip_softmax);
 }
 
 template <typename T>
@@ -380,35 +388,35 @@ bool ComputeSoftmaxWithMask1D(cudaStream_t stream, const int all_sequence_length
 
 template <typename T>
 bool ComputeSoftmaxWithRawMask(cudaStream_t stream, const int all_sequence_length, const int sequence_length, const int batch_size, const int num_heads,
-                               const int* attention_mask, const T* add_before_softmax, const T* input, T* output, const bool is_unidirectional, const float rsqrt_head_size,
-                               const int mask_dimension, const int max_sequence_length, const bool use_persistent_softmax, T* persistent_softmax_workspace) {
+                               const int* attention_mask, const bool* key_padding_mask, const T* add_before_softmax, const T* input, T* output, const bool is_unidirectional,
+                               const float rsqrt_head_size, const int mask_dimension, const int max_sequence_length, const bool use_persistent_softmax, T* persistent_softmax_workspace) {
   const dim3 grid(sequence_length * num_heads, batch_size, 1);
 
   T* out = use_persistent_softmax ? persistent_softmax_workspace : output;
   if (all_sequence_length <= 32) {
     const int blockSize = 32;
     SoftmaxWithRawMaskSmallKernel<T, blockSize>
-        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
+        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, key_padding_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
   } else if (all_sequence_length <= 64) {
     const int blockSize = 64;
     SoftmaxWithRawMaskSmallKernel<T, blockSize>
-        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
+        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, key_padding_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
   } else if (all_sequence_length <= 128) {
     const int blockSize = 128;
     SoftmaxWithRawMaskSmallKernel<T, blockSize>
-        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
+        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, key_padding_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
   } else if (all_sequence_length <= 256) {
     const int blockSize = 256;
     SoftmaxWithRawMaskSmallKernel<T, blockSize>
-        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
+        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, key_padding_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
   } else if (all_sequence_length <= 512) {
     const int blockSize = 512;
     SoftmaxWithRawMaskSmallKernel<T, blockSize>
-        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
+        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, key_padding_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
   } else if (all_sequence_length <= 1024) {
     const int blockSize = 1024;
     SoftmaxWithRawMaskSmallKernel<T, blockSize>
-        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
+        <<<grid, blockSize, 0, stream>>>(all_sequence_length, sequence_length, attention_mask, key_padding_mask, add_before_softmax, input, out, is_unidirectional, rsqrt_head_size, mask_dimension, max_sequence_length, use_persistent_softmax);
   } else {
     ORT_THROW("Attention CUDA operator does not support total sequence length > 1024.");
   }
diff --git a/onnxruntime/contrib_ops/cuda/bert/attention_transpose.cu b/onnxruntime/contrib_ops/cuda/bert/attention_transpose.cu
index 456c39ab32..1c9c7a8532 100644
--- a/onnxruntime/contrib_ops/cuda/bert/attention_transpose.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/attention_transpose.cu
@@ -27,7 +27,7 @@ namespace contrib {
 namespace cuda {
 
 template <typename T>
-__global__ void TransposeCtx(const int H, const T* input, T* output) {
+__global__ void TransposeCtx(const int H, const bool reversed_bs, const T* input, T* output) {
   // Input:  BxNxSxH
   // Output: BxSxNxH
 
@@ -41,7 +41,15 @@ __global__ void TransposeCtx(const int H, const T* input, T* output) {
   const int NH = num_heads * H;
   const int NHS = NH * sequence_length;
   const int in_offset = s * H + n * sequence_length * H + b * NHS;
-  const int out_offset = n * H + s * NH + b * NHS;
+
+  int out_offset = 0;
+  if (reversed_bs) {
+    const int batch_size = gridDim.y;
+    const int BNH = NH * batch_size;
+    out_offset = n * H + b * NH + s * BNH;
+  } else {
+    out_offset = n * H + s * NH + b * NHS;
+  }
 
   const int i = threadIdx.x;
   if (i < H) {
@@ -50,7 +58,7 @@ __global__ void TransposeCtx(const int H, const T* input, T* output) {
 }
 
 template <typename T>
-__global__ void TransposeCtxLarge(const int H, const T* input, T* output) {
+__global__ void TransposeCtxLarge(const int H, const bool reversed_bs, const T* input, T* output) {
   // Use when (H*)*num_heads > 1024
 
   // Input:  BxNxSxH
@@ -67,7 +75,15 @@ __global__ void TransposeCtxLarge(const int H, const T* input, T* output) {
   const int NH = num_heads * H;
   const int NHS = NH * sequence_length;
   const int in_offset = s * H + n * sequence_length * H + b * NHS;
-  const int out_offset = n * H + s * NH + b * NHS;
+
+  int out_offset = 0;
+  if (reversed_bs) {
+    const int batch_size = gridDim.y;
+    const int BNH = NH * batch_size;
+    out_offset = n * H + b * NH + s * BNH;
+  } else {
+    out_offset = n * H + s * NH + b * NHS;
+  }
 
   int i = threadIdx.x;
   while (i < H) {
@@ -78,7 +94,7 @@ __global__ void TransposeCtxLarge(const int H, const T* input, T* output) {
 
 bool LaunchTransCtx(cudaStream_t stream,
                     const int sequence_length, const int batch_size, const int head_size, const int num_heads,
-                    const int max_threads_per_block, const float* input, float* output) {
+                    const int max_threads_per_block, const bool reversed_bs, const float* input, float* output) {
   const dim3 grid(sequence_length, batch_size, 1);
   if (0 == (head_size & 1)) {
     const int H = head_size / 2;
@@ -86,18 +102,18 @@ bool LaunchTransCtx(cudaStream_t stream,
     float2* output2 = reinterpret_cast<float2*>(output);
     if (H * num_heads <= max_threads_per_block) {
       const dim3 block(H, num_heads, 1);
-      TransposeCtx<float2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeCtx<float2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeCtxLarge<float2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeCtxLarge<float2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     }
   } else {
     if (head_size * num_heads <= max_threads_per_block) {
       const dim3 block(head_size, num_heads, 1);
-      TransposeCtx<float><<<grid, block, 0, stream>>>(head_size, input, output);
+      TransposeCtx<float><<<grid, block, 0, stream>>>(head_size, reversed_bs, input, output);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeCtxLarge<float><<<grid, block, 0, stream>>>(head_size, input, output);
+      TransposeCtxLarge<float><<<grid, block, 0, stream>>>(head_size, reversed_bs, input, output);
     }
   }
   return CUDA_CALL(cudaPeekAtLastError());
@@ -105,7 +121,7 @@ bool LaunchTransCtx(cudaStream_t stream,
 
 bool LaunchTransCtx(cudaStream_t stream,
                     const int sequence_length, const int batch_size, const int head_size, const int num_heads,
-                    const int max_threads_per_block, const half* input, half* output) {
+                    const int max_threads_per_block, const bool reversed_bs, const half* input, half* output) {
   const dim3 grid(sequence_length, batch_size, 1);
   if (0 == (head_size % 4)) {
     const int H = head_size / 4;
@@ -113,10 +129,10 @@ bool LaunchTransCtx(cudaStream_t stream,
     float2* output2 = reinterpret_cast<float2*>(output);
     if (H * num_heads <= max_threads_per_block) {
       const dim3 block(H, num_heads, 1);
-      TransposeCtx<float2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeCtx<float2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeCtxLarge<float2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeCtxLarge<float2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     }
   } else if (0 == (head_size & 1)) {
     const int H = head_size / 2;
@@ -124,18 +140,18 @@ bool LaunchTransCtx(cudaStream_t stream,
     half2* output2 = reinterpret_cast<half2*>(output);
     if (H * num_heads <= max_threads_per_block) {
       const dim3 block(H, num_heads, 1);
-      TransposeCtx<half2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeCtx<half2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeCtxLarge<half2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeCtxLarge<half2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     }
   } else {  // this should be an "odd" case. probably not worth catching it in the half2 kernel.
     if (head_size * num_heads <= max_threads_per_block) {
       const dim3 block(head_size, num_heads, 1);
-      TransposeCtx<half><<<grid, block, 0, stream>>>(head_size, input, output);
+      TransposeCtx<half><<<grid, block, 0, stream>>>(head_size, reversed_bs, input, output);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeCtxLarge<half><<<grid, block, 0, stream>>>(head_size, input, output);
+      TransposeCtxLarge<half><<<grid, block, 0, stream>>>(head_size, reversed_bs, input, output);
     }
   }
 
@@ -143,9 +159,10 @@ bool LaunchTransCtx(cudaStream_t stream,
 }
 
 template <typename T>
-__global__ void TransposeQKV(const int H, const T* input, T* output) {
-  // Input:  BxSx3xNxH
-  // Output: 3xBxNxSxH
+__global__ void TransposeQKV(const int H, const bool reversed_bs, const T* input, T* output) {
+  // Input:  BxSxKxNxH or SxBxKxNxH
+  // Output: KxBxNxSxH
+  // K is the number of identical matrix
 
   int n = threadIdx.y;
   int s = blockIdx.x;
@@ -156,9 +173,17 @@ __global__ void TransposeQKV(const int H, const T* input, T* output) {
 
   const int sequence_length = gridDim.x;
   const int batch_size = gridDim.y;
+  const int chunk_num = gridDim.z;
   const int NH = num_heads * H;
   const int NHS = NH * sequence_length;
-  const int in_offset = n * H + m * NH + s * 3 * NH + b * NHS * 3;
+
+  int in_offset = 0;
+  if (reversed_bs) {
+    const int BNH = NH * batch_size;
+    in_offset = n * H + (m + b * chunk_num) * NH + s * BNH * chunk_num;
+  } else {
+    in_offset = n * H + (m + s * chunk_num) * NH + b * NHS * chunk_num;
+  }
   const int out_offset = s * H + n * sequence_length * H + b * NHS + m * NHS * batch_size;
 
   const int i = threadIdx.x;
@@ -168,11 +193,12 @@ __global__ void TransposeQKV(const int H, const T* input, T* output) {
 }
 
 template <typename T>
-__global__ void TransposeQKVLarge(const int H, const T* input, T* output) {
+__global__ void TransposeQKVLarge(const int H, const bool reversed_bs, const T* input, T* output) {
   // Use when (H*)*num_heads > 1024
 
-  // Input:  BxSx3xNxH
-  // Output: 3xBxNxSxH
+  // Input:  BxSxKxNxH or SxBxKxNxH
+  // Output: KxBxNxSxH
+  // K is the number of identical matrix
 
   int n = threadIdx.y;
   int s = blockIdx.x;
@@ -184,9 +210,16 @@ __global__ void TransposeQKVLarge(const int H, const T* input, T* output) {
 
   const int sequence_length = gridDim.x;
   const int batch_size = gridDim.y;
+  const int chunk_num = gridDim.z;
   const int NH = num_heads * H;
   const int NHS = NH * sequence_length;
-  const int in_offset = n * H + m * NH + s * 3 * NH + b * NHS * 3;
+  int in_offset = 0;
+  if (reversed_bs) {
+    const int BNH = NH * batch_size;
+    in_offset = n * H + (m + b * chunk_num) * NH + s * BNH * chunk_num;
+  } else {
+    in_offset = n * H + (m + s * chunk_num) * NH + b * NHS * chunk_num;
+  }
   const int out_offset = s * H + n * sequence_length * H + b * NHS + m * NHS * batch_size;
 
   int i = threadIdx.x;
@@ -196,48 +229,48 @@ __global__ void TransposeQKVLarge(const int H, const T* input, T* output) {
   }
 }
 
-bool LaunchTransQkv(cudaStream_t stream,
+bool LaunchTransQkv(cudaStream_t stream, const int matrix_num,
                     const int sequence_length, const int batch_size, const int head_size, const int num_heads,
-                    const int max_threads_per_block, const float* input, float* output) {
-  const dim3 grid(sequence_length, batch_size, 3);
+                    const int max_threads_per_block, const bool reversed_bs, const float* input, float* output) {
+  const dim3 grid(sequence_length, batch_size, matrix_num);
   if (0 == (head_size & 1)) {
     const int H = head_size / 2;
     const float2* input2 = reinterpret_cast<const float2*>(input);
     float2* output2 = reinterpret_cast<float2*>(output);
     if (H * num_heads <= max_threads_per_block) {
       const dim3 block(H, num_heads, 1);
-      TransposeQKV<float2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeQKV<float2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeQKVLarge<float2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeQKVLarge<float2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     }
   } else {
     if (head_size * num_heads <= max_threads_per_block) {
       const dim3 block(head_size, num_heads, 1);
-      TransposeQKV<float><<<grid, block, 0, stream>>>(head_size, input, output);
+      TransposeQKV<float><<<grid, block, 0, stream>>>(head_size, reversed_bs, input, output);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeQKVLarge<float><<<grid, block, 0, stream>>>(head_size, input, output);
+      TransposeQKVLarge<float><<<grid, block, 0, stream>>>(head_size, reversed_bs, input, output);
     }
 
   }
   return CUDA_CALL(cudaPeekAtLastError());
 }
 
-bool LaunchTransQkv(cudaStream_t stream,
+bool LaunchTransQkv(cudaStream_t stream, const int matrix_num,
                     const int sequence_length, const int batch_size, const int head_size, const int num_heads,
-                    const int max_threads_per_block, const half* input, half* output) {
-  const dim3 grid(sequence_length, batch_size, 3);
+                    const int max_threads_per_block, const bool reversed_bs, const half* input, half* output) {
+  const dim3 grid(sequence_length, batch_size, matrix_num);
   if (0 == (head_size % 4)) {
     const int H = head_size / 4;
     const float2* input2 = reinterpret_cast<const float2*>(input);
     float2* output2 = reinterpret_cast<float2*>(output);
     if (H * num_heads <= max_threads_per_block) {
       const dim3 block(H, num_heads, 1);
-      TransposeQKV<float2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeQKV<float2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeQKVLarge<float2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeQKVLarge<float2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     }
   } else if (0 == (head_size & 1)) {
     const int H = head_size / 2;
@@ -245,23 +278,24 @@ bool LaunchTransQkv(cudaStream_t stream,
     half2* output2 = reinterpret_cast<half2*>(output);
     if (H * num_heads <= max_threads_per_block) {
       const dim3 block(H, num_heads, 1);
-      TransposeQKV<half2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeQKV<half2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeQKVLarge<half2><<<grid, block, 0, stream>>>(H, input2, output2);
+      TransposeQKVLarge<half2><<<grid, block, 0, stream>>>(H, reversed_bs, input2, output2);
     }
   } else {  // this should be an "odd" case. probably not worth catching it in the half2 kernel..
     if (head_size * num_heads <= max_threads_per_block) {
       const dim3 block(head_size, num_heads, 1);
-      TransposeQKV<half><<<grid, block, 0, stream>>>(head_size, input, output);
+      TransposeQKV<half><<<grid, block, 0, stream>>>(head_size, reversed_bs, input, output);
     } else {
       const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
-      TransposeQKVLarge<half><<<grid, block, 0, stream>>>(head_size, input, output);
+      TransposeQKVLarge<half><<<grid, block, 0, stream>>>(head_size, reversed_bs, input, output);
     }
   }
   return CUDA_CALL(cudaPeekAtLastError());
 }
 
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc b/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc
new file mode 100644
index 0000000000..d069cc3663
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_attention.cc
@@ -0,0 +1,387 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "attention_impl.h"
+#include "decoder_attention.h"
+#include "transformer_cuda_common.h"
+#include "core/framework/op_kernel.h"
+#include "core/providers/cuda/shared_inc/fpgeneric.h"
+
+using namespace onnxruntime::cuda;
+using namespace ::onnxruntime::common;
+using namespace ONNX_NAMESPACE;
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+#define REGISTER_KERNEL_TYPED(T)                                  \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      DecoderAttention,                                           \
+      kMSDomain,                                                  \
+      1,                                                          \
+      T,                                                          \
+      kCudaExecutionProvider,                                     \
+      (*KernelDefBuilder::Create())                               \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      DecoderAttention<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(MLFloat16)
+
+namespace {
+
+Status CheckInputs(const TensorShape& query_shape,
+                   const TensorShape& key_shape,
+                   const TensorShape& q_weights_shape,
+                   const TensorShape& kv_weights_shape,
+                   const TensorShape& bias_shape,
+                   const Tensor* key_padding_mask,
+                   const Tensor* key_cache,
+                   const Tensor* value_cache,
+                   const bool static_kv,
+                   const bool use_past,
+                   const bool has_layer_state,
+                   const bool has_key_padding_mask) {
+
+  const auto& query_shape_dims = query_shape.GetDims();
+  if (query_shape_dims.size() != 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'query' is expected to have 3 dimensions, got ",
+                           query_shape_dims.size());
+  }
+
+  int sequence_length = static_cast<int>(query_shape_dims[0]);
+  int batch_size = static_cast<int>(query_shape_dims[1]);
+  int hidden_size = static_cast<int>(query_shape_dims[2]);
+
+  const auto& key_shape_dims = key_shape.GetDims();
+  if (key_shape_dims.size() != 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'key' is expected to have 3 dimensions, got ",
+                           key_shape_dims.size());
+  }
+  int kv_sequence_length = static_cast<int>(key_shape_dims[0]);
+
+  if (query_shape_dims[1] != key_shape_dims[1]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "query and key shall have the same batch size");
+  }
+
+  if (query_shape_dims[2] != key_shape_dims[2]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "query and key shall have the same hidden size");
+  }
+
+  const auto& q_weights_dims = q_weights_shape.GetDims();
+  if (q_weights_dims.size() != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'q_weights' is expected to have 2 dimensions, got ",
+                           q_weights_dims.size());
+  }
+
+  const auto& kv_weights_dims = kv_weights_shape.GetDims();
+  if (kv_weights_dims.size() != 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'kv_weights' is expected to have 2 dimensions, got ",
+                           kv_weights_dims.size());
+  }
+
+  if (q_weights_dims[0] != hidden_size || q_weights_dims[1] != hidden_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "q_weights shall have shape (hidden size, hidden size)");
+  }
+
+  if (kv_weights_dims[0] != hidden_size || kv_weights_dims[1] != 2 * hidden_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "kv_weights shall have shape (hidden size, 2 * hidden size)");
+  }
+
+  const auto& bias_dims = bias_shape.GetDims();
+  if (bias_dims.size() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'bias' is expected to have 1 dimension, got ",
+                           bias_dims.size());
+  }
+
+  if (bias_dims[0] != 3 * hidden_size) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "bias shall have shape (3 * hidden size)");
+  }
+
+  int key_length = kv_sequence_length;
+  if (key_padding_mask != nullptr && has_key_padding_mask == true) {
+    const auto& kp_mask_dims = key_padding_mask->Shape().GetDims();
+
+    if (kp_mask_dims.size() != 2) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'key_padding_mask' is expected to have 2 dimension, got ",
+                             kp_mask_dims.size());
+    }
+
+    if (kp_mask_dims[0] != batch_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "key_padding_mask shall have same batch size with query");
+    }
+
+    if (!has_layer_state || !use_past) {
+      if (!static_kv) {
+        key_length = sequence_length;
+      }
+    } else {
+      if (!static_kv) {
+        key_length = sequence_length + kv_sequence_length;
+      }
+    }
+
+    if (kp_mask_dims[1] != key_length) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "key_padding_mask shall have same sequence length as generated key");
+    }
+  }
+
+  if (key_cache != nullptr && value_cache != nullptr && has_layer_state && use_past) {
+    const auto& key_cache_dims = key_cache->Shape().GetDims();
+    if (key_cache_dims.size() != 4) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'key_cache' is expected to have 4 dimension, got ",
+                             key_cache_dims.size());
+    }
+
+    const auto& value_cache_dims = value_cache->Shape().GetDims();
+    if (value_cache_dims.size() != 4) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Input 'value_cache' is expected to have 4 dimension, got ",
+                             value_cache_dims.size());
+    }
+
+    if (key_cache_dims[0] != batch_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "key_cache shall have same batch size as query");
+    }
+
+    if (value_cache_dims[0] != batch_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "value_cache shall have same batch size as query");
+    }
+
+    if (key_cache_dims[1] * key_cache_dims[3] != hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "key_cache shall have correct hidden size");
+    }
+
+    if (value_cache_dims[1] * value_cache_dims[3] != hidden_size) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "value_cache shall have correct hidden size");
+    }
+  }
+
+  return Status::OK();
+}
+} // anonymous namespace
+
+template <typename T>
+DecoderAttention<T>::DecoderAttention(const OpKernelInfo& info) : CudaKernel(info) {
+  int64_t num_heads = 0;
+  ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
+  num_heads_ = static_cast<int>(num_heads);
+}
+
+template <typename T>
+Status DecoderAttention<T>::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* query(context->Input<Tensor>(0));
+  const Tensor* key(context->Input<Tensor>(1));
+  const Tensor* q_weights(context->Input<Tensor>(2));
+  const Tensor* kv_weights(context->Input<Tensor>(3));
+  const Tensor* bias(context->Input<Tensor>(4));
+  const Tensor* key_padding_mask(context->Input<Tensor>(5));
+  const Tensor* key_cache(context->Input<Tensor>(6));
+  const Tensor* value_cache(context->Input<Tensor>(7));
+  const Tensor* static_kv(context->Input<Tensor>(8));
+  const Tensor* use_past(context->Input<Tensor>(9));
+  const Tensor* has_layer_state(context->Input<Tensor>(10));
+  const Tensor* has_key_padding_mask(context->Input<Tensor>(11));
+
+  cudaStream_t stream = Stream();
+
+  // Copy static_kv, use_past and has_layer_state to CPU
+  auto pinned_buffer = AllocateBufferOnCPUPinned<void>(4 * sizeof(bool));
+  bool* kernel_state_pinned = reinterpret_cast<bool*>(pinned_buffer.get());
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(kernel_state_pinned, static_kv->template Data<bool>(), sizeof(bool), cudaMemcpyDeviceToHost, stream));
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(kernel_state_pinned + 1, use_past->template Data<bool>(), sizeof(bool), cudaMemcpyDeviceToHost, stream));
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(kernel_state_pinned + 2, has_layer_state->template Data<bool>(), sizeof(bool), cudaMemcpyDeviceToHost, stream));
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(kernel_state_pinned + 3, has_key_padding_mask->template Data<bool>(), sizeof(bool), cudaMemcpyDeviceToHost, stream));
+
+  // Create an event to make sure the async copy is finished before reading the data.
+  AutoDestoryCudaEvent new_event;
+  cudaEvent_t& isCopyDone = new_event.Get();
+
+  CUDA_RETURN_IF_ERROR(cudaEventCreate(&isCopyDone));
+  CUDA_RETURN_IF_ERROR(cudaEventRecord(isCopyDone, stream));
+
+  auto& device_prop = GetDeviceProp();
+
+  // query shape (batch_size, sequence_length, input_hidden_size)
+  const auto& query_shape = query->Shape();
+  int sequence_length = static_cast<int>(query_shape[0]);
+  int batch_size = static_cast<int>(query_shape[1]);
+  int hidden_size = static_cast<int>(query_shape[2]);
+
+  const auto& key_shape = key->Shape();
+  int key_sequence_length = static_cast<int>(key_shape[0]);
+  int head_size = hidden_size / num_heads_;
+
+  //k, v sequence after gemm
+  int kv_sequence_length = 0;
+
+  // Generate q, k, v w/o cache
+  // query input: (S, B, h1)
+  // key input: (S', B, h1)
+  // weight: (h1, h2)
+  // h = N*H
+  cublasHandle_t cublas = CublasHandle();
+  CUBLAS_RETURN_IF_ERROR(cublasSetStream(cublas, stream));
+  constexpr size_t element_size = sizeof(T);
+
+  typedef typename ToCudaType<T>::MappedType CudaT;
+  CudaT one = ToCudaType<T>::FromFloat(1.0f);
+  CudaT zero = ToCudaType<T>::FromFloat(0.0f);
+
+  int m = 0, n = 0, k = 0;
+  IAllocatorUniquePtr<T> gemm_query_buffer_p(nullptr);
+  IAllocatorUniquePtr<T> gemm_kv_buffer_p(nullptr);
+
+  CUDA_RETURN_IF_ERROR(cudaEventSynchronize(isCopyDone));
+  bool static_kv_ = *kernel_state_pinned;
+  bool use_past_ = *(kernel_state_pinned + 1);
+  bool has_layer_state_ = *(kernel_state_pinned + 2);
+  bool has_key_padding_mask_ = *(kernel_state_pinned + 3);
+
+  ORT_RETURN_IF_ERROR(
+    CheckInputs(query->Shape(),
+                key->Shape(),
+                q_weights->Shape(),
+                kv_weights->Shape(),
+                bias->Shape(),
+                key_padding_mask,
+                key_cache,
+                value_cache,
+                static_kv_,
+                use_past_,
+                has_layer_state_,
+                has_key_padding_mask_)
+  );
+
+  // calcualte q
+  gemm_query_buffer_p = GetScratchBuffer<T>(batch_size * sequence_length * hidden_size * element_size);
+  m = sequence_length * batch_size;
+  n = hidden_size;
+  k = hidden_size;
+  // broadcast bias for query: (h2, S*B)
+  CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+      cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
+      reinterpret_cast<const CudaT*>(bias->template Data<T>()), n,
+      GetConstOnes<CudaT>(m), 1,
+      &zero, reinterpret_cast<CudaT*>(gemm_query_buffer_p.get()), n, device_prop));
+  // matmul: (h2, h1)*(h1, S*B)
+  CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+      cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
+      reinterpret_cast<const CudaT*>(q_weights->template Data<T>()), n,
+      reinterpret_cast<const CudaT*>(query->template Data<T>()), k,
+      &one, reinterpret_cast<CudaT*>(gemm_query_buffer_p.get()), n, device_prop));
+  // gemm_query_buffer in col-base: (h2, S*B)
+
+  // calcualte k, v
+  n = 2 * hidden_size;
+  k = hidden_size;
+  if (!has_layer_state_ || !use_past_) {
+    if (!static_kv_) {
+      gemm_kv_buffer_p = GetScratchBuffer<T>(batch_size * 2 * sequence_length * hidden_size * element_size);
+      m = sequence_length * batch_size;
+      n = 2 * hidden_size;
+      k = hidden_size;
+      kv_sequence_length = sequence_length;
+      // broadcast bias for key and value: (2*h2, T_S*B)
+      CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+          cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
+          reinterpret_cast<const CudaT*>(bias->template Data<T>() + hidden_size), n,
+          GetConstOnes<CudaT>(m), 1,
+          &zero, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+      // matmul: (2*h2, h1)*(h1, T_S*B)
+      CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+          cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
+          reinterpret_cast<const CudaT*>(kv_weights->template Data<T>()), n,
+          reinterpret_cast<const CudaT*>(query->template Data<T>()), k,
+          &one, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+      // gemm_kv_buffer in col-base: (2*h2, T_S*B)
+    } else {
+      gemm_kv_buffer_p = GetScratchBuffer<T>(batch_size * 2 * key_sequence_length * hidden_size * element_size);
+      m = key_sequence_length * batch_size;
+      n = 2 * hidden_size;
+      k = hidden_size;
+      kv_sequence_length = key_sequence_length;
+      // broadcast bias for key and value: (2*h2, T_S*B)
+      CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+          cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
+          reinterpret_cast<const CudaT*>(bias->template Data<T>() + hidden_size), n,
+          GetConstOnes<CudaT>(m), 1,
+          &zero, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+      // matmul: (2*h2, h1)*(h1, T_S*B)
+      CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+          cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
+          reinterpret_cast<const CudaT*>(kv_weights->template Data<T>()), n,
+          reinterpret_cast<const CudaT*>(key->template Data<T>()), k,
+          &one, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+      // gemm_kv_buffer in col-base: (2*h2, T_S*B)
+    }
+  } else {
+    ORT_ENFORCE(nullptr != key_cache && nullptr != value_cache); // (B, N, S, H)
+    const auto& cache_shape = key_cache->Shape();
+    // key and value cache have identical shape
+    int cache_sequence_length = static_cast<int>(cache_shape[2]);
+    if (!static_kv_) {
+      gemm_kv_buffer_p = GetScratchBuffer<T>(batch_size * 2 * sequence_length * hidden_size * element_size);
+      m = sequence_length * batch_size;
+      kv_sequence_length = cache_sequence_length + sequence_length;
+      // broadcast bias for key and value: (2*h2, T_S*B)
+      CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+          cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, 1, &one,
+          reinterpret_cast<const CudaT*>(bias->template Data<T>() + hidden_size), n,
+          GetConstOnes<CudaT>(m), 1,
+          &zero, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+      // matmul: (2*h2, h1)*(h1, T_S*B)
+      CUBLAS_RETURN_IF_ERROR(cublasGemmHelper(
+          cublas, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &one,
+          reinterpret_cast<const CudaT*>(kv_weights->template Data<T>()), n,
+          reinterpret_cast<const CudaT*>(query->template Data<T>()), k,
+          &one, reinterpret_cast<CudaT*>(gemm_kv_buffer_p.get()), n, device_prop));
+      // gemm_kv_buffer in col-base: (2*h2, T_S*B)
+    } else {
+      kv_sequence_length = cache_sequence_length;
+    }
+  }
+
+  auto qkv_buffer_p = GetScratchBuffer<void>(batch_size * (sequence_length + 2 * kv_sequence_length) * hidden_size * element_size);
+  auto workspace_p = GetScratchBuffer<void>(2 * batch_size * sequence_length * num_heads_ * element_size * (2 * head_size + kv_sequence_length));
+
+  Tensor* output(context->Output(0, query_shape));
+  TensorShape new_cache_shape({batch_size, num_heads_, kv_sequence_length, head_size});
+  Tensor* new_key_cache(context->Output(1, new_cache_shape));
+  Tensor* new_value_cache(context->Output(2, new_cache_shape));
+
+  if (!LaunchDecoderAttentionKernel(
+          device_prop,
+          stream,
+          cublas,
+          element_size,
+          batch_size,
+          sequence_length,
+          kv_sequence_length,
+          num_heads_,
+          head_size,
+          static_kv_,
+          use_past_,
+          has_layer_state_,
+          has_key_padding_mask_,
+          nullptr == gemm_query_buffer_p ? nullptr : reinterpret_cast<const CudaT*>(gemm_query_buffer_p.get()),
+          nullptr == gemm_kv_buffer_p ? nullptr : reinterpret_cast<const CudaT*>(gemm_kv_buffer_p.get()),
+          nullptr == key_padding_mask ? nullptr : key_padding_mask->template Data<bool>(),
+          nullptr == key_cache ? nullptr : key_cache->template Data<T>(),
+          nullptr == value_cache ? nullptr : value_cache->template Data<T>(),
+          qkv_buffer_p.get(),
+          workspace_p.get(),
+          output->template MutableData<T>(),
+          nullptr == new_key_cache ? nullptr : new_key_cache->template MutableData<T>(),
+          nullptr == new_value_cache ? nullptr : new_value_cache->template MutableData<T>())) {
+    // Get last error to reset it to cudaSuccess.
+    CUDA_CALL(cudaGetLastError());
+    return Status(common::ONNXRUNTIME, common::FAIL);
+  }
+
+  return Status::OK();
+}
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/decoder_attention.h b/onnxruntime/contrib_ops/cuda/bert/decoder_attention.h
new file mode 100644
index 0000000000..6f887f43db
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/decoder_attention.h
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/cuda/cuda_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+using namespace onnxruntime::cuda;
+
+template <typename T>
+class DecoderAttention final : public CudaKernel {
+ public:
+  DecoderAttention(const OpKernelInfo& info);
+  Status ComputeInternal(OpKernelContext* context) const override;
+
+ private:
+  int num_heads_;
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc b/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc
index c21adcfc72..a10df58ba2 100644
--- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention.cc
@@ -1,12 +1,13 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/shared_inc/fpgeneric.h"
 #include "core/platform/env_var_utils.h"
 #include "longformer_attention.h"
 #include "longformer_global_impl.h"
 #include "longformer_attention_impl.h"
+#include "transformer_cuda_common.h"
+#include "transformer_common.h"
 
 using namespace onnxruntime::cuda;
 using namespace ::onnxruntime::common;
@@ -30,25 +31,6 @@ namespace cuda {
 REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
 
-// A wrapper class of cudaEvent_t to destroy the event automatically for avoiding memory leak.
-class AutoDestoryCudaEvent {
- public:
-  AutoDestoryCudaEvent() : cuda_event_(nullptr) {
-  }
-
-  ~AutoDestoryCudaEvent() {
-    if (cuda_event_ != nullptr)
-      cudaEventDestroy(cuda_event_);
-  }
-
-  cudaEvent_t& Get() {
-    return cuda_event_;
-  }
-
- private:
-  cudaEvent_t cuda_event_;
-};
-
 template <typename T>
 LongformerAttention<T>::LongformerAttention(const OpKernelInfo& info) : CudaKernel(info), LongformerAttentionBase(info) {
   use_compact_memory_ = ParseEnvironmentVariableWithDefault<bool>(longformer::kUseCompactMemory, false);
diff --git a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu
index ba71c8b6f3..ecc6f917ba 100644
--- a/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/bert/longformer_attention_impl.cu
@@ -843,7 +843,7 @@ bool LongformerQkvToContext(
   const int max_threads_per_block(device_prop.maxThreadsPerBlock);
 
   // Input should be BxSx3xNxH => qkv: 3xBxNxSxH
-  if (!LaunchTransQkv(stream, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, input, qkv)) {
+  if (!LaunchTransQkv(stream, 3, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, false, input, qkv)) {
     return false;
   }
 
@@ -852,7 +852,7 @@ bool LongformerQkvToContext(
 
   // When there is no global token, no need to process global Q, K and V
   if (max_num_global > 0 && nullptr != global_input) {
-    if (!LaunchTransQkv(stream, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, global_input, global_qkv)) {
+    if (!LaunchTransQkv(stream, 3, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, false, global_input, global_qkv)) {
       return false;
     }
   }
@@ -928,7 +928,7 @@ bool LongformerQkvToContext(
 
 
   // The temp_output is BxNxSxH, transpose it to final output BxSxNxH
-  return LaunchTransCtx(stream, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, temp_output, output);
+  return LaunchTransCtx(stream, sequence_length, batch_size, head_size, num_heads, max_threads_per_block, false, temp_output, output);
 }
 
 bool LaunchLongformerAttentionKernel(
diff --git a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc
index dd975ca90f..d6cac35261 100644
--- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc
@@ -41,6 +41,14 @@ Status SkipLayerNorm<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   Tensor* output = ctx->Output(0, input->Shape());
 
+  if (input->SizeInBytes() == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'input' has no data from upstream nodes");
+  }
+
+  if (skip->SizeInBytes() == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'skip' has no data from upstream nodes");
+  }
+
   const auto& input_dims = input->Shape().GetDims();
   if (input_dims.size() != 3) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
diff --git a/onnxruntime/contrib_ops/cuda/bert/transformer_common.h b/onnxruntime/contrib_ops/cuda/bert/transformer_common.h
index 74c1b2522d..c2372137f6 100644
--- a/onnxruntime/contrib_ops/cuda/bert/transformer_common.h
+++ b/onnxruntime/contrib_ops/cuda/bert/transformer_common.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include "core/providers/cuda/cuda_common.h"
+
 namespace onnxruntime {
 namespace contrib {
 namespace cuda {
diff --git a/onnxruntime/contrib_ops/cuda/bert/transformer_cuda_common.h b/onnxruntime/contrib_ops/cuda/bert/transformer_cuda_common.h
new file mode 100644
index 0000000000..f5828c9970
--- /dev/null
+++ b/onnxruntime/contrib_ops/cuda/bert/transformer_cuda_common.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/cuda/cuda_common.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace cuda {
+
+// A wrapper class of cudaEvent_t to destroy the event automatically for avoiding memory leak.
+class AutoDestoryCudaEvent {
+ public:
+  AutoDestoryCudaEvent() : cuda_event_(nullptr) {
+  }
+
+  ~AutoDestoryCudaEvent() {
+    if (cuda_event_ != nullptr)
+      cudaEventDestroy(cuda_event_);
+  }
+
+  cudaEvent_t& Get() {
+    return cuda_event_;
+  }
+
+ private:
+  cudaEvent_t cuda_event_;
+};
+
+}  // namespace cuda
+}  // namespace contrib
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index d29ef3e043..19b226858a 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -49,6 +49,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Crop);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Crop);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderAttention);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderAttention);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, int64_t, DynamicSlice);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization);
@@ -139,6 +141,8 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Crop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Crop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Crop)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, DecoderAttention)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DecoderAttention)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, int32_t, DynamicSlice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, int64_t, DynamicSlice)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
diff --git a/onnxruntime/contrib_ops/cuda/inverse.cc b/onnxruntime/contrib_ops/cuda/inverse.cc
index b0560c874a..4f4da3228e 100644
--- a/onnxruntime/contrib_ops/cuda/inverse.cc
+++ b/onnxruntime/contrib_ops/cuda/inverse.cc
@@ -150,7 +150,7 @@ Status Inverse::ComputeInternal(OpKernelContext* ctx) const {
   }
 
   IAllocatorUniquePtr<int> info = GetScratchBuffer<int>(num_batches);
-  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches, Stream()));
+  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(info.get(), 0, num_batches * sizeof(int), Stream()));
   IAllocatorUniquePtr<int> pivots = GetScratchBuffer<int>(rows * num_batches);
 
   utils::MLTypeCallDispatcher<float, double, MLFloat16> t_disp(input->GetElementType());
diff --git a/onnxruntime/contrib_ops/cuda/layer_norm.cc b/onnxruntime/contrib_ops/cuda/layer_norm.cc
index 50cb1c735f..3095ebf437 100644
--- a/onnxruntime/contrib_ops/cuda/layer_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/layer_norm.cc
@@ -61,6 +61,12 @@ Status LayerNorm<T, U, simplified>::ComputeInternal(OpKernelContext* ctx) const
   auto bias_data = (simplified || (nullptr == bias)) ? nullptr : reinterpret_cast<const CudaT*>(bias->template Data<T>());
 
   const TensorShape& x_shape = X->Shape();
+  // Sometimes due to conversion issue, the input 'X' has no data which is a case that cuda kernel cannot handle.
+  // Provide more error infomation here instead of CUDA errors.
+  if (X->SizeInBytes() == 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'X' has no data from upstream nodes");
+  }
+
   const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
 
   int n1 = gsl::narrow<int>(x_shape.SizeToDimension(axis));
diff --git a/onnxruntime/core/framework/data_types.cc b/onnxruntime/core/framework/data_types.cc
index 8038d819fe..dc66fba500 100644
--- a/onnxruntime/core/framework/data_types.cc
+++ b/onnxruntime/core/framework/data_types.cc
@@ -59,66 +59,13 @@ MLDataType DataTypeImpl::GetType<TensorSeq>() {
   return SequenceTensorTypeBase::Type();
 }
 
-//static bool IsTensorTypeScalar(const ONNX_NAMESPACE::TypeProto_Tensor& tensor_type_proto) {
-//  int sz = tensor_type_proto.shape().dim_size();
-//  return sz == 0 || sz == 1;
-//}
+// static bool IsTensorTypeScalar(const ONNX_NAMESPACE::TypeProto_Tensor& tensor_type_proto) {
+//   int sz = tensor_type_proto.shape().dim_size();
+//   return sz == 0 || sz == 1;
+// }
 
 namespace data_types_internal {
 
-template <typename T>
-struct TensorElementTypeSetter<T> {
-  static void SetTensorElementType(ONNX_NAMESPACE::TypeProto& proto) {
-    proto.mutable_tensor_type()->set_elem_type(utils::ToTensorProtoElementType<T>());
-  }
-
-#if !defined(DISABLE_SPARSE_TENSORS)
-  static void SetSparseTensorElementType(ONNX_NAMESPACE::TypeProto& proto) {
-    proto.mutable_sparse_tensor_type()->set_elem_type(utils::ToTensorProtoElementType<T>());
-  }
-#endif
-
-#if !defined(DISABLE_ML_OPS)
-  static void SetMapKeyType(ONNX_NAMESPACE::TypeProto& proto) {
-    proto.mutable_map_type()->set_key_type(utils::ToTensorProtoElementType<T>());
-  }
-#endif
-
-  constexpr static int32_t GetElementType() {
-    return utils::ToTensorProtoElementType<T>();
-  }
-};
-
-// Pre-instantiate
-template struct
-    TensorElementTypeSetter<float>;
-template struct
-    TensorElementTypeSetter<uint8_t>;
-template struct
-    TensorElementTypeSetter<int8_t>;
-template struct
-    TensorElementTypeSetter<uint16_t>;
-template struct
-    TensorElementTypeSetter<int16_t>;
-template struct
-    TensorElementTypeSetter<int32_t>;
-template struct
-    TensorElementTypeSetter<int64_t>;
-template struct
-    TensorElementTypeSetter<std::string>;
-template struct
-    TensorElementTypeSetter<bool>;
-template struct
-    TensorElementTypeSetter<MLFloat16>;
-template struct
-    TensorElementTypeSetter<double>;
-template struct
-    TensorElementTypeSetter<uint32_t>;
-template struct
-    TensorElementTypeSetter<uint64_t>;
-template struct
-    TensorElementTypeSetter<BFloat16>;
-
 #if !defined(DISABLE_ML_OPS)
 void CopyMutableMapValue(const ONNX_NAMESPACE::TypeProto& value_proto,
                          ONNX_NAMESPACE::TypeProto& map_proto) {
@@ -368,15 +315,13 @@ const ONNX_NAMESPACE::TypeProto* TensorTypeBase::GetTypeProto() const {
   return impl_->GetProto();
 }
 
-TensorTypeBase::TensorTypeBase() : impl_(new Impl()) {}
+TensorTypeBase::TensorTypeBase()
+    : DataTypeImpl{DataTypeImpl::GeneralType::kTensor, sizeof(Tensor)},
+      impl_(new Impl()) {}
 TensorTypeBase::~TensorTypeBase() {
   delete impl_;
 }
 
-size_t TensorTypeBase::Size() const {
-  return sizeof(Tensor);
-}
-
 template <typename T>
 static void Delete(void* p) {
   delete static_cast<T*>(p);
@@ -417,7 +362,10 @@ MLDataType TensorTypeBase::Type() {
 struct SparseTensorTypeBase::Impl : public data_types_internal::TypeProtoImpl {
 };
 
-SparseTensorTypeBase::SparseTensorTypeBase() : impl_(new Impl()) {}
+SparseTensorTypeBase::SparseTensorTypeBase()
+    : DataTypeImpl{DataTypeImpl::GeneralType::kSparseTensor, sizeof(SparseTensor)},
+      impl_(new Impl()) {}
+
 SparseTensorTypeBase::~SparseTensorTypeBase() {
   delete impl_;
 }
@@ -437,10 +385,6 @@ bool SparseTensorTypeBase::IsCompatible(const ONNX_NAMESPACE::TypeProto& type_pr
   return data_types_internal::IsCompatible(thisProto->sparse_tensor_type(), type_proto.sparse_tensor_type());
 }
 
-size_t SparseTensorTypeBase::Size() const {
-  return sizeof(SparseTensor);
-}
-
 DeleteFunc SparseTensorTypeBase::GetDeleteFunc() const {
   return &Delete<SparseTensor>;
 }
@@ -464,7 +408,9 @@ MLDataType SparseTensorTypeBase::Type() {
 struct SequenceTensorTypeBase::Impl : public data_types_internal::TypeProtoImpl {
 };
 
-SequenceTensorTypeBase::SequenceTensorTypeBase() : impl_(new Impl()) {}
+SequenceTensorTypeBase::SequenceTensorTypeBase()
+    : DataTypeImpl{DataTypeImpl::GeneralType::kTensorSequence, sizeof(TensorSeq)},
+      impl_(new Impl()) {}
 
 SequenceTensorTypeBase::~SequenceTensorTypeBase() {
   delete impl_;
@@ -489,10 +435,6 @@ bool SequenceTensorTypeBase::IsCompatible(const ONNX_NAMESPACE::TypeProto& type_
   return data_types_internal::IsCompatible(thisProto->sequence_type(), type_proto.sequence_type());
 }
 
-size_t SequenceTensorTypeBase::Size() const {
-  return sizeof(TensorSeq);
-}
-
 DeleteFunc SequenceTensorTypeBase::GetDeleteFunc() const {
   return &Delete<TensorSeq>;
 }
@@ -516,7 +458,8 @@ MLDataType SequenceTensorTypeBase::Type() {
 struct OptionalTypeBase::Impl : public data_types_internal::TypeProtoImpl {
 };
 
-OptionalTypeBase::OptionalTypeBase() : impl_(new Impl()) {}
+OptionalTypeBase::OptionalTypeBase() : DataTypeImpl{DataTypeImpl::GeneralType::kOptional, 0},
+                                       impl_(new Impl()) {}
 
 OptionalTypeBase::~OptionalTypeBase() {
   delete impl_;
@@ -557,7 +500,8 @@ MLDataType OptionalTypeBase::Type() {
 struct DisabledTypeBase::Impl : public data_types_internal::TypeProtoImpl {
 };
 
-DisabledTypeBase::DisabledTypeBase() : impl_(new Impl()) {}
+DisabledTypeBase::DisabledTypeBase(DataTypeImpl::GeneralType type, size_t size)
+    : DataTypeImpl{type, size}, impl_(new Impl()) {}
 
 DisabledTypeBase::~DisabledTypeBase() {
   delete impl_;
@@ -572,7 +516,7 @@ ONNX_NAMESPACE::TypeProto& DisabledTypeBase::MutableTypeProto() {
 }
 
 MLDataType DisabledTypeBase::Type() {
-  static DisabledTypeBase disabled_base;
+  static DisabledTypeBase disabled_base{GeneralType::kInvalid, 0};
   return &disabled_base;
 }
 #endif
@@ -580,7 +524,9 @@ MLDataType DisabledTypeBase::Type() {
 /// NoTensorTypeBase
 struct NonTensorTypeBase::Impl : public data_types_internal::TypeProtoImpl {};
 
-NonTensorTypeBase::NonTensorTypeBase() : impl_(new Impl()) {
+NonTensorTypeBase::NonTensorTypeBase(size_t size)
+    : DataTypeImpl{DataTypeImpl::GeneralType::kNonTensor, size},
+      impl_(new Impl()) {
 }
 
 NonTensorTypeBase::~NonTensorTypeBase() {
@@ -1037,8 +983,8 @@ MLDataType DataTypeImpl::TypeFromProto(const ONNX_NAMESPACE::TypeProto& proto) {
   return type;
 }
 
-//Below are the types the we need to execute the runtime
-//They are not compatible with TypeProto in ONNX.
+// Below are the types the we need to execute the runtime
+// They are not compatible with TypeProto in ONNX.
 ORT_REGISTER_PRIM_TYPE(int32_t);
 ORT_REGISTER_PRIM_TYPE(float);
 ORT_REGISTER_PRIM_TYPE(bool);
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index a70e8c0e39..2ab88c0314 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -274,6 +274,8 @@ void IExecutionFrame::Init(const std::vector<int>& feed_mlvalue_idxs, const std:
                                                                 cpu_allocator, allocator, has_linear_coo_index,
                                                                 *dest.GetMutable<SparseTensor>()));
       } else {
+#else
+        ORT_UNUSED_PARAMETER(is_initializer_sparse_func);
 #endif  //  !defined(DISABLE_SPARSE_TENSORS)
         if (!dest.IsAllocated()) {
           // NOTE: This doesn't need to support ExecutionFrame custom allocators as they only come into play
diff --git a/onnxruntime/core/framework/execution_providers.h b/onnxruntime/core/framework/execution_providers.h
index 0fc59f9ab6..7ef12137b7 100644
--- a/onnxruntime/core/framework/execution_providers.h
+++ b/onnxruntime/core/framework/execution_providers.h
@@ -39,7 +39,7 @@ class ExecutionProviders {
     exec_provider_options_[provider_id] = p_exec_provider->GetProviderOptions();
 
     exec_provider_ids_.push_back(provider_id);
-    exec_providers_.push_back(std::move(p_exec_provider));
+    exec_providers_.push_back(p_exec_provider);
     return Status::OK();
   }
 
diff --git a/onnxruntime/core/framework/graph_partitioner.cc b/onnxruntime/core/framework/graph_partitioner.cc
index bd06510482..93ccc4b435 100644
--- a/onnxruntime/core/framework/graph_partitioner.cc
+++ b/onnxruntime/core/framework/graph_partitioner.cc
@@ -359,8 +359,6 @@ static Status InlineNodes(Graph& graph, bool& modified_graph) {
     modified_graph = true;
   }
 
-  ORT_RETURN_IF_ERROR(graph.Resolve());
-
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 510af3097a..374c044209 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -204,6 +204,8 @@ Status SessionState::AddInitializedTensor(int ort_value_index, const OrtValue& o
   if (sparse) {
     sparse_initialized_tensors_.insert(ort_value_index);
   }
+#else
+  ORT_UNUSED_PARAMETER(sparse);
 #endif
 
   return Status::OK();
diff --git a/onnxruntime/core/framework/tensor_shape.cc b/onnxruntime/core/framework/tensor_shape.cc
index 929262735c..152ca6e1e1 100644
--- a/onnxruntime/core/framework/tensor_shape.cc
+++ b/onnxruntime/core/framework/tensor_shape.cc
@@ -14,11 +14,6 @@ TensorShape::TensorShape(gsl::span<const int64_t> dims) {
   gsl::copy(dims, values_);
 }
 
-TensorShape::TensorShape(const std::initializer_list<int64_t>& dims) {
-  Allocate(dims.size());
-  std::copy(dims.begin(), dims.end(), values_.begin());
-}
-
 TensorShape& TensorShape::operator=(const TensorShape& other) {
   if (&other==this)
     return *this;
@@ -28,7 +23,7 @@ TensorShape& TensorShape::operator=(const TensorShape& other) {
   return *this;
 }
 
-TensorShape& TensorShape::operator=(TensorShape&& other) {
+TensorShape& TensorShape::operator=(TensorShape&& other) noexcept {
   if (&other==this)
     return *this;
 
diff --git a/onnxruntime/core/framework/tensor_type_and_shape.cc b/onnxruntime/core/framework/tensor_type_and_shape.cc
index a0098b6a17..b4ec9cea18 100644
--- a/onnxruntime/core/framework/tensor_type_and_shape.cc
+++ b/onnxruntime/core/framework/tensor_type_and_shape.cc
@@ -237,6 +237,8 @@ ORT_API_STATUS_IMPL(OrtApis::GetSparseTensorValuesTypeAndShape, _In_ const OrtVa
   const auto& values = sparse_tensor.Values();
   return GetTensorShapeAndType(values.Shape(), *values.DataType(), out);
 #else
+  ORT_UNUSED_PARAMETER(v);
+  ORT_UNUSED_PARAMETER(out);
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -275,6 +277,9 @@ ORT_API_STATUS_IMPL(OrtApis::GetSparseTensorIndicesTypeShape, _In_ const OrtValu
   const Tensor& indices_tensor = GetIndicesTensor(*v, indices_format);
   return GetTensorShapeAndType(indices_tensor.Shape(), *indices_tensor.DataType(), out);
 #else
+  ORT_UNUSED_PARAMETER(v);
+  ORT_UNUSED_PARAMETER(indices_format);
+  ORT_UNUSED_PARAMETER(out);
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -289,6 +294,10 @@ ORT_API_STATUS_IMPL(OrtApis::GetSparseTensorIndices, _In_ const OrtValue* v,
   *indices = indices_tensor.DataRaw();
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(v);
+  ORT_UNUSED_PARAMETER(indices_format);
+  ORT_UNUSED_PARAMETER(num_indices);
+  ORT_UNUSED_PARAMETER(indices);
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index eddaa41b15..ca7416fce0 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -829,6 +829,8 @@ common::Status ConstantNodeProtoToTensorProto(const ONNX_NAMESPACE::NodeProto& n
       ORT_RETURN_IF_ERROR(SparseTensorProtoToDenseTensorProto(s, model_path, tensor));
       break;
     }
+#else
+  ORT_UNUSED_PARAMETER(model_path);
 #endif
     default:
       ORT_THROW("Unsupported attribute value type of ", constant_attribute.type(),
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index c9f6d315a1..74d3b1d74d 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -531,6 +531,43 @@ void AttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx, int p
   }
 }
 
+void DecoderAttentionTypeAndShapeInference(ONNX_NAMESPACE::InferenceContext& ctx) {
+  // Type inference
+  ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
+  if (ctx.getNumOutputs() > 1) {
+    ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 1);
+    ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 2);
+  }
+  // Shape inference
+  if (hasInputShape(ctx, 0)) {
+    auto& query_shape = getInputShape(ctx, 0);
+    updateOutputShape(ctx, 0, query_shape);
+  }
+  if (ctx.getNumOutputs() > 1) {
+    if (hasInputShape(ctx, 6) && hasInputShape(ctx, 7)) {
+      auto& cache_shape = getInputShape(ctx, 6);
+      auto& cache_dims = cache_shape.dim();
+      if (cache_dims.size() != 4) {
+        fail_shape_inference("key and value cache shall be 4 dimensions");
+      }
+      if (!cache_dims[0].has_dim_value() ||
+          !cache_dims[1].has_dim_value() ||
+          !cache_dims[2].has_dim_value() ||
+          !cache_dims[3].has_dim_value()) {
+        fail_shape_inference("key and value cache dimensions value shall not be null");
+      }
+      ONNX_NAMESPACE::TensorShapeProto new_cache_shape;
+      *new_cache_shape.add_dim() = cache_shape.dim(0);
+      *new_cache_shape.add_dim() = cache_shape.dim(1);
+      new_cache_shape.add_dim();
+      *new_cache_shape.add_dim() = cache_shape.dim(3);
+
+      updateOutputShape(ctx, 1, new_cache_shape);
+      updateOutputShape(ctx, 2, new_cache_shape);
+    }
+  }
+}
+
 void RegisterBertSchemas() {
   static const char* Attention_ver1_doc = R"DOC(
 Multi-Head Self Attention that can be either unidirectional (like GPT-2) or bidirectional (like BERT).
@@ -684,6 +721,38 @@ Global attention flags have value 1 for the tokens attend globally and 0 otherwi
       .TypeConstraint("G", {"tensor(int32)"}, "Constrain to integer types")
       .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput);
 
+  static const char* Decoder_Attention_doc = R"DOC(
+This DecoderAttention supports self attention and cross attention, key and value cache, and key_padding_mask. The attention mask is not support at the moment.
+Some boolean parameters are passed by runtime input for generic purpose
+)DOC";
+
+  ONNX_CONTRIB_OPERATOR_SCHEMA(DecoderAttention)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .SetDoc(Decoder_Attention_doc)
+      .Attr("num_heads", "Number of attention heads", AttributeProto::INT)
+      .Input(0, "query", "3D input tensor with shape (sequence_length, batch_size, hidden_size), hidden_size = num_heads * head_size", "T")
+      .Input(1, "key", "3D input tensor with shape (total_sequence_length, batch_size, hidden_size)", "T")
+      .Input(2, "q_weight", "2D input tensor with shape (hidden_size, hidden_size)", "T")
+      .Input(3, "kv_weight", "2D input tensor with shape (hidden_size, 2 * hidden_size)", "T")
+      .Input(4, "bias", "1D input tensor with shape (3 * hidden_size)", "T")
+      .Input(5, "key_padding_mask", "2D input tensor with shape (batch_size, total_sequence_length)", "B", OpSchema::Optional)
+      .Input(6, "key_cache", "input tensor with shape (batch_size, num_heads, sequence_length or total_sequence_length, head_size)", "T", OpSchema::Optional)   // self & cross
+      .Input(7, "value_cache", "input tensor with shape (batch_size, num_heads, sequence_length or total_sequence_length, head_size)", "T", OpSchema::Optional)   // self & cross
+      .Input(8, "static_kv", "If static_kv = true, cross-attention; else self-attention", "B")
+      .Input(9, "use_past", "If use_past = true, use cache; else no cache", "B")
+      .Input(10, "has_layer_state", "If has_layer_state = true, layer_state = {} or [a,b]; else layer_state = None", "B")
+      .Input(11, "has_key_padding_mask", "has_key_padding_mask or not", "B")
+      .Output(0, "output", "3D output tensor with shape (sequence_length, batch_size, hidden_size)", "T")
+      .Output(1, "new_key_cache", "output tensor with shape (batch_size, num_heads, new sequence_length, head_size)", "T", OpSchema::Optional) // self & cross
+      .Output(2, "new_value_cache", "output tensor with shape (batch_size, num_heads, new sequence_length, head_size)", "T", OpSchema::Optional) // self & cross
+      .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float and float16 tensors.")
+      .TypeConstraint("B", {"tensor(bool)"}, "Constrain key_padding_mask to bool tensors.")
+      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
+        DecoderAttentionTypeAndShapeInference(ctx);
+      });
+
+
   static const char* EmbedLayerNormalization_ver1_doc = R"DOC(
 EmbedLayerNormalization is the fusion of embedding layer in BERT model, with optional mask processing.
 The embedding layer takes input_ids (word IDs) and segment_ids (sentence IDs) to look up word_embedding, position_embedding,
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index 011ec52e7e..03acfec47f 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -2517,7 +2517,8 @@ Status Graph::VerifyNodeAndOpMatch(const ResolveOptions& options) {
         }
         ORT_CATCH(const std::exception& ex) {
           ORT_HANDLE_EXCEPTION([&]() {
-            status = ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "This is an invalid model. Error in Node:", node_name, " : ", ex.what());
+            status = ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH,
+                                     "This is an invalid model. In Node, ", node, ", Error ", ex.what());
           });
         }
         ORT_RETURN_IF_ERROR(status);
@@ -4128,32 +4129,72 @@ Graph::~Graph() {
 }
 
 #if !defined(ORT_MINIMAL_BUILD)
+std::ostream& operator<<(std::ostream& out, const NodeArg& node_arg) {
+  out << "\"" << node_arg.Name() << "\"";
+  if (node_arg.Type()) {
+    out << ": " << *node_arg.Type();
+  }
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& out, const Node& node) {
+  out << "(\"" << node.Name() << "\""
+      << ", "
+      << node.OpType()
+      << ", "
+      // Use quote so default ONNX domain is shown as ""
+      // rather than misleading empty string.
+      << "\"" << node.Domain() << "\""
+      << ", "
+      << node.SinceVersion()
+      << ") : (";
+  for (const auto* x : node.InputDefs()) {
+    if (x->Exists()) {
+      out << *x << ",";
+    } else {
+      // Print missing (or optional) inputs
+      // because operator schema uses positional
+      // arguments in ONNX.
+      out << "\"\""
+          << ",";
+    }
+  }
+  out << ") -> (";
+  for (const auto* x : node.OutputDefs()) {
+    if (x->Exists()) {
+      out << *x << ",";
+    } else {
+      // Print missing (or optional) outputs
+      // because operator schema uses positional
+      // arguments in ONNX.
+      out << "\"\""
+          << ",";
+    }
+  }
+  out << ") ";
+  return out;
+}
+
 std::ostream& operator<<(std::ostream& out, const Graph& graph) {
   out << "Inputs:\n";
-  for (auto* x : graph.GetInputs()) {
-    out << "   " << x->Name() << " : " << *x->Type() << "\n";
+  for (const auto* x : graph.GetInputs()) {
+    // Unlike we print missing input and output for operator, we don't
+    // print missing input for graph because they are not helpful (we
+    // don't have a fixed schema for graph to match arguments).
+    if (x) {
+      out << "   " << *x << "\n";
+    }
   }
   out << "Nodes:\n";
-  for (auto& node : graph.Nodes()) {
-    out << "   " << node.Name() << ": " << node.OpType() << " (";
-    for (auto* x : node.InputDefs()) {
-      if (x->Exists()) {
-        out << x->Name() << ": " << *x->Type();
-      }
-      out << ", ";
-    }
-    out << ") -> ";
-    for (auto* x : node.OutputDefs()) {
-      if (x->Exists()) {
-        out << x->Name() << ": " << *x->Type();
-      }
-      out << ", ";
-    }
-    out << "\n";
+  for (const auto& node : graph.Nodes()) {
+    out << "   " << node << "\n";
   }
   out << "Outputs:\n";
-  for (auto* x : graph.GetOutputs()) {
-    out << "   " << x->Name() << " : " << *x->Type() << "\n";
+  for (const auto* x : graph.GetOutputs()) {
+    // Similar to graph input, missing graph output is not printed.
+    if (x) {
+      out << "   " << *x << "\n";
+    }
   }
   return out;
 }
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 01f22c25b4..d3a5619c5c 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -786,11 +786,12 @@ MlasPool(
     MLAS_THREADPOOL* ThreadPool
     );
 
+template<typename T8Bits>
 void
 MLASCALL
 MlasMaximumPool(
-    const uint8_t* const* Input,
-    uint8_t* Output,
+    const T8Bits* const* Input,
+    T8Bits* Output,
     size_t Channels,
     size_t OutputCount,
     size_t KernelSize
@@ -1041,17 +1042,18 @@ MlasQuantizeLinear(
  * @param CountN
  * @return
 */
+template<typename OutputType>
 void
 MLASCALL
 MlasRequantizeOutput(
     const int32_t* Input,
     size_t InputLeadingDimension,
-    uint8_t* Output,
+    OutputType* Output,
     size_t OutputLeadingDimension,
     const int32_t* Bias,
     const float* Scale,
     bool PerColumnScale,
-    uint8_t ZeroPoint,
+    OutputType ZeroPoint,
     size_t StartM,
     size_t StartN,
     size_t CountM,
@@ -1115,13 +1117,14 @@ MlasQLinearSafePaddingElementCount(
     size_t ElementCount
     );
 
+template<typename T8Bits>
 void
 MLASCALL
 MlasQLinearGlobalAveragePoolNchw(
-    const uint8_t* Input,
+    const T8Bits* Input,
     float ScaleInput,
     int32_t ZeroPointInput,
-    uint8_t* Output,
+    T8Bits* Output,
     float ScaleOutput,
     int32_t ZeroPointOutput,
     size_t Channels,
@@ -1129,13 +1132,14 @@ MlasQLinearGlobalAveragePoolNchw(
     int32_t* AccumulateBuffer
     );
 
+template <typename T8Bits>
 void
 MLASCALL
 MlasQLinearGlobalAveragePoolNhwc(
-    const uint8_t* Input,
+    const T8Bits* Input,
     float ScaleInput,
     int32_t ZeroPointInput,
-    uint8_t* Output,
+    T8Bits* Output,
     float ScaleOutput,
     int32_t ZeroPointOutput,
     size_t Batch,
@@ -1143,7 +1147,7 @@ MlasQLinearGlobalAveragePoolNhwc(
     size_t Stride,
     size_t Channels,
     int32_t* AccumulateBuffer,
-    const uint8_t* ZeroBuffer
+    const T8Bits* ZeroBuffer
     );
 
 //
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 82b1f5c978..e5ec448618 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -499,6 +499,7 @@ extern "C" {
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernel;
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelPOWER10;
     MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernel;
+    MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelPOWER10;
 #else
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelZero;
     MLAS_GEMM_FLOAT_KERNEL MlasSgemmKernelAdd;
@@ -1886,7 +1887,7 @@ MlasStoreAlignedFloat64x2(double* Buffer, MLAS_FLOAT64X2 Vector)
 #if defined(MLAS_SSE2_INTRINSICS)
     _mm_store_pd(Buffer, Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
-    vec_st(Vector, 0, Buffer);
+    *((MLAS_FLOAT64X2*)Buffer) = Vector;
 #endif
 }
 
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 5c92ce915f..de7fee8c07 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -379,6 +379,7 @@ Return Value:
     bool HasP10Instructions = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
     if (HasP10Instructions) {
         this->GemmFloatKernel = MlasSgemmKernelPOWER10;
+        this->GemmDoubleKernel = MlasDgemmKernelPOWER10;
     }
 #endif
 #endif
diff --git a/onnxruntime/core/mlas/lib/pooling.cpp b/onnxruntime/core/mlas/lib/pooling.cpp
index 0223da1ee6..649e137182 100644
--- a/onnxruntime/core/mlas/lib/pooling.cpp
+++ b/onnxruntime/core/mlas/lib/pooling.cpp
@@ -1304,11 +1304,12 @@ Return Value:
 #endif
 }
 
+template<typename T8Bits>
 void
 MLASCALL
 MlasMaximumPool(
-    const uint8_t* const* Input,
-    uint8_t* Output,
+    const T8Bits* const* Input,
+    T8Bits* Output,
     size_t Channels,
     size_t OutputCount,
     size_t KernelSize
@@ -1352,6 +1353,10 @@ Return Value:
         size_t c = Channels;
 
 #if defined(MLAS_SSE2_INTRINSICS)
+        const __m128i BitFlipVector = _mm_set1_epi32(0x80808080);
+        if constexpr (std::is_unsigned<T8Bits>::value) {
+            MLAS_UNREFERENCED_PARAMETER(BitFlipVector);
+        }
 
         while (c >= 32) {
 
@@ -1363,10 +1368,20 @@ Return Value:
                 __m128i InputVector0 = _mm_loadu_si128((const __m128i*)&Input[k][ChannelOffset]);
                 __m128i InputVector1 = _mm_loadu_si128((const __m128i*)&Input[k][ChannelOffset + 16]);
 
+                if constexpr (std::is_signed<T8Bits>::value) {
+                    InputVector0 = _mm_xor_si128(InputVector0, BitFlipVector);
+                    InputVector1 = _mm_xor_si128(InputVector1, BitFlipVector);
+                }
+
                 MaximumVector0 = _mm_max_epu8(MaximumVector0, InputVector0);
                 MaximumVector1 = _mm_max_epu8(MaximumVector1, InputVector1);
             }
 
+            if constexpr (std::is_signed<T8Bits>::value) {
+                MaximumVector0 = _mm_xor_si128(MaximumVector0, BitFlipVector);
+                MaximumVector1 = _mm_xor_si128(MaximumVector1, BitFlipVector);
+            }
+
             _mm_storeu_si128((__m128i*)&Output[0], MaximumVector0);
             _mm_storeu_si128((__m128i*)&Output[16], MaximumVector1);
             Output += 32;
@@ -1383,9 +1398,17 @@ Return Value:
 
                 __m128i InputVector0 = _mm_loadu_si128((const __m128i*)&Input[k][ChannelOffset]);
 
+                if constexpr (std::is_signed<T8Bits>::value){
+                    InputVector0 = _mm_xor_si128(InputVector0, BitFlipVector);
+                }
+
                 MaximumVector0 = _mm_max_epu8(MaximumVector0, InputVector0);
             }
 
+            if constexpr (std::is_signed<T8Bits>::value) {
+                MaximumVector0 = _mm_xor_si128(MaximumVector0, BitFlipVector);
+            }
+
             _mm_storeu_si128((__m128i*)&Output[0], MaximumVector0);
             Output += 16;
 
@@ -1401,9 +1424,17 @@ Return Value:
 
                 __m128i InputVector0 = _mm_loadl_epi64((const __m128i*)&Input[k][ChannelOffset]);
 
+                if constexpr (std::is_signed<T8Bits>::value){
+                    InputVector0 = _mm_xor_si128(InputVector0, BitFlipVector);
+                }
+
                 MaximumVector0 = _mm_max_epu8(MaximumVector0, InputVector0);
             }
 
+            if constexpr (std::is_signed<T8Bits>::value) {
+                MaximumVector0 = _mm_xor_si128(MaximumVector0, BitFlipVector);
+            }
+
             _mm_storel_epi64((__m128i*)&Output[0], MaximumVector0);
             Output += 8;
 
@@ -1415,20 +1446,40 @@ Return Value:
 
         while (c >= 32) {
 
-            uint8x16_t MaximumVector0 = vdupq_n_u8(0);
-            uint8x16_t MaximumVector1 = vdupq_n_u8(0);
+            if constexpr (std::is_signed<T8Bits>::value){
 
-            for (size_t k = 0; k < KernelSize; k++) {
+                int8x16_t MaximumVector0 = vdupq_n_s8(-128);
+                int8x16_t MaximumVector1 = vdupq_n_s8(-128);
 
-                uint8x16_t InputVector0 = vld1q_u8(&Input[k][ChannelOffset]);
-                uint8x16_t InputVector1 = vld1q_u8(&Input[k][ChannelOffset + 16]);
+                for (size_t k = 0; k < KernelSize; k++) {
 
-                MaximumVector0 = vmaxq_u8(MaximumVector0, InputVector0);
-                MaximumVector1 = vmaxq_u8(MaximumVector1, InputVector1);
+                    int8x16_t InputVector0 = vld1q_s8(&Input[k][ChannelOffset]);
+                    int8x16_t InputVector1 = vld1q_s8(&Input[k][ChannelOffset + 16]);
+
+                    MaximumVector0 = vmaxq_s8(MaximumVector0, InputVector0);
+                    MaximumVector1 = vmaxq_s8(MaximumVector1, InputVector1);
+                }
+
+                vst1q_s8(&Output[0], MaximumVector0);
+                vst1q_s8(&Output[16], MaximumVector1);
+            } else {
+
+                uint8x16_t MaximumVector0 = vdupq_n_u8(0);
+                uint8x16_t MaximumVector1 = vdupq_n_u8(0);
+
+                for (size_t k = 0; k < KernelSize; k++) {
+
+                    uint8x16_t InputVector0 = vld1q_u8(&Input[k][ChannelOffset]);
+                    uint8x16_t InputVector1 = vld1q_u8(&Input[k][ChannelOffset + 16]);
+
+                    MaximumVector0 = vmaxq_u8(MaximumVector0, InputVector0);
+                    MaximumVector1 = vmaxq_u8(MaximumVector1, InputVector1);
+                }
+
+                vst1q_u8(&Output[0], MaximumVector0);
+                vst1q_u8(&Output[16], MaximumVector1);
             }
 
-            vst1q_u8(&Output[0], MaximumVector0);
-            vst1q_u8(&Output[16], MaximumVector1);
             Output += 32;
 
             ChannelOffset += 32;
@@ -1437,16 +1488,30 @@ Return Value:
 
         while (c >= 16) {
 
-            uint8x16_t MaximumVector0 = vdupq_n_u8(0);
+            if constexpr (std::is_signed<T8Bits>::value){
 
-            for (size_t k = 0; k < KernelSize; k++) {
+                int8x16_t MaximumVector0 = vdupq_n_s8(-128);
 
-                uint8x16_t InputVector0 = vld1q_u8(&Input[k][ChannelOffset]);
+                for (size_t k = 0; k < KernelSize; k++) {
 
-                MaximumVector0 = vmaxq_u8(MaximumVector0, InputVector0);
+                    int8x16_t InputVector0 = vld1q_s8(&Input[k][ChannelOffset]);
+                    MaximumVector0 = vmaxq_s8(MaximumVector0, InputVector0);
+                }
+
+                vst1q_s8(&Output[0], MaximumVector0);
+            } else {
+
+                uint8x16_t MaximumVector0 = vdupq_n_u8(0);
+
+                for (size_t k = 0; k < KernelSize; k++) {
+
+                    uint8x16_t InputVector0 = vld1q_u8(&Input[k][ChannelOffset]);
+                    MaximumVector0 = vmaxq_u8(MaximumVector0, InputVector0);
+                }
+
+                vst1q_u8(&Output[0], MaximumVector0);
             }
 
-            vst1q_u8(&Output[0], MaximumVector0);
             Output += 16;
 
             ChannelOffset += 16;
@@ -1455,16 +1520,29 @@ Return Value:
 
         if (c >= 8) {
 
-            uint8x8_t MaximumVector0 = vdup_n_u8(0);
+            if constexpr (std::is_signed<T8Bits>::value){
 
-            for (size_t k = 0; k < KernelSize; k++) {
+                int8x8_t MaximumVector0 = vdup_n_s8(-128);
 
-                uint8x8_t InputVector0 = vld1_u8(&Input[k][ChannelOffset]);
+                for (size_t k = 0; k < KernelSize; k++) {
 
-                MaximumVector0 = vmax_u8(MaximumVector0, InputVector0);
+                    int8x8_t InputVector0 = vld1_s8(&Input[k][ChannelOffset]);
+                    MaximumVector0 = vmax_s8(MaximumVector0, InputVector0);
+                }
+
+                vst1_s8(&Output[0], MaximumVector0);
+            } else {
+
+                uint8x8_t MaximumVector0 = vdup_n_u8(0);
+
+                for (size_t k = 0; k < KernelSize; k++) {
+
+                    uint8x8_t InputVector0 = vld1_u8(&Input[k][ChannelOffset]);
+                    MaximumVector0 = vmax_u8(MaximumVector0, InputVector0);
+                }
+                vst1_u8(&Output[0], MaximumVector0);
             }
 
-            vst1_u8(&Output[0], MaximumVector0);
             Output += 8;
 
             ChannelOffset += 8;
@@ -1475,13 +1553,13 @@ Return Value:
 
         while (c > 0) {
 
-            int32_t MaximumValue = 0;
+            int32_t MaximumValue = std::numeric_limits<T8Bits>::lowest();
 
             for (size_t k = 0; k < KernelSize; k++) {
                 MaximumValue = std::max(MaximumValue, int32_t(Input[k][ChannelOffset]));
             }
 
-            *Output++ = uint8_t(MaximumValue);
+            *Output++ = T8Bits(MaximumValue);
 
             ChannelOffset += 1;
             c -= 1;
@@ -1491,3 +1569,25 @@ Return Value:
         OutputCount -= 1;
     }
 }
+
+template
+void
+MLASCALL
+MlasMaximumPool<int8_t>(
+    const int8_t* const* Input,
+    int8_t* Output,
+    size_t Channels,
+    size_t OutputCount,
+    size_t KernelSize
+    );
+
+template
+void
+MLASCALL
+MlasMaximumPool<uint8_t>(
+    const uint8_t* const* Input,
+    uint8_t* Output,
+    size_t Channels,
+    size_t OutputCount,
+    size_t KernelSize
+    );
diff --git a/onnxruntime/core/mlas/lib/power/DgemmKernelPOWER10.cpp b/onnxruntime/core/mlas/lib/power/DgemmKernelPOWER10.cpp
new file mode 100644
index 0000000000..11638bc33f
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/power/DgemmKernelPOWER10.cpp
@@ -0,0 +1,418 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    DgemmKernelPower.cpp
+
+Abstract:
+
+    This module implements the kernels for the double precision matrix/matrix
+    multiply operation (DGEMM).
+
+--*/
+
+#include "DgemmKernelpower.h"
+struct MlasDgemmBroadcastAElementsMMA
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        double ARow[RowCount],
+        const double* A,
+        size_t lda
+        )
+    {
+        ARow[Row] = A [Row * lda];
+    }
+};
+
+template<size_t RowCount>
+MLAS_FORCEINLINE
+void
+MlasDgemmComputeAElements(
+    MLAS_FLOAT64X2 AElements[RowCount],
+    MLAS_FLOAT64X2 ABroadcast[RowCount]
+    )
+{
+    ABroadcast[0] = vec_mergee (AElements[0], AElements[1]);
+    ABroadcast[1] = vec_mergee (AElements[2], AElements[3]);
+    ABroadcast[2] = vec_mergeo (AElements[0], AElements[1]);
+    ABroadcast[3] = vec_mergeo (AElements[2], AElements[3]);
+}
+
+template<size_t RowCount>
+MLAS_FORCEINLINE
+void
+MlasDgemmComputeBlockMMA(
+    __vector_quad acc[8],
+    MLAS_FLOAT64X2 ABroadcast[RowCount],
+    MLAS_FLOAT64X2 A2Broadcast[RowCount],
+    const double* B,
+    size_t CountM
+    )
+{
+    MLAS_FLOAT64X2 BElements[4];
+    typedef __vector unsigned char vec_t;
+    __vector_pair A2pair, Apair;
+#if (defined(__GNUC__) && (__GNUC__ == 10 && __GNUC_MINOR__ <= 2))
+    __builtin_mma_assemble_pair (&Apair, (vec_t)ABroadcast[1], (vec_t)ABroadcast[0]);
+    if (CountM == 8)  {
+      __builtin_mma_assemble_pair (&A2pair, (vec_t)A2Broadcast[1], (vec_t)A2Broadcast[0]);
+    }
+#elif (defined(__GNUC__) && (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))
+    Apair = *((__vector_pair *)((void *)&ABroadcast[0]));
+    if (CountM == 8)  {
+      A2pair = *((__vector_pair *)((void *)&A2Broadcast[0]));
+    }
+#else
+    __builtin_vsx_assemble_pair (&Apair, (vec_t)ABroadcast[1], (vec_t)ABroadcast[0]);
+    if (CountM == 8)  {
+      __builtin_vsx_assemble_pair (&A2pair, (vec_t)A2Broadcast[1], (vec_t)A2Broadcast[0]);
+    }
+#endif
+    BElements[0] = MlasLoadFloat64x2(B);
+    BElements[1] = MlasLoadFloat64x2(B + 2);
+    BElements[2] = MlasLoadFloat64x2(B + 4);
+    BElements[3] = MlasLoadFloat64x2(B + 6);
+   __builtin_mma_xvf64gerpp (&acc[0], Apair, (vec_t)BElements[0]);
+   __builtin_mma_xvf64gerpp (&acc[1], Apair, (vec_t)BElements[1]);
+   __builtin_mma_xvf64gerpp (&acc[2], Apair, (vec_t)BElements[2]);
+   __builtin_mma_xvf64gerpp (&acc[3], Apair, (vec_t)BElements[3]);
+   if (CountM == 8) {
+       __builtin_mma_xvf64gerpp (&acc[4], A2pair, (vec_t)BElements[0]);
+       __builtin_mma_xvf64gerpp (&acc[5], A2pair, (vec_t)BElements[1]);
+       __builtin_mma_xvf64gerpp (&acc[6], A2pair, (vec_t)BElements[2]);
+       __builtin_mma_xvf64gerpp (&acc[7], A2pair, (vec_t)BElements[3]);
+   }
+}
+template<size_t VectorCount>
+struct MlasDgemmStoreVectorMMA
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOAT64X2 Result[4],
+        double* C,
+        size_t ldc,
+        MLAS_FLOAT64X2 AlphaBroadcast,
+        bool ZeroMode
+        )
+    {
+        MLAS_FLOAT64X2 *rowC;
+        if (ZeroMode) {
+            rowC = (MLAS_FLOAT64X2 *) &C[Row * ldc + VectorCount];
+            rowC[0] = Result[Row] * AlphaBroadcast;
+        } else {
+            rowC = (MLAS_FLOAT64X2 *) &C[Row * ldc + VectorCount];
+            rowC[0] += Result[Row] * AlphaBroadcast;
+        }
+    }
+};
+
+struct MlasDgemmMultiplyAlphaTrailingMMA
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOAT64X2 Accumulators[RowCount],
+        MLAS_FLOAT64X2 AlphaBroadcast
+        )
+    {
+        Accumulators[Row] = MlasMultiplyFloat64x2(Accumulators[Row], AlphaBroadcast);
+    }
+};
+template<unsigned Lane>
+struct MlasDgemmStoreScalarMMA
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOAT64X2 Accumulators[RowCount],
+        double* C,
+        size_t ldc,
+        bool ZeroMode
+        )
+    {
+        double* c = C + Row * ldc + Lane;
+        double Value = Accumulators[Row][Lane];
+        if (!ZeroMode) {
+            Value += *c;
+        }
+
+        *c = Value;
+    }
+};
+
+template<size_t RowCount>
+MLAS_FORCEINLINE
+size_t
+MlasDgemmMMAProcessCount(
+    const double* A,
+    const double* B,
+    double* C,
+    size_t CountM,
+    size_t CountK,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    MLAS_FLOAT64X2 AlphaBroadcast,
+    bool ZeroMode
+    )
+{
+    do {
+
+        const double* a = A;
+        size_t k = CountK;
+
+        MLAS_FLOAT64X2 Accumulators[2][RowCount] = {{ 0 }};
+        MLAS_FLOAT64X2 Result[RowCount];
+        MLAS_FLOAT64X2 AElements[RowCount];
+        MLAS_FLOAT64X2 ABroadcast[RowCount] = { 0 };
+        MLAS_FLOAT64X2 A2Broadcast[RowCount] = { 0 };
+        MLAS_FLOAT64X2 A3Broadcast[RowCount] = { 0 };
+        MLAS_FLOAT64X2 A4Broadcast[RowCount] = { 0 };
+        double ARow[RowCount] = { 0 };
+        double A2Row[RowCount] = { 0 };
+        __vector_quad acc[8];
+
+        //
+        // Clear the block accumulators.
+        //
+        __builtin_mma_xxsetaccz(&acc[0]);
+        __builtin_mma_xxsetaccz(&acc[1]);
+        __builtin_mma_xxsetaccz(&acc[2]);
+        __builtin_mma_xxsetaccz(&acc[3]);
+        __builtin_mma_xxsetaccz(&acc[4]);
+        __builtin_mma_xxsetaccz(&acc[5]);
+        __builtin_mma_xxsetaccz(&acc[6]);
+        __builtin_mma_xxsetaccz(&acc[7]);
+
+        //
+        // Compute the output block.
+        //
+        while (k >= 4) {
+
+            MlasLoopUnroll<RowCount, MlasFgemmLoadAElements>()(AElements, a, lda);
+            MlasDgemmComputeAElements<RowCount>(AElements, ABroadcast);
+            MlasLoopUnroll<RowCount, MlasFgemmLoadAElements>()(AElements, a+2, lda);
+            MlasDgemmComputeAElements<RowCount>(AElements, A3Broadcast);
+            if (CountM == 8) {
+                MlasLoopUnroll<RowCount, MlasFgemmLoadAElements>()(AElements, a + ( lda * 4), lda);
+                MlasDgemmComputeAElements<RowCount>(AElements, A2Broadcast);
+                MlasLoopUnroll<RowCount, MlasFgemmLoadAElements>()(AElements, (a+2) + ( lda * 4), lda);
+                MlasDgemmComputeAElements<RowCount>(AElements, A4Broadcast);
+            }
+            MlasDgemmComputeBlockMMA<RowCount>(&acc[0], &ABroadcast[0], &A2Broadcast[0], B, CountM);
+            MlasDgemmComputeBlockMMA<RowCount>(&acc[0], &ABroadcast[2], &A2Broadcast[2], B+8, CountM);
+            MlasDgemmComputeBlockMMA<RowCount>(&acc[0], &A3Broadcast[0], &A4Broadcast[0], B+16, CountM);
+            MlasDgemmComputeBlockMMA<RowCount>(&acc[0], &A3Broadcast[2], &A4Broadcast[2], B+24, CountM);
+            B += 8 * 4;
+            a += 4;
+            k -= 4;
+        }
+        while (k > 0) {
+            MlasLoopUnroll<RowCount, MlasDgemmBroadcastAElementsMMA>()(ARow, a, lda);
+            if (CountM == 8)  {
+                MlasLoopUnroll<RowCount, MlasDgemmBroadcastAElementsMMA>()(A2Row, a + (lda * 4), lda);
+            }
+
+            MlasDgemmComputeBlockMMA<RowCount>(&acc[0], (MLAS_FLOAT64X2 *)ARow, (MLAS_FLOAT64X2 *)A2Row, B, CountM);
+            a += 1;
+            B += 8;
+            k -= 1;
+        }
+        if (CountN >= 8) {
+
+            //
+            // Store the entire output block.
+            //
+            __builtin_mma_disassemble_acc ((void *)Result, &acc[0]);
+            MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<0>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+            __builtin_mma_disassemble_acc ((void *)Result, &acc[1]);
+            MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<2>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+            __builtin_mma_disassemble_acc ((void *)Result, &acc[2]);
+            MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<4>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+            __builtin_mma_disassemble_acc ((void *)Result, &acc[3]);
+            MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<6>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+            if (CountM == 8) {
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[4]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<0>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[5]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<2>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[6]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<4>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[7]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<6>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+            }
+        } else {
+
+            //
+            // Store the partial output block.
+            //
+
+            if (CountN >= 6) {
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[0]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<0>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[1]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<2>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[2]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<4>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+                if (CountM == 8) {
+                    __builtin_mma_disassemble_acc ((void *)Result, &acc[4]);
+                    MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<0>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+                    __builtin_mma_disassemble_acc ((void *)Result, &acc[5]);
+                    MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<2>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+                    __builtin_mma_disassemble_acc ((void *)Result, &acc[6]);
+                    MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<4>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+                    if (CountN - 6 > 0) {
+                        __builtin_mma_disassemble_acc ((void *)Accumulators[1], &acc[7]);
+                    }
+                }
+                if (CountN - 6 > 0) {
+                    __builtin_mma_disassemble_acc ((void *)Accumulators[0], &acc[3]);
+                }
+            } else if (CountN >= 4) {
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[0]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<0>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[1]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<2>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+                if (CountM == 8) {
+                    __builtin_mma_disassemble_acc ((void *)Result, &acc[4]);
+                    MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<0>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+                    __builtin_mma_disassemble_acc ((void *)Result, &acc[5]);
+                    MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<2>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+                    if (CountN - 4 > 0) {
+                        __builtin_mma_disassemble_acc ((void *)Accumulators[1], &acc[6]);
+                    }
+                }
+                if (CountN - 4 > 0) {
+                    __builtin_mma_disassemble_acc ((void *)Accumulators[0], &acc[2]);
+                }
+            } else if (CountN >= 2) {
+                __builtin_mma_disassemble_acc ((void *)Result, &acc[0]);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<0>>()(Result, C, ldc, AlphaBroadcast, ZeroMode);
+                if (CountM == 8) {
+                    __builtin_mma_disassemble_acc ((void *)Result, &acc[4]);
+                    MlasLoopUnroll<RowCount, MlasDgemmStoreVectorMMA<0>>()(Result, C + (ldc*4), ldc, AlphaBroadcast, ZeroMode);
+                    if (CountN - 2 > 0) {
+                        __builtin_mma_disassemble_acc ((void *)Accumulators[1], &acc[5]);
+                    }
+                }
+                if (CountN - 2 > 0) {
+                    __builtin_mma_disassemble_acc ((void *)Accumulators[0], &acc[1]);
+                }
+            } else {
+                __builtin_mma_disassemble_acc ((void *)Accumulators[0], &acc[0]);
+                if (CountM == 8) {
+                    __builtin_mma_disassemble_acc ((void *)Accumulators[1], &acc[4]);
+                }
+           }
+
+            //
+            // Store the remaining unaligned columns.
+            //
+            C += (CountN & ~1);
+            CountN &= 1;
+
+            if (CountN > 0) {
+
+                MlasLoopUnroll<RowCount, MlasDgemmMultiplyAlphaTrailingMMA>()(Accumulators[0], AlphaBroadcast);
+                MlasLoopUnroll<RowCount, MlasDgemmStoreScalarMMA<0>>()(Accumulators[0], C, ldc, ZeroMode);
+                if (CountM == 8) {
+                    MlasLoopUnroll<RowCount, MlasDgemmMultiplyAlphaTrailingMMA>()(Accumulators[1], AlphaBroadcast);
+                    MlasLoopUnroll<RowCount, MlasDgemmStoreScalarMMA<0>>()(Accumulators[1], C + (ldc*4), ldc, ZeroMode);
+                }
+            }
+
+            break;
+        }
+
+        C += 8; 
+        CountN -= 8;
+
+    } while (CountN > 0);
+
+    return CountM;
+}
+
+size_t
+MLASCALL
+MlasDgemmKernelPOWER10(
+    const double* A,
+    const double* B,
+    double* C,
+    size_t CountK,
+    size_t CountM,
+    size_t CountN,
+    size_t lda,
+    size_t ldc,
+    double alpha,
+    bool ZeroMode
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute matrix multiplication for a
+    set of rows.
+
+Arguments:
+
+    A - Supplies the address of matrix A.
+
+    B - Supplies the address of matrix B. The matrix data has been packed using
+        MlasDgemmCopyPackB or MlasDgemmTransposePackB.
+
+    C - Supplies the address of matrix C.
+
+    CountK - Supplies the number of columns from matrix A and the number of rows
+        from matrix B to iterate over.
+
+    CountM - Supplies the maximum number of rows that can be processed for
+        matrix A and matrix C. The actual number of rows handled for this
+        invocation depends on the kernel implementation.
+
+    CountN - Supplies the number of columns from matrix B and matrix C to
+        iterate over.
+
+    lda - Supplies the first dimension of matrix A.
+
+    ldc - Supplies the first dimension of matrix C.
+
+    alpha - Supplies the scalar multiplier (see DGEMM definition).
+
+    ZeroMode - Supplies true if the output matrix must be zero initialized,
+        else false if the output matrix is accumulated into.
+
+Return Value:
+
+    Returns the number of rows handled.
+
+--*/
+{
+    size_t RowsHandled;
+    MLAS_FLOAT64X2 AlphaBroadcast = MlasBroadcastFloat64x2(alpha);
+    if (CountM >= 8) {
+        RowsHandled = MlasDgemmMMAProcessCount<4>(A, B, C, 8 ,CountK, CountN, lda, ldc, AlphaBroadcast, ZeroMode);
+    } else if (CountM >= 4) {
+        RowsHandled = MlasDgemmMMAProcessCount<4>(A, B, C, 4, CountK, CountN, lda, ldc, AlphaBroadcast, ZeroMode);
+    } else if (CountM >= 2) {
+        RowsHandled = MlasDgemmProcessCount<2>(A, B, C, CountK, CountN, lda, ldc, AlphaBroadcast, ZeroMode);
+    } else {
+        RowsHandled = MlasDgemmProcessCount<1>(A, B, C, CountK, CountN, lda, ldc, AlphaBroadcast, ZeroMode);
+    }
+
+    return RowsHandled;
+}
diff --git a/onnxruntime/core/mlas/lib/power/DgemmKernelpower.h b/onnxruntime/core/mlas/lib/power/DgemmKernelpower.h
index a7f780a22d..0dca7e4e43 100644
--- a/onnxruntime/core/mlas/lib/power/DgemmKernelpower.h
+++ b/onnxruntime/core/mlas/lib/power/DgemmKernelpower.h
@@ -6,293 +6,16 @@ Licensed under the MIT License.
 
 Module Name:
 
-    DgemmKernelPower.cpp
+    DgemmKernelpower.h
 
 Abstract:
 
-    This module implements the kernels for the single precision matrix/matrix
+    This module implements the kernels for the double precision matrix/matrix
     multiply operation (DGEMM).
 
 --*/
 
-#include "mlasi.h"
-
-//
-// Templates to ensure that a loop is unrolled.
-//
-
-template<size_t Count, size_t Index>
-struct MlasLoopUnrollStep
-{
-    template<typename IterationType, typename... IterationArgs>
-    MLAS_FORCEINLINE
-    static
-    void
-    Step(
-        IterationArgs&&... Arguments
-        )
-    {
-        IterationType::template Iteration<Count, Index>(Arguments...);
-        MlasLoopUnrollStep<Count, Index + 1>::template Step<IterationType>(Arguments...);
-    }
-};
-
-template<size_t Count>
-struct MlasLoopUnrollStep<Count, Count>
-{
-    template<typename IterationType, typename... IterationArgs>
-    MLAS_FORCEINLINE
-    static
-    void
-    Step(
-        IterationArgs&&...
-        )
-    {
-        // Terminate the loop.
-    }
-};
-
-template<size_t Count, typename IteratorType>
-struct MlasLoopUnroll
-{
-    template<typename... IterationArgs>
-    MLAS_FORCEINLINE
-    void
-    operator()(
-        IterationArgs&&... Arguments
-        )
-    {
-        MlasLoopUnrollStep<Count, 0>::template Step<IteratorType>(Arguments...);
-    }
-};
-
-//
-// Templates used with loop unrolling to perform an action on one row of the
-// output.
-//
-
-struct MlasDgemmZeroAccumulators
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 Accumulators[RowCount][4]
-        )
-    {
-        Accumulators[Row][0] = MlasZeroFloat64x2();
-        Accumulators[Row][1] = MlasZeroFloat64x2();
-        Accumulators[Row][2] = MlasZeroFloat64x2();
-        Accumulators[Row][3] = MlasZeroFloat64x2();
-    }
-};
-
-struct MlasDgemmLoadAElements
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 AElements[RowCount],
-        const double* A,
-        size_t lda
-        )
-    {
-        AElements[Row] = MlasLoadFloat64x2(A + Row * lda);
-    }
-};
-
-struct MlasDgemmBroadcastAElements
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 ABroadcast[RowCount],
-        const double* A,
-        size_t lda
-        )
-    {
-        ABroadcast[Row] = MlasBroadcastFloat64x2(A + Row * lda);
-    }
-};
-
-template<unsigned Lane>
-struct MlasDgemmSplatAElements
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 AElements[RowCount],
-        MLAS_FLOAT64X2 ABroadcast[RowCount]
-        )
-    {
-        ABroadcast[Row] = vec_splat(AElements[Row], Lane);
-    }
-};
-
-struct MlasDgemmMultiplyAddRow
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 Accumulators[RowCount][4],
-        MLAS_FLOAT64X2 ABroadcast[RowCount],
-        MLAS_FLOAT64X2 BElements[4]
-        )
-    {
-        Accumulators[Row][0] = MlasMultiplyAddFloat64x2(ABroadcast[Row], BElements[0], Accumulators[Row][0]);
-        Accumulators[Row][1] = MlasMultiplyAddFloat64x2(ABroadcast[Row], BElements[1], Accumulators[Row][1]);
-        Accumulators[Row][2] = MlasMultiplyAddFloat64x2(ABroadcast[Row], BElements[2], Accumulators[Row][2]);
-        Accumulators[Row][3] = MlasMultiplyAddFloat64x2(ABroadcast[Row], BElements[3], Accumulators[Row][3]);
-    }
-};
-
-template<size_t RowCount>
-MLAS_FORCEINLINE
-void
-MlasDgemmComputeBlock(
-    MLAS_FLOAT64X2 Accumulators[RowCount][4],
-    MLAS_FLOAT64X2 ABroadcast[RowCount],
-    const double* B
-    )
-{
-    MLAS_FLOAT64X2 BElements[4];
-
-    BElements[0] = MlasLoadFloat64x2(B);
-    BElements[1] = MlasLoadFloat64x2(B + 2);
-    BElements[2] = MlasLoadFloat64x2(B + 4);
-    BElements[3] = MlasLoadFloat64x2(B + 6);
-
-    MlasLoopUnroll<RowCount, MlasDgemmMultiplyAddRow>()(Accumulators, ABroadcast, BElements);
-}
-
-struct MlasDgemmMultiplyAlphaRow
-{
-    template<size_t Count, size_t Index>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 Accumulators[4],
-        MLAS_FLOAT64X2 AlphaBroadcast
-        )
-    {
-        Accumulators[Index] = MlasMultiplyFloat64x2(Accumulators[Index], AlphaBroadcast);
-    }
-};
-
-struct MlasDgemmMultiplyAlphaAddRow
-{
-    template<size_t Count, size_t Index>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 Accumulators[4],
-        MLAS_FLOAT64X2 AlphaBroadcast,
-        const double* C
-        )
-    {
-        Accumulators[Index] = MlasMultiplyAddFloat64x2(Accumulators[Index],
-            AlphaBroadcast, MlasLoadFloat64x2(C + Index * 2));
-    }
-};
-
-struct MlasDgemmStoreRow
-{
-    template<size_t Count, size_t Index>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 Accumulators[4],
-        double* C
-        )
-    {
-        MlasStoreFloat64x2(C + Index * 2, Accumulators[Index]);
-    }
-};
-
-template<size_t VectorCount>
-struct MlasDgemmStoreVector
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 Accumulators[RowCount][4],
-        double* C,
-        size_t ldc,
-        MLAS_FLOAT64X2 AlphaBroadcast,
-        bool ZeroMode
-        )
-    {
-        double* c = C + Row * ldc;
-        if (ZeroMode) {
-            MlasLoopUnroll<VectorCount, MlasDgemmMultiplyAlphaRow>()(Accumulators[Row], AlphaBroadcast);
-        } else {
-            MlasLoopUnroll<VectorCount, MlasDgemmMultiplyAlphaAddRow>()(Accumulators[Row], AlphaBroadcast, c);
-        }
-        MlasLoopUnroll<VectorCount, MlasDgemmStoreRow>()(Accumulators[Row], c);
-
-        //
-        // Shift down any unaligned elements to the bottom for further processing.
-        //
-
-        if (VectorCount < 4) {
-            Accumulators[Row][0] = Accumulators[Row][VectorCount];
-        }
-    }
-};
-
-struct MlasDgemmMultiplyAlphaTrailing
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 Accumulators[RowCount][4],
-        MLAS_FLOAT64X2 AlphaBroadcast
-        )
-    {
-        Accumulators[Row][0] = MlasMultiplyFloat64x2(Accumulators[Row][0], AlphaBroadcast);
-    }
-};
-
-template<unsigned Lane>
-struct MlasDgemmStoreScalar
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT64X2 Accumulators[RowCount][4],
-        double* C,
-        size_t ldc,
-        bool ZeroMode
-        )
-    {
-        double* c = C + Row * ldc + Lane;
-        double Value = MlasExtractLaneFloat64x2<Lane>(Accumulators[Row][0]);
-
-        if (!ZeroMode) {
-            Value += *c;
-        }
-
-        *c = Value;
-    }
-};
+#include "FgemmKernelpower.h"
 
 template<size_t RowCount>
 MLAS_FORCEINLINE
@@ -322,20 +45,20 @@ MlasDgemmProcessCount(
         // Clear the block accumulators.
         //
 
-        MlasLoopUnroll<RowCount, MlasDgemmZeroAccumulators>()(Accumulators);
+        MlasLoopUnroll<RowCount, MlasFgemmZeroAccumulators>()(Accumulators);
 
         //
         // Compute the output block.
         //
         while (k >= 2) {
 
-            MlasLoopUnroll<RowCount, MlasDgemmLoadAElements>()(AElements, a, lda);
+            MlasLoopUnroll<RowCount, MlasFgemmLoadAElements>()(AElements, a, lda);
 
-            MlasLoopUnroll<RowCount, MlasDgemmSplatAElements<0>>()(AElements, ABroadcast);
-            MlasDgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B);
+            MlasLoopUnroll<RowCount, MlasFgemmSplatAElements<0>>()(AElements, ABroadcast);
+            MlasFgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B);
 
-            MlasLoopUnroll<RowCount, MlasDgemmSplatAElements<1>>()(AElements, ABroadcast);
-            MlasDgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B + 8);
+            MlasLoopUnroll<RowCount, MlasFgemmSplatAElements<1>>()(AElements, ABroadcast);
+            MlasFgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B + 8);
 
             a += 2;
             B += 8 * 2;
@@ -343,8 +66,8 @@ MlasDgemmProcessCount(
         }
         if (k > 0) {
 
-            MlasLoopUnroll<RowCount, MlasDgemmBroadcastAElements>()(ABroadcast, a, lda);
-            MlasDgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B);
+            MlasLoopUnroll<RowCount, MlasFgemmBroadcastAElements>()(ABroadcast, a, lda);
+            MlasFgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B);
 
             a += 1;
             B += 8;
@@ -357,7 +80,7 @@ MlasDgemmProcessCount(
             // Store the entire output block.
             //
 
-            MlasLoopUnroll<RowCount, MlasDgemmStoreVector<4>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
+            MlasLoopUnroll<RowCount, MlasFgemmStoreVector<4>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
 
         } else {
 
@@ -367,11 +90,11 @@ MlasDgemmProcessCount(
 
             //
             if (CountN >= 6) {
-                MlasLoopUnroll<RowCount, MlasDgemmStoreVector<3>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
+                MlasLoopUnroll<RowCount, MlasFgemmStoreVector<3>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
             } else if (CountN >= 4) {
-                MlasLoopUnroll<RowCount, MlasDgemmStoreVector<2>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
+                MlasLoopUnroll<RowCount, MlasFgemmStoreVector<2>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
             } else if (CountN >= 2) {
-                MlasLoopUnroll<RowCount, MlasDgemmStoreVector<1>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
+                MlasLoopUnroll<RowCount, MlasFgemmStoreVector<1>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
             }
             //
             // Store the remaining unaligned columns.
@@ -381,9 +104,9 @@ MlasDgemmProcessCount(
 
             if (CountN > 0) {
 
-                MlasLoopUnroll<RowCount, MlasDgemmMultiplyAlphaTrailing>()(Accumulators, AlphaBroadcast);
+                MlasLoopUnroll<RowCount, MlasFgemmMultiplyAlphaTrailing>()(Accumulators, AlphaBroadcast);
 
-                MlasLoopUnroll<RowCount, MlasDgemmStoreScalar<0>>()(Accumulators, C, ldc, ZeroMode);
+                MlasLoopUnroll<RowCount, MlasFgemmStoreScalar<0>>()(Accumulators, C, ldc, ZeroMode);
             }
 
             break;
diff --git a/onnxruntime/core/mlas/lib/power/FgemmKernelpower.h b/onnxruntime/core/mlas/lib/power/FgemmKernelpower.h
new file mode 100644
index 0000000000..3746dbc82b
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/power/FgemmKernelpower.h
@@ -0,0 +1,333 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    FgemmKernelPower.h
+
+Abstract:
+
+    This module implements the kernels for the single/double precision matrix/matrix
+    multiply operation (DGEMM/SGEMM).
+
+--*/
+
+#include "mlasi.h"
+#if defined(SINGLE)
+#define MLAS_FLOATTYPE MLAS_FLOAT32X4
+#define MLAS_GEMMTYPE float 
+#define MLAS_LOAD_FLOAT MlasLoadFloat32x4
+#define MLAS_ZERO_FLOAT MlasZeroFloat32x4
+#define MLAS_STORE_FLOAT MlasStoreFloat32x4
+#define MLAS_EXTRACT_FLOAT MlasExtractLaneFloat32x4
+#define MLAS_MUL_FLOAT MlasMultiplyFloat32x4
+#define MLAS_MULADD_FLOAT MlasMultiplyAddFloat32x4
+#define MLAS_BROADCAST_FLOAT MlasBroadcastFloat32x4
+#else
+#define MLAS_FLOATTYPE MLAS_FLOAT64X2
+#define MLAS_GEMMTYPE double
+#define MLAS_LOAD_FLOAT MlasLoadFloat64x2
+#define MLAS_ZERO_FLOAT MlasZeroFloat64x2
+#define MLAS_STORE_FLOAT MlasStoreFloat64x2
+#define MLAS_EXTRACT_FLOAT MlasExtractLaneFloat64x2
+#define MLAS_MUL_FLOAT MlasMultiplyFloat64x2
+#define MLAS_MULADD_FLOAT MlasMultiplyAddFloat64x2
+#define MLAS_BROADCAST_FLOAT MlasBroadcastFloat64x2
+#endif
+//
+// Templates to ensure that a loop is unrolled.
+//
+
+template<size_t Count, size_t Index>
+struct MlasLoopUnrollStep
+{
+    template<typename IterationType, typename... IterationArgs>
+    MLAS_FORCEINLINE
+    static
+    void
+    Step(
+        IterationArgs&&... Arguments
+        )
+    {
+        IterationType::template Iteration<Count, Index>(Arguments...);
+        MlasLoopUnrollStep<Count, Index + 1>::template Step<IterationType>(Arguments...);
+    }
+};
+
+template<size_t Count>
+struct MlasLoopUnrollStep<Count, Count>
+{
+    template<typename IterationType, typename... IterationArgs>
+    MLAS_FORCEINLINE
+    static
+    void
+    Step(
+        IterationArgs&&...
+        )
+    {
+        // Terminate the loop.
+    }
+};
+
+template<size_t Count, typename IteratorType>
+struct MlasLoopUnroll
+{
+    template<typename... IterationArgs>
+    MLAS_FORCEINLINE
+    void
+    operator()(
+        IterationArgs&&... Arguments
+        )
+    {
+        MlasLoopUnrollStep<Count, 0>::template Step<IteratorType>(Arguments...);
+    }
+};
+
+//
+// Templates used with loop unrolling to perform an action on one row of the
+// output.
+//
+
+struct MlasFgemmZeroAccumulators
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE Accumulators[RowCount][4]
+        )
+    {
+        Accumulators[Row][0] = MLAS_ZERO_FLOAT();
+        Accumulators[Row][1] = MLAS_ZERO_FLOAT();
+        Accumulators[Row][2] = MLAS_ZERO_FLOAT();
+        Accumulators[Row][3] = MLAS_ZERO_FLOAT();
+    }
+};
+
+struct MlasFgemmLoadAElements
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE AElements[RowCount],
+        const MLAS_GEMMTYPE* A,
+        size_t lda
+        )
+    {
+        AElements[Row] = MLAS_LOAD_FLOAT(A + Row * lda);
+    }
+};
+
+struct MlasFgemmBroadcastAElements
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE ABroadcast[RowCount],
+        const MLAS_GEMMTYPE* A,
+        size_t lda
+        )
+    {
+        ABroadcast[Row] = MLAS_BROADCAST_FLOAT(A + Row * lda);
+    }
+};
+
+template<unsigned Lane>
+struct MlasFgemmSplatAElements
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE AElements[RowCount],
+        MLAS_FLOATTYPE ABroadcast[RowCount]
+        )
+    {
+        ABroadcast[Row] = vec_splat(AElements[Row], Lane);
+    }
+};
+
+struct MlasFgemmMultiplyAddRow
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE Accumulators[RowCount][4],
+        MLAS_FLOATTYPE ABroadcast[RowCount],
+        MLAS_FLOATTYPE BElements[4]
+        )
+    {
+        Accumulators[Row][0] = MLAS_MULADD_FLOAT(ABroadcast[Row], BElements[0], Accumulators[Row][0]);
+        Accumulators[Row][1] = MLAS_MULADD_FLOAT(ABroadcast[Row], BElements[1], Accumulators[Row][1]);
+        Accumulators[Row][2] = MLAS_MULADD_FLOAT(ABroadcast[Row], BElements[2], Accumulators[Row][2]);
+        Accumulators[Row][3] = MLAS_MULADD_FLOAT(ABroadcast[Row], BElements[3], Accumulators[Row][3]);
+    }
+};
+
+template<size_t RowCount>
+MLAS_FORCEINLINE
+void
+MlasFgemmComputeBlock(
+    MLAS_FLOATTYPE Accumulators[RowCount][4],
+    MLAS_FLOATTYPE ABroadcast[RowCount],
+    const MLAS_GEMMTYPE* B
+    )
+{
+    MLAS_FLOATTYPE BElements[4];
+#if defined(SINGLE)
+    BElements[0] = MLAS_LOAD_FLOAT(B);
+    BElements[1] = MLAS_LOAD_FLOAT(B + 4);
+    BElements[2] = MLAS_LOAD_FLOAT(B + 8);
+    BElements[3] = MLAS_LOAD_FLOAT(B + 12);
+#else
+    BElements[0] = MLAS_LOAD_FLOAT(B);
+    BElements[1] = MLAS_LOAD_FLOAT(B + 2);
+    BElements[2] = MLAS_LOAD_FLOAT(B + 4);
+    BElements[3] = MLAS_LOAD_FLOAT(B + 6);
+#endif
+
+    MlasLoopUnroll<RowCount, MlasFgemmMultiplyAddRow>()(Accumulators, ABroadcast, BElements);
+}
+
+struct MlasFgemmMultiplyAlphaRow
+{
+    template<size_t Count, size_t Index>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE Accumulators[4],
+        MLAS_FLOATTYPE AlphaBroadcast
+        )
+    {
+        Accumulators[Index] = MLAS_MUL_FLOAT(Accumulators[Index], AlphaBroadcast);
+    }
+};
+
+struct MlasFgemmMultiplyAlphaAddRow
+{
+    template<size_t Count, size_t Index>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE Accumulators[4],
+        MLAS_FLOATTYPE AlphaBroadcast,
+        const MLAS_GEMMTYPE* C
+        )
+    {
+#if defined(SINGLE)
+        Accumulators[Index] = MLAS_MULADD_FLOAT(Accumulators[Index],
+            AlphaBroadcast, MLAS_LOAD_FLOAT(C + Index * 4));
+#else
+        Accumulators[Index] = MLAS_MULADD_FLOAT(Accumulators[Index],
+            AlphaBroadcast, MLAS_LOAD_FLOAT(C + Index * 2));
+#endif
+    }
+};
+
+struct MlasFgemmStoreRow
+{
+    template<size_t Count, size_t Index>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE Accumulators[4],
+        MLAS_GEMMTYPE* C
+        )
+    {
+#if defined(SINGLE)
+        MLAS_STORE_FLOAT(C + Index * 4, Accumulators[Index]);
+#else
+        MLAS_STORE_FLOAT(C + Index * 2, Accumulators[Index]);
+#endif
+    }
+};
+
+template<size_t VectorCount>
+struct MlasFgemmStoreVector
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE Accumulators[RowCount][4],
+        MLAS_GEMMTYPE* C,
+        size_t ldc,
+        MLAS_FLOATTYPE AlphaBroadcast,
+        bool ZeroMode
+        )
+    {
+        MLAS_GEMMTYPE* c = C + Row * ldc;
+
+        if (ZeroMode) {
+            MlasLoopUnroll<VectorCount, MlasFgemmMultiplyAlphaRow>()(Accumulators[Row], AlphaBroadcast);
+        } else {
+            MlasLoopUnroll<VectorCount, MlasFgemmMultiplyAlphaAddRow>()(Accumulators[Row], AlphaBroadcast, c);
+        }
+
+        MlasLoopUnroll<VectorCount, MlasFgemmStoreRow>()(Accumulators[Row], c);
+
+        //
+        // Shift down any unaligned elements to the bottom for further processing.
+        //
+
+        if (VectorCount < 4) {
+            Accumulators[Row][0] = Accumulators[Row][VectorCount];
+        }
+    }
+};
+
+struct MlasFgemmMultiplyAlphaTrailing
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE Accumulators[RowCount][4],
+        MLAS_FLOATTYPE AlphaBroadcast
+        )
+    {
+        Accumulators[Row][0] = MLAS_MUL_FLOAT(Accumulators[Row][0], AlphaBroadcast);
+    }
+};
+
+template<unsigned Lane>
+struct MlasFgemmStoreScalar
+{
+    template<size_t RowCount, size_t Row>
+    MLAS_FORCEINLINE
+    static
+    void
+    Iteration(
+        MLAS_FLOATTYPE Accumulators[RowCount][4],
+        MLAS_GEMMTYPE* C,
+        size_t ldc,
+        bool ZeroMode
+        )
+    {
+        MLAS_GEMMTYPE* c = C + Row * ldc + Lane;
+        MLAS_GEMMTYPE Value = MLAS_EXTRACT_FLOAT<Lane>(Accumulators[Row][0]);
+
+        if (!ZeroMode) {
+            Value += *c;
+        }
+
+        *c = Value;
+    }
+};
+
diff --git a/onnxruntime/core/mlas/lib/power/SgemmKernelPOWER10.cpp b/onnxruntime/core/mlas/lib/power/SgemmKernelPOWER10.cpp
index 9ba4f8062a..bc08af0cd7 100644
--- a/onnxruntime/core/mlas/lib/power/SgemmKernelPOWER10.cpp
+++ b/onnxruntime/core/mlas/lib/power/SgemmKernelPOWER10.cpp
@@ -188,10 +188,10 @@ MlasSgemmMMAProcessCount(
         //
         while (k >= 4) {
 
-            MlasLoopUnroll<RowCount, MlasSgemmLoadAElements>()(AElements, a, lda);
+            MlasLoopUnroll<RowCount, MlasFgemmLoadAElements>()(AElements, a, lda);
             MlasSgemmComputeAElements<RowCount>(AElements, ABroadcast);
             if (CountM == 8) {
-                MlasLoopUnroll<RowCount, MlasSgemmLoadAElements>()(AElements, a + ( lda * 4), lda);
+                MlasLoopUnroll<RowCount, MlasFgemmLoadAElements>()(AElements, a + ( lda * 4), lda);
                 MlasSgemmComputeAElements<RowCount>(AElements, A2Broadcast);
             }
             MlasSgemmComputeBlockMMA<RowCount>(&acc[0], ABroadcast[0], A2Broadcast[0], B, CountM);
diff --git a/onnxruntime/core/mlas/lib/power/SgemmKernelpower.h b/onnxruntime/core/mlas/lib/power/SgemmKernelpower.h
index 1cd8d7dd16..53be544bdb 100644
--- a/onnxruntime/core/mlas/lib/power/SgemmKernelpower.h
+++ b/onnxruntime/core/mlas/lib/power/SgemmKernelpower.h
@@ -6,7 +6,7 @@ Licensed under the MIT License.
 
 Module Name:
 
-    SgemmKernelPower.cpp
+    SgemmKernelpower.h
 
 Abstract:
 
@@ -15,286 +15,7 @@ Abstract:
 
 --*/
 
-#include "mlasi.h"
-
-//
-// Templates to ensure that a loop is unrolled.
-//
-
-template<size_t Count, size_t Index>
-struct MlasLoopUnrollStep
-{
-    template<typename IterationType, typename... IterationArgs>
-    MLAS_FORCEINLINE
-    static
-    void
-    Step(
-        IterationArgs&&... Arguments
-        )
-    {
-        IterationType::template Iteration<Count, Index>(Arguments...);
-        MlasLoopUnrollStep<Count, Index + 1>::template Step<IterationType>(Arguments...);
-    }
-};
-
-template<size_t Count>
-struct MlasLoopUnrollStep<Count, Count>
-{
-    template<typename IterationType, typename... IterationArgs>
-    MLAS_FORCEINLINE
-    static
-    void
-    Step(
-        IterationArgs&&...
-        )
-    {
-        // Terminate the loop.
-    }
-};
-
-template<size_t Count, typename IteratorType>
-struct MlasLoopUnroll
-{
-    template<typename... IterationArgs>
-    MLAS_FORCEINLINE
-    void
-    operator()(
-        IterationArgs&&... Arguments
-        )
-    {
-        MlasLoopUnrollStep<Count, 0>::template Step<IteratorType>(Arguments...);
-    }
-};
-
-//
-// Templates used with loop unrolling to perform an action on one row of the
-// output.
-//
-
-struct MlasSgemmZeroAccumulators
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 Accumulators[RowCount][4]
-        )
-    {
-        Accumulators[Row][0] = MlasZeroFloat32x4();
-        Accumulators[Row][1] = MlasZeroFloat32x4();
-        Accumulators[Row][2] = MlasZeroFloat32x4();
-        Accumulators[Row][3] = MlasZeroFloat32x4();
-    }
-};
-
-struct MlasSgemmLoadAElements
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 AElements[RowCount],
-        const float* A,
-        size_t lda
-        )
-    {
-        AElements[Row] = MlasLoadFloat32x4(A + Row * lda);
-    }
-};
-
-struct MlasSgemmBroadcastAElements
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 ABroadcast[RowCount],
-        const float* A,
-        size_t lda
-        )
-    {
-        ABroadcast[Row] = MlasBroadcastFloat32x4(A + Row * lda);
-    }
-};
-
-template<unsigned Lane>
-struct MlasSgemmSplatAElements
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 AElements[RowCount],
-        MLAS_FLOAT32X4 ABroadcast[RowCount]
-        )
-    {
-        ABroadcast[Row] = vec_splat(AElements[Row], Lane);
-    }
-};
-
-struct MlasSgemmMultiplyAddRow
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 Accumulators[RowCount][4],
-        MLAS_FLOAT32X4 ABroadcast[RowCount],
-        MLAS_FLOAT32X4 BElements[4]
-        )
-    {
-        Accumulators[Row][0] = MlasMultiplyAddFloat32x4(ABroadcast[Row], BElements[0], Accumulators[Row][0]);
-        Accumulators[Row][1] = MlasMultiplyAddFloat32x4(ABroadcast[Row], BElements[1], Accumulators[Row][1]);
-        Accumulators[Row][2] = MlasMultiplyAddFloat32x4(ABroadcast[Row], BElements[2], Accumulators[Row][2]);
-        Accumulators[Row][3] = MlasMultiplyAddFloat32x4(ABroadcast[Row], BElements[3], Accumulators[Row][3]);
-    }
-};
-
-template<size_t RowCount>
-MLAS_FORCEINLINE
-void
-MlasSgemmComputeBlock(
-    MLAS_FLOAT32X4 Accumulators[RowCount][4],
-    MLAS_FLOAT32X4 ABroadcast[RowCount],
-    const float* B
-    )
-{
-    MLAS_FLOAT32X4 BElements[4];
-
-    BElements[0] = MlasLoadFloat32x4(B);
-    BElements[1] = MlasLoadFloat32x4(B + 4);
-    BElements[2] = MlasLoadFloat32x4(B + 8);
-    BElements[3] = MlasLoadFloat32x4(B + 12);
-
-    MlasLoopUnroll<RowCount, MlasSgemmMultiplyAddRow>()(Accumulators, ABroadcast, BElements);
-}
-
-struct MlasSgemmMultiplyAlphaRow
-{
-    template<size_t Count, size_t Index>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 Accumulators[4],
-        MLAS_FLOAT32X4 AlphaBroadcast
-        )
-    {
-        Accumulators[Index] = MlasMultiplyFloat32x4(Accumulators[Index], AlphaBroadcast);
-    }
-};
-
-struct MlasSgemmMultiplyAlphaAddRow
-{
-    template<size_t Count, size_t Index>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 Accumulators[4],
-        MLAS_FLOAT32X4 AlphaBroadcast,
-        const float* C
-        )
-    {
-        Accumulators[Index] = MlasMultiplyAddFloat32x4(Accumulators[Index],
-            AlphaBroadcast, MlasLoadFloat32x4(C + Index * 4));
-    }
-};
-
-struct MlasSgemmStoreRow
-{
-    template<size_t Count, size_t Index>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 Accumulators[4],
-        float* C
-        )
-    {
-        MlasStoreFloat32x4(C + Index * 4, Accumulators[Index]);
-    }
-};
-
-template<size_t VectorCount>
-struct MlasSgemmStoreVector
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 Accumulators[RowCount][4],
-        float* C,
-        size_t ldc,
-        MLAS_FLOAT32X4 AlphaBroadcast,
-        bool ZeroMode
-        )
-    {
-        float* c = C + Row * ldc;
-
-        if (ZeroMode) {
-            MlasLoopUnroll<VectorCount, MlasSgemmMultiplyAlphaRow>()(Accumulators[Row], AlphaBroadcast);
-        } else {
-            MlasLoopUnroll<VectorCount, MlasSgemmMultiplyAlphaAddRow>()(Accumulators[Row], AlphaBroadcast, c);
-        }
-
-        MlasLoopUnroll<VectorCount, MlasSgemmStoreRow>()(Accumulators[Row], c);
-
-        //
-        // Shift down any unaligned elements to the bottom for further processing.
-        //
-
-        if (VectorCount < 4) {
-            Accumulators[Row][0] = Accumulators[Row][VectorCount];
-        }
-    }
-};
-
-struct MlasSgemmMultiplyAlphaTrailing
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 Accumulators[RowCount][4],
-        MLAS_FLOAT32X4 AlphaBroadcast
-        )
-    {
-        Accumulators[Row][0] = MlasMultiplyFloat32x4(Accumulators[Row][0], AlphaBroadcast);
-    }
-};
-
-template<unsigned Lane>
-struct MlasSgemmStoreScalar
-{
-    template<size_t RowCount, size_t Row>
-    MLAS_FORCEINLINE
-    static
-    void
-    Iteration(
-        MLAS_FLOAT32X4 Accumulators[RowCount][4],
-        float* C,
-        size_t ldc,
-        bool ZeroMode
-        )
-    {
-        float* c = C + Row * ldc + Lane;
-        float Value = MlasExtractLaneFloat32x4<Lane>(Accumulators[Row][0]);
-
-        if (!ZeroMode) {
-            Value += *c;
-        }
-
-        *c = Value;
-    }
-};
+#include "FgemmKernelpower.h"
 
 template<size_t RowCount>
 MLAS_FORCEINLINE
@@ -324,7 +45,7 @@ MlasSgemmProcessCount(
         // Clear the block accumulators.
         //
 
-        MlasLoopUnroll<RowCount, MlasSgemmZeroAccumulators>()(Accumulators);
+        MlasLoopUnroll<RowCount, MlasFgemmZeroAccumulators>()(Accumulators);
 
         //
         // Compute the output block.
@@ -332,19 +53,19 @@ MlasSgemmProcessCount(
 
         while (k >= 4) {
 
-            MlasLoopUnroll<RowCount, MlasSgemmLoadAElements>()(AElements, a, lda);
+            MlasLoopUnroll<RowCount, MlasFgemmLoadAElements>()(AElements, a, lda);
 
-            MlasLoopUnroll<RowCount, MlasSgemmSplatAElements<0>>()(AElements, ABroadcast);
-            MlasSgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B);
+            MlasLoopUnroll<RowCount, MlasFgemmSplatAElements<0>>()(AElements, ABroadcast);
+            MlasFgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B);
 
-            MlasLoopUnroll<RowCount, MlasSgemmSplatAElements<1>>()(AElements, ABroadcast);
-            MlasSgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B + 16);
+            MlasLoopUnroll<RowCount, MlasFgemmSplatAElements<1>>()(AElements, ABroadcast);
+            MlasFgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B + 16);
 
-            MlasLoopUnroll<RowCount, MlasSgemmSplatAElements<2>>()(AElements, ABroadcast);
-            MlasSgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B + 32);
+            MlasLoopUnroll<RowCount, MlasFgemmSplatAElements<2>>()(AElements, ABroadcast);
+            MlasFgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B + 32);
 
-            MlasLoopUnroll<RowCount, MlasSgemmSplatAElements<3>>()(AElements, ABroadcast);
-            MlasSgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B + 48);
+            MlasLoopUnroll<RowCount, MlasFgemmSplatAElements<3>>()(AElements, ABroadcast);
+            MlasFgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B + 48);
 
             a += 4;
             B += 16 * 4;
@@ -353,8 +74,8 @@ MlasSgemmProcessCount(
 
         while (k > 0) {
 
-            MlasLoopUnroll<RowCount, MlasSgemmBroadcastAElements>()(ABroadcast, a, lda);
-            MlasSgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B);
+            MlasLoopUnroll<RowCount, MlasFgemmBroadcastAElements>()(ABroadcast, a, lda);
+            MlasFgemmComputeBlock<RowCount>(Accumulators, ABroadcast, B);
 
             a += 1;
             B += 16;
@@ -367,7 +88,7 @@ MlasSgemmProcessCount(
             // Store the entire output block.
             //
 
-            MlasLoopUnroll<RowCount, MlasSgemmStoreVector<4>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
+            MlasLoopUnroll<RowCount, MlasFgemmStoreVector<4>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
 
         } else {
 
@@ -376,11 +97,11 @@ MlasSgemmProcessCount(
             //
 
             if (CountN >= 12) {
-                MlasLoopUnroll<RowCount, MlasSgemmStoreVector<3>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
+                MlasLoopUnroll<RowCount, MlasFgemmStoreVector<3>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
             } else if (CountN >= 8) {
-                MlasLoopUnroll<RowCount, MlasSgemmStoreVector<2>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
+                MlasLoopUnroll<RowCount, MlasFgemmStoreVector<2>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
             } else if (CountN >= 4) {
-                MlasLoopUnroll<RowCount, MlasSgemmStoreVector<1>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
+                MlasLoopUnroll<RowCount, MlasFgemmStoreVector<1>>()(Accumulators, C, ldc, AlphaBroadcast, ZeroMode);
             }
 
             //
@@ -392,16 +113,16 @@ MlasSgemmProcessCount(
 
             if (CountN > 0) {
 
-                MlasLoopUnroll<RowCount, MlasSgemmMultiplyAlphaTrailing>()(Accumulators, AlphaBroadcast);
+                MlasLoopUnroll<RowCount, MlasFgemmMultiplyAlphaTrailing>()(Accumulators, AlphaBroadcast);
 
-                MlasLoopUnroll<RowCount, MlasSgemmStoreScalar<0>>()(Accumulators, C, ldc, ZeroMode);
+                MlasLoopUnroll<RowCount, MlasFgemmStoreScalar<0>>()(Accumulators, C, ldc, ZeroMode);
 
                 if (CountN >= 2) {
-                    MlasLoopUnroll<RowCount, MlasSgemmStoreScalar<1>>()(Accumulators, C, ldc, ZeroMode);
+                    MlasLoopUnroll<RowCount, MlasFgemmStoreScalar<1>>()(Accumulators, C, ldc, ZeroMode);
                 }
 
                 if (CountN >= 3) {
-                    MlasLoopUnroll<RowCount, MlasSgemmStoreScalar<2>>()(Accumulators, C, ldc, ZeroMode);
+                    MlasLoopUnroll<RowCount, MlasFgemmStoreScalar<2>>()(Accumulators, C, ldc, ZeroMode);
                 }
             }
 
diff --git a/onnxruntime/core/mlas/lib/qdwconv_kernelsize.cpp b/onnxruntime/core/mlas/lib/qdwconv_kernelsize.cpp
index 097ce8fcea..32a7fcec91 100644
--- a/onnxruntime/core/mlas/lib/qdwconv_kernelsize.cpp
+++ b/onnxruntime/core/mlas/lib/qdwconv_kernelsize.cpp
@@ -20,9 +20,8 @@ MlasConvSymDepthwiseKernelSize25Arm(
     const int16x8_t voutput_zero_point = vld1q_dup_s16((int16_t const*)&PostProcessParams->OutputZeroPoint);
     float32x4_t vscale_0123, vscale_4567, vscale_89AB, vscale_CDEF;
     const bool is_per_channel = ((KernelFlags & MLAS_CONV_SYM_FLAG_PER_CHANNEL_SCALE) != 0);
-    if (!is_per_channel) {
-        vscale_0123 = vscale_4567 = vscale_89AB = vscale_CDEF = vld1q_dup_f32(PostProcessParams->Scale);
-    }
+    // Init them anyway due to some compiler will generate uninitialized warnings.
+    vscale_0123 = vscale_4567 = vscale_89AB = vscale_CDEF = vld1q_dup_f32(PostProcessParams->Scale);
     while (OutputCount-- > 0) {
         const uint8_t* i00 = InputIndirection[0];
         const uint8_t* i01 = InputIndirection[1];
diff --git a/onnxruntime/core/mlas/lib/qlgavgpool.cpp b/onnxruntime/core/mlas/lib/qlgavgpool.cpp
index d8972eecbf..38c777d34b 100644
--- a/onnxruntime/core/mlas/lib/qlgavgpool.cpp
+++ b/onnxruntime/core/mlas/lib/qlgavgpool.cpp
@@ -24,6 +24,7 @@ MlasQLinearSafePaddingElementCount(
     )
 {
     if (!(ElementSize == 1 || ElementSize == 2 || ElementSize == 4 || ElementSize == 8 || ElementSize == 16)) {
+
 #ifdef MLAS_NO_EXCEPTION
         abort();
 #else
@@ -42,6 +43,7 @@ CheckQLinearGlobalAveragePoolScaleAndSize(
     )
 {
     if (ImageSize >= 0x1000000) {
+
 #ifdef MLAS_NO_EXCEPTION
         abort();
 #else
@@ -51,6 +53,7 @@ CheckQLinearGlobalAveragePoolScaleAndSize(
 
     float scale = ScaleInput / (ScaleOutput * static_cast<float>(ImageSize));
     if (scale < 0x1.0p-32f || scale >= 256.0f) {
+
         // In first case, the scale is too small, ScaleInput/ScaleOutput < 1/256 no matter what ImageSize
         // In second case, the scale is too large, ScaleInput/ScaleOutput >= 256 no matter what Image Size
         // both case make output value constant, and hence not meaningful.
@@ -65,13 +68,14 @@ CheckQLinearGlobalAveragePoolScaleAndSize(
 
 #if defined(MLAS_NEON_INTRINSICS)
 
+template <typename T8Bits>
 void
 MLASCALL
 MlasQLinearGlobalAveragePoolNchw(
-    const uint8_t* Input,
+    const T8Bits* Input,
     float ScaleInput,
     int32_t ZeroPointInput,
-    uint8_t* Output,
+    T8Bits* Output,
     float ScaleOutput,
     int32_t ZeroPointOutput,
     size_t Channels,
@@ -83,38 +87,65 @@ MlasQLinearGlobalAveragePoolNchw(
     int32_t bias[] = {-ZeroPointInput * static_cast<int32_t>(ImageSize), 0, 0, 0};
     const int32x4_t vbias = vld1q_s32(bias);
     const int32x4_t vzero = vmovq_n_s32(0);
+    const uint8_t* InputU8 = (const uint8_t*)(Input);
 
     int32_t* sum_buffer = AccumulateBuffer;
     uint8_t tail_buffer[8] = {0, 0, 0, 0, 0, 0, 0, 0};
     for (size_t c = Channels; c > 0; c--) {
+
         int32x4_t vacc_lo = vbias;
         int32x4_t vacc_hi = vzero;
         auto Len = ImageSize;
         for (; Len >= 32; Len -= 32) {
-            const uint8x8_t vi0 = vld1_u8(Input);
-            const uint8x8_t vi1 = vld1_u8(Input + 8);
-            const uint8x8_t vi2 = vld1_u8(Input + 16);
-            const uint8x8_t vi3 = vld1_u8(Input + 24);
 
-            const uint16x8_t vs01 = vaddl_u8(vi0, vi1);
-            const uint16x8_t vs23 = vaddl_u8(vi2, vi3);
-            const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vs01, vs23));
+            const uint8x8_t vi0 = vld1_u8(InputU8);
+            const uint8x8_t vi1 = vld1_u8(InputU8 + 8);
+            const uint8x8_t vi2 = vld1_u8(InputU8 + 16);
+            const uint8x8_t vi3 = vld1_u8(InputU8 + 24);
+
+            int16x8_t vsum;
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const int16x8_t vs01 = vaddl_s8(vreinterpret_s8_u8(vi0), vreinterpret_s8_u8(vi1));
+                const int16x8_t vs23 = vaddl_s8(vreinterpret_s8_u8(vi2), vreinterpret_s8_u8(vi3));
+                vsum = vaddq_s16(vs01, vs23);
+            } else {
+
+                const uint16x8_t vs01 = vaddl_u8(vi0, vi1);
+                const uint16x8_t vs23 = vaddl_u8(vi2, vi3);
+                vsum = vreinterpretq_s16_u16(vaddq_u16(vs01, vs23));
+            }
+
             vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
             vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));
-            Input += 32;
+            InputU8 += 32;
         }
         for (; Len >= 8; Len -= 8) {
-            const int16x8_t vsum = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(Input)));
+
+            int16x8_t vsum;
+            if constexpr (std::is_signed<T8Bits>::value) {
+                vsum = vmovl_s8(vreinterpret_s8_u8(vld1_u8(InputU8)));
+            } else {
+                vsum = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(InputU8)));
+            }
             vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
             vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));
-            Input += 8;
+            InputU8 += 8;
         }
+
         if (Len > 0) {
-            memcpy(tail_buffer, Input, Len);
-            const int16x8_t vsum = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(tail_buffer)));
+
+            memcpy(tail_buffer, InputU8, Len);
+            int16x8_t vsum;
+            if constexpr (std::is_signed<T8Bits>::value) {
+                vsum = vmovl_s8(vreinterpret_s8_u8(vld1_u8(tail_buffer)));
+            } else {
+                vsum = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(tail_buffer)));
+            }
+
             vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
             vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));
-            Input += Len;
+            InputU8 += Len;
         }
 
         vacc_lo = vaddq_s32(vacc_lo, vacc_hi);
@@ -123,23 +154,24 @@ MlasQLinearGlobalAveragePoolNchw(
     }
 
     MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false,
-                         static_cast<uint8_t>(ZeroPointOutput), 0, 0, 1, Channels);
+                         static_cast<T8Bits>(ZeroPointOutput), 0, 0, 1, Channels);
 }
 
+template <typename T8Bits>
 MLAS_FORCEINLINE
 void
 MlasQLinearGlobalAveragePoolNhwcSingleBatch(
-    const uint8_t* Input,
-    uint8_t* Output,
-    const uint8_t* LastOf8,
+    const T8Bits* Input,
+    T8Bits* Output,
+    const T8Bits* LastOf8,
     size_t ImageSize,
     size_t Channels,
     size_t Stride,
     int32_t Bias,
     float Scale,
-    uint8_t Output_zero_point,
+    T8Bits Output_zero_point,
     int32_t* AccumulateBuffer,
-    const uint8_t* ZeroBuffer
+    const T8Bits* ZeroBuffer
     )
 {
 #define LOAD_FULL_CHANNELS()           \
@@ -158,24 +190,35 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
     const uint8x8_t vi6 = vld1_u8(i6); \
     i6 += 8
 
-#define CALCULATE_ACCUMULATE_VECTORS()                                               \
-    int32x4_t vacc_lo = finish_one_pass ? vld1q_s32(acc) : vbias;                    \
-    int32x4_t vacc_hi = finish_one_pass ? vld1q_s32(acc + 4) : vbias;                \
-    const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);                                    \
-    const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);                                    \
-    const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);                                    \
-    const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6);                                \
-    const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);                           \
-    const int16x8_t vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345));      \
-    vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));                                \
+#define CALCULATE_ACCUMULATE_VECTORS()                                                         \
+    int32x4_t vacc_lo = finish_one_pass ? vld1q_s32(acc) : vbias;                              \
+    int32x4_t vacc_hi = finish_one_pass ? vld1q_s32(acc + 4) : vbias;                          \
+    int16x8_t vsum;                                                                            \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        const int16x8_t vsum01 = vaddl_s8(vreinterpret_s8_u8(vi0), vreinterpret_s8_u8(vi1));   \
+        const int16x8_t vsum23 = vaddl_s8(vreinterpret_s8_u8(vi2), vreinterpret_s8_u8(vi3));   \
+        const int16x8_t vsum45 = vaddl_s8(vreinterpret_s8_u8(vi4), vreinterpret_s8_u8(vi5));   \
+        const int16x8_t vsum016 = vaddw_s8(vsum01, vreinterpret_s8_u8(vi6));                   \
+        const int16x8_t vsum2345 = vaddq_s16(vsum23, vsum45);                                  \
+        vsum = vaddq_s16(vsum016, vsum2345);                                                   \
+    } else {                                                                                   \
+        const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);                                          \
+        const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);                                          \
+        const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);                                          \
+        const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6);                                      \
+        const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);                                 \
+        vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345));                            \
+    }                                                                                          \
+    vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));                                          \
     vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum))
 
-    uint8_t tail[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+    uint8_t tail[8] = {0, 0, 0, 0, 0, 0, 0, 0};
     const int32x4_t vbias = vld1q_dup_s32(&Bias);
     bool finish_one_pass = false;
     const size_t step_next_group = 7 * Stride - (Channels & ~size_t{7});
 
-    const uint8_t* i0 = Input;
+    const uint8_t* LastOf8U8 = (const uint8_t*)LastOf8;
+    const uint8_t* i0 = (const uint8_t*)Input;
     const uint8_t* i1 = i0 + Stride;
     const uint8_t* i4 = i0 + Stride * 4;
     const uint8_t* i2 = i1 + Stride;
@@ -184,9 +227,11 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
     const uint8_t* i6 = i5 + Stride;
 
     for (; ImageSize > 7; ImageSize -= 7) {
+
         int32_t* acc = AccumulateBuffer;
         size_t c = Channels;
         for (; c >= 8; c -= 8) {
+
             LOAD_FULL_CHANNELS();
 
             CALCULATE_ACCUMULATE_VECTORS();
@@ -196,13 +241,14 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
             acc += 8;
         }
         if (c > 0) {
-            const uint8x8_t vi0 = vld1_u8(((i0 >= LastOf8) ? (const uint8_t*)memcpy(tail, i0, c) : i0));
-            const uint8x8_t vi1 = vld1_u8(((i1 >= LastOf8) ? (const uint8_t*)memcpy(tail, i1, c) : i1));
-            const uint8x8_t vi2 = vld1_u8(((i2 >= LastOf8) ? (const uint8_t*)memcpy(tail, i2, c) : i2));
-            const uint8x8_t vi3 = vld1_u8(((i3 >= LastOf8) ? (const uint8_t*)memcpy(tail, i3, c) : i3));
-            const uint8x8_t vi4 = vld1_u8(((i4 >= LastOf8) ? (const uint8_t*)memcpy(tail, i4, c) : i4));
-            const uint8x8_t vi5 = vld1_u8(((i5 >= LastOf8) ? (const uint8_t*)memcpy(tail, i5, c) : i5));
-            const uint8x8_t vi6 = vld1_u8(((i6 >= LastOf8) ? (const uint8_t*)memcpy(tail, i6, c) : i6));
+
+            const uint8x8_t vi0 = vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0));
+            const uint8x8_t vi1 = vld1_u8(((i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1));
+            const uint8x8_t vi2 = vld1_u8(((i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2));
+            const uint8x8_t vi3 = vld1_u8(((i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3));
+            const uint8x8_t vi4 = vld1_u8(((i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4));
+            const uint8x8_t vi5 = vld1_u8(((i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5));
+            const uint8x8_t vi6 = vld1_u8(((i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6));
 
             CALCULATE_ACCUMULATE_VECTORS();
 
@@ -221,19 +267,28 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
     }
 
     if (ImageSize > 0) {
+
         switch (ImageSize) {
-        case 1: i1 = ZeroBuffer; /* fall through */
-        case 2: i2 = ZeroBuffer; /* fall through */
-        case 3: i3 = ZeroBuffer; /* fall through */
-        case 4: i4 = ZeroBuffer; /* fall through */
-        case 5: i5 = ZeroBuffer; /* fall through */
-        case 6: i6 = ZeroBuffer; /* fall through */
-        default: break;
+            case 1:
+                i1 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 2:
+                i2 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 3:
+                i3 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 4:
+                i4 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 5:
+                i5 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 6:
+                i6 = (const uint8_t*)ZeroBuffer; /* fall through */
+            default:
+                break;
         }
 
         int32_t* acc = AccumulateBuffer;
         size_t c = Channels;
         for (; c >= 8; c -= 8) {
+
             LOAD_FULL_CHANNELS();
 
             CALCULATE_ACCUMULATE_VECTORS();
@@ -244,13 +299,21 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
         }
 
         if (c > 0) {
-            const uint8x8_t vi0 = vld1_u8(((i0 >= LastOf8) ? (const uint8_t*)memcpy(tail, i0, c) : i0));
-            const uint8x8_t vi1 = vld1_u8(((1 < ImageSize && i1 >= LastOf8) ? (const uint8_t*)memcpy(tail, i1, c) : i1));
-            const uint8x8_t vi2 = vld1_u8(((2 < ImageSize && i2 >= LastOf8) ? (const uint8_t*)memcpy(tail, i2, c) : i2));
-            const uint8x8_t vi3 = vld1_u8(((3 < ImageSize && i3 >= LastOf8) ? (const uint8_t*)memcpy(tail, i3, c) : i3));
-            const uint8x8_t vi4 = vld1_u8(((4 < ImageSize && i4 >= LastOf8) ? (const uint8_t*)memcpy(tail, i4, c) : i4));
-            const uint8x8_t vi5 = vld1_u8(((5 < ImageSize && i5 >= LastOf8) ? (const uint8_t*)memcpy(tail, i5, c) : i5));
-            const uint8x8_t vi6 = vld1_u8(((6 < ImageSize && i6 >= LastOf8) ? (const uint8_t*)memcpy(tail, i6, c) : i6));
+
+            const uint8x8_t vi0 =
+                vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0));
+            const uint8x8_t vi1 = vld1_u8(
+                ((1 < ImageSize && i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1));
+            const uint8x8_t vi2 = vld1_u8(
+                ((2 < ImageSize && i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2));
+            const uint8x8_t vi3 = vld1_u8(
+                ((3 < ImageSize && i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3));
+            const uint8x8_t vi4 = vld1_u8(
+                ((4 < ImageSize && i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4));
+            const uint8x8_t vi5 = vld1_u8(
+                ((5 < ImageSize && i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5));
+            const uint8x8_t vi6 = vld1_u8(
+                ((6 < ImageSize && i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6));
 
             CALCULATE_ACCUMULATE_VECTORS();
 
@@ -264,13 +327,13 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
 
 #elif defined(MLAS_SSE2_INTRINSICS)
 
-void
-MLASCALL
+template <typename T8Bits>
+void MLASCALL
 MlasQLinearGlobalAveragePoolNchw(
-    const uint8_t* Input,
+    const T8Bits* Input,
     float ScaleInput,
     int32_t ZeroPointInput,
-    uint8_t* Output,
+    T8Bits* Output,
     float ScaleOutput,
     int32_t ZeroPointOutput,
     size_t Channels,
@@ -286,36 +349,73 @@ MlasQLinearGlobalAveragePoolNchw(
 
     int32_t* sum_buffer = AccumulateBuffer;
     for (size_t c = Channels; c > 0; c--) {
+
         __m128i vacc_lo = vbias;
         __m128i vacc_hi = vzero;
         auto Len = ImageSize;
         for (; Len >= 32; Len -= 32) {
+
             const __m128i vi0 = _mm_loadl_epi64((const __m128i*)Input);
             const __m128i vi1 = _mm_loadl_epi64((const __m128i*)(Input + 8));
             const __m128i vi2 = _mm_loadl_epi64((const __m128i*)(Input + 16));
             const __m128i vi3 = _mm_loadl_epi64((const __m128i*)(Input + 24));
 
-            const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
-            const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
-            const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
-            const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const __m128i vxi0 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi0), 8);
+                const __m128i vxi1 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi1), 8);
+                const __m128i vxi2 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi2), 8);
+                const __m128i vxi3 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi3), 8);
+                const __m128i vsum = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1),
+                                                   _mm_add_epi16(vxi2, vxi3));
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16));
+            } else {
+
+                const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
+                const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
+                const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
+                const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
+                const __m128i vsum = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1),
+                                                   _mm_add_epi16(vxi2, vxi3));
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+            }
 
-            const __m128i vsum = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), _mm_add_epi16(vxi2, vxi3));
-            vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
-            vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
             Input += 32;
         }
         for (; Len >= 8; Len -= 8) {
-            const __m128i vsum = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)Input), vzero);
-            vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
-            vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const __m128i vsum = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, _mm_loadl_epi64((const __m128i*)Input)), 8);
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16));
+            } else {
+
+                const __m128i vsum = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)Input), vzero);
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+            }
+
             Input += 8;
         }
         if (Len > 0) {
+
             memcpy(buffer, Input, Len);
-            const __m128i vsum = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)buffer), vzero);
-            vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
-            vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const __m128i vsum = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, _mm_loadl_epi64((const __m128i*)buffer)), 8);
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16));
+            } else {
+
+                const __m128i vsum = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)buffer), vzero);
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+            }
+
             Input += Len;
         }
 
@@ -326,24 +426,27 @@ MlasQLinearGlobalAveragePoolNchw(
         vsums = _mm_add_epi32(vsums, vshuf);
         *sum_buffer++ = _mm_cvtsi128_si32(vsums);
     }
+
     MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false,
-                         static_cast<uint8_t>(ZeroPointOutput), 0, 0, 1, Channels);
+                         static_cast<T8Bits>(ZeroPointOutput), 0, 0, 1, Channels);
 }
 
+template <typename T8Bits>
 MLAS_FORCEINLINE
 void
 MlasQLinearGlobalAveragePoolNhwcSingleBatch(
-    const uint8_t* Input,
-    uint8_t* Output,
-    const uint8_t* LastOf8,
+    const T8Bits* Input,
+    T8Bits* Output,
+    const T8Bits* LastOf8,
     size_t ImageSize,
     size_t Channels,
     size_t Stride,
     int32_t Bias,
     float Scale,
-    uint8_t Output_zero_point,
+    T8Bits Output_zero_point,
     int32_t* AccumulateBuffer,
-    const uint8_t* ZeroBuffer)
+    const T8Bits* ZeroBuffer
+    )
 {
 #if defined(MLAS_TARGET_IX86)
 
@@ -359,23 +462,39 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
     const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3); \
     i3 += 8;
 
-#define CALCULATE_ACCUMULATE_VECTORS()                                                                 \
-    __m128i vacc_lo = finish_one_pass ? _mm_loadu_si128((__m128i*)acc) : vbias;                        \
-    __m128i vacc_hi = finish_one_pass ? _mm_loadu_si128(((__m128i*)acc) + 1) : vbias;                  \
-    const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);                                                \
-    const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);                                                \
-    const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);                                                \
-    const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);                                                \
-    const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);                                                  \
-    const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);                                                  \
-    const __m128i vsum = _mm_add_epi16(vsum01, vsum23);                                                \
-    vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));                                 \
-    vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero))
+#define CALCULATE_ACCUMULATE_VECTORS()                                                         \
+    __m128i vacc_lo = finish_one_pass ? _mm_loadu_si128((__m128i*)acc) : vbias;                \
+    __m128i vacc_hi = finish_one_pass ? _mm_loadu_si128(((__m128i*)acc) + 1) : vbias;          \
+    __m128i vxi0;                                                                              \
+    __m128i vxi1;                                                                              \
+    __m128i vxi2;                                                                              \
+    __m128i vxi3;                                                                              \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vxi0 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi0), 8);                               \
+        vxi1 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi1), 8);                               \
+        vxi2 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi2), 8);                               \
+        vxi3 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi3), 8);                               \
+    } else {                                                                                   \
+        vxi0 = _mm_unpacklo_epi8(vi0, vzero);                                                  \
+        vxi1 = _mm_unpacklo_epi8(vi1, vzero);                                                  \
+        vxi2 = _mm_unpacklo_epi8(vi2, vzero);                                                  \
+        vxi3 = _mm_unpacklo_epi8(vi3, vzero);                                                  \
+    }                                                                                          \
+    __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);                                                \
+    __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);                                                \
+    __m128i vsum = _mm_add_epi16(vsum01, vsum23);                                              \
+                                                                                               \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16)); \
+        vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16)); \
+    } else {                                                                                   \
+        vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));                     \
+        vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));                     \
+    }
 
 #else
 
     constexpr size_t PixelsPerIteration = 7;
-
 #define LOAD_FULL_CHANNELS()                                 \
     const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0); \
     i0 += 8;                                                 \
@@ -392,47 +511,71 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
     const __m128i vi6 = _mm_loadl_epi64((const __m128i*)i6); \
     i6 += 8
 
-#define CALCULATE_ACCUMULATE_VECTORS()                                                                 \
-    __m128i vacc_lo = finish_one_pass ? _mm_loadu_si128((__m128i*)acc) : vbias;                        \
-    __m128i vacc_hi = finish_one_pass ? _mm_loadu_si128(((__m128i*)acc) + 1) : vbias;                  \
-    const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);                                                \
-    const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);                                                \
-    const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);                                                \
-    const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);                                                \
-    const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);                                                \
-    const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);                                                \
-    const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);                                                \
-    const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);                                                  \
-    const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);                                                  \
-    const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);                                                  \
-    const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);                                               \
-    const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);                                            \
-    const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);                                             \
-    vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));                                 \
-    vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero))
+#define CALCULATE_ACCUMULATE_VECTORS()                                                         \
+    __m128i vacc_lo = finish_one_pass ? _mm_loadu_si128((__m128i*)acc) : vbias;                \
+    __m128i vacc_hi = finish_one_pass ? _mm_loadu_si128(((__m128i*)acc) + 1) : vbias;          \
+    __m128i vxi0;                                                                              \
+    __m128i vxi1;                                                                              \
+    __m128i vxi2;                                                                              \
+    __m128i vxi3;                                                                              \
+    __m128i vxi4;                                                                              \
+    __m128i vxi5;                                                                              \
+    __m128i vxi6;                                                                              \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vxi0 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi0), 8);                               \
+        vxi1 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi1), 8);                               \
+        vxi2 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi2), 8);                               \
+        vxi3 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi3), 8);                               \
+        vxi4 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi4), 8);                               \
+        vxi5 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi5), 8);                               \
+        vxi6 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi6), 8);                               \
+    } else {                                                                                   \
+        vxi0 = _mm_unpacklo_epi8(vi0, vzero);                                                  \
+        vxi1 = _mm_unpacklo_epi8(vi1, vzero);                                                  \
+        vxi2 = _mm_unpacklo_epi8(vi2, vzero);                                                  \
+        vxi3 = _mm_unpacklo_epi8(vi3, vzero);                                                  \
+        vxi4 = _mm_unpacklo_epi8(vi4, vzero);                                                  \
+        vxi5 = _mm_unpacklo_epi8(vi5, vzero);                                                  \
+        vxi6 = _mm_unpacklo_epi8(vi6, vzero);                                                  \
+    }                                                                                          \
+    const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);                                          \
+    const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);                                          \
+    const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);                                          \
+    const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);                                       \
+    const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);                                    \
+    const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);                                     \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16)); \
+        vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16)); \
+    } else {                                                                                   \
+        vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));                     \
+        vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));                     \
+    }
 
 #endif
 
-    uint8_t tail[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+    T8Bits tail[8] = {0, 0, 0, 0, 0, 0, 0, 0};
     bool finish_one_pass = false;
     const __m128i vbias = _mm_set1_epi32(Bias);
     const __m128i vzero = _mm_setzero_si128();
     size_t step_next_group = PixelsPerIteration * Stride - (Channels & ~size_t{7});
 
-    const uint8_t* i0 = Input;
-    const uint8_t* i1 = i0 + Stride;
-    const uint8_t* i2 = i1 + Stride;
-    const uint8_t* i3 = i2 + Stride;
+    const T8Bits* i0 = Input;
+    const T8Bits* i1 = i0 + Stride;
+    const T8Bits* i2 = i1 + Stride;
+    const T8Bits* i3 = i2 + Stride;
 #if !defined(MLAS_TARGET_IX86)
-    const uint8_t* i4 = i0 + Stride * 4;
-    const uint8_t* i5 = i4 + Stride;
-    const uint8_t* i6 = i5 + Stride;
+    const T8Bits* i4 = i0 + Stride * 4;
+    const T8Bits* i5 = i4 + Stride;
+    const T8Bits* i6 = i5 + Stride;
 #endif
 
     for (; ImageSize > PixelsPerIteration; ImageSize -= PixelsPerIteration) {
+
         int32_t* acc = AccumulateBuffer;
         size_t c = Channels;
         for (; c >= 8; c -= 8) {
+
             LOAD_FULL_CHANNELS();
 
             CALCULATE_ACCUMULATE_VECTORS();
@@ -442,14 +585,21 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
             acc += 8;
         }
         if (c > 0) {
-            const __m128i vi0 = _mm_loadl_epi64((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0));
-            const __m128i vi1 = _mm_loadl_epi64((const __m128i*)(i1 >= LastOf8 ? memcpy(tail, i1, c) : i1));
-            const __m128i vi2 = _mm_loadl_epi64((const __m128i*)(i2 >= LastOf8 ? memcpy(tail, i2, c) : i2));
-            const __m128i vi3 = _mm_loadl_epi64((const __m128i*)(i3 >= LastOf8 ? memcpy(tail, i3, c) : i3));
+            const __m128i vi0 =
+                _mm_loadl_epi64((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0));
+            const __m128i vi1 =
+                _mm_loadl_epi64((const __m128i*)(i1 >= LastOf8 ? memcpy(tail, i1, c) : i1));
+            const __m128i vi2 =
+                _mm_loadl_epi64((const __m128i*)(i2 >= LastOf8 ? memcpy(tail, i2, c) : i2));
+            const __m128i vi3 =
+                _mm_loadl_epi64((const __m128i*)(i3 >= LastOf8 ? memcpy(tail, i3, c) : i3));
 #if !defined(MLAS_TARGET_IX86)
-            const __m128i vi4 = _mm_loadl_epi64((const __m128i*)(i4 >= LastOf8 ? memcpy(tail, i4, c) : i4));
-            const __m128i vi5 = _mm_loadl_epi64((const __m128i*)(i5 >= LastOf8 ? memcpy(tail, i5, c) : i5));
-            const __m128i vi6 = _mm_loadl_epi64((const __m128i*)(i6 >= LastOf8 ? memcpy(tail, i6, c) : i6));
+            const __m128i vi4 =
+                _mm_loadl_epi64((const __m128i*)(i4 >= LastOf8 ? memcpy(tail, i4, c) : i4));
+            const __m128i vi5 =
+                _mm_loadl_epi64((const __m128i*)(i5 >= LastOf8 ? memcpy(tail, i5, c) : i5));
+            const __m128i vi6 =
+                _mm_loadl_epi64((const __m128i*)(i6 >= LastOf8 ? memcpy(tail, i6, c) : i6));
 #endif
 
             CALCULATE_ACCUMULATE_VECTORS();
@@ -473,26 +623,38 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
     if (ImageSize > 0) {
 #if defined(MLAS_TARGET_IX86)
         switch (ImageSize) {
-        case 1: i1 = ZeroBuffer; /* fall through */
-        case 2: i2 = ZeroBuffer; /* fall through */
-        case 3: i3 = ZeroBuffer; /* fall through */
-        default: break;
+            case 1:
+                i1 = ZeroBuffer; /* fall through */
+            case 2:
+                i2 = ZeroBuffer; /* fall through */
+            case 3:
+                i3 = ZeroBuffer; /* fall through */
+            default:
+                break;
         }
 #else
         switch (ImageSize) {
-        case 1: i1 = ZeroBuffer; /* fall through */
-        case 2: i2 = ZeroBuffer; /* fall through */
-        case 3: i3 = ZeroBuffer; /* fall through */
-        case 4: i4 = ZeroBuffer; /* fall through */
-        case 5: i5 = ZeroBuffer; /* fall through */
-        case 6: i6 = ZeroBuffer; /* fall through */
-        default: break;
+            case 1:
+                i1 = ZeroBuffer; /* fall through */
+            case 2:
+                i2 = ZeroBuffer; /* fall through */
+            case 3:
+                i3 = ZeroBuffer; /* fall through */
+            case 4:
+                i4 = ZeroBuffer; /* fall through */
+            case 5:
+                i5 = ZeroBuffer; /* fall through */
+            case 6:
+                i6 = ZeroBuffer; /* fall through */
+            default:
+                break;
         }
 #endif
 
         int32_t* acc = AccumulateBuffer;
         size_t c = Channels;
         for (; c >= 8; c -= 8) {
+
             LOAD_FULL_CHANNELS();
 
             CALCULATE_ACCUMULATE_VECTORS();
@@ -503,14 +665,21 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
         }
 
         if (c > 0) {
-            const __m128i vi0 = _mm_loadl_epi64((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0));
-            const __m128i vi1 = _mm_loadl_epi64((const __m128i*)(1 < ImageSize && i1 >= LastOf8 ? memcpy(tail, i1, c) : i1));
-            const __m128i vi2 = _mm_loadl_epi64((const __m128i*)(2 < ImageSize && i2 >= LastOf8 ? memcpy(tail, i2, c) : i2));
-            const __m128i vi3 = _mm_loadl_epi64((const __m128i*)(3 < ImageSize && i3 >= LastOf8 ? memcpy(tail, i3, c) : i3));
+            const __m128i vi0 =
+                _mm_loadl_epi64((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0));
+            const __m128i vi1 = _mm_loadl_epi64(
+                (const __m128i*)(1 < ImageSize && i1 >= LastOf8 ? memcpy(tail, i1, c) : i1));
+            const __m128i vi2 = _mm_loadl_epi64(
+                (const __m128i*)(2 < ImageSize && i2 >= LastOf8 ? memcpy(tail, i2, c) : i2));
+            const __m128i vi3 = _mm_loadl_epi64(
+                (const __m128i*)(3 < ImageSize && i3 >= LastOf8 ? memcpy(tail, i3, c) : i3));
 #if !defined(MLAS_TARGET_IX86)
-            const __m128i vi4 = _mm_loadl_epi64((const __m128i*)(4 < ImageSize && i4 >= LastOf8 ? memcpy(tail, i4, c) : i4));
-            const __m128i vi5 = _mm_loadl_epi64((const __m128i*)(5 < ImageSize && i5 >= LastOf8 ? memcpy(tail, i5, c) : i5));
-            const __m128i vi6 = _mm_loadl_epi64((const __m128i*)(6 < ImageSize && i6 >= LastOf8 ? memcpy(tail, i6, c) : i6));
+            const __m128i vi4 = _mm_loadl_epi64(
+                (const __m128i*)(4 < ImageSize && i4 >= LastOf8 ? memcpy(tail, i4, c) : i4));
+            const __m128i vi5 = _mm_loadl_epi64(
+                (const __m128i*)(5 < ImageSize && i5 >= LastOf8 ? memcpy(tail, i5, c) : i5));
+            const __m128i vi6 = _mm_loadl_epi64(
+                (const __m128i*)(6 < ImageSize && i6 >= LastOf8 ? memcpy(tail, i6, c) : i6));
 #endif
 
             CALCULATE_ACCUMULATE_VECTORS();
@@ -527,13 +696,14 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
 
 // Pure C++ Implementation
 
+template <typename T8Bits>
 void
 MLASCALL
 MlasQLinearGlobalAveragePoolNchw(
-    const uint8_t* Input,
+    const T8Bits* Input,
     float ScaleInput,
     int32_t ZeroPointInput,
-    uint8_t* Output,
+    T8Bits* Output,
     float ScaleOutput,
     int32_t ZeroPointOutput,
     size_t Channels,
@@ -544,22 +714,26 @@ MlasQLinearGlobalAveragePoolNchw(
     float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
     int32_t bias = -ZeroPointInput * static_cast<int32_t>(ImageSize);
     for (; Channels > 0; Channels--) {
+
         int32_t acc = bias;
         for (size_t i = 0; i < ImageSize; ++i) {
-            acc += static_cast<int>(*Input++);
+            acc += static_cast<int32_t>(*Input++);
         }
-        int32_t v = static_cast<int>(std::nearbyintf(acc * scale)) + ZeroPointOutput;
-        *Output++ = std::max(std::min(255, v), 0);
+        int32_t v = static_cast<int32_t>(std::nearbyintf(acc * scale)) + ZeroPointOutput;
+        v = std::min(static_cast<int32_t>(std::numeric_limits<T8Bits>::max()), v);
+        v = std::max(static_cast<int32_t>(std::numeric_limits<T8Bits>::lowest()), v);
+        *Output++ = static_cast<T8Bits>(v);
     }
 }
 
+template <typename T8Bits>
 void
 MLASCALL
 MlasQLinearGlobalAveragePoolNhwc(
-    const uint8_t* Input,
+    const T8Bits* Input,
     float ScaleInput,
     int32_t ZeroPointInput,
-    uint8_t* Output,
+    T8Bits* Output,
     float ScaleOutput,
     int32_t ZeroPointOutput,
     size_t Batch,
@@ -567,26 +741,33 @@ MlasQLinearGlobalAveragePoolNhwc(
     size_t Stride,
     size_t Channels,
     int32_t* AccumulateBuffer,
-    const uint8_t* /* ZeroBuffer */
+    const T8Bits* ZeroBuffer
     )
 {
     float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
     int32_t bias = -ZeroPointInput * static_cast<int32_t>(ImageSize);
     for (; Batch > 0; Batch--) {
-        const uint8_t* batch_input = Input;
-        uint8_t* batch_output = Output;
+
+        const T8Bits* batch_input = Input;
+        T8Bits* batch_output = Output;
         Input += Stride * ImageSize;
         Output += Stride;
         std::fill_n(AccumulateBuffer, Channels, bias);
         for (size_t i = 0; i < ImageSize; ++i) {
+
             for (size_t c = 0; c < Channels; ++c) {
                 AccumulateBuffer[c] += static_cast<int>(batch_input[c]);
             }
+
             batch_input += Stride;
         }
+
         for (size_t c = 0; c < Channels; ++c) {
-            int32_t v = static_cast<int>(std::nearbyintf(AccumulateBuffer[c] * scale)) + ZeroPointOutput;
-            *batch_output++ = std::max(std::min(255, v), 0);
+
+            int32_t v = static_cast<int32_t>(std::nearbyintf(AccumulateBuffer[c] * scale)) + ZeroPointOutput;
+            v = std::min(static_cast<int32_t>(std::numeric_limits<T8Bits>::max()), v);
+            v = std::max(static_cast<int32_t>(std::numeric_limits<T8Bits>::lowest()), v);
+            *batch_output++ = static_cast<T8Bits>(v);
         }
     }
 }
@@ -595,9 +776,91 @@ MlasQLinearGlobalAveragePoolNhwc(
 
 #if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS)
 
+template <typename T8Bits>
 void
 MLASCALL
 MlasQLinearGlobalAveragePoolNhwc(
+    const T8Bits* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    T8Bits* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Batch,
+    size_t ImageSize,
+    size_t Stride,
+    size_t Channels,
+    int32_t* AccumulateBuffer,
+    const T8Bits* ZeroBuffer
+    )
+{
+    float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
+    const int32_t bias = -ZeroPointInput * static_cast<int32_t>(ImageSize);
+    const T8Bits* inputLastOf8 = Input + (Batch * ImageSize * Stride - Stride + Channels) - 8;
+
+    for (; Batch > 0; Batch--) {
+        MlasQLinearGlobalAveragePoolNhwcSingleBatch(
+            Input, Output, inputLastOf8, ImageSize, Channels, Stride, bias, scale,
+            static_cast<T8Bits>(ZeroPointOutput), AccumulateBuffer, ZeroBuffer);
+        Input += ImageSize * Stride;
+        Output += Stride;
+    }
+}
+
+#endif
+
+template
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNchw<int8_t>(
+    const int8_t* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    int8_t* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Channels,
+    size_t ImageSize,
+    int32_t* AccumulateBuffer
+    );
+
+template
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNchw<uint8_t>(
+    const uint8_t* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    uint8_t* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Channels,
+    size_t ImageSize,
+    int32_t* AccumulateBuffer
+    );
+
+template
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNhwc<int8_t>(
+    const int8_t* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    int8_t* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Batch,
+    size_t ImageSize,
+    size_t Stride,
+    size_t Channels,
+    int32_t* AccumulateBuffer,
+    const int8_t* ZeroBuffer
+    );
+
+template
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNhwc<uint8_t>(
     const uint8_t* Input,
     float ScaleInput,
     int32_t ZeroPointInput,
@@ -610,20 +873,4 @@ MlasQLinearGlobalAveragePoolNhwc(
     size_t Channels,
     int32_t* AccumulateBuffer,
     const uint8_t* ZeroBuffer
-    )
-{
-    float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
-    const int32_t bias = -ZeroPointInput * static_cast<int32_t>(ImageSize);
-    const uint8_t* inputLastOf8 = Input + (Batch * ImageSize * Stride - Stride + Channels) - 8;
-
-    for (; Batch > 0; Batch--) {
-        MlasQLinearGlobalAveragePoolNhwcSingleBatch(
-            Input, Output, inputLastOf8, ImageSize, Channels, Stride,
-            bias, scale, static_cast<uint8_t>(ZeroPointOutput),
-            AccumulateBuffer, ZeroBuffer);
-        Input += ImageSize * Stride;
-        Output += Stride;
-    }
-}
-
-#endif
+    );
diff --git a/onnxruntime/core/mlas/lib/quantize.cpp b/onnxruntime/core/mlas/lib/quantize.cpp
index 01a5529fb6..632800e8fd 100644
--- a/onnxruntime/core/mlas/lib/quantize.cpp
+++ b/onnxruntime/core/mlas/lib/quantize.cpp
@@ -165,7 +165,7 @@ Return Value:
 
 --*/
 {
-    constexpr int32_t MinimumValue = std::numeric_limits<OutputType>::min();
+    constexpr int32_t MinimumValue = std::numeric_limits<OutputType>::lowest();
     constexpr int32_t MaximumValue = std::numeric_limits<OutputType>::max();
 
     auto ScaleVector = MlasBroadcastFloat32x4(Scale);
@@ -315,7 +315,7 @@ Return Value:
 
 --*/
 {
-    constexpr int32_t MinimumValue = std::numeric_limits<OutputType>::min();
+    constexpr int32_t MinimumValue = std::numeric_limits<OutputType>::lowest();
     constexpr int32_t MaximumValue = std::numeric_limits<OutputType>::max();
 
     for (size_t n = 0; n < N; n++) {
@@ -352,17 +352,18 @@ MlasQuantizeLinear<uint8_t>(
 
 #if defined(MLAS_SSE2_INTRINSICS)
 
+template <typename OutputType>
 void
 MLASCALL
 MlasRequantizeOutput(
     const int32_t* Input,
     size_t InputLeadingDimension,
-    uint8_t* Output,
+    OutputType* Output,
     size_t OutputLeadingDimension,
     const int32_t* Bias,
     const float* Scale,
     bool PerColumnScale,
-    uint8_t ZeroPoint,
+    OutputType ZeroPoint,
     size_t StartM,
     size_t StartN,
     size_t CountM,
@@ -370,8 +371,8 @@ MlasRequantizeOutput(
     )
 {
     const __m128 PerMatrixScaleVector = PerColumnScale ? _mm_setzero_ps() : _mm_load1_ps(Scale);
-    const __m128 MinimumValueVector = _mm_set1_ps(float(0 - ZeroPoint));
-    const __m128 MaximumValueVector = _mm_set1_ps(float(255 - ZeroPoint));
+    const __m128 MinimumValueVector = _mm_set1_ps(float(std::numeric_limits<OutputType>::lowest() - ZeroPoint));
+    const __m128 MaximumValueVector = _mm_set1_ps(float(std::numeric_limits<OutputType>::max() - ZeroPoint));
     const __m128i ZeroPointVector = _mm_set1_epi32(ZeroPoint);
 
     if (nullptr != Bias) {
@@ -467,10 +468,23 @@ MlasRequantizeOutput(
             IntegerVector2 = _mm_add_epi32(IntegerVector2, ZeroPointVector);
             IntegerVector3 = _mm_add_epi32(IntegerVector3, ZeroPointVector);
 
-            __m128i WordVector0 = _mm_packus_epi16(IntegerVector0, IntegerVector1);
-            __m128i WordVector1 = _mm_packus_epi16(IntegerVector2, IntegerVector3);
+            __m128i WordVector0;
+            __m128i WordVector1;
+            __m128i ByteVector;
 
-            __m128i ByteVector = _mm_packus_epi16(WordVector0, WordVector1);
+            if (std::is_signed<OutputType>::value) {
+
+                WordVector0 = _mm_packs_epi32(IntegerVector0, IntegerVector1);
+                WordVector1 = _mm_packs_epi32(IntegerVector2, IntegerVector3);
+                ByteVector = _mm_packs_epi16(WordVector0, WordVector1);
+
+            } else {
+
+                WordVector0 = _mm_packus_epi16(IntegerVector0, IntegerVector1);
+                WordVector1 = _mm_packus_epi16(IntegerVector2, IntegerVector3);
+                ByteVector = _mm_packus_epi16(WordVector0, WordVector1);
+
+            }
 
             _mm_storeu_si128((__m128i*)RowOutput, ByteVector);
             RowOutput += 16;
@@ -541,8 +555,17 @@ MlasRequantizeOutput(
             IntegerVector = _mm_cvtps_epi32(FloatVector);
             IntegerVector = _mm_add_epi32(IntegerVector, ZeroPointVector);
 
-            IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
-            IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
+            if (std::is_signed<OutputType>::value) {
+
+                IntegerVector = _mm_packs_epi32(IntegerVector, IntegerVector);
+                IntegerVector = _mm_packs_epi16(IntegerVector, IntegerVector);
+
+            } else {
+
+                IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
+                IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
+
+            }
 
             uint32_t OutputValue = uint32_t(_mm_cvtsi128_si32(IntegerVector));
 
@@ -570,17 +593,18 @@ MlasRequantizeOutput(
 
 #elif defined(MLAS_NEON64_INTRINSICS)
 
+template<typename OutputType>
 void
 MLASCALL
 MlasRequantizeOutput(
     const int32_t* Input,
     size_t InputLeadingDimension,
-    uint8_t* Output,
+    OutputType* Output,
     size_t OutputLeadingDimension,
     const int32_t* Bias,
     const float* Scale,
     bool PerColumnScale,
-    uint8_t ZeroPoint,
+    OutputType ZeroPoint,
     size_t StartM,
     size_t StartN,
     size_t CountM,
@@ -686,7 +710,7 @@ MlasRequantizeOutput(
 
             //
             // Pack the integers with saturation to 16-bit values and shift by
-            // the zero point, then pack the integers again to unsigned bytes.
+            // the zero point, then pack the integers again to bytes.
             //
 
             int16x8x2_t WordVector;
@@ -697,7 +721,13 @@ MlasRequantizeOutput(
             WordVector.val[0] = vqaddq_s16(WordVector.val[0], ZeroPointVector);
             WordVector.val[1] = vqaddq_s16(WordVector.val[1], ZeroPointVector);
 
-            vst1q_u8(RowOutput, vqmovun_high_s16(vqmovun_s16(WordVector.val[0]), WordVector.val[1]));
+            if (std::is_signed<OutputType>::value) {
+                vst1q_s8(reinterpret_cast<int8_t*>(RowOutput),
+                         vqmovn_high_s16(vqmovn_s16(WordVector.val[0]), WordVector.val[1]));
+            } else {
+                vst1q_u8(reinterpret_cast<uint8_t*>(RowOutput),
+                         vqmovun_high_s16(vqmovun_s16(WordVector.val[0]), WordVector.val[1]));
+            }
             RowOutput += 16;
 
             n -= 16;
@@ -775,7 +805,13 @@ MlasRequantizeOutput(
             int16x8_t WordVector = vcombine_s16(vqmovn_s32(IntegerVector), vdup_n_s16(0));
             WordVector = vqaddq_s16(WordVector, ZeroPointVector);
 
-            uint8x16_t ByteVector = vcombine_u8(vqmovun_s16(WordVector), vdup_n_u8(0));
+            uint8x16_t ByteVector;
+
+            if (std::is_signed<OutputType>::value) {
+                ByteVector = vcombine_u8(vreinterpret_u8_s8(vqmovn_s16(WordVector)), vdup_n_u8(0));
+            } else {
+                ByteVector = vcombine_u8(vqmovun_s16(WordVector), vdup_n_u8(0));
+            }
 
             if (n >= 4) {
 
@@ -787,7 +823,7 @@ MlasRequantizeOutput(
 
             } else {
 
-                vst1q_lane_u8(RowOutput, ByteVector, 0);
+                vst1q_lane_u8(reinterpret_cast<uint8_t*>(RowOutput), ByteVector, 0);
                 RowOutput += 1;
 
                 n -= 1;
@@ -802,17 +838,18 @@ MlasRequantizeOutput(
 
 #else
 
+template <typename OutputType>
 void
 MLASCALL
 MlasRequantizeOutput(
     const int32_t* Input,
     size_t InputLeadingDimension,
-    uint8_t* Output,
+    OutputType* Output,
     size_t OutputLeadingDimension,
     const int32_t* Bias,
     const float* Scale,
     bool PerColumnScale,
-    uint8_t ZeroPoint,
+    OutputType ZeroPoint,
     size_t StartM,
     size_t StartN,
     size_t CountM,
@@ -820,8 +857,8 @@ MlasRequantizeOutput(
     )
 {
     const float PerMatrixScaleValue = PerColumnScale ? 0.0f : *Scale;
-    const float MinimumValue = float(0 - ZeroPoint);
-    const float MaximumValue = float(255 - ZeroPoint);
+    const float MinimumValue = float(std::numeric_limits<OutputType>::lowest() - ZeroPoint);
+    const float MaximumValue = float(std::numeric_limits<OutputType>::max() - ZeroPoint);
 
     if (nullptr != Bias) {
         Bias += StartN;
@@ -872,7 +909,7 @@ MlasRequantizeOutput(
             IntegerValue = int32_t(MlasBitsOfFp32(FloatValue + MLAS_ROUNDING_BIAS_MAGIC)) -
                 MLAS_ROUNDING_BIAS_MAGIC_BITS;
 
-            *RowOutput++ = uint8_t(IntegerValue + ZeroPoint);
+            *RowOutput++ = OutputType(IntegerValue + ZeroPoint);
 
             n -= 1;
         }
@@ -885,6 +922,42 @@ MlasRequantizeOutput(
 
 #endif
 
+template
+void
+MLASCALL
+MlasRequantizeOutput<int8_t>(
+    const int32_t* Input,
+    size_t InputLeadingDimension,
+    int8_t* Output,
+    size_t OutputLeadingDimension,
+    const int32_t* Bias,
+    const float* Scale,
+    bool PerColumnScale,
+    int8_t ZeroPoint,
+    size_t StartM,
+    size_t StartN,
+    size_t CountM,
+    size_t CountN
+    );
+
+template
+void
+MLASCALL
+MlasRequantizeOutput<uint8_t>(
+    const int32_t* Input,
+    size_t InputLeadingDimension,
+    uint8_t* Output,
+    size_t OutputLeadingDimension,
+    const int32_t* Bias,
+    const float* Scale,
+    bool PerColumnScale,
+    uint8_t ZeroPoint,
+    size_t StartM,
+    size_t StartN,
+    size_t CountM,
+    size_t CountN
+    );
+
 void
 MLASCALL
 MlasFindMinMaxElement(
diff --git a/onnxruntime/core/mlas/lib/transpose.cpp b/onnxruntime/core/mlas/lib/transpose.cpp
index 3f12773050..37181ec2f3 100644
--- a/onnxruntime/core/mlas/lib/transpose.cpp
+++ b/onnxruntime/core/mlas/lib/transpose.cpp
@@ -436,3 +436,18 @@ Return Value:
         n -= 1;
     }
 }
+
+void
+MLASCALL
+MlasTranspose(
+    const int8_t* Input,
+    int8_t* Output,
+    size_t M,
+    size_t N)
+{
+    MlasTranspose(
+        reinterpret_cast<const uint8_t*>(Input),
+        reinterpret_cast<uint8_t*>(Output),
+        M,
+        N);
+}
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
index 419394e9cf..eacc0e3fd3 100644
--- a/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
+++ b/onnxruntime/core/optimizer/transpose_optimizer/transpose_optimizer.cc
@@ -942,35 +942,35 @@ void PermuteInput(api::GraphRef& graph, api::NodeRef& node, size_t i, const std:
   node.SetInput(i, gather_output);
 }
 
-static bool HandleResize(HandlerArgs& args) {
-  auto inputs = args.node.Inputs();
-  int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
+//static bool HandleResize(HandlerArgs& args) {
+//  auto inputs = args.node.Inputs();
+//  int64_t rank_int = gsl::narrow_cast<int64_t>(args.perm.size());
+//
+//  if (args.ctx.opset < 11) {
+//    PermuteInput(args.ctx.graph, args.node, 1, args.perm_inv);
+//  } else {
+//    if (inputs[1] != "") {
+//      std::vector<int64_t> double_perm_inv = args.perm_inv;
+//      double_perm_inv.reserve(2 * args.perm_inv.size());
+//      for (int64_t p : args.perm_inv) {
+//        double_perm_inv.push_back(p + rank_int);
+//      }
+//      PermuteInput(args.ctx.graph, args.node, 1, double_perm_inv);
+//    }
+//    for (size_t i = 2; i < inputs.size(); ++i) {
+//      if (inputs[i] != "") {
+//        PermuteInput(args.ctx.graph, args.node, i, args.perm_inv);
+//      }
+//    }
+//  }
+//
+//  TransposeFirstInput(args.ctx, args.node, args.perm_inv);
+//  TransposeOutputs(args.ctx, args.node, args.perm);
+//
+//  return true;
+//}
 
-  if (args.ctx.opset < 11) {
-    PermuteInput(args.ctx.graph, args.node, 1, args.perm_inv);
-  } else {
-    if (inputs[1] != "") {
-      std::vector<int64_t> double_perm_inv = args.perm_inv;
-      double_perm_inv.reserve(2 * args.perm_inv.size());
-      for (int64_t p : args.perm_inv) {
-        double_perm_inv.push_back(p + rank_int);
-      }
-      PermuteInput(args.ctx.graph, args.node, 1, double_perm_inv);
-    }
-    for (size_t i = 2; i < inputs.size(); ++i) {
-      if (inputs[i] != "") {
-        PermuteInput(args.ctx.graph, args.node, i, args.perm_inv);
-      }
-    }
-  }
-
-  TransposeFirstInput(args.ctx, args.node, args.perm_inv);
-  TransposeOutputs(args.ctx, args.node, args.perm);
-
-  return true;
-}
-
-constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};
+// constexpr HandlerInfo resize_handler = {&FirstInput, &HandleResize};
 
 static bool HandlePad(HandlerArgs& args) {
   size_t rank = args.perm.size();
@@ -1563,7 +1563,9 @@ static const std::unordered_map<std::string_view, const HandlerInfo&> handler_ma
   {"Split", split_handler},
   {"Shape", shape_handler},
   {"Pad", pad_handler},
-  {"Resize", resize_handler},
+  // Todo: renable resize handler after adding NHWC support in upsample op on cpu
+  // https://github.com/microsoft/onnxruntime/issues/9857
+  //{"Resize", resize_handler},
   {"ReduceSum", reduce_sum_handler},
 
   {"ReduceLogSum", reduce_op_handler}, {"ReduceLogSumExp", reduce_op_handler}, {"ReduceMax", reduce_op_handler},
diff --git a/onnxruntime/core/providers/common.h b/onnxruntime/core/providers/common.h
index 39321f563a..0b9179c0b4 100644
--- a/onnxruntime/core/providers/common.h
+++ b/onnxruntime/core/providers/common.h
@@ -122,7 +122,7 @@ inline int64_t ComputeOutputShape(const int64_t in_dim,
                                   const int64_t stride, const int64_t kernel, const int64_t dilation,
                                   const int64_t pad_head, const int64_t pad_tail) {
   const int64_t dkernel = dilation * (kernel - 1) + 1;
-  return static_cast<int64_t>(static_cast<float>(in_dim + pad_head + pad_tail - dkernel) / stride + 1);
+  return static_cast<int64_t>(static_cast<double>(in_dim + pad_head + pad_tail - dkernel) / stride + 1);
 }
 
 inline Status ComputePadAndOutputShape(const int64_t in_dim,
diff --git a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.h b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.h
index 955ff4a920..1ab64b3e8d 100644
--- a/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.h
+++ b/onnxruntime/core/providers/cpu/math/einsum_utils/einsum_compute_preprocessor.h
@@ -30,11 +30,11 @@ constexpr size_t num_of_letters = 52;
  */
 inline int64_t LetterToIndex(char ch) {
   if (ch >= 'a' && ch <= 'z') {
-    return static_cast<int64_t>(ch - 'a');
+    return static_cast<int64_t>(ch) - 'a';
   }
 
   if (ch >= 'A' && ch <= 'Z') {
-    return 26 + static_cast<int64_t>(ch - 'A');
+    return 26 + static_cast<int64_t>(ch) - 'A';
   }
 
   // invalid character - return error value
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index d2e4fc68dc..cd5e7ad6b9 100755
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -716,6 +716,10 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Loop);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, DepthToSpace);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, SpaceToDepth);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomNormal);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomNormalLike);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomUniform);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomUniformLike);
 
 // opset 10
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool);
@@ -1561,6 +1565,10 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, DepthToSpace)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 12, SpaceToDepth)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomNormal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomNormalLike)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomUniform)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, RandomUniformLike)>,
 
     // opset 10
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.cc b/onnxruntime/core/providers/cuda/cuda_profiler.cc
index de9cbcc09f..adf771fb23 100644
--- a/onnxruntime/core/providers/cuda/cuda_profiler.cc
+++ b/onnxruntime/core/providers/cuda/cuda_profiler.cc
@@ -1,6 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-#if !(defined(USE_ROCM) || defined(ENABLE_TRAINING))
+#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
 
 #include "cuda_profiler.h"
 #include <map>
diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.h b/onnxruntime/core/providers/cuda/cuda_profiler.h
index 2ae6715009..bd625a7c6a 100644
--- a/onnxruntime/core/providers/cuda/cuda_profiler.h
+++ b/onnxruntime/core/providers/cuda/cuda_profiler.h
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 #include "core/common/profiler_common.h"
 
-#if !(defined(USE_ROCM) || defined(ENABLE_TRAINING))
+#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
 
 #include "core/platform/ort_mutex.h"
 #include <cupti.h>
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index c390bc8954..45e10fa14c 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -130,9 +130,26 @@ struct ProviderInfo_CUDA_Impl : ProviderInfo_CUDA {
   }
 
   // Used by slice_concatenate_test.cc and onnxruntime_pybind_state.cc
-  void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) override { CUDA_CALL_THROW(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice)); }
+
+  void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) override {
+    // cudaMemcpy() operates on the default stream
+    CUDA_CALL_THROW(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+
+    // To ensure that the copy has completed, invoke a stream sync for the default stream.
+    // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html#api-sync-behavior__memcpy-sync
+    // For transfers from pageable host memory to device memory, a stream sync is performed before the copy is initiated.
+    // The function will return once the pageable buffer has been copied to the staging memory for DMA transfer
+    // to device memory, but the DMA to final destination may not have completed.
+
+    CUDA_CALL_THROW(cudaStreamSynchronize(0));
+  }
+
   // Used by onnxruntime_pybind_state.cc
-  void cudaMemcpy_DeviceToHost(void* dst, const void* src, size_t count) override { CUDA_CALL_THROW(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost)); }
+  void cudaMemcpy_DeviceToHost(void* dst, const void* src, size_t count) override {
+    // https://docs.nvidia.com/cuda/cuda-runtime-api/api-sync-behavior.html#api-sync-behavior__memcpy-sync
+    // For transfers from device to either pageable or pinned host memory, the function returns only once the copy has completed.
+    CUDA_CALL_THROW(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
+  }
 
   int cudaGetDeviceCount() override {
     int num_devices = 0;
diff --git a/onnxruntime/core/providers/cuda/generator/random.cc b/onnxruntime/core/providers/cuda/generator/random.cc
new file mode 100644
index 0000000000..643da1579d
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/generator/random.cc
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/generator/random.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+using namespace ONNX_NAMESPACE;
+
+ONNX_OPERATOR_KERNEL_EX(RandomNormal, kOnnxDomain, 1, kCudaExecutionProvider,
+                        (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes()),
+                        RandomNormal);
+
+ONNX_OPERATOR_KERNEL_EX(RandomNormalLike, kOnnxDomain, 1, kCudaExecutionProvider,
+                        (*KernelDefBuilder::Create())
+                            .TypeConstraint("T1", DataTypeImpl::AllTensorTypes())
+                            .TypeConstraint("T2", DataTypeImpl::AllIEEEFloatTensorTypes()),
+                        RandomNormalLike);
+
+ONNX_OPERATOR_KERNEL_EX(RandomUniform, kOnnxDomain, 1, kCudaExecutionProvider,
+                        (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::AllIEEEFloatTensorTypes()),
+                        RandomUniform);
+
+ONNX_OPERATOR_KERNEL_EX(RandomUniformLike, kOnnxDomain, 1, kCudaExecutionProvider,
+                        (*KernelDefBuilder::Create())
+                            .TypeConstraint("T1", DataTypeImpl::AllTensorTypes())
+                            .TypeConstraint("T2", DataTypeImpl::AllIEEEFloatTensorTypes()),
+                        RandomUniformLike);
+
+Status RandomNormalBase::Compute(OpKernelContext* p_ctx, const TensorShape& shape, int dtype) const {
+  Tensor& Y = *p_ctx->Output(0, shape);
+  const int64_t N = shape.Size();
+  PhiloxGenerator& generator = generator_ ? *generator_ : PhiloxGenerator::Default();
+  utils::MLTypeCallDispatcher<float, MLFloat16, double> t_disp(dtype);
+  t_disp.Invoke<RandomNormalComputeImpl>(GetDeviceProp(), Stream(), N, scale_, mean_, generator, Y);
+  return Status::OK();
+}
+
+Status RandomNormal::ComputeInternal(OpKernelContext* p_ctx) const { return Compute(p_ctx, shape_, dtype_); }
+
+Status RandomNormalLike::ComputeInternal(OpKernelContext* p_ctx) const {
+  const Tensor* p_X = p_ctx->Input<Tensor>(0);
+  if (!p_X) return Status(common::ONNXRUNTIME, common::FAIL, "X Input is not available.");
+  if (dtype_ == TensorProto_DataType_UNDEFINED && !p_X->IsDataType<float>() && !p_X->IsDataType<double>() &&
+      !p_X->IsDataType<MLFloat16>()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Output data type is required to be one of float types, but got incompatible data type ",
+                           p_X->DataType(), " from input tensor.");
+  }
+  return Compute(p_ctx, p_X->Shape(), dtype_ != TensorProto_DataType_UNDEFINED ? dtype_ : p_X->GetElementType());
+}
+
+Status RandomUniformBase::Compute(OpKernelContext* p_ctx, const TensorShape& shape, int dtype) const {
+  Tensor& Y = *p_ctx->Output(0, shape);
+  const int64_t N = shape.Size();
+  PhiloxGenerator& generator = generator_ ? *generator_ : PhiloxGenerator::Default();
+  utils::MLTypeCallDispatcher<float, MLFloat16, double> t_disp(dtype);
+  t_disp.Invoke<RandomUniformComputeImpl>(GetDeviceProp(), Stream(), N, range_, from_, generator, Y);
+  return Status::OK();
+}
+
+Status RandomUniform::ComputeInternal(OpKernelContext* p_ctx) const { return Compute(p_ctx, shape_, dtype_); }
+
+Status RandomUniformLike::ComputeInternal(OpKernelContext* p_ctx) const {
+  const Tensor* p_X = p_ctx->Input<Tensor>(0);
+  if (!p_X) return Status(common::ONNXRUNTIME, common::FAIL, "X Input is not available.");
+  if (dtype_ == TensorProto_DataType_UNDEFINED && !p_X->IsDataType<float>() && !p_X->IsDataType<double>() &&
+      !p_X->IsDataType<MLFloat16>()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "Output data type is required to be one of float types, but got incompatible data type ",
+                           p_X->DataType(), " from input tensor.");
+  }
+  return Compute(p_ctx, p_X->Shape(), dtype_ != TensorProto_DataType_UNDEFINED ? dtype_ : p_X->GetElementType());
+}
+
+}  // namespace cuda
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/cuda/generator/random.h b/onnxruntime/core/providers/cuda/generator/random.h
new file mode 100644
index 0000000000..f1b58b1636
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/generator/random.h
@@ -0,0 +1,130 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/cuda/cuda_kernel.h"
+
+#include "core/providers/cuda/generator/random_impl.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+#define RANDOM_COMPUTE_IMPL(name)                                                                        \
+  template <typename T>                                                                                  \
+  struct name##ComputeImpl {                                                                             \
+    void operator()(const cudaDeviceProp& prop, cudaStream_t stream, const int64_t N, const float alpha, \
+                    const float beta, PhiloxGenerator& generator, Tensor& Y) const {                     \
+      typedef typename ToCudaType<T>::MappedType CudaT;                                                  \
+      CudaT* Y_data = reinterpret_cast<CudaT*>(Y.template MutableData<T>());                             \
+      name##KernelImpl<CudaT>(prop, stream, N, alpha, beta, generator, Y_data);                          \
+    }                                                                                                    \
+  };
+
+RANDOM_COMPUTE_IMPL(RandomNormal)
+RANDOM_COMPUTE_IMPL(RandomUniform)
+
+#undef RANDOM_COMPUTE_IMPL
+
+class RandomBase : public CudaKernel {
+ protected:
+  RandomBase(const OpKernelInfo& info) : CudaKernel(info) {
+    float seed = 0.f;
+    if (info.GetAttr<float>("seed", &seed).IsOK()) {
+      generator_ = std::make_unique<PhiloxGenerator>(static_cast<uint64_t>(seed));
+    }
+
+    int64_t dtype;
+    if (info.GetAttr<int64_t>("dtype", &dtype).IsOK()) {
+      dtype_ = static_cast<ONNX_NAMESPACE::TensorProto::DataType>(dtype);
+      ORT_ENFORCE(ONNX_NAMESPACE::TensorProto::DataType_IsValid(dtype_) &&
+                      dtype_ != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED,
+                  "Invalid dtype of ", dtype_);
+    }
+  }
+
+ protected:
+  std::unique_ptr<PhiloxGenerator> generator_;
+  ONNX_NAMESPACE::TensorProto::DataType dtype_ =
+      ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;  // optional and may be inferred
+};
+
+class RandomNormalBase : public RandomBase {
+ protected:
+  RandomNormalBase(const OpKernelInfo& info) : RandomBase(info) {
+    ORT_ENFORCE(info.GetAttr<float>("scale", &scale_).IsOK());
+    ORT_ENFORCE(info.GetAttr<float>("mean", &mean_).IsOK());
+  }
+
+  Status Compute(OpKernelContext* p_ctx, const TensorShape& shape, int dtype) const;
+
+ protected:
+  float scale_;
+  float mean_;
+};
+
+class RandomNormal final : public RandomNormalBase {
+ public:
+  explicit RandomNormal(const OpKernelInfo& info) : RandomNormalBase(info) {
+    if (dtype_ == ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
+      dtype_ = ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+    }
+    std::vector<int64_t> shape;
+    ORT_ENFORCE(info.GetAttrs<int64_t>("shape", shape).IsOK());
+    shape_ = TensorShape(shape);
+  }
+
+  Status ComputeInternal(OpKernelContext* p_ctx) const override;
+
+ private:
+  TensorShape shape_;
+};
+
+class RandomNormalLike final : public RandomNormalBase {
+ public:
+  explicit RandomNormalLike(const OpKernelInfo& info) : RandomNormalBase(info) {}
+  Status ComputeInternal(OpKernelContext* p_ctx) const override;
+};
+
+class RandomUniformBase : public RandomBase {
+ protected:
+  RandomUniformBase(const OpKernelInfo& info) : RandomBase(info) {
+    float low, high;
+    ORT_ENFORCE(info.GetAttr<float>("low", &low).IsOK());
+    ORT_ENFORCE(info.GetAttr<float>("high", &high).IsOK());
+    from_ = low;
+    range_ = high - low;
+  }
+
+  Status Compute(OpKernelContext* p_ctx, const TensorShape& shape, int dtype) const;
+
+ protected:
+  float range_;
+  float from_;
+};
+
+class RandomUniform final : public RandomUniformBase {
+ public:
+  explicit RandomUniform(const OpKernelInfo& info) : RandomUniformBase(info) {
+    if (dtype_ == ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
+      dtype_ = ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+    }
+    std::vector<int64_t> shape;
+    ORT_ENFORCE(info.GetAttrs<int64_t>("shape", shape).IsOK());
+    shape_ = TensorShape(shape);
+  }
+
+  Status ComputeInternal(OpKernelContext* p_ctx) const override;
+
+ private:
+  TensorShape shape_;
+};
+
+class RandomUniformLike final : public RandomUniformBase {
+ public:
+  explicit RandomUniformLike(const OpKernelInfo& info) : RandomUniformBase(info) {}
+  Status ComputeInternal(OpKernelContext* p_ctx) const override;
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/generator/random_impl.cu b/onnxruntime/core/providers/cuda/generator/random_impl.cu
new file mode 100644
index 0000000000..7b256f3def
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/generator/random_impl.cu
@@ -0,0 +1,145 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/generator/random_impl.h"
+
+#include <curand_kernel.h>
+#include <algorithm>
+#include "core/providers/cuda/cu_inc/common.cuh"
+
+namespace onnxruntime {
+namespace cuda {
+
+constexpr int UNROLL = 4;
+
+struct DistFunc_RandomNormal {
+  __device__ __inline__ float4 operator()(curandStatePhilox4_32_10_t* state) const { return curand_normal4(state); }
+};
+
+struct DistFunc_RandomUniform {
+  __device__ __inline__ float4 operator()(curandStatePhilox4_32_10_t* state) const { return curand_uniform4(state); }
+};
+
+struct TransformFunc_RandomNormal {
+  __device__ __inline__ float operator()(const float value, const float scale, const float mean) const {
+    return value * scale + mean;
+  }
+};
+
+struct TransformFunc_RandomUniform {
+  __device__ __inline__ float operator()(const float value, const float range, const float from) const {
+    // reverse the bounds of curand4 from (0, 1] to [0, 1).
+    // ref: https://github.com/pytorch/pytorch/blob/e795315c638228d4170f3797356c09a70b2ed4cd/aten/src/ATen/native/cuda/DistributionTemplates.h#L464
+    float reverse_bound_value = value == 1.0f ? 0.0f : value;
+    return reverse_bound_value * range + from;
+  }
+};
+
+template <typename T, typename DistFuncT, typename TransformFuncT>
+__global__ void RandomKernel(const int64_t N, const std::pair<uint64_t, uint64_t> seeds, const DistFuncT& dist_func,
+                             const TransformFuncT& transform_func, const float alpha, const float beta, T* Y_data) {
+  CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
+
+  curandStatePhilox4_32_10_t state;
+  curand_init(seeds.first, idx, seeds.second, &state);
+  float4 rand;
+
+  // We ensure every thread generates the same number of random numbers (by rounding
+  // up the size) and at the same timestep (by syncing threads).
+  // From CUDA curand documentation:
+  //   The Philox_4x32_10 algorithm is closely tied to the thread and block count.
+  //   Each thread computes 4 random numbers in the same time thus the most efficient
+  //   use of Philox_4x32_10 is to generate a multiple of 4 times number of threads.
+  for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
+    rand = dist_func(&state);
+
+// actual computation
+#pragma unroll
+    for (int i = 0; i < UNROLL; i++) {
+      CUDA_LONG li = id + i;
+      if (li < N) {
+        Y_data[li] = static_cast<T>(transform_func((&rand.x)[i], alpha, beta));
+      }
+    }
+
+    __syncthreads();
+  }
+}
+
+template <typename T, typename DistFuncT, typename TransformFuncT>
+__global__ void RandomVectorizedKernel(const int64_t N, const std::pair<uint64_t, uint64_t> seeds,
+                                       const DistFuncT& dist_func, const TransformFuncT& transform_func,
+                                       const float alpha, const float beta, T* Y_data) {
+  CUDA_LONG idx = blockDim.x * blockIdx.x + threadIdx.x;
+  CUDA_LONG step_size = gridDim.x * blockDim.x * UNROLL;
+
+  curandStatePhilox4_32_10_t state;
+  curand_init(seeds.first, idx, seeds.second, &state);
+  float4 rand;
+
+  // Using vectorized data load/store approach when N % 4 == 0 since this is typical case for input shape size.
+  using LoadT = aligned_vector<T, UNROLL>;
+  for (CUDA_LONG id = idx * UNROLL; id < N; id += step_size) {
+    rand = dist_func(&state);
+    T r[UNROLL];
+
+// actual computation
+#pragma unroll
+    for (int ii = 0; ii < UNROLL; ii++) {
+      r[ii] = static_cast<T>(transform_func((&rand.x)[ii], alpha, beta));
+    }
+
+    // Vectorized writes for Y_data
+    *(reinterpret_cast<LoadT*>(&Y_data[id])) = *reinterpret_cast<LoadT*>(&r[0]);
+
+    __syncthreads();
+  }
+}
+
+template <typename T, typename DistFuncT, typename TransformFuncT>
+void RandomKernelImpl(const cudaDeviceProp& prop, cudaStream_t stream, const int64_t N, const DistFuncT& dist_func,
+                      const TransformFuncT& transform_func, float alpha, float beta, PhiloxGenerator& generator,
+                      T* Y_data) {
+  const int block_size = 256;
+  const int blocks_per_sm = prop.maxThreadsPerMultiProcessor / block_size;
+  const int grid_size =
+      std::min(prop.multiProcessorCount * blocks_per_sm, static_cast<int>(CeilDiv(N, block_size * UNROLL)));
+
+  // Compute the number of random numbers generated by each thread, and increment philox generator offset by that
+  // amount.
+  const uint64_t counter_offset = static_cast<uint64_t>(((N - 1) / (block_size * grid_size * UNROLL) + 1) * UNROLL);
+  auto seeds = generator.NextPhiloxSeeds(counter_offset);
+
+  if (N % UNROLL != 0) {
+    RandomKernel<T><<<grid_size, block_size, 0, stream>>>(N, seeds, dist_func, transform_func, alpha, beta, Y_data);
+  } else {
+    RandomVectorizedKernel<T>
+        <<<grid_size, block_size, 0, stream>>>(N, seeds, dist_func, transform_func, alpha, beta, Y_data);
+  }
+}
+
+#define RANDOM_KERNEL_IMPL(name)                                                                                  \
+  template <typename T>                                                                                           \
+  void name##KernelImpl(const cudaDeviceProp& prop, cudaStream_t stream, const int64_t N, const float alpha,      \
+                        const float beta, PhiloxGenerator& generator, T* Y_data) {                                \
+    RandomKernelImpl(prop, stream, N, DistFunc_##name(), TransformFunc_##name(), alpha, beta, generator, Y_data); \
+  }
+
+RANDOM_KERNEL_IMPL(RandomNormal)
+RANDOM_KERNEL_IMPL(RandomUniform)
+
+#define SPECIALIZED_RANDOM_KERNEL(name, T)                                                                            \
+  template void name##KernelImpl(const cudaDeviceProp& prop, cudaStream_t stream, const int64_t N, const float alpha, \
+                                 const float beta, PhiloxGenerator& generator, T* Y_data);
+
+#define SPECIALIZED_RANDOM_KERNELS(T)        \
+  SPECIALIZED_RANDOM_KERNEL(RandomNormal, T) \
+  SPECIALIZED_RANDOM_KERNEL(RandomUniform, T)
+
+SPECIALIZED_RANDOM_KERNELS(float)
+SPECIALIZED_RANDOM_KERNELS(double)
+SPECIALIZED_RANDOM_KERNELS(half)
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/generator/random_impl.h b/onnxruntime/core/providers/cuda/generator/random_impl.h
new file mode 100644
index 0000000000..0fa981e4f4
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/generator/random_impl.h
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/random_generator.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+#define RANDOM_KERNEL_DECLARE(name)                                                                          \
+  template <typename T>                                                                                      \
+  void name##KernelImpl(const cudaDeviceProp& prop, cudaStream_t stream, const int64_t N, const float alpha, \
+                        const float beta, PhiloxGenerator& generator, T* Y_data);
+
+RANDOM_KERNEL_DECLARE(RandomNormal)
+RANDOM_KERNEL_DECLARE(RandomUniform)
+
+#undef RANDOM_KERNEL_DECLARE
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/nn/instance_norm.cc b/onnxruntime/core/providers/cuda/nn/instance_norm.cc
index cd0a13418d..c40c27cdf1 100644
--- a/onnxruntime/core/providers/cuda/nn/instance_norm.cc
+++ b/onnxruntime/core/providers/cuda/nn/instance_norm.cc
@@ -100,10 +100,19 @@ Status InstanceNorm<T>::ComputeInternal(OpKernelContext* p_op_kernel_context) co
     CudnnTensor stats_desc;
     ORT_RETURN_IF_ERROR(stats_desc.Set(std::array<int64_t, 4>{1, stats_count, 1, 1}, CudnnTensor::GetDataType<CudaT>()));
 
+    const size_t stats_byte_count = stats_count * sizeof(CudaT);
+
+    // Mean & Variance are inputs & outputs and must be initialized to zero to work properly
     auto mean = GetScratchBuffer<CudaT>(stats_count);
+    CUDA_RETURN_IF_ERROR(cudaMemsetAsync(mean.get(), 0, stats_byte_count, Stream()));
     auto variance = GetScratchBuffer<CudaT>(stats_count);
+    CUDA_RETURN_IF_ERROR(cudaMemsetAsync(variance.get(), 0, stats_byte_count, Stream()));
+
+    // We must set the scale & bias inputs to zero as they are inputs to the calculation
     auto unused_scale = GetScratchBuffer<CudaT>(stats_count);
+    CUDA_RETURN_IF_ERROR(cudaMemsetAsync(unused_scale.get(), 0, stats_byte_count, Stream()));
     auto unused_bias = GetScratchBuffer<CudaT>(stats_count);
+    CUDA_RETURN_IF_ERROR(cudaMemsetAsync(unused_bias.get(), 0, stats_byte_count, Stream()));
 
     // first, compute mean and variance per-instance per-channel using cudnnBatchNorm training
     CUDNN_RETURN_IF_ERROR(cudnnBatchNormalizationForwardTraining(
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
index 85017f0a05..7a61f56eb6 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/MLOperatorAuthorImpl.cpp
@@ -1507,7 +1507,7 @@ std::vector<IMLOperatorTensor*> OpKernelContextWrapper::GetOutputTensors(const E
   std::vector<IMLOperatorTensor*> ret;
   ret.reserve(m_outputTensors.size());
 
-  ORT_THROW_HR_IF(E_INVALIDARG, m_impl->OutputCount() != outputShapes.EdgeCount());
+  ORT_THROW_HR_IF(E_INVALIDARG, static_cast<size_t>(m_impl->OutputCount()) != outputShapes.EdgeCount());
 
   for (int i = 0; i < m_impl->OutputCount(); ++i) {
     ComPtr<IMLOperatorTensor> tensor;
@@ -1847,7 +1847,7 @@ void InferAndVerifyOutputSizes(
 
     if (tensorType.has_shape()) {
       const auto& shape = tensorType.shape();
-      ML_CHECK_BOOL(shape.dim_size() == outputShapes.GetShape(outputIndex).size());
+      ML_CHECK_BOOL(static_cast<size_t>(shape.dim_size()) == outputShapes.GetShape(outputIndex).size());
 
       for (uint32_t output_dim = 0; output_dim < outputShapes.GetShape(outputIndex).size(); ++output_dim) {
         if (shape.dim(output_dim).has_dim_value()) {
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/Common.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/Common.h
index 0e4ec35da2..e4f05461a7 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/Common.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/Common.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <algorithm>
+
 #define ML_CHECK_VALID_ARGUMENT(x, ...)\
     {\
         if ((x) == false)\
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index f3b9ddceb2..1e7f1be00b 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -1335,7 +1335,7 @@ namespace OperatorHelper
             auto inputShape = shapeInfo.GetInputTensorShape(i);
             for (size_t j = 0; j < outputShape.size(); ++j)
             {
-                if (m_axis == j)
+                if (static_cast<size_t>(m_axis) == j)
                 {
                     outputShape[j] += inputShape[j];
                 }
diff --git a/onnxruntime/core/providers/dml/dml_provider_factory.cc b/onnxruntime/core/providers/dml/dml_provider_factory.cc
index 9b3df2bf7f..704fb99c1b 100644
--- a/onnxruntime/core/providers/dml/dml_provider_factory.cc
+++ b/onnxruntime/core/providers/dml/dml_provider_factory.cc
@@ -212,6 +212,7 @@ static constexpr OrtDmlApi ort_dml_api_10_to_x = {
 const OrtDmlApi* GetOrtDmlApi(_In_ uint32_t /*version*/) NO_EXCEPTION {
 #ifdef USE_DML
   return &ort_dml_api_10_to_x;
-#endif  // USE_DML
-  return nullptr;
+#else
+    return nullptr;
+#endif
 }
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc b/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
index b809756e93..105af2d86f 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_node_capability.cc
@@ -519,4 +519,36 @@ bool DnnlDynamicQuantizeLinearNodeCapability::Supported(const Node* node, const
   return true;
 }
 
+
+// DnnlSqueezeCapability class
+//-------------------------------------
+bool DnnlSqueezeNodeCapability::Supported(const Node* node, const GraphViewer& graph_viewer) const {
+  ORT_UNUSED_PARAMETER(graph_viewer);
+  if (!IsTypeSupported(node)) return false;
+  if (!IsDimensionSupported(node, graph_viewer)) return false;
+  return true;
+}
+bool DnnlSqueezeNodeCapability::IsDimensionSupported(const Node* node, const GraphViewer& graph_viewer) const {
+  // we don't support scalar output
+  auto node_out = node->OutputDefs()[0];
+  if (node_out->Exists() &&
+      node_out->Shape() != nullptr &&
+      node_out->Shape()->dim_size() == 0) {
+    return false;
+  }
+
+  // Before opset version 13 the axis comes from an attribute. After opset version
+  // 13 we must check that the optional axis (input[1]) is a ConstantInitializer because
+  // we only handle the axis at compile time. If it changes at runtime we can not support
+  // the operator
+  auto opset = node->SinceVersion();
+  auto node_inputs = node->InputDefs();
+  if (opset >= 13 && node_inputs.size() > 1 && node_inputs[1]->Shape() != nullptr) {
+    if (!graph_viewer.IsConstantInitializer(node_inputs[1]->Name(), true)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dnnl/dnnl_node_capability.h b/onnxruntime/core/providers/dnnl/dnnl_node_capability.h
index 8c7ac46e3c..ea3e2bf6c1 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_node_capability.h
+++ b/onnxruntime/core/providers/dnnl/dnnl_node_capability.h
@@ -301,4 +301,19 @@ class DnnlDynamicQuantizeLinearNodeCapability : public DnnlDefaultNodeCapability
  private:
 };
 
+class DnnlSqueezeNodeCapability : public DnnlDefaultNodeCapability {
+ public:
+  DnnlSqueezeNodeCapability() : DnnlDefaultNodeCapability({type_float32,
+                                                           type_float16,
+                                                           type_bfloat16,
+                                                           type_int32,
+                                                           type_int8,
+                                                           type_uint8}) {}
+
+  bool Supported(const Node* node, const GraphViewer& graph_viewer) const override;
+
+ private:
+  bool IsDimensionSupported(const Node* node, const GraphViewer& graph_viewer) const;
+};
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc b/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc
index b1d50a70a1..023111264a 100644
--- a/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc
+++ b/onnxruntime/core/providers/dnnl/dnnl_op_manager.cc
@@ -33,11 +33,13 @@ DnnlOpManager::DnnlOpManager() {
   dnnl_ops_map_.emplace(std::make_pair("Sigmoid", std::unique_ptr<DnnlNodeCapability>(new DnnlElementwiseCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Softmax", std::unique_ptr<DnnlNodeCapability>(new DnnlSoftmaxNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Softplus", std::unique_ptr<DnnlNodeCapability>(new DnnlElementwiseCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("Squeeze", std::unique_ptr<DnnlNodeCapability>(new DnnlSqueezeNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Sqrt", std::unique_ptr<DnnlNodeCapability>(new DnnlElementwiseCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Sub", std::unique_ptr<DnnlNodeCapability>(new DnnlBinaryNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Sum", std::unique_ptr<DnnlNodeCapability>(new DnnlSumNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Tanh", std::unique_ptr<DnnlNodeCapability>(new DnnlElementwiseCapability())));
   dnnl_ops_map_.emplace(std::make_pair("Transpose", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
+  dnnl_ops_map_.emplace(std::make_pair("Unsqueeze", std::unique_ptr<DnnlNodeCapability>(new DnnlSqueezeNodeCapability())));
 #if defined(ENABLE_TRAINING)
   dnnl_ops_map_.emplace(std::make_pair("AveragePoolGrad", std::unique_ptr<DnnlNodeCapability>(new DnnlPoolNodeCapability())));
   dnnl_ops_map_.emplace(std::make_pair("ConvGrad", std::unique_ptr<DnnlNodeCapability>(new DnnlDefaultNodeCapability())));
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.cc
new file mode 100644
index 0000000000..db66cb3a39
--- /dev/null
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.cc
@@ -0,0 +1,75 @@
+// Copyright(C) 2021 Intel Corporation
+// Licensed under the MIT License
+
+#include "dnnl_squeeze.h"
+#include "dnnl_subgraph.h"
+#include "dnnl_subgraph_primitive.h"
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace ort_dnnl {
+DnnlSqueeze::DnnlSqueeze() {}
+
+void DnnlSqueeze::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
+  auto dnnl_engine = sp.GetEngine();
+
+  // the input shape assumes OrtFormat so we get the memory in OrtFormat.
+  auto data_mem = sp.GetMemoryInOrtFormat(node.Input(IN_DATA), dnnl_engine);
+  dnnl::memory::dims data_dims = data_mem.get_desc().dims();
+
+  std::vector<int64_t> axes_data;
+  // ONNX Squeeze version 13+ the axes is an input tensor
+  // ONNX Squeeze before version 13 axes comes from an Attribute.
+  if (node.Input(IN_AXES).Exists()) {
+    auto axes_mem = sp.GetMemory(node.Input(IN_AXES));
+    dnnl::memory::dims axes_dims = axes_mem.get_desc().dims();
+    int64_t* p_axes_data = (int64_t*)axes_mem.get_data_handle();
+    axes_data = std::vector<int64_t>(p_axes_data, p_axes_data + axes_dims[0]);
+  } else {
+    axes_data = GetAxes(node);
+  }
+
+  // convert negative axis to the positive axis
+  for (size_t i = 0; i < axes_data.size(); ++i) {
+    axes_data[i] = HandleNegativeAxis(axes_data[i], data_dims.size());
+  }
+
+  // Handle out of order and repeating dims.
+  std::sort(axes_data.begin(), axes_data.end());
+  axes_data.erase(std::unique(axes_data.begin(), axes_data.end()), axes_data.end());
+
+  std::vector<int64_t> output_shape;
+  size_t j = 0;
+  for (size_t i = 0; i < data_dims.size(); ++i) {
+    if ((j < axes_data.size() && axes_data[j] == static_cast<int64_t>(i)) ||
+        (axes_data.size() == 0 && data_dims[i] == 1)) {
+      ORT_ENFORCE(data_dims[i] == 1, "Dimension of input ", i, " must be 1 instead of ", data_dims[i],
+                  ". shape=", data_dims);
+      ++j;
+      continue;
+    }
+    output_shape.push_back(data_dims[i]);
+  }
+
+  dnnl::memory::desc squeeze_md(output_shape, node.Input(IN_DATA).Type(), sp.GetDnnlFormat(output_shape.size()));
+
+  dnnl::memory squeeze_mem = dnnl::memory(squeeze_md, dnnl_engine, nullptr);
+  sp.AddReshape(data_mem, squeeze_mem);
+
+  sp.SetMemory(node.Output(OUT_SQUEEZED), squeeze_mem, true);
+}
+
+std::vector<int64_t> DnnlSqueeze::GetAxes(DnnlNode& node) {
+  auto attr = node.Attributes().find("axes");
+  std::vector<int64_t> axes;
+  if (attr != node.Attributes().end() && 
+      attr->second().type() == ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INTS) {
+    axes.reserve(attr->second().ints_size());
+    for (int i = 0; i < attr->second().ints_size(); ++i) {
+      axes.push_back(attr->second().ints(i));
+    }
+  } 
+  return axes;
+}
+}  // namespace ort_dnnl
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.h
new file mode 100644
index 0000000000..7ea47a3975
--- /dev/null
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_squeeze.h
@@ -0,0 +1,30 @@
+// Copyright(C) 2021 Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+#include "dnnl_subgraph.h"
+#include "dnnl_subgraph_primitive.h"
+
+namespace onnxruntime {
+namespace ort_dnnl {
+
+class DnnlSqueeze {
+ public:
+  enum InputTensors : int {
+    IN_DATA = 0,
+    IN_AXES = 1,
+  };
+
+  enum OutputTensors : int {
+    OUT_SQUEEZED = 0
+  };
+
+  DnnlSqueeze();
+  void CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node);
+
+  private:
+  std::vector<int64_t> GetAxes(DnnlNode& node);
+};
+
+}  // namespace ort_dnnl
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc
index 34d6dfc760..15ad240c5a 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.cc
@@ -18,8 +18,10 @@
 #include "dnnl_reshape.h"
 #include "dnnl_softmax.h"
 #include "dnnl_softmaxgrad.h"
+#include "dnnl_squeeze.h"
 #include "dnnl_sum.h"
 #include "dnnl_transpose.h"
+#include "dnnl_unsqueeze.h"
 
 #if defined(ENABLE_TRAINING)
 #include "dnnl_convgrad.h"
@@ -75,10 +77,14 @@ void DnnlSubgraphPrimitive::AddKernels() {
       DnnlReshape().CreatePrimitive(*this, node);
     } else if (node.OpType() == "Softmax") {
       DnnlSoftmax().CreatePrimitive(*this, node);
+    } else if (node.OpType() == "Squeeze") {
+      DnnlSqueeze().CreatePrimitive(*this, node);
     } else if (node.OpType() == "Sum") {
       DnnlSum().CreatePrimitive(*this, node);
     } else if (node.OpType() == "Transpose") {
       DnnlTranspose().CreatePrimitive(*this, node);
+    } else if (node.OpType() == "Unsqueeze") {
+      DnnlUnsqueeze().CreatePrimitive(*this, node);
 #if defined(ENABLE_TRAINING)
     } else if (node.OpType() == "AveragePoolGrad" || node.OpType() == "MaxPoolGrad") {
       DnnlPoolGrad().CreatePrimitive(*this, node);
@@ -110,6 +116,10 @@ bool DnnlSubgraphPrimitive::IsDynamic() {
   return subgraph_->IsDynamic();
 }
 
+bool DnnlSubgraphPrimitive::IsScalar(const DnnlTensor& tensor) {
+  return Contains(input_is_scalar_, tensor.Name());
+}
+
 void DnnlSubgraphPrimitive::Compile(const std::unordered_map<std::string, OnnxTensorData>& inputs) {
   //if already compiled once and is not dynamic, then don't compile again
   if (!shape_key_.empty() && !IsDynamic()) {
@@ -157,6 +167,7 @@ void DnnlSubgraphPrimitive::Compile(const std::unordered_map<std::string, OnnxTe
     dnnl::memory::dims dnnl_dims = inputs.at(dnnl_tensor_name).tensor_info.shape;
     if (dnnl_dims.size() == 0) {
       dnnl_dims.push_back(1);
+      input_is_scalar_.insert(dnnl_tensor_name);
     }
     auto dnnl_format = GetDnnlFormat(dnnl_dims.size());
     auto input_md = dnnl::memory::desc(dnnl_dims, dnnl_data_type, dnnl_format);
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
index b82d82ea63..4e77be231a 100644
--- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h
@@ -63,6 +63,10 @@ class DnnlSubgraphPrimitive {
   dnnl::memory::desc GetOutputInfo(std::string name);
   bool IsScalarOutput(const std::string& name);
   bool IsDynamic();
+  // All Scalar inputs are automatically converterted to a one dimentional tensor when used in OneDNN
+  // If the input being a scalar affects the operator this function can be used to determine if the
+  // original input from ORT was a scalar.
+  bool IsScalar(const DnnlTensor& tensor);
   OrtMutex& GetMutex() { return mutex_; }
 
   //GetMemory in OrtFormat if the memory is not in the OrtFormat this will reorder the memory.
@@ -77,6 +81,8 @@ class DnnlSubgraphPrimitive {
 
   std::unordered_map<std::string, dnnl::memory> inputs_;
   std::unordered_map<std::string, dnnl::memory::desc> inputs_md_;
+  std::unordered_set<std::string> input_is_scalar_;
+
 
   std::unordered_map<std::string, dnnl::memory> outputs_;
   std::unordered_map<std::string, dnnl::memory::desc> outputs_md_;
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.cc b/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.cc
new file mode 100644
index 0000000000..9532686028
--- /dev/null
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.cc
@@ -0,0 +1,85 @@
+// Copyright(C) 2021 Intel Corporation
+// Licensed under the MIT License
+
+#include "dnnl_unsqueeze.h"
+#include "dnnl_subgraph.h"
+#include "dnnl_subgraph_primitive.h"
+#include "core/providers/common.h"
+
+namespace onnxruntime {
+namespace ort_dnnl {
+DnnlUnsqueeze::DnnlUnsqueeze() {}
+
+void DnnlUnsqueeze::CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node) {
+  auto dnnl_engine = sp.GetEngine();
+
+  // the input shape assumes OrtFormat so we get the memory in OrtFormat.
+  auto data_mem = sp.GetMemoryInOrtFormat(node.Input(IN_DATA), dnnl_engine);
+  bool data_is_scalar = sp.IsScalar(node.Input(IN_DATA));
+
+  // The OneDNN execution provider automatically expands all scalar inputs to dim {1} tensors.
+  // this will result in the data_dims.size() being 1 too large if the input is from a scalar.
+  // To counter this data_dims is left empty if the input is from a scalar.
+  dnnl::memory::dims data_dims;
+  if (!data_is_scalar) {
+    data_dims = data_mem.get_desc().dims();
+  }
+
+  std::vector<int64_t> axes_data;
+  // ONNX Unsqueeze version 13+ the axes is an input tensor
+  // ONNX Unsqueeze before version 13 axes comes from an Attribute.
+  if (node.Input(IN_AXES).Exists()) {
+    auto axes_mem = sp.GetMemory(node.Input(IN_AXES));
+    dnnl::memory::dims axes_dims = axes_mem.get_desc().dims();
+    int64_t* p_axes_data = (int64_t*)axes_mem.get_data_handle();
+    axes_data = std::vector<int64_t>(p_axes_data, p_axes_data + axes_dims[0]);
+  } else {
+    axes_data = GetAxes(node);
+  }
+
+  std::vector<int64_t> output_shape(axes_data.size() + data_dims.size(), 0);
+  // Set all axes indices to 1 in output_dims and check for duplicates
+  for (int64_t axes : axes_data) {
+    // Valid axis range is [0, output_rank - 1]
+    axes = HandleNegativeAxis(axes, output_shape.size());
+    if (axes < 0 || axes >= static_cast<int64_t>(output_shape.size()))
+      ORT_ENFORCE("'axes' has an out of range axis");
+    if (output_shape[axes] != 0)
+      ORT_ENFORCE("'axes' has a duplicate axis");
+    output_shape[axes] = 1;
+  }
+
+  // Now fill in the zero entries with the existing shape
+  {
+    auto begin = data_dims.cbegin();
+    for (auto& axisSize : output_shape) {
+      if (axisSize == 0)
+        axisSize = *begin++;
+    }
+    assert(begin == data_dims.cend());
+  }
+
+  dnnl::memory::desc squeeze_md(output_shape, node.Input(IN_DATA).Type(), sp.GetDnnlFormat(output_shape.size()));
+
+  dnnl::memory expanded_mem = dnnl::memory(squeeze_md, dnnl_engine, nullptr);
+  sp.AddReshape(data_mem, expanded_mem);
+
+  sp.SetMemory(node.Output(OUT_EXPANDED), expanded_mem, true);
+}
+
+std::vector<int64_t> DnnlUnsqueeze::GetAxes(DnnlNode& node) {
+  auto attr = node.Attributes().find("axes");
+  std::vector<int64_t> axes;
+  if (attr != node.Attributes().end() && 
+      attr->second().type() == ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INTS) {
+    axes.reserve(attr->second().ints_size());
+    for (int i = 0; i < attr->second().ints_size(); ++i) {
+      axes.push_back(attr->second().ints(i));
+    }
+  } else {
+    ORT_ENFORCE("Missing/Invalid 'axes' attribute value");
+  } 
+  return axes;
+}
+}  // namespace ort_dnnl
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.h
new file mode 100644
index 0000000000..aaf821da59
--- /dev/null
+++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_unsqueeze.h
@@ -0,0 +1,30 @@
+// Copyright(C) 2021 Intel Corporation
+// Licensed under the MIT License
+
+#pragma once
+#include "dnnl_subgraph.h"
+#include "dnnl_subgraph_primitive.h"
+
+namespace onnxruntime {
+namespace ort_dnnl {
+
+class DnnlUnsqueeze {
+ public:
+  enum InputTensors : int {
+    IN_DATA = 0,
+    IN_AXES = 1,
+  };
+
+  enum OutputTensors : int {
+    OUT_EXPANDED = 0
+  };
+
+  DnnlUnsqueeze();
+  void CreatePrimitive(DnnlSubgraphPrimitive& sp, DnnlNode& node);
+
+  private:
+  std::vector<int64_t> GetAxes(DnnlNode& node);
+};
+
+}  // namespace ort_dnnl
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
index 179cd2c723..a351a5c483 100644
--- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
+++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc
@@ -2852,4 +2852,4 @@ const std::unordered_map<std::string, const IOpBuilder*>& GetOpBuilders() {
 #pragma endregion
 
 }  // namespace nnapi
-}  // namespace onnxruntime
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/nuphar/scripts/rnn_benchmark.py b/onnxruntime/core/providers/nuphar/scripts/rnn_benchmark.py
index baa16888c6..821a02cbb3 100644
--- a/onnxruntime/core/providers/nuphar/scripts/rnn_benchmark.py
+++ b/onnxruntime/core/providers/nuphar/scripts/rnn_benchmark.py
@@ -121,7 +121,7 @@ def perf_test(rnn_type, num_threads, input_dim, hidden_dim, bidirectional, layer
         convert_to_scan_model(model_name, scan_model_name)
         # note that symbolic shape inference is needed because model has symbolic batch dim, thus init_state is ConstantOfShape
         onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(scan_model_name)), scan_model_name)
-        sess = onnxruntime.InferenceSession(scan_model_name)
+        sess = onnxruntime.InferenceSession(scan_model_name, providers=onnxruntime.get_available_providers())
         count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
         avg_scan = top_n_avg(per_iter_cost, top_n)
         print('perf_scan (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, scan_model_name, count, top_n, avg_scan))
@@ -131,7 +131,7 @@ def perf_test(rnn_type, num_threads, input_dim, hidden_dim, bidirectional, layer
         int8_model_name = os.path.splitext(model_name)[0] + '_int8.onnx'
         convert_matmul_model(scan_model_name, int8_model_name)
         onnx.save(SymbolicShapeInference.infer_shapes(onnx.load(int8_model_name)), int8_model_name)
-        sess = onnxruntime.InferenceSession(int8_model_name)
+        sess = onnxruntime.InferenceSession(int8_model_name, providers=onnxruntime.get_available_providers())
         count, duration, per_iter_cost = perf_run(sess, feeds, min_counts=top_n, min_duration_seconds=min_duration_seconds)
         avg_int8 = top_n_avg(per_iter_cost, top_n)
         print('perf_int8 (with {} threads) {}: run for {} iterations, top {} avg {:.3f} ms'.format(num_threads, int8_model_name, count, top_n, avg_int8))
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index d8669b5582..0549f71239 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -1022,8 +1022,16 @@ bool DataOps::op_is_supported(std::string name, std::vector<SupportedOp>& op_lis
             return true;
           }
 
-         //The operator to be marked true, it should be supported by all the devices specified with HETERO/MULTI/AUTO
-          if (device_id_.find("HETERO") == 0 || device_id_.find("MULTI") == 0 || device_id_.find("AUTO") == 0) {
+          //The operator to be marked true, it should be supported by either of the devices specified with HETERO
+          if (device_id_.find("HETERO") == 0) {
+              status = true;
+              if (device_id_.find(*it) != std::string::npos) {
+                return true;
+              }
+          }
+
+         //The operator to be marked true, it should be supported by all the devices specified with MULTI/AUTO
+          if (device_id_.find("MULTI") == 0 || device_id_.find("AUTO") == 0) {
               status = true;
               if (device_id_.find(*it) == std::string::npos) {
                 return false;
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index 38760f4da9..28128bfcf5 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -719,6 +719,10 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDom
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 10, Loop);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 10, DepthToSpace);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 12, SpaceToDepth);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, RandomNormal);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, RandomNormalLike);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, RandomUniform);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, RandomUniformLike);
 
 // opset 10
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool);
@@ -1559,6 +1563,10 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
     // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 10, Loop)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 10, DepthToSpace)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, 12, SpaceToDepth)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, RandomNormal)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, RandomNormalLike)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, RandomUniform)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, RandomUniformLike)>,
 
     // opset 10
     // BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
index e3dbfd1cc2..7752d657f4 100644
--- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
+++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc
@@ -222,7 +222,7 @@ TensorShape& TensorShape::operator=(const TensorShape& other) {
   return *this;
 }
 
-TensorShape& TensorShape::operator=(TensorShape&& other) {
+TensorShape& TensorShape::operator=(TensorShape&& other) noexcept {
   g_host->TensorShape__operator_move_assign(this, std::move(other));
   return *this;
 }
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
index 4f6bcf232a..9735f83408 100644
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -221,7 +221,7 @@ struct ProviderHost {
 
   // TensorShape
   virtual void TensorShape__operator_assign(TensorShape* p, const TensorShape& other) = 0;
-  virtual void TensorShape__operator_move_assign(TensorShape* p, TensorShape&& other) = 0;
+  virtual void TensorShape__operator_move_assign(TensorShape* p, TensorShape&& other) noexcept = 0;
   virtual void TensorShape__Allocate(TensorShape* p, size_t size) = 0;
   virtual int64_t TensorShape__SizeHelper(const TensorShape* p, size_t start, size_t end) = 0;
   virtual std::string TensorShape__ToString(const TensorShape* p) = 0;
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 124a2b57aa..99d99ae0e4 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -526,7 +526,7 @@ common::Status InferenceSession::RegisterExecutionProvider(const std::shared_ptr
 
   p_exec_provider->SetLogger(session_logger_);
   session_profiler_.AddEpProfilers(p_exec_provider->GetProfiler());
-  return execution_providers_.Add(provider_type, std::move(p_exec_provider));
+  return execution_providers_.Add(provider_type, p_exec_provider);
 }
 
 // Custom Op support
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 3a34947dfa..7894eb602b 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -236,6 +236,12 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSparseTensorAsOrtValue, _Inout_ OrtAllocator*
   *out = value.release();
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(allocator);
+  ORT_UNUSED_PARAMETER(dense_shape);
+  ORT_UNUSED_PARAMETER(dense_shape_len);
+  ORT_UNUSED_PARAMETER(type);
+  ORT_UNUSED_PARAMETER(out);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -302,6 +308,14 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorCoo, _Inout_ OrtValue* ort_value, _
   }
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(ort_value);
+  ORT_UNUSED_PARAMETER(data_mem_info);
+  ORT_UNUSED_PARAMETER(values_shape);
+  ORT_UNUSED_PARAMETER(values_shape_len);
+  ORT_UNUSED_PARAMETER(values);
+  ORT_UNUSED_PARAMETER(indices_data);
+  ORT_UNUSED_PARAMETER(indices_num);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -329,6 +343,15 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorCsr, _Inout_ OrtValue* ort_value, _
   }
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(ort_value);
+  ORT_UNUSED_PARAMETER(data_mem_info);
+  ORT_UNUSED_PARAMETER(values_shape);
+  ORT_UNUSED_PARAMETER(values_shape_len);
+  ORT_UNUSED_PARAMETER(values);
+  ORT_UNUSED_PARAMETER(inner_indices_data);
+  ORT_UNUSED_PARAMETER(inner_indices_num);
+  ORT_UNUSED_PARAMETER(outer_indices_data);
+  ORT_UNUSED_PARAMETER(outer_indices_num);
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -359,6 +382,15 @@ ORT_API_STATUS_IMPL(OrtApis::FillSparseTensorBlockSparse, _Inout_ OrtValue* ort_
   }
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(ort_value);
+  ORT_UNUSED_PARAMETER(data_mem_info);
+  ORT_UNUSED_PARAMETER(values_shape);
+  ORT_UNUSED_PARAMETER(values_shape_len);
+  ORT_UNUSED_PARAMETER(values);
+  ORT_UNUSED_PARAMETER(indices_shape_data);
+  ORT_UNUSED_PARAMETER(indices_shape_len);
+  ORT_UNUSED_PARAMETER(indices_data);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -389,6 +421,15 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSparseTensorWithValuesAsOrtValue, _In_ const
   *out = value.release();
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(info);
+  ORT_UNUSED_PARAMETER(p_data);
+  ORT_UNUSED_PARAMETER(dense_shape);
+  ORT_UNUSED_PARAMETER(dense_shape_len);
+  ORT_UNUSED_PARAMETER(values_shape);
+  ORT_UNUSED_PARAMETER(values_shape_len);
+  ORT_UNUSED_PARAMETER(type);
+  ORT_UNUSED_PARAMETER(out);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -406,6 +447,10 @@ ORT_API_STATUS_IMPL(OrtApis::UseCooIndices, _Inout_ OrtValue* ort_value, _Inout_
   ORT_THROW_IF_ERROR(sparse_tensor.UseCooIndices(indices_span));
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(ort_value);
+  ORT_UNUSED_PARAMETER(indices_data);
+  ORT_UNUSED_PARAMETER(indices_num);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -426,13 +471,19 @@ ORT_API_STATUS_IMPL(OrtApis::UseCsrIndices, _Inout_ OrtValue* ort_value,
   ORT_THROW_IF_ERROR(sparse_tensor.UseCsrIndices(inner_span, outer_span));
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(ort_value);
+  ORT_UNUSED_PARAMETER(inner_data);
+  ORT_UNUSED_PARAMETER(inner_num);
+  ORT_UNUSED_PARAMETER(outer_data);
+  ORT_UNUSED_PARAMETER(outer_num);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtApis::UseBlockSparseIndices, _Inout_ OrtValue* ort_value, const int64_t* indices_shape, size_t indices_shape_len,
-                    _Inout_ int32_t* indices_data) {
+ORT_API_STATUS_IMPL(OrtApis::UseBlockSparseIndices, _Inout_ OrtValue* ort_value, const int64_t* indices_shape,
+                    size_t indices_shape_len, _Inout_ int32_t* indices_data) {
   API_IMPL_BEGIN
 #if !defined(DISABLE_SPARSE_TENSORS)
   auto& sparse_tensor = SparseTensor::GetSparseTensorFromOrtValue(*ort_value);
@@ -440,6 +491,11 @@ ORT_API_STATUS_IMPL(OrtApis::UseBlockSparseIndices, _Inout_ OrtValue* ort_value,
   ORT_THROW_IF_ERROR(sparse_tensor.UseBlockSparseIndices(ind_shape, indices_data));
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(ort_value);
+  ORT_UNUSED_PARAMETER(indices_shape);
+  ORT_UNUSED_PARAMETER(indices_shape_len);
+  ORT_UNUSED_PARAMETER(indices_data);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -456,6 +512,9 @@ ORT_API_STATUS_IMPL(OrtApis::GetSparseTensorFormat, _In_ const OrtValue* ort_val
   *out = static_cast<OrtSparseFormat>(sparse_tensor.Format());
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(ort_value);
+  ORT_UNUSED_PARAMETER(out);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -472,6 +531,9 @@ ORT_API_STATUS_IMPL(OrtApis::GetSparseTensorValues, _In_ const OrtValue* ort_val
   *out = values.DataRaw();
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(ort_value);
+  ORT_UNUSED_PARAMETER(out);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
   API_IMPL_END
@@ -908,6 +970,26 @@ ORT_API(void, OrtApis::ClearBoundOutputs, _Inout_ OrtIoBinding* binding_ptr) {
   binding_ptr->binding_->ClearOutputs();
 }
 
+ORT_API_STATUS_IMPL(OrtApis::SynchronizeBoundInputs, _Inout_ OrtIoBinding* binding_ptr) {
+  API_IMPL_BEGIN
+  auto st = binding_ptr->binding_->SynchronizeInputs();
+  if (!st.IsOK()) {
+    return ToOrtStatus(st);
+  }
+  return nullptr;
+  API_IMPL_END
+}
+
+ORT_API_STATUS_IMPL(OrtApis::SynchronizeBoundOutputs, _Inout_ OrtIoBinding* binding_ptr) {
+  API_IMPL_BEGIN
+  auto st = binding_ptr->binding_->SynchronizeOutputs();
+  if (!st.IsOK()) {
+    return ToOrtStatus(st);
+  }
+  return nullptr;
+  API_IMPL_END
+}
+
 ORT_API_STATUS_IMPL(OrtApis::IsTensor, _In_ const OrtValue* value, _Out_ int* out) {
   auto v = reinterpret_cast<const ::OrtValue*>(value);
   *out = v->IsTensor() ? 1 : 0;
@@ -926,6 +1008,9 @@ ORT_API_STATUS_IMPL(OrtApis::IsSparseTensor, _In_ const OrtValue* value, _Out_ i
   *out = v->IsSparseTensor() ? 1 : 0;
   return nullptr;
 #else
+  ORT_UNUSED_PARAMETER(value);
+  ORT_UNUSED_PARAMETER(out);
+
   return OrtApis::CreateStatus(ORT_FAIL, "SparseTensor is not supported in this build.");
 #endif
 }
@@ -933,7 +1018,7 @@ ORT_API_STATUS_IMPL(OrtApis::IsSparseTensor, _In_ const OrtValue* value, _Out_ i
 ORT_API_STATUS_IMPL(OrtApis::GetTensorMutableData, _Inout_ OrtValue* value, _Outptr_ void** output) {
   TENSOR_READWRITE_API_BEGIN
   // Uncomment when WinML fixed their code
-  //if (tensor->IsDataTypeString()) {
+  // if (tensor->IsDataTypeString()) {
   //  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "this API does not support strings");
   //}
   *output = tensor->MutableDataRaw();
@@ -949,7 +1034,7 @@ ORT_API_STATUS_IMPL(OrtApis::FillStringTensor, _Inout_ OrtValue* value, _In_ con
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "input array doesn't equal tensor size");
   }
   for (size_t i = 0; i != len; ++i) {
-    //allocate and copy
+    // allocate and copy
     dst[i] = s[i];
   }
   return nullptr;
@@ -1875,9 +1960,9 @@ ORT_API_STATUS_IMPL(OrtApis::GetOpaqueValue, _In_ const char* domain_name, _In_
 ORT_API_STATUS_IMPL(OrtApis::GetAvailableProviders, _Outptr_ char*** out_ptr,
                     _In_ int* providers_length) {
   API_IMPL_BEGIN
-  //TODO: there is no need to manually malloc/free these memory, it is insecure
-  //and inefficient. Instead, the implementation could scan the array twice,
-  //and use a single string object to hold all the names.
+  // TODO: there is no need to manually malloc/free these memory, it is insecure
+  // and inefficient. Instead, the implementation could scan the array twice,
+  // and use a single string object to hold all the names.
   const size_t MAX_LEN = 30;
   const auto& available_providers = GetAvailableExecutionProviderNames();
   const int available_count = gsl::narrow<int>(available_providers.size());
@@ -1902,7 +1987,7 @@ ORT_API_STATUS_IMPL(OrtApis::GetAvailableProviders, _Outptr_ char*** out_ptr,
   return nullptr;
 }
 
-//TODO: we don't really need the second parameter
+// TODO: we don't really need the second parameter
 ORT_API_STATUS_IMPL(OrtApis::ReleaseAvailableProviders, _In_ char** ptr,
                     _In_ int providers_length) {
   API_IMPL_BEGIN
@@ -2420,6 +2505,8 @@ static constexpr OrtApi ort_api_1_to_10 = {
     &OrtApis::SetGlobalCustomCreateThreadFn,
     &OrtApis::SetGlobalCustomThreadCreationOptions,
     &OrtApis::SetGlobalCustomJoinThreadFn,
+    &OrtApis::SynchronizeBoundInputs,
+    &OrtApis::SynchronizeBoundOutputs
 };
 
 // Asserts to do a some checks to ensure older Versions of the OrtApi never change (will detect an addition or deletion but not if they cancel out each other)
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index 5053b8ac5f..2345e00ed3 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -325,5 +325,6 @@ ORT_API_STATUS_IMPL(SessionOptionsSetCustomJoinThreadFn, _Inout_ OrtSessionOptio
 ORT_API_STATUS_IMPL(SetGlobalCustomCreateThreadFn, _Inout_ OrtThreadingOptions* tp_options, _In_ OrtCustomCreateThreadFn ort_custom_create_thread_fn);
 ORT_API_STATUS_IMPL(SetGlobalCustomThreadCreationOptions, _Inout_ OrtThreadingOptions* tp_options, _In_ void* ort_custom_thread_creation_options);
 ORT_API_STATUS_IMPL(SetGlobalCustomJoinThreadFn, _Inout_ OrtThreadingOptions* tp_options, _In_ OrtCustomJoinThreadFn ort_custom_join_thread_fn);
-
+ORT_API_STATUS_IMPL(SynchronizeBoundInputs, _Inout_ OrtIoBinding* binding_ptr);
+ORT_API_STATUS_IMPL(SynchronizeBoundOutputs, _Inout_ OrtIoBinding* binding_ptr);
 }  // namespace OrtApis
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
index 36525126df..d5435bdf44 100644
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -282,7 +282,7 @@ struct ProviderHostImpl : ProviderHost {
 
   // TensorShape (direct)
   void TensorShape__operator_assign(TensorShape* p, const TensorShape& other) override { p->TensorShape::operator=(other); }
-  void TensorShape__operator_move_assign(TensorShape* p, TensorShape&& other) override { p->TensorShape::operator=(std::move(other)); }
+  void TensorShape__operator_move_assign(TensorShape* p, TensorShape&& other) noexcept override { p->TensorShape::operator=(std::move(other)); }
   void TensorShape__Allocate(TensorShape* p, size_t size) override { p->TensorShape::Allocate(size); }
   int64_t TensorShape__SizeHelper(const TensorShape* p, size_t start, size_t end) override { return p->TensorShape::SizeHelper(start, end); }
   std::string TensorShape__ToString(const TensorShape* p) override { return p->TensorShape::ToString(); }
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index eb4515f1f7..c09d885a23 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -650,6 +650,7 @@ void Im2col<T, StorageOrder::NHWC>::operator()(
   }
 }
 
+template struct Im2col<int8_t, StorageOrder::NHWC>;
 template struct Im2col<uint8_t, StorageOrder::NHWC>;
 
 template <>
diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 0e92984e01..f753ccff31 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -357,6 +357,7 @@ class InferenceSession(Session):
                                                                         provider_options,
                                                                         available_providers)
         if providers == [] and len(available_providers) > 1:
+            self.disable_fallback()
             raise ValueError("This ORT build has {} enabled. ".format(available_providers) +
                              "Since ORT 1.9, you are required to explicitly set " +
                              "the providers parameter when instantiating InferenceSession. For example, "
@@ -447,6 +448,9 @@ class IOBinding:
         '''
         self._iobinding.bind_ortvalue_input(name, ortvalue._ortvalue)
 
+    def synchronize_inputs(self):
+        self._iobinding.synchronize_inputs()
+
     def bind_output(self, name, device_type='cpu', device_id=0, element_type=None, shape=None, buffer_ptr=None):
         '''
         :param name: output name
@@ -482,6 +486,9 @@ class IOBinding:
         '''
         self._iobinding.bind_ortvalue_output(name, ortvalue._ortvalue)
 
+    def synchronize_outputs(self):
+        self._iobinding.synchronize_outputs()
+
     def get_outputs(self):
         '''
         Returns the output OrtValues from the Run() that preceded the call.
diff --git a/onnxruntime/python/onnxruntime_pybind_iobinding.cc b/onnxruntime/python/onnxruntime_pybind_iobinding.cc
index 852dd2a136..fb88844f4f 100644
--- a/onnxruntime/python/onnxruntime_pybind_iobinding.cc
+++ b/onnxruntime/python/onnxruntime_pybind_iobinding.cc
@@ -87,6 +87,12 @@ void addIoBindingMethods(pybind11::module& m) {
           throw std::runtime_error("Error when binding input: " + status.ErrorMessage());
         }
       })
+      .def("synchronize_inputs", [](SessionIOBinding* io_binding) -> void {
+        auto status = io_binding->Get()->SynchronizeInputs();
+        if (!status.IsOK()) {
+          throw std::runtime_error("Error when synchronizing bound inputs: " + status.ErrorMessage());
+        }
+      })
       // This binds output to a pre-allocated memory as a Tensor
       .def("bind_output", [](SessionIOBinding* io_binding, const std::string& name, const OrtDevice& device, py::object& element_type, std::vector<int64_t>& shape, int64_t data_ptr) -> void {
         ORT_ENFORCE(data_ptr != 0, "Pointer to data memory is not valid");
@@ -140,6 +146,12 @@ void addIoBindingMethods(pybind11::module& m) {
           throw std::runtime_error("Error when binding output: " + status.ErrorMessage());
         }
       })
+      .def("synchronize_outputs", [](SessionIOBinding* io_binding) -> void {
+        auto status = io_binding->Get()->SynchronizeOutputs();
+        if (!status.IsOK()) {
+          throw std::runtime_error("Error when synchronizing bound outputs: " + status.ErrorMessage());
+        }
+      })
       .def("clear_binding_inputs", [](SessionIOBinding* io_binding) -> void {
         io_binding->Get()->ClearInputs();
       })
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index ea198fb45d..f7cc9d37c0 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -497,15 +497,16 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
               ORT_THROW("Invalid TensorRT EP option: ", option.first);
             }
           }
-          return onnxruntime::CreateExecutionProviderFactory_Tensorrt(&params)->CreateProvider();
+          if (std::shared_ptr<IExecutionProviderFactory> tensorrt_provider_factory = onnxruntime::CreateExecutionProviderFactory_Tensorrt(&params)) {
+            return tensorrt_provider_factory->CreateProvider();
+          }
         } else {
-          return onnxruntime::CreateExecutionProviderFactory_Tensorrt(cuda_device_id)->CreateProvider();
+          if (std::shared_ptr<IExecutionProviderFactory> tensorrt_provider_factory = onnxruntime::CreateExecutionProviderFactory_Tensorrt(cuda_device_id)) {
+            return tensorrt_provider_factory->CreateProvider();
+          }
         }
-    } else {
-      if (!Env::Default().GetEnvironmentVar("CUDA_PATH").empty()) {
-        ORT_THROW("CUDA_PATH is set but CUDA wasn't able to be loaded. Please install the correct version of CUDA and cuDNN as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/reference/execution-providers/CUDA-ExecutionProvider.html#requirements) as well as TensorRT as mentioned in the TensorRT requirements page (https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported.");
-      }
     }
+    LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please reference https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#requirements to ensure all dependencies are met.";
 #endif
   } else if (type == kMIGraphXExecutionProvider) {
 #ifdef USE_MIGRAPHX
@@ -531,6 +532,7 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
         }
       }
     }
+    LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please reference https://onnxruntime.ai/docs/reference/execution-providers/CUDA-ExecutionProvider.html#requirements to ensure all dependencies are met.";
 #endif
   } else if (type == kRocmExecutionProvider) {
 #ifdef USE_ROCM
@@ -604,10 +606,18 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
 
       
     }
-    auto p = onnxruntime::CreateExecutionProviderFactory_OpenVINO(&params)->CreateProvider();
-    // Reset global variables config to avoid it being accidentally passed on to the next session
-    openvino_device_type.clear();
-    return p;
+    if (std::shared_ptr<IExecutionProviderFactory> openvino_provider_factory = onnxruntime::CreateExecutionProviderFactory_OpenVINO(&params)) {
+      auto p = openvino_provider_factory->CreateProvider();
+      // Reset global variables config to avoid it being accidentally passed on to the next session
+      openvino_device_type.clear();
+      return p;
+    } else {
+      if (!Env::Default().GetEnvironmentVar("INTEL_OPENVINO_DIR").empty()) {
+        ORT_THROW("INTEL_OPENVINO_DIR is set but OpenVINO library wasn't able to be loaded. Please install a supported version of OpenVINO as mentioned in the requirements page (https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements), ensure dependency libraries are in the PATH and your hardware is supported.");
+      } else {
+        LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please reference https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met.";
+      }
+    }
 #endif
   } else if (type == kNupharExecutionProvider) {
 #if USE_NUPHAR
diff --git a/onnxruntime/python/tools/onnxruntime_test.py b/onnxruntime/python/tools/onnxruntime_test.py
index c1f809074c..0d4cc22be3 100644
--- a/onnxruntime/python/tools/onnxruntime_test.py
+++ b/onnxruntime/python/tools/onnxruntime_test.py
@@ -71,7 +71,7 @@ def run_model(model_path,
         sess_options.enable_profiling = True
         sess_options.profile_file_prefix = os.path.basename(model_path)
 
-    sess = onnxrt.InferenceSession(model_path, sess_options)
+    sess = onnxrt.InferenceSession(model_path, sess_options=sess_options, providers=onnxrt.get_available_providers())
     meta = sess.get_modelmeta()
 
     if not feeds:
diff --git a/onnxruntime/python/tools/quantization/onnx_quantizer.py b/onnxruntime/python/tools/quantization/onnx_quantizer.py
index 38b64a171c..aa0ee6156f 100644
--- a/onnxruntime/python/tools/quantization/onnx_quantizer.py
+++ b/onnxruntime/python/tools/quantization/onnx_quantizer.py
@@ -42,10 +42,13 @@ class ONNXQuantizer:
         self.static = static  # use static quantization for inputs.
         self.fuse_dynamic_quant = False
         self.enable_subgraph_quantization = 'EnableSubgraph' in self.extra_options and self.extra_options['EnableSubgraph']
+        self.force_quantize_no_input_check = 'ForceQuantizeNoInputCheck' in self.extra_options and self.extra_options['ForceQuantizeNoInputCheck']
         self.q_matmul_const_b_only = 'MatMulConstBOnly' in self.extra_options and self.extra_options['MatMulConstBOnly']
         is_weight_int8 = weight_qType == QuantType.QInt8
         self.is_weight_symmetric = is_weight_int8 if 'WeightSymmetric' not in self.extra_options else self.extra_options['WeightSymmetric']
         self.is_activation_symmetric = False if 'ActivationSymmetric' not in self.extra_options else self.extra_options['ActivationSymmetric']
+        self.op_types_support_per_channel_quantization = [] if 'OpTypesSupportPerChannelQuantization' not in extra_options \
+                                                        else extra_options['OpTypesSupportPerChannelQuantization']
 
         self.input_qType = onnx_proto.TensorProto.INT8 if input_qType == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
         self.weight_qType = onnx_proto.TensorProto.INT8 if weight_qType == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
@@ -171,7 +174,7 @@ class ONNXQuantizer:
 
     def remove_fake_quantized_nodes(self):
         '''
-            Detect and remove the quantize/dequantizelinear node pairs(fake quantized nodes in Quantization-Aware training) 
+            Detect and remove the quantize/dequantizelinear node pairs(fake quantized nodes in Quantization-Aware training)
             and reconnect and update the nodes.
         '''
         nodes_to_remove = []
@@ -294,8 +297,11 @@ class ONNXQuantizer:
         self.model.graph().ClearField('node')
         self.model.graph().node.extend(self.new_nodes)
 
-        # Remove ununsed weights from graph.
-        self.remove_quantized_weights()
+        # Remove ununsed initializers from graph, starting from the top level graph.
+        if self.parent is None:
+            _, initializers_not_found = ONNXQuantizer.CleanGraphInitializers(self.model.graph(), self.model.model)
+            if len(initializers_not_found) > 0:
+                raise RuntimeError("Invalid model with unknown initializers/tensors." + str(initializers_not_found))
 
         self.model.model.producer_name = __producer__
         self.model.model.producer_version = __version__
@@ -542,6 +548,13 @@ class ONNXQuantizer:
         self.quantized_value_map[input_name] = QuantizedValue(input_name, output_name, scale_name, zp_name, qType)
         return nodes + [qlinear_node]
 
+    def find_quantized_value(self, input_name):
+        if input_name in self.quantized_value_map:
+            return self.quantized_value_map[input_name]
+        if self.parent is not None:
+            return self.parent.find_quantized_value(input_name)
+        return None
+
     def quantize_bias_static(self, bias_name, input_name, weight_name):
         '''
         Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale
@@ -699,7 +712,7 @@ class ONNXQuantizer:
             :param weight: TensorProto initializer
             :param qType: type to quantize to
             :param keep_float_weight: Whether to quantize the weight. In some cases, we only want to qunatize scale and zero point.
-                                      If keep_float_weight is False, quantize the weight, or don't quantize the weight. 
+                                      If keep_float_weight is False, quantize the weight, or don't quantize the weight.
             :return: quantized weight name, zero point name, scale name
         '''
         # Find if this input is already quantized
@@ -733,7 +746,7 @@ class ONNXQuantizer:
 
         return q_weight_name, zp_name, scale_name
 
-    def quantize_weight_per_channel(self, weight_name, weight_qType, channel_axis, reduce_range=True, 
+    def quantize_weight_per_channel(self, weight_name, weight_qType, channel_axis, reduce_range=True,
                                     keep_float_weight=False):
         # Find if this input is already quantized
         if weight_name in self.quantized_value_map:
@@ -857,23 +870,74 @@ class ONNXQuantizer:
 
         return quantization_params
 
-    def remove_quantized_weights(self):
-        ''' Remove the weights which are already quantized from graph initializer list.
-            This function assumes that after quantization, all nodes that previously use a weight:
-                - use output from DequantizeLinear as input if they do not support quantization.
-                - use quantized weight if they support quantization.
+
+    # static method
+    def CleanGraphInitializers(graph, model):
         '''
-        for tensor_name, quant_value in self.quantized_value_map.items():
-            if quant_value.value_type == QuantizedValueType.Initializer:
-                weight = self.model.get_initializer(tensor_name)
+        Clean unused initializers including which is caused by quantizing the model.
+            return cleaned graph, and list of tensor names from this graph and all its subgraphes
+            that can not be found in this graph and its subgraphes
+        '''
+        requesting_tensor_names = {}
+        requesting_tensor_names.update({input_name: 1 for node in graph.node for input_name in node.input if input_name})
+        requesting_tensor_names.update({g_out.name: 1 for g_out in graph.output if g_out.name})
 
-                if weight is not None:
-                    self.model.initializer().remove(weight)
+        new_nodes = []
+        for node in graph.node:
+            node_2_add = node
+            graph_attrs = [attr for attr in node.attribute if attr.type == onnx.AttributeProto.GRAPH or attr.type == onnx.AttributeProto.GRAPHS]
+            if len(graph_attrs) > 0:
+                kwargs = {}
+                for attr in node.attribute:
+                    kv = {}
+                    if attr.type == onnx.AttributeProto.GRAPH:
+                        cleaned_sub_graph, sub_requesting_tensor_names = ONNXQuantizer.CleanGraphInitializers(attr.g, model)
+                        kv = {attr.name: cleaned_sub_graph}
+                        requesting_tensor_names.update({gn: 1 for gn in sub_requesting_tensor_names})
+                    elif attr.type == onnx.AttributeProto.GRAPHS:
+                        cleaned_graphes = []
+                        for subgraph in attr.graphs:
+                            cleaned_sub_graph, sub_requesting_tensor_names = ONNXQuantizer.CleanGraphInitializers(subgraph, model)
+                            cleaned_graphes.extend([cleaned_sub_graph])
+                            requesting_tensor_names.update({gn: 1 for gn in sub_requesting_tensor_names})
+                        kv = {attr.name: cleaned_graphes}
+                    else:
+                        kv = attribute_to_kwarg(attr)
+                    kwargs.update(kv)
+                node_2_add = onnx.helper.make_node(node.op_type, node.input, node.output, name=node.name, **kwargs)
+            new_nodes.extend([node_2_add])
 
-                    # Remove from graph.input
-                    try:
-                        weight_input = next(val for val in self.model.graph().input if val.name == tensor_name)
-                        self.model.graph().input.remove(weight_input)
-                    except StopIteration:
-                        if self.model.ir_version() < 4:
-                            print("Warning: invalid weight name {} found in the graph (not a graph input)".format(tensor_name))
+        graph.ClearField('node')
+        graph.node.extend(new_nodes)
+
+        generated_names = {}
+        generated_names.update({output_name: 1 for node in graph.node for output_name in node.output if output_name})
+        for gn in generated_names:
+            requesting_tensor_names.pop(gn, None)
+
+        name_to_input = {}
+        for input in graph.input:
+            name_to_input[input.name] = input
+
+        unused_ini_tensors = []
+        for ini_tensor in graph.initializer:
+            if ini_tensor.name in requesting_tensor_names:
+                requesting_tensor_names.pop(ini_tensor.name, None)
+            else:
+                # mark it to remove, remove here directly will cause mis-behavier
+                unused_ini_tensors.append(ini_tensor)
+
+        for ini_tensor in unused_ini_tensors:
+            graph.initializer.remove(ini_tensor)
+            if ini_tensor.name in name_to_input:
+                try:
+                    graph.input.remove(name_to_input[ini_tensor.name])
+                except StopIteration:
+                    if model.ir_version < 4:
+                        print("Warning: invalid weight name {} found in the graph (not a graph input)".format(ini_tensor.name))
+
+        for input in graph.input:
+            if input.name in requesting_tensor_names:
+                requesting_tensor_names.pop(input.name, None)
+
+        return graph, requesting_tensor_names
diff --git a/onnxruntime/python/tools/quantization/operators/direct_q8.py b/onnxruntime/python/tools/quantization/operators/direct_q8.py
index 255aba738c..9ec70436c7 100644
--- a/onnxruntime/python/tools/quantization/operators/direct_q8.py
+++ b/onnxruntime/python/tools/quantization/operators/direct_q8.py
@@ -1,6 +1,6 @@
 from .base_operator import QuantOperatorBase
 from .qdq_base_operator import QDQOperatorBase
-from ..quant_utils import QuantizedValue
+from ..quant_utils import QuantizedValue, QuantizedValueType
 
 # For operators that support 8bits operations directly, and output could
 # reuse input[0]'s type, zeropoint, scale; For example,Transpose, Reshape, etc.
@@ -11,21 +11,46 @@ class Direct8BitOp(QuantOperatorBase):
     def quantize(self):
         node = self.node
 
-        # Quantize when input[0] is quantized already. Otherwise keep it.
-        if node.input[0] not in self.quantizer.quantized_value_map:
+        if not self.quantizer.force_quantize_no_input_check:
+            # Keep backward compatiblity
+            # Quantize when input[0] is quantized already. Otherwise keep it.
+            quantized_input_value = self.quantizer.find_quantized_value(node.input[0])
+            if quantized_input_value is None:
+                self.quantizer.new_nodes += [node]
+                return
+
+            quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
+                                                    quantized_input_value.scale_name, quantized_input_value.zp_name,
+                                                    quantized_input_value.value_type)
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_value.q_name
+            node.output[0] = quantized_output_value.q_name
             self.quantizer.new_nodes += [node]
-            return
 
-        # Create an entry for output quantized value
-        quantized_input_value = self.quantizer.quantized_value_map[node.input[0]]
-        quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
-                                                quantized_input_value.scale_name, quantized_input_value.zp_name,
-                                                quantized_input_value.value_type)
-        self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+        else:
+            # Force quantize those ops if possible, use black list on node if this is not you want
+            if (not self.quantizer.is_valid_quantize_weight(node.input[0])):
+                super().quantize()
+                return
+
+            (quantized_input_names, zero_point_names, scale_names, nodes) = \
+                self.quantizer.quantize_inputs(node, [0])
+            if quantized_input_names is None:
+                return super().quantize()
+
+            # Create an entry for output quantized value
+            quantized_output_value = QuantizedValue(node.output[0], node.output[0] + "_quantized",
+                                                    scale_names[0], zero_point_names[0],
+                                                    QuantizedValueType.Input)
+            self.quantizer.quantized_value_map[node.output[0]] = quantized_output_value
+
+            node.input[0] = quantized_input_names[0]
+            node.output[0] = quantized_output_value.q_name
+            nodes.append(node)
+
+            self.quantizer.new_nodes += nodes
 
-        node.input[0] = quantized_input_value.q_name
-        node.output[0] = quantized_output_value.q_name
-        self.quantizer.new_nodes += [node]
 
 
 class QDQDirect8BitOp(QDQOperatorBase):
diff --git a/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py b/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py
index f8f5546b15..ebe3b7c71a 100644
--- a/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py
+++ b/onnxruntime/python/tools/quantization/operators/qdq_base_operator.py
@@ -19,4 +19,10 @@ class QDQOperatorBase:
             nodes_to_iterate = itertools.chain(node.input, node.output)
 
         for tensor_name in nodes_to_iterate:
-            self.quantizer.quantize_tensor(tensor_name)
+            if self.quantizer.is_per_channel():
+                if node.op_type in self.quantizer.op_types_support_per_channel_quantization :
+                    self.quantizer.quantize_tensor_per_channel(tensor_name, self.quantizer.qdq_channel_axis)
+                else:
+                    self.quantizer.quantize_tensor(tensor_name)
+            else:
+                self.quantizer.quantize_tensor(tensor_name)
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index 839ee60b09..423e8d5c8d 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -51,6 +51,15 @@ class QDQQuantizer(ONNXQuantizer):
         self.add_qdq_pair_to_weight = False if 'AddQDQPairToWeight' not in extra_options \
                                         else extra_options['AddQDQPairToWeight'] 
 
+        # The default behavior is that multiple nodes can share a QDQ pair as their inputs. 
+        # In TRT, QDQ pair can’t be shared between nodes, so it will create dedicated QDQ pairs for each node. 
+        self.dedicated_qdq_pair = False if 'DedicatedQDQPair' not in extra_options else extra_options['DedicatedQDQPair'] 
+        if self.dedicated_qdq_pair:
+            self.tensor_to_its_receiving_nodes = {}
+
+        # Channel axis when per_channel is True
+        self.qdq_channel_axis = 0 if 'QDQChannelAxis' not in extra_options else extra_options['QDQChannelAxis']
+
     def quantize_tensor(self, tensor_name):
         weight = find_by_name(tensor_name, self.model.initializer())
         if weight is not None:
@@ -91,6 +100,14 @@ class QDQQuantizer(ONNXQuantizer):
         self.model.remove_nodes(self.nodes_to_remove)
 
     def quantize_model(self):
+        if self.dedicated_qdq_pair:
+            for node in self.model.nodes():
+                if self.should_quantize(node):
+                    for tensor_name in node.input:
+                        if tensor_name not in self.tensor_to_its_receiving_nodes:
+                            self.tensor_to_its_receiving_nodes[tensor_name] = []
+                        self.tensor_to_its_receiving_nodes[tensor_name].append(node)
+
         for node in self.model.nodes():
             if self.should_quantize(node):
                 op_quantizer = CreateQDQQuantizer(self, node)
@@ -101,7 +118,7 @@ class QDQQuantizer(ONNXQuantizer):
         self.quantize_bias_tensors()
         self.remove_nodes()
         if not self.add_qdq_pair_to_weight:
-            self.remove_quantized_weights()
+            ONNXQuantizer.CleanGraphInitializers(self.model.graph(), self.model.model)
 
         self.model.model.producer_name = __producer__
         self.model.model.producer_version = __version__
@@ -156,30 +173,55 @@ class QDQQuantizer(ONNXQuantizer):
                         "In static mode quantization params for inputs and outputs of nodes to be quantized are required."
                         .format(tensor_name))
 
-                q_input = tensor_name
-                q_output = tensor_name + "_QuantizeLinear"
-                dq_input = q_output
-                dq_output = tensor_name + "_DequantizeLinear"
-                if self.model.is_graph_output(tensor_name):
-                    q_input = tensor_name + "_QuantizeLinearInput"
-                    dq_output = tensor_name
-                    self.model.replace_output_of_all_nodes(tensor_name, q_input)
+                if self.dedicated_qdq_pair and tensor_name in self.tensor_to_its_receiving_nodes and len(self.tensor_to_its_receiving_nodes[tensor_name]) > 1:
+                    num_dedicated_qdq_pair = len(self.tensor_to_its_receiving_nodes[tensor_name])
+                    for i in range(num_dedicated_qdq_pair):
+                        postfix = str(i+1)
+                        q_input = tensor_name
+                        q_output = tensor_name + "_QuantizeLinear_" + postfix 
+                        dq_input = q_output
+                        dq_output = tensor_name + "_DequantizeLinear_" + postfix
+                        quant_node_name = tensor_name + "_QuantizeLinear_" + postfix
+                        dequant_node_name = tensor_name + "_DequantizeLinear_" + postfix
+                        qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
+                                                             [q_output], quant_node_name)
+                        dequant_node = onnx.helper.make_node("DequantizeLinear",
+                                                             [dq_input, scale_name, zp_name],
+                                                             [dq_output],
+                                                             dequant_node_name)
+                        self.model.add_nodes([qlinear_node, dequant_node])
+
+                        node = self.tensor_to_its_receiving_nodes[tensor_name][i]
+                        self.model.replace_node_input(node, tensor_name, dq_output)
+
+                    quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
+                                                     QuantizedValueType.Input)
+                    self.quantized_value_map[tensor_name] = quantized_value
                 else:
-                    self.model.replace_input_of_all_nodes(tensor_name, dq_output)
+                    q_input = tensor_name
+                    q_output = tensor_name + "_QuantizeLinear"
+                    dq_input = q_output
+                    dq_output = tensor_name + "_DequantizeLinear"
+                    if self.model.is_graph_output(tensor_name):
+                        q_input = tensor_name + "_QuantizeLinearInput"
+                        dq_output = tensor_name
+                        self.model.replace_output_of_all_nodes(tensor_name, q_input)
+                    else:
+                        self.model.replace_input_of_all_nodes(tensor_name, dq_output)
 
-                quant_node_name = tensor_name + "_QuantizeLinear"
-                dequant_node_name = tensor_name + "_DequantizeLinear"
-                qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
-                                                     [q_output], quant_node_name)
-                dequant_node = onnx.helper.make_node("DequantizeLinear",
-                                                     [dq_input, scale_name, zp_name],
-                                                     [dq_output],
-                                                     dequant_node_name)
-                self.model.add_nodes([qlinear_node, dequant_node])
+                    quant_node_name = tensor_name + "_QuantizeLinear"
+                    dequant_node_name = tensor_name + "_DequantizeLinear"
+                    qlinear_node = onnx.helper.make_node("QuantizeLinear", [q_input, scale_name, zp_name],
+                                                         [q_output], quant_node_name)
+                    dequant_node = onnx.helper.make_node("DequantizeLinear",
+                                                         [dq_input, scale_name, zp_name],
+                                                         [dq_output],
+                                                         dequant_node_name)
+                    self.model.add_nodes([qlinear_node, dequant_node])
 
-                quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
-                                                 QuantizedValueType.Input)
-                self.quantized_value_map[tensor_name] = quantized_value
+                    quantized_value = QuantizedValue(tensor_name, dq_output, scale_name, zp_name,
+                                                     QuantizedValueType.Input)
+                    self.quantized_value_map[tensor_name] = quantized_value
 
     def quantize_bias_tensors(self):
         for bias_name, input_name, weight_name in self.bias_to_quantize:
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index e70a84c23b..bc0a57a425 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -188,12 +188,18 @@ def quantize_static(model_input,
                                           Dyanmic mode currently is supported. Will support more in future.
             DisableShapeInference = True/False : in dynamic quantize mode, shape inference is not must have
                                                  and if it cause some issue, you could disable it.
+            ForceQuantizeNoInputCheck = True/False : By default, some latent operators like maxpool, transpose, do not quantize
+                                                     if their input is not quantized already. Setting to True to force such operator
+                                                     always quantize input and so generate quantized output. Also the True behavior
+                                                     could be disabled per node using the nodes_to_exclude.
             MatMulConstBOnly = True/False: Default is False. If enabled, only MatMul with const B will be quantized.
             AddQDQPairToWeight = True/False : Default is False which quantizes floating-point weight and feeds it to 
                                               soley inserted DeQuantizeLinear node. If True, it remains floating-point weight and 
                                               inserts both QuantizeLinear/DeQuantizeLinear nodes to weight.
             OpTypesToExcludeOutputQuantizatioin = list of op type : Default is []. If any op type is specified, it won't quantize  
                                                                     the output of ops with this specific op types.
+            DedicatedQDQPair = True/False : Default is False. When inserting QDQ pair, multiple nodes can share a single QDQ pair as their inputs.
+                                            If True, it will create identical and dedicated QDQ pair for each node. 
     '''
 
     mode = QuantizationMode.QLinearOps
@@ -283,6 +289,10 @@ def quantize_dynamic(model_input: Path,
                                           Dyanmic mode currently is supported. Will support more in future.
             DisableShapeInference = True/False : in dynamic quantize mode, shape inference is not must have
                                                  and if it cause some issue, you could disable it.
+            ForceQuantizeNoInputCheck = True/False : By default, some latent operators like maxpool, transpose, do not quantize
+                                                     if their input is not quantized already. Setting to True to force such operator
+                                                     always quantize input and so generate quantized output. Also the True behavior
+                                                     could be disabled per node using the nodes_to_exclude.
             MatMulConstBOnly = True/False: Default is False. If enabled, only MatMul with const B will be quantized.
     '''
 
diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py
index c51cd65151..3628bd2ec9 100644
--- a/onnxruntime/python/tools/quantization/registry.py
+++ b/onnxruntime/python/tools/quantization/registry.py
@@ -20,6 +20,7 @@ from .operators.concat import QLinearConcat, QDQConcat
 
 CommonOpsRegistry = {
     "Gather": GatherQuant,
+    "Transpose" : Direct8BitOp,
     "EmbedLayerNormalization": EmbedLayerNormalizationQuant,
 }
 
@@ -45,7 +46,6 @@ QLinearOpsRegistry = {
     "Split": QSplit,
     "Pad": QPad,
     "Reshape": Direct8BitOp,
-    "Transpose" : Direct8BitOp,
     "Squeeze" : Direct8BitOp,
     "Unsqueeze" : Direct8BitOp,
     "Resize": QResize,
diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py
index 0d80efbc64..cd5d30638e 100755
--- a/onnxruntime/python/tools/symbolic_shape_infer.py
+++ b/onnxruntime/python/tools/symbolic_shape_infer.py
@@ -642,14 +642,18 @@ class SymbolicShapeInference:
         vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_dtype, new_shape))
 
     def _fuse_tensor_type(self, node, out_idx, dst_type, src_type):
-        ''' 
+        '''
         update dst_tensor_type to be compatible with src_tensor_type when dimension mismatches
         '''
         dst_tensor_type = dst_type.sequence_type.elem_type.tensor_type if is_sequence(
             dst_type) else dst_type.tensor_type
         src_tensor_type = src_type.sequence_type.elem_type.tensor_type if is_sequence(
             src_type) else src_type.tensor_type
-        assert dst_tensor_type.elem_type == src_tensor_type.elem_type
+        if dst_tensor_type.elem_type != src_tensor_type.elem_type:
+            node_id = node.name if node.name else node.op_type
+            raise ValueError(f"For node {node_id}, dst_tensor_type.elem_type != src_tensor_type.elem_type: "
+                             f"{onnx.onnx_pb.TensorProto.DataType.Name(dst_tensor_type.elem_type)} vs "
+                             f"{onnx.onnx_pb.TensorProto.DataType.Name(src_tensor_type.elem_type)}")
         if dst_tensor_type.HasField('shape'):
             for di, ds in enumerate(zip(dst_tensor_type.shape.dim, src_tensor_type.shape.dim)):
                 if ds[0] != ds[1]:
diff --git a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
new file mode 100644
index 0000000000..cdab3d0c82
--- /dev/null
+++ b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc
@@ -0,0 +1,412 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "test/common/tensor_op_test_utils.h"
+#include "test/common/cuda_op_test_utils.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+static void RunAttentionTest(
+    const std::vector<float>& query_data,
+    const std::vector<float>& key_data,
+    const std::vector<float>& q_weights_data,
+    const std::vector<float>& kv_weights_data,
+    const std::vector<float>& bias_data,
+    const std::vector<float>& output_data,
+    int batch_size,
+    int sequence_length,
+    int kv_sequence_length,
+    int input_cache_sen_len,
+    int hidden_size,
+    int num_heads,
+    bool static_kv,
+    bool use_past,
+    bool has_layer_state,
+    bool has_key_padding_mask,
+    const std::vector<float>* new_key_cache = nullptr,
+    const std::vector<float>* new_value_cache = nullptr,
+    const std::vector<float>* key_cache = nullptr,
+    const std::vector<float>* value_cache = nullptr,
+    const std::initializer_list<bool>* key_padding_mask_data = nullptr,
+    bool use_float16 = false
+) {
+  int min_cuda_architecture = use_float16 ? 530 : 0;
+  bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
+  bool enable_cpu = false;
+
+  if (enable_cpu || enable_cuda) {
+    OpTester tester("DecoderAttention", 1, onnxruntime::kMSDomain);
+    tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(num_heads));
+
+    int head_size = hidden_size / num_heads;
+    std::vector<int64_t> query_dims = {sequence_length, batch_size, hidden_size};
+    std::vector<int64_t> key_dims = {kv_sequence_length, batch_size, hidden_size};
+    std::vector<int64_t> q_weights_dims = {hidden_size, hidden_size};
+    std::vector<int64_t> kv_weights_dims = {hidden_size, 2 * hidden_size};
+    std::vector<int64_t> bias_dims = {3 * hidden_size};
+    std::vector<int64_t> input_cache_dims = {batch_size, num_heads, input_cache_sen_len, head_size};
+
+    std::vector<int64_t> output_dims = {sequence_length, batch_size, hidden_size};
+
+    tester.AddInput<float>("query", query_dims, query_data);
+    tester.AddInput<float>("key", key_dims, key_data);
+    tester.AddInput<float>("q_weight", q_weights_dims, q_weights_data);
+    tester.AddInput<float>("kv_weight", kv_weights_dims, kv_weights_data);
+    tester.AddInput<float>("bias", bias_dims, bias_data);
+
+    int src_len = 0;
+    if (!has_layer_state || !use_past) {
+      if (!static_kv) {
+        src_len = sequence_length;
+      } else {
+        src_len = kv_sequence_length;
+      }
+    } else {
+      if (!static_kv) {
+        src_len = input_cache_sen_len + sequence_length;
+      } else {
+        src_len = input_cache_sen_len;
+      }
+    }
+
+    if (nullptr == key_padding_mask_data || !has_key_padding_mask) {
+      tester.AddOptionalInputEdge<bool>();
+    } else {
+      std::vector<int64_t> key_padding_mask_dims = {batch_size, src_len};
+      tester.AddInput<bool>("key_padding_mask", key_padding_mask_dims, *key_padding_mask_data);
+    }
+
+    if (!has_layer_state || !use_past) {
+      tester.AddOptionalInputEdge<float>();
+      tester.AddOptionalInputEdge<float>();
+    } else {
+      tester.AddInput<float>("key_cache", input_cache_dims, *key_cache);
+      tester.AddInput<float>("value_cache", input_cache_dims, *value_cache);
+    }
+    tester.AddInput<bool>("static_kv", {1}, {static_kv});
+    tester.AddInput<bool>("use_past", {1}, {use_past});
+    tester.AddInput<bool>("has_layer_state", {1}, {has_layer_state});
+    tester.AddInput<bool>("has_key_padding_mask", {1}, {has_key_padding_mask});
+
+    tester.AddOutput<float>("output", output_dims, output_data);
+    if (has_layer_state) {
+      std::vector<int64_t> output_cache_dims = {batch_size, num_heads, src_len, head_size};
+      tester.AddOutput<float>("new_key_cache", output_cache_dims, *new_key_cache);
+      tester.AddOutput<float>("new_value_cache", output_cache_dims, *new_value_cache);
+    }
+
+    if (enable_cuda) {
+      std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+      execution_providers.push_back(DefaultCudaExecutionProvider());
+      tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+    }
+  }
+}
+
+
+TEST(DecoderAttentionTest, SelfAttentionNoStateNoCache) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int kv_sequence_length = 2;
+  int hidden_size = 4;
+  int number_of_heads = 2;
+
+  std::vector<float> input_data = {
+      0.8f, -0.5f, 0.0f, 1.f,
+      0.5f, 0.2f, 0.3f, -0.6f};
+
+  std::vector<float> q_weight_data = {
+      0.1f, -0.2f, 0.3f, 1.0f,
+      0.5f, 0.1f, 0.4f, 1.6f,
+      0.3f, 0.2f, 4.0f, 2.2f,
+      0.2f, 0.1f, 0.4f, 1.6f};
+
+  std::vector<float> kv_weight_data = {
+      1.1f, 0.3f, 0.5f, 0.2f, 0.3f, -0.6f, 1.5f, 2.0f,
+      1.0f, 2.0f, 0.4f, 0.8f, 0.9f, 0.1f, -1.3f, 0.7f,
+      1.6f, 1.1f, 0.7f, 0.2f, 0.4f, 1.0f, 1.2f, 0.5f,
+      2.4f, 3.3f, 2.1f, 4.2f, 8.4f, 0.0f, 2.1f, 3.2f};
+
+  std::vector<float> bias_data = {
+      -0.5f, 0.6f, 1.2f, 2.1f, 0.5f, 0.7f, 0.2f, 1.2f, 0.5f, 0.4f, 0.3f, 1.2f};
+
+  std::vector<float> output_data = {
+      3.1495983600616455f, 0.10843668878078461f, 4.25f, 5.6499996185302734f,
+      3.9696791172027588f, 0.073143675923347473f, 4.2499995231628418f, 5.6499991416931152f};
+
+  //self-attn without cache
+  RunAttentionTest(input_data, input_data, q_weight_data, kv_weight_data, bias_data, output_data,
+                   batch_size, sequence_length, kv_sequence_length, 0, hidden_size, number_of_heads,
+                   /*static_kv*/false, /*use_past*/false, /*has_layer_state*/false, /*has_key_padding_mask*/false);
+}
+
+TEST(DecoderAttentionTest, CrossAttentionNoStateNoCache) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int kv_sequence_length = 2;
+  int hidden_size = 4;
+  int number_of_heads = 2;
+
+  std::vector<float> input_data = {
+      0.8f, -0.5f, 0.0f, 1.f,
+      0.5f, 0.2f, 0.3f, -0.6f};
+
+  std::vector<float> q_weight_data = {
+      0.1f, -0.2f, 0.3f, 1.0f,
+      0.5f, 0.1f, 0.4f, 1.6f,
+      0.3f, 0.2f, 4.0f, 2.2f,
+      0.2f, 0.1f, 0.4f, 1.6f};
+
+  std::vector<float> kv_weight_data = {
+      1.1f, 0.3f, 0.5f, 0.2f, 0.3f, -0.6f, 1.5f, 2.0f,
+      1.0f, 2.0f, 0.4f, 0.8f, 0.9f, 0.1f, -1.3f, 0.7f,
+      1.6f, 1.1f, 0.7f, 0.2f, 0.4f, 1.0f, 1.2f, 0.5f,
+      2.4f, 3.3f, 2.1f, 4.2f, 8.4f, 0.0f, 2.1f, 3.2f};
+
+  std::vector<float> bias_data = {
+      -0.5f, 0.6f, 1.2f, 2.1f, 0.5f, 0.7f, 0.2f, 1.2f, 0.5f, 0.4f, 0.3f, 1.2f};
+
+  std::vector<float> output_data = {
+      3.1495983600616455f, 0.10843668878078461f, 4.25f, 5.6499996185302734f,
+      3.9696791172027588f, 0.073143675923347473f, 4.2499995231628418f, 5.6499991416931152f};
+
+  //cross-attn without cache
+  RunAttentionTest(input_data, input_data, q_weight_data, kv_weight_data, bias_data, output_data,
+                   batch_size, sequence_length, kv_sequence_length, 0, hidden_size, number_of_heads,
+                   /*static_kv*/true, /*use_past*/false, /*has_layer_state*/false, /*has_key_padding_mask*/false);
+}
+
+TEST(DecoderAttentionTest, SelfAttentionNoStateOutputCache) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int kv_sequence_length = 2;
+  int hidden_size = 4;
+  int number_of_heads = 2;
+
+  std::vector<float> input_data = {
+      0.8f, -0.5f, 0.0f, 1.f,
+      0.5f, 0.2f, 0.3f, -0.6f};
+
+  std::vector<float> q_weight_data = {
+      0.1f, -0.2f, 0.3f, 1.0f,
+      0.5f, 0.1f, 0.4f, 1.6f,
+      0.3f, 0.2f, 4.0f, 2.2f,
+      0.2f, 0.1f, 0.4f, 1.6f};
+
+  std::vector<float> kv_weight_data = {
+      1.1f, 0.3f, 0.5f, 0.2f, 0.3f, -0.6f, 1.5f, 2.0f,
+      1.0f, 2.0f, 0.4f, 0.8f, 0.9f, 0.1f, -1.3f, 0.7f,
+      1.6f, 1.1f, 0.7f, 0.2f, 0.4f, 1.0f, 1.2f, 0.5f,
+      2.4f, 3.3f, 2.1f, 4.2f, 8.4f, 0.0f, 2.1f, 3.2f};
+
+  std::vector<float> bias_data = {
+      -0.5f, 0.6f, 1.2f, 2.1f, 0.5f, 0.7f, 0.2f, 1.2f, 0.5f, 0.4f, 0.3f, 1.2f};
+
+  std::vector<float> output_data = {
+      3.1495983600616455f, 0.10843668878078461f, 4.25f, 5.6499996185302734f,
+      3.9696791172027588f, 0.073143675923347473f, 4.2499995231628418f, 5.6499991416931152f};
+
+  std::vector<float> new_key_cache = {
+      3.2800f, 3.2400f, 0.2900f, -0.4000f, 2.5000f, 5.1600f, -0.5200f, -1.0000f};
+
+  std::vector<float> new_value_cache = {
+      8.6900f, -0.1300f, -4.0900f, 0.4200f, 4.2500f, 5.6500f, -0.1100f, 0.5700f};
+
+  //self-attn without cache
+  RunAttentionTest(input_data, input_data, q_weight_data, kv_weight_data, bias_data, output_data,
+                   batch_size, sequence_length, kv_sequence_length, 0, hidden_size, number_of_heads,
+                   /*static_kv*/false, /*use_past*/false, /*has_layer_state*/true, /*has_key_padding_mask*/false,
+                   &new_key_cache, &new_value_cache);
+}
+
+TEST(DecoderAttentionTest, CrossAttentionNoStateOutputCache) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int kv_sequence_length = 2;
+  int hidden_size = 4;
+  int number_of_heads = 2;
+
+  std::vector<float> input_data = {
+      0.8f, -0.5f, 0.0f, 1.f,
+      0.5f, 0.2f, 0.3f, -0.6f};
+
+  std::vector<float> q_weight_data = {
+      0.1f, -0.2f, 0.3f, 1.0f,
+      0.5f, 0.1f, 0.4f, 1.6f,
+      0.3f, 0.2f, 4.0f, 2.2f,
+      0.2f, 0.1f, 0.4f, 1.6f};
+
+  std::vector<float> kv_weight_data = {
+      1.1f, 0.3f, 0.5f, 0.2f, 0.3f, -0.6f, 1.5f, 2.0f,
+      1.0f, 2.0f, 0.4f, 0.8f, 0.9f, 0.1f, -1.3f, 0.7f,
+      1.6f, 1.1f, 0.7f, 0.2f, 0.4f, 1.0f, 1.2f, 0.5f,
+      2.4f, 3.3f, 2.1f, 4.2f, 8.4f, 0.0f, 2.1f, 3.2f};
+
+  std::vector<float> bias_data = {
+      -0.5f, 0.6f, 1.2f, 2.1f, 0.5f, 0.7f, 0.2f, 1.2f, 0.5f, 0.4f, 0.3f, 1.2f};
+
+  std::vector<float> output_data = {
+      3.1495983600616455f, 0.10843668878078461f, 4.25f, 5.6499996185302734f,
+      3.9696791172027588f, 0.073143675923347473f, 4.2499995231628418f, 5.6499991416931152f};
+
+  std::vector<float> new_key_cache = {
+      3.2800f, 3.2400f, 0.2900f, -0.4000f, 2.5000f, 5.1600f, -0.5200f, -1.0000f};
+
+  std::vector<float> new_value_cache = {
+      8.6900f, -0.1300f, -4.0900f, 0.4200f, 4.2500f, 5.6500f, -0.1100f, 0.5700f};
+
+  //self-attn without cache
+  RunAttentionTest(input_data, input_data, q_weight_data, kv_weight_data, bias_data, output_data,
+                   batch_size, sequence_length, kv_sequence_length, 0, hidden_size, number_of_heads,
+                   /*static_kv*/true, /*use_past*/false, /*has_layer_state*/true, /*has_key_padding_mask*/false,
+                   &new_key_cache, &new_value_cache);
+}
+
+TEST(DecoderAttentionTest, SelfAttentionWithCache) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int kv_sequence_length = 2;
+  int input_cache_sen_len = 2;
+  int hidden_size = 4;
+  int number_of_heads = 2;
+
+  std::vector<float> input_data = {
+      0.8f, -0.5f, 0.0f, 1.f,
+      0.5f, 0.2f, 0.3f, -0.6f};
+
+  std::vector<float> q_weight_data = {
+      0.1f, -0.2f, 0.3f, 1.0f,
+      0.5f, 0.1f, 0.4f, 1.6f,
+      0.3f, 0.2f, 4.0f, 2.2f,
+      0.2f, 0.1f, 0.4f, 1.6f};
+
+  std::vector<float> kv_weight_data = {
+      1.1f, 0.3f, 0.5f, 0.2f, 0.3f, -0.6f, 1.5f, 2.0f,
+      1.0f, 2.0f, 0.4f, 0.8f, 0.9f, 0.1f, -1.3f, 0.7f,
+      1.6f, 1.1f, 0.7f, 0.2f, 0.4f, 1.0f, 1.2f, 0.5f,
+      2.4f, 3.3f, 2.1f, 4.2f, 8.4f, 0.0f, 2.1f, 3.2f};
+
+  std::vector<float> bias_data = {
+      -0.5f, 0.6f, 1.2f, 2.1f, 0.5f, 0.7f, 0.2f, 1.2f, 0.5f, 0.4f, 0.3f, 1.2f};
+
+  std::vector<float> output_data = {
+      1.502f, 0.05172f, 4.25f, 5.6499996185302734f,
+      2.0621f, 0.037995f, 4.2499995231628418f, 5.6499991416931152f};
+
+  std::vector<float> key_cache = {
+      0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+  std::vector<float> value_cache = {
+      0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
+
+  std::vector<float> new_key_cache = {
+      0.0f, 0.0f, 0.0f, 0.0f, 3.2800f, 3.2400f, 0.2900f, -0.4000f,
+      0.0f, 0.0f, 0.0f, 0.0f, 2.5000f, 5.1600f, -0.5200f, -1.0000f};
+
+  std::vector<float> new_value_cache = {
+      0.0f, 0.0f, 0.0f, 0.0f, 8.6900f, -0.1300f, -4.0900f, 0.4200f,
+      0.0f, 0.0f, 0.0f, 0.0f, 4.2500f, 5.6500f, -0.1100f, 0.5700f};
+
+  //self-attn without cache
+  RunAttentionTest(input_data, input_data, q_weight_data, kv_weight_data, bias_data, output_data,
+                   batch_size, sequence_length, kv_sequence_length, input_cache_sen_len, hidden_size, number_of_heads,
+                   /*static_kv*/false, /*use_past*/true, /*has_layer_state*/true, /*has_key_padding_mask*/false,
+                   &new_key_cache, &new_value_cache, &key_cache, &value_cache);
+}
+
+TEST(DecoderAttentionTest, CrossAttentionWithCache) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int kv_sequence_length = 2;
+  int input_cache_sen_len = 2;
+  int hidden_size = 4;
+  int number_of_heads = 2;
+
+  std::vector<float> input_data = {
+      0.8f, -0.5f, 0.0f, 1.f,
+      0.5f, 0.2f, 0.3f, -0.6f};
+
+  std::vector<float> q_weight_data = {
+      0.1f, -0.2f, 0.3f, 1.0f,
+      0.5f, 0.1f, 0.4f, 1.6f,
+      0.3f, 0.2f, 4.0f, 2.2f,
+      0.2f, 0.1f, 0.4f, 1.6f};
+
+  std::vector<float> kv_weight_data = {
+      1.1f, 0.3f, 0.5f, 0.2f, 0.3f, -0.6f, 1.5f, 2.0f,
+      1.0f, 2.0f, 0.4f, 0.8f, 0.9f, 0.1f, -1.3f, 0.7f,
+      1.6f, 1.1f, 0.7f, 0.2f, 0.4f, 1.0f, 1.2f, 0.5f,
+      2.4f, 3.3f, 2.1f, 4.2f, 8.4f, 0.0f, 2.1f, 3.2f};
+
+  std::vector<float> bias_data = {
+      -0.5f, 0.6f, 1.2f, 2.1f, 0.5f, 0.7f, 0.2f, 1.2f, 0.5f, 0.4f, 0.3f, 1.2f};
+
+  std::vector<float> output_data = {
+      3.1495983600616455f, 0.10843668878078461f, 4.25f, 5.6499996185302734f,
+      3.9696791172027588f, 0.073143675923347473f, 4.2499995231628418f, 5.6499991416931152f};
+
+  std::vector<float> key_cache = {
+      3.2800f, 3.2400f, 0.2900f, -0.4000f, 2.5000f, 5.1600f, -0.5200f, -1.0000f};
+
+  std::vector<float> value_cache = {
+      8.6900f, -0.1300f, -4.0900f, 0.4200f, 4.2500f, 5.6500f, -0.1100f, 0.5700f};
+
+  std::vector<float> new_key_cache = {
+      3.2800f, 3.2400f, 0.2900f, -0.4000f, 2.5000f, 5.1600f, -0.5200f, -1.0000f};
+
+  std::vector<float> new_value_cache = {
+      8.6900f, -0.1300f, -4.0900f, 0.4200f, 4.2500f, 5.6500f, -0.1100f, 0.5700f};
+
+  //self-attn without cache
+  RunAttentionTest(input_data, input_data, q_weight_data, kv_weight_data, bias_data, output_data,
+                   batch_size, sequence_length, kv_sequence_length, input_cache_sen_len, hidden_size, number_of_heads,
+                   /*static_kv*/true, /*use_past*/true, /*has_layer_state*/true, /*has_key_padding_mask*/false,
+                   &new_key_cache, &new_value_cache, &key_cache, &value_cache);
+}
+
+TEST(DecoderAttentionTest, SelfAttentionNoStateNoCachePaddingMask) {
+  int batch_size = 1;
+  int sequence_length = 2;
+  int kv_sequence_length = 2;
+  int hidden_size = 4;
+  int number_of_heads = 2;
+
+  std::vector<float> input_data = {
+      0.8f, -0.5f, 0.0f, 1.f,
+      0.5f, 0.2f, 0.3f, -0.6f};
+
+  std::vector<float> q_weight_data = {
+      0.1f, -0.2f, 0.3f, 1.0f,
+      0.5f, 0.1f, 0.4f, 1.6f,
+      0.3f, 0.2f, 4.0f, 2.2f,
+      0.2f, 0.1f, 0.4f, 1.6f};
+
+  std::vector<float> kv_weight_data = {
+      1.1f, 0.3f, 0.5f, 0.2f, 0.3f, -0.6f, 1.5f, 2.0f,
+      1.0f, 2.0f, 0.4f, 0.8f, 0.9f, 0.1f, -1.3f, 0.7f,
+      1.6f, 1.1f, 0.7f, 0.2f, 0.4f, 1.0f, 1.2f, 0.5f,
+      2.4f, 3.3f, 2.1f, 4.2f, 8.4f, 0.0f, 2.1f, 3.2f};
+
+  std::vector<float> bias_data = {
+      -0.5f, 0.6f, 1.2f, 2.1f, 0.5f, 0.7f, 0.2f, 1.2f, 0.5f, 0.4f, 0.3f, 1.2f};
+
+  std::vector<float> output_data = {
+      3.1495983600616455f, 0.10843668878078461f, 4.25f, 5.6499996185302734f,
+      3.9696791172027588f, 0.073143675923347473f, 4.2499995231628418f, 5.6499991416931152f};
+
+  std::initializer_list<bool> key_padding_mask_data = {false, false};
+
+  //self-attn without cache
+  RunAttentionTest(input_data, input_data, q_weight_data, kv_weight_data, bias_data, output_data,
+                   batch_size, sequence_length, kv_sequence_length, 0, hidden_size, number_of_heads,
+                   /*static_kv*/false, /*use_past*/false, /*has_layer_state*/false, /*has_key_padding_mask*/true,
+                   nullptr, nullptr, nullptr, nullptr, &key_padding_mask_data);
+}
+
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/nhwc_maxpool_op_test.cc b/onnxruntime/test/contrib_ops/nhwc_maxpool_op_test.cc
index 8c52de8373..0a74a68a80 100644
--- a/onnxruntime/test/contrib_ops/nhwc_maxpool_op_test.cc
+++ b/onnxruntime/test/contrib_ops/nhwc_maxpool_op_test.cc
@@ -2,10 +2,12 @@
 // Licensed under the MIT License.
 
 #include <algorithm>
+#include <random>
+
 #include "core/util/math.h"
+#include "core/mlas/inc/mlas.h"
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
-#include <random>
 
 namespace onnxruntime {
 namespace test {
@@ -70,7 +72,9 @@ class NhwcMaxPoolOpTester {
     Y_shape.push_back(batch_count);
     for (size_t n = 0; n < kernel_rank; n++) {
       Y_shape.push_back(((input_shape[n] + pads[n] + pads[kernel_rank + n]) -
-                         (dilations[n] * (kernel_shape_[n] - 1) + 1)) / strides[n] + 1);
+                         (dilations[n] * (kernel_shape_[n] - 1) + 1)) /
+                            strides[n] +
+                        1);
     }
     Y_shape.push_back(channels);
     Y_data.resize(ShapeSize(Y_shape));
@@ -87,7 +91,7 @@ class NhwcMaxPoolOpTester {
       std::vector<int64_t> d_output(kernel_rank, 0);
       std::vector<int64_t> d_kernel(kernel_rank, 0);
       do {
-        std::fill_n(Ydata, channels, static_cast<T>(0));
+        std::fill_n(Ydata, channels, std::numeric_limits<T>::lowest());
         do {
           int64_t input_offset = 0;
           bool is_padding = false;
@@ -163,7 +167,7 @@ class NhwcMaxPoolOpTester {
 };
 
 TEST(NhwcMaxPoolContribOpTest, MaxPool1D) {
-  for (int64_t channels = 1; channels < 64; channels++) {
+  for (int64_t channels = 1; channels < 94; channels++) {
     NhwcMaxPoolOpTester<uint8_t> test;
     test.GenerateRandomInput({1, 23, channels});
     test.SetKernelShape({5});
@@ -173,7 +177,7 @@ TEST(NhwcMaxPoolContribOpTest, MaxPool1D) {
 }
 
 TEST(NhwcMaxPoolContribOpTest, MaxPool2D) {
-  for (int64_t channels = 1; channels < 64; channels++) {
+  for (int64_t channels = 1; channels < 94; channels++) {
     NhwcMaxPoolOpTester<uint8_t> test;
     test.GenerateRandomInput({1, 15, 19, channels});
     test.SetKernelShape({3, 5});
@@ -183,7 +187,7 @@ TEST(NhwcMaxPoolContribOpTest, MaxPool2D) {
 }
 
 TEST(NhwcMaxPoolContribOpTest, MaxPool3D) {
-  for (int64_t channels = 1; channels < 64; channels++) {
+  for (int64_t channels = 1; channels < 94; channels++) {
     NhwcMaxPoolOpTester<uint8_t> test;
     test.GenerateRandomInput({1, 9, 13, 15, channels});
     test.SetKernelShape({2, 4, 6});
@@ -208,5 +212,51 @@ TEST(NhwcMaxPoolContribOpTest, MaxPoolDilations) {
   test.Run();
 }
 
+TEST(NhwcMaxPoolContribOpTest, MaxPool1D_S8) {
+  for (int64_t channels = 1; channels < 94; channels++) {
+    NhwcMaxPoolOpTester<int8_t> test;
+    test.GenerateRandomInput({1, 23, channels});
+    test.SetKernelShape({5});
+    test.SetPads({2, 2});
+    test.Run();
+  }
+}
+
+TEST(NhwcMaxPoolContribOpTest, MaxPool2D_S8) {
+  for (int64_t channels = 1; channels < 94; channels++) {
+    NhwcMaxPoolOpTester<int8_t> test;
+    test.GenerateRandomInput({1, 15, 19, channels});
+    test.SetKernelShape({3, 5});
+    test.SetPads({1, 1, 1, 1});
+    test.Run();
+  }
+}
+
+TEST(NhwcMaxPoolContribOpTest, MaxPool3D_S8) {
+  for (int64_t channels = 1; channels < 94; channels++) {
+    NhwcMaxPoolOpTester<int8_t> test;
+    test.GenerateRandomInput({1, 9, 13, 15, channels});
+    test.SetKernelShape({2, 4, 6});
+    test.SetPads({0, 0, 0, 1, 1, 1});
+    test.Run();
+  }
+}
+
+TEST(NhwcMaxPoolContribOpTest, MaxPoolStrides_S8) {
+  NhwcMaxPoolOpTester<int8_t> test;
+  test.GenerateRandomInput({4, 23, 19, 32});
+  test.SetKernelShape({3, 3});
+  test.SetStrides({2, 2});
+  test.Run();
+}
+
+TEST(NhwcMaxPoolContribOpTest, MaxPoolDilations_S8) {
+  NhwcMaxPoolOpTester<int8_t> test;
+  test.GenerateRandomInput({4, 23, 19, 32});
+  test.SetKernelShape({3, 3});
+  test.SetDilations({2, 2});
+  test.Run();
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc
index 94fdef7b78..7fc057aa79 100644
--- a/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_global_average_pool_test.cc
@@ -5,22 +5,24 @@
 #include "test/common/tensor_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
 #include "core/providers/common.h"
+#include "core/mlas/inc/mlas.h"
 
 namespace onnxruntime {
 namespace test {
 
-static void CalculateGlobalAvgPoolU8(
-    const uint8_t* x, int64_t batch, int64_t hw, int64_t channel, bool channels_last, uint8_t* y,
+template <typename T8Bits>
+static void CalculateGlobalAvgPool(
+    const T8Bits* x, int64_t batch, int64_t hw, int64_t channel, bool channels_last, T8Bits* y,
     int32_t x_zero_point, float x_scale, int32_t y_zero_point, float y_scale) {
   int32_t bias = -x_zero_point * gsl::narrow_cast<int32_t>(hw);
   int64_t stride_image = channels_last ? channel : 1;
   int64_t stride_channel = channels_last ? 1 : hw;
 
   for (int64_t b = 0; b < batch; ++b) {
-    const uint8_t* bx = x + b * hw * channel;
-    uint8_t* by = y + b * channel;
+    const T8Bits* bx = x + b * hw * channel;
+    T8Bits* by = y + b * channel;
     for (int64_t c = 0; c < channel; ++c) {
-      const uint8_t* ix = bx + c * stride_channel;
+      const T8Bits* ix = bx + c * stride_channel;
       int32_t sum = 0;
       for (int64_t i = 0; i < hw; ++i) {
         sum += static_cast<int32_t>(*ix);
@@ -29,40 +31,41 @@ static void CalculateGlobalAvgPoolU8(
       sum += bias;
       int32_t r = static_cast<int32_t>(std::nearbyintf(x_scale * sum / static_cast<float>(hw) / y_scale));
       r += y_zero_point;
-      r = std::min(255, r);
-      r = std::max(0, r);
-      by[c] = static_cast<uint8_t>(r);
+      r = std::min((int32_t)(std::numeric_limits<T8Bits>::max()), r);
+      r = std::max((int32_t)(std::numeric_limits<T8Bits>::lowest()), r);
+      by[c] = static_cast<T8Bits>(r);
     }
   }
 }
 
-void RunQLinearGlobalAveragePoolU8(
+template <typename T8Bits = uint8_t>
+void RunQLinearGlobalAveragePool(
     bool channels_last, int64_t batch, int64_t channel, int64_t h, int64_t w,
-    uint8_t x_zero_point, float x_scale, uint8_t y_zero_point, float y_scale, int32_t seed = 0) {
+    T8Bits x_zero_point, float x_scale, T8Bits y_zero_point, float y_scale, int32_t seed = 0) {
   std::vector<int64_t> x_dims = channels_last ? std::vector<int64_t>{batch, h, w, channel} : std::vector<int64_t>{batch, channel, h, w};
   std::vector<int64_t> y_dims = channels_last ? std::vector<int64_t>{batch, 1, 1, channel} : std::vector<int64_t>{batch, channel, 1, 1};
   int64_t x_size = batch * channel * h * w;
   int64_t y_size = batch * channel;
-  std::vector<uint8_t> x_data((size_t)x_size);
-  std::vector<uint8_t> y_data((size_t)y_size);
+  std::vector<T8Bits> x_data((size_t)x_size);
+  std::vector<T8Bits> y_data((size_t)y_size);
 
   RandomValueGenerator random{seed ? optional<RandomValueGenerator::RandomSeedType>{seed} : optional<RandomValueGenerator::RandomSeedType>{}};
-  std::vector<int> tmp_x_data = random.Uniform<int32_t>(x_dims, 0, 255);
-  std::transform(tmp_x_data.begin(), tmp_x_data.end(), x_data.data(), [](int32_t v) -> uint8_t {
-    return static_cast<uint8_t>(v);
+  std::vector<int> tmp_x_data = random.Uniform<int32_t>(x_dims, std::numeric_limits<T8Bits>::lowest(), std::numeric_limits<T8Bits>::max());
+  std::transform(tmp_x_data.begin(), tmp_x_data.end(), x_data.data(), [](int32_t v) -> T8Bits {
+    return static_cast<T8Bits>(v);
   });
 
-  CalculateGlobalAvgPoolU8(x_data.data(), batch, h * w, channel, channels_last, y_data.data(),
+  CalculateGlobalAvgPool(x_data.data(), batch, h * w, channel, channels_last, y_data.data(),
                            x_zero_point, x_scale, y_zero_point, y_scale);
 
   OpTester test("QLinearGlobalAveragePool", 1, onnxruntime::kMSDomain);
   test.AddAttribute<int64_t>("channels_last", channels_last ? 1LL : 0LL);
-  test.AddInput<uint8_t>("X", x_dims, x_data);
+  test.AddInput<T8Bits>("X", x_dims, x_data);
   test.AddInput<float>("x_scale", {}, {x_scale});
-  test.AddInput<uint8_t>("x_zero_point", {}, {x_zero_point});
+  test.AddInput<T8Bits>("x_zero_point", {}, {x_zero_point});
   test.AddInput<float>("y_scale", {}, {y_scale});
-  test.AddInput<uint8_t>("y_zero_point", {}, {y_zero_point});
-  test.AddOutput<uint8_t>("Y", y_dims, y_data);
+  test.AddInput<T8Bits>("y_zero_point", {}, {y_zero_point});
+  test.AddOutput<T8Bits>("Y", y_dims, y_data);
 
   auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
     const OrtValue& ort_value = fetches[0];
@@ -75,7 +78,7 @@ void RunQLinearGlobalAveragePoolU8(
     ORT_ENFORCE(y_shape == output_tensor.Shape(),
                 "Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" +
                     output_tensor.Shape().ToString() + "] for Y @" + provider_type);
-    auto* output = output_tensor.Data<uint8_t>();
+    auto* output = output_tensor.Data<T8Bits>();
     auto size = static_cast<int>(output_tensor.Shape().Size());
     for (int i = 0; i < size; ++i) {
       int diff = abs(y_data[i] - output[i]);
@@ -89,76 +92,149 @@ void RunQLinearGlobalAveragePoolU8(
 }
 
 TEST(QLinearGlobalAveragePool, Nhwc_1x1x32x32) {
-  RunQLinearGlobalAveragePoolU8(true, 1, 1, 32, 32, 128, 1.0, 64, 2.0);
+  RunQLinearGlobalAveragePool<uint8_t>(true, 1, 1, 32, 32, 128, 1.0, 64, 2.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nchw_1x32x32x1) {
-  RunQLinearGlobalAveragePoolU8(false, 1, 1, 32, 32, 128, 1.0, 64, 2.0);
+  RunQLinearGlobalAveragePool<uint8_t>(false, 1, 1, 32, 32, 128, 1.0, 64, 2.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nhwc_1x256x8x8) {
-  RunQLinearGlobalAveragePoolU8(true, 1, 256, 8, 8, 128, 1.0, 64, 3.0);
+  RunQLinearGlobalAveragePool<uint8_t>(true, 1, 256, 8, 8, 128, 1.0, 64, 3.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nchw_1x8x8x256) {
-  RunQLinearGlobalAveragePoolU8(false, 1, 256, 8, 8, 128, 1.0, 64, 3.0);
+  RunQLinearGlobalAveragePool<uint8_t>(false, 1, 256, 8, 8, 128, 1.0, 64, 3.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nhwc_1x255x7x7) {
-  RunQLinearGlobalAveragePoolU8(true, 1, 255, 7, 7, 128, 7.0, 128, 21.0);
+  RunQLinearGlobalAveragePool<uint8_t>(true, 1, 255, 7, 7, 128, 7.0, 128, 21.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nchw_1x7x7x255) {
-  RunQLinearGlobalAveragePoolU8(false, 1, 255, 7, 7, 128, 7.0, 128, 21.0);
+  RunQLinearGlobalAveragePool<uint8_t>(false, 1, 255, 7, 7, 128, 7.0, 128, 21.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nhwc_1x255x8x8) {
-  RunQLinearGlobalAveragePoolU8(true, 1, 255, 8, 8, 128, 1.0, 128, 2.0);
+  RunQLinearGlobalAveragePool<uint8_t>(true, 1, 255, 8, 8, 128, 1.0, 128, 2.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nchw_1x8x8x255) {
-  RunQLinearGlobalAveragePoolU8(false, 1, 255, 8, 8, 128, 1.0, 128, 2.0);
+  RunQLinearGlobalAveragePool<uint8_t>(false, 1, 255, 8, 8, 128, 1.0, 128, 2.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nhwc_1x256x7x7) {
-  RunQLinearGlobalAveragePoolU8(true, 1, 256, 7, 7, 128, 7.0, 128, 21.0);
+  RunQLinearGlobalAveragePool<uint8_t>(true, 1, 256, 7, 7, 128, 7.0, 128, 21.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nchw_1x7x7x256) {
-  RunQLinearGlobalAveragePoolU8(false, 1, 256, 7, 7, 128, 7.0, 128, 21.0);
+  RunQLinearGlobalAveragePool<uint8_t>(false, 1, 256, 7, 7, 128, 7.0, 128, 21.0);
 }
 
 // tests for BatchSize > 1
 TEST(QLinearGlobalAveragePool, Nhwc_3x256x8x8) {
-  RunQLinearGlobalAveragePoolU8(true, 3, 256, 8, 8, 128, 1.0, 64, 3.0);
+  RunQLinearGlobalAveragePool<uint8_t>(true, 3, 256, 8, 8, 128, 1.0, 64, 3.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nchw_3x8x8x256) {
-  RunQLinearGlobalAveragePoolU8(false, 3, 256, 8, 8, 128, 1.0, 64, 3.0);
+  RunQLinearGlobalAveragePool<uint8_t>(false, 3, 256, 8, 8, 128, 1.0, 64, 3.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nhwc_3x255x7x7) {
-  RunQLinearGlobalAveragePoolU8(true, 3, 255, 7, 7, 128, 7.0, 128, 21.0);
+  RunQLinearGlobalAveragePool<uint8_t>(true, 3, 255, 7, 7, 128, 7.0, 128, 21.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nchw_3x7x7x255) {
-  RunQLinearGlobalAveragePoolU8(false, 3, 255, 7, 7, 128, 7.0, 128, 21.0);
+  RunQLinearGlobalAveragePool<uint8_t>(false, 3, 255, 7, 7, 128, 7.0, 128, 21.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nhwc_3x255x8x8) {
-  RunQLinearGlobalAveragePoolU8(true, 3, 255, 8, 8, 128, 1.0, 128, 2.0);
+  RunQLinearGlobalAveragePool<uint8_t>(true, 3, 255, 8, 8, 128, 1.0, 128, 2.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nchw_3x8x8x255) {
-  RunQLinearGlobalAveragePoolU8(false, 3, 255, 8, 8, 128, 1.0, 128, 2.0);
+  RunQLinearGlobalAveragePool<uint8_t>(false, 3, 255, 8, 8, 128, 1.0, 128, 2.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nhwc_3x256x7x7) {
-  RunQLinearGlobalAveragePoolU8(true, 3, 256, 7, 7, 128, 7.0, 128, 21.0);
+  RunQLinearGlobalAveragePool<uint8_t>(true, 3, 256, 7, 7, 128, 7.0, 128, 21.0);
 }
 
 TEST(QLinearGlobalAveragePool, Nchw_3x7x7x256) {
-  RunQLinearGlobalAveragePoolU8(false, 3, 256, 7, 7, 128, 7.0, 128, 21.0);
+  RunQLinearGlobalAveragePool<uint8_t>(false, 3, 256, 7, 7, 128, 7.0, 128, 21.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nhwc_1x1x32x32_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(true, 1, 1, 32, 32, 1, 1.0, -64, 2.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nchw_1x32x32x1_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(false, 1, 1, 32, 32, 1, 1.0, 64, 2.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nhwc_1x256x8x8_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(true, 1, 256, 8, 8, -1, 1.0, -64, 3.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nchw_1x8x8x256_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(false, 1, 256, 8, 8, -1, 1.0, 64, 3.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nhwc_1x255x7x7_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(true, 1, 255, 7, 7, 64, 7.0, 1, 21.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nchw_1x7x7x255_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(false, 1, 255, 7, 7, 64, 7.0, -1, 21.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nhwc_1x255x8x8_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(true, 1, 255, 8, 8, -64, 1.0, 1, 2.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nchw_1x8x8x255_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(false, 1, 255, 8, 8, -64, 1.0, -1, 2.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nhwc_1x256x7x7_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(true, 1, 256, 7, 7, -64, 7.0, 64, 21.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nchw_1x7x7x256_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(false, 1, 256, 7, 7, 64, 7.0, -64, 21.0);
+}
+
+// tests for BatchSize > 1
+TEST(QLinearGlobalAveragePool, Nhwc_3x256x8x8_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(true, 3, 256, 8, 8, 1, 1.0, 64, 3.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nchw_3x8x8x256_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(false, 3, 256, 8, 8, 1, 1.0, 64, 3.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nhwc_3x255x7x7_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(true, 3, 255, 7, 7, 1, 7.0, -1, 21.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nchw_3x7x7x255_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(false, 3, 255, 7, 7, 1, 7.0, -1, 21.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nhwc_3x255x8x8_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(true, 3, 255, 8, 8, 1, 1.0, -1, 2.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nchw_3x8x8x255_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(false, 3, 255, 8, 8, -1, 1.0, 1, 2.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nhwc_3x256x7x7_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(true, 3, 256, 7, 7, -1, 7.0, 1, 21.0);
+}
+
+TEST(QLinearGlobalAveragePool, Nchw_3x7x7x256_S8) {
+  RunQLinearGlobalAveragePool<int8_t>(false, 3, 256, 7, 7, -1, 7.0, 1, 21.0);
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
index 85b8c998e7..335c5eaeaf 100644
--- a/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
+++ b/onnxruntime/test/contrib_ops/qlinear_pool_test.cc
@@ -6,6 +6,7 @@
 #include "test/common/tensor_op_test_utils.h"
 #include "test/providers/provider_test_utils.h"
 #include "core/providers/common.h"
+#include "core/mlas/inc/mlas.h"
 
 namespace onnxruntime {
 namespace test {
@@ -46,14 +47,15 @@ struct DimIterator {
   int64_t index_;
 };
 
+template <typename T8Bits>
 static void
-CalculateAvgPoolNchwU8(
-    uint8_t* x,
+CalculateAvgPoolNchw(
+    T8Bits* x,
     const std::vector<int64_t> x_dims,
-    const quantization::Params<uint8_t>& x_params,
-    uint8_t* y,
+    const quantization::Params<T8Bits>& x_params,
+    T8Bits* y,
     const std::vector<int64_t> y_dims,
-    const quantization::Params<uint8_t>& y_params,
+    const quantization::Params<T8Bits>& y_params,
     const std::vector<int64_t> kernel_shape,
     const std::vector<int64_t> strides,
     const std::vector<int64_t> pads,
@@ -73,8 +75,8 @@ CalculateAvgPoolNchwU8(
   int64_t x_step = std::accumulate(x_img_dims.begin(), x_img_dims.end(), 1LL, std::multiplies<int64_t>());
   for (int64_t b = 0; b < batch; ++b) {
     for (int64_t c = 0; c < channel; ++c) {
-      uint8_t* ybc = y + (b * channel + c) * y_step;
-      uint8_t* xbc = x + (b * channel + c) * x_step;
+      T8Bits* ybc = y + (b * channel + c) * y_step;
+      T8Bits* xbc = x + (b * channel + c) * x_step;
 
       DimIterator yit(y_img_dims);
       while (yit.has_next()) {
@@ -103,14 +105,15 @@ CalculateAvgPoolNchwU8(
           }
         }
         auto y_offset = yit.next();
-        auto y_u8 = QuantizeTestValue<uint8_t>(y_value_sum / count, y_params);
+        auto y_u8 = QuantizeTestValue<T8Bits>(y_value_sum / count, y_params);
         ybc[y_offset] = y_u8;
       }
     }
   }
 }
 
-void RunQLinearAveragePoolNchwU8(
+template <typename T8Bits = uint8_t>
+void RunQLinearAveragePoolNchw(
     const std::vector<int64_t> x_dims,
     const std::vector<int64_t> y_dims,
     const std::vector<int64_t> kernel_shape,
@@ -119,17 +122,19 @@ void RunQLinearAveragePoolNchwU8(
     const int64_t count_include_pad = 0) {
   auto run_test = [&](bool only_x_not_initializer, bool x_y_same_zero_point) {
     float x_scale = 1.0f / 255.0f;
-    quantization::Params<uint8_t> x_params(x_scale, /*zero_point=*/128);
+    T8Bits x_zero_point = (std::numeric_limits<T8Bits>::lowest() + std::numeric_limits<T8Bits>::max() - 5) / 2;
+    quantization::Params<T8Bits> x_params(x_scale, x_zero_point);
     RandomValueGenerator random{};
     std::vector<float> x_data_fp32 = random.Uniform<float>(x_dims, -0.5f, 0.5f);
-    std::vector<uint8_t> x_data = QuantizeTestVector<uint8_t>(x_data_fp32, x_params);
+    std::vector<T8Bits> x_data = QuantizeTestVector<T8Bits>(x_data_fp32, x_params);
 
     float y_scale = 1.0f / 255.0f;
-    uint8_t y_zero_point = x_y_same_zero_point ? x_params.zero_point : 100;
-    const quantization::Params<uint8_t> y_params(y_scale, y_zero_point);
+    T8Bits y_zero_point_not_same = (std::numeric_limits<T8Bits>::lowest() + std::numeric_limits<T8Bits>::max() + 10) / 2;
+    T8Bits y_zero_point = x_y_same_zero_point ? x_params.zero_point : y_zero_point_not_same;
+    const quantization::Params<T8Bits> y_params(y_scale, y_zero_point);
     int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies<int64_t>());
-    std::vector<uint8_t> y_data(y_size);
-    CalculateAvgPoolNchwU8(
+    std::vector<T8Bits> y_data(y_size);
+    CalculateAvgPoolNchw(
         x_data.data(), x_dims, x_params,
         y_data.data(), y_dims, y_params,
         kernel_shape, strides, pads, count_include_pad);
@@ -142,12 +147,12 @@ void RunQLinearAveragePoolNchwU8(
     test.AddAttribute("kernel_shape", kernel_shape);
     test.AddAttribute("count_include_pad", count_include_pad);
 
-    test.AddInput<uint8_t>("X", x_dims, x_data);
+    test.AddInput<T8Bits>("X", x_dims, x_data);
     test.AddInput<float>("x_scale", {}, {x_scale}, only_x_not_initializer);
-    test.AddInput<uint8_t>("x_zero_point", {}, {x_params.zero_point}, only_x_not_initializer);
+    test.AddInput<T8Bits>("x_zero_point", {}, {x_params.zero_point}, only_x_not_initializer);
     test.AddInput<float>("y_scale", {}, {y_scale}, only_x_not_initializer);
-    test.AddInput<uint8_t>("y_zero_point", {}, {y_params.zero_point}, only_x_not_initializer);
-    test.AddOutput<uint8_t>("Y", y_dims, y_data);
+    test.AddInput<T8Bits>("y_zero_point", {}, {y_params.zero_point}, only_x_not_initializer);
+    test.AddOutput<T8Bits>("Y", y_dims, y_data);
 
     auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
       const OrtValue& ort_value = fetches[0];
@@ -160,7 +165,7 @@ void RunQLinearAveragePoolNchwU8(
       ORT_ENFORCE(y_shape == output_tensor.Shape(),
                   "Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" +
                       output_tensor.Shape().ToString() + "] for Y @" + provider_type);
-      auto* output = output_tensor.Data<uint8_t>();
+      auto* output = output_tensor.Data<T8Bits>();
       auto size = static_cast<int>(output_tensor.Shape().Size());
       for (int i = 0; i < size; ++i) {
         int diff = abs(y_data[i] - output[i]);
@@ -188,15 +193,16 @@ static std::vector<int64_t> dims_to_nhwc(const std::vector<int64_t>& nchw) {
   return nhwc;
 }
 
-static std::vector<uint8_t> transpose_to_nhwc(const std::vector<uint8_t>& nchw_data, const std::vector<int64_t>& nchw_dims) {
-  std::vector<uint8_t> nhwc_data(nchw_data.size());
+template <typename T8Bits>
+static std::vector<T8Bits> transpose_to_nhwc(const std::vector<T8Bits>& nchw_data, const std::vector<int64_t>& nchw_dims) {
+  std::vector<T8Bits> nhwc_data(nchw_data.size());
 
   auto batch_count = nchw_dims[0];
   auto channels = nchw_dims[1];
   int64_t image_size = std::accumulate(nchw_dims.begin() + 2, nchw_dims.end(), 1LL, std::multiplies<int64_t>());
   for (int64_t b = 0; b < batch_count; b++) {
-    const uint8_t* nchw_image = nchw_data.data() + (b * channels * image_size);
-    uint8_t* nhwc_image = nhwc_data.data() + (b * channels * image_size);
+    const T8Bits* nchw_image = nchw_data.data() + (b * channels * image_size);
+    T8Bits* nhwc_image = nhwc_data.data() + (b * channels * image_size);
     for (int64_t img_index = 0; img_index < image_size; ++img_index) {
       for (int64_t c = 0; c < channels; c++) {
         *nhwc_image++ = nchw_image[c * image_size + img_index];
@@ -207,7 +213,8 @@ static std::vector<uint8_t> transpose_to_nhwc(const std::vector<uint8_t>& nchw_d
   return nhwc_data;
 }
 
-void RunQLinearAveragePoolNhwcU8(
+template <typename T8Bits = uint8_t>
+void RunQLinearAveragePoolNhwc(
     const std::vector<int64_t> x_dims,
     const std::vector<int64_t> y_dims,
     const std::vector<int64_t> kernel_shape,
@@ -215,23 +222,25 @@ void RunQLinearAveragePoolNhwcU8(
     const std::vector<int64_t> pads,
     const int64_t count_include_pad = 0) {
   float x_scale = 1.0f / 255.0f;
-  const quantization::Params<uint8_t> x_params(x_scale, /*zero_point=*/128);
+  T8Bits x_zero_point = (std::numeric_limits<T8Bits>::lowest() + std::numeric_limits<T8Bits>::max() - 5) / 2;
+  const quantization::Params<T8Bits> x_params(x_scale, x_zero_point);
   RandomValueGenerator random{};
   std::vector<float> x_data_fp32 = random.Uniform<float>(x_dims, -0.5f, 0.5f);
-  std::vector<uint8_t> x_data = QuantizeTestVector<uint8_t>(x_data_fp32, x_params);
+  std::vector<T8Bits> x_data = QuantizeTestVector<T8Bits>(x_data_fp32, x_params);
 
   float y_scale = 1.0f / 255.0f;
-  const quantization::Params<uint8_t> y_params(y_scale, /*zero_point=*/100);
+  T8Bits y_zero_point = (std::numeric_limits<T8Bits>::lowest() + std::numeric_limits<T8Bits>::max() + 10) / 2;
+  const quantization::Params<T8Bits> y_params(y_scale, y_zero_point);
   int64_t y_size = std::accumulate(y_dims.begin(), y_dims.end(), 1LL, std::multiplies<int64_t>());
-  std::vector<uint8_t> y_data(y_size);
-  CalculateAvgPoolNchwU8(
+  std::vector<T8Bits> y_data(y_size);
+  CalculateAvgPoolNchw(
       x_data.data(), x_dims, x_params,
       y_data.data(), y_dims, y_params,
       kernel_shape, strides, pads, count_include_pad);
 
   // transpose the result
-  std::vector<uint8_t> y_data_nhwc = transpose_to_nhwc(y_data, y_dims);
-  std::vector<uint8_t> x_data_nhwc = transpose_to_nhwc(x_data, x_dims);
+  std::vector<T8Bits> y_data_nhwc = transpose_to_nhwc(y_data, y_dims);
+  std::vector<T8Bits> x_data_nhwc = transpose_to_nhwc(x_data, x_dims);
   auto x_dims_nhwc = dims_to_nhwc(x_dims);
   auto y_dims_nhwc = dims_to_nhwc(y_dims);
 
@@ -244,12 +253,12 @@ void RunQLinearAveragePoolNhwcU8(
   test.AddAttribute("count_include_pad", count_include_pad);
   test.AddAttribute("channels_last", (int64_t)1LL);
 
-  test.AddInput<uint8_t>("X", x_dims_nhwc, x_data_nhwc);
+  test.AddInput<T8Bits>("X", x_dims_nhwc, x_data_nhwc);
   test.AddInput<float>("x_scale", {}, {x_scale});
-  test.AddInput<uint8_t>("x_zero_point", {}, {x_params.zero_point});
+  test.AddInput<T8Bits>("x_zero_point", {}, {x_params.zero_point});
   test.AddInput<float>("y_scale", {}, {y_scale});
-  test.AddInput<uint8_t>("y_zero_point", {}, {y_params.zero_point});
-  test.AddOutput<uint8_t>("Y", y_dims_nhwc, y_data_nhwc);
+  test.AddInput<T8Bits>("y_zero_point", {}, {y_params.zero_point});
+  test.AddOutput<T8Bits>("Y", y_dims_nhwc, y_data_nhwc);
 
   auto q8checker = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
     const OrtValue& ort_value = fetches[0];
@@ -262,7 +271,7 @@ void RunQLinearAveragePoolNhwcU8(
     ORT_ENFORCE(y_shape == output_tensor.Shape(),
                 "Expected output shape [" + y_shape.ToString() + "] did not match run output shape [" +
                     output_tensor.Shape().ToString() + "] for Y @" + provider_type);
-    auto* output = output_tensor.Data<uint8_t>();
+    auto* output = output_tensor.Data<T8Bits>();
     auto size = static_cast<int>(output_tensor.Shape().Size());
     for (int i = 0; i < size; ++i) {
       int diff = abs(y_data_nhwc[i] - output[i]);
@@ -278,7 +287,7 @@ void RunQLinearAveragePoolNhwcU8(
 }
 
 TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) {
-  RunQLinearAveragePoolNchwU8(
+  RunQLinearAveragePoolNchw(
       {1, 1, 5},  // x shape
       {1, 1, 6},  // expected y shape
       {3},        // kernel shape
@@ -288,7 +297,7 @@ TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel) {
 }
 
 TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel) {
-  RunQLinearAveragePoolNchwU8(
+  RunQLinearAveragePoolNchw(
       {1, 1, 5},  // x shape
       {1, 1, 6},  // expected y shape
       {3},        // kernel shape
@@ -298,7 +307,7 @@ TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel) {
 }
 
 TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel) {
-  RunQLinearAveragePoolNchwU8(
+  RunQLinearAveragePoolNchw(
       {1, 1, 5, 7},  // x shape
       {1, 1, 6, 4},  // expected y shape
       {3, 4},        // kernel shape
@@ -308,7 +317,7 @@ TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel) {
 }
 
 TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel) {
-  RunQLinearAveragePoolNchwU8(
+  RunQLinearAveragePoolNchw(
       {1, 1, 5, 7},  // x shape
       {1, 1, 6, 4},  // expected y shape
       {3, 4},        // kernel shape
@@ -318,7 +327,7 @@ TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel) {
 }
 
 TEST(QLinearPoolTest, AveragePool2D_MultiChannel) {
-  RunQLinearAveragePoolNchwU8(
+  RunQLinearAveragePoolNchw(
       {1, 3, 5, 7},  // x shape
       {1, 3, 6, 4},  // expected y shape
       {3, 4},        // kernel shape
@@ -328,7 +337,7 @@ TEST(QLinearPoolTest, AveragePool2D_MultiChannel) {
 }
 
 TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel) {
-  RunQLinearAveragePoolNchwU8(
+  RunQLinearAveragePoolNchw(
       {1, 1, 5, 7, 9},     // x shape
       {1, 1, 6, 4, 3},     // expected y shape
       {3, 4, 5},           // kernel shape
@@ -338,7 +347,7 @@ TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel) {
 }
 
 TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) {
-  RunQLinearAveragePoolNchwU8(
+  RunQLinearAveragePoolNchw(
       {1, 1, 5, 7, 9},     // x shape
       {1, 1, 6, 4, 3},     // expected y shape
       {3, 4, 5},           // kernel shape
@@ -351,7 +360,7 @@ TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel) {
 * Channels last test
 **************************************************/
 TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel_nhwc) {
-  RunQLinearAveragePoolNhwcU8(
+  RunQLinearAveragePoolNhwc(
       {1, 1, 5},  // x shape
       {1, 1, 6},  // expected y shape
       {3},        // kernel shape
@@ -361,7 +370,7 @@ TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel_nhwc) {
 }
 
 TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel_nhwc) {
-  RunQLinearAveragePoolNhwcU8(
+  RunQLinearAveragePoolNhwc(
       {1, 1, 5},  // x shape
       {1, 1, 6},  // expected y shape
       {3},        // kernel shape
@@ -371,7 +380,7 @@ TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel_nhwc) {
 }
 
 TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel_nhwc) {
-  RunQLinearAveragePoolNhwcU8(
+  RunQLinearAveragePoolNhwc(
       {1, 1, 5, 7},  // x shape
       {1, 1, 6, 4},  // expected y shape
       {3, 4},        // kernel shape
@@ -381,7 +390,7 @@ TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel_nhwc) {
 }
 
 TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel_nhwc) {
-  RunQLinearAveragePoolNhwcU8(
+  RunQLinearAveragePoolNhwc(
       {1, 1, 5, 7},  // x shape
       {1, 1, 6, 4},  // expected y shape
       {3, 4},        // kernel shape
@@ -391,7 +400,7 @@ TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel_nhwc) {
 }
 
 TEST(QLinearPoolTest, AveragePool2D_MultiChannel_nhwc) {
-  RunQLinearAveragePoolNhwcU8(
+  RunQLinearAveragePoolNhwc(
       {1, 3, 5, 7},  // x shape
       {1, 3, 6, 4},  // expected y shape
       {3, 4},        // kernel shape
@@ -401,7 +410,7 @@ TEST(QLinearPoolTest, AveragePool2D_MultiChannel_nhwc) {
 }
 
 TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel_nhwc) {
-  RunQLinearAveragePoolNhwcU8(
+  RunQLinearAveragePoolNhwc(
       {1, 1, 5, 7, 9},     // x shape
       {1, 1, 6, 4, 3},     // expected y shape
       {3, 4, 5},           // kernel shape
@@ -411,7 +420,7 @@ TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel_nhwc) {
 }
 
 TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel_nhwc) {
-  RunQLinearAveragePoolNhwcU8(
+  RunQLinearAveragePoolNhwc(
       {1, 1, 5, 7, 9},     // x shape
       {1, 1, 6, 4, 3},     // expected y shape
       {3, 4, 5},           // kernel shape
@@ -420,9 +429,8 @@ TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel_nhwc) {
       1);                  // count_include_pad
 }
 
-
 TEST(QLinearPoolTest, AveragePool2D_BigImage) {
-  RunQLinearAveragePoolNchwU8(
+  RunQLinearAveragePoolNchw(
       {1, 1, 32, 64},  // x shape
       {1, 1, 32, 64},  // expected y shape
       {3, 3},          // kernel shape
@@ -432,7 +440,7 @@ TEST(QLinearPoolTest, AveragePool2D_BigImage) {
 }
 
 TEST(QLinearPoolTest, AveragePool2D_BigImage_nhwc) {
-  RunQLinearAveragePoolNhwcU8(
+  RunQLinearAveragePoolNhwc(
       {1, 1, 32, 64},  // x shape
       {1, 1, 32, 64},  // expected y shape
       {3, 3},          // kernel shape
@@ -442,7 +450,7 @@ TEST(QLinearPoolTest, AveragePool2D_BigImage_nhwc) {
 }
 
 TEST(QLinearPoolTest, AveragePool2D_Global) {
-  RunQLinearAveragePoolNchwU8(
+  RunQLinearAveragePoolNchw(
       {1, 2, 32, 16},  // x shape
       {1, 2, 1, 1},    // expected y shape
       {32, 16},        // kernel shape
@@ -452,7 +460,7 @@ TEST(QLinearPoolTest, AveragePool2D_Global) {
 }
 
 TEST(QLinearPoolTest, AveragePool2D_Global_nhwc) {
-  RunQLinearAveragePoolNhwcU8(
+  RunQLinearAveragePoolNhwc(
       {1, 2, 32, 16},  // x shape
       {1, 2, 1, 1},    // expected y shape
       {32, 16},        // kernel shape
@@ -461,6 +469,188 @@ TEST(QLinearPoolTest, AveragePool2D_Global_nhwc) {
       1);              // count_include_pad
 }
 
+TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel_S8) {
+  RunQLinearAveragePoolNchw<int8_t>(
+      {1, 1, 5},  // x shape
+      {1, 1, 6},  // expected y shape
+      {3},        // kernel shape
+      {1},        // strides
+      {1, 2},     // pads
+      0);         // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel_S8) {
+  RunQLinearAveragePoolNchw<int8_t>(
+      {1, 1, 5},  // x shape
+      {1, 1, 6},  // expected y shape
+      {3},        // kernel shape
+      {1},        // strides
+      {1, 2},     // pads
+      1);         // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel_S8) {
+  RunQLinearAveragePoolNchw<int8_t>(
+      {1, 1, 5, 7},  // x shape
+      {1, 1, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      0);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel_S8) {
+  RunQLinearAveragePoolNchw<int8_t>(
+      {1, 1, 5, 7},  // x shape
+      {1, 1, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      1);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_MultiChannel_S8) {
+  RunQLinearAveragePoolNchw<int8_t>(
+      {1, 3, 5, 7},  // x shape
+      {1, 3, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      1);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel_S8) {
+  RunQLinearAveragePoolNchw<int8_t>(
+      {1, 1, 5, 7, 9},     // x shape
+      {1, 1, 6, 4, 3},     // expected y shape
+      {3, 4, 5},           // kernel shape
+      {1, 2, 3},           // strides
+      {1, 3, 2, 2, 1, 2},  // pads
+      0);                  // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel_S8) {
+  RunQLinearAveragePoolNchw<int8_t>(
+      {1, 1, 5, 7, 9},     // x shape
+      {1, 1, 6, 4, 3},     // expected y shape
+      {3, 4, 5},           // kernel shape
+      {1, 2, 3},           // strides
+      {1, 3, 2, 2, 1, 2},  // pads
+      1);                  // count_include_pad
+}
+
+/*************************************************
+* Channels last test
+**************************************************/
+TEST(QLinearPoolTest, AveragePool1D_ExcludePadPixel_nhwc_S8) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 1, 5},  // x shape
+      {1, 1, 6},  // expected y shape
+      {3},        // kernel shape
+      {1},        // strides
+      {1, 2},     // pads
+      0);         // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool1D_IncludePadPixel_nhwc_S8) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 1, 5},  // x shape
+      {1, 1, 6},  // expected y shape
+      {3},        // kernel shape
+      {1},        // strides
+      {1, 2},     // pads
+      1);         // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_ExcludePadPixel_nhwc_S8) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 1, 5, 7},  // x shape
+      {1, 1, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      0);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_IncludePadPixel_nhwc_S8) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 1, 5, 7},  // x shape
+      {1, 1, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      1);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_MultiChannel_nhwc_S8) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 3, 5, 7},  // x shape
+      {1, 3, 6, 4},  // expected y shape
+      {3, 4},        // kernel shape
+      {1, 2},        // strides
+      {1, 3, 2, 1},  // pads
+      1);            // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_ExcludePadPixel_nhwc_S8) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 1, 5, 7, 9},     // x shape
+      {1, 1, 6, 4, 3},     // expected y shape
+      {3, 4, 5},           // kernel shape
+      {1, 2, 3},           // strides
+      {1, 3, 2, 2, 1, 2},  // pads
+      0);                  // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool3D_IncludePadPixel_nhwc_S8) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 1, 5, 7, 9},     // x shape
+      {1, 1, 6, 4, 3},     // expected y shape
+      {3, 4, 5},           // kernel shape
+      {1, 2, 3},           // strides
+      {1, 3, 2, 2, 1, 2},  // pads
+      1);                  // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_BigImage_S8) {
+  RunQLinearAveragePoolNchw<int8_t>(
+      {1, 1, 32, 64},  // x shape
+      {1, 1, 32, 64},  // expected y shape
+      {3, 3},          // kernel shape
+      {1, 1},          // strides
+      {1, 1, 1, 1},    // pads
+      1);              // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_BigImage_nhwc_S8) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 1, 32, 64},  // x shape
+      {1, 1, 32, 64},  // expected y shape
+      {3, 3},          // kernel shape
+      {1, 1},          // strides
+      {1, 1, 1, 1},    // pads
+      1);              // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_Global_S8) {
+  RunQLinearAveragePoolNchw<int8_t>(
+      {1, 2, 32, 16},  // x shape
+      {1, 2, 1, 1},    // expected y shape
+      {32, 16},        // kernel shape
+      {1, 1},          // strides
+      {0, 0, 0, 0},    // pads
+      1);              // count_include_pad
+}
+
+TEST(QLinearPoolTest, AveragePool2D_Global_nhwc_S8) {
+  RunQLinearAveragePoolNhwc<int8_t>(
+      {1, 2, 32, 16},  // x shape
+      {1, 2, 1, 1},    // expected y shape
+      {32, 16},        // kernel shape
+      {1, 1},          // strides
+      {0, 0, 0, 0},    // pads
+      1);              // count_include_pad
+}
 
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 9d5fa3ec74..23e240a134 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -661,7 +661,7 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
     }
   }
 
-#if defined(USE_CUDA) && !defined(ENABLE_TRAINING) && defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#if defined(USE_CUDA) && defined(ENABLE_CUDA_PROFILING)
   ASSERT_TRUE(has_kernel_info);
 #endif
 }
diff --git a/onnxruntime/test/fuzzing/include/BetaDistribution.h b/onnxruntime/test/fuzzing/include/BetaDistribution.h
index 221ac3e176..5818d01d45 100644
--- a/onnxruntime/test/fuzzing/include/BetaDistribution.h
+++ b/onnxruntime/test/fuzzing/include/BetaDistribution.h
@@ -118,7 +118,7 @@ private:
 
     // A constant value used for internal computation.
     //
-    constexpr inline double sqrtpi()
+    constexpr double sqrtpi()
     { 
         return std::sqrt( std::atan(1)*4 ); 
     }
diff --git a/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp b/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp
index 936bac86c6..aeb13af5b9 100644
--- a/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp
+++ b/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp
@@ -3,24 +3,28 @@
 
 #include "test_util.h"
 
-class MlasQLinearGlobalAveragePoolU8Test : public MlasTestBase {
- private:
-  MatrixGuardBuffer<uint8_t> BufferInput;
-  MatrixGuardBuffer<uint8_t> BufferOutput;
-  MatrixGuardBuffer<uint8_t> BufferOutputReference;
+#include <vector>
 
-  static void CalculateGlobalAvgPoolU8(
-      const uint8_t* x, int64_t batch, int64_t channel, int64_t hw, bool channel_last,
-      uint8_t* y, int32_t x_zero_point, float x_scale, int32_t y_zero_point, float y_scale) {
+template <typename T8Bits>
+class MlasQLinearGlobalAveragePoolTest : public MlasTestBase {
+ private:
+  MatrixGuardBuffer<T8Bits> BufferInput;
+  MatrixGuardBuffer<T8Bits> BufferOutput;
+  MatrixGuardBuffer<T8Bits> BufferOutputReference;
+  static const std::vector<T8Bits> ZeroPoints;
+
+  static void CalculateGlobalAvgPool(
+      const T8Bits* x, int64_t batch, int64_t channel, int64_t hw, bool channel_last,
+      T8Bits* y, int32_t x_zero_point, float x_scale, int32_t y_zero_point, float y_scale) {
     int32_t bias = -x_zero_point * static_cast<int32_t>(hw);
     int64_t stride_image = channel_last ? channel : 1;
     int64_t stride_channel = channel_last ? 1 : hw;
 
     for (int64_t b = 0; b < batch; ++b) {
-      const uint8_t* bx = x + b * hw * channel;
-      uint8_t* by = y + b * channel;
+      const T8Bits* bx = x + b * hw * channel;
+      T8Bits* by = y + b * channel;
       for (int64_t c = 0; c < channel; ++c) {
-        const uint8_t* ix = bx + c * stride_channel;
+        const T8Bits* ix = bx + c * stride_channel;
         int32_t sum = 0;
         for (int64_t i = 0; i < hw; ++i) {
           sum += static_cast<int32_t>(*ix);
@@ -29,15 +33,15 @@ class MlasQLinearGlobalAveragePoolU8Test : public MlasTestBase {
         sum += bias;
         int32_t r = static_cast<int32_t>(std::nearbyintf(x_scale * sum / static_cast<float>(hw) / y_scale));
         r += y_zero_point;
-        r = std::min(255, r);
-        r = std::max(0, r);
-        by[c] = static_cast<uint8_t>(r);
+        r = std::min((int32_t)(std::numeric_limits<T8Bits>::max()), r);
+        r = std::max((int32_t)(std::numeric_limits<T8Bits>::lowest()), r);
+        by[c] = static_cast<T8Bits>(r);
       }
     }
   }
 
   static void CompareResultWithGold(size_t Batch, size_t Channel,
-                                    uint8_t* Output, uint8_t* OutputReference, std::string& info) {
+                                    T8Bits* Output, T8Bits* OutputReference, std::string& info) {
     size_t n = 0;
     for (size_t b = 0; b < Batch; ++b) {
       for (size_t c = 0; c < Channel; c++) {
@@ -53,9 +57,9 @@ class MlasQLinearGlobalAveragePoolU8Test : public MlasTestBase {
                                  size_t Channel,
                                  size_t ImageSize,
                                  float InputScale,
-                                 uint8_t InputZeroPoint,
+                                 T8Bits InputZeroPoint,
                                  float OutputScale,
-                                 uint8_t OutputZeroPoint) {
+                                 T8Bits OutputZeroPoint) {
     std::stringstream ss;
     ss << (channel_last ? "Nhwc_" : "Nchw_");
     ss << Batch << "x [C=" << Stride << "-" << Channel << "] x" << ImageSize << "-";
@@ -69,25 +73,25 @@ class MlasQLinearGlobalAveragePoolU8Test : public MlasTestBase {
             size_t Channel,
             size_t ImageSize,
             float InputScale,
-            uint8_t InputZeroPoint,
+            T8Bits InputZeroPoint,
             float OutputScale,
-            uint8_t OutputZeroPoint,
+            T8Bits OutputZeroPoint,
             int32_t UnalignedOffset = 0) {
     size_t N = Batch * Stride * ImageSize;
     size_t ResultLen = Batch * Stride;
-    uint8_t* Input = BufferInput.GetBuffer(N);
-    uint8_t* Output = BufferOutput.GetBuffer(ResultLen);
-    uint8_t* Gold = BufferOutputReference.GetBuffer(ResultLen);
+    T8Bits* Input = BufferInput.GetBuffer(N);
+    T8Bits* Output = BufferOutput.GetBuffer(ResultLen);
+    T8Bits* Gold = BufferOutputReference.GetBuffer(ResultLen);
     std::string test_info = GetTestInfo(
         channel_last, Batch, Stride, Channel, ImageSize,
         InputScale, InputZeroPoint, OutputScale, OutputZeroPoint);
 
     std::default_random_engine generator(static_cast<unsigned>(N));
-    std::uniform_int_distribution<int> distribution(0, 255);
+    std::uniform_int_distribution<int> distribution(std::numeric_limits<T8Bits>::lowest(), std::numeric_limits<T8Bits>::max());
     for (size_t n = 0; n < N; n++) {
-      Input[n] = static_cast<uint8_t>(distribution(generator));
+      Input[n] = static_cast<T8Bits>(distribution(generator));
     }
-    CalculateGlobalAvgPoolU8(
+    CalculateGlobalAvgPool(
         Input, Batch, Stride, ImageSize, channel_last,
         Gold, InputZeroPoint, InputScale, OutputZeroPoint, OutputScale);
 
@@ -98,7 +102,7 @@ class MlasQLinearGlobalAveragePoolU8Test : public MlasTestBase {
           OutputScale, OutputZeroPoint, ResultLen, ImageSize, acc.data() + UnalignedOffset);
     } else {
       std::vector<int32_t> acc(MlasQLinearSafePaddingElementCount(sizeof(int32_t), Channel + UnalignedOffset));
-      std::vector<uint8_t> zero(MlasQLinearSafePaddingElementCount(sizeof(uint8_t), Channel + UnalignedOffset));
+      std::vector<T8Bits> zero(MlasQLinearSafePaddingElementCount(sizeof(T8Bits), Channel + UnalignedOffset));
       if (Stride == Channel) {
         MlasQLinearGlobalAveragePoolNhwc(
             Input, InputScale, InputZeroPoint, Output,
@@ -120,12 +124,12 @@ class MlasQLinearGlobalAveragePoolU8Test : public MlasTestBase {
 
  public:
   static const char* GetTestSuiteName() {
-    static const std::string suite_name("QLinearGlobalAvgPool");
+    constexpr bool is_signed = std::is_signed<T8Bits>::value;
+    static const std::string suite_name(is_signed ? "QLinearGlobalAvgPoolS8" : "QLinearGlobalAvgPoolU8");
     return suite_name.c_str();
   }
 
   void ExecuteShort(void) override {
-    static const uint8_t zero_points[] = {0, 18, 128, 231, 255};
     static const float scales[] = {18.0f, 90.0f};
     static const size_t Batch[] = {1, 3};
     static const size_t Stride[] = {7, 8, 63, 256};
@@ -134,17 +138,17 @@ class MlasQLinearGlobalAveragePoolU8Test : public MlasTestBase {
 
     for (int channel_last = 0; channel_last <= 1; ++channel_last) {
       for (size_t b = 0; b < _countof(Batch); b++) {
-        for (size_t xzp = 0; xzp < _countof(zero_points); xzp++) {
-          for (size_t yzp = 0; yzp < _countof(zero_points); yzp++) {
+        for (size_t xzp = 0; xzp < ZeroPoints.size(); xzp++) {
+          for (size_t yzp = 0; yzp < ZeroPoints.size(); yzp++) {
             for (size_t xs = 0; xs < _countof(scales); ++xs) {
               for (size_t ys = 0; ys < _countof(scales); ++ys) {
                 for (size_t i = 0; i < _countof(ImageSize); i++) {
                   for (size_t s = 0; s < _countof(Stride); s++) {
                     Test(channel_last != 0, Batch[b], Stride[s], Stride[s], ImageSize[i],
-                         scales[xs], zero_points[xzp], scales[ys], zero_points[yzp], unalign_offset);
+                         scales[xs], ZeroPoints[xzp], scales[ys], ZeroPoints[yzp], unalign_offset);
                     if (channel_last == 1 && Stride[s] > 32) {
                       Test(channel_last != 0, Batch[b], Stride[s], 32, ImageSize[i],
-                           scales[xs], zero_points[xzp], scales[ys], zero_points[yzp], unalign_offset);
+                           scales[xs], ZeroPoints[xzp], scales[ys], ZeroPoints[yzp], unalign_offset);
                     }
                     unalign_offset = (unalign_offset + 1) & 3;
                   }
@@ -158,8 +162,21 @@ class MlasQLinearGlobalAveragePoolU8Test : public MlasTestBase {
   }
 };
 
-template <> MlasQLinearGlobalAveragePoolU8Test* MlasTestFixture<MlasQLinearGlobalAveragePoolU8Test>::mlas_tester(nullptr);
+template <>
+MlasQLinearGlobalAveragePoolTest<int8_t>* MlasTestFixture<MlasQLinearGlobalAveragePoolTest<int8_t>>::mlas_tester(nullptr);
+template <>
+MlasQLinearGlobalAveragePoolTest<uint8_t>* MlasTestFixture<MlasQLinearGlobalAveragePoolTest<uint8_t>>::mlas_tester(nullptr);
+
+template <>
+const std::vector<int8_t> MlasQLinearGlobalAveragePoolTest<int8_t>::ZeroPoints = {-128, -110, 1, 103, 127};
+
+template <>
+const std::vector<uint8_t> MlasQLinearGlobalAveragePoolTest<uint8_t>::ZeroPoints = {0, 18, 128, 231, 255};
 
 static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
-  return is_short_execute ? MlasDirectShortExecuteTests<MlasQLinearGlobalAveragePoolU8Test>::RegisterShortExecute() : 0;
+  if (is_short_execute) {
+    return MlasDirectShortExecuteTests<MlasQLinearGlobalAveragePoolTest<int8_t>>::RegisterShortExecute() +
+           MlasDirectShortExecuteTests<MlasQLinearGlobalAveragePoolTest<uint8_t>>::RegisterShortExecute();
+  }
+  return (size_t)0;
 });
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index 7c8d4a96e6..2903e51055 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -835,6 +835,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     broken_tests.insert({"resize_upsample_linear", "ORT 0.4 uses asymmetric but will conform to half_pixel in the next ONNX version."});
 
     // These tests are temporarily disabled pending investigation
+    broken_tests.insert({"dynamicquantizelinear", "Temporarily disabled pending investigation"});
     broken_tests.insert({"dynamicquantizelinear_expanded", "Temporarily disabled pending investigation"});
     broken_tests.insert({"dynamicquantizelinear_max_adjusted_expanded", "Temporarily disabled pending investigation"});
     broken_tests.insert({"dynamicquantizelinear_min_adjusted_expanded", "Temporarily disabled pending investigation"});
diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
index c4afac1b02..a76843413e 100644
--- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
@@ -291,209 +291,212 @@ TEST(TransposeOptimizerTests, TestPadNonconst) {
                     /*opset_version*/ 11);
 }
 
-TEST(TransposeOptimizerTests, TestResize) {
-  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
-    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
-    auto* const_1 = builder.MakeInitializer<float>({4}, {0.3f, 2.5f, 1.0f, 0.7f});
-    auto* transpose_1_out_0 = builder.MakeIntermediate();
-    auto* resize_1_out_0 = builder.MakeIntermediate();
-    auto* transpose_2_out_0 = builder.MakeOutput();
+// Todo: renable tests on resize transformer after adding NHWC support in upsample op on cpu
+// https://github.com/microsoft/onnxruntime/issues/9857
 
-    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
-    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
-    builder.AddNode("Resize", {transpose_1_out_0, const_1}, {resize_1_out_0});
-    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
-    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
-  };
-
-  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
-    int transpose_cost = EstimateTransposeCost(session.GetGraph());
-    EXPECT_EQ(transpose_cost, 0);
-  };
-
-  TransformerTester(build_test_case_1,
-                    check_optimized_graph_1,
-                    TransformerLevel::Default,
-                    TransformerLevel::Level1,
-                    /*opset_version*/ 10);
-}
-
-TEST(TransposeOptimizerTests, TestResizeOpset11) {
-  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
-    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
-    auto* const_1 = builder.MakeInitializer<float>({8}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
-    auto* const_2 = builder.MakeInitializer<float>({4}, {0.3f, 2.5f, 1.0f, 0.7f});
-    auto* transpose_1_out_0 = builder.MakeIntermediate();
-    auto* resize_1_out_0 = builder.MakeIntermediate();
-    auto* transpose_2_out_0 = builder.MakeOutput();
-
-    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
-    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
-    builder.AddNode("Resize", {transpose_1_out_0, const_1, const_2}, {resize_1_out_0});
-    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
-    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
-  };
-
-  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
-    int transpose_cost = EstimateTransposeCost(session.GetGraph());
-    EXPECT_EQ(transpose_cost, 0);
-  };
-
-  TransformerTester(build_test_case_1,
-                    check_optimized_graph_1,
-                    TransformerLevel::Default,
-                    TransformerLevel::Level1,
-                    /*opset_version*/ 11);
-}
-
-TEST(TransposeOptimizerTests, TestResizeOpset15) {
-  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
-    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
-    auto* const_1 = builder.MakeInitializer<float>({4}, {0.3f, 2.5f, 1.0f, 0.7f});
-    auto* transpose_1_out_0 = builder.MakeIntermediate();
-    auto* resize_1_out_0 = builder.MakeIntermediate();
-    auto* transpose_2_out_0 = builder.MakeOutput();
-    auto empty_arg = NodeArg("", nullptr);
-
-    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
-    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
-    builder.AddNode("Resize", {transpose_1_out_0, &empty_arg, const_1}, {resize_1_out_0});
-    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
-    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
-  };
-
-  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
-    int transpose_cost = EstimateTransposeCost(session.GetGraph());
-    EXPECT_EQ(transpose_cost, 0);
-  };
-
-  TransformerTester(build_test_case_1,
-                    check_optimized_graph_1,
-                    TransformerLevel::Default,
-                    TransformerLevel::Level1,
-                    /*opset_version*/ 15);
-}
-
-TEST(TransposeOptimizerTests, TestResizeSizeRoi) {
-  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
-    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
-    auto* const_1 = builder.MakeInitializer<float>({8}, {0.1f, 0.2f, 0.3f, 0.4f, 0.9f, 0.8f, 0.7f, 0.6f});
-    auto* const_2 = builder.MakeInitializer<int64_t>({4}, {10, 9, 8, 7});
-    auto* transpose_1_out_0 = builder.MakeIntermediate();
-    auto* resize_1_out_0 = builder.MakeIntermediate();
-    auto* transpose_2_out_0 = builder.MakeOutput();
-    auto empty_arg = NodeArg("", nullptr);
-
-    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
-    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
-    auto& resize_1 = builder.AddNode("Resize", {transpose_1_out_0, const_1, &empty_arg, const_2}, {resize_1_out_0});
-    resize_1.AddAttribute("coordinate_transformation_mode", "tf_crop_and_resize");
-    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
-    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
-  };
-
-  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
-    int transpose_cost = EstimateTransposeCost(session.GetGraph());
-    EXPECT_EQ(transpose_cost, 0);
-  };
-
-  TransformerTester(build_test_case_1,
-                    check_optimized_graph_1,
-                    TransformerLevel::Default,
-                    TransformerLevel::Level1,
-                    /*opset_version*/ 15);
-}
-
-TEST(TransposeOptimizerTests, TestResizeRoiScalesZeroRank0) {
-  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
-    auto* input = builder.MakeInput<uint8_t>({1, 512, 512, 3},
-                                             std::numeric_limits<uint8_t>::min(),
-                                             std::numeric_limits<uint8_t>::max());
-    auto* resize_in_roi = builder.MakeInitializer<float>({0}, {});
-    auto* resize_in_scales = builder.MakeInitializer<float>({0}, {});
-    auto* resize_in_sizes = builder.MakeInitializer<int64_t>({4}, {1, 256, 32, 32});
-
-    auto* transpose1_out_transposed = builder.MakeIntermediate();
-    auto* resize_out_Y = builder.MakeIntermediate();
-    auto* output = builder.MakeOutput();
-
-    auto& transpose_1 = builder.AddNode("Transpose", {input}, {transpose1_out_transposed});
-    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
-    builder.AddNode("Resize",
-                    {transpose1_out_transposed, resize_in_roi, resize_in_scales, resize_in_sizes},
-                    {resize_out_Y});
-    auto& transpose_2 = builder.AddNode("Transpose", {resize_out_Y}, {output});
-    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
-  };
-
-  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
-    int transpose_cost = EstimateTransposeCost(session.GetGraph());
-    EXPECT_EQ(transpose_cost, 0);
-  };
-
-  TransformerTester(build_test_case_1,
-                    check_optimized_graph_1,
-                    TransformerLevel::Default,
-                    TransformerLevel::Level1);
-}
-
-TEST(TransposeOptimizerTests, TestResizeNonconst) {
-  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
-    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
-    auto* input1_arg = MakeInput<float>(builder, {{8}}, {8}, {0.1f, 0.2f, 0.3f, 0.4f, 0.9f, 0.8f, 0.7f, 0.6f});
-    auto* input2_arg = MakeInput<float>(builder, {{4}}, {4}, {0.3f, 2.5f, 1.0f, 0.7f});
-    auto* transpose_1_out_0 = builder.MakeIntermediate();
-    auto* resize_1_out_0 = builder.MakeIntermediate();
-    auto* transpose_2_out_0 = builder.MakeOutput();
-
-    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
-    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
-    auto& resize_1 = builder.AddNode("Resize", {transpose_1_out_0, input1_arg, input2_arg}, {resize_1_out_0});
-    resize_1.AddAttribute("coordinate_transformation_mode", "tf_crop_and_resize");
-    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
-    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
-  };
-
-  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
-    int transpose_cost = EstimateTransposeCost(session.GetGraph());
-    EXPECT_EQ(transpose_cost, 0);
-  };
-
-  TransformerTester(build_test_case_1,
-                    check_optimized_graph_1,
-                    TransformerLevel::Default,
-                    TransformerLevel::Level1,
-                    /*opset_version*/ 11);
-}
-
-TEST(TransposeOptimizerTests, TestResizeNonconstOpset13) {
-  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
-    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
-    auto* input1_arg = MakeInput<float>(builder, {{8}}, {8}, {0.1f, 0.2f, 0.3f, 0.4f, 0.9f, 0.8f, 0.7f, 0.6f});
-    auto* input2_arg = MakeInput<float>(builder, {{4}}, {4}, {0.3f, 2.5f, 1.0f, 0.7f});
-    auto* transpose_1_out_0 = builder.MakeIntermediate();
-    auto* resize_1_out_0 = builder.MakeIntermediate();
-    auto* transpose_2_out_0 = builder.MakeOutput();
-
-    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
-    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
-    auto& resize_1 = builder.AddNode("Resize", {transpose_1_out_0, input1_arg, input2_arg}, {resize_1_out_0});
-    resize_1.AddAttribute("coordinate_transformation_mode", "tf_crop_and_resize");
-    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
-    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
-  };
-
-  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
-    int transpose_cost = EstimateTransposeCost(session.GetGraph());
-    EXPECT_EQ(transpose_cost, 0);
-  };
-
-  TransformerTester(build_test_case_1,
-                    check_optimized_graph_1,
-                    TransformerLevel::Default,
-                    TransformerLevel::Level1,
-                    /*opset_version*/ 13);
-}
+//TEST(TransposeOptimizerTests, TestResize) {
+//  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
+//    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
+//    auto* const_1 = builder.MakeInitializer<float>({4}, {0.3f, 2.5f, 1.0f, 0.7f});
+//    auto* transpose_1_out_0 = builder.MakeIntermediate();
+//    auto* resize_1_out_0 = builder.MakeIntermediate();
+//    auto* transpose_2_out_0 = builder.MakeOutput();
+//
+//    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
+//    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
+//    builder.AddNode("Resize", {transpose_1_out_0, const_1}, {resize_1_out_0});
+//    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
+//    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
+//  };
+//
+//  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
+//    int transpose_cost = EstimateTransposeCost(session.GetGraph());
+//    EXPECT_EQ(transpose_cost, 0);
+//  };
+//
+//  TransformerTester(build_test_case_1,
+//                    check_optimized_graph_1,
+//                    TransformerLevel::Default,
+//                    TransformerLevel::Level1,
+//                    /*opset_version*/ 10);
+//}
+//
+//TEST(TransposeOptimizerTests, TestResizeOpset11) {
+//  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
+//    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
+//    auto* const_1 = builder.MakeInitializer<float>({8}, {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
+//    auto* const_2 = builder.MakeInitializer<float>({4}, {0.3f, 2.5f, 1.0f, 0.7f});
+//    auto* transpose_1_out_0 = builder.MakeIntermediate();
+//    auto* resize_1_out_0 = builder.MakeIntermediate();
+//    auto* transpose_2_out_0 = builder.MakeOutput();
+//
+//    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
+//    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
+//    builder.AddNode("Resize", {transpose_1_out_0, const_1, const_2}, {resize_1_out_0});
+//    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
+//    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
+//  };
+//
+//  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
+//    int transpose_cost = EstimateTransposeCost(session.GetGraph());
+//    EXPECT_EQ(transpose_cost, 0);
+//  };
+//
+//  TransformerTester(build_test_case_1,
+//                    check_optimized_graph_1,
+//                    TransformerLevel::Default,
+//                    TransformerLevel::Level1,
+//                    /*opset_version*/ 11);
+//}
+//
+//TEST(TransposeOptimizerTests, TestResizeOpset15) {
+//  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
+//    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
+//    auto* const_1 = builder.MakeInitializer<float>({4}, {0.3f, 2.5f, 1.0f, 0.7f});
+//    auto* transpose_1_out_0 = builder.MakeIntermediate();
+//    auto* resize_1_out_0 = builder.MakeIntermediate();
+//    auto* transpose_2_out_0 = builder.MakeOutput();
+//    auto empty_arg = NodeArg("", nullptr);
+//
+//    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
+//    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
+//    builder.AddNode("Resize", {transpose_1_out_0, &empty_arg, const_1}, {resize_1_out_0});
+//    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
+//    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
+//  };
+//
+//  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
+//    int transpose_cost = EstimateTransposeCost(session.GetGraph());
+//    EXPECT_EQ(transpose_cost, 0);
+//  };
+//
+//  TransformerTester(build_test_case_1,
+//                    check_optimized_graph_1,
+//                    TransformerLevel::Default,
+//                    TransformerLevel::Level1,
+//                    /*opset_version*/ 15);
+//}
+//
+//TEST(TransposeOptimizerTests, TestResizeSizeRoi) {
+//  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
+//    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
+//    auto* const_1 = builder.MakeInitializer<float>({8}, {0.1f, 0.2f, 0.3f, 0.4f, 0.9f, 0.8f, 0.7f, 0.6f});
+//    auto* const_2 = builder.MakeInitializer<int64_t>({4}, {10, 9, 8, 7});
+//    auto* transpose_1_out_0 = builder.MakeIntermediate();
+//    auto* resize_1_out_0 = builder.MakeIntermediate();
+//    auto* transpose_2_out_0 = builder.MakeOutput();
+//    auto empty_arg = NodeArg("", nullptr);
+//
+//    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
+//    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
+//    auto& resize_1 = builder.AddNode("Resize", {transpose_1_out_0, const_1, &empty_arg, const_2}, {resize_1_out_0});
+//    resize_1.AddAttribute("coordinate_transformation_mode", "tf_crop_and_resize");
+//    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
+//    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
+//  };
+//
+//  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
+//    int transpose_cost = EstimateTransposeCost(session.GetGraph());
+//    EXPECT_EQ(transpose_cost, 0);
+//  };
+//
+//  TransformerTester(build_test_case_1,
+//                    check_optimized_graph_1,
+//                    TransformerLevel::Default,
+//                    TransformerLevel::Level1,
+//                    /*opset_version*/ 15);
+//}
+//
+//TEST(TransposeOptimizerTests, TestResizeRoiScalesZeroRank0) {
+//  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
+//    auto* input = builder.MakeInput<uint8_t>({1, 512, 512, 3},
+//                                             std::numeric_limits<uint8_t>::min(),
+//                                             std::numeric_limits<uint8_t>::max());
+//    auto* resize_in_roi = builder.MakeInitializer<float>({0}, {});
+//    auto* resize_in_scales = builder.MakeInitializer<float>({0}, {});
+//    auto* resize_in_sizes = builder.MakeInitializer<int64_t>({4}, {1, 256, 32, 32});
+//
+//    auto* transpose1_out_transposed = builder.MakeIntermediate();
+//    auto* resize_out_Y = builder.MakeIntermediate();
+//    auto* output = builder.MakeOutput();
+//
+//    auto& transpose_1 = builder.AddNode("Transpose", {input}, {transpose1_out_transposed});
+//    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
+//    builder.AddNode("Resize",
+//                    {transpose1_out_transposed, resize_in_roi, resize_in_scales, resize_in_sizes},
+//                    {resize_out_Y});
+//    auto& transpose_2 = builder.AddNode("Transpose", {resize_out_Y}, {output});
+//    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
+//  };
+//
+//  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
+//    int transpose_cost = EstimateTransposeCost(session.GetGraph());
+//    EXPECT_EQ(transpose_cost, 0);
+//  };
+//
+//  TransformerTester(build_test_case_1,
+//                    check_optimized_graph_1,
+//                    TransformerLevel::Default,
+//                    TransformerLevel::Level1);
+//}
+//
+//TEST(TransposeOptimizerTests, TestResizeNonconst) {
+//  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
+//    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
+//    auto* input1_arg = MakeInput<float>(builder, {{8}}, {8}, {0.1f, 0.2f, 0.3f, 0.4f, 0.9f, 0.8f, 0.7f, 0.6f});
+//    auto* input2_arg = MakeInput<float>(builder, {{4}}, {4}, {0.3f, 2.5f, 1.0f, 0.7f});
+//    auto* transpose_1_out_0 = builder.MakeIntermediate();
+//    auto* resize_1_out_0 = builder.MakeIntermediate();
+//    auto* transpose_2_out_0 = builder.MakeOutput();
+//
+//    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
+//    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
+//    auto& resize_1 = builder.AddNode("Resize", {transpose_1_out_0, input1_arg, input2_arg}, {resize_1_out_0});
+//    resize_1.AddAttribute("coordinate_transformation_mode", "tf_crop_and_resize");
+//    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
+//    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
+//  };
+//
+//  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
+//    int transpose_cost = EstimateTransposeCost(session.GetGraph());
+//    EXPECT_EQ(transpose_cost, 0);
+//  };
+//
+//  TransformerTester(build_test_case_1,
+//                    check_optimized_graph_1,
+//                    TransformerLevel::Default,
+//                    TransformerLevel::Level1,
+//                    /*opset_version*/ 11);
+//}
+//
+//TEST(TransposeOptimizerTests, TestResizeNonconstOpset13) {
+//  auto build_test_case_1 = [&](ModelTestBuilder& builder) {
+//    auto* input0_arg = MakeInput<float>(builder, {{4, -1, 2, -1}}, {4, 6, 2, 10}, 0.0, 1.0);
+//    auto* input1_arg = MakeInput<float>(builder, {{8}}, {8}, {0.1f, 0.2f, 0.3f, 0.4f, 0.9f, 0.8f, 0.7f, 0.6f});
+//    auto* input2_arg = MakeInput<float>(builder, {{4}}, {4}, {0.3f, 2.5f, 1.0f, 0.7f});
+//    auto* transpose_1_out_0 = builder.MakeIntermediate();
+//    auto* resize_1_out_0 = builder.MakeIntermediate();
+//    auto* transpose_2_out_0 = builder.MakeOutput();
+//
+//    auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0});
+//    transpose_1.AddAttribute("perm", std::vector<int64_t>{0, 3, 1, 2});
+//    auto& resize_1 = builder.AddNode("Resize", {transpose_1_out_0, input1_arg, input2_arg}, {resize_1_out_0});
+//    resize_1.AddAttribute("coordinate_transformation_mode", "tf_crop_and_resize");
+//    auto& transpose_2 = builder.AddNode("Transpose", {resize_1_out_0}, {transpose_2_out_0});
+//    transpose_2.AddAttribute("perm", std::vector<int64_t>{0, 2, 3, 1});
+//  };
+//
+//  auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) {
+//    int transpose_cost = EstimateTransposeCost(session.GetGraph());
+//    EXPECT_EQ(transpose_cost, 0);
+//  };
+//
+//  TransformerTester(build_test_case_1,
+//                    check_optimized_graph_1,
+//                    TransformerLevel::Default,
+//                    TransformerLevel::Level1,
+//                    /*opset_version*/ 13);
+//}
 
 TEST(TransposeOptimizerTests, TestAdd) {
   auto build_test_case_1 = [&](ModelTestBuilder& builder) {
diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc
index 59d4a3737d..e1be1e28f9 100644
--- a/onnxruntime/test/providers/cpu/generator/random_test.cc
+++ b/onnxruntime/test/providers/cpu/generator/random_test.cc
@@ -33,7 +33,10 @@ TEST(Random, RandomNormal2DDouble) {
                 [&generator, &distribution](double& value) { value = distribution(generator); });
 
   test.AddOutput<double>("Y", dims, expected_output);
-  test.Run();
+
+  // The expected_output is generated using std lib, which is used by CPU kernel only.
+  // So we need to exclude other EPs here. Ditto for other places.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
 }
 
 void RunRandomNormalLike3DFloat(bool infer_dtype = false) {
@@ -68,7 +71,7 @@ void RunRandomNormalLike3DFloat(bool infer_dtype = false) {
 
   test.AddOutput<float>("Y", dims, expected_output);
 
-  test.Run();
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider});
 }
 
 TEST(Random, RandomNormalLike3DDouble) {
@@ -105,7 +108,7 @@ TEST(Random, RandomUniform1DFloat) {
   test.AddOutput<float>("Y", dims, expected_output);
 
   // TensorRT does not support manual seed overrides and there will be result mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
 }
 
 void RunRandomUniformLikeTest(bool infer_dtype = false) {
@@ -138,7 +141,7 @@ void RunRandomUniformLikeTest(bool infer_dtype = false) {
   test.AddOutput<double>("Y", dims, expected_output);
 
   // TensorRT does not support seed parameter and there will be result mismatch
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kRocmExecutionProvider, kTensorrtExecutionProvider});
 }
 
 TEST(Random, RandomUniformLike2DDouble) {
@@ -324,5 +327,205 @@ TEST(Random, MultinomialInvalidDtype) {
 
   test.Run(OpTester::ExpectResult::kExpectFailure, "Output type must be int32 or int64");
 }
+
+#if defined(USE_CUDA) || defined(USE_ROCM)
+// We cannot call CUDA lib from UT, so just do some simple verification on output tensor.
+void RunRandomNormalGpuTest(const std::vector<int64_t> dims, const float mean, const float scale, const float seed,
+                            TensorProto_DataType dtype, bool is_random_like, bool infer_dtype) {
+  OpTester test(is_random_like ? "RandomNormalLike" : "RandomNormal");
+  test.AddAttribute("mean", mean);
+  test.AddAttribute("scale", scale);
+  test.AddAttribute("seed", seed);
+  if (!is_random_like) {
+    test.AddAttribute<int64_t>("dtype", dtype);
+  } else if (!infer_dtype) {
+    // For RandomNormalLike, if not infer dtype, use float as target.
+    test.AddAttribute<int64_t>("dtype", TensorProto_DataType::TensorProto_DataType_FLOAT);
+  }
+  size_t size = 1;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    size *= static_cast<size_t>(dims[i]);
+  }
+  if (!is_random_like) {
+    test.AddAttribute("shape", dims);
+  } else {
+    if (dtype == TensorProto_DataType::TensorProto_DataType_FLOAT) {
+      std::vector<float> float_data(size, 0.f);
+      test.AddInput("X", dims, float_data);
+    } else if (dtype == TensorProto_DataType::TensorProto_DataType_DOUBLE) {
+      std::vector<double> double_data(size, 0.);
+      test.AddInput("X", dims, double_data);
+    } else if (dtype == TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+      std::vector<float> float_data(size, 0.f);
+      std::vector<MLFloat16> fp16_data(size);
+      ConvertFloatToMLFloat16(float_data.data(), fp16_data.data(), static_cast<int>(size));
+      test.AddInput("X", dims, fp16_data);
+    }
+  }
+
+  // We'll do our own output verification.
+  TensorProto_DataType output_dtype =
+      is_random_like && !infer_dtype ? TensorProto_DataType::TensorProto_DataType_FLOAT : dtype;
+  if (output_dtype == TensorProto_DataType::TensorProto_DataType_FLOAT) {
+    std::vector<float> float_data(size, 0.f);
+    test.AddOutput("Y", dims, float_data);
+  } else if (output_dtype == TensorProto_DataType::TensorProto_DataType_DOUBLE) {
+    std::vector<double> double_data(size, 0.);
+    test.AddOutput("Y", dims, double_data);
+  } else if (output_dtype == TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+    std::vector<float> float_data(size, 0.f);
+    std::vector<MLFloat16> fp16_data(size);
+    ConvertFloatToMLFloat16(float_data.data(), fp16_data.data(), static_cast<int>(size));
+    test.AddOutput("Y", dims, fp16_data);
+  }
+
+  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
+    // Only one output, and mean of output values are near attribute mean.
+    ASSERT_EQ(fetches.size(), 1);
+    const auto& output_tensor = FetchTensor(fetches[0]);
+    if (output_dtype == TensorProto_DataType::TensorProto_DataType_FLOAT) {
+      auto output_span = output_tensor.DataAsSpan<float>();
+      float sum = std::accumulate(output_span.begin(), output_span.end(), 0.f);
+      ASSERT_NEAR(sum / static_cast<float>(size), mean, 0.1f);
+    } else if (output_dtype == TensorProto_DataType::TensorProto_DataType_DOUBLE) {
+      auto output_span = output_tensor.DataAsSpan<double>();
+      double sum = std::accumulate(output_span.begin(), output_span.end(), 0.);
+      ASSERT_NEAR(sum / static_cast<double>(size), static_cast<double>(mean), 0.1);
+    } else if (output_dtype == TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+      auto output_span = output_tensor.DataAsSpan<MLFloat16>();
+      float sum = 0.f;
+      for (auto value : output_span) {
+        sum += value.ToFloat();
+      }
+      ASSERT_NEAR(sum / static_cast<float>(size), mean, 0.1f);
+    }
+  };
+
+  test.SetCustomOutputVerifier(output_verifier);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider, kTensorrtExecutionProvider});
+}
+
+TEST(Random, RandomNormalGpu) {
+  // We will call RandomVectorizedKernel if total_size % 4 == 0, so test two input sizes here.
+  std::vector<int64_t> dims1{256, 256};
+  RunRandomNormalGpuTest(dims1, 1.f, 10.f, 123.f, TensorProto_DataType::TensorProto_DataType_FLOAT, false, false);
+  RunRandomNormalGpuTest(dims1, -1.f, 8.f, 231.f, TensorProto_DataType::TensorProto_DataType_DOUBLE, false, false);
+  RunRandomNormalGpuTest(dims1, 0.f, 16.f, 312.f, TensorProto_DataType::TensorProto_DataType_FLOAT16, false, false);
+  RunRandomNormalGpuTest(dims1, 1.f, 10.f, 123.f, TensorProto_DataType::TensorProto_DataType_FLOAT, true, true);
+  RunRandomNormalGpuTest(dims1, -1.f, 8.f, 231.f, TensorProto_DataType::TensorProto_DataType_DOUBLE, true, true);
+  RunRandomNormalGpuTest(dims1, 0.f, 16.f, 312.f, TensorProto_DataType::TensorProto_DataType_FLOAT16, true, true);
+  RunRandomNormalGpuTest(dims1, -1.f, 8.f, 231.f, TensorProto_DataType::TensorProto_DataType_DOUBLE, true, false);
+  RunRandomNormalGpuTest(dims1, 0.f, 16.f, 312.f, TensorProto_DataType::TensorProto_DataType_FLOAT16, true, false);
+  std::vector<int64_t> dims2{255, 255};
+  RunRandomNormalGpuTest(dims2, 1.f, 10.f, 123.f, TensorProto_DataType::TensorProto_DataType_FLOAT, false, false);
+  RunRandomNormalGpuTest(dims2, -1.f, 8.f, 231.f, TensorProto_DataType::TensorProto_DataType_DOUBLE, true, true);
+  RunRandomNormalGpuTest(dims2, 0.f, 16.f, 312.f, TensorProto_DataType::TensorProto_DataType_FLOAT16, true, false);
+}
+
+void RunRandomUniformGpuTest(const std::vector<int64_t> dims, const float low, const float high, const float seed,
+                             TensorProto_DataType dtype, bool is_random_like, bool infer_dtype) {
+  OpTester test(is_random_like ? "RandomUniformLike" : "RandomUniform");
+  test.AddAttribute("low", low);
+  test.AddAttribute("high", high);
+  test.AddAttribute("seed", seed);
+  if (!is_random_like) {
+    test.AddAttribute<int64_t>("dtype", dtype);
+  } else if (!infer_dtype) {
+    // For RandomUniformLike, if not infer dtype, use float as target.
+    test.AddAttribute<int64_t>("dtype", TensorProto_DataType::TensorProto_DataType_FLOAT);
+  }
+  size_t size = 1;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    size *= static_cast<size_t>(dims[i]);
+  }
+  if (!is_random_like) {
+    test.AddAttribute("shape", dims);
+  } else {
+    if (dtype == TensorProto_DataType::TensorProto_DataType_FLOAT) {
+      std::vector<float> float_data(size, 0.f);
+      test.AddInput("X", dims, float_data);
+    } else if (dtype == TensorProto_DataType::TensorProto_DataType_DOUBLE) {
+      std::vector<double> double_data(size, 0.);
+      test.AddInput("X", dims, double_data);
+    } else if (dtype == TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+      std::vector<float> float_data(size, 0.f);
+      std::vector<MLFloat16> fp16_data(size);
+      ConvertFloatToMLFloat16(float_data.data(), fp16_data.data(), static_cast<int>(size));
+      test.AddInput("X", dims, fp16_data);
+    }
+  }
+
+  // We'll do our own output verification.
+  TensorProto_DataType output_dtype =
+      is_random_like && !infer_dtype ? TensorProto_DataType::TensorProto_DataType_FLOAT : dtype;
+  if (output_dtype == TensorProto_DataType::TensorProto_DataType_FLOAT) {
+    std::vector<float> float_data(size, 0.f);
+    test.AddOutput("Y", dims, float_data);
+  } else if (output_dtype == TensorProto_DataType::TensorProto_DataType_DOUBLE) {
+    std::vector<double> double_data(size, 0.);
+    test.AddOutput("Y", dims, double_data);
+  } else if (output_dtype == TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+    std::vector<float> float_data(size, 0.f);
+    std::vector<MLFloat16> fp16_data(size);
+    ConvertFloatToMLFloat16(float_data.data(), fp16_data.data(), static_cast<int>(size));
+    test.AddOutput("Y", dims, fp16_data);
+  }
+
+  auto output_verifier = [&](const std::vector<OrtValue>& fetches, const std::string& provider_type) {
+    // Only one output. Each value in output tensoer is between low and high.
+    // Mean of output values are near attribute mean of low and high.
+    ASSERT_EQ(fetches.size(), 1);
+    const auto& output_tensor = FetchTensor(fetches[0]);
+    if (output_dtype == TensorProto_DataType::TensorProto_DataType_FLOAT) {
+      auto output_span = output_tensor.DataAsSpan<float>();
+      for (auto value : output_span) {
+        ASSERT_GE(value, low);
+        ASSERT_LE(value, high);
+      }
+      float sum = std::accumulate(output_span.begin(), output_span.end(), 0.f);
+      ASSERT_NEAR(sum / static_cast<float>(size), (high + low) / 2.f, 0.1f);
+    } else if (output_dtype == TensorProto_DataType::TensorProto_DataType_DOUBLE) {
+      auto output_span = output_tensor.DataAsSpan<double>();
+      for (auto value : output_span) {
+        ASSERT_GE(value, static_cast<double>(low));
+        ASSERT_LE(value, static_cast<double>(high));
+      }
+      double sum = std::accumulate(output_span.begin(), output_span.end(), 0.);
+      ASSERT_NEAR(sum / static_cast<double>(size), static_cast<double>((high + low) / 2.f), 0.1);
+    } else if (output_dtype == TensorProto_DataType::TensorProto_DataType_FLOAT16) {
+      auto output_span = output_tensor.DataAsSpan<MLFloat16>();
+      float sum = 0.f;
+      for (auto value : output_span) {
+        float f = value.ToFloat();
+        ASSERT_GE(f, low);
+        ASSERT_LE(f, high);
+        sum += f;
+      }
+      ASSERT_NEAR(sum / static_cast<float>(size), (high + low) / 2.f, 0.1f);
+    }
+  };
+
+  test.SetCustomOutputVerifier(output_verifier);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider, kTensorrtExecutionProvider});
+}
+
+TEST(Random, RandomUniformGpu) {
+  // We will call RandomVectorizedKernel if total_size % 4 == 0, so test two input sizes here.
+  std::vector<int64_t> dims1{256, 256};
+  RunRandomUniformGpuTest(dims1, 0.f, 10.f, 123.f, TensorProto_DataType::TensorProto_DataType_FLOAT, false, false);
+  RunRandomUniformGpuTest(dims1, -10.f, 0.f, 231.f, TensorProto_DataType::TensorProto_DataType_DOUBLE, false, false);
+  RunRandomUniformGpuTest(dims1, -5.f, 5.f, 312.f, TensorProto_DataType::TensorProto_DataType_FLOAT16, false, false);
+  RunRandomUniformGpuTest(dims1, 0.f, 10.f, 123.f, TensorProto_DataType::TensorProto_DataType_FLOAT, true, true);
+  RunRandomUniformGpuTest(dims1, -10.f, 0.f, 231.f, TensorProto_DataType::TensorProto_DataType_DOUBLE, true, true);
+  RunRandomUniformGpuTest(dims1, -5.f, 5.f, 312.f, TensorProto_DataType::TensorProto_DataType_FLOAT16, true, true);
+  RunRandomUniformGpuTest(dims1, -10.f, 0.f, 231.f, TensorProto_DataType::TensorProto_DataType_DOUBLE, true, false);
+  RunRandomUniformGpuTest(dims1, -5.f, 5.f, 312.f, TensorProto_DataType::TensorProto_DataType_FLOAT16, true, false);
+  std::vector<int64_t> dims2{255, 255};
+  RunRandomUniformGpuTest(dims2, 0.f, 10.f, 123.f, TensorProto_DataType::TensorProto_DataType_FLOAT, false, false);
+  RunRandomUniformGpuTest(dims2, -10.f, 0.f, 231.f, TensorProto_DataType::TensorProto_DataType_DOUBLE, true, true);
+  RunRandomUniformGpuTest(dims2, -5.f, 5.f, 312.f, TensorProto_DataType::TensorProto_DataType_FLOAT16, true, false);
+}
+#endif
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc b/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
index b4adb5e0da..6a5db60646 100644
--- a/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/unsqueeze_op_test.cc
@@ -45,6 +45,64 @@ TEST(TensorOpTest, Unsqueeze_3) {
   test.Run();
 }
 
+TEST(TensorOpTest, Unsqueeze_scalar) {
+  {
+    OpTester test("Unsqueeze");
+
+    test.AddAttribute("axes", std::vector<int64_t>{0});
+    test.AddInput<float>("input", {}, std::vector<float>{1.0f});
+    test.AddOutput<float>("output", {1}, std::vector<float>{1.0f});
+    test.Run();
+  }
+  {
+    OpTester test("Unsqueeze");
+
+    test.AddAttribute("axes", std::vector<int64_t>{-1});
+    test.AddInput<float>("input", {}, std::vector<float>{1.0f});
+    test.AddOutput<float>("output", {1}, std::vector<float>{1.0f});
+    test.Run();
+  }
+
+  auto run_test = [](bool axes_is_initializer) {
+    {
+      OpTester test("Unsqueeze", 13);
+      test.AddInput<float>("input", {}, std::vector<float>{1.0f});
+      test.AddInput<int64_t>("axes", {1}, std::vector<int64_t>{0}, axes_is_initializer);
+      test.AddOutput<float>("output", {1}, std::vector<float>{1.0f});
+      test.Run();
+    }
+    {
+      OpTester test("Unsqueeze", 13);
+      test.AddInput<float>("input", {}, std::vector<float>{1.0f});
+      test.AddInput<int64_t>("axes", {1}, std::vector<int64_t>{-1}, axes_is_initializer);
+      test.AddOutput<float>("output", {1}, std::vector<float>{1.0f});
+      test.Run();
+    }
+  };
+  run_test(false);
+  run_test(true);
+}
+
+  TEST(TensorOpTest, Unsqueeze_scalar_2) {
+  {
+    OpTester test("Unsqueeze");
+
+    test.AddAttribute("axes", std::vector<int64_t>{0, 1});
+    test.AddInput<float>("input", {}, std::vector<float>{1.0f});
+    test.AddOutput<float>("output", {1, 1}, std::vector<float>{1.0f});
+    test.Run();
+  }
+  auto run_test = [](bool axes_is_initializer) {
+    OpTester test("Unsqueeze", 13);
+    test.AddInput<float>("input", {}, std::vector<float>{1.0f});
+    test.AddInput<int64_t>("axes", {2}, std::vector<int64_t>{0, -1}, axes_is_initializer);
+    test.AddOutput<float>("output", {1, 1}, std::vector<float>{1.0f});
+    test.Run();
+  };
+  run_test(false);
+  run_test(true);
+  }
+
 TEST(TensorOpTest, Unsqueeze_Duplicate) {
   {
     OpTester test("Unsqueeze", 12); // opset 1-12 has axes attribute
@@ -98,35 +156,45 @@ TEST(TensorOpTest, UnsqueezeNegAxis_3) {
     // TensorRT does not support negative axis.
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
   }
-  {
+  auto run_test = [](bool axes_is_initializer) {
     OpTester test("Unsqueeze", 13);  // use latest opset with axis input
     test.AddInput<float>("input", {2, 3, 4}, std::vector<float>(2 * 3 * 4, 1.0f));
-    test.AddInput<int64_t>("axes", {3}, std::vector<int64_t>{-4, 1, -6});
+    test.AddInput<int64_t>("axes", {3}, std::vector<int64_t>{-4, 1, -6}, axes_is_initializer);
     test.AddOutput<float>("output", {1, 1, 1, 2, 3, 4}, std::vector<float>(2 * 3 * 4, 1.0f));
     // TensorRT does not support negative axis.
     // TODO: TensorRT, OpenVINO dont support "axes" input in opset 13, re-enable after
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", { kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
-  }
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+  };
+  run_test(false);
+  run_test(true);
 }
 
 TEST(TensorOpTest, Unsqueeze_1_int32_axes_input) {
-  OpTester test("Unsqueeze", 13);
+  auto run_test = [](bool axes_is_initializer) {
+    OpTester test("Unsqueeze", 13);
 
-  test.AddInput<int32_t>("input", {2, 3, 4}, std::vector<int32_t>(2 * 3 * 4, 1));
-  test.AddInput<int64_t>("axes", {1}, std::vector<int64_t>{1});
-  test.AddOutput<int32_t>("output", {2, 1, 3, 4}, std::vector<int32_t>(2 * 3 * 4, 1));
-  // TODO: TensorRT and OpenVINO dont support "axes" input in opset 13, re-enable after
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+    test.AddInput<int32_t>("input", {2, 3, 4}, std::vector<int32_t>(2 * 3 * 4, 1));
+    test.AddInput<int64_t>("axes", {1}, std::vector<int64_t>{1}, axes_is_initializer);
+    test.AddOutput<int32_t>("output", {2, 1, 3, 4}, std::vector<int32_t>(2 * 3 * 4, 1));
+    // TODO: TensorRT and OpenVINO dont support "axes" input in opset 13, re-enable after
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+  };
+  run_test(false);
+  run_test(true);
 }
 
 TEST(TensorOpTest, Unsqueeze_3_axes_input) {
-  OpTester test("Unsqueeze", 13);
+  auto run_test = [](bool axes_is_initializer) {
+    OpTester test("Unsqueeze", 13);
 
-  test.AddInput<float>("input", {2, 3, 4}, std::vector<float>(2 * 3 * 4, 1.0f));
-  test.AddInput<int64_t>("axes", {3}, std::vector<int64_t>{2, 1, 0});
-  test.AddOutput<float>("output", {1, 1, 1, 2, 3, 4}, std::vector<float>(2 * 3 * 4, 1.0f));
-  // TODO: TensorRT and OpenVINO dont support "axes" input in opset 13, re-enable after
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+    test.AddInput<float>("input", {2, 3, 4}, std::vector<float>(2 * 3 * 4, 1.0f));
+    test.AddInput<int64_t>("axes", {3}, std::vector<int64_t>{2, 1, 0}, axes_is_initializer);
+    test.AddOutput<float>("output", {1, 1, 1, 2, 3, 4}, std::vector<float>(2 * 3 * 4, 1.0f));
+    // TODO: TensorRT and OpenVINO dont support "axes" input in opset 13, re-enable after
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
+  };
+  run_test(false);
+  run_test(true);
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py
index 9520b885b9..fb6d129192 100644
--- a/onnxruntime/test/python/onnx_backend_test_series.py
+++ b/onnxruntime/test/python/onnx_backend_test_series.py
@@ -105,6 +105,7 @@ def create_backend_test(testname=None):
                 '^test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob_cpu',
                 '^test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob_expanded_cpu',
                 '^test_asin_example_cpu',
+                '^test_dynamicquantizelinear_cpu',
                 '^test_dynamicquantizelinear_expanded_cpu',
                 '^test_resize_downsample_scales_linear_cpu',
                 '^test_resize_downsample_sizes_linear_pytorch_half_pixel_cpu',
diff --git a/onnxruntime/test/python/onnxruntime_test_python.py b/onnxruntime/test/python/onnxruntime_test_python.py
index e8005f5270..8618431ec7 100644
--- a/onnxruntime/test/python/onnxruntime_test_python.py
+++ b/onnxruntime/test/python/onnxruntime_test_python.py
@@ -884,7 +884,19 @@ class TestInferenceSession(unittest.TestCase):
 
             # The constructed OrtValue should still be valid after being used in a session
             self.assertTrue(np.array_equal(ortvalue2.numpy(), numpy_arr_input))
-            
+
+    def testOrtValue_ghIssue9799(self):
+        if 'CUDAExecutionProvider' in onnxrt.get_available_providers():
+            session = onnxrt.InferenceSession(get_name("identity_9799.onnx"), 
+                                              providers=onnxrt.get_available_providers())
+
+            for seq_length in range(40, 200):
+                inps = np.ones((seq_length, 16, 7, 5, 3, 3)).astype(np.float32)
+                ort_val = onnxrt.OrtValue.ortvalue_from_numpy(inps, 'cuda', 0)
+                upstreams_onnxrt = {'input': ort_val}
+                outs = session.run(output_names=['output'], input_feed=upstreams_onnxrt)[0]
+                self.assertTrue(np.allclose(inps, outs))
+
     def testSparseTensorCooFormat(self):
         cpu_device = onnxrt.OrtDevice.make('cpu', 0)
         shape = [9,9]
diff --git a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
index d7ead9fbb0..4d2660c053 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_iobinding.py
@@ -39,6 +39,9 @@ class TestIOBinding(unittest.TestCase):
         # Invoke Run
         session.run_with_iobinding(io_binding)
         
+        # Sync if different CUDA streams
+        io_binding.synchronize_outputs()
+
         # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
         ort_output = io_binding.copy_outputs_to_cpu()[0]
 
@@ -54,11 +57,17 @@ class TestIOBinding(unittest.TestCase):
         # Bind input to CUDA
         io_binding.bind_input('X', 'cuda', 0, np.float32, [3, 2], input.data_ptr())
 
+        # Sync if different CUDA streams
+        io_binding.synchronize_inputs()
+
         # Bind output to CPU
         io_binding.bind_output('Y')
         
         # Invoke Run
         session.run_with_iobinding(io_binding)
+
+        # Sync if different CUDA streams
+        io_binding.synchronize_outputs()
         
         # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
         ort_output = io_binding.copy_outputs_to_cpu()[0]
@@ -79,8 +88,14 @@ class TestIOBinding(unittest.TestCase):
         output = self.create_uninitialized_ortvalue_input_on_gpu()
         io_binding.bind_output('Y', 'cuda', 0, np.float32, [3, 2], output.data_ptr())
 
+        # Sync if different CUDA streams
+        io_binding.synchronize_inputs()
+
         # Invoke Run
         session.run_with_iobinding(io_binding)
+
+        # Sync if different CUDA streams
+        io_binding.synchronize_outputs()
         
         # Get outputs over to CPU (the outputs which were bound to CUDA will get copied over to the host here)
         ort_output_vals = io_binding.copy_outputs_to_cpu()[0]
@@ -104,9 +119,15 @@ class TestIOBinding(unittest.TestCase):
         # Bind output to CUDA
         io_binding.bind_output('Y', 'cuda')
 
+        # Sync if different CUDA streams
+        io_binding.synchronize_inputs()
+
         # Invoke Run
         session.run_with_iobinding(io_binding)
 
+        # Sync if different CUDA streams
+        io_binding.synchronize_outputs()
+
         # This call returns an OrtValue which has data allocated by ORT on CUDA
         ort_outputs = io_binding.get_outputs()
         self.assertEqual(len(ort_outputs), 1)
@@ -124,10 +145,16 @@ class TestIOBinding(unittest.TestCase):
         # Change the bound input and validate the results in the same bound OrtValue
         # Bind alternate input to CUDA
         io_binding.bind_input('X', 'cuda', 0, np.float32, [3, 2], self.create_ortvalue_alternate_input_on_gpu().data_ptr())
-        
+
+        # Sync if different CUDA streams
+        io_binding.synchronize_inputs()
+
         # Invoke Run
         session.run_with_iobinding(io_binding)
 
+        # Sync if different CUDA streams
+        io_binding.synchronize_outputs()
+
         # This call returns an OrtValue which has data allocated by ORT on CUDA
         ort_outputs = io_binding.get_outputs()
         self.assertEqual(len(ort_outputs), 1)
@@ -147,9 +174,15 @@ class TestIOBinding(unittest.TestCase):
         output_ortvalue = self.create_uninitialized_ortvalue_input_on_gpu()
         io_binding.bind_ortvalue_output('Y', output_ortvalue)
 
+        # Sync if different CUDA streams
+        io_binding.synchronize_inputs()
+
         # Invoke Run
         session.run_with_iobinding(io_binding)
 
+        # Sync if different CUDA streams
+        io_binding.synchronize_outputs()
+
         # Inspect contents of output_ortvalue and make sure that it has the right contents
         self.assertTrue(np.array_equal(self.create_expected_output(), output_ortvalue.numpy()))
 
@@ -157,9 +190,15 @@ class TestIOBinding(unittest.TestCase):
         input_ortvalue_2 = self.create_ortvalue_alternate_input_on_gpu()
         io_binding.bind_ortvalue_input('X', input_ortvalue_2)
 
+        # Sync if different CUDA streams
+        io_binding.synchronize_inputs()
+
         # Invoke Run
         session.run_with_iobinding(io_binding)
 
+        # Sync if different CUDA streams
+        io_binding.synchronize_outputs()
+
         # Inspect contents of output_ortvalue and make sure that it has the right contents
         self.assertTrue(np.array_equal(self.create_expected_output_alternate(), output_ortvalue.numpy()))
 
diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
index e3935c4e40..ad046cce89 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py
@@ -36,6 +36,52 @@ class TestSymbolicShapeInference(unittest.TestCase):
                                                 int_max=100000,
                                                 guess_output_rank=True)
 
+    def test_mismatched_types(self):
+        graph = helper.make_graph(
+            [helper.make_node(
+                "If",
+                ["x"],
+                ["out"],
+                name="if_node",
+                then_branch=helper.make_graph(
+                    [helper.make_node(
+                        "Constant",
+                        [],
+                        ["one_float"],
+                        value=helper.make_tensor(
+                            "one_float_value",
+                            TensorProto.FLOAT,
+                            [],
+                            [1]),
+                    )],
+                    "then",
+                    [],
+                    [helper.make_tensor_value_info("one_float", TensorProto.FLOAT, [])],
+                ),
+                else_branch=helper.make_graph(
+                    [helper.make_node(
+                        "Constant",
+                        [],
+                        ["one_double"],
+                        value=helper.make_tensor(
+                            "one_double",
+                            TensorProto.DOUBLE,
+                            [],
+                            [1]),
+                    )],
+                    "else",
+                    [],
+                    [helper.make_tensor_value_info("one_double", TensorProto.DOUBLE, [])],
+                ))],
+            "graph",
+            [helper.make_tensor_value_info("x", TensorProto.BOOL, [])],
+            [helper.make_tensor_value_info("out", TensorProto.FLOAT, [])],
+        )
+        model = helper.make_model(graph, producer_name="test_mismatched_types")
+
+        with self.assertRaisesRegex(ValueError, r"if_node.*FLOAT.*DOUBLE"):
+            SymbolicShapeInference.infer_shapes(model, auto_merge=True)
+
 
 class TestSymbolicShapeInferenceForOperators(unittest.TestCase):
     def _check_shapes(self, graph, inferred_graph, vis):  # type: (GraphProto, GraphProto, List[ValueInfoProto]) -> None
@@ -238,7 +284,7 @@ class TestSymbolicShapeInferenceForOperators(unittest.TestCase):
 
     def test_einsum_transpose(self):
         self._test_einsum_one_input_impl(['a', 'b'], ['b', 'a'], "ij -> ji")
-        
+
 
 class TestSymbolicShapeInferenceForSlice(unittest.TestCase):
     def check_slice_of_concat(self, input_dims, start, end, step, expected_output_dim):
diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py
index 445ff858c0..d8d4280e37 100644
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@@ -10,7 +10,7 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, QuantType, QuantFormat
+from onnxruntime.quantization import quantize_static, QuantType, QuantFormat, QuantizationMode, QDQQuantizer
 from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_type_order
 
 class TestQDQFormat(unittest.TestCase):
@@ -24,6 +24,177 @@ class TestQDQFormat(unittest.TestCase):
         dr = TestDataFeeds(input_data_list)
         return dr
 
+class TestQDQExtraOptions(unittest.TestCase):
+    def test_qdq_extra_options(self):
+        #   (input) 
+        #      |    
+        #     Add 
+        #      |
+        #     ReduceMean 
+        #      |
+        #     Add 
+        #      |
+        #   (output)
+
+        initializers = []
+
+        input_tensor = helper.make_tensor_value_info('L', TensorProto.FLOAT, [5, 5])
+        output_tensor = helper.make_tensor_value_info('O', TensorProto.FLOAT, [5, 5])
+
+        add_weight_data_1 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
+        initializers.append(onnx.numpy_helper.from_array(add_weight_data_1, name="M"))
+        add_weight_data_2 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
+        initializers.append(onnx.numpy_helper.from_array(add_weight_data_2, name="N"))
+
+        add_node_1 = onnx.helper.make_node('Add', ['L', 'M'], ['P'], name='Add1')
+        reduce_mean_node = onnx.helper.make_node('ReduceMean', ['P'], ['Q'], keepdims=1, name='ReduceMean')
+        add_node_2 = onnx.helper.make_node('Add', ['Q', 'N'], ['O'], name='Add2')
+
+        graph = helper.make_graph([add_node_1, reduce_mean_node, add_node_2], 'QDQ_Test_Finetune', [input_tensor], [output_tensor], initializer=initializers)
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        test_model_path = './test_qdq_finetune.onnx'
+        onnx.save(model, test_model_path)
+
+        compute_range = {
+            'P': [0.1, 0.1],
+            'Q': [0.1, 0.1],
+            'M': [0.1, 0.1],
+            'N': [0.1, 0.1],
+            'L': [0.1, 0.1],
+            'O': [0.1, 0.1],
+        }
+
+        op_types_to_quantize = ['Add']
+
+        mode = QuantizationMode.QLinearOps
+        model = onnx.load_model(test_model_path, False)
+        quantizer = QDQQuantizer(
+            model,
+            True, #per_channel
+            False, #reduce_range
+            mode,
+            True,  #static
+            QuantType.QInt8, #weight_type
+            QuantType.QInt8, #activation_type
+            compute_range,
+            [], #nodes_to_quantize
+            ['Add2'], #nodes_to_exclude
+            op_types_to_quantize,
+            {'ActivationSymmetric' : True, 'AddQDQPairToWeight' : True, 'OpTypesToExcludeOutputQuantizatioin': []}) #extra_options
+        quantizer.quantize_model()
+        qdq_model_path = './test_qdq_finetune_qdq.onnx'
+        quantizer.model.save_model_to_file(qdq_model_path, False)
+
+        # QDQ pair should be added to Add1 but not Add2
+        # QDQ pair shoud be added to Add1 output as well.
+        qdq_added_to_node_output_flag = False 
+        for node in quantizer.model.nodes():
+            if node.name == 'Add1':
+                for input in node.input:
+                    self.assertTrue("DequantizeLinear" in input)
+                for output in node.output:
+                    self.assertTrue("QuantizeLinear" not in output)
+
+            if node.name == 'Add2':
+                for input in node.input:
+                    self.assertTrue("DequantizeLinear" not in input)
+                for output in node.output:
+                    self.assertTrue("QuantizeLinear" not in output)
+
+            # This QuantizeLinear node should be followed by Add1
+            if node.name == 'P_QuantizeLinear':
+                qdq_added_to_node_output_flag = True
+                self.assertTrue(node.input[0] is 'P')
+
+        self.assertTrue(qdq_added_to_node_output_flag)
+
+
+    def test_qdq_extra_options_2(self):
+        #         (input) 
+        #           |    
+        #          Add 
+        #       /   |   \
+        #  MatMul MatMul MatMul 
+        #     |     |      |
+        # (output)(output)(output)
+
+        initializers = []
+
+        input_tensor = helper.make_tensor_value_info('L', TensorProto.FLOAT, [5, 5])
+        output_tensor1 = helper.make_tensor_value_info('M', TensorProto.FLOAT, [5, 5])
+        output_tensor2 = helper.make_tensor_value_info('N', TensorProto.FLOAT, [5, 5])
+        output_tensor3 = helper.make_tensor_value_info('O', TensorProto.FLOAT, [5, 5])
+
+        add_weight_data = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
+        initializers.append(onnx.numpy_helper.from_array(add_weight_data, name="P"))
+        matmul_weight_data_1 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
+        initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_1, name="Q"))
+        matmul_weight_data_2 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
+        initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_2, name="R"))
+        matmul_weight_data_3 = np.random.normal(0, 0.1, [5, 5]).astype(np.float32)
+        initializers.append(onnx.numpy_helper.from_array(matmul_weight_data_2, name="S"))
+
+        add_node = onnx.helper.make_node('Add', ['L', 'P'], ['T'], name='Add')
+        matmul_node_1 = onnx.helper.make_node('MatMul', ['T', 'Q'], ['M'], name='MatMul1')
+        matmul_node_2 = onnx.helper.make_node('MatMul', ['T', 'R'], ['N'], name='MatMul2')
+        matmul_node_3 = onnx.helper.make_node('MatMul', ['T', 'S'], ['O'], name='MatMul3')
+
+        graph = helper.make_graph([add_node, matmul_node_1, matmul_node_2, matmul_node_3], 'QDQ_Test_Finetune_2', [input_tensor], [output_tensor1, output_tensor2, output_tensor3], initializer=initializers)
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        test_model_path = './test_qdq_finetune_2.onnx'
+        onnx.save(model, test_model_path)
+
+        compute_range = {
+            'L': [0.1, 0.1],
+            'M': [0.1, 0.1],
+            'N': [0.1, 0.1],
+            'O': [0.1, 0.1],
+            'P': [0.1, 0.1],
+            'Q': [0.1, 0.1],
+            'R': [0.1, 0.1],
+            'S': [0.1, 0.1],
+            'T': [0.1, 0.1],
+        }
+
+        op_types_to_quantize = ['Add', 'MatMul']
+
+        mode = QuantizationMode.QLinearOps
+        model = onnx.load_model(test_model_path, False)
+        quantizer = QDQQuantizer(
+            model,
+            True, #per_channel
+            False, #reduce_range
+            mode,
+            True,  #static
+            QuantType.QInt8, #weight_type
+            QuantType.QInt8, #activation_type
+            compute_range,
+            [], #nodes_to_quantize
+            ['Add'], #nodes_to_exclude
+            op_types_to_quantize,
+            {'ActivationSymmetric' : True, 'AddQDQPairToWeight' : True, 'OpTypesToExcludeOutputQuantizatioin': op_types_to_quantize, 'DedicatedQDQPair': True}) #extra_options
+        quantizer.quantize_model()
+        qdq_model_path = './test_qdq_finetune_qdq_2.onnx'
+        quantizer.model.save_model_to_file(qdq_model_path, False)
+
+        # Three dedicated QDQ pair should be generated and feed into each MatMul node
+        # Also QDQ pair should not be added to Add node 
+        # QDQ pair shoud not be added to node's output
+        for node in quantizer.model.nodes():
+            if node.name == 'MatMul1':
+                self.assertTrue("T_DequantizeLinear_1" in node.input)
+            if node.name == 'MatMul2':
+                self.assertTrue("T_DequantizeLinear_2" in node.input)
+            if node.name == 'MatMul3':
+                self.assertTrue("T_DequantizeLinear_3" in node.input)
+            if node.name == 'Add':
+                for input in node.input:
+                    self.assertTrue("DequantizeLinear" not in input)
+
+            # QDQ pair shoud not be added to MatMul's output
+            if node.op_type == 'QuantizeLinear':
+                self.assertTrue(node.input[0] not in ['M_QuantizeLinearInput', 'N_QuantizeLinearInput', 'O_QuantizeLinearInput']) 
+
 class TestQDQFormatConv(TestQDQFormat):
     def construct_model_conv(self, output_model_path, input_shape, weight_shape, output_shape, has_bias):
         #    (input)
diff --git a/onnxruntime/test/python/transformers/test_parity_decoder_attention.py b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
new file mode 100644
index 0000000000..79934fbd94
--- /dev/null
+++ b/onnxruntime/test/python/transformers/test_parity_decoder_attention.py
@@ -0,0 +1,423 @@
+# --------------------------------------------------------------------------
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation.  All rights reserved.
+# Licensed under the MIT License.  See License.txt in the project root for
+# license information.
+# -------------------------------------------------------------------------
+
+import math
+import numpy
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from typing import Dict, List, Optional, Tuple
+import os
+
+torch.manual_seed(0)
+
+"""
+This is an example of export bart decoder attention with huggingface v3.5.1
+def my_bart_attention_forward(
+    self,
+    query,
+    key: Tensor,
+    key_padding_mask: Optional[Tensor],
+    layer_state: Optional[List[Tensor]],
+    attn_mask: Optional[Tensor] = None,
+    output_attentions: bool=False,
+    use_past=torch.tensor(False),
+):
+    static_kv: bool = self.encoder_decoder_attention
+    q_weight = self.q_proj.weight.transpose(0,1)
+    q_weight = q_weight.reshape(self.embed_dim, self.embed_dim)
+
+    kv_weight = torch.stack((self.k_v_proj.k_proj.weight.transpose(0,1), self.k_v_proj.v_proj.weight.transpose(0,1)), dim=1)
+    kv_weight = kv_weight.reshape(self.embed_dim, 2 * self.embed_dim)
+
+    bias = torch.stack((self.q_proj.bias, self.k_v_proj.k_proj.bias, self.k_v_proj.v_proj.bias), dim=0)
+    bias = bias.reshape(3 * self.embed_dim)
+
+    self_p_k, self_p_v, enc_dec_p_k, enc_dec_p_v = layer_state
+    if static_kv:
+        key_cache, value_cache = enc_dec_p_k, enc_dec_p_v
+    else:
+        key_cache, value_cache = self_p_k, self_p_v
+
+    if not static_kv:
+        key_padding_mask = torch.tensor(False)
+
+    attn_output, new_key_cache, new_value_cache = torch.ops.onnxruntime.DecoderAttention(
+                                                    query,
+                                                    key,
+                                                    q_weight,
+                                                    kv_weight,
+                                                    bias,
+                                                    key_padding_mask,
+                                                    key_cache,
+                                                    value_cache,
+                                                    torch.tensor(static_kv), #static_kv
+                                                    use_past, #use_past
+                                                    torch.tensor(True), #has_layer_state
+                                                    torch.tensor(static_kv), #has_key_padding_mask
+                                                    self.num_heads)
+
+    if not use_past:
+        if self.encoder_decoder_attention:
+            layer_state[2] = new_key_cache
+            layer_state[3] = new_value_cache
+        else:
+            layer_state[0] = new_key_cache
+            layer_state[1] = new_value_cache
+    else:
+        if not self.encoder_decoder_attention:
+            layer_state[0] = new_key_cache
+            layer_state[1] = new_value_cache
+
+    attn_output = self.out_proj(attn_output)
+
+    return attn_output, None, layer_state
+"""
+
+class Config:
+    batch_size = 0
+    sequence_length = 0
+    kv_sequence_length = 0
+    num_heads = 0
+    head_size = 0
+    embed_dim = 0
+
+    def __init__(self, b, s, s2, n, h):
+        self.batch_size = b
+        self.sequence_length = s
+        self.kv_sequence_length = s2
+        self.num_heads = n
+        self.head_size = h
+        self.embed_dim = self.num_heads * self.head_size
+
+class AttentionProjection(nn.Module):
+    def __init__(self, num_heads, head_dim, embed_dim, bias=True):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def shape_state(self, state, batch_size):
+        return state.view(batch_size * self.num_heads, -1, self.head_dim)
+
+    def shape_proj(self, proj, batch_size):
+        return proj.view(-1, batch_size * self.num_heads, self.head_dim).transpose(0, 1)
+
+    def forward(
+        self,
+        query,
+        key,
+        layer_state: Optional[List[Tensor]],
+        encoder_decoder_attention: bool,
+        use_past=torch.tensor(False),
+    ):
+        bsz = torch._shape_as_tensor(query)[1]
+        if layer_state is None or not use_past:
+            if not encoder_decoder_attention:
+                k = self.k_proj(query)
+                v = self.v_proj(query)
+                k = self.shape_proj(k, bsz)
+                v = self.shape_proj(v, bsz)
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+                k = self.shape_proj(k, bsz)
+                v = self.shape_proj(v, bsz)
+        else:
+            self_p_k, self_p_v, enc_dec_p_k, enc_dec_p_v = layer_state
+            if not encoder_decoder_attention:
+                k = self.k_proj(query)
+                v = self.v_proj(query)
+                k = self.shape_proj(k, bsz)
+                v = self.shape_proj(v, bsz)
+                k = torch.cat([self.shape_state(self_p_k, bsz), k], dim=1)
+                v = torch.cat([self.shape_state(self_p_v, bsz), v], dim=1)
+            else:
+                k = self.shape_state(enc_dec_p_k, bsz)
+                v = self.shape_state(enc_dec_p_v, bsz)
+
+        return k, v
+
+class AttentionForONNX(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        encoder_decoder_attention=False,  # otherwise self_attention
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+
+        self.encoder_decoder_attention = encoder_decoder_attention
+        self.k_v_proj = torch.jit.script(AttentionProjection(num_heads, self.head_dim, embed_dim, bias))
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.cache_key = "encoder_decoder" if self.encoder_decoder_attention else "self"
+
+    def _shape(self, tensor, seq_len, bsz):
+        return tensor.contiguous().view(seq_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+
+    def forward(
+        self,
+        query,
+        key: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+        layer_state: Optional[List[Tensor]] = None,
+        attn_mask: Optional[Tensor] = None,
+        output_attentions: bool=False,
+        use_past=torch.tensor(False),
+        has_key_padding_mask: bool=False
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time(SeqLen) x Batch x Channel"""
+        static_kv: bool = self.encoder_decoder_attention
+        tgt_len, bsz, embed_dim = query.size()
+        # get here for encoder decoder cause of static_kv
+        k, v = self.k_v_proj(query, key, layer_state, self.encoder_decoder_attention, use_past)
+
+        q = self.q_proj(query) * self.scaling
+        q = self._shape(q, tgt_len, bsz)
+
+        # Update cache
+        if layer_state is not None:
+            cached_shape = (bsz, self.num_heads, -1, self.head_dim)  # bsz must be first for reorder_cache
+            if static_kv:
+                # cross-attn
+                new_key_cache = k.view(*cached_shape)
+                new_value_cache = v.view(*cached_shape)
+            else:
+                # self-attn
+                new_key_cache = k.view(*cached_shape)
+                new_value_cache = v.view(*cached_shape)
+
+        src_len = k.size(1)
+        assert key_padding_mask is None or key_padding_mask.shape == (bsz, src_len)
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        assert attn_weights.size() == (bsz * self.num_heads, tgt_len, src_len)
+
+        if has_key_padding_mask:  # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            reshaped = key_padding_mask.unsqueeze(1).unsqueeze(2)
+            attn_weights = attn_weights.masked_fill(reshaped, float("-inf"))
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        attn_probs = attn_weights
+
+        assert v is not None
+        attn_output = torch.bmm(attn_probs, v)
+        assert attn_output.size() == (bsz * self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, new_key_cache, new_value_cache
+
+    def ORT_forward(
+        self,
+        query,
+        key: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+        layer_state: Optional[List[Tensor]] = None,
+        attn_mask: Optional[Tensor] = None,
+        output_attentions: bool=False,
+        use_past=torch.tensor(False),
+        has_key_padding_mask: bool=False
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time(SeqLen) x Batch x Channel"""
+        # For readability
+        static_kv = True if self.encoder_decoder_attention else False
+        has_layer_state = True if layer_state is not None else False
+        use_past_cache = True if use_past else False
+
+        q_weight = self.q_proj.weight.transpose(0,1)
+        q_weight = q_weight.reshape(self.embed_dim, self.embed_dim)
+
+        kv_weight = torch.stack((self.k_v_proj.k_proj.weight.transpose(0,1), self.k_v_proj.v_proj.weight.transpose(0,1)), dim=1)
+        kv_weight = kv_weight.reshape(self.embed_dim, 2 * self.embed_dim)
+
+        bias = torch.stack((self.q_proj.bias, self.k_v_proj.k_proj.bias, self.k_v_proj.v_proj.bias), dim=0)
+        bias = bias.reshape(3 * self.embed_dim)
+
+        onnx_model_str = create_decoder_attention_graph(query, key, q_weight, kv_weight, bias, self.num_heads, static_kv, use_past_cache, has_layer_state, has_key_padding_mask)
+
+        self_p_k, self_p_v, enc_dec_p_k, enc_dec_p_v = layer_state
+        if self.encoder_decoder_attention:
+            key_cache, value_cache = enc_dec_p_k, enc_dec_p_v
+        else:
+            key_cache, value_cache = self_p_k, self_p_v
+
+        ort_inputs = {
+            'query': numpy.ascontiguousarray(query.cpu().numpy()),
+            'key': numpy.ascontiguousarray(key.cpu().numpy()),
+            'key_padding_mask': numpy.ascontiguousarray(key_padding_mask.cpu().numpy()),
+            'key_cache': numpy.ascontiguousarray(key_cache.detach().cpu().numpy()),
+            'value_cache': numpy.ascontiguousarray(value_cache.detach().cpu().numpy())
+        }
+
+        from onnxruntime import SessionOptions, InferenceSession
+        sess_options = SessionOptions()
+        ort_session = InferenceSession(onnx_model_str, sess_options, providers=['CUDAExecutionProvider'])
+        ort_output = ort_session.run(None, ort_inputs)
+        output, new_key_cache, new_value_cache = ort_output
+
+        output = torch.tensor(output)
+        attn_output = self.out_proj(output)
+
+        return attn_output, torch.tensor(new_key_cache), torch.tensor(new_value_cache)
+
+
+def create_decoder_attention_graph(query, key, q_weight, kv_weight, bias, num_heads_, static_kv, use_past, has_layer_state, has_key_padding_mask):
+    from onnx import helper, TensorProto
+
+    S, B, NH = query.size()
+    S2 = key.size()[0]
+    N = num_heads_
+    H = int(NH / N)
+
+    nodes = [
+        helper.make_node("DecoderAttention",
+                         ["query", "key", "q_weight", "kv_weight", "bias", "key_padding_mask", "key_cache", "value_cache", "static_kv", "use_past", "has_layer_state", "has_key_padding_mask"],
+                         ["output", "new_key_cache", "new_value_cache"],
+                         "DecoderAttention_0",
+                         num_heads=num_heads_,
+                         domain="com.microsoft"),
+    ]
+
+    initializers = [
+        helper.make_tensor('q_weight', TensorProto.FLOAT, [NH, NH],
+                           q_weight.flatten().tolist()),
+        helper.make_tensor('kv_weight', TensorProto.FLOAT, [NH, 2 * NH],
+                           kv_weight.flatten().tolist()),
+        helper.make_tensor('bias', TensorProto.FLOAT, [3 * NH],
+                           bias.flatten().tolist()),
+        helper.make_tensor('static_kv', TensorProto.BOOL, [1],
+                           [static_kv]),
+        helper.make_tensor('use_past', TensorProto.BOOL, [1],
+                            [use_past]),
+        helper.make_tensor('has_layer_state', TensorProto.BOOL, [1],
+                            [has_layer_state]),
+        helper.make_tensor('has_key_padding_mask', TensorProto.BOOL, [1],
+                            [has_key_padding_mask]),
+    ]
+
+    graph = helper.make_graph(nodes, "DecoderAttention_Graph", [
+        helper.make_tensor_value_info('query', TensorProto.FLOAT, [S, B, NH]),
+        helper.make_tensor_value_info('key', TensorProto.FLOAT, [S2, B, NH]),
+        helper.make_tensor_value_info('key_padding_mask', TensorProto.BOOL, [B, "mask_len"]),
+        helper.make_tensor_value_info('key_cache', TensorProto.FLOAT, [B, N, "cache_len", H]),
+        helper.make_tensor_value_info('value_cache', TensorProto.FLOAT, [B, N, "cache_len", H]),
+    ], [
+        helper.make_tensor_value_info('output', TensorProto.FLOAT, [S, B, NH]),
+        helper.make_tensor_value_info('new_key_cache', TensorProto.FLOAT, [B, N, "new_cache_len", H]),
+        helper.make_tensor_value_info('new_value_cache', TensorProto.FLOAT, [B, N, "new_cache_len", H]),
+    ], initializers)
+
+    model = helper.make_model(graph)
+    return model.SerializeToString()
+
+
+def create_inputs(config: Config, has_layer_state: bool, use_past: bool, encoder_decoder_attention:bool):
+    query = torch.normal(mean=0.0,
+                         std=0.1,
+                         size=(config.sequence_length,
+                               config.batch_size,
+                               config.embed_dim)
+                        ).to(torch.float32)
+    key = torch.normal(mean=0.0,
+                       std=0.1,
+                       size=(config.kv_sequence_length,
+                             config.batch_size,
+                             config.embed_dim)
+                       ).to(torch.float32)
+
+    key_length = None
+    if not has_layer_state or not use_past:
+        if not encoder_decoder_attention:
+            key_length = config.sequence_length
+        else:
+            key_length = config.kv_sequence_length
+    else:
+        if not encoder_decoder_attention:
+            key_length = config.sequence_length + config.kv_sequence_length
+        else:
+            key_length = config.kv_sequence_length
+
+    key_padding_mask = torch.normal(mean=0.0,
+                                    std=0.1,
+                                    size=(config.batch_size,
+                                          key_length)
+                                    ) > 0
+    # The following line ensure not all the mask are true
+    key_padding_mask[0][0] = False
+
+    cache = torch.normal(mean=0.0,
+                         std=0.1,
+                         size=(config.batch_size,
+                               config.num_heads,
+                               config.kv_sequence_length,
+                               config.head_size)
+                         ).to(torch.float32)
+    layer_state = [cache, cache, cache, cache]
+
+    return query, key, key_padding_mask, layer_state, torch.tensor(use_past)
+
+
+def parity_check(config, has_layer_state, use_past, static_kv, has_key_padding_mask, rtol = 1e-4, atol = 1e-4):
+    query, key, key_padding_mask, layer_state, use_past = create_inputs(config,
+                                                                        has_layer_state,
+                                                                        use_past,
+                                                                        static_kv)
+    attn = AttentionForONNX(config.embed_dim,
+                            config.num_heads,
+                            encoder_decoder_attention = static_kv)
+    attn_output, new_key_cache, new_value_cache = attn.forward(query, key, key_padding_mask, layer_state, None, False, use_past, has_key_padding_mask)
+    attn_output_ort, new_key_cache_ort, new_value_cache_ort = attn.ORT_forward(query, key, key_padding_mask, layer_state, None, False, use_past, has_key_padding_mask)
+    attn_output_ort_1, _, _ = attn.ORT_forward(query, key, key_padding_mask, layer_state, None, False, use_past, has_key_padding_mask)
+    print(" B:", config.batch_size,
+          " S:", config.sequence_length,
+          " S*:", config.kv_sequence_length,
+          " h:", config.embed_dim,
+          " has_layer_state:", has_layer_state,
+          " use_past:", use_past,
+          " static_kv:", static_kv,
+          " has_key_padding_mask:", has_key_padding_mask,
+          "[attn_output, randomness, key, value] parity:",
+          numpy.allclose(attn_output.detach().numpy(), attn_output_ort.detach().numpy(), rtol = rtol, atol = atol, equal_nan = True),
+          numpy.allclose(attn_output_ort_1.detach().numpy(), attn_output_ort.detach().numpy(), rtol = rtol, atol = atol, equal_nan = True),
+          numpy.allclose(new_key_cache.detach().numpy(), new_key_cache_ort.detach().numpy(), rtol = rtol, atol = atol, equal_nan = True),
+          numpy.allclose(new_value_cache.detach().numpy(), new_value_cache_ort.detach().numpy(), rtol = rtol, atol = atol, equal_nan = True))
+
+
+if __name__ == '__main__':
+    for b in [1, 32, 128]:
+        for s in [1, 2, 128]:
+            for s2 in [1, 64, 256]:
+                for n in [8]:
+                    for h in [64]:
+                        config = Config(b, s, s2, n, h)
+                        parity_check(config, has_layer_state = True, use_past = True, static_kv = True, has_key_padding_mask = False)
+                        parity_check(config, has_layer_state = True, use_past = True, static_kv = False, has_key_padding_mask = False)
+                        parity_check(config, has_layer_state = True, use_past = False, static_kv = True, has_key_padding_mask = False)
+                        parity_check(config, has_layer_state = True, use_past = False, static_kv = False, has_key_padding_mask = False)
+                        parity_check(config, has_layer_state = True, use_past = True, static_kv = True, has_key_padding_mask = True)
+                        parity_check(config, has_layer_state = True, use_past = True, static_kv = False, has_key_padding_mask = True)
+                        parity_check(config, has_layer_state = True, use_past = False, static_kv = True, has_key_padding_mask = True)
+                        parity_check(config, has_layer_state = True, use_past = False, static_kv = False, has_key_padding_mask = True)
\ No newline at end of file
diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc
index 3aca885ea2..73e436263c 100644
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@@ -196,6 +196,8 @@ static constexpr PATH_TYPE PYOP_MULTI_MODEL_URI = TSTR("testdata/pyop_2.onnx");
 static constexpr PATH_TYPE PYOP_KWARG_MODEL_URI = TSTR("testdata/pyop_3.onnx");
 #endif
 
+static constexpr PATH_TYPE RESIZE_AND_CROP_MODEL_URI = TSTR("testdata/crop_and_resize.onnx");
+
 class CApiTestWithProvider : public testing::Test, public ::testing::WithParamInterface<int> {
 };
 
@@ -1053,15 +1055,17 @@ TEST(CApiTest, io_binding_cuda) {
   Ort::Value bound_y = Ort::Value::CreateTensor(info_cuda, reinterpret_cast<float*>(output_data.get()),
                                                 expected_y.size(), expected_y_shape.data(), expected_y_shape.size());
 
-  // Sychronize to make sure the copy on default stream is done since TensorRT isn't using default stream.
-  cudaStreamSynchronize(nullptr);
 
   Ort::IoBinding binding(session);
   binding.BindInput("X", bound_x);
   binding.BindOutput("Y", bound_y);
+  // Sychronize to make sure the copy on default stream is done since TensorRT isn't using default stream.
+  binding.SynchronizeInputs();
 
   session.Run(Ort::RunOptions(), binding);
 
+  binding.SynchronizeOutputs();
+
   // Check the values against the bound raw memory (needs copying from device to host first)
   std::array<float, 3 * 2> y_values_0;
   cudaMemcpy(y_values_0.data(), output_data.get(), sizeof(float) * y_values_0.size(), cudaMemcpyDeviceToHost);
@@ -1881,5 +1885,42 @@ TEST(CApiTest, TestPerSessionCustomThreadPoolHooks) {
   ASSERT_TRUE(custom_join_hook_called == (thread_count - 1) << 1);
 }
 
+// Preventing resize tranformer issue:
+// https://github.com/microsoft/onnxruntime/issues/9857
+TEST(CApiTest, crop_and_resize) {
+  std::vector<float> input_value_0;
+  input_value_0.resize(2 * 36 * 36 * 3);
+  for (int i = 0; i < 36 * 36 * 3; ++i) {
+    input_value_0[i] = 1.f;
+    input_value_0[i + 36 * 36 * 3] = 2.f;
+  }
+  std::vector<int64_t> input_shape_0{2, 36, 36, 3};
+
+  std::vector<int32_t> input_value_1{1, 0};
+  std::vector<int64_t> input_shape_1{2};
+
+  std::vector<const char*> input_names{"input:0", "input2:0"};
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  std::vector<Ort::Value> ort_inputs;
+  ort_inputs.emplace_back(Ort::Value::CreateTensor<float>(info, input_value_0.data(), input_value_0.size(), input_shape_0.data(), input_shape_0.size()));
+  ort_inputs.emplace_back(Ort::Value::CreateTensor<int32_t>(info, input_value_1.data(), input_value_1.size(), input_shape_1.data(), input_shape_1.size()));
+
+  Ort::SessionOptions session_options;
+  Ort::Session session(*ort_env, RESIZE_AND_CROP_MODEL_URI, session_options);
+
+  const char* output_names[] = {"output:0"};
+  std::vector<int64_t> output_shape{2, 20, 20, 3};
+
+  std::vector<Ort::Value> ort_outputs = session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(), output_names, countof(output_names));
+  ASSERT_EQ(ort_outputs.size(), 1U);
+  const auto& output_0 = ort_outputs[0];
+  ASSERT_TRUE(output_0.IsTensor());
+
+  auto output_type_shape = output_0.GetTensorTypeAndShapeInfo();
+  ASSERT_EQ(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, output_type_shape.GetElementType());
+  ASSERT_EQ(output_shape, output_type_shape.GetShape());
+}
+
 }  // namespace TestPerSessionCustomThreadHooks
 #endif
diff --git a/onnxruntime/test/testdata/crop_and_resize.onnx b/onnxruntime/test/testdata/crop_and_resize.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..72c31fcd6a0404ce69474c819c5a8e1437e64eaa
GIT binary patch
literal 3107
zcma)8&u`;I6pj;bJejsM9onU>*rjVNQt@FG|47pmL_+|fH48#52NVt?H*vZVsT~{#
zx_jcnfj<FG{cpH*Py98U;LVKfu@gtyNLJq1Z@&3v-uK=-R3}<Iw!$E|J*=D0%+|Bk
z*&qA(7^6urTg28`i{Q)2YoF3~>#TK%Qd?J<byfjSjeBy(m)?wda>vBhA6zmEJO>za
zlRMz;VHnI~I*+|5o>O{)x5NuS7{RcU4!v6+hNnPcIvE0Xmt!Y3U^5G_|0V#Q(ZmR_
zMzD@a-U}Bo?1M($uQV|!wE?6qM!NbU3}?pg1glr!=#};F4nD`*iKjp@EkeW#J_eIg
z(>~mMKJtTj62GF<>FOvQ0hurGW&$t)PZJ+VIM{I*_;eZu*OXc<()eryJS`IlX^E-V
zzyOpVvGzGTqO;Z#oScIX3&^kiC`_QuQ_>J8He}F=DbMRUmY?xn0c9L6BA|3awl4g+
z2%>cgARD=%t|bHDd4Y?LxCzIUi>z(ocf=bJTH7WYNib~(a1$cbb|k$Y@P2_4J2UoB
ztBv!M$tE1p(1U>3Cjf=zNUsaq*ubiHJDKlS?v(0Vu(i5NY+I5oi&DnclxzoOap1wA
zmLvJX^c-K7B$97?9p7)T+#*cLjbOt$xK07CvktB&^LnkSQssC6nJCAToBbqJ4t&PD
zD^a3mHx)J1j#SiED(cIuTI698MZ<(8Es8@lJ1M0rN$E;b`Z~pe&t&`i$P4DPaL&}i
z5`C-iLT_Y4V+X4<Kf38vdbOU?tKKOZ2*kq62p|BF1~yBBMm7KfNoD&Ef-GHD5@1PE
zj+B&>mz0x})I}y6=7Nbxpm8`I(QpQG@0c?74%=lWCC6SD`+t;`t>xqd0>XP@8UJPm
zRc$;CSz=io>BLqdn~?gv(mGtia-`RVZFKPCVHifEN#MmkjafX#VRYlglQ5t+;mFrX
zJf_1aoKX+Tbj0!Ql$!Afzw6(E|16ve=U=?(!r!aF?@m!zy!h<;bQ1Vp1T!FB-{JzH
zLi`<;75Vr(o$~lsnF2)ozbqUQLm*q1KQDZr-GL4-gt?)vB?D1s{CN+Uf{e&Ua;d@v
z;1~v~HkE!!u}+JGi`X974}6uTR^}`24D&SS`viRJ{bKn}Q^r*}+%aEzZ8c?Jzn!zk
z_=qpO{;qP7!Wja`Bff5F^1CYYGCt&BMP}*6;v!CsxiKKEzkFeIb6YI;uv}EllUc#A
z@)(c#DuqkPY2^?~zT0+=W!d-q{70w);Vn-dw{o#}4T^qw1SxPN%H<L3GzN#o0t^id
zLK;zV6)A&*)fS|VRn#DrtM_q9GeXj@ALF7}iGh!iLY`tw&?5MWH7=()yDM#{b*DUl
zd>Y{$Q3V43THLwqc3J&_ek<$9WG!kJuHp8{x?1!VUe}n(YQwMy5=RDkEc=gkC+k15
z=GFLs2YI=xbWttU2}>5Ozq&aW{g?cBo@)J)V(T_{6PWbhc*Nm~O>0gwh0jS7o0Ii+
zv(e9*R2OQSiK<8i6{S6&djt!f+qTW0@pydx=KjT}Z|?8;^VgpT=ND@1&*Q<9>_#OI
q@h-%=2@0yh^E7#CgNO3c=PIjnl~u2FoSp8{++CQ<o?5@!-1r|ql52ec

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/identity_9799.onnx b/onnxruntime/test/testdata/identity_9799.onnx
new file mode 100644
index 0000000000..c54fa57c61
--- /dev/null
+++ b/onnxruntime/test/testdata/identity_9799.onnx
@@ -0,0 +1,20 @@
+pytorch1.10:�
+%
+inputoutput
+Identity_0"Identitytorch-jit-exportZ;
+input2
+0,
+input_dynamic_axes_1
+
+
+
+
+b<
+output2
+0,
+input_dynamic_axes_1
+
+
+
+
+B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
index bb0f31e904..3b1747a90f 100644
--- a/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
+++ b/onnxruntime/test/testdata/kernel_def_hashes/contrib.cpu.json
@@ -27,6 +27,10 @@
         "MeanVarianceNormalization ai.onnx CPUExecutionProvider",
         13114085849278607104
     ],
+    [
+        "NhwcMaxPool com.microsoft CPUExecutionProvider",
+        11773579655431087496
+    ],
     [
         "ParametricSoftplus ai.onnx CPUExecutionProvider",
         17971715260566574960
diff --git a/onnxruntime/test/testdata/kernel_def_hashes/training_ops.cpu.json b/onnxruntime/test/testdata/kernel_def_hashes/training_ops.cpu.json
index 764b882112..856996d074 100644
--- a/onnxruntime/test/testdata/kernel_def_hashes/training_ops.cpu.json
+++ b/onnxruntime/test/testdata/kernel_def_hashes/training_ops.cpu.json
@@ -99,6 +99,10 @@
         "LogSoftmaxGrad com.microsoft CPUExecutionProvider",
         2657523710083167200
     ],
+    [
+        "LogSoftmaxGrad_13 com.microsoft CPUExecutionProvider",
+        1917456134240183096
+    ],
     [
         "MaxPoolGrad ai.onnx CPUExecutionProvider",
         17526822836083413768
@@ -239,6 +243,10 @@
         "SoftmaxGrad com.microsoft CPUExecutionProvider",
         4483165757863027152
     ],
+    [
+        "SoftmaxGrad_13 com.microsoft CPUExecutionProvider",
+        8375491041422269560
+    ],
     [
         "SparseSoftmaxCrossEntropy ai.onnx CPUExecutionProvider",
         10638058507241762520
diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index 1fad171dae..342132e5c3 100755
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -707,7 +707,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetSigmoidGradient) {
 
 IMPLEMENT_GRADIENT_BUILDER(GetSoftmaxGradient) {
   return std::vector<NodeDef>{
-      NodeDef(OpDef{"SoftmaxGrad", kMSDomain, 1},
+      NodeDef(OpDef{SrcNodeOpsetVersion() < 13 ? "SoftmaxGrad" : "SoftmaxGrad_13", kMSDomain, 1},
               {GO(0), O(0)},
               {GI(0)},
               SrcNodeAttributes())};
@@ -715,7 +715,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetSoftmaxGradient) {
 
 IMPLEMENT_GRADIENT_BUILDER(GetLogSoftmaxGradient) {
   return std::vector<NodeDef>{
-      NodeDef(OpDef{"LogSoftmaxGrad", kMSDomain, 1},
+      NodeDef(OpDef{SrcNodeOpsetVersion() < 13 ? "LogSoftmaxGrad" : "LogSoftmaxGrad_13", kMSDomain, 1},
               {GO(0), O(0)},
               {GI(0)},
               SrcNodeAttributes())};
diff --git a/orttraining/orttraining/core/graph/graph_augmenter.h b/orttraining/orttraining/core/graph/graph_augmenter.h
index 8409150599..3a6d7a4099 100644
--- a/orttraining/orttraining/core/graph/graph_augmenter.h
+++ b/orttraining/orttraining/core/graph/graph_augmenter.h
@@ -182,7 +182,7 @@ class GraphAugmenter {
     }
 
     TypeProto* CopyTypeProto(const NodeArg* node_arg) {
-      ORT_ENFORCE(node_arg != nullptr, "During CopyTypeProto, ", node_arg->Name(), "'s node_arg is null.");
+      ORT_ENFORCE(node_arg != nullptr, "During CopyTypeProto, node_arg is null.");
       TypeProto* type_proto = CreateTypeProto();
       type_proto->CopyFrom(*(node_arg->TypeAsProto()));
       return type_proto;
diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc
index e14f0f2d9e..749032ea9d 100644
--- a/orttraining/orttraining/core/graph/training_op_defs.cc
+++ b/orttraining/orttraining/core/graph/training_op_defs.cc
@@ -650,6 +650,24 @@ void RegisterTrainingOpSchemas() {
             return true;
           });
 
+  ONNX_CONTRIB_OPERATOR_SCHEMA(SoftmaxGrad_13)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .Input(0, "dY", "Gradient of output Y", "T")
+      .Input(1, "Y", "Input tensor", "T")
+      .Output(0, "dX", "Gradient of input X", "T")
+      .Attr(
+          "axis",
+          "Describes the dimension Softmax will be performed on."
+          "Defaults to -1. Negative value means counting dimensions from the back.",
+          AttributeProto::INT,
+          static_cast<int64_t>(-1))
+      .TypeConstraint(
+          "T",
+          {"tensor(float16)", "tensor(float)", "tensor(double)", "tensor(bfloat16)"},
+          "Constrain input and output types to float tensors.")
+      .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput);
+
   ONNX_CONTRIB_OPERATOR_SCHEMA(LogSoftmaxGrad)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
@@ -669,6 +687,24 @@ void RegisterTrainingOpSchemas() {
           "Constrain input and output types to float tensors.")
       .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput);
 
+  ONNX_CONTRIB_OPERATOR_SCHEMA(LogSoftmaxGrad_13)
+      .SetDomain(kMSDomain)
+      .SinceVersion(1)
+      .Input(0, "dY", "Gradient of output Y", "T")
+      .Input(1, "X", "Input tensor", "T")
+      .Output(0, "dX", "Gradient of input X", "T")
+      .Attr(
+          "axis",
+          "Describes the dimension LogSoftmax will be performed on."
+          "Defaults to -1. Negative value means counting dimensions from the back.",
+          AttributeProto::INT,
+          static_cast<int64_t>(-1))
+      .TypeConstraint(
+          "T",
+          {"tensor(float16)", "tensor(float)", "tensor(double)"},
+          "Constrain input and output types to float tensors.")
+      .TypeAndShapeInferenceFunction(propagateShapeAndTypeFromFirstInput);
+
   ONNX_CONTRIB_OPERATOR_SCHEMA(AveragePoolGrad)
       .SinceVersion(9)
       .Input(0, "dY", "Gradient of output Y", "T")
diff --git a/orttraining/orttraining/eager/opgen/onnxgen.py b/orttraining/orttraining/eager/opgen/onnxgen.py
index d53b09b2c0..5579750a75 100755
--- a/orttraining/orttraining/eager/opgen/onnxgen.py
+++ b/orttraining/orttraining/eager/opgen/onnxgen.py
@@ -21,6 +21,24 @@ for schema in defs.get_all_schemas_with_history():
     onnx_ops[key].since_version < schema.since_version:
     onnx_ops[key] = schema
 
+def convert_to_aten_type(onnx_type_strs):
+  type_map = {'tensor(float16)' : 'at::kHalf',
+              'tensor(float)' : 'at::kFloat',
+              'tensor(double)' : 'at::kDouble',
+              'tensor(bfloat16)' : 'at::kBFloat16',
+              'tensor(int32)' : 'at::kInt',
+              'tensor(int16)' : 'at::kShort',
+              'tensor(int8)' : 'at::kByte',
+              'tensor(int64)' : 'at::kLong',
+              'tensor(bool)' : 'at::kBool',
+             }
+  result = set({})
+  for onnx_type in onnx_type_strs:
+    # ONNX has more types, like tensor(string), ignore those types at this momemnt
+    if onnx_type in type_map:
+      result.add(type_map[onnx_type])
+  return result
+
 with open(out_file, 'wt') as fp:
   def write(s): fp.write(s)
   def writeline(s = ''): fp.write(s + '\n')
@@ -54,9 +72,17 @@ with open(out_file, 'wt') as fp:
 
     writeline('):')
     write(f'    super().__init__(\'{schema.name}\', {len(schema.outputs)}')
-
+    writeline(',')
+    write('      ')
+    input_types = []
     for input in schema.inputs:
-      write(f', {input.name}')
+      input_types.append(convert_to_aten_type(input.types))
+    write(str(input_types))
+    if len(schema.inputs) > 0:
+      writeline(',')
+      input_names = ','.join([input.name for input in schema.inputs])
+      write(f'      {input_names}')
+    
 
     if len(schema.attributes) > 0:
       writeline(',')
diff --git a/orttraining/orttraining/eager/opgen/opgen/atenops.py b/orttraining/orttraining/eager/opgen/opgen/atenops.py
index c07c7d281d..5da6ac607f 100644
--- a/orttraining/orttraining/eager/opgen/opgen/atenops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/atenops.py
@@ -4,7 +4,7 @@ from opgen.generator import \
   ORTGen as ORTGen, \
   ONNXOp as ONNXOp, \
   SignatureOnly as SignatureOnly, \
-  MakeFallthrough as MakeFallthrough
+  MakeTorchFallback as MakeTorchFallback
 
 from opgen.onnxops import *
 
@@ -12,17 +12,17 @@ kMSDomain = 'onnxruntime::kMSDomain'
 
 class ReluGrad(ONNXOp):
   def __init__(self, dY, X):
-    super().__init__('ReluGrad', 1, dY, X)
+    super().__init__('ReluGrad', 1, [{'at::kHalf', 'at::kFloat', 'at::kBFloat16'}, {'at::kHalf', 'at::kFloat', 'at::kBFloat16'}], dY, X)
     self.domain = kMSDomain
 
 class Gelu(ONNXOp):
   def __init__(self, X):
-    super().__init__('Gelu', 1, X)
+    super().__init__('Gelu', 1, [{'at::kHalf', 'at::kFloat', 'at::kBFloat16'}], X)
     self.domain = kMSDomain
 
 class GeluGrad(ONNXOp):
   def __init__(self, dY, X):
-    super().__init__('GeluGrad', 1, dY, X)
+    super().__init__('GeluGrad', 1, [{'at::kHalf', 'at::kFloat', 'at::kBFloat16'}, {'at::kHalf', 'at::kFloat', 'at::kBFloat16'}], dY, X)
     self.domain = kMSDomain
 
 ops = {
@@ -31,8 +31,9 @@ ops = {
   'aten::empty_strided': SignatureOnly(),
   'aten::zero_': SignatureOnly(),
   'aten::copy_': SignatureOnly(),
-  'aten::reshape': SignatureOnly(),
+  'aten::_reshape_alias': SignatureOnly(),
   'aten::view': SignatureOnly(),
+  'aten::_copy_from_and_resize' : SignatureOnly(),
 
   'aten::addmm': Gemm('mat1', 'mat2', 'self', alpha='alpha', beta='beta'),
   'aten::t': Transpose('self'),
@@ -48,7 +49,20 @@ ops = {
   'aten::softshrink': Shrink('self', bias='lambd', lambd='lambd'), #yes, bias is set to 'lambd'
   'aten::hardshrink': Shrink('self', bias=0, lambd='lambd'),
   'aten::gelu' : Gelu('self'),
-  'aten::gelu_backward' : GeluGrad('grad', 'self')
+  'aten::gelu_backward' : GeluGrad('grad', 'self'),
+  'aten::max' : ReduceMax('self', keepdims=1),
+  'aten::min' : ReduceMin('self', keepdims=1),
+
+  'aten::ne.Scalar':MakeTorchFallback(),
+  'aten::ne.Scalar_out': MakeTorchFallback(),
+  'aten::ne.Tensor_out': MakeTorchFallback(),
+  'aten::eq.Tensor': MakeTorchFallback(),
+  'aten::eq.Tensor_out':MakeTorchFallback(),
+  'aten::bitwise_and.Tensor_out' : MakeTorchFallback(),
+  'aten::masked_select' : MakeTorchFallback(),
+  'aten::as_strided' : MakeTorchFallback(),
+  'aten::_local_scalar_dense' : MakeTorchFallback(),
+  'aten::gt.Scalar_out' : MakeTorchFallback(),
 }
 
 for binary_op, onnx_op in {
@@ -64,7 +78,7 @@ for unary_op in [
   'abs','acos','acosh', 'asinh', 'atanh', 'asin', 'atan', 'ceil', 'cos',
   'cosh', 'erf', 'exp', 'floor', 'isnan', 'log', 'reciprocal', 'neg', 'round',
   'relu', 'selu', 'sigmoid', 'sin', 'sinh', 'sqrt', 'tan', 'tanh', 'nonzero',
-  'sign', 'min', 'max', 'hardsigmoid', 'isinf', 'det']:
+  'sign', 'hardsigmoid', 'isinf', 'det']:
   aten_name = f'aten::{unary_op}'
   onnx_op = onnx_ops[unary_op]('self')
   ops[aten_name] = onnx_op
diff --git a/orttraining/orttraining/eager/opgen/opgen/custom_ops.py b/orttraining/orttraining/eager/opgen/opgen/custom_ops.py
index 90ed820c83..4fe53bbbf9 100644
--- a/orttraining/orttraining/eager/opgen/opgen/custom_ops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/custom_ops.py
@@ -6,7 +6,7 @@ from opgen.generator import \
   ORTGen as ORTGen, \
   ONNXOp as ONNXOp, \
   SignatureOnly as SignatureOnly, \
-  MakeFallthrough as MakeFallthrough
+  MakeTorchFallback as MakeTorchFallback
 
 from opgen.onnxops import *
 
diff --git a/orttraining/orttraining/eager/opgen/opgen/generator.py b/orttraining/orttraining/eager/opgen/opgen/generator.py
index 467a4f50ad..eaa3a8deef 100644
--- a/orttraining/orttraining/eager/opgen/opgen/generator.py
+++ b/orttraining/orttraining/eager/opgen/opgen/generator.py
@@ -48,6 +48,7 @@ class ONNXOp:
   def __init__(self,
     name: str,
     outputs: int,
+    input_types: List,
     *inputs: Union[str, Outputs],
     **attributes: Optional[Union[str, Outputs]]):
     self.name = name
@@ -55,6 +56,7 @@ class ONNXOp:
     self.inputs = inputs
     self.attributes = attributes
     self.domain = None
+    self.input_types = input_types
 
   def eval(self, ctx: ONNXOpEvalContext):
     evaluated_inputs = []
@@ -71,10 +73,10 @@ class ONNXOp:
     return self.outputs
 
 class SignatureOnly(ONNXOp):
-  def __init__(self): super().__init__(None, 0)
+  def __init__(self): super().__init__(None, 0, [])
 
-class MakeFallthrough(ONNXOp):
-  def __init__(self): super().__init__(None, 0)
+class MakeTorchFallback(ONNXOp):
+  def __init__(self): super().__init__(None, 0, [])
 
 class FunctionGenerationError(NotImplementedError):
   def __init__(self, cpp_func: ast.FunctionDecl, message: str):
@@ -88,13 +90,13 @@ class MappedOpFunction:
     onnx_op: ONNXOp,
     cpp_func: ast.FunctionDecl,
     signature_only: bool,
-    make_fallthrough: bool):
+    make_torch_fallback: bool):
     self.op_namespace = op_namespace
     self.mapped_op_name = mapped_op_name
     self.onnx_op = onnx_op
     self.cpp_func = cpp_func
     self.signature_only = signature_only
-    self.make_fallthrough = make_fallthrough
+    self.make_torch_fallback = make_torch_fallback
 
 class ORTGen:
   _mapped_ops: Dict[str, ONNXOp]
@@ -126,9 +128,6 @@ class ORTGen:
       del self._mapped_ops[mapped_func.mapped_op_name]
       generated_funcs.append(mapped_func)
 
-      if mapped_func.make_fallthrough:
-        continue
-
       ns = mapped_func.op_namespace
       if current_ns and current_ns != ns:
         current_ns = None
@@ -173,6 +172,7 @@ class ORTGen:
     writer.writeline('#include "python/onnxruntime_pybind_state_common.h"')
     writer.writeline()
     writer.writeline('#include <torch/extension.h>')
+    writer.writeline('#include <ATen/native/CPUFallback.h>')
     writer.writeline()
     writer.writeline('#include <core/providers/dml/OperatorAuthorHelper/Attributes.h>')
     writer.writeline()
@@ -206,6 +206,27 @@ class ORTGen:
     writer.pop_indent()
     writer.write(')')
 
+  def _write_cpu_fall_back(self, 
+                           writer: writer.SourceWriter,
+                           mapped_func: MappedOpFunction):
+      onnx_op, cpp_func = mapped_func.onnx_op, mapped_func.cpp_func
+      #return at::native::call_fallback_fn<
+      #  &at::native::cpu_fallback,
+      #  ATEN_OP(eq_Tensor)>::call(self, other);
+      writer.writeline('return native::call_fallback_fn<')
+      writer.push_indent()
+      writer.writeline('&native::cpu_fallback,')
+      writer.write('ATEN_OP(')
+      writer.write(cpp_func.identifier.value)
+      writer.write(')>::call(')
+
+      params = ', '.join([p.member.identifier.value for p \
+        in cpp_func.parameters if p.member.identifier])
+      writer.write(params)
+      writer.writeline(');')
+      writer.pop_indent()
+
+
   def _write_function_body(
     self,
     writer: writer.SourceWriter,
@@ -214,6 +235,15 @@ class ORTGen:
 
     assert(len(cpp_func.parameters) > 0)
 
+    # Debug Logging
+    log_params = ', '.join([p.member.identifier.value for p \
+      in cpp_func.parameters if p.member.identifier])
+    writer.writeline(f'ORT_LOG_FN({log_params});')
+    writer.writeline()
+
+    if mapped_func.make_torch_fallback:
+      return self._write_cpu_fall_back(writer, mapped_func)
+
     return_alias_info = self._get_alias_info(cpp_func.torch_func.return_type) if cpp_func.torch_func else None
     if return_alias_info and not return_alias_info.is_writable:
       return_alias_info = None
@@ -224,11 +254,32 @@ class ORTGen:
     onnx_op.eval(ctx)
     ctx.prepare_outputs()
 
-    # Debug Logging
-    log_params = ', '.join([p.member.identifier.value for p \
-      in cpp_func.parameters if p.member.identifier])
-    writer.writeline(f'ORT_LOG_FN({log_params});')
-    writer.writeline()
+    # generate the type check
+    need_type_check = False
+    if not self._custom_ops:
+      for onnx_op_index, onnx_op in enumerate(ctx.ops):
+        for op_input in onnx_op.inputs:
+          if not isinstance(op_input, Outputs):
+            need_type_check = True
+            break
+    if need_type_check:
+      writer.write('if (')
+      i = 0
+      for onnx_op_index, onnx_op in enumerate(ctx.ops):
+        for idx, op_input in enumerate(onnx_op.inputs):
+          if isinstance(op_input, Outputs):
+            continue
+          writer.writeline(' || ' if i > 0 else '')
+          if i == 0:
+            writer.push_indent()
+          cpp_param = cpp_func.get_parameter(op_input)
+          supported_types = ','.join([type for type in onnx_op.input_types[idx]])
+          writer.write('!IsSupportedType(%s, {%s})' % (cpp_param.identifier.value, supported_types))
+          i += 1
+      writer.writeline(') {')
+      self._write_cpu_fall_back(writer, mapped_func)
+      writer.pop_indent()
+      writer.writeline('}')      
 
     # Fetch the ORT invoker from an at::Tensor.device()
     # FIXME: find the first at::Tensor param anywhere in the signature
@@ -258,10 +309,10 @@ class ORTGen:
           continue
         # See if this input is aliased as an in-place tensor
         cpp_param = cpp_func.get_parameter(op_input)
-        if return_alias_info and cpp_param and \
-          len(cpp_param.torch_param) == 1 and \
-          self._get_alias_info(cpp_param.torch_param[0]) == return_alias_info:
-          in_place_param = cpp_param
+        if return_alias_info and cpp_param:
+          for torch_p in cpp_param.torch_param:
+            if self._get_alias_info(torch_p) == return_alias_info:
+              in_place_param = cpp_param
 
         writer.write(f'auto ort_input_{op_input} = ')
         writer.writeline(f'create_ort_value(invoker, {op_input});')
@@ -367,18 +418,15 @@ class ORTGen:
     for mapped_func in generated_funcs:
       cpp_func, torch_func = mapped_func.cpp_func, mapped_func.cpp_func.torch_func
 
-      if mapped_func.make_fallthrough:
-        reg_function_arg = 'torch::CppFunction::makeFallthrough()'
+      
+      if mapped_func.op_namespace:
+        reg_function_arg = f'{mapped_func.op_namespace}::'
       else:
-        if mapped_func.op_namespace:
-          reg_function_arg = f'{mapped_func.op_namespace}::'
-        else:
-          reg_function_arg = ''
-        reg_function_arg += cpp_func.identifier.value
+        reg_function_arg = ''
+      reg_function_arg += cpp_func.identifier.value
 
       writer.write('m.impl(')
-      if not mapped_func.make_fallthrough:
-        reg_function_arg = f'TORCH_FN({reg_function_arg})'
+      reg_function_arg = f'TORCH_FN({reg_function_arg})'
 
       writer.writeline(f'"{torch_func.identifier.value}", {reg_function_arg});')
 
@@ -427,7 +475,7 @@ class ORTGen:
           op_namespace = None
           op_namewithoutnamespace = op_name
 
-        cpp_func.identifier.value = op_namewithoutnamespace.replace('.', '__')
+        cpp_func.identifier.value = op_namewithoutnamespace.replace('.', '_')
 
       onnx_op = self._mapped_ops.get(op_name)
       if not onnx_op:
@@ -439,7 +487,7 @@ class ORTGen:
         onnx_op,
         cpp_func,
         isinstance(onnx_op, SignatureOnly),
-        isinstance(onnx_op, MakeFallthrough))
+        isinstance(onnx_op, MakeTorchFallback))
 
   def _parse_function_decls(self, cpp_parser: parser.CPPParser):
     # Parse the C++ declarations
diff --git a/orttraining/orttraining/eager/opgen/opgen/onnxops.py b/orttraining/orttraining/eager/opgen/opgen/onnxops.py
index aa0530fd5f..97a83cf3be 100644
--- a/orttraining/orttraining/eager/opgen/opgen/onnxops.py
+++ b/orttraining/orttraining/eager/opgen/opgen/onnxops.py
@@ -1,5 +1,5 @@
 # AUTO-GENERATED CODE! - DO NOT EDIT!
-# $ python opgen/onnxgen.py
+# $ python onnxgen.py
 
 from opgen.generator import ONNXAttr, ONNXOp, AttrType
 
@@ -11,7 +11,9 @@ class Abs(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Abs', 1, X)
+    super().__init__('Abs', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      X)
 
 class Acos(ONNXOp):
   """
@@ -19,7 +21,9 @@ class Acos(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Acos', 1, input)
+    super().__init__('Acos', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Acosh(ONNXOp):
   """
@@ -27,7 +31,9 @@ class Acosh(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Acosh', 1, input)
+    super().__init__('Acosh', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Adagrad(ONNXOp):
   """
@@ -36,12 +42,12 @@ class Adagrad(ONNXOp):
   
       Let's define the behavior of this operator. As you can imagine, ADAGRAD requires
       some parameters:
-       
+  
        - The initial learning-rate "R".
        - The update count "T". That is, the number of training iterations conducted.
        - A L2-norm regularization coefficient "norm_coefficient".
        - A learning-rate decay factor "decay_factor".
-       - A small constant "epsilon" to avoid dividing-by-zero. 
+       - A small constant "epsilon" to avoid dividing-by-zero.
   
       At each ADAGRAD iteration, the optimized tensors are moved along a direction
       computed based on their estimated gradient and accumulated squared gradient. Assume
@@ -87,7 +93,9 @@ class Adagrad(ONNXOp):
     decay_factor=None, 
     epsilon=None, 
     norm_coefficient=None):
-    super().__init__('Adagrad', 1, R, T, inputs,
+    super().__init__('Adagrad', 1,
+      [{'at::kDouble', 'at::kFloat'}, {'at::kLong'}, {'at::kDouble', 'at::kFloat'}],
+      R,T,inputs,
       decay_factor=ONNXAttr(decay_factor, AttrType.FLOAT), 
       epsilon=ONNXAttr(epsilon, AttrType.FLOAT), 
       norm_coefficient=ONNXAttr(norm_coefficient, AttrType.FLOAT))
@@ -99,18 +107,18 @@ class Adam(ONNXOp):
   
       Let's define the behavior of this operator. First of all, Adam requires
       some parameters:
-       
+  
        - The learning-rate "R".
        - The update count "T". That is, the number of training iterations conducted.
        - A L2-norm regularization coefficient "norm_coefficient".
-       - A small constant "epsilon" to avoid dividing-by-zero. 
+       - A small constant "epsilon" to avoid dividing-by-zero.
        - Two coefficients, "alpha" and "beta".
   
       At each Adam iteration, the optimized tensors are moved along a direction
       computed based on their exponentially-averaged historical gradient and
       exponentially-averaged historical squared gradient. Assume that only a tensor
       "X" is being optimized. The rest of required information is
-      
+  
        - the value of "X",
        - "X"'s gradient (denoted by "G"),
        - "X"'s exponentially-averaged historical gradient (denoted by "V"), and
@@ -120,8 +128,8 @@ class Adam(ONNXOp):
       are stored as this operator's attributes. Specifically, this operator's input tensor
       list is ["R", "T", "X", "G", "V", "H"]. That is, "R" is the first input, "T" is
       the second input, and so on. Other parameters are given as attributes because they
-      are constants. Moreover, the corresponding output tensors are 
-      
+      are constants. Moreover, the corresponding output tensors are
+  
        - the new value of "X" (called "X_new"),
        - the new exponentially-averaged historical gradient (denoted by "V_new"), and
        - the new exponentially-averaged historical squared gradient (denoted by "H_new").
@@ -151,7 +159,7 @@ class Adam(ONNXOp):
         X_new = X - R_adjusted * V_new / H_sqrt
   
         // Post-update regularization.
-        X_final = (1 - norm_coefficient_post) * X_new 
+        X_final = (1 - norm_coefficient_post) * X_new
   
       If there are multiple inputs to be optimized, the pseudo code will be applied
       independently to each of them.
@@ -163,7 +171,9 @@ class Adam(ONNXOp):
     epsilon=None, 
     norm_coefficient=None, 
     norm_coefficient_post=None):
-    super().__init__('Adam', 1, R, T, inputs,
+    super().__init__('Adam', 1,
+      [{'at::kDouble', 'at::kFloat'}, {'at::kLong'}, {'at::kDouble', 'at::kFloat'}],
+      R,T,inputs,
       alpha=ONNXAttr(alpha, AttrType.FLOAT), 
       beta=ONNXAttr(beta, AttrType.FLOAT), 
       epsilon=ONNXAttr(epsilon, AttrType.FLOAT), 
@@ -175,10 +185,14 @@ class Add(ONNXOp):
   Performs element-wise binary addition (with Numpy-style broadcasting support).
   
   This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
+  
+  (Opset 14 change): Extend supported types to include uint8, int8, uint16, and int16.
   """
 
   def __init__(self, A, B):
-    super().__init__('Add', 1, A, B)
+    super().__init__('Add', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      A,B)
 
 class And(ONNXOp):
   """
@@ -189,15 +203,17 @@ class And(ONNXOp):
   """
 
   def __init__(self, A, B):
-    super().__init__('And', 1, A, B)
+    super().__init__('And', 1,
+      [{'at::kBool'}, {'at::kBool'}],
+      A,B)
 
 class ArgMax(ONNXOp):
   """
-  Computes the indices of the max elements of the input tensor's element along the 
-  provided axis. The resulting tensor has the same rank as the input if keepdims equal 1. 
-  If keepdims equal 0, then the resulting tensor have the reduced dimension pruned. 
-  If select_last_index is True (default False), the index of the last occurrence of the max 
-  is selected if the max appears more than once in the input. Otherwise the index of the 
+  Computes the indices of the max elements of the input tensor's element along the
+  provided axis. The resulting tensor has the same rank as the input if keepdims equal 1.
+  If keepdims equal 0, then the resulting tensor have the reduced dimension pruned.
+  If select_last_index is True (default False), the index of the last occurrence of the max
+  is selected if the max appears more than once in the input. Otherwise the index of the
   first occurrence is selected.
   The type of the output tensor is integer.
   """
@@ -206,18 +222,20 @@ class ArgMax(ONNXOp):
     axis=None, 
     keepdims=None, 
     select_last_index=None):
-    super().__init__('ArgMax', 1, data,
+    super().__init__('ArgMax', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axis=ONNXAttr(axis, AttrType.INT), 
       keepdims=ONNXAttr(keepdims, AttrType.INT), 
       select_last_index=ONNXAttr(select_last_index, AttrType.INT))
 
 class ArgMin(ONNXOp):
   """
-  Computes the indices of the min elements of the input tensor's element along the 
-  provided axis. The resulting tensor has the same rank as the input if keepdims equal 1. 
-  If keepdims equal 0, then the resulting tensor have the reduced dimension pruned. 
-  If select_last_index is True (default False), the index of the last occurrence of the min 
-  is selected if the min appears more than once in the input. Otherwise the index of the 
+  Computes the indices of the min elements of the input tensor's element along the
+  provided axis. The resulting tensor has the same rank as the input if keepdims equal 1.
+  If keepdims equal 0, then the resulting tensor have the reduced dimension pruned.
+  If select_last_index is True (default False), the index of the last occurrence of the min
+  is selected if the min appears more than once in the input. Otherwise the index of the
   first occurrence is selected.
   The type of the output tensor is integer.
   """
@@ -226,7 +244,9 @@ class ArgMin(ONNXOp):
     axis=None, 
     keepdims=None, 
     select_last_index=None):
-    super().__init__('ArgMin', 1, data,
+    super().__init__('ArgMin', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axis=ONNXAttr(axis, AttrType.INT), 
       keepdims=ONNXAttr(keepdims, AttrType.INT), 
       select_last_index=ONNXAttr(select_last_index, AttrType.INT))
@@ -238,7 +258,9 @@ class ArrayFeatureExtractor(ONNXOp):
   """
 
   def __init__(self, X, Y):
-    super().__init__('ArrayFeatureExtractor', 1, X, Y)
+    super().__init__('ArrayFeatureExtractor', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}, {'at::kLong'}],
+      X,Y)
 
 class Asin(ONNXOp):
   """
@@ -246,7 +268,9 @@ class Asin(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Asin', 1, input)
+    super().__init__('Asin', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Asinh(ONNXOp):
   """
@@ -254,7 +278,9 @@ class Asinh(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Asinh', 1, input)
+    super().__init__('Asinh', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Atan(ONNXOp):
   """
@@ -262,7 +288,9 @@ class Atan(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Atan', 1, input)
+    super().__init__('Atan', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Atanh(ONNXOp):
   """
@@ -270,7 +298,9 @@ class Atanh(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Atanh', 1, input)
+    super().__init__('Atanh', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class AveragePool(ONNXOp):
   """
@@ -312,7 +342,9 @@ class AveragePool(ONNXOp):
     kernel_shape=None, 
     pads=None, 
     strides=None):
-    super().__init__('AveragePool', 1, X,
+    super().__init__('AveragePool', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,
       auto_pad=ONNXAttr(auto_pad, AttrType.STRING), 
       ceil_mode=ONNXAttr(ceil_mode, AttrType.INT), 
       count_include_pad=ONNXAttr(count_include_pad, AttrType.INT), 
@@ -324,22 +356,55 @@ class BatchNormalization(ONNXOp):
   """
   Carries out batch normalization as described in the paper
   https://arxiv.org/abs/1502.03167. Depending on the mode it is being run,
-  there are multiple cases for the number of outputs, which we list below:
+  There are five required inputs 'X', 'scale', 'B', 'input_mean' and
+  'input_var'.
+  Note that 'input_mean' and 'input_var' are expected to be the estimated
+  statistics in inference mode (training_mode=False, default),
+  and the running statistics in training mode (training_mode=True).
+  There are multiple cases for the number of outputs, which we list below:
   
-  Output case #1: Y, mean, var, saved_mean, saved_var (training mode)
-  Output case #2: Y (test mode)
+  Output case #1: Y, running_mean, running_var (training_mode=True)
+  Output case #2: Y (training_mode=False)
+  
+  When training_mode=False, extra outputs are invalid.
+  The outputs are updated as follows when training_mode=True:
+  ```
+  running_mean = input_mean * momentum + current_mean * (1 - momentum)
+  running_var = input_var * momentum + current_var * (1 - momentum)
+  
+  Y = (X - current_mean) / sqrt(current_var + epsilon) * scale + B
+  
+  where:
+  
+  current_mean = ReduceMean(X, axis=all_except_channel_index)
+  current_var =  ReduceVar(X, axis=all_except_channel_index)
+  
+  Notice that ReduceVar refers to the population variance, and it equals to
+  sum(sqrd(x_i - x_avg)) / N
+  where N is the population size (this formula does not use sample size N - 1).
+  
+  ```
+  
+  When training_mode=False:
+  ```
+  Y = (X - input_mean) / sqrt(input_var + epsilon) * scale + B
+  ```
   
   For previous (depreciated) non-spatial cases, implementors are suggested
-  to flatten the input shape to (N x C*D1*D2 ..*Dn) before a BatchNormalization Op.
+  to flatten the input shape to (N x C * D1 * D2 * ... * Dn) before a BatchNormalization Op.
   This operator has **optional** inputs/outputs. See [the doc](IR.md) for more details about the representation of optional arguments. An empty string may be used in the place of an actual argument's name to indicate a missing argument. Trailing optional arguments (those not followed by an argument that is present) may also be simply omitted.
   """
 
-  def __init__(self, X, scale, B, mean, var,
+  def __init__(self, X, scale, B, input_mean, input_var,
     epsilon=None, 
-    momentum=None):
-    super().__init__('BatchNormalization', 5, X, scale, B, mean, var,
+    momentum=None, 
+    training_mode=None):
+    super().__init__('BatchNormalization', 3,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      X,scale,B,input_mean,input_var,
       epsilon=ONNXAttr(epsilon, AttrType.FLOAT), 
-      momentum=ONNXAttr(momentum, AttrType.FLOAT))
+      momentum=ONNXAttr(momentum, AttrType.FLOAT), 
+      training_mode=ONNXAttr(training_mode, AttrType.INT))
 
 class Binarizer(ONNXOp):
   """
@@ -348,7 +413,9 @@ class Binarizer(ONNXOp):
 
   def __init__(self, X,
     threshold=None):
-    super().__init__('Binarizer', 1, X,
+    super().__init__('Binarizer', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       threshold=ONNXAttr(threshold, AttrType.FLOAT))
 
 class BitShift(ONNXOp):
@@ -361,7 +428,7 @@ class BitShift(ONNXOp):
    Y specifies the amounts of shifting. For example, if "direction" is "Right", X is [1, 4],
    and S is [1, 1], the corresponding output Z would be [0, 2]. If "direction" is "LEFT" with
    X=[1, 2] and S=[1, 2], the corresponding output Y would be [2, 8].
-   
+  
    Because this operator supports Numpy-style broadcasting, X's and Y's shapes are
    not necessarily identical.
   This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
@@ -369,7 +436,9 @@ class BitShift(ONNXOp):
 
   def __init__(self, X, Y,
     direction=None):
-    super().__init__('BitShift', 1, X, Y,
+    super().__init__('BitShift', 1,
+      [set(), set()],
+      X,Y,
       direction=ONNXAttr(direction, AttrType.STRING))
 
 class Cast(ONNXOp):
@@ -385,8 +454,8 @@ class Cast(ONNXOp):
   "+INF" (and "INF"), "-INF", and "NaN" are positive infinity, negative infinity, and not-a-number, respectively.
   Any string which can exactly match "+INF" in a case-insensitive way would be mapped to positive infinite. Similarly,
   this case-insensitive rule is applied to "INF" and "NaN". When casting from numeric tensors
-  to string tensors, plain floating-point representation (such as "314.15926") would be used. 
-  Converting non-numerical-literal string such as "Hello World!" is an undefined behavior. Cases 
+  to string tensors, plain floating-point representation (such as "314.15926") would be used.
+  Converting non-numerical-literal string such as "Hello World!" is an undefined behavior. Cases
   of converting string representing floating-point arithmetic value, such as "2.718", to INT is an undefined behavior.
   
   Conversion from a numerical type to any numerical type is always allowed.
@@ -397,7 +466,9 @@ class Cast(ONNXOp):
 
   def __init__(self, input,
     to=None):
-    super().__init__('Cast', 1, input,
+    super().__init__('Cast', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      input,
       to=ONNXAttr(to, AttrType.INT))
 
 class CastMap(ONNXOp):
@@ -411,7 +482,9 @@ class CastMap(ONNXOp):
     cast_to=None, 
     map_form=None, 
     max_map=None):
-    super().__init__('CastMap', 1, X,
+    super().__init__('CastMap', 1,
+      [set()],
+      X,
       cast_to=ONNXAttr(cast_to, AttrType.STRING), 
       map_form=ONNXAttr(map_form, AttrType.STRING), 
       max_map=ONNXAttr(max_map, AttrType.INT))
@@ -421,7 +494,7 @@ class CategoryMapper(ONNXOp):
       Converts strings to integers and vice versa.<br>
       Two sequences of equal length are used to map between integers and strings,
       with strings and integers at the same index detailing the mapping.<br>
-      Each operator converts either integers to strings or strings to integers, depending 
+      Each operator converts either integers to strings or strings to integers, depending
       on which default value attribute is provided. Only one default value attribute
       should be defined.<br>
       If the string default value is set, it will convert integers to strings.
@@ -433,7 +506,9 @@ class CategoryMapper(ONNXOp):
     cats_strings=None, 
     default_int64=None, 
     default_string=None):
-    super().__init__('CategoryMapper', 1, X,
+    super().__init__('CategoryMapper', 1,
+      [{'at::kLong'}],
+      X,
       cats_int64s=ONNXAttr(cats_int64s, AttrType.INTS), 
       cats_strings=ONNXAttr(cats_strings, AttrType.STRINGS), 
       default_int64=ONNXAttr(default_int64, AttrType.INT), 
@@ -447,13 +522,15 @@ class Ceil(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Ceil', 1, X)
+    super().__init__('Ceil', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class Celu(ONNXOp):
   """
   Continuously Differentiable Exponential Linear Units:
   Perform the linear unit element-wise on the input tensor X
-  using formula: 
+  using formula:
   
   ```
   max(0,x) + min(0,alpha*(exp(x/alpha)-1))
@@ -462,7 +539,9 @@ class Celu(ONNXOp):
 
   def __init__(self, X,
     alpha=None):
-    super().__init__('Celu', 1, X,
+    super().__init__('Celu', 1,
+      [{'at::kFloat'}],
+      X,
       alpha=ONNXAttr(alpha, AttrType.FLOAT))
 
 class Clip(ONNXOp):
@@ -473,7 +552,9 @@ class Clip(ONNXOp):
   """
 
   def __init__(self, input, min, max):
-    super().__init__('Clip', 1, input, min, max)
+    super().__init__('Clip', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      input,min,max)
 
 class Compress(ONNXOp):
   """
@@ -485,7 +566,9 @@ class Compress(ONNXOp):
 
   def __init__(self, input, condition,
     axis=None):
-    super().__init__('Compress', 1, input, condition,
+    super().__init__('Compress', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}, {'at::kBool'}],
+      input,condition,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class Concat(ONNXOp):
@@ -495,7 +578,9 @@ class Concat(ONNXOp):
 
   def __init__(self, inputs,
     axis=None):
-    super().__init__('Concat', 1, inputs,
+    super().__init__('Concat', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      inputs,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class ConcatFromSequence(ONNXOp):
@@ -509,7 +594,9 @@ class ConcatFromSequence(ONNXOp):
   def __init__(self, input_sequence,
     axis=None, 
     new_axis=None):
-    super().__init__('ConcatFromSequence', 1, input_sequence,
+    super().__init__('ConcatFromSequence', 1,
+      [set()],
+      input_sequence,
       axis=ONNXAttr(axis, AttrType.INT), 
       new_axis=ONNXAttr(new_axis, AttrType.INT))
 
@@ -529,6 +616,7 @@ class Constant(ONNXOp):
     value_string=None, 
     value_strings=None):
     super().__init__('Constant', 1,
+      [],
       sparse_value=ONNXAttr(sparse_value, AttrType.SPARSE_TENSOR), 
       value=ONNXAttr(value, AttrType.TENSOR), 
       value_float=ONNXAttr(value_float, AttrType.FLOAT), 
@@ -545,7 +633,9 @@ class ConstantOfShape(ONNXOp):
 
   def __init__(self, input,
     value=None):
-    super().__init__('ConstantOfShape', 1, input,
+    super().__init__('ConstantOfShape', 1,
+      [{'at::kLong'}],
+      input,
       value=ONNXAttr(value, AttrType.TENSOR))
 
 class Conv(ONNXOp):
@@ -561,7 +651,9 @@ class Conv(ONNXOp):
     kernel_shape=None, 
     pads=None, 
     strides=None):
-    super().__init__('Conv', 1, X, W, B,
+    super().__init__('Conv', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,W,B,
       auto_pad=ONNXAttr(auto_pad, AttrType.STRING), 
       dilations=ONNXAttr(dilations, AttrType.INTS), 
       group=ONNXAttr(group, AttrType.INT), 
@@ -582,7 +674,9 @@ class ConvInteger(ONNXOp):
     kernel_shape=None, 
     pads=None, 
     strides=None):
-    super().__init__('ConvInteger', 1, x, w, x_zero_point, w_zero_point,
+    super().__init__('ConvInteger', 1,
+      [{'at::kByte'}, {'at::kByte'}, {'at::kByte'}, {'at::kByte'}],
+      x,w,x_zero_point,w_zero_point,
       auto_pad=ONNXAttr(auto_pad, AttrType.STRING), 
       dilations=ONNXAttr(dilations, AttrType.INTS), 
       group=ONNXAttr(group, AttrType.INT), 
@@ -617,7 +711,9 @@ class ConvTranspose(ONNXOp):
     output_shape=None, 
     pads=None, 
     strides=None):
-    super().__init__('ConvTranspose', 1, X, W, B,
+    super().__init__('ConvTranspose', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,W,B,
       auto_pad=ONNXAttr(auto_pad, AttrType.STRING), 
       dilations=ONNXAttr(dilations, AttrType.INTS), 
       group=ONNXAttr(group, AttrType.INT), 
@@ -633,7 +729,9 @@ class Cos(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Cos', 1, input)
+    super().__init__('Cos', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Cosh(ONNXOp):
   """
@@ -641,7 +739,9 @@ class Cosh(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Cosh', 1, input)
+    super().__init__('Cosh', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class CumSum(ONNXOp):
   """
@@ -670,7 +770,9 @@ class CumSum(ONNXOp):
   def __init__(self, x, axis,
     exclusive=None, 
     reverse=None):
-    super().__init__('CumSum', 1, x, axis,
+    super().__init__('CumSum', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong', 'at::kInt'}],
+      x,axis,
       exclusive=ONNXAttr(exclusive, AttrType.INT), 
       reverse=ONNXAttr(reverse, AttrType.INT))
 
@@ -707,7 +809,9 @@ class DepthToSpace(ONNXOp):
   def __init__(self, input,
     blocksize=None, 
     mode=None):
-    super().__init__('DepthToSpace', 1, input,
+    super().__init__('DepthToSpace', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      input,
       blocksize=ONNXAttr(blocksize, AttrType.INT), 
       mode=ONNXAttr(mode, AttrType.STRING))
 
@@ -722,7 +826,9 @@ class DequantizeLinear(ONNXOp):
 
   def __init__(self, x, x_scale, x_zero_point,
     axis=None):
-    super().__init__('DequantizeLinear', 1, x, x_scale, x_zero_point,
+    super().__init__('DequantizeLinear', 1,
+      [{'at::kByte', 'at::kInt'}, {'at::kFloat'}, {'at::kByte', 'at::kInt'}],
+      x,x_scale,x_zero_point,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class Det(ONNXOp):
@@ -735,7 +841,9 @@ class Det(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Det', 1, X)
+    super().__init__('Det', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class DictVectorizer(ONNXOp):
   """
@@ -756,7 +864,9 @@ class DictVectorizer(ONNXOp):
   def __init__(self, X,
     int64_vocabulary=None, 
     string_vocabulary=None):
-    super().__init__('DictVectorizer', 1, X,
+    super().__init__('DictVectorizer', 1,
+      [set()],
+      X,
       int64_vocabulary=ONNXAttr(int64_vocabulary, AttrType.INTS), 
       string_vocabulary=ONNXAttr(string_vocabulary, AttrType.STRINGS))
 
@@ -765,10 +875,14 @@ class Div(ONNXOp):
   Performs element-wise binary division (with Numpy-style broadcasting support).
   
   This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
+  
+  (Opset 14 change): Extend supported types to include uint8, int8, uint16, and int16.
   """
 
   def __init__(self, A, B):
-    super().__init__('Div', 1, A, B)
+    super().__init__('Div', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      A,B)
 
 class Dropout(ONNXOp):
   """
@@ -788,7 +902,9 @@ class Dropout(ONNXOp):
 
   def __init__(self, data, ratio, training_mode,
     seed=None):
-    super().__init__('Dropout', 2, data, ratio, training_mode,
+    super().__init__('Dropout', 2,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kBool'}],
+      data,ratio,training_mode,
       seed=ONNXAttr(seed, AttrType.INT))
 
 class DynamicQuantizeLinear(ONNXOp):
@@ -818,7 +934,9 @@ class DynamicQuantizeLinear(ONNXOp):
   """
 
   def __init__(self, x):
-    super().__init__('DynamicQuantizeLinear', 3, x)
+    super().__init__('DynamicQuantizeLinear', 3,
+      [{'at::kFloat'}],
+      x)
 
 class Einsum(ONNXOp):
   """
@@ -849,7 +967,9 @@ class Einsum(ONNXOp):
 
   def __init__(self, Inputs,
     equation=None):
-    super().__init__('Einsum', 1, Inputs,
+    super().__init__('Einsum', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}],
+      Inputs,
       equation=ONNXAttr(equation, AttrType.STRING))
 
 class Elu(ONNXOp):
@@ -861,7 +981,9 @@ class Elu(ONNXOp):
 
   def __init__(self, X,
     alpha=None):
-    super().__init__('Elu', 1, X,
+    super().__init__('Elu', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,
       alpha=ONNXAttr(alpha, AttrType.FLOAT))
 
 class Equal(ONNXOp):
@@ -873,7 +995,9 @@ class Equal(ONNXOp):
   """
 
   def __init__(self, A, B):
-    super().__init__('Equal', 1, A, B)
+    super().__init__('Equal', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      A,B)
 
 class Erf(ONNXOp):
   """
@@ -881,7 +1005,9 @@ class Erf(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Erf', 1, input)
+    super().__init__('Erf', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      input)
 
 class Exp(ONNXOp):
   """
@@ -889,7 +1015,9 @@ class Exp(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Exp', 1, input)
+    super().__init__('Exp', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Expand(ONNXOp):
   """
@@ -904,7 +1032,9 @@ class Expand(ONNXOp):
   """
 
   def __init__(self, input, shape):
-    super().__init__('Expand', 1, input, shape)
+    super().__init__('Expand', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}],
+      input,shape)
 
 class EyeLike(ONNXOp):
   """
@@ -920,7 +1050,9 @@ class EyeLike(ONNXOp):
   def __init__(self, input,
     dtype=None, 
     k=None):
-    super().__init__('EyeLike', 1, input,
+    super().__init__('EyeLike', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      input,
       dtype=ONNXAttr(dtype, AttrType.INT), 
       k=ONNXAttr(k, AttrType.INT))
 
@@ -934,7 +1066,9 @@ class FeatureVectorizer(ONNXOp):
 
   def __init__(self, X,
     inputdimensions=None):
-    super().__init__('FeatureVectorizer', 1, X,
+    super().__init__('FeatureVectorizer', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       inputdimensions=ONNXAttr(inputdimensions, AttrType.INTS))
 
 class Flatten(ONNXOp):
@@ -946,7 +1080,9 @@ class Flatten(ONNXOp):
 
   def __init__(self, input,
     axis=None):
-    super().__init__('Flatten', 1, input,
+    super().__init__('Flatten', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      input,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class Floor(ONNXOp):
@@ -957,7 +1093,9 @@ class Floor(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Floor', 1, X)
+    super().__init__('Floor', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class Gather(ONNXOp):
   """
@@ -1020,7 +1158,9 @@ class Gather(ONNXOp):
 
   def __init__(self, data, indices,
     axis=None):
-    super().__init__('Gather', 1, data, indices,
+    super().__init__('Gather', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong', 'at::kInt'}],
+      data,indices,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class GatherElements(ONNXOp):
@@ -1034,7 +1174,7 @@ class GatherElements(ONNXOp):
   (gathered from the `data`) for each element in `indices`.
   
   For instance, in the 3-D case (r = 3), the output produced is determined
-  by the following equations: 
+  by the following equations:
   ```
     out[i][j][k] = input[index[i][j][k]][j][k] if axis = 0,
     out[i][j][k] = input[i][index[i][j][k]][k] if axis = 1,
@@ -1084,29 +1224,31 @@ class GatherElements(ONNXOp):
 
   def __init__(self, data, indices,
     axis=None):
-    super().__init__('GatherElements', 1, data, indices,
+    super().__init__('GatherElements', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong', 'at::kInt'}],
+      data,indices,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class GatherND(ONNXOp):
   """
-  Given `data` tensor of rank `r` >= 1, `indices` tensor of rank `q` >= 1, and `batch_dims` integer `b`, this operator gathers 
+  Given `data` tensor of rank `r` >= 1, `indices` tensor of rank `q` >= 1, and `batch_dims` integer `b`, this operator gathers
   slices of `data` into an output tensor of rank `q + r - indices_shape[-1] - 1 - b`.
   
-  `indices` is an q-dimensional integer tensor, best thought of as a `(q-1)`-dimensional tensor of index-tuples into `data`, 
+  `indices` is an q-dimensional integer tensor, best thought of as a `(q-1)`-dimensional tensor of index-tuples into `data`,
   where each element defines a slice of `data`
   
-  `batch_dims` (denoted as `b`) is an integer indicating the number of batch dimensions, i.e the leading `b` number of dimensions of 
-  `data` tensor and `indices` are representing the batches, and the gather starts from the `b+1` dimension. 
+  `batch_dims` (denoted as `b`) is an integer indicating the number of batch dimensions, i.e the leading `b` number of dimensions of
+  `data` tensor and `indices` are representing the batches, and the gather starts from the `b+1` dimension.
   
   Some salient points about the inputs' rank and shape:
-   
+  
   1) r >= 1 and q >= 1 are to be honored. There is no dependency condition to be met between ranks `r` and `q`
   
   2) The first `b` dimensions of the shape of `indices` tensor and `data` tensor must be equal.
   
   3) b < min(q, r) is to be honored.
   
-  4) The `indices_shape[-1]` should have a value between 1 (inclusive) and rank `r-b` (inclusive) 
+  4) The `indices_shape[-1]` should have a value between 1 (inclusive) and rank `r-b` (inclusive)
   
   5) All values in `indices` are expected to be within bounds [-s, s-1] along axis of size `s` (i.e.) `-data_shape[i] <= indices[...,i] <= data_shape[i] - 1`.
      It is an error if any of the index values are out of bounds.
@@ -1114,17 +1256,17 @@ class GatherND(ONNXOp):
   The output is computed as follows:
   
   The output tensor is obtained by mapping each index-tuple in the `indices` tensor to the corresponding slice of the input `data`.
-   
+  
   1) If `indices_shape[-1] > r-b` => error condition
   
   2) If `indices_shape[-1] == r-b`, since the rank of `indices` is `q`, `indices` can be thought of as `N` `(q-b-1)`-dimensional tensors
-     containing 1-D tensors of dimension `r-b`, where `N` is an integer equals to the product of 1 and all the elements in the batch dimensions 
-     of the indices_shape. Let us think of each such `r-b` ranked tensor as `indices_slice`. Each *scalar value* corresponding to `data[0:b-1,indices_slice]` 
+     containing 1-D tensors of dimension `r-b`, where `N` is an integer equals to the product of 1 and all the elements in the batch dimensions
+     of the indices_shape. Let us think of each such `r-b` ranked tensor as `indices_slice`. Each *scalar value* corresponding to `data[0:b-1,indices_slice]`
      is filled into the corresponding location of the `(q-b-1)`-dimensional tensor to form the `output` tensor (Example 1 below)
   
   3) If `indices_shape[-1] < r-b`, since the rank of `indices` is `q`, `indices` can be thought of as `N` `(q-b-1)`-dimensional tensor
-     containing 1-D tensors of dimension `< r-b`. Let us think of each such tensors as `indices_slice`. Each *tensor slice* corresponding 
-     to `data[0:b-1, indices_slice , :]` is filled into the corresponding location of the `(q-b-1)`-dimensional tensor 
+     containing 1-D tensors of dimension `< r-b`. Let us think of each such tensors as `indices_slice`. Each *tensor slice* corresponding
+     to `data[0:b-1, indices_slice , :]` is filled into the corresponding location of the `(q-b-1)`-dimensional tensor
      to form the `output` tensor (Examples 2, 3, 4 and 5 below)
   
   This operator is the inverse of `ScatterND`.
@@ -1157,7 +1299,7 @@ class GatherND(ONNXOp):
   
     indices = [[0,1],[1,0]]                 # indices_shape = [2, 2]
   
-    output  = [[2,3],[4,5]]                 # output_shape = [2, 2]   
+    output  = [[2,3],[4,5]]                 # output_shape = [2, 2]
   
   `Example 4`
   
@@ -1167,7 +1309,7 @@ class GatherND(ONNXOp):
   
     indices = [[[0,1]],[[1,0]]]             # indices_shape = [2, 1, 2]
   
-    output  = [[[2,3]],[[4,5]]]             # output_shape = [2, 1, 2] 
+    output  = [[[2,3]],[[4,5]]]             # output_shape = [2, 1, 2]
   
   `Example 5`
   
@@ -1177,12 +1319,14 @@ class GatherND(ONNXOp):
   
     indices = [[1],[0]]             # indices_shape = [2, 1]
   
-    output  = [[2,3],[4,5]]             # output_shape = [2, 2] 
+    output  = [[2,3],[4,5]]             # output_shape = [2, 2]
   """
 
   def __init__(self, data, indices,
     batch_dims=None):
-    super().__init__('GatherND', 1, data, indices,
+    super().__init__('GatherND', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}],
+      data,indices,
       batch_dims=ONNXAttr(batch_dims, AttrType.INT))
 
 class Gemm(ONNXOp):
@@ -1207,7 +1351,9 @@ class Gemm(ONNXOp):
     beta=None, 
     transA=None, 
     transB=None):
-    super().__init__('Gemm', 1, A, B, C,
+    super().__init__('Gemm', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      A,B,C,
       alpha=ONNXAttr(alpha, AttrType.FLOAT), 
       beta=ONNXAttr(beta, AttrType.FLOAT), 
       transA=ONNXAttr(transA, AttrType.INT), 
@@ -1221,7 +1367,9 @@ class GlobalAveragePool(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('GlobalAveragePool', 1, X)
+    super().__init__('GlobalAveragePool', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class GlobalLpPool(ONNXOp):
   """
@@ -1232,7 +1380,9 @@ class GlobalLpPool(ONNXOp):
 
   def __init__(self, X,
     p=None):
-    super().__init__('GlobalLpPool', 1, X,
+    super().__init__('GlobalLpPool', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,
       p=ONNXAttr(p, AttrType.INT))
 
 class GlobalMaxPool(ONNXOp):
@@ -1243,7 +1393,9 @@ class GlobalMaxPool(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('GlobalMaxPool', 1, X)
+    super().__init__('GlobalMaxPool', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class Gradient(ONNXOp):
   """
@@ -1347,7 +1499,7 @@ class Gradient(ONNXOp):
   The tensors named in attributes "xs", "zs", and "y" define the differentiated
   computation graph, and the inputs to Gradient node define the values at
   which the gradient is computed. We can feed different tensors to the identified
-  graph. For example, one can compute the gradient of Y with respect to H at 
+  graph. For example, one can compute the gradient of Y with respect to H at
   a specific value of H, H_1, by providing that value as an input to the Gradient
   node.
   
@@ -1375,7 +1527,9 @@ class Gradient(ONNXOp):
     xs=None, 
     y=None, 
     zs=None):
-    super().__init__('Gradient', 1, Inputs,
+    super().__init__('Gradient', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      Inputs,
       xs=ONNXAttr(xs, AttrType.STRINGS), 
       y=ONNXAttr(y, AttrType.STRING), 
       zs=ONNXAttr(zs, AttrType.STRINGS))
@@ -1389,7 +1543,9 @@ class Greater(ONNXOp):
   """
 
   def __init__(self, A, B):
-    super().__init__('Greater', 1, A, B)
+    super().__init__('Greater', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      A,B)
 
 class GreaterOrEqual(ONNXOp):
   """
@@ -1400,7 +1556,9 @@ class GreaterOrEqual(ONNXOp):
   """
 
   def __init__(self, A, B):
-    super().__init__('GreaterOrEqual', 1, A, B)
+    super().__init__('GreaterOrEqual', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}],
+      A,B)
 
 class GRU(ONNXOp):
   """
@@ -1486,14 +1644,18 @@ class GRU(ONNXOp):
     clip=None, 
     direction=None, 
     hidden_size=None, 
+    layout=None, 
     linear_before_reset=None):
-    super().__init__('GRU', 2, X, W, R, B, sequence_lens, initial_h,
+    super().__init__('GRU', 2,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kInt'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,W,R,B,sequence_lens,initial_h,
       activation_alpha=ONNXAttr(activation_alpha, AttrType.FLOATS), 
       activation_beta=ONNXAttr(activation_beta, AttrType.FLOATS), 
       activations=ONNXAttr(activations, AttrType.STRINGS), 
       clip=ONNXAttr(clip, AttrType.FLOAT), 
       direction=ONNXAttr(direction, AttrType.STRING), 
       hidden_size=ONNXAttr(hidden_size, AttrType.INT), 
+      layout=ONNXAttr(layout, AttrType.INT), 
       linear_before_reset=ONNXAttr(linear_before_reset, AttrType.INT))
 
 class Hardmax(ONNXOp):
@@ -1510,7 +1672,9 @@ class Hardmax(ONNXOp):
 
   def __init__(self, input,
     axis=None):
-    super().__init__('Hardmax', 1, input,
+    super().__init__('Hardmax', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      input,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class HardSigmoid(ONNXOp):
@@ -1523,17 +1687,33 @@ class HardSigmoid(ONNXOp):
   def __init__(self, X,
     alpha=None, 
     beta=None):
-    super().__init__('HardSigmoid', 1, X,
+    super().__init__('HardSigmoid', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,
       alpha=ONNXAttr(alpha, AttrType.FLOAT), 
       beta=ONNXAttr(beta, AttrType.FLOAT))
 
+class HardSwish(ONNXOp):
+  """
+  HardSwish takes one input data (Tensor<T>) and produces one output data (Tensor<T>) where
+  the HardSwish function, y = x * max(0, min(1, alpha * x + beta)) = x * HardSigmoid<alpha, beta>(x),
+  where alpha = 1/6 and beta = 0.5, is applied to the tensor elementwise.
+  """
+
+  def __init__(self, X):
+    super().__init__('HardSwish', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X)
+
 class Identity(ONNXOp):
   """
   Identity operator
   """
 
   def __init__(self, input):
-    super().__init__('Identity', 1, input)
+    super().__init__('Identity', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      input)
 
 class If(ONNXOp):
   """
@@ -1543,7 +1723,9 @@ class If(ONNXOp):
   def __init__(self, cond,
     else_branch=None, 
     then_branch=None):
-    super().__init__('If', 1, cond,
+    super().__init__('If', 1,
+      [{'at::kBool'}],
+      cond,
       else_branch=ONNXAttr(else_branch, AttrType.GRAPH), 
       then_branch=ONNXAttr(then_branch, AttrType.GRAPH))
 
@@ -1564,7 +1746,9 @@ class Imputer(ONNXOp):
     imputed_value_int64s=None, 
     replaced_value_float=None, 
     replaced_value_int64=None):
-    super().__init__('Imputer', 1, X,
+    super().__init__('Imputer', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       imputed_value_floats=ONNXAttr(imputed_value_floats, AttrType.FLOATS), 
       imputed_value_int64s=ONNXAttr(imputed_value_int64s, AttrType.INTS), 
       replaced_value_float=ONNXAttr(replaced_value_float, AttrType.FLOAT), 
@@ -1581,7 +1765,9 @@ class InstanceNormalization(ONNXOp):
 
   def __init__(self, input, scale, B,
     epsilon=None):
-    super().__init__('InstanceNormalization', 1, input, scale, B,
+    super().__init__('InstanceNormalization', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input,scale,B,
       epsilon=ONNXAttr(epsilon, AttrType.FLOAT))
 
 class IsInf(ONNXOp):
@@ -1592,7 +1778,9 @@ class IsInf(ONNXOp):
   def __init__(self, X,
     detect_negative=None, 
     detect_positive=None):
-    super().__init__('IsInf', 1, X,
+    super().__init__('IsInf', 1,
+      [{'at::kDouble', 'at::kFloat'}],
+      X,
       detect_negative=ONNXAttr(detect_negative, AttrType.INT), 
       detect_positive=ONNXAttr(detect_positive, AttrType.INT))
 
@@ -1602,7 +1790,9 @@ class IsNaN(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('IsNaN', 1, X)
+    super().__init__('IsNaN', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class LabelEncoder(ONNXOp):
   """
@@ -1635,7 +1825,9 @@ class LabelEncoder(ONNXOp):
     values_floats=None, 
     values_int64s=None, 
     values_strings=None):
-    super().__init__('LabelEncoder', 1, X,
+    super().__init__('LabelEncoder', 1,
+      [{'at::kLong', 'at::kFloat'}],
+      X,
       default_float=ONNXAttr(default_float, AttrType.FLOAT), 
       default_int64=ONNXAttr(default_int64, AttrType.INT), 
       default_string=ONNXAttr(default_string, AttrType.STRING), 
@@ -1655,7 +1847,9 @@ class LeakyRelu(ONNXOp):
 
   def __init__(self, X,
     alpha=None):
-    super().__init__('LeakyRelu', 1, X,
+    super().__init__('LeakyRelu', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,
       alpha=ONNXAttr(alpha, AttrType.FLOAT))
 
 class Less(ONNXOp):
@@ -1667,7 +1861,9 @@ class Less(ONNXOp):
   """
 
   def __init__(self, A, B):
-    super().__init__('Less', 1, A, B)
+    super().__init__('Less', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      A,B)
 
 class LessOrEqual(ONNXOp):
   """
@@ -1678,7 +1874,9 @@ class LessOrEqual(ONNXOp):
   """
 
   def __init__(self, A, B):
-    super().__init__('LessOrEqual', 1, A, B)
+    super().__init__('LessOrEqual', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}],
+      A,B)
 
 class LinearClassifier(ONNXOp):
   """
@@ -1692,7 +1890,9 @@ class LinearClassifier(ONNXOp):
     intercepts=None, 
     multi_class=None, 
     post_transform=None):
-    super().__init__('LinearClassifier', 2, X,
+    super().__init__('LinearClassifier', 2,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       classlabels_ints=ONNXAttr(classlabels_ints, AttrType.INTS), 
       classlabels_strings=ONNXAttr(classlabels_strings, AttrType.STRINGS), 
       coefficients=ONNXAttr(coefficients, AttrType.FLOATS), 
@@ -1715,7 +1915,9 @@ class LinearRegressor(ONNXOp):
     intercepts=None, 
     post_transform=None, 
     targets=None):
-    super().__init__('LinearRegressor', 1, X,
+    super().__init__('LinearRegressor', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       coefficients=ONNXAttr(coefficients, AttrType.FLOATS), 
       intercepts=ONNXAttr(intercepts, AttrType.FLOATS), 
       post_transform=ONNXAttr(post_transform, AttrType.STRING), 
@@ -1727,7 +1929,9 @@ class Log(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Log', 1, input)
+    super().__init__('Log', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class LogSoftmax(ONNXOp):
   """
@@ -1743,7 +1947,9 @@ class LogSoftmax(ONNXOp):
 
   def __init__(self, input,
     axis=None):
-    super().__init__('LogSoftmax', 1, input,
+    super().__init__('LogSoftmax', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      input,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class Loop(ONNXOp):
@@ -1887,7 +2093,9 @@ class Loop(ONNXOp):
 
   def __init__(self, M, cond, v_initial,
     body=None):
-    super().__init__('Loop', 1, M, cond, v_initial,
+    super().__init__('Loop', 1,
+      [{'at::kLong'}, {'at::kBool'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      M,cond,v_initial,
       body=ONNXAttr(body, AttrType.GRAPH))
 
 class LpNormalization(ONNXOp):
@@ -1898,7 +2106,9 @@ class LpNormalization(ONNXOp):
   def __init__(self, input,
     axis=None, 
     p=None):
-    super().__init__('LpNormalization', 1, input,
+    super().__init__('LpNormalization', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input,
       axis=ONNXAttr(axis, AttrType.INT), 
       p=ONNXAttr(p, AttrType.INT))
 
@@ -1917,7 +2127,9 @@ class LpPool(ONNXOp):
     p=None, 
     pads=None, 
     strides=None):
-    super().__init__('LpPool', 1, X,
+    super().__init__('LpPool', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,
       auto_pad=ONNXAttr(auto_pad, AttrType.STRING), 
       kernel_shape=ONNXAttr(kernel_shape, AttrType.INTS), 
       p=ONNXAttr(p, AttrType.INT), 
@@ -1943,7 +2155,9 @@ class LRN(ONNXOp):
     beta=None, 
     bias=None, 
     size=None):
-    super().__init__('LRN', 1, X,
+    super().__init__('LRN', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      X,
       alpha=ONNXAttr(alpha, AttrType.FLOAT), 
       beta=ONNXAttr(beta, AttrType.FLOAT), 
       bias=ONNXAttr(bias, AttrType.FLOAT), 
@@ -2041,15 +2255,19 @@ class LSTM(ONNXOp):
     clip=None, 
     direction=None, 
     hidden_size=None, 
-    input_forget=None):
-    super().__init__('LSTM', 3, X, W, R, B, sequence_lens, initial_h, initial_c, P,
+    input_forget=None, 
+    layout=None):
+    super().__init__('LSTM', 3,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kInt'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,W,R,B,sequence_lens,initial_h,initial_c,P,
       activation_alpha=ONNXAttr(activation_alpha, AttrType.FLOATS), 
       activation_beta=ONNXAttr(activation_beta, AttrType.FLOATS), 
       activations=ONNXAttr(activations, AttrType.STRINGS), 
       clip=ONNXAttr(clip, AttrType.FLOAT), 
       direction=ONNXAttr(direction, AttrType.STRING), 
       hidden_size=ONNXAttr(hidden_size, AttrType.INT), 
-      input_forget=ONNXAttr(input_forget, AttrType.INT))
+      input_forget=ONNXAttr(input_forget, AttrType.INT), 
+      layout=ONNXAttr(layout, AttrType.INT))
 
 class MatMul(ONNXOp):
   """
@@ -2057,7 +2275,9 @@ class MatMul(ONNXOp):
   """
 
   def __init__(self, A, B):
-    super().__init__('MatMul', 1, A, B)
+    super().__init__('MatMul', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      A,B)
 
 class MatMulInteger(ONNXOp):
   """
@@ -2066,7 +2286,9 @@ class MatMulInteger(ONNXOp):
   """
 
   def __init__(self, A, B, a_zero_point, b_zero_point):
-    super().__init__('MatMulInteger', 1, A, B, a_zero_point, b_zero_point)
+    super().__init__('MatMulInteger', 1,
+      [{'at::kByte'}, {'at::kByte'}, {'at::kByte'}, {'at::kByte'}],
+      A,B,a_zero_point,b_zero_point)
 
 class Max(ONNXOp):
   """
@@ -2076,7 +2298,9 @@ class Max(ONNXOp):
   """
 
   def __init__(self, data_0):
-    super().__init__('Max', 1, data_0)
+    super().__init__('Max', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      data_0)
 
 class MaxPool(ONNXOp):
   """
@@ -2119,7 +2343,9 @@ class MaxPool(ONNXOp):
     pads=None, 
     storage_order=None, 
     strides=None):
-    super().__init__('MaxPool', 2, X,
+    super().__init__('MaxPool', 2,
+      [{'at::kDouble', 'at::kByte', 'at::kHalf', 'at::kFloat'}],
+      X,
       auto_pad=ONNXAttr(auto_pad, AttrType.STRING), 
       ceil_mode=ONNXAttr(ceil_mode, AttrType.INT), 
       dilations=ONNXAttr(dilations, AttrType.INTS), 
@@ -2138,7 +2364,9 @@ class MaxRoiPool(ONNXOp):
   def __init__(self, X, rois,
     pooled_shape=None, 
     spatial_scale=None):
-    super().__init__('MaxRoiPool', 1, X, rois,
+    super().__init__('MaxRoiPool', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,rois,
       pooled_shape=ONNXAttr(pooled_shape, AttrType.INTS), 
       spatial_scale=ONNXAttr(spatial_scale, AttrType.FLOAT))
 
@@ -2168,7 +2396,9 @@ class MaxUnpool(ONNXOp):
     kernel_shape=None, 
     pads=None, 
     strides=None):
-    super().__init__('MaxUnpool', 1, X, I, output_shape,
+    super().__init__('MaxUnpool', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kLong'}, {'at::kLong'}],
+      X,I,output_shape,
       kernel_shape=ONNXAttr(kernel_shape, AttrType.INTS), 
       pads=ONNXAttr(pads, AttrType.INTS), 
       strides=ONNXAttr(strides, AttrType.INTS))
@@ -2181,7 +2411,9 @@ class Mean(ONNXOp):
   """
 
   def __init__(self, data_0):
-    super().__init__('Mean', 1, data_0)
+    super().__init__('Mean', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      data_0)
 
 class MeanVarianceNormalization(ONNXOp):
   """
@@ -2191,7 +2423,9 @@ class MeanVarianceNormalization(ONNXOp):
 
   def __init__(self, X,
     axes=None):
-    super().__init__('MeanVarianceNormalization', 1, X,
+    super().__init__('MeanVarianceNormalization', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      X,
       axes=ONNXAttr(axes, AttrType.INTS))
 
 class Min(ONNXOp):
@@ -2202,20 +2436,22 @@ class Min(ONNXOp):
   """
 
   def __init__(self, data_0):
-    super().__init__('Min', 1, data_0)
+    super().__init__('Min', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      data_0)
 
 class Mod(ONNXOp):
   """
-    Performs element-wise binary modulus (with Numpy-style broadcasting support). 
+    Performs element-wise binary modulus (with Numpy-style broadcasting support).
       The sign of the remainder is the same as that of the Divisor.
-    
-      Mod operator can also behave like C fmod() or numpy.fmod. In this case, the sign of the remainder however, will be the same as the Dividend 
+  
+      Mod operator can also behave like C fmod() or numpy.fmod. In this case, the sign of the remainder however, will be the same as the Dividend
       (in contrast to integer mod). To force a behavior like numpy.fmod() an 'fmod' Attribute is provided.
-      This attribute is set to 0 by default causing the behavior to be like integer mod. 
+      This attribute is set to 0 by default causing the behavior to be like integer mod.
       Setting this attribute to 1 causes the remainder to be calculated similar to that of numpy.fmod().
   
       If the input type is floating point, then `fmod` attribute must be set to 1.
-    
+  
       In case of dividend being zero, the results will be platform dependent.
   
     This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
@@ -2223,7 +2459,9 @@ class Mod(ONNXOp):
 
   def __init__(self, A, B,
     fmod=None):
-    super().__init__('Mod', 1, A, B,
+    super().__init__('Mod', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      A,B,
       fmod=ONNXAttr(fmod, AttrType.INT))
 
 class Momentum(ONNXOp):
@@ -2233,7 +2471,7 @@ class Momentum(ONNXOp):
   
       Let's define the behavior of this operator. As you can imagine, SG with momentum requires
       several parameters:
-       
+  
        - The learning-rate "R".
        - The update count "T". That is, the number of conducted training iterations. It should
          be zero in the first training iteration.
@@ -2247,7 +2485,7 @@ class Momentum(ONNXOp):
       Other necessary inputs are "X"'s gradient (called "G") and "X"'s momentum (called "V"). This
       Momentum operator maps all these inputs to the new value of "X" (called "X_new") and its new
       momentum (called "V_new").
-      
+  
       This operator supports two different momentum algorithms. Set the attribute "mode" to
       "nesterov" if Nesterov's momentum is desired. Otherwise, set the attribute "model" to
       "standard" to use standard momentum. Computation details are described subsequently.
@@ -2295,7 +2533,9 @@ class Momentum(ONNXOp):
     beta=None, 
     mode=None, 
     norm_coefficient=None):
-    super().__init__('Momentum', 1, R, T, inputs,
+    super().__init__('Momentum', 1,
+      [{'at::kDouble', 'at::kFloat'}, {'at::kLong'}, {'at::kDouble', 'at::kFloat'}],
+      R,T,inputs,
       alpha=ONNXAttr(alpha, AttrType.FLOAT), 
       beta=ONNXAttr(beta, AttrType.FLOAT), 
       mode=ONNXAttr(mode, AttrType.STRING), 
@@ -2306,10 +2546,14 @@ class Mul(ONNXOp):
   Performs element-wise binary multiplication (with Numpy-style broadcasting support).
   
   This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
+  
+  (Opset 14 change): Extend supported types to include uint8, int8, uint16, and int16.
   """
 
   def __init__(self, A, B):
-    super().__init__('Mul', 1, A, B)
+    super().__init__('Mul', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      A,B)
 
 class Multinomial(ONNXOp):
   """
@@ -2321,7 +2565,9 @@ class Multinomial(ONNXOp):
     dtype=None, 
     sample_size=None, 
     seed=None):
-    super().__init__('Multinomial', 1, input,
+    super().__init__('Multinomial', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input,
       dtype=ONNXAttr(dtype, AttrType.INT), 
       sample_size=ONNXAttr(sample_size, AttrType.INT), 
       seed=ONNXAttr(seed, AttrType.FLOAT))
@@ -2334,7 +2580,9 @@ class Neg(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Neg', 1, X)
+    super().__init__('Neg', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      X)
 
 class NegativeLogLikelihoodLoss(ONNXOp):
   """
@@ -2352,7 +2600,7 @@ class NegativeLogLikelihoodLoss(ONNXOp):
       loss[n][d_1][d_2]...[d_k] = -input[n][c][d_1][d_2]...[d_k] * weight[c].
   
   loss is zero for the case when target-value equals ignore_index.
-      
+  
       loss[n][d_1][d_2]...[d_k] = 0, when target[n][d_1][d_2]...[d_k] = ignore_index
   
   If "reduction" attribute is set to "none", the operator's output will be the above loss with shape (N, d1, d2, ..., dk).
@@ -2429,7 +2677,9 @@ class NegativeLogLikelihoodLoss(ONNXOp):
   def __init__(self, input, target, weight,
     ignore_index=None, 
     reduction=None):
-    super().__init__('NegativeLogLikelihoodLoss', 1, input, target, weight,
+    super().__init__('NegativeLogLikelihoodLoss', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kLong', 'at::kInt'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input,target,weight,
       ignore_index=ONNXAttr(ignore_index, AttrType.INT), 
       reduction=ONNXAttr(reduction, AttrType.STRING))
 
@@ -2446,7 +2696,9 @@ class NonMaxSuppression(ONNXOp):
 
   def __init__(self, boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold,
     center_point_box=None):
-    super().__init__('NonMaxSuppression', 1, boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold,
+    super().__init__('NonMaxSuppression', 1,
+      [{'at::kFloat'}, {'at::kFloat'}, {'at::kLong'}, {'at::kFloat'}, {'at::kFloat'}],
+      boxes,scores,max_output_boxes_per_class,iou_threshold,score_threshold,
       center_point_box=ONNXAttr(center_point_box, AttrType.INT))
 
 class NonZero(ONNXOp):
@@ -2458,7 +2710,9 @@ class NonZero(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('NonZero', 1, X)
+    super().__init__('NonZero', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      X)
 
 class Normalizer(ONNXOp):
   """
@@ -2476,7 +2730,9 @@ class Normalizer(ONNXOp):
 
   def __init__(self, X,
     norm=None):
-    super().__init__('Normalizer', 1, X,
+    super().__init__('Normalizer', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       norm=ONNXAttr(norm, AttrType.STRING))
 
 class Not(ONNXOp):
@@ -2485,7 +2741,9 @@ class Not(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Not', 1, X)
+    super().__init__('Not', 1,
+      [{'at::kBool'}],
+      X)
 
 class OneHot(ONNXOp):
   """
@@ -2511,15 +2769,17 @@ class OneHot(ONNXOp):
 
   def __init__(self, indices, depth, values,
     axis=None):
-    super().__init__('OneHot', 1, indices, depth, values,
+    super().__init__('OneHot', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      indices,depth,values,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class OneHotEncoder(ONNXOp):
   """
       Replace each input element with an array of ones and zeros, where a single
-      one is placed at the index of the category that was passed in. The total category count 
+      one is placed at the index of the category that was passed in. The total category count
       will determine the size of the extra dimension of the output array Y.<br>
-      For example, if we pass a tensor with a single value of 4, and a category count of 8, 
+      For example, if we pass a tensor with a single value of 4, and a category count of 8,
       the output will be a tensor with ``[0,0,0,0,1,0,0,0]``.<br>
       This operator assumes every input feature is from the same set of categories.<br>
       If the input is a tensor of float, int32, or double, the data will be cast
@@ -2530,7 +2790,9 @@ class OneHotEncoder(ONNXOp):
     cats_int64s=None, 
     cats_strings=None, 
     zeros=None):
-    super().__init__('OneHotEncoder', 1, X,
+    super().__init__('OneHotEncoder', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       cats_int64s=ONNXAttr(cats_int64s, AttrType.INTS), 
       cats_strings=ONNXAttr(cats_strings, AttrType.STRINGS), 
       zeros=ONNXAttr(zeros, AttrType.INT))
@@ -2544,16 +2806,18 @@ class Or(ONNXOp):
   """
 
   def __init__(self, A, B):
-    super().__init__('Or', 1, A, B)
+    super().__init__('Or', 1,
+      [{'at::kBool'}, {'at::kBool'}],
+      A,B)
 
 class Pad(ONNXOp):
   """
-  Given a tensor containing the data to be padded (`data`), a tensor containing the number of start and end pad values for axis (`pads`), (optionally) a `mode`, and (optionally) `constant_value`, 
+  Given a tensor containing the data to be padded (`data`), a tensor containing the number of start and end pad values for axis (`pads`), (optionally) a `mode`, and (optionally) `constant_value`,
   a padded tensor (`output`) is generated.
   
   The three supported `modes` are (similar to corresponding modes supported by `numpy.pad`):
   
-  1) `constant`(default) - pads with a given constant value as specified by `constant_value` (which defaults to 0)
+  1) `constant`(default) - pads with a given constant value as specified by `constant_value` (which defaults to 0, empty string, or False)
   
   2) `reflect` - pads with the reflection of the vector mirrored on the first and last values of the vector along each axis
   
@@ -2563,12 +2827,12 @@ class Pad(ONNXOp):
   Example 1 (`constant` mode):
     Insert 0 pads to the beginning of the second dimension.
   
-    data = 
+    data =
     [
         [1.0, 1.2],
         [2.3, 3.4],
         [4.5, 5.7],
-    ] 
+    ]
   
     pads = [0, 2, 0, 0]
   
@@ -2576,7 +2840,7 @@ class Pad(ONNXOp):
   
     constant_value = 0.0
   
-    output = 
+    output =
     [
         [0.0, 0.0, 1.0, 1.2],
         [0.0, 0.0, 2.3, 3.4],
@@ -2585,18 +2849,18 @@ class Pad(ONNXOp):
   
   
   Example 2 (`reflect` mode):
-    data = 
+    data =
     [
         [1.0, 1.2],
         [2.3, 3.4],
         [4.5, 5.7],
-    ] 
+    ]
   
     pads = [0, 2, 0, 0]
   
     mode = 'reflect'
   
-    output = 
+    output =
     [
         [1.0, 1.2, 1.0, 1.2],
         [2.3, 3.4, 2.3, 3.4],
@@ -2605,18 +2869,18 @@ class Pad(ONNXOp):
   
   
   Example 3 (`edge` mode):
-    data = 
+    data =
     [
         [1.0, 1.2],
         [2.3, 3.4],
         [4.5, 5.7],
-    ] 
+    ]
   
     pads = [0, 2, 0, 0]
   
     mode = 'edge'
   
-    output = 
+    output =
     [
         [1.0, 1.0, 1.0, 1.2],
         [2.3, 2.3, 2.3, 3.4],
@@ -2626,7 +2890,9 @@ class Pad(ONNXOp):
 
   def __init__(self, data, pads, constant_value,
     mode=None):
-    super().__init__('Pad', 1, data, pads, constant_value,
+    super().__init__('Pad', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      data,pads,constant_value,
       mode=ONNXAttr(mode, AttrType.STRING))
 
 class Pow(ONNXOp):
@@ -2638,7 +2904,9 @@ class Pow(ONNXOp):
   """
 
   def __init__(self, X, Y):
-    super().__init__('Pow', 1, X, Y)
+    super().__init__('Pow', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}],
+      X,Y)
 
 class PRelu(ONNXOp):
   """
@@ -2649,7 +2917,9 @@ class PRelu(ONNXOp):
   """
 
   def __init__(self, X, slope):
-    super().__init__('PRelu', 1, X, slope)
+    super().__init__('PRelu', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat'}],
+      X,slope)
 
 class QLinearConv(ONNXOp):
   """
@@ -2658,7 +2928,7 @@ class QLinearConv(ONNXOp):
   and computes the quantized output. Each scale and zero-point pair must have same shape.
   It means they must be either scalars (per tensor) or 1-D tensors (per output channel).
   Each input or output and its related zero point must have same type.
-  When bias is present it must be quantized using scale = input scale * weight scale and 
+  When bias is present it must be quantized using scale = input scale * weight scale and
   zero point as 0.
   """
 
@@ -2669,7 +2939,9 @@ class QLinearConv(ONNXOp):
     kernel_shape=None, 
     pads=None, 
     strides=None):
-    super().__init__('QLinearConv', 1, x, x_scale, x_zero_point, w, w_scale, w_zero_point, y_scale, y_zero_point, B,
+    super().__init__('QLinearConv', 1,
+      [{'at::kByte'}, {'at::kFloat'}, {'at::kByte'}, {'at::kByte'}, {'at::kFloat'}, {'at::kByte'}, {'at::kFloat'}, {'at::kByte'}, {'at::kInt'}],
+      x,x_scale,x_zero_point,w,w_scale,w_zero_point,y_scale,y_zero_point,B,
       auto_pad=ONNXAttr(auto_pad, AttrType.STRING), 
       dilations=ONNXAttr(dilations, AttrType.INTS), 
       group=ONNXAttr(group, AttrType.INT), 
@@ -2690,7 +2962,9 @@ class QLinearMatMul(ONNXOp):
   """
 
   def __init__(self, a, a_scale, a_zero_point, b, b_scale, b_zero_point, y_scale, y_zero_point):
-    super().__init__('QLinearMatMul', 1, a, a_scale, a_zero_point, b, b_scale, b_zero_point, y_scale, y_zero_point)
+    super().__init__('QLinearMatMul', 1,
+      [{'at::kByte'}, {'at::kFloat'}, {'at::kByte'}, {'at::kByte'}, {'at::kFloat'}, {'at::kByte'}, {'at::kFloat'}, {'at::kByte'}],
+      a,a_scale,a_zero_point,b,b_scale,b_zero_point,y_scale,y_zero_point)
 
 class QuantizeLinear(ONNXOp):
   """
@@ -2702,7 +2976,9 @@ class QuantizeLinear(ONNXOp):
 
   def __init__(self, x, y_scale, y_zero_point,
     axis=None):
-    super().__init__('QuantizeLinear', 1, x, y_scale, y_zero_point,
+    super().__init__('QuantizeLinear', 1,
+      [{'at::kInt', 'at::kFloat'}, {'at::kFloat'}, {'at::kByte'}],
+      x,y_scale,y_zero_point,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class RandomNormal(ONNXOp):
@@ -2723,6 +2999,7 @@ class RandomNormal(ONNXOp):
     seed=None, 
     shape=None):
     super().__init__('RandomNormal', 1,
+      [],
       dtype=ONNXAttr(dtype, AttrType.INT), 
       mean=ONNXAttr(mean, AttrType.FLOAT), 
       scale=ONNXAttr(scale, AttrType.FLOAT), 
@@ -2745,7 +3022,9 @@ class RandomNormalLike(ONNXOp):
     mean=None, 
     scale=None, 
     seed=None):
-    super().__init__('RandomNormalLike', 1, input,
+    super().__init__('RandomNormalLike', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      input,
       dtype=ONNXAttr(dtype, AttrType.INT), 
       mean=ONNXAttr(mean, AttrType.FLOAT), 
       scale=ONNXAttr(scale, AttrType.FLOAT), 
@@ -2768,6 +3047,7 @@ class RandomUniform(ONNXOp):
     seed=None, 
     shape=None):
     super().__init__('RandomUniform', 1,
+      [],
       dtype=ONNXAttr(dtype, AttrType.INT), 
       high=ONNXAttr(high, AttrType.FLOAT), 
       low=ONNXAttr(low, AttrType.FLOAT), 
@@ -2790,7 +3070,9 @@ class RandomUniformLike(ONNXOp):
     high=None, 
     low=None, 
     seed=None):
-    super().__init__('RandomUniformLike', 1, input,
+    super().__init__('RandomUniformLike', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      input,
       dtype=ONNXAttr(dtype, AttrType.INT), 
       high=ONNXAttr(high, AttrType.FLOAT), 
       low=ONNXAttr(low, AttrType.FLOAT), 
@@ -2825,7 +3107,9 @@ class Range(ONNXOp):
   """
 
   def __init__(self, start, limit, delta):
-    super().__init__('Range', 1, start, limit, delta)
+    super().__init__('Range', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kShort', 'at::kFloat'}, {'at::kDouble', 'at::kLong', 'at::kInt', 'at::kShort', 'at::kFloat'}, {'at::kDouble', 'at::kLong', 'at::kInt', 'at::kShort', 'at::kFloat'}],
+      start,limit,delta)
 
 class Reciprocal(ONNXOp):
   """
@@ -2835,7 +3119,9 @@ class Reciprocal(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Reciprocal', 1, X)
+    super().__init__('Reciprocal', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class ReduceL1(ONNXOp):
   """
@@ -2850,7 +3136,9 @@ class ReduceL1(ONNXOp):
   def __init__(self, data,
     axes=None, 
     keepdims=None):
-    super().__init__('ReduceL1', 1, data,
+    super().__init__('ReduceL1', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axes=ONNXAttr(axes, AttrType.INTS), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -2867,7 +3155,9 @@ class ReduceL2(ONNXOp):
   def __init__(self, data,
     axes=None, 
     keepdims=None):
-    super().__init__('ReduceL2', 1, data,
+    super().__init__('ReduceL2', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axes=ONNXAttr(axes, AttrType.INTS), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -2884,7 +3174,9 @@ class ReduceLogSum(ONNXOp):
   def __init__(self, data,
     axes=None, 
     keepdims=None):
-    super().__init__('ReduceLogSum', 1, data,
+    super().__init__('ReduceLogSum', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axes=ONNXAttr(axes, AttrType.INTS), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -2901,7 +3193,9 @@ class ReduceLogSumExp(ONNXOp):
   def __init__(self, data,
     axes=None, 
     keepdims=None):
-    super().__init__('ReduceLogSumExp', 1, data,
+    super().__init__('ReduceLogSumExp', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axes=ONNXAttr(axes, AttrType.INTS), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -2918,7 +3212,9 @@ class ReduceMax(ONNXOp):
   def __init__(self, data,
     axes=None, 
     keepdims=None):
-    super().__init__('ReduceMax', 1, data,
+    super().__init__('ReduceMax', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axes=ONNXAttr(axes, AttrType.INTS), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -2935,7 +3231,9 @@ class ReduceMean(ONNXOp):
   def __init__(self, data,
     axes=None, 
     keepdims=None):
-    super().__init__('ReduceMean', 1, data,
+    super().__init__('ReduceMean', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axes=ONNXAttr(axes, AttrType.INTS), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -2952,7 +3250,9 @@ class ReduceMin(ONNXOp):
   def __init__(self, data,
     axes=None, 
     keepdims=None):
-    super().__init__('ReduceMin', 1, data,
+    super().__init__('ReduceMin', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axes=ONNXAttr(axes, AttrType.INTS), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -2969,7 +3269,9 @@ class ReduceProd(ONNXOp):
   def __init__(self, data,
     axes=None, 
     keepdims=None):
-    super().__init__('ReduceProd', 1, data,
+    super().__init__('ReduceProd', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axes=ONNXAttr(axes, AttrType.INTS), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -2986,7 +3288,9 @@ class ReduceSum(ONNXOp):
   def __init__(self, data, axes,
     keepdims=None, 
     noop_with_empty_axes=None):
-    super().__init__('ReduceSum', 1, data, axes,
+    super().__init__('ReduceSum', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}],
+      data,axes,
       keepdims=ONNXAttr(keepdims, AttrType.INT), 
       noop_with_empty_axes=ONNXAttr(noop_with_empty_axes, AttrType.INT))
 
@@ -3003,7 +3307,9 @@ class ReduceSumSquare(ONNXOp):
   def __init__(self, data,
     axes=None, 
     keepdims=None):
-    super().__init__('ReduceSumSquare', 1, data,
+    super().__init__('ReduceSumSquare', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kHalf', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       axes=ONNXAttr(axes, AttrType.INTS), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -3015,7 +3321,9 @@ class Relu(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Relu', 1, X)
+    super().__init__('Relu', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      X)
 
 class Reshape(ONNXOp):
   """
@@ -3024,11 +3332,16 @@ class Reshape(ONNXOp):
   At most one dimension of the new shape can be -1. In this case, the value is
   inferred from the size of the tensor and the remaining dimensions. A dimension
   could also be 0, in which case the actual dimension value is unchanged (i.e. taken
-  from the input tensor).
+  from the input tensor). If 'allowzero' is set, and the new shape includes 0, the
+  dimension will be set explicitly to zero (i.e. not taken from input tensor)
   """
 
-  def __init__(self, data, shape):
-    super().__init__('Reshape', 1, data, shape)
+  def __init__(self, data, shape,
+    allowzero=None):
+    super().__init__('Reshape', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}],
+      data,shape,
+      allowzero=ONNXAttr(allowzero, AttrType.INT))
 
 class Resize(ONNXOp):
   """
@@ -3044,7 +3357,9 @@ class Resize(ONNXOp):
     extrapolation_value=None, 
     mode=None, 
     nearest_mode=None):
-    super().__init__('Resize', 1, X, roi, scales, sizes,
+    super().__init__('Resize', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kFloat'}, {'at::kLong'}],
+      X,roi,scales,sizes,
       coordinate_transformation_mode=ONNXAttr(coordinate_transformation_mode, AttrType.STRING), 
       cubic_coeff_a=ONNXAttr(cubic_coeff_a, AttrType.FLOAT), 
       exclude_outside=ONNXAttr(exclude_outside, AttrType.INT), 
@@ -3092,7 +3407,9 @@ class ReverseSequence(ONNXOp):
   def __init__(self, input, sequence_lens,
     batch_axis=None, 
     time_axis=None):
-    super().__init__('ReverseSequence', 1, input, sequence_lens,
+    super().__init__('ReverseSequence', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}, {'at::kLong'}],
+      input,sequence_lens,
       batch_axis=ONNXAttr(batch_axis, AttrType.INT), 
       time_axis=ONNXAttr(time_axis, AttrType.INT))
 
@@ -3167,14 +3484,18 @@ class RNN(ONNXOp):
     activations=None, 
     clip=None, 
     direction=None, 
-    hidden_size=None):
-    super().__init__('RNN', 2, X, W, R, B, sequence_lens, initial_h,
+    hidden_size=None, 
+    layout=None):
+    super().__init__('RNN', 2,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kInt'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,W,R,B,sequence_lens,initial_h,
       activation_alpha=ONNXAttr(activation_alpha, AttrType.FLOATS), 
       activation_beta=ONNXAttr(activation_beta, AttrType.FLOATS), 
       activations=ONNXAttr(activations, AttrType.STRINGS), 
       clip=ONNXAttr(clip, AttrType.FLOAT), 
       direction=ONNXAttr(direction, AttrType.STRING), 
-      hidden_size=ONNXAttr(hidden_size, AttrType.INT))
+      hidden_size=ONNXAttr(hidden_size, AttrType.INT), 
+      layout=ONNXAttr(layout, AttrType.INT))
 
 class RoiAlign(ONNXOp):
   """
@@ -3197,7 +3518,9 @@ class RoiAlign(ONNXOp):
     output_width=None, 
     sampling_ratio=None, 
     spatial_scale=None):
-    super().__init__('RoiAlign', 1, X, rois, batch_indices,
+    super().__init__('RoiAlign', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kDouble', 'at::kHalf', 'at::kFloat'}, {'at::kLong'}],
+      X,rois,batch_indices,
       mode=ONNXAttr(mode, AttrType.STRING), 
       output_height=ONNXAttr(output_height, AttrType.INT), 
       output_width=ONNXAttr(output_width, AttrType.INT), 
@@ -3222,7 +3545,9 @@ class Round(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Round', 1, X)
+    super().__init__('Round', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class Scaler(ONNXOp):
   """
@@ -3232,7 +3557,9 @@ class Scaler(ONNXOp):
   def __init__(self, X,
     offset=None, 
     scale=None):
-    super().__init__('Scaler', 1, X,
+    super().__init__('Scaler', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       offset=ONNXAttr(offset, AttrType.FLOATS), 
       scale=ONNXAttr(scale, AttrType.FLOATS))
 
@@ -3367,7 +3694,9 @@ class Scan(ONNXOp):
     scan_input_directions=None, 
     scan_output_axes=None, 
     scan_output_directions=None):
-    super().__init__('Scan', 1, initial_state_and_scan_inputs,
+    super().__init__('Scan', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      initial_state_and_scan_inputs,
       body=ONNXAttr(body, AttrType.GRAPH), 
       num_scan_inputs=ONNXAttr(num_scan_inputs, AttrType.INT), 
       scan_input_axes=ONNXAttr(scan_input_axes, AttrType.INTS), 
@@ -3377,8 +3706,8 @@ class Scan(ONNXOp):
 
 class Scatter(ONNXOp):
   """
-  Given `data`, `updates` and `indices` input tensors of rank r >= 1, write the values provided by `updates` 
-  into the first input, `data`, along `axis` dimension of `data` (by default outer-most one as axis=0) at corresponding `indices`. 
+  Given `data`, `updates` and `indices` input tensors of rank r >= 1, write the values provided by `updates`
+  into the first input, `data`, along `axis` dimension of `data` (by default outer-most one as axis=0) at corresponding `indices`.
   For each entry in `updates`, the target index in `data` is specified by corresponding entry in `indices`
   for dimension = axis, and index in source for dimension != axis. For instance, in a 2-D tensor case,
   data[indices[i][j]][j] = updates[i][j] if axis = 0, or data[i][indices[i][j]] = updates[i][j] if axis = 1,
@@ -3412,7 +3741,9 @@ class Scatter(ONNXOp):
 
   def __init__(self, data, indices, updates,
     axis=None):
-    super().__init__('Scatter', 1, data, indices, updates,
+    super().__init__('Scatter', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}, {'at::kLong', 'at::kInt'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      data,indices,updates,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class ScatterElements(ONNXOp):
@@ -3433,7 +3764,7 @@ class ScatterElements(ONNXOp):
   For instance, in a 2-D tensor case, the update corresponding to the [i][j] entry
   is performed as below:
   ```
-    output[indices[i][j]][j] = updates[i][j] if axis = 0, 
+    output[indices[i][j]][j] = updates[i][j] if axis = 0,
     output[i][indices[i][j]] = updates[i][j] if axis = 1,
   ```
   
@@ -3472,7 +3803,9 @@ class ScatterElements(ONNXOp):
 
   def __init__(self, data, indices, updates,
     axis=None):
-    super().__init__('ScatterElements', 1, data, indices, updates,
+    super().__init__('ScatterElements', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong', 'at::kInt'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      data,indices,updates,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class ScatterND(ONNXOp):
@@ -3536,7 +3869,9 @@ class ScatterND(ONNXOp):
   """
 
   def __init__(self, data, indices, updates):
-    super().__init__('ScatterND', 1, data, indices, updates)
+    super().__init__('ScatterND', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      data,indices,updates)
 
 class Selu(ONNXOp):
   """
@@ -3549,7 +3884,9 @@ class Selu(ONNXOp):
   def __init__(self, X,
     alpha=None, 
     gamma=None):
-    super().__init__('Selu', 1, X,
+    super().__init__('Selu', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,
       alpha=ONNXAttr(alpha, AttrType.FLOAT), 
       gamma=ONNXAttr(gamma, AttrType.FLOAT))
 
@@ -3561,7 +3898,9 @@ class SequenceAt(ONNXOp):
   """
 
   def __init__(self, input_sequence, position):
-    super().__init__('SequenceAt', 1, input_sequence, position)
+    super().__init__('SequenceAt', 1,
+      [set(), {'at::kLong', 'at::kInt'}],
+      input_sequence,position)
 
 class SequenceConstruct(ONNXOp):
   """
@@ -3570,7 +3909,9 @@ class SequenceConstruct(ONNXOp):
   """
 
   def __init__(self, inputs):
-    super().__init__('SequenceConstruct', 1, inputs)
+    super().__init__('SequenceConstruct', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      inputs)
 
 class SequenceEmpty(ONNXOp):
   """
@@ -3580,6 +3921,7 @@ class SequenceEmpty(ONNXOp):
   def __init__(self,
     dtype=None):
     super().__init__('SequenceEmpty', 1,
+      [],
       dtype=ONNXAttr(dtype, AttrType.INT))
 
 class SequenceErase(ONNXOp):
@@ -3591,7 +3933,9 @@ class SequenceErase(ONNXOp):
   """
 
   def __init__(self, input_sequence, position):
-    super().__init__('SequenceErase', 1, input_sequence, position)
+    super().__init__('SequenceErase', 1,
+      [set(), {'at::kLong', 'at::kInt'}],
+      input_sequence,position)
 
 class SequenceInsert(ONNXOp):
   """
@@ -3603,7 +3947,9 @@ class SequenceInsert(ONNXOp):
   """
 
   def __init__(self, input_sequence, tensor, position):
-    super().__init__('SequenceInsert', 1, input_sequence, tensor, position)
+    super().__init__('SequenceInsert', 1,
+      [set(), {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}, {'at::kLong', 'at::kInt'}],
+      input_sequence,tensor,position)
 
 class SequenceLength(ONNXOp):
   """
@@ -3611,7 +3957,9 @@ class SequenceLength(ONNXOp):
   """
 
   def __init__(self, input_sequence):
-    super().__init__('SequenceLength', 1, input_sequence)
+    super().__init__('SequenceLength', 1,
+      [set()],
+      input_sequence)
 
 class Shape(ONNXOp):
   """
@@ -3619,7 +3967,9 @@ class Shape(ONNXOp):
   """
 
   def __init__(self, data):
-    super().__init__('Shape', 1, data)
+    super().__init__('Shape', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      data)
 
 class Shrink(ONNXOp):
   """
@@ -3632,7 +3982,9 @@ class Shrink(ONNXOp):
   def __init__(self, input,
     bias=None, 
     lambd=None):
-    super().__init__('Shrink', 1, input,
+    super().__init__('Shrink', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}],
+      input,
       bias=ONNXAttr(bias, AttrType.FLOAT), 
       lambd=ONNXAttr(lambd, AttrType.FLOAT))
 
@@ -3644,7 +3996,9 @@ class Sigmoid(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Sigmoid', 1, X)
+    super().__init__('Sigmoid', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class Sign(ONNXOp):
   """
@@ -3653,7 +4007,9 @@ class Sign(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Sign', 1, input)
+    super().__init__('Sign', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      input)
 
 class Sin(ONNXOp):
   """
@@ -3661,7 +4017,9 @@ class Sin(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Sin', 1, input)
+    super().__init__('Sin', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Sinh(ONNXOp):
   """
@@ -3669,7 +4027,9 @@ class Sinh(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Sinh', 1, input)
+    super().__init__('Sinh', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Size(ONNXOp):
   """
@@ -3677,7 +4037,9 @@ class Size(ONNXOp):
   """
 
   def __init__(self, data):
-    super().__init__('Size', 1, data)
+    super().__init__('Size', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      data)
 
 class Slice(ONNXOp):
   """
@@ -3689,9 +4051,9 @@ class Slice(ONNXOp):
   start or end indices, it represents number of elements before the end of that
   dimension. If the value passed to start or end is larger than the `n` (the
   number of elements in this dimension), it represents `n`. For slicing to the
-  end of a dimension with unknown size, it is recommended to pass in `INT_MAX` 
+  end of a dimension with unknown size, it is recommended to pass in `INT_MAX`
   when sclicing forward and 'INT_MIN' when slicing backward.
-  If a negative value is passed for step, it represents slicing backward. 
+  If a negative value is passed for step, it represents slicing backward.
   However step value cannot be 0.
   If `axes` are omitted, they are set to `[0, ..., ndim-1]`.
   If `steps` are omitted, they are set to `[1, ..., 1]` of length `len(starts)`
@@ -3720,7 +4082,9 @@ class Slice(ONNXOp):
   """
 
   def __init__(self, data, starts, ends, axes, steps):
-    super().__init__('Slice', 1, data, starts, ends, axes, steps)
+    super().__init__('Slice', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong', 'at::kInt'}, {'at::kLong', 'at::kInt'}, {'at::kLong', 'at::kInt'}, {'at::kLong', 'at::kInt'}],
+      data,starts,ends,axes,steps)
 
 class Softmax(ONNXOp):
   """
@@ -3736,7 +4100,9 @@ class Softmax(ONNXOp):
 
   def __init__(self, input,
     axis=None):
-    super().__init__('Softmax', 1, input,
+    super().__init__('Softmax', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      input,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class SoftmaxCrossEntropyLoss(ONNXOp):
@@ -3777,7 +4143,9 @@ class SoftmaxCrossEntropyLoss(ONNXOp):
   def __init__(self, scores, labels, weights,
     ignore_index=None, 
     reduction=None):
-    super().__init__('SoftmaxCrossEntropyLoss', 2, scores, labels, weights,
+    super().__init__('SoftmaxCrossEntropyLoss', 2,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}, {'at::kLong', 'at::kInt'}, {'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      scores,labels,weights,
       ignore_index=ONNXAttr(ignore_index, AttrType.INT), 
       reduction=ONNXAttr(reduction, AttrType.STRING))
 
@@ -3789,7 +4157,9 @@ class Softplus(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Softplus', 1, X)
+    super().__init__('Softplus', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class Softsign(ONNXOp):
   """
@@ -3797,7 +4167,9 @@ class Softsign(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Softsign', 1, input)
+    super().__init__('Softsign', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class SpaceToDepth(ONNXOp):
   """
@@ -3808,7 +4180,9 @@ class SpaceToDepth(ONNXOp):
 
   def __init__(self, input,
     blocksize=None):
-    super().__init__('SpaceToDepth', 1, input,
+    super().__init__('SpaceToDepth', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      input,
       blocksize=ONNXAttr(blocksize, AttrType.INT))
 
 class Split(ONNXOp):
@@ -3820,7 +4194,9 @@ class Split(ONNXOp):
 
   def __init__(self, input, split,
     axis=None):
-    super().__init__('Split', 1, input, split,
+    super().__init__('Split', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}],
+      input,split,
       axis=ONNXAttr(axis, AttrType.INT))
 
 class SplitToSequence(ONNXOp):
@@ -3840,7 +4216,9 @@ class SplitToSequence(ONNXOp):
   def __init__(self, input, split,
     axis=None, 
     keepdims=None):
-    super().__init__('SplitToSequence', 1, input, split,
+    super().__init__('SplitToSequence', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}, {'at::kLong', 'at::kInt'}],
+      input,split,
       axis=ONNXAttr(axis, AttrType.INT), 
       keepdims=ONNXAttr(keepdims, AttrType.INT))
 
@@ -3852,7 +4230,9 @@ class Sqrt(ONNXOp):
   """
 
   def __init__(self, X):
-    super().__init__('Sqrt', 1, X)
+    super().__init__('Sqrt', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      X)
 
 class Squeeze(ONNXOp):
   """
@@ -3863,7 +4243,9 @@ class Squeeze(ONNXOp):
   """
 
   def __init__(self, data, axes):
-    super().__init__('Squeeze', 1, data, axes)
+    super().__init__('Squeeze', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}],
+      data,axes)
 
 class StringNormalizer(ONNXOp):
   """
@@ -3883,7 +4265,9 @@ class StringNormalizer(ONNXOp):
     is_case_sensitive=None, 
     locale=None, 
     stopwords=None):
-    super().__init__('StringNormalizer', 1, X,
+    super().__init__('StringNormalizer', 1,
+      [set()],
+      X,
       case_change_action=ONNXAttr(case_change_action, AttrType.STRING), 
       is_case_sensitive=ONNXAttr(is_case_sensitive, AttrType.INT), 
       locale=ONNXAttr(locale, AttrType.STRING), 
@@ -3894,10 +4278,14 @@ class Sub(ONNXOp):
   Performs element-wise binary subtraction (with Numpy-style broadcasting support).
   
   This operator supports **multidirectional (i.e., Numpy-style) broadcasting**; for more details please check [the doc](Broadcasting.md).
+  
+  (Opset 14 change): Extend supported types to include uint8, int8, uint16, and int16.
   """
 
   def __init__(self, A, B):
-    super().__init__('Sub', 1, A, B)
+    super().__init__('Sub', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat', 'at::kBFloat16'}],
+      A,B)
 
 class Sum(ONNXOp):
   """
@@ -3907,7 +4295,9 @@ class Sum(ONNXOp):
   """
 
   def __init__(self, data_0):
-    super().__init__('Sum', 1, data_0)
+    super().__init__('Sum', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      data_0)
 
 class SVMClassifier(ONNXOp):
   """
@@ -3926,7 +4316,9 @@ class SVMClassifier(ONNXOp):
     rho=None, 
     support_vectors=None, 
     vectors_per_class=None):
-    super().__init__('SVMClassifier', 2, X,
+    super().__init__('SVMClassifier', 2,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       classlabels_ints=ONNXAttr(classlabels_ints, AttrType.INTS), 
       classlabels_strings=ONNXAttr(classlabels_strings, AttrType.STRINGS), 
       coefficients=ONNXAttr(coefficients, AttrType.FLOATS), 
@@ -3953,7 +4345,9 @@ class SVMRegressor(ONNXOp):
     post_transform=None, 
     rho=None, 
     support_vectors=None):
-    super().__init__('SVMRegressor', 1, X,
+    super().__init__('SVMRegressor', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       coefficients=ONNXAttr(coefficients, AttrType.FLOATS), 
       kernel_params=ONNXAttr(kernel_params, AttrType.FLOATS), 
       kernel_type=ONNXAttr(kernel_type, AttrType.STRING), 
@@ -3969,7 +4363,9 @@ class Tan(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Tan', 1, input)
+    super().__init__('Tan', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class Tanh(ONNXOp):
   """
@@ -3977,7 +4373,9 @@ class Tanh(ONNXOp):
   """
 
   def __init__(self, input):
-    super().__init__('Tanh', 1, input)
+    super().__init__('Tanh', 1,
+      [{'at::kDouble', 'at::kBFloat16', 'at::kHalf', 'at::kFloat'}],
+      input)
 
 class TfIdfVectorizer(ONNXOp):
   """
@@ -4020,7 +4418,9 @@ class TfIdfVectorizer(ONNXOp):
     pool_int64s=None, 
     pool_strings=None, 
     weights=None):
-    super().__init__('TfIdfVectorizer', 1, X,
+    super().__init__('TfIdfVectorizer', 1,
+      [{'at::kLong', 'at::kInt'}],
+      X,
       max_gram_length=ONNXAttr(max_gram_length, AttrType.INT), 
       max_skip_count=ONNXAttr(max_skip_count, AttrType.INT), 
       min_gram_length=ONNXAttr(min_gram_length, AttrType.INT), 
@@ -4040,7 +4440,9 @@ class ThresholdedRelu(ONNXOp):
 
   def __init__(self, X,
     alpha=None):
-    super().__init__('ThresholdedRelu', 1, X,
+    super().__init__('ThresholdedRelu', 1,
+      [{'at::kDouble', 'at::kHalf', 'at::kFloat'}],
+      X,
       alpha=ONNXAttr(alpha, AttrType.FLOAT))
 
 class Tile(ONNXOp):
@@ -4051,7 +4453,9 @@ class Tile(ONNXOp):
   """
 
   def __init__(self, input, repeats):
-    super().__init__('Tile', 1, input, repeats)
+    super().__init__('Tile', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}],
+      input,repeats)
 
 class TopK(ONNXOp):
   """
@@ -4075,7 +4479,9 @@ class TopK(ONNXOp):
     axis=None, 
     largest=None, 
     sorted=None):
-    super().__init__('TopK', 2, X, K,
+    super().__init__('TopK', 2,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kFloat'}, {'at::kLong'}],
+      X,K,
       axis=ONNXAttr(axis, AttrType.INT), 
       largest=ONNXAttr(largest, AttrType.INT), 
       sorted=ONNXAttr(sorted, AttrType.INT))
@@ -4089,13 +4495,15 @@ class Transpose(ONNXOp):
 
   def __init__(self, data,
     perm=None):
-    super().__init__('Transpose', 1, data,
+    super().__init__('Transpose', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}],
+      data,
       perm=ONNXAttr(perm, AttrType.INTS))
 
 class TreeEnsembleClassifier(ONNXOp):
   """
       Tree Ensemble classifier.  Returns the top class for each of N inputs.<br>
-      The attributes named 'nodes_X' form a sequence of tuples, associated by 
+      The attributes named 'nodes_X' form a sequence of tuples, associated by
       index into the sequences, which must all be of equal length. These tuples
       define the nodes.<br>
       Similarly, all fields prefixed with 'class_' are tuples of votes at the leaves.
@@ -4123,7 +4531,9 @@ class TreeEnsembleClassifier(ONNXOp):
     nodes_truenodeids=None, 
     nodes_values=None, 
     post_transform=None):
-    super().__init__('TreeEnsembleClassifier', 2, X,
+    super().__init__('TreeEnsembleClassifier', 2,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       base_values=ONNXAttr(base_values, AttrType.FLOATS), 
       class_ids=ONNXAttr(class_ids, AttrType.INTS), 
       class_nodeids=ONNXAttr(class_nodeids, AttrType.INTS), 
@@ -4174,7 +4584,9 @@ class TreeEnsembleRegressor(ONNXOp):
     target_nodeids=None, 
     target_treeids=None, 
     target_weights=None):
-    super().__init__('TreeEnsembleRegressor', 1, X,
+    super().__init__('TreeEnsembleRegressor', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kInt', 'at::kFloat'}],
+      X,
       aggregate_function=ONNXAttr(aggregate_function, AttrType.STRING), 
       base_values=ONNXAttr(base_values, AttrType.FLOATS), 
       n_targets=ONNXAttr(n_targets, AttrType.INT), 
@@ -4193,18 +4605,41 @@ class TreeEnsembleRegressor(ONNXOp):
       target_treeids=ONNXAttr(target_treeids, AttrType.INTS), 
       target_weights=ONNXAttr(target_weights, AttrType.FLOATS))
 
+class Trilu(ONNXOp):
+  """
+  Given a 2-D matrix or batches of 2-D matrices, returns the upper or lower triangular part of the tensor(s).
+  The attribute "upper" determines whether the upper or lower part is retained. If set to true,
+  the upper triangular matrix is retained. Lower triangular matrix is retained otherwise.
+  Default value for the "upper" attribute is true.
+  Trilu takes one input tensor of shape [*, N, M], where * is zero or more batch dimensions. The upper triangular part consists
+  of the elements on and above the given diagonal (k). The lower triangular part consists of elements on and below the diagonal.
+  All other elements in the matrix are set to zero.
+  If k = 0, the triangular part on and above/below the main diagonal is retained.
+  If upper is set to true, a positive k retains the upper triangular matrix excluding the main diagonal and (k-1) diagonals above it.
+  A negative k value retains the main diagonal and |k| diagonals below it.
+  If upper is set to false, a positive k retains the lower triangular matrix including the main diagonal and k diagonals above it.
+  A negative k value excludes the main diagonal and (|k|-1) diagonals below it.
+  """
+
+  def __init__(self, input, k,
+    upper=None):
+    super().__init__('Trilu', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}],
+      input,k,
+      upper=ONNXAttr(upper, AttrType.INT))
+
 class Unique(ONNXOp):
   """
-  Find the unique elements of a tensor. When an optional attribute 'axis' is provided, unique subtensors sliced along the 'axis' are returned. 
-  Otherwise the input tensor is flattened and unique values of the flattened tensor are returned. 
+  Find the unique elements of a tensor. When an optional attribute 'axis' is provided, unique subtensors sliced along the 'axis' are returned.
+  Otherwise the input tensor is flattened and unique values of the flattened tensor are returned.
   
-  This operator returns the unique values or sliced unique subtensors of the input tensor and three optional outputs. 
-  The first output tensor 'Y' contains all unique values or subtensors of the input. 
-  The second optional output tensor 'indices' contains indices of 'Y' elements' first occurance in 'X'.. 
-  The third optional output tensor 'inverse_indices' contains, for elements of 'X', its corresponding indices in 'Y'. ". 
-  The fourth optional output tensor 'counts' contains the count of each element of 'Y' in the input. 
+  This operator returns the unique values or sliced unique subtensors of the input tensor and three optional outputs.
+  The first output tensor 'Y' contains all unique values or subtensors of the input.
+  The second optional output tensor 'indices' contains indices of 'Y' elements' first occurance in 'X'..
+  The third optional output tensor 'inverse_indices' contains, for elements of 'X', its corresponding indices in 'Y'. ".
+  The fourth optional output tensor 'counts' contains the count of each element of 'Y' in the input.
   
-  Outputs are either sorted in ascending order or optionally in the order of the first occurrence of the values in the input. 
+  Outputs are either sorted in ascending order or optionally in the order of the first occurrence of the values in the input.
   
   https://docs.scipy.org/doc/numpy/reference/generated/numpy.unique.html
   
@@ -4236,36 +4671,36 @@ class Unique(ONNXOp):
     output_counts = [2, 1]
   
   Example 4:
-    input_x = [[[1., 1.], [0., 1.], [2., 1.], [0., 1.]], 
+    input_x = [[[1., 1.], [0., 1.], [2., 1.], [0., 1.]],
                [[1., 1.], [0., 1.], [2., 1.], [0., 1.]]]
     attribute_sorted = 1
     attribute_axis = 1
   
-    intermediate data are presented below for better understanding: 
-    
+    intermediate data are presented below for better understanding:
+  
     there are 4 subtensors sliced along axis 1 of input_x (shape = (2, 4, 2)):
-    A: [[1, 1], [1, 1]], 
-       [[0, 1], [0, 1]], 
-       [[2, 1], [2, 1]], 
+    A: [[1, 1], [1, 1]],
+       [[0, 1], [0, 1]],
+       [[2, 1], [2, 1]],
        [[0, 1], [0, 1]].
-    
-    there are 3 unique subtensors: 
-    [[1, 1], [1, 1]], 
-    [[0, 1], [0, 1]], 
+  
+    there are 3 unique subtensors:
+    [[1, 1], [1, 1]],
+    [[0, 1], [0, 1]],
     [[2, 1], [2, 1]].
-    
+  
     sorted unique subtensors:
-    B: [[0, 1], [0, 1]], 
-       [[1, 1], [1, 1]], 
+    B: [[0, 1], [0, 1]],
+       [[1, 1], [1, 1]],
        [[2, 1], [2, 1]].
-    
+  
     output_Y is constructed from B:
-    [[[0. 1.], [1. 1.], [2. 1.]], 
+    [[[0. 1.], [1. 1.], [2. 1.]],
      [[0. 1.], [1. 1.], [2. 1.]]]
   
     output_indices is to map from B to A:
     [1, 0, 2]
-    
+  
     output_inverse_indices is to map from A to B:
     [1, 0, 2, 0]
   
@@ -4275,7 +4710,9 @@ class Unique(ONNXOp):
   def __init__(self, X,
     axis=None, 
     sorted=None):
-    super().__init__('Unique', 4, X,
+    super().__init__('Unique', 4,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      X,
       axis=ONNXAttr(axis, AttrType.INT), 
       sorted=ONNXAttr(sorted, AttrType.INT))
 
@@ -4290,12 +4727,14 @@ class Unsqueeze(ONNXOp):
   
   The input `axes` should not contain any duplicate entries. It is an error if it contains duplicates.
   The rank of the output tensor (`output_rank`) is the rank of the input tensor (`data`) plus the number of values in `axes`.
-  Each value in `axes` should be within the (inclusive) range [-output_rank , output_rank - 1]. 
-  The order of values in `axes` does not matter and can come in any order. 
+  Each value in `axes` should be within the (inclusive) range [-output_rank , output_rank - 1].
+  The order of values in `axes` does not matter and can come in any order.
   """
 
   def __init__(self, data, axes):
-    super().__init__('Unsqueeze', 1, data, axes)
+    super().__init__('Unsqueeze', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat', 'at::kBFloat16'}, {'at::kLong'}],
+      data,axes)
 
 class Upsample(ONNXOp):
   """
@@ -4306,7 +4745,9 @@ class Upsample(ONNXOp):
 
   def __init__(self, X, scales,
     mode=None):
-    super().__init__('Upsample', 1, X, scales,
+    super().__init__('Upsample', 1,
+      [{'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}, {'at::kFloat'}],
+      X,scales,
       mode=ONNXAttr(mode, AttrType.STRING))
 
 class Where(ONNXOp):
@@ -4318,7 +4759,9 @@ class Where(ONNXOp):
   """
 
   def __init__(self, condition, X, Y):
-    super().__init__('Where', 1, condition, X, Y)
+    super().__init__('Where', 1,
+      [{'at::kBool'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}, {'at::kDouble', 'at::kLong', 'at::kByte', 'at::kInt', 'at::kHalf', 'at::kShort', 'at::kBool', 'at::kFloat'}],
+      condition,X,Y)
 
 class Xor(ONNXOp):
   """
@@ -4329,7 +4772,9 @@ class Xor(ONNXOp):
   """
 
   def __init__(self, A, B):
-    super().__init__('Xor', 1, A, B)
+    super().__init__('Xor', 1,
+      [{'at::kBool'}, {'at::kBool'}],
+      A,B)
 
 class ZipMap(ONNXOp):
   """
@@ -4342,193 +4787,197 @@ class ZipMap(ONNXOp):
   def __init__(self, X,
     classlabels_int64s=None, 
     classlabels_strings=None):
-    super().__init__('ZipMap', 1, X,
+    super().__init__('ZipMap', 1,
+      [{'at::kFloat'}],
+      X,
       classlabels_int64s=ONNXAttr(classlabels_int64s, AttrType.INTS), 
       classlabels_strings=ONNXAttr(classlabels_strings, AttrType.STRINGS))
 
 onnx_ops = {
+  'adam': Adam,
+  'adagrad': Adagrad,
   'momentum': Momentum,
   'gradient': Gradient,
-  'treeensembleclassifier': TreeEnsembleClassifier,
-  'scaler': Scaler,
-  'linearregressor': LinearRegressor,
+  'zipmap': ZipMap,
+  'onehotencoder': OneHotEncoder,
+  'normalizer': Normalizer,
+  'linearclassifier': LinearClassifier,
   'labelencoder': LabelEncoder,
   'imputer': Imputer,
-  'linearclassifier': LinearClassifier,
-  'binarizer': Binarizer,
-  'lessorequal': LessOrEqual,
-  'celu': Celu,
-  'concatfromsequence': ConcatFromSequence,
-  'sequenceat': SequenceAt,
-  'sequenceinsert': SequenceInsert,
-  'gathernd': GatherND,
-  'scatternd': ScatterND,
-  'det': Det,
-  'scatterelements': ScatterElements,
-  'gatherelements': GatherElements,
-  'splittosequence': SplitToSequence,
-  'dynamicquantizelinear': DynamicQuantizeLinear,
-  'round': Round,
-  'cumsum': CumSum,
-  'bitshift': BitShift,
-  'roialign': RoiAlign,
-  'reversesequence': ReverseSequence,
-  'nonmaxsuppression': NonMaxSuppression,
-  'isinf': IsInf,
-  'quantizelinear': QuantizeLinear,
-  'qlinearconv': QLinearConv,
-  'convinteger': ConvInteger,
-  'qlinearmatmul': QLinearMatMul,
-  'matmulinteger': MatMulInteger,
-  'stringnormalizer': StringNormalizer,
-  'meanvariancenormalization': MeanVarianceNormalization,
-  'tfidfvectorizer': TfIdfVectorizer,
+  'featurevectorizer': FeatureVectorizer,
   'treeensembleregressor': TreeEnsembleRegressor,
-  'range': Range,
-  'nonzero': NonZero,
-  'sign': Sign,
-  'isnan': IsNaN,
-  'sequenceerase': SequenceErase,
-  'shrink': Shrink,
-  'sinh': Sinh,
-  'mod': Mod,
-  'scatter': Scatter,
-  'onehot': OneHot,
-  'maxunpool': MaxUnpool,
-  'eyelike': EyeLike,
-  'constantofshape': ConstantOfShape,
-  'compress': Compress,
-  'scan': Scan,
-  'dequantizelinear': DequantizeLinear,
-  'thresholdedrelu': ThresholdedRelu,
-  'expand': Expand,
-  'multinomial': Multinomial,
-  'asin': Asin,
-  'xor': Xor,
-  'einsum': Einsum,
-  'floor': Floor,
-  'reducesumsquare': ReduceSumSquare,
-  'upsample': Upsample,
-  'and': And,
-  'tile': Tile,
-  'sub': Sub,
-  'squeeze': Squeeze,
-  'acosh': Acosh,
-  'reducelogsum': ReduceLogSum,
-  'split': Split,
-  'where': Where,
-  'sqrt': Sqrt,
-  'softsign': Softsign,
-  'softplus': Softplus,
-  'cos': Cos,
-  'spacetodepth': SpaceToDepth,
-  'greaterorequal': GreaterOrEqual,
-  'softmax': Softmax,
-  'erf': Erf,
-  'size': Size,
-  'max': Max,
-  'tanh': Tanh,
-  'transpose': Transpose,
-  'shape': Shape,
-  'onehotencoder': OneHotEncoder,
-  'selu': Selu,
-  'adam': Adam,
-  'sum': Sum,
-  'relu': Relu,
-  'negativeloglikelihoodloss': NegativeLogLikelihoodLoss,
-  'sequencelength': SequenceLength,
-  'reducemin': ReduceMin,
-  'reducel1': ReduceL1,
-  'reciprocal': Reciprocal,
-  'mul': Mul,
-  'randomuniformlike': RandomUniformLike,
-  'sin': Sin,
-  'sigmoid': Sigmoid,
-  'randomnormallike': RandomNormalLike,
-  'asinh': Asinh,
-  'rnn': RNN,
-  'pad': Pad,
-  'slice': Slice,
-  'greater': Greater,
-  'reducelogsumexp': ReduceLogSumExp,
-  'or': Or,
-  'neg': Neg,
-  'mean': Mean,
-  'adagrad': Adagrad,
-  'reshape': Reshape,
   'dictvectorizer': DictVectorizer,
-  'reducel2': ReduceL2,
-  'arrayfeatureextractor': ArrayFeatureExtractor,
-  'flatten': Flatten,
+  'castmap': CastMap,
+  'shape': Shape,
+  'reshape': Reshape,
+  'binarizer': Binarizer,
+  'reciprocal': Reciprocal,
+  'leakyrelu': LeakyRelu,
+  'hardsigmoid': HardSigmoid,
+  'treeensembleclassifier': TreeEnsembleClassifier,
+  'reducemin': ReduceMin,
+  'div': Div,
+  'randomnormallike': RandomNormalLike,
   'randomnormal': RandomNormal,
+  'greaterorequal': GreaterOrEqual,
+  'pow': Pow,
+  'or': Or,
+  'mul': Mul,
+  'min': Min,
+  'floor': Floor,
+  'mean': Mean,
+  'lrn': LRN,
+  'scaler': Scaler,
+  'max': Max,
+  'round': Round,
+  'lppool': LpPool,
+  'sigmoid': Sigmoid,
+  'relu': Relu,
+  'quantizelinear': QuantizeLinear,
+  'logsoftmax': LogSoftmax,
+  'randomuniform': RandomUniform,
+  'depthtospace': DepthToSpace,
+  'concat': Concat,
+  'bitshift': BitShift,
+  'ceil': Ceil,
+  'gather': Gather,
+  'log': Log,
+  'reducesumsquare': ReduceSumSquare,
+  'dropout': Dropout,
+  'greater': Greater,
+  'reducesum': ReduceSum,
+  'sequenceempty': SequenceEmpty,
+  'neg': Neg,
+  'constant': Constant,
+  'maxpool': MaxPool,
+  'sub': Sub,
+  'reducelogsumexp': ReduceLogSumExp,
+  'xor': Xor,
+  'globallppool': GlobalLpPool,
+  'upsample': Upsample,
+  'prelu': PRelu,
+  'loop': Loop,
+  'lpnormalization': LpNormalization,
+  'dynamicquantizelinear': DynamicQuantizeLinear,
+  'splittosequence': SplitToSequence,
+  'linearregressor': LinearRegressor,
+  'add': Add,
+  'selu': Selu,
+  'reducemax': ReduceMax,
+  'and': And,
+  'abs': Abs,
+  'qlinearmatmul': QLinearMatMul,
+  'lessorequal': LessOrEqual,
+  'clip': Clip,
+  'argmax': ArgMax,
+  'einsum': Einsum,
+  'hardmax': Hardmax,
   'conv': Conv,
   'globalmaxpool': GlobalMaxPool,
-  'lppool': LpPool,
-  'reducemax': ReduceMax,
-  'loop': Loop,
-  'zipmap': ZipMap,
-  'log': Log,
-  'leakyrelu': LeakyRelu,
-  'batchnormalization': BatchNormalization,
-  'cosh': Cosh,
-  'cast': Cast,
-  'not': Not,
-  'lstm': LSTM,
-  'unsqueeze': Unsqueeze,
-  'topk': TopK,
-  'argmax': ArgMax,
-  'lrn': LRN,
-  'sequenceempty': SequenceEmpty,
-  'acos': Acos,
-  'randomuniform': RandomUniform,
-  'normalizer': Normalizer,
+  'maxunpool': MaxUnpool,
+  'argmin': ArgMin,
+  'averagepool': AveragePool,
+  'sqrt': Sqrt,
+  'size': Size,
   'instancenormalization': InstanceNormalization,
-  'softmaxcrossentropyloss': SoftmaxCrossEntropyLoss,
-  'concat': Concat,
-  'if': If,
-  'categorymapper': CategoryMapper,
-  'maxroipool': MaxRoiPool,
-  'clip': Clip,
+  'gemm': Gemm,
+  'reducelogsum': ReduceLogSum,
+  'cos': Cos,
+  'not': Not,
+  'eyelike': EyeLike,
+  'equal': Equal,
+  'cast': Cast,
+  'exp': Exp,
+  'flatten': Flatten,
+  'svmclassifier': SVMClassifier,
+  'roialign': RoiAlign,
+  'reducemean': ReduceMean,
+  'scatter': Scatter,
+  'split': Split,
   'identity': Identity,
-  'svmregressor': SVMRegressor,
-  'reduceprod': ReduceProd,
-  'prelu': PRelu,
-  'gather': Gather,
-  'atanh': Atanh,
-  'hardsigmoid': HardSigmoid,
-  'matmul': MatMul,
-  'gru': GRU,
-  'resize': Resize,
-  'globallppool': GlobalLpPool,
-  'sequenceconstruct': SequenceConstruct,
-  'elu': Elu,
+  'reducel2': ReduceL2,
   'globalaveragepool': GlobalAveragePool,
   'tan': Tan,
-  'exp': Exp,
+  'reducel1': ReduceL1,
+  'lstm': LSTM,
+  'slice': Slice,
+  'softmax': Softmax,
+  'softmaxcrossentropyloss': SoftmaxCrossEntropyLoss,
+  'categorymapper': CategoryMapper,
+  'maxroipool': MaxRoiPool,
+  'softsign': Softsign,
+  'gathernd': GatherND,
+  'batchnormalization': BatchNormalization,
+  'spacetodepth': SpaceToDepth,
+  'squeeze': Squeeze,
   'unique': Unique,
-  'argmin': ArgMin,
-  'add': Add,
-  'constant': Constant,
-  'equal': Equal,
-  'reducesum': ReduceSum,
-  'featurevectorizer': FeatureVectorizer,
-  'pow': Pow,
-  'maxpool': MaxPool,
-  'min': Min,
-  'div': Div,
-  'svmclassifier': SVMClassifier,
-  'reducemean': ReduceMean,
+  'sum': Sum,
+  'sinh': Sinh,
   'less': Less,
-  'dropout': Dropout,
-  'depthtospace': DepthToSpace,
-  'ceil': Ceil,
+  'tanh': Tanh,
+  'isnan': IsNaN,
+  'tile': Tile,
+  'multinomial': Multinomial,
+  'topk': TopK,
+  'reversesequence': ReverseSequence,
+  'transpose': Transpose,
+  'stringnormalizer': StringNormalizer,
+  'acos': Acos,
+  'asin': Asin,
+  'gru': GRU,
   'atan': Atan,
-  'logsoftmax': LogSoftmax,
-  'averagepool': AveragePool,
-  'hardmax': Hardmax,
-  'castmap': CastMap,
-  'abs': Abs,
+  'sign': Sign,
+  'trilu': Trilu,
+  'where': Where,
+  'sin': Sin,
+  'shrink': Shrink,
+  'matmul': MatMul,
+  'expand': Expand,
+  'scan': Scan,
+  'compress': Compress,
+  'elu': Elu,
+  'unsqueeze': Unsqueeze,
+  'constantofshape': ConstantOfShape,
+  'onehot': OneHot,
+  'sequenceat': SequenceAt,
+  'cosh': Cosh,
+  'asinh': Asinh,
+  'rnn': RNN,
+  'acosh': Acosh,
+  'atanh': Atanh,
+  'erf': Erf,
+  'nonzero': NonZero,
+  'meanvariancenormalization': MeanVarianceNormalization,
+  'scatternd': ScatterND,
+  'randomuniformlike': RandomUniformLike,
+  'resize': Resize,
+  'mod': Mod,
+  'thresholdedrelu': ThresholdedRelu,
+  'matmulinteger': MatMulInteger,
+  'pad': Pad,
+  'convinteger': ConvInteger,
+  'qlinearconv': QLinearConv,
+  'celu': Celu,
   'convtranspose': ConvTranspose,
-  'lpnormalization': LpNormalization,
-  'gemm': Gemm,
+  'dequantizelinear': DequantizeLinear,
+  'sequencelength': SequenceLength,
+  'nonmaxsuppression': NonMaxSuppression,
+  'isinf': IsInf,
+  'cumsum': CumSum,
+  'softplus': Softplus,
+  'gatherelements': GatherElements,
+  'scatterelements': ScatterElements,
+  'range': Range,
+  'svmregressor': SVMRegressor,
+  'negativeloglikelihoodloss': NegativeLogLikelihoodLoss,
+  'det': Det,
+  'sequenceconstruct': SequenceConstruct,
+  'if': If,
+  'sequenceinsert': SequenceInsert,
+  'tfidfvectorizer': TfIdfVectorizer,
+  'sequenceerase': SequenceErase,
+  'concatfromsequence': ConcatFromSequence,
+  'hardswish': HardSwish,
+  'reduceprod': ReduceProd,
+  'arrayfeatureextractor': ArrayFeatureExtractor,
 }
\ No newline at end of file
diff --git a/orttraining/orttraining/eager/ort_aten.cpp b/orttraining/orttraining/eager/ort_aten.cpp
index 79f33c41a2..00969e1339 100644
--- a/orttraining/orttraining/eager/ort_aten.cpp
+++ b/orttraining/orttraining/eager/ort_aten.cpp
@@ -3,6 +3,8 @@
 
 #include "ort_aten.h"
 #include "ort_tensor.h"
+#include <c10/core/TensorImpl.h>
+#include <ATen/native/CPUFallback.h>
 
 namespace torch_ort {
 namespace eager {
@@ -67,6 +69,8 @@ onnxruntime::MLDataType ort_scalar_type_from_aten(
       return onnxruntime::DataTypeImpl::GetType<int16_t>();
     case at::kLong:
       return onnxruntime::DataTypeImpl::GetType<int64_t>();
+    case at::kBool:
+      return onnxruntime::DataTypeImpl::GetType<bool>();
     default:
       ORT_THROW("Unsupport aten scalar type: ", dtype);
   }
@@ -156,13 +160,26 @@ onnx::AttributeProto create_ort_attribute(
   return attr;
 }
 
+bool IsSupportedType(at::Scalar scalar, const std::vector<at::ScalarType>& valid_types){
+  return std::find(valid_types.begin(), valid_types.end(), scalar.type()) != valid_types.end();
+}
+
+bool IsSupportedType(at::Tensor tensor, const std::vector<at::ScalarType>& valid_types){
+  return std::find(valid_types.begin(), valid_types.end(), tensor.scalar_type()) != valid_types.end();
+}
+
+bool IsSupportedType(at::IntArrayRef arrary, const std::vector<at::ScalarType>& valid_types){
+  return std::find(valid_types.begin(), valid_types.end(), at::kInt) != valid_types.end() ||
+         std::find(valid_types.begin(), valid_types.end(), at::kLong) != valid_types.end();
+}
+
 //#pragma endregion
 
 //#pragma region Hand-Implemented ATen Ops
 
 namespace aten {
 
-at::Tensor empty__memory_format(
+at::Tensor empty_memory_format(
   at::IntArrayRef size,
   // *,
   c10::optional<at::ScalarType> dtype_opt,
@@ -184,7 +201,7 @@ at::Tensor empty__memory_format(
     ort_scalar_type_from_aten(*dtype_opt),
     size.vec(),
     &ot);
-
+  
   return aten_tensor_from_ort(
     std::move(ot),
     at::TensorOptions()
@@ -222,15 +239,20 @@ at::Tensor empty_strided(
       .dtype(dtype));
 }
 
-at::Tensor reshape(at::Tensor const& self, at::IntArrayRef shape) {
-  ORT_LOG_FN(self, shape);
-
+at::Tensor _reshape_alias(
+  const at::Tensor& self, 
+  at::IntArrayRef size, 
+  at::IntArrayRef stride){
+  ORT_LOG_FN(self, size, stride);
+  // TODO: support stride
   auto& invoker = GetORTInvoker(self.device());
   return aten_tensor_from_ort(
     reshape_copy(
       invoker,
       create_ort_value(invoker, self),
-      shape.vec()),
+      at::infer_size(
+        size,
+        self.numel())),
     self.options());
 }
 
@@ -248,6 +270,29 @@ at::Tensor view(const at::Tensor& self, at::IntArrayRef size) {
     self.options());
 }
 
+ONNX_NAMESPACE::TensorProto_DataType GetONNXTensorProtoDataType(at::ScalarType dtype){
+  switch (dtype){
+    case at::kFloat:
+      return ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+    case at::kDouble:
+      return ONNX_NAMESPACE::TensorProto_DataType_DOUBLE;
+    case at::kHalf:
+      return ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
+    case at::kBFloat16:
+      return ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16;
+    case at::kInt:
+      return ONNX_NAMESPACE::TensorProto_DataType_INT32;
+    case at::kShort:
+      return ONNX_NAMESPACE::TensorProto_DataType_INT16;
+    case at::kLong:
+      return ONNX_NAMESPACE::TensorProto_DataType_INT64;
+    case at::kBool:
+      return ONNX_NAMESPACE::TensorProto_DataType_BOOL;
+    default:
+      ORT_THROW("Unsupport aten scalar type: ", dtype);
+  }
+}
+
 at::Tensor& copy_(
   at::Tensor& self,
   const at::Tensor& src,
@@ -262,8 +307,45 @@ at::Tensor& copy_(
     : src.device());
   const auto ort_src = create_ort_value(invoker, src);
   auto ort_self = create_ort_value(invoker, self);
+  if (self.scalar_type() != src.scalar_type()){
+    // invoke cast first
+    std::vector<OrtValue> ort_cast_output(1);
+    onnxruntime::NodeAttributes attrs(1);
+    attrs["to"] = create_ort_attribute(
+      "to", (int64_t)GetONNXTensorProtoDataType(self.scalar_type()), at::kLong);
 
-  copy(invoker, ort_src, ort_self);
+    auto status = invoker.Invoke("Cast", {
+      std::move(ort_src),
+    }, ort_cast_output, &attrs);
+  
+    if (!status.IsOK())
+      throw std::runtime_error(
+        "ORT return failure status:" + status.ErrorMessage());
+    
+    copy(invoker, ort_cast_output[0], ort_self);
+  }
+  else{
+    copy(invoker, ort_src, ort_self);
+  }
+  
+  return self;
+}
+
+at::Tensor _copy_from_and_resize(
+  const at::Tensor& self, 
+  const at::Tensor& dst){
+  ORT_LOG_FN(self, dst);
+
+  assert_tensor_supported(self);
+  assert_tensor_supported(dst);
+
+  auto& invoker = GetORTInvoker(self.device().type() == at::kORT
+    ? self.device()
+    : dst.device());
+  const auto ort_self = create_ort_value(invoker, self);
+  auto ort_dst = create_ort_value(invoker, dst);
+
+  copy(invoker, ort_self, ort_dst);
 
   return self;
 }
diff --git a/orttraining/orttraining/eager/ort_aten.h b/orttraining/orttraining/eager/ort_aten.h
index 2aea68b7a7..b91c9ab381 100644
--- a/orttraining/orttraining/eager/ort_aten.h
+++ b/orttraining/orttraining/eager/ort_aten.h
@@ -73,5 +73,11 @@ onnx::AttributeProto create_ort_attribute(
   const char* name,
   const char* value);
 
+bool IsSupportedType(at::Scalar scalar, const std::vector<at::ScalarType>& valid_types);
+
+bool IsSupportedType(at::Tensor tensor, const std::vector<at::ScalarType>& valid_types);
+
+bool IsSupportedType(at::IntArrayRef arrary, const std::vector<at::ScalarType>& valid_types);
+
 } // namespace eager
 } // namespace torch_ort
\ No newline at end of file
diff --git a/orttraining/orttraining/eager/ort_tensor.h b/orttraining/orttraining/eager/ort_tensor.h
index 59de400903..e947e2344b 100644
--- a/orttraining/orttraining/eager/ort_tensor.h
+++ b/orttraining/orttraining/eager/ort_tensor.h
@@ -5,6 +5,7 @@
 
 #include <c10/core/TensorImpl.h>
 #include <core/framework/ort_value.h>
+#include <iostream>
 
 namespace torch_ort {
 namespace eager {
diff --git a/orttraining/orttraining/eager/test/ort_eps_test.py b/orttraining/orttraining/eager/test/ort_eps_test.py
index e9d848be5d..9122b09b21 100644
--- a/orttraining/orttraining/eager/test/ort_eps_test.py
+++ b/orttraining/orttraining/eager/test/ort_eps_test.py
@@ -120,6 +120,13 @@ class OrtEPTests(unittest.TestCase):
         ort_device = torch_ort.device(1)
     assert 'My EP provider created, with device id: 0, some_option: val' in out.capturedtext
 
+  #disable the print test for now as we need to merge a PR to pytorch first.
+  #def test_print(self):
+  #  x = torch.ones(1, 2)
+  #  ort_x = x.to('ort')
+  #  with OutputGrabber() as out:
+  #      print(ort_x)
+  #  assert "tensor([[1., 1.]], device='ort:0')" in out.capturedtext
 
 if __name__ == '__main__':
   unittest.main()
\ No newline at end of file
diff --git a/orttraining/orttraining/eager/test/ort_ops.py b/orttraining/orttraining/eager/test/ort_ops.py
index 8e2461218e..f8f27e163a 100644
--- a/orttraining/orttraining/eager/test/ort_ops.py
+++ b/orttraining/orttraining/eager/test/ort_ops.py
@@ -25,6 +25,14 @@ class OrtOpTests(unittest.TestCase):
     assert torch.allclose(
       torch.add(cpu_ones, cpu_ones, alpha=2.5),
       torch.add(ort_ones, ort_ones, alpha=2.5).cpu())
+  
+  def test_mul_bool(self):
+    device = self.get_device()
+    cpu_ones = torch.ones(3, 3, dtype=bool)
+    ort_ones = cpu_ones.to(device)
+    assert torch.allclose(
+      torch.mul(cpu_ones, cpu_ones),
+      torch.mul(ort_ones, ort_ones).cpu())
 
   def test_add_(self):
     device = self.get_device()
@@ -68,6 +76,20 @@ class OrtOpTests(unittest.TestCase):
     cpu_ans = cpu_ones * 4
     ort_ans = torch_ort.custom_ops.gemm(ort_ones, ort_ones, ort_ones, 1.0, 1.0, 0, 0)
     assert torch.allclose(cpu_ans, ort_ans.cpu())
+  
+  def test_max(self):
+    cpu_tensor = torch.rand(10, 10)
+    ort_tensor = cpu_tensor.to('ort')
+    y = ort_tensor.max()
+    x = cpu_tensor.max()
+    assert torch.allclose(x, y.cpu())
+  
+  def test_min(self):
+    cpu_tensor = torch.rand(10, 10)
+    ort_tensor = cpu_tensor.to('ort')
+    y = ort_tensor.min()
+    x = cpu_tensor.min()
+    assert torch.allclose(x, y.cpu())
 
 if __name__ == '__main__':
   unittest.main()
\ No newline at end of file
diff --git a/orttraining/orttraining/eager/test/ort_tensor.py b/orttraining/orttraining/eager/test/ort_tensor.py
index c0a1b8eb5e..772c26287a 100644
--- a/orttraining/orttraining/eager/test/ort_tensor.py
+++ b/orttraining/orttraining/eager/test/ort_tensor.py
@@ -19,6 +19,13 @@ class OrtTensorTests(unittest.TestCase):
     ort_ones = cpu_ones.to('ort')
     assert ort_ones.is_ort
     assert torch.allclose(cpu_ones, ort_ones.cpu())
+  
+  def test_reshape(self):
+    cpu_ones = torch.ones(10, 10)
+    ort_ones = cpu_ones.to('ort')
+    y = ort_ones.reshape(-1)
+    assert len(y.size()) == 1
+    assert y.size()[0] == 100
 
 if __name__ == '__main__':
   unittest.main()
\ No newline at end of file
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 53f885d8eb..4535cd5e3f 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -1574,13 +1574,13 @@ TEST(GradientCheckerTest, SigmoidGrad) {
   UnaryOpGradientTest("Sigmoid");
 }
 
-void GradientCheckerSoftmaxGradHelper(bool is_log_softmax) {
+void GradientCheckerSoftmaxGradHelper(bool is_log_softmax, int version = 11) {
   TensorShape shape({3, 4, 5});
   float max_error;
   GradientChecker<float, float, float> gradient_checker;
 
   const std::string op = is_log_softmax ? "LogSoftmax" : "Softmax";
-  OpDef op_def{op};
+  OpDef op_def{op, kOnnxDomain, version};
 
   // default_axis
   {
@@ -1594,6 +1594,12 @@ void GradientCheckerSoftmaxGradHelper(bool is_log_softmax) {
     EXPECT_IS_TINY(max_error);
   }
 
+  // axis=1
+  {
+    ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {shape}, &max_error, {MakeAttribute("axis", int64_t(1))}));
+    EXPECT_IS_TINY(max_error);
+  }
+
   // axis=2
   {
     ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {shape}, {shape}, &max_error, {MakeAttribute("axis", int64_t(2))}));
@@ -1603,10 +1609,12 @@ void GradientCheckerSoftmaxGradHelper(bool is_log_softmax) {
 
 TEST(GradientCheckerTest, SoftMaxGrad) {
   GradientCheckerSoftmaxGradHelper(false);
+  GradientCheckerSoftmaxGradHelper(false, 13);
 }
 
 TEST(GradientCheckerTest, LogSoftMaxGrad) {
   GradientCheckerSoftmaxGradHelper(true);
+  GradientCheckerSoftmaxGradHelper(true, 13);
 }
 
 void TestSoftmaxCrossEntropyGrad(const TensorShape& input_shape, const std::string& reduction) {
diff --git a/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc b/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc
index 3665526277..13b186729d 100644
--- a/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc
+++ b/orttraining/orttraining/test/graph/optimizer_graph_builder_test.cc
@@ -196,7 +196,7 @@ static void TestOptimizerGraphBuilderWithInitialStates(OptimizerGraphConfig conf
   std::unordered_map<std::string, std::unordered_map<std::string, std::string>> opt_initializer_names_map;
   ASSERT_STATUS_OK(optimizer_graph_builder.Build(graph, opt_initializer_names_map, opt_graph_outputs));
 
-  const ONNX_NAMESPACE::TensorProto* tensor;
+  const ONNX_NAMESPACE::TensorProto* tensor{};
   for (auto& weight_item : opt_initializer_names_map) {
     for (auto& opt_item : weight_item.second) {
       ASSERT_TRUE(graph.GetInitializedTensor(opt_item.second, tensor));
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index d12fa746dd..121775d695 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -1118,8 +1118,8 @@ def test_gradient_correctness_einsum(equation):
         pt_prediction = run_step(pt_model, pt_input_left, pt_input_right)
         ort_prediction = run_step(ort_model, ort_input_left, ort_input_right)
 
-        _test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-5)
-        _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)
+        _test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-3, rtol=1e-3)
+        _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model, atol=1e-3, rtol=1e-3)
 
 def test_gradient_correctness_einsum_2():
     class NeuralNetEinsum(torch.nn.Module):
@@ -1202,8 +1202,8 @@ def test_gradient_correctness_einsum_2():
             pt_prediction = run_step(pt_model, pt_input_left, pt_input_right)
             ort_prediction = run_step(ort_model, ort_input_left, ort_input_right)
 
-            _test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-4, rtol=1e-5)
-            _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model)
+            _test_helpers.assert_values_are_close(ort_prediction, pt_prediction, atol=1e-3, rtol=1e-3)
+            _test_helpers.assert_gradients_match_and_reset_gradient(ort_model, pt_model, atol=1e-3, rtol=1e-3)
 
 # Since multinomial is a generator function, we do not have to test for gradient
 # Two consecutive calls on the torch.multinomail on a probability distribution with more
diff --git a/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc b/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc
index 76d4f9dde5..800fd48221 100644
--- a/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cpu/cpu_training_kernels.cc
@@ -40,6 +40,8 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, ConvG
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, ReluGrad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SoftmaxGrad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, LogSoftmaxGrad);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SoftmaxGrad_13);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, LogSoftmaxGrad_13);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, AveragePoolGrad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, MaxPoolGrad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, GatherGrad);
@@ -149,6 +151,8 @@ Status RegisterCpuTrainingKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, ReluGrad)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SoftmaxGrad)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, LogSoftmaxGrad)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, SoftmaxGrad_13)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, LogSoftmaxGrad_13)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, AveragePoolGrad)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 9, MaxPoolGrad)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, GatherGrad)>,
diff --git a/orttraining/orttraining/training_ops/cpu/op_gradients.cc b/orttraining/orttraining/training_ops/cpu/op_gradients.cc
index bbcfdc6ec8..a5ef415374 100644
--- a/orttraining/orttraining/training_ops/cpu/op_gradients.cc
+++ b/orttraining/orttraining/training_ops/cpu/op_gradients.cc
@@ -9,6 +9,7 @@
 #include "core/util/math.h"
 #include "core/providers/cpu/math/element_wise_ops.h"
 #include "core/providers/cpu/math/matmul_helper.h"
+#include "core/providers/cpu/tensor/transpose.h"
 #include "gsl/gsl"
 
 namespace onnxruntime {
@@ -58,6 +59,14 @@ ONNX_OPERATOR_KERNEL_EX(
     KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     SoftmaxGrad<float>);
 
+ONNX_OPERATOR_KERNEL_EX(
+    SoftmaxGrad_13,
+    kMSDomain,
+    1,
+    kCpuExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    SoftmaxGrad<float>);
+
 template <typename T>
 Status SoftmaxGrad<T>::Compute(OpKernelContext* context) const {
   auto& dY = *context->Input<Tensor>(0);
@@ -65,7 +74,8 @@ Status SoftmaxGrad<T>::Compute(OpKernelContext* context) const {
   const TensorShape input_shape{Y.Shape()};
   auto& dX = *context->Output(0, Y.Shape());
 
-  auto axis = HandleNegativeAxis(axis_, Y.Shape().NumDimensions());
+  size_t rank = input_shape.NumDimensions();
+  const size_t axis = static_cast<size_t>(HandleNegativeAxis(axis_, rank));
 
   size_t N = input_shape.SizeToDimension(axis);
   size_t D = input_shape.SizeFromDimension(axis);
@@ -74,30 +84,88 @@ Status SoftmaxGrad<T>::Compute(OpKernelContext* context) const {
     return Status::OK();
   }
 
-  std::vector<float> scale_(N);
-  std::vector<float> sum_multiplier_(D, 1.f);  // initialize all multiplier values to 1.0
+  bool is_transpose_required = opset_ >= 13 && axis != (rank - 1);
+
+  std::unique_ptr<Tensor> transposed_dY;
+  std::unique_ptr<Tensor> transposed_Y;
+  std::vector<int64_t> transposed_input_dims;
+  std::unique_ptr<Tensor> intermediate_output;  // output that the softmax implementation will write into while using transposed input
+  std::vector<size_t> permutation(rank);
+
+  if (is_transpose_required) {
+    AllocatorPtr alloc;
+    auto status = context->GetTempSpaceAllocator(&alloc);
+    if (!status.IsOK())
+      return status;
+
+    std::iota(std::begin(permutation), std::end(permutation), 0);
+
+    // swap the innermost dim with the dim corresponding to axis
+    permutation[axis] = rank - 1;
+    permutation[rank - 1] = axis;
+
+    transposed_input_dims.reserve(rank);
+    for (auto e : permutation) {
+      transposed_input_dims.push_back(input_shape[e]);
+    }
+    N = TensorShape(transposed_input_dims).SizeToDimension(rank - 1);
+    D = TensorShape(transposed_input_dims).SizeFromDimension(rank - 1);
+
+    // Allocate a temporary tensor to hold transposed input
+    auto temp_input0 = Tensor::Create(Y.DataType(), TensorShape(transposed_input_dims), alloc);
+
+    // Perform the transpose
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutation, Y, *temp_input0));
+    transposed_Y = std::move(temp_input0);
+
+    auto temp_input1 = Tensor::Create(Y.DataType(), TensorShape(transposed_input_dims), alloc);
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutation, dY, *temp_input1));
+    transposed_dY = std::move(temp_input1);
+
+    // Allocate memory for the intermediate output
+    intermediate_output = Tensor::Create(dX.DataType(), TensorShape(transposed_input_dims), alloc);
+  }
+
   const int n = gsl::narrow_cast<int>(N);
   const int d = gsl::narrow_cast<int>(D);
   const int nd = gsl::narrow_cast<int>(N * D);
-
-  float* scaledata = scale_.data();
-  const float* Ydata = Y.template Data<float>();
-  const float* dYdata = dY.template Data<float>();
-  float* dXdata = dX.template MutableData<float>();
+  const float* Ydata = is_transpose_required ? transposed_Y->template Data<T>() : Y.template Data<float>();
+  const float* dYdata = is_transpose_required ? transposed_dY->template Data<T>() : dY.template Data<float>();
+  float* dXdata = is_transpose_required ? intermediate_output->template MutableData<T>() : dX.template MutableData<float>();
 
   gsl::copy(gsl::make_span(dYdata, nd), gsl::make_span(dXdata, nd));
+  if (is_logsoftmaxgrad_) {
+    std::vector<float> eY(nd);
+    float* eYdata = eY.data();
 
-  for (size_t i = 0; i < N; ++i) {
-    math::Dot<float, CPUMathUtil>(d, Ydata + i * d, dYdata + i * d,
-                                  scaledata + i, nullptr);
+    // dX_ai = d(log Y_ai) - [sum_j d(log Y_aj)] exp(log Y_ai)
+    gsl::copy(gsl::make_span(dYdata, nd), gsl::make_span(dXdata, nd));
+    math::Exp<float, CPUMathUtil>(nd, Ydata, eYdata, nullptr);
+    for (size_t i = 0; i < N; ++i) {
+      float sdY;
+      math::Sum<float, CPUMathUtil>(d, dYdata + i * d, &sdY, nullptr, nullptr);
+      math::Axpy<float, CPUMathUtil>(d, -sdY, eYdata + i * d, dXdata + i * d, nullptr);
+    }
+  } else {
+    std::vector<float> scale_(N);
+    std::vector<float> sum_multiplier_(D, 1.f);  // initialize all multiplier values to 1.0
+    float* scaledata = scale_.data();
+    for (size_t i = 0; i < N; ++i) {
+      math::Dot<float, CPUMathUtil>(d, Ydata + i * d, dYdata + i * d,
+                                    scaledata + i, nullptr);
+    }
+
+    concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
+    math::Gemm<float>(CblasNoTrans, CblasNoTrans, n, d, 1, -1,
+                      scaledata, sum_multiplier_.data(), 1,
+                      dXdata, tp);
+
+    math::Mul<float, CPUMathUtil>(gsl::narrow_cast<int>(Y.Shape().Size()), dXdata, Ydata, dXdata, nullptr);
+  }
+  if (is_transpose_required) {
+    // Perform the transpose to get the axes back to the original ordering
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutation, *intermediate_output, dX));
   }
-
-  concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
-  math::Gemm<float>(CblasNoTrans, CblasNoTrans, n, d, 1, -1,
-                    scaledata, sum_multiplier_.data(), 1,
-                    dXdata, tp);
-
-  math::Mul<float, CPUMathUtil>(gsl::narrow_cast<int>(Y.Shape().Size()), dXdata, Ydata, dXdata, nullptr);
 
   return Status::OK();
 }
@@ -108,45 +176,15 @@ ONNX_OPERATOR_KERNEL_EX(
     1,
     kCpuExecutionProvider,
     KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
-    LogSoftmaxGrad<float>);
+    SoftmaxGrad<float>);
 
-template <typename T>
-Status LogSoftmaxGrad<T>::Compute(OpKernelContext* context) const {
-  auto& dY = *context->Input<Tensor>(0);
-  auto& Y = *context->Input<Tensor>(1);
-  const TensorShape input_shape{Y.Shape()};
-  auto& dX = *context->Output(0, Y.Shape());
-
-  auto axis = HandleNegativeAxis(axis_, Y.Shape().NumDimensions());
-
-  size_t N = input_shape.SizeToDimension(axis);
-  size_t D = input_shape.SizeFromDimension(axis);
-
-  if (N == 0) {
-    return Status::OK();
-  }
-
-  const int d = gsl::narrow_cast<int>(D);
-  const int nd = gsl::narrow_cast<int>(N * D);
-
-  const float* Ydata = Y.template Data<float>();
-  const float* dYdata = dY.template Data<float>();
-  float* dXdata = dX.template MutableData<float>();
-
-  std::vector<float> eY(nd);
-  float* eYdata = eY.data();
-
-  // dX_ai = d(log Y_ai) - [sum_j d(log Y_aj)] exp(log Y_ai)
-  gsl::copy(gsl::make_span(dYdata, nd), gsl::make_span(dXdata, nd));
-  math::Exp<float, CPUMathUtil>(nd, Ydata, eYdata, nullptr);
-  for (size_t i = 0; i < N; ++i) {
-    float sdY;
-    math::Sum<float, CPUMathUtil>(d, dYdata + i * d, &sdY, nullptr, nullptr);
-    math::Axpy<float, CPUMathUtil>(d, -sdY, eYdata + i * d, dXdata + i * d, nullptr);
-  }
-
-  return Status::OK();
-}
+ONNX_OPERATOR_KERNEL_EX(
+    LogSoftmaxGrad_13,
+    kMSDomain,
+    1,
+    kCpuExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    SoftmaxGrad<float>);
 
 ONNX_OPERATOR_KERNEL_EX(
     SigmoidGrad,
diff --git a/orttraining/orttraining/training_ops/cpu/op_gradients.h b/orttraining/orttraining/training_ops/cpu/op_gradients.h
index 80081c15c4..4e519a7622 100644
--- a/orttraining/orttraining/training_ops/cpu/op_gradients.h
+++ b/orttraining/orttraining/training_ops/cpu/op_gradients.h
@@ -61,7 +61,10 @@ template <typename T>
 class SoftmaxGrad final : public OpKernel {
  public:
   explicit SoftmaxGrad(const OpKernelInfo& info) : OpKernel(info) {
-    axis_ = info.GetAttrOrDefault<int64_t>("axis", 0);
+    const auto& node = info.node();
+    opset_ = (node.OpType() == "SoftmaxGrad_13" || node.OpType() == "LogSoftmaxGrad_13") ? 13 : 1;
+    axis_ = info.GetAttrOrDefault("axis", static_cast<int64_t>(opset_ < 13 ? 1 : -1));
+    is_logsoftmaxgrad_ = node.OpType() == "LogSoftmaxGrad_13" || node.OpType() == "LogSoftmaxGrad";
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -69,20 +72,8 @@ class SoftmaxGrad final : public OpKernel {
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SoftmaxGrad);
   int64_t axis_;
-};
-
-template <typename T>
-class LogSoftmaxGrad final : public OpKernel {
- public:
-  explicit LogSoftmaxGrad(const OpKernelInfo& info) : OpKernel(info) {
-    axis_ = info.GetAttrOrDefault<int64_t>("axis", 0);
-  }
-
-  Status Compute(OpKernelContext* context) const override;
-
- private:
-  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(LogSoftmaxGrad);
-  int64_t axis_;
+  int opset_;  // opset_ of the forward Softmax operator
+  bool is_logsoftmaxgrad_;
 };
 
 }  // namespace contrib
diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
index 394b8ad7c2..8d0882f2ee 100644
--- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
@@ -69,6 +69,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, LogSoftmaxGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, LogSoftmaxGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, LogSoftmaxGrad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, SoftmaxGrad_13);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, SoftmaxGrad_13);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, SoftmaxGrad_13);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, LogSoftmaxGrad_13);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, LogSoftmaxGrad_13);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, LogSoftmaxGrad_13);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float_float_float, BatchNormalizationGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double_double_double, BatchNormalizationGrad);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16_MLFloat16_MLFloat16, BatchNormalizationGrad);
@@ -187,6 +193,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, InPlaceAccumulator);
 
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, SoftmaxGrad);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, SoftmaxGrad_13);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MixedPrecisionScale);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, LayerNormalizationGrad);
 
@@ -281,6 +288,13 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, LogSoftmaxGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, LogSoftmaxGrad)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, LogSoftmaxGrad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, SoftmaxGrad_13)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, SoftmaxGrad_13)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, SoftmaxGrad_13)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, LogSoftmaxGrad_13)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, LogSoftmaxGrad_13)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, LogSoftmaxGrad_13)>,
+
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, 12, float, int64_t, SoftmaxCrossEntropyLoss)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 13, MLFloat16, int64_t, SoftmaxCrossEntropyLoss)>,
@@ -397,6 +411,7 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, InPlaceAccumulator)>,
 
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, SoftmaxGrad)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, SoftmaxGrad_13)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16, MixedPrecisionScale)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BFloat16_float, LayerNormalizationGrad)>,
 
diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc
index 9e00f9ede1..7ce8f0184a 100644
--- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.cc
@@ -7,6 +7,7 @@
 #include "core/providers/cuda/cudnn_common.h"
 #include "core/providers/cuda/math/softmax.h"
 #include "core/providers/cuda/shared_inc/accumulation_type.h"
+#include "core/providers/cuda/tensor/transpose.h"
 
 namespace onnxruntime {
 namespace cuda {
@@ -98,6 +99,16 @@ SPECIALIZED_SOFTMAXGRAD_HELPER_IMPL_BFloat16(true)
       kCudaExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       SoftmaxGrad<T>);                                                                     \
+                                                                                           \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
+      SoftmaxGrad_13,                                                                      \
+      kMSDomain,                                                                           \
+      1,                                                                                   \
+      T,                                                                                   \
+      kCudaExecutionProvider,                                                              \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      SoftmaxGrad<T>);                                                                     \
+                                                                                           \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
       LogSoftmaxGrad,                                                                      \
       kMSDomain,                                                                           \
@@ -105,6 +116,15 @@ SPECIALIZED_SOFTMAXGRAD_HELPER_IMPL_BFloat16(true)
       T,                                                                                   \
       kCudaExecutionProvider,                                                              \
       (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      SoftmaxGrad<T>);                                                                     \
+                                                                                           \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                           \
+      LogSoftmaxGrad_13,                                                                   \
+      kMSDomain,                                                                           \
+      1,                                                                                   \
+      T,                                                                                   \
+      kCudaExecutionProvider,                                                              \
+      (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       SoftmaxGrad<T>);
 
         template <typename T>
@@ -113,16 +133,75 @@ SPECIALIZED_SOFTMAXGRAD_HELPER_IMPL_BFloat16(true)
   const TensorShape& input_shape{dY->Shape()};
   const Tensor* Y = ctx->Input<Tensor>(1);
   Tensor* dX = ctx->Output(0, input_shape);
+  size_t rank = input_shape.NumDimensions();
+  const size_t axis = static_cast<size_t>(HandleNegativeAxis(axis_, rank));
+  bool is_transpose_required = opset_ >= 13 && axis != (rank - 1);
 
-  const T* dY_data = dY->template Data<T>();
-  const T* Y_data = Y->template Data<T>();
-  T* dX_data = dX->template MutableData<T>();
+  std::unique_ptr<Tensor> transposed_dY;
+  std::unique_ptr<Tensor> transposed_Y;
+  std::vector<int64_t> transposed_input_dims;
+  std::unique_ptr<Tensor> intermediate_output;  // output that the softmax implementation will write into while using transposed input
+  std::vector<size_t> permutation(rank);
 
-  if (log_softmax_) {
-    return SoftMaxGradComputeHelper<T, true>(Stream(), dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_);
-  } else {
-    return SoftMaxGradComputeHelper<T, false>(Stream(), dY_data, input_shape, Y_data, dX_data, CudnnHandle(), axis_);
+  if (is_transpose_required) {
+    AllocatorPtr alloc;
+    auto status = ctx->GetTempSpaceAllocator(&alloc);
+    if (!status.IsOK())
+      return status;
+
+    std::iota(std::begin(permutation), std::end(permutation), 0);
+
+    // swap the innermost dim with the dim corresponding to axis
+    permutation[axis] = rank - 1;
+    permutation[rank - 1] = axis;
+
+    transposed_input_dims.reserve(rank);
+    for (auto e : permutation) {
+      transposed_input_dims.push_back(input_shape[e]);
+    }
+
+    // Allocate a temporary tensor to hold transposed input
+    auto temp_input0 = Tensor::Create(Y->DataType(), TensorShape(transposed_input_dims), alloc);
+
+    // Perform the transpose
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(prop_,
+                                               Stream(),
+                                               CublasHandle(),
+                                               permutation, *Y, *temp_input0));
+    transposed_Y = std::move(temp_input0);
+    auto temp_input1 = Tensor::Create(Y->DataType(), TensorShape(transposed_input_dims), alloc);
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(prop_,
+                                               Stream(),
+                                               CublasHandle(),
+                                               permutation, *dY, *temp_input1));
+    transposed_dY = std::move(temp_input1);
+
+    // Allocate memory for the intermediate output
+    intermediate_output = Tensor::Create(dX->DataType(), TensorShape(transposed_input_dims), alloc);
   }
+  const T* dY_data = is_transpose_required ? transposed_dY->template Data<T>() : dY->template Data<T>();
+  const T* Y_data = is_transpose_required ? transposed_Y->template Data<T>() : Y->template Data<T>();
+  T* dX_data = is_transpose_required ? intermediate_output->template MutableData<T>() : dX->template MutableData<T>();
+  const TensorShape* compute_input_shape = is_transpose_required ? &transposed_Y->Shape() : &input_shape;
+  Status status;
+  if (log_softmax_) {
+    status = SoftMaxGradComputeHelper<T, true>(Stream(), dY_data, *compute_input_shape, Y_data, dX_data, CudnnHandle(), is_transpose_required ? static_cast<int64_t>(rank) - 1 : axis);
+  } else {
+    status = SoftMaxGradComputeHelper<T, false>(Stream(), dY_data, *compute_input_shape, Y_data, dX_data, CudnnHandle(), is_transpose_required ? static_cast<int64_t>(rank) - 1 : axis);
+  }
+
+  if (!status.IsOK()) {
+    return status;
+  }
+
+  if (is_transpose_required) {
+    // Perform the transpose to get the axes back to the original ordering
+    ORT_RETURN_IF_ERROR(Transpose::DoTranspose(prop_,
+                                               Stream(),
+                                               CublasHandle(),
+                                               permutation, *intermediate_output, *dX));
+  }
+  return Status::OK();
 }
 
 #define SPECIALIZED_GRADIENT(T)     \
diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h
index 4e50cf2cf4..21543ca9a2 100644
--- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h
+++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad.h
@@ -14,9 +14,12 @@ void dispatch_softmax_backward(cudaStream_t stream, output_t* grad_input, const
 template <typename T>
 class SoftmaxGrad final : public CudaKernel {
  public:
-  SoftmaxGrad(const OpKernelInfo& info) : CudaKernel{info} {
-    info.GetAttrOrDefault("axis", &axis_, static_cast<int64_t>(1));
-    log_softmax_ = info.GetKernelDef().OpName() == "LogSoftmaxGrad";
+  SoftmaxGrad(const OpKernelInfo& info) : CudaKernel{info},
+                                          prop_(static_cast<const CUDAExecutionProvider*>(info.GetExecutionProvider())->GetDeviceProp()) {
+    const auto& node = info.node();
+    opset_ = (node.OpType() == "SoftmaxGrad_13" || node.OpType() == "LogSoftmaxGrad_13") ? 13 : 1;
+    axis_ = info.GetAttrOrDefault("axis", static_cast<int64_t>(opset_ < 13 ? 1 : -1));
+    log_softmax_ = info.GetKernelDef().OpName() == "LogSoftmaxGrad" || info.GetKernelDef().OpName() == "LogSoftmaxGrad_13";
   }
 
   Status ComputeInternal(OpKernelContext* context) const override;
@@ -24,6 +27,8 @@ class SoftmaxGrad final : public CudaKernel {
  private:
   int64_t axis_;
   bool log_softmax_;
+  int opset_;  // opset_ of the forward Softmax/LogSoftmax operator
+  const cudaDeviceProp& prop_;
 };
 
 }  // namespace cuda
diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc
index ccee072b0c..1d64f14d27 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc
+++ b/orttraining/orttraining/training_ops/cuda/nn/conv_grad.cc
@@ -222,8 +222,8 @@ class AlgoIterator {
 
   Status TryAll(const CUDAExecutionProvider* provider, std::function<Status(const T_Perf& perf)> f) {
     auto& cache = AlgoSearch<T_Perf>::Cache();
-    T_Perf algo_perf;
-    if (cache.Find(args_.params, &algo_perf) && f(algo_perf) == Status::OK()) {
+    
+    if (T_Perf algo_perf; cache.Find(args_.params, &algo_perf) && f(algo_perf) == Status::OK()) {
       return Status::OK();
     }
 
diff --git a/orttraining/tools/ci_test/download_azure_blob.py b/orttraining/tools/ci_test/download_azure_blob.py
deleted file mode 100755
index 564dcc8007..0000000000
--- a/orttraining/tools/ci_test/download_azure_blob.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License.
-
-import argparse
-import hashlib
-import os
-import shutil
-import subprocess
-import sys
-import tempfile
-import urllib.request
-import zipfile
-
-SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
-REPO_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", "..", ".."))
-
-sys.path.append(os.path.join(REPO_DIR, "tools", "python"))
-
-from util import get_azcopy  # noqa: E402
-
-def _download(azcopy_path, url, local_path):
-  subprocess.run([azcopy_path, "cp", "--log-level", "NONE", url, local_path], check=True)
-
-def _get_sha256_digest(file_path):
-  alg = hashlib.sha256()
-  read_bytes_length = 8192
-
-  with open(file_path, mode="rb") as archive:
-    while True:
-      read_bytes = archive.read(read_bytes_length)
-      if len(read_bytes) == 0: break
-      alg.update(read_bytes)
-
-  return alg.hexdigest()
-
-def _check_file_sha256_digest(path, expected_digest):
-  actual_digest = _get_sha256_digest(path)
-  match = actual_digest.lower() == expected_digest.lower()
-  if not match:
-    raise RuntimeError(
-        "SHA256 digest mismatch, expected: {}, actual: {}".format(
-            expected_digest.lower(), actual_digest.lower()))
-
-def main():
-  parser = argparse.ArgumentParser(
-      description="Downloads an Azure blob archive.")
-  parser.add_argument("--azure_blob_url", required=True,
-                      help="The Azure blob URL.")
-  parser.add_argument("--target_dir", required=True,
-                      help="The destination directory.")
-  parser.add_argument("--archive_sha256_digest",
-                      help="The SHA256 digest of the archive. Verified if provided.")
-  args = parser.parse_args()
-
-  with tempfile.TemporaryDirectory() as temp_dir, get_azcopy() as azcopy_path:
-    archive_path = os.path.join(temp_dir, "archive.zip")
-    print("Downloading archive from '{}'...".format(args.azure_blob_url))
-    _download(azcopy_path, args.azure_blob_url, archive_path)
-    if args.archive_sha256_digest:
-      _check_file_sha256_digest(archive_path, args.archive_sha256_digest)
-    print("Extracting to '{}'...".format(args.target_dir))
-    shutil.unpack_archive(archive_path, args.target_dir)
-    print("Done.")
-
-if __name__ == "__main__":
-  sys.exit(main())
diff --git a/orttraining/tools/scripts/performance_investigation.py b/orttraining/tools/scripts/performance_investigation.py
new file mode 100644
index 0000000000..b8fb648c18
--- /dev/null
+++ b/orttraining/tools/scripts/performance_investigation.py
@@ -0,0 +1,85 @@
+import argparse
+import onnx
+
+parser = argparse.ArgumentParser(description='ONNX file analyzer for performance investigation.')
+parser.add_argument('onnx_file', type=str, help='ONNX file to analyze')
+args = parser.parse_args()
+
+
+
+def process_file(onnx_file):
+    model = onnx.load(onnx_file)
+
+    # Map from output arg to the producer of the output.
+    output_to_node = {}
+    for node in model.graph.node:
+        for o in node.output:
+            output_to_node[o] = node
+
+    aten_ops = []
+    python_ops = []
+    memcpu_ops = []
+    cast_ops = []
+    msgs = []
+
+    for node in model.graph.node:
+        if "Memcpy" in node.op_type:
+            memcpu_ops.append(f"{node.op_type} {node.name}")
+        if node.op_type == "Cast":
+            cast_ops.append(f"{node.name}")
+        if node.op_type == "ATenOp":
+            for attr in node.attribute:
+                if attr.name == "name":
+                    aten_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")
+        if node.op_type == "PythonOp":
+            for attr in node.attribute:
+                if attr.name == "name":
+                    python_ops.append(f"{node.name}: {attr.s.decode('utf-8')}")
+
+        # Look for stand-alone Dropout node in *_execution_model_<mode>.onnx graph.
+        # Examine whether it should be fused with surrounding Add ops into BiasDropout node.
+        if node.op_type == "Dropout" and len(node.input) == 1:
+            prev = output_to_node[node.input[0]]
+            if prev.op_type == "Add":
+                msgs.append(f"Examine whether {node.name} should be fused with the leading {prev.name} op into BiasDropout node.")
+
+        # Look for stand-alone Softmax node in *_execution_model_<mode>.onnx graph.
+        # Examine whether it should be fused with the leading Add ops into BiasSoftmax node.
+        if node.op_type == "Softmax" and len(node.input) == 1:
+            prev = output_to_node[node.input[0]]
+            if prev.op_type == "Add":
+                msgs.append(f"Examine whether {node.name} should be fused with the leading {prev.name} op into BiasSoftmax node.")
+
+    if aten_ops:
+        print("ATenOp found:")
+        for line in aten_ops:
+            print(line)
+        print(10 * '-')
+
+    if python_ops:
+        print("PythonOp found:")
+        for line in python_ops:
+            print(line)
+        print(10 * '-')
+
+    if memcpu_ops:
+        print("Memcpu ops found:")
+        for line in memcpu_ops:
+            print(line)
+        print(10 * '-')
+
+    if cast_ops:
+        print("Cast ops found:")
+        for line in cast_ops:
+            print(line)
+        print(10 * '-')
+
+    for line in msgs:
+        print(line)
+
+
+def main():
+    process_file(args.onnx_file)
+
+if __name__ == "__main__":
+    main()
diff --git a/packages.config b/packages.config
index 7a1e950b1e..4ca9b96802 100644
--- a/packages.config
+++ b/packages.config
@@ -1,6 +1,6 @@
 ﻿<?xml version="1.0" encoding="utf-8"?>
 <packages>
   <package id="GoogleTestAdapter" version="0.17.1" targetFramework="net46" />
-  <package id="Microsoft.AI.DirectML" version="1.5.1" targetFramework="native" />
+  <package id="Microsoft.AI.DirectML" version="1.8.0" targetFramework="native" />
   <package id="Microsoft.Windows.CppWinRT" version="2.0.201113.7" targetFramework="native" />
 </packages>
diff --git a/server/serializing/tensorprotoutils.cc b/server/serializing/tensorprotoutils.cc
index ee6d6abf7f..145e483d2c 100644
--- a/server/serializing/tensorprotoutils.cc
+++ b/server/serializing/tensorprotoutils.cc
@@ -47,16 +47,16 @@ inline std::string MakeString(const char* p_str) {
 
 
 namespace server {
-#ifdef __GNUC__
-constexpr inline bool IsLittleEndianOrder() noexcept { return __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__; }
+constexpr bool IsLittleEndianOrder() noexcept {
+#if defined(_WIN32)
+  return true;
+#elif defined(__GNUC__) || defined(__clang__)
+  return __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 #else
-// On Windows and Mac, this function should always return true
-GSL_SUPPRESS(type .1)  // allow use of reinterpret_cast for this special case
-inline bool IsLittleEndianOrder() noexcept {
-  static int n = 1;
-  return (*reinterpret_cast<char*>(&n) == 1);
-}
+#error server::IsLittleEndianOrder() is not implemented in this environment.
 #endif
+}
+
 std::vector<int64_t> GetTensorShapeFromTensorProto(const onnx::TensorProto& tensor_proto) {
   const auto& dims = tensor_proto.dims();
   std::vector<int64_t> tensor_shape_vec(static_cast<size_t>(dims.size()));
@@ -107,7 +107,7 @@ static void UnpackTensorWithRawData(const void* raw_data, size_t raw_data_length
       throw Ort::Exception(MakeString("UnpackTensor: the pre-allocated size does not match the raw data size, expected ",
                                       expected_size_in_bytes, ", got ", raw_data_length),
                            OrtErrorCode::ORT_FAIL);
-    if (IsLittleEndianOrder()) {
+    if constexpr (IsLittleEndianOrder()) {
       memcpy(p_data, raw_data, raw_data_length);
     } else {
       const size_t type_size = sizeof(T);
diff --git a/tools/ci_build/amd_hipify.py b/tools/ci_build/amd_hipify.py
index 8a388ae927..c1a227dc92 100644
--- a/tools/ci_build/amd_hipify.py
+++ b/tools/ci_build/amd_hipify.py
@@ -18,7 +18,9 @@ contrib_ops_excluded_files = [
                     'bert/attention_impl.cu',
                     'bert/attention_impl.h',
                     'bert/attention_transpose.cu',
-                    'bert/attention_past.cu',
+                    'bert/attention_concat.cu',
+                    'bert/decoder_attention.h',
+                    'bert/decoder_attention.cc',
                     'bert/embed_layer_norm.cc',
                     'bert/embed_layer_norm.h',
                     'bert/embed_layer_norm_impl.cu',
@@ -33,6 +35,7 @@ contrib_ops_excluded_files = [
                     'bert/longformer_attention_impl.h',
                     'bert/longformer_global_impl.cu',
                     'bert/longformer_global_impl.h',
+                    'bert/transformer_cuda_common.h',
                     'math/bias_softmax.cc',
                     'math/bias_softmax.h',
                     'math/bias_softmax_impl.cu',
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 6797638b08..35a95c2c90 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -567,6 +567,10 @@ def parse_arguments():
         "--test_external_transformer_example", action='store_true',
         help="run the example external transformer test, mainly used in CI pipeline.")
 
+    parser.add_argument(
+        "--enable_cuda_profiling", action='store_true', help="enable cuda kernel profiling, \
+        cupti library must be added to PATH beforehand.")
+
     return parser.parse_args()
 
 
@@ -817,6 +821,7 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
         "-Donnxruntime_ENABLE_EXTERNAL_CUSTOM_OP_SCHEMAS=" + ("ON" if args.enable_external_custom_op_schemas
                                                               else "OFF"),
         "-Donnxruntime_NVCC_THREADS=" + str(args.parallel),
+        "-Donnxruntime_ENABLE_CUDA_PROFILING=" + ("ON" if args.enable_cuda_profiling else "OFF"),
     ]
     if args.external_graph_transformer_path:
         cmake_args.append("-Donnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH=" + args.external_graph_transformer_path)
@@ -1759,6 +1764,11 @@ def build_nuget_package(source_dir, build_dir, configs, use_cuda, use_openvino,
     csharp_build_dir = os.path.join(source_dir, 'csharp')
     is_linux_build = derive_linux_build_property()
 
+    # in most cases we don't want/need to include the Xamarin mobile targets, as doing so means the Xamarin
+    # mobile workloads must be installed on the machine.
+    # they are only included in the Microsoft.ML.OnnxRuntime nuget package
+    sln = "OnnxRuntime.DesktopOnly.CSharp.sln"
+
     # derive package name and execution provider based on the build args
     target_name = "/t:CreatePackage"
     execution_provider = "/p:ExecutionProvider=\"None\""
@@ -1780,14 +1790,15 @@ def build_nuget_package(source_dir, build_dir, configs, use_cuda, use_openvino,
     elif use_nuphar:
         package_name = "/p:OrtPackageId=\"Microsoft.ML.OnnxRuntime.Nuphar\""
     else:
-        pass
+        # use the solution file that includes Xamarin mobile targets
+        sln = "OnnxRuntime.CSharp.sln"
 
     # set build directory based on build_dir arg
     native_dir = os.path.normpath(os.path.join(source_dir, build_dir))
     ort_build_dir = "/p:OnnxRuntimeBuildDirectory=\"" + native_dir + "\""
 
     # dotnet restore
-    cmd_args = ["dotnet", "restore", "OnnxRuntime.CSharp.sln", "--configfile", "Nuget.CSharp.config"]
+    cmd_args = ["dotnet", "restore", sln, "--configfile", "Nuget.CSharp.config"]
     run_subprocess(cmd_args, cwd=csharp_build_dir)
 
     # build csharp bindings and create nuget package for each config
@@ -1800,8 +1811,7 @@ def build_nuget_package(source_dir, build_dir, configs, use_cuda, use_openvino,
         configuration = "/p:Configuration=\"" + config + "\""
 
         if not use_winml:
-            cmd_args = ["dotnet", "msbuild", "OnnxRuntime.CSharp.sln", configuration, package_name, is_linux_build,
-                        ort_build_dir]
+            cmd_args = ["dotnet", "msbuild", sln, configuration, package_name, is_linux_build, ort_build_dir]
             run_subprocess(cmd_args, cwd=csharp_build_dir)
         else:
             winml_interop_dir = os.path.join(source_dir, "csharp", "src", "Microsoft.AI.MachineLearning.Interop")
@@ -1811,7 +1821,13 @@ def build_nuget_package(source_dir, build_dir, configs, use_cuda, use_openvino,
                         ort_build_dir, "-restore"]
             run_subprocess(cmd_args, cwd=csharp_build_dir)
 
-        nuget_exe = os.path.normpath(os.path.join(native_dir, config, "nuget_exe", "src", "nuget.exe"))
+        if is_windows():
+            # this path is setup by cmake/nuget_helpers.cmake for MSVC on Windows
+            nuget_exe = os.path.normpath(os.path.join(native_dir, config, "nuget_exe", "src", "nuget.exe"))
+        else:
+            # user needs to make sure nuget is installed and can be found
+            nuget_exe = "nuget"
+
         nuget_exe_arg = "/p:NugetExe=\"" + nuget_exe + "\""
 
         cmd_args = [
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index f93b076004..283bb70f67 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -444,11 +444,14 @@ jobs:
     inputs:
       versionSpec: 5.7.0
 
-  - task: NuGetCommand@2
-    displayName: Restore NuGet Packages
+  - task: MSBuild@1
+    displayName: 'Restore NuGet Packages'
     inputs:
-      command: "restore"
-      restoreSolution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      platform: 'Any CPU'
+      configuration: RelWithDebInfo
+      msbuildArguments: '-t:restore -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"'
+      workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
   - task: MSBuild@1
     displayName: 'Build C#'
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 0a4c360f5c..59cdd99fc7 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -38,6 +38,7 @@ jobs:
               --parallel \
               --build_wheel \
               --enable_onnx_tests --use_cuda --cuda_version=11.4 --cuda_home=/usr/local/cuda-11.4 --cudnn_home=/usr/local/cuda-11.4 \
+              --enable_cuda_profiling \
               --enable_pybind --build_java \
               --cmake_extra_defines CMAKE_CUDA_HOST_COMPILER=/opt/rh/devtoolset-10/root/usr/bin/cc  CMAKE_CUDA_ARCHITECTURES=52
       workingDirectory: $(Build.SourcesDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml b/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml
index 2c2701d8da..0400d2d035 100644
--- a/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml
+++ b/tools/ci_build/github/azure-pipelines/nodejs/templates/test.yml
@@ -22,7 +22,7 @@ steps:
   displayName: 'Extract package file name (POSIX)'
   inputs:
     script: |
-      echo "##vso[task.setvariable variable=NpmPackageFilesForTest;]`ls $(Build.BinariesDirectory)/nodejs-artifact/*.tgz`"
+      echo "##vso[task.setvariable variable=NpmPackageFilesForTest;]`ls $(Build.BinariesDirectory)/nodejs-artifact/*.tgz | tr '\n' ' '`"
     workingDirectory: '$(Build.BinariesDirectory)/e2e_test'
 
 - script: |
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 03b5dbac76..32a0552932 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -464,11 +464,14 @@ jobs:
     inputs:
       versionSpec: 5.7.0
 
-  - task: NuGetCommand@2
-    displayName: Restore NuGet Packages
+  - task: MSBuild@1
+    displayName: 'Restore NuGet Packages'
     inputs:
-      command: "restore"
-      restoreSolution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      platform: 'Any CPU'
+      configuration: RelWithDebInfo
+      msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
+      workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
   - task: MSBuild@1
     displayName: 'Build C#'
@@ -897,30 +900,6 @@ jobs:
         python.exe $(Build.SourcesDirectory)\tools\ci_build\github\windows\post_binary_sizes_to_dashboard.py --commit_hash=$(Build.SourceVersion) --size_data_file=binary_size_data.txt --build_project=Lotus --build_id=$(Build.BuildId)
       workingDirectory: '$(Build.BinariesDirectory)'
 
-  - task: AzureFileCopy@3
-    displayName: 'Copy Signed Native NuGet Package to Blob Store'
-    condition: ne(variables['IsReleaseBuild'], 'true') # release build has a different package naming scheme
-    inputs:
-      sourcePath: '$(Build.BinariesDirectory)/nuget-artifact/final-package/${{ parameters.OrtNugetPackageId }}.$(NuGetPackageVersionNumber).nupkg'
-      azureSubscription: 'AIInfraBuildOnnxRuntimeOSS'
-      destination: azureBlob
-      storage: ortpackages
-      containerName: ortpackages
-      blobPrefix: '$(CurrentDate)/'
-    continueOnError: true
-
-  - task: AzureFileCopy@3
-    displayName: 'Copy Signed Managed NuGet Package to Blob Store'
-    condition: ne(variables['IsReleaseBuild'], 'true') # release build has a different package naming scheme
-    inputs:
-      sourcePath: '$(Build.BinariesDirectory)/nuget-artifact/final-package/Microsoft.ML.OnnxRuntime.Managed.$(NuGetPackageVersionNumber).nupkg'
-      azureSubscription: 'AIInfraBuildOnnxRuntimeOSS'
-      destination: azureBlob
-      storage: ortpackages
-      containerName: ortpackages
-      blobPrefix: '$(CurrentDate)/'
-    continueOnError: true
-
   - task: DotNetCoreCLI@2
     displayName: 'Copy Signed Native NuGet Package to Internal NuGet Feed'
     condition: ne(variables['IsReleaseBuild'], 'true') # release build has a different package naming scheme
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci-2019.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci-2019.yml
index 0b35f48249..04e21bcd22 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci-2019.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci-2019.yml
@@ -136,11 +136,14 @@ jobs:
           versionSpec: 5.7.0
 
     - ${{ if eq(parameters.BuildCSharp, true) }}:
-      - task: NuGetCommand@2
-        displayName: Restore NuGet Packages
+      - task: MSBuild@1
+        displayName: 'Restore NuGet Packages'
         inputs:
-          command: "restore"
-          restoreSolution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+          platform: 'Any CPU'
+          configuration: '$(BuildConfig)'
+          msbuildArguments: '-t:restore -p:OrtPackageId=${{ parameters.OrtPackageId }}'
+          workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
     - ${{ if eq(parameters.BuildCSharp, true) }}:
       - task: MSBuild@1
@@ -211,6 +214,7 @@ jobs:
          ${{ parameters.NuPackScript }}
         workingDirectory: '$(Build.SourcesDirectory)\csharp'
         displayName: 'Create NuGet Package'
+        failOnStderr: true
 
       - task: PublishPipelineArtifact@0
         displayName: 'Publish Pipeline Artifact: ${{ parameters.ArtifactName }}'
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml
index 0a704ddf4f..3e952d1fd6 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-cpu-ci.yml
@@ -145,12 +145,14 @@ jobs:
           arguments: 'bdist_wheel'
           workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}'
 
-
-  - task: NuGetCommand@2
-    displayName: Restore NuGet Packages
+  - task: MSBuild@1
+    displayName: 'Restore NuGet Packages'
     inputs:
-      command: "restore"
-      restoreSolution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      platform: 'Any CPU'
+      configuration: '${{ parameters.BuildConfig }}'
+      msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
+      workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
   - task: MSBuild@1
     displayName: 'Build C#'    
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index b435b74d96..a2d9dc3048 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -13,7 +13,7 @@ stages:
       strategy:
         matrix:
           cuda:
-            additionalBuildFlags: --build_java --build_nodejs --use_cuda --cuda_version=11.4 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4" --cudnn_home="C:\local\cudnn-11.4-windows-x64-v8.2.2.26\cuda" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=52 --gen_doc validate
+            additionalBuildFlags: --build_java --build_nodejs --use_cuda --cuda_version=11.4 --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.4" --cudnn_home="C:\local\cudnn-11.4-windows-x64-v8.2.2.26\cuda" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=52 --gen_doc validate --enable_cuda_profiling
             EnvSetupScript: setup_env_cuda_11.bat
             ORT_EP_NAME: CUDA
           dml:
@@ -130,11 +130,14 @@ stages:
           arguments: 'bdist_wheel'
           workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)'
 
-      - task: NuGetCommand@2
-        displayName: Restore NuGet Packages
+      - task: MSBuild@1
+        displayName: 'Restore NuGet Packages'
         inputs:
-          command: "restore"
-          restoreSolution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+          solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+          platform: 'Any CPU'
+          configuration: '$(BuildConfig)'
+          msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
+          workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
       - task: MSBuild@1
         displayName: 'Build C#'
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-10-2-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-10-2-pipeline.yml
index 55561e4227..3150d8c4b9 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-10-2-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-10-2-pipeline.yml
@@ -81,11 +81,14 @@ jobs:
     inputs:
       versionSpec: 5.7.0
 
-  - task: NuGetCommand@2
-    displayName: Restore NuGet Packages
+  - task: MSBuild@1
+    displayName: 'Restore NuGet Packages'
     inputs:
-      command: "restore"
-      restoreSolution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+      platform: 'Any CPU'
+      configuration: '$(BuildConfig)'
+      msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)'
+      workingDirectory: '$(Build.SourcesDirectory)\csharp'
 
   - task: MSBuild@1
     displayName: 'Build C#'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
index eb46ecebdc..2c65ed4cb5 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
+++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino
@@ -12,7 +12,7 @@ RUN apt update && apt install -y libnuma1 ocl-icd-libopencl1 && \
 
 WORKDIR /root
 
-ENV INTEL_OPENVINO_DIR /opt/intel/openvino_${OPENVINO_VERSION}.689
+ENV INTEL_OPENVINO_DIR /opt/intel/openvino_${OPENVINO_VERSION}.752
 ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/deployment_tools/inference_engine/lib/intel64:$INTEL_OPENVINO_DIR/deployment_tools/ngraph/lib:$INTEL_OPENVINO_DIR/deployment_tools/inference_engine/external/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH
 ENV InferenceEngine_DIR $INTEL_OPENVINO_DIR/deployment_tools/inference_engine/share
 ENV ngraph_DIR $INTEL_OPENVINO_DIR/deployment_tools/ngraph/cmake
@@ -26,7 +26,7 @@ RUN wget https://apt.repos.intel.com/openvino/2021/GPG-PUB-KEY-INTEL-OPENVINO-20
     cd /etc/apt/sources.list.d && \
     echo "deb https://apt.repos.intel.com/openvino/2021 all main">intel-openvino-2021.list && \
     apt update && \ 
-    apt install -y intel-openvino-dev-ubuntu18-2021.4.689 && \
+    apt install -y intel-openvino-dev-ubuntu18-2021.4.752 && \
     cd ${INTEL_OPENVINO_DIR}/install_dependencies && ./install_openvino_dependencies.sh -y
 
 RUN wget https://github.com/intel/compute-runtime/releases/download/19.41.14441/intel-gmmlib_19.3.2_amd64.deb && \
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index a1237f4786..1e99e22fda 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -164,7 +164,7 @@ def generate_repo_url(list, repo_url, commit_id):
 
 
 def generate_dependencies(list, package_name, version):
-    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.5.1"/>'
+    dml_dependency = '<dependency id="Microsoft.AI.DirectML" version="1.8.0"/>'
 
     if (package_name == 'Microsoft.AI.MachineLearning'):
         list.append('<dependencies>')
diff --git a/winml/lib/Api/ImageFeatureValue.cpp b/winml/lib/Api/ImageFeatureValue.cpp
index 2ecfa297f3..1b8be103c0 100644
--- a/winml/lib/Api/ImageFeatureValue.cpp
+++ b/winml/lib/Api/ImageFeatureValue.cpp
@@ -123,6 +123,25 @@ static std::optional<wgi::BitmapBounds> GetBoundsFromMetadata(const wfc::IProper
   return {};
 }
 
+static std::optional<winml::LearningModelPixelRange> GetBitmapPixelRangeFromMetadata(const wfc::IPropertySet& properties) {
+  if (properties != nullptr && properties.HasKey(L"PixelRange")) {
+    if (auto pixelRangeInspectable = properties.Lookup(L"PixelRange")) {
+      auto pixelRangeValue = pixelRangeInspectable.as<wf::IPropertyValue>();
+      auto pixelRange = static_cast<winml::LearningModelPixelRange>(pixelRangeValue.GetInt32());
+      WINML_THROW_HR_IF_FALSE_MSG(
+          WINML_ERR_INVALID_BINDING,
+          pixelRange == winml::LearningModelPixelRange::ZeroTo255 ||
+              pixelRange == winml::LearningModelPixelRange::ZeroToOne ||
+              pixelRange == winml::LearningModelPixelRange::MinusOneToOne,
+          "LearningModelPixelRange must be either ZeroTo255, ZeroToOne, or MinusOneToOne");
+
+      return pixelRange;
+    }
+  }
+
+  return {};
+}
+
 wgi::BitmapBounds ImageFeatureValue::CenterAndCropBounds(
     uint32_t idx,
     uint32_t desiredWidth,
@@ -366,7 +385,6 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
   // TODO: Validate Bounds
 
   // Set up BitmapPixelFormat
-
   auto pixelFormat = std::optional<wgi::BitmapPixelFormat>{};
   pixelFormat = GetBitmapPixelFormatFromMetadata(context.properties);
   if (!pixelFormat.has_value() && spImageDescriptor) {
@@ -387,13 +405,21 @@ std::optional<ImageFeatureValue::ImageResourceMetadata> ImageFeatureValue::GetIn
   }
 
   // Set up LearningModelPixelRange
-  winml::LearningModelPixelRange pixelRange = winml::LearningModelPixelRange::ZeroTo255;  //default;
-  if (spImageDescriptor) {
+  auto pixelRange = std::optional<winml::LearningModelPixelRange>{};
+  pixelRange = GetBitmapPixelRangeFromMetadata(context.properties);
+  if (pixelRange.has_value()) {
+    // The pixel range was set by the bind properties, skip all checks and honor
+    // the user provided normalization property. Do nothing.
+  } else if (!pixelRange.has_value() && spImageDescriptor) {
     pixelRange = spImageDescriptor->PixelRange();
+  } else if (!pixelRange.has_value() && spTensorDescriptor) {
+    pixelRange = winml::LearningModelPixelRange::ZeroTo255;  //default;
+  } else {
+    THROW_HR(WINML_ERR_INVALID_BINDING);
   }
   
   //NCHW layout
-  auto imageTensorDescriptor = CreateImageTensorDescriptor(tensorKind, pixelFormat.value(), pixelRange, m_batchSize, descriptorWidth, descriptorHeight);
+  auto imageTensorDescriptor = CreateImageTensorDescriptor(tensorKind, pixelFormat.value(), pixelRange.value(), m_batchSize, descriptorWidth, descriptorHeight);
 
   return ImageResourceMetadata{bounds, imageTensorDescriptor};
 }
diff --git a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
index 96fae01ff1..090dcf72ee 100644
--- a/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
+++ b/winml/test/scenario/cppwinrt/scenariotestscppwinrt.cpp
@@ -260,6 +260,14 @@ static void Scenario6BindWithProperties() {
     // insert it in the property set
     propertySet.Insert(L"BitmapPixelFormat", bitmapPixelFormatProperty);
 
+    // make a LearningModelPixelRange
+    LearningModelPixelRange pixelRange = LearningModelPixelRange::ZeroTo255;
+    // translate it to an int so it can be used as a PropertyValue;
+    int intFromLearningModelPixelRange = static_cast<int>(pixelRange);
+    auto pixelRangeProperty = wf::PropertyValue::CreateInt32(intFromLearningModelPixelRange);
+    // insert it in the property set
+    propertySet.Insert(L"PixelRange", pixelRangeProperty);
+
     // bind with properties
     WINML_EXPECT_NO_THROW(binding.Bind(input.Name(), imageValue, propertySet));
   }

From 7a3abd863fa6b99f966ac4134e5b6ae6f9f20527 Mon Sep 17 00:00:00 2001
From: Dwayne Robinson <dwayner@microsoft.com>
Date: Thu, 2 Dec 2021 00:48:54 -0800
Subject: [PATCH 2/5] Update WinML model test tolerances for tiny_yolov2 and
 FNS_Candy

---
 winml/test/model/model_tests.cpp    | 57 +++++++++++++++++++----------
 winml/test/model/skip_model_tests.h | 47 ++++++++++++++----------
 2 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp
index 3e940dd05e..33fe868f93 100644
--- a/winml/test/model/model_tests.cpp
+++ b/winml/test/model/model_tests.cpp
@@ -23,6 +23,8 @@ namespace WinML {
 // Global needed to keep the actual ITestCase alive while the tests are going on. Only ITestCase* are used as test parameters.
 std::vector<std::unique_ptr<ITestCase>> ownedTests;
 
+static std::string GetFullNameOfTest(ITestCase* testCase, winml::LearningModelDeviceKind deviceKind);
+
 class ModelTest : public testing::TestWithParam<std::tuple<ITestCase*, winml::LearningModelDeviceKind>> {
  protected:
   void SetUp() override {
@@ -30,7 +32,7 @@ class ModelTest : public testing::TestWithParam<std::tuple<ITestCase*, winml::Le
     winrt_activation_handler = WINRT_RoGetActivationFactory;
 #endif
     std::tie(m_testCase, m_deviceKind) = GetParam();
-    WINML_EXPECT_NO_THROW(m_testCase->GetPerSampleTolerance(&m_perSampleTolerance));
+    WINML_EXPECT_NO_THROW(m_testCase->GetPerSampleTolerance(&m_absolutePerSampleTolerance));
     WINML_EXPECT_NO_THROW(m_testCase->GetRelativePerSampleTolerance(&m_relativePerSampleTolerance));
     WINML_EXPECT_NO_THROW(m_testCase->GetPostProcessing(&m_postProcessing));
 
@@ -38,12 +40,15 @@ class ModelTest : public testing::TestWithParam<std::tuple<ITestCase*, winml::Le
 #ifdef USE_DML
     if (m_deviceKind == winml::LearningModelDeviceKind::DirectX) {
       m_relativePerSampleTolerance = 0.009;  // tolerate up to 0.9% difference of expected result.
-      auto gpuSampleTolerancePerTestsItr = gpuSampleTolerancePerTests.find(m_testCase->GetTestCaseName());
-      if (gpuSampleTolerancePerTestsItr != gpuSampleTolerancePerTests.end()) {
-        m_perSampleTolerance = gpuSampleTolerancePerTestsItr->second;
-      }
     }
 #endif
+
+    // Check for any specific tolerances with this test.
+    std::string fullTestName = GetFullNameOfTest(m_testCase, m_deviceKind);
+    auto sampleTolerancePerTestsIter = sampleTolerancePerTests.find(fullTestName);
+    if (sampleTolerancePerTestsIter != sampleTolerancePerTests.end()) {
+      m_absolutePerSampleTolerance = sampleTolerancePerTestsIter->second;
+    }
   }
   // Called after the last test in this test suite.
   static void TearDownTestSuite() {
@@ -51,7 +56,7 @@ class ModelTest : public testing::TestWithParam<std::tuple<ITestCase*, winml::Le
   }
   winml::LearningModelDeviceKind m_deviceKind;
   ITestCase* m_testCase;
-  double m_perSampleTolerance = 1e-3;
+  double m_absolutePerSampleTolerance = 1e-3;
   double m_relativePerSampleTolerance = 1e-3;
   bool m_postProcessing = false;
 
@@ -87,7 +92,7 @@ class ModelTest : public testing::TestWithParam<std::tuple<ITestCase*, winml::Le
         auto actualOutputTensorValue = results.Outputs().Lookup(outputName).as<ITensor>();
         Ort::Value actualOutput = OrtValueHelpers::CreateOrtValueFromITensor(actualOutputTensorValue);
         // Use the expected and actual OrtValues to compare
-        std::pair<COMPARE_RESULT, std::string> ret = CompareOrtValue(*actualOutput, *value, m_perSampleTolerance, m_relativePerSampleTolerance, m_postProcessing);
+        std::pair<COMPARE_RESULT, std::string> ret = CompareOrtValue(*actualOutput, *value, m_absolutePerSampleTolerance, m_relativePerSampleTolerance, m_postProcessing);
         WINML_EXPECT_EQUAL(COMPARE_RESULT::SUCCESS, ret.first) << ret.second;
       } else if (outputDescriptor.Kind() == LearningModelFeatureKind::Sequence) {
         auto sequenceOfMapsStringToFloat = results.Outputs().Lookup(outputName).try_as<IVectorView<IMap<winrt::hstring, float>>>();
@@ -95,7 +100,7 @@ class ModelTest : public testing::TestWithParam<std::tuple<ITestCase*, winml::Le
           WINML_EXPECT_TRUE(CompareFeatureValuesHelper::CompareSequenceOfMapsStringToFloat(
               sequenceOfMapsStringToFloat,
               value,
-              m_perSampleTolerance,
+              m_absolutePerSampleTolerance,
               m_relativePerSampleTolerance));
         } else {
           throw winrt::hresult_not_implemented(L"This particular type of sequence output hasn't been handled yet.");
@@ -319,10 +324,12 @@ bool ShouldSkipTestOnGpuAdapter(std::string& testName) {
   return false;
 }
 
-// determine if test should be disabled
-void DetermineIfDisableTest(std::string& testName, winml::LearningModelDeviceKind deviceKind) {
+// Determine if test should be disabled, and prepend "DISABLED" in front of the name if so.
+bool ModifyNameIfDisabledTest(/*inout*/ std::string& testName, winml::LearningModelDeviceKind deviceKind) {
   bool shouldSkip = false;
   std::string reason = "Reason not found.";
+
+  // Check for any tests by name that should be disabled, for either CPU or GPU.
   if (disabledTests.find(testName) != disabledTests.end()) {
     reason = disabledTests.at(testName);
     shouldSkip = true;
@@ -330,9 +337,6 @@ void DetermineIfDisableTest(std::string& testName, winml::LearningModelDeviceKin
     if (SkipGpuTests()) {
       reason = "GPU tests are not enabled for this build.";
       shouldSkip = true;
-    } else if (disabledGpuTests.find(testName) != disabledGpuTests.end()) {
-      reason = disabledGpuTests.at(testName);
-      shouldSkip = true;
     } else if (disabledGpuAdapterTests.find(testName) != disabledGpuAdapterTests.end() && ShouldSkipTestOnGpuAdapter(testName)) {
       reason = disabledGpuAdapterTests[testName].second;
       shouldSkip = true;
@@ -342,12 +346,14 @@ void DetermineIfDisableTest(std::string& testName, winml::LearningModelDeviceKin
     printf("Disabling %s test because : %s\n", testName.c_str(), reason.c_str());
     testName = "DISABLED_" + testName;
   }
+
+  return shouldSkip;
 }
 
-// This function gets the name of the test
-static std::string GetNameOfTest(const testing::TestParamInfo<ModelTest::ParamType>& info) {
+// This function constructs the full name of the test from the file path and device kind.
+std::string GetFullNameOfTest(ITestCase* testCase, winml::LearningModelDeviceKind deviceKind) {
   std::string name = "";
-  auto modelPath = std::wstring(std::get<0>(info.param)->GetModelUrl());
+  auto modelPath = std::wstring(testCase->GetModelUrl());
   auto modelPathStr = _winml::Strings::UTF8FromUnicode(modelPath.c_str(), modelPath.length());
   std::vector<std::string> tokenizedModelPath;
   std::istringstream ss(modelPathStr);
@@ -362,18 +368,29 @@ static std::string GetNameOfTest(const testing::TestParamInfo<ModelTest::ParamTy
 
   std::replace_if(name.begin(), name.end(), [](char c) { return !google::protobuf::ascii_isalnum(c); }, '_');
 
-  auto deviceKind = std::get<1>(info.param);
-  // Determine if test should be skipped
-  DetermineIfDisableTest(name, deviceKind);
+  // Determine if test should be skipped, using the generic name (no CPU or GPU suffix yet).
+  bool isDisabled = ModifyNameIfDisabledTest(/*inout*/ name, deviceKind);
+
   if (deviceKind == winml::LearningModelDeviceKind::Cpu) {
     name += "_CPU";
   } else {
     name += "_GPU";
   }
 
+  // Check once more with the full name, lest any GPU-specific/CPU-specific cases exist.
+  if (!isDisabled)
+  {
+    ModifyNameIfDisabledTest(/*inout*/ name, deviceKind);
+  }
+
   return name;
 }
 
+// This function gets the name of the test
+static std::string GetNameOfTestFromTestParam(const testing::TestParamInfo<ModelTest::ParamType>& info) {
+  return GetFullNameOfTest(std::get<0>(info.param), std::get<1>(info.param));
+}
+
 INSTANTIATE_TEST_SUITE_P(ModelTests, ModelTest, testing::Combine(testing::ValuesIn(GetAllTestCases()), testing::Values(winml::LearningModelDeviceKind::Cpu, winml::LearningModelDeviceKind::DirectX)),
-                         GetNameOfTest);
+                         GetNameOfTestFromTestParam);
 }  // namespace WinML
\ No newline at end of file
diff --git a/winml/test/model/skip_model_tests.h b/winml/test/model/skip_model_tests.h
index 93047e87dd..2f29806200 100644
--- a/winml/test/model/skip_model_tests.h
+++ b/winml/test/model/skip_model_tests.h
@@ -8,6 +8,8 @@ static const std::string disabledGpuTestDefaultReason = "Model not working on GP
 // {"model test name", "reason for why it is happening and bug filed for it."}
 std::unordered_map<std::string, std::string> disabledTests(
     {
+     // Disabled cases common to both CPU&GPU (no _CPU/_GPU suffix):
+
      // Tier 3 models
      {"mxnet_arcface_opset8", disabledTestDefaultReason},
      {"XGBoost_XGClassifier_sklearn_load_wine_opset7", disabledTestDefaultReason},
@@ -109,34 +111,41 @@ std::unordered_map<std::string, std::string> disabledTests(
      {"coreml_DecisionTreeClassifier_sklearn_load_breast_cancer_opset7", disabledTestDefaultReason},
      {"coreml_DecisionTreeClassifier_OpenML_312_scene_opset7", disabledTestDefaultReason},
      {"coreml_DecisionTreeClassifier_OpenML_1464_blood_transfusion_opset7", disabledTestDefaultReason},
-     {"coreml_AgeNet_ImageNet_opset7", disabledTestDefaultReason}
-    });
+     {"coreml_AgeNet_ImageNet_opset7", disabledTestDefaultReason},
 
-std::unordered_map<std::string, std::string> disabledGpuTests(
-    {
-     // Onnx zoo models
-     {"mask_rcnn_opset10", "Bug 31005388: mask_rcnn opset 10 onnx zoo model fails to evaluate on DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005388"},
-     {"faster_rcnn_opset10", "Bug 31005511: Failed to extract tensor data from evaluate result of faster_rcnn opset 10 model in DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005511"},
+     // GPU specific cases:
+
+     // ONNX zoo models
+     {"mask_rcnn_opset10_GPU", "Bug 31005388: mask_rcnn opset 10 onnx zoo model fails to evaluate on DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005388"},
+     {"faster_rcnn_opset10_GPU", "Bug 31005511: Failed to extract tensor data from evaluate result of faster_rcnn opset 10 model in DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005511"},
 
      // Tier 2 models
-     {"fp16_test_tiny_yolov2_opset7", "Bug 31005780: Result of fp16_test_tiny_yolov2_opset7 and fp16_coreml_FNS_Candy_opset7 models on DirectML aren't as accurate as on CPU https://microsoft.visualstudio.com/OS/_workitems/edit/31005780"},
-     {"fp16_tiny_yolov2_opset8", "Bug 31005780: Result of fp16_test_tiny_yolov2_opset7 and fp16_coreml_FNS_Candy_opset7 models on DirectML aren't as accurate as on CPU https://microsoft.visualstudio.com/OS/_workitems/edit/31005780"},
-     {"fp16_coreml_FNS_Candy_opset7", "Bug 31005780: Result of fp16_test_tiny_yolov2_opset7 and fp16_coreml_FNS_Candy_opset7 models on DirectML aren't as accurate as on CPU https://microsoft.visualstudio.com/OS/_workitems/edit/31005780"},
-     {"mlperf_ssd_mobilenet_300_opset10", "Bug 31005624: mlperf_ssd_mobilenet_300 opset 10 model fails to evaluate in DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005624"}
-    });
+     {"fp16_test_tiny_yolov2_opset7_GPU", "Bug 31005780: Result of fp16_test_tiny_yolov2_opset7 and fp16_coreml_FNS_Candy_opset7 models on DirectML aren't as accurate as on CPU https://microsoft.visualstudio.com/OS/_workitems/edit/31005780"},
+     {"fp16_tiny_yolov2_opset8_GPU", "Bug 31005780: Result of fp16_test_tiny_yolov2_opset7 and fp16_coreml_FNS_Candy_opset7 models on DirectML aren't as accurate as on CPU https://microsoft.visualstudio.com/OS/_workitems/edit/31005780"},
+     {"fp16_coreml_FNS_Candy_opset7_GPU", "Bug 31005780: Result of fp16_test_tiny_yolov2_opset7 and fp16_coreml_FNS_Candy_opset7 models on DirectML aren't as accurate as on CPU https://microsoft.visualstudio.com/OS/_workitems/edit/31005780"},
+     {"mlperf_ssd_mobilenet_300_opset10_GPU", "Bug 31005624: mlperf_ssd_mobilenet_300 opset 10 model fails to evaluate in DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005624"},
+    }
+);
 
 /*
     model name -> (adapter name regex, skipped test reason)
 */
 std::unordered_map<std::string, std::pair<std::string, std::string>> disabledGpuAdapterTests(
     {
-      {"fp16_inception_v1_opset7", std::make_pair("NVIDIA", "Bug 31144419: Results of fp16_inception_v1 opset7 and opset8 aren't accurate enough on AMD Radeon VII & Intel(R) UHD Graphics 630 & NVIDIA https://microsoft.visualstudio.com/OS/_workitems/edit/31144419")},
-      {"fp16_inception_v1_opset8", std::make_pair("NVIDIA", "Bug 31144419: Results of fp16_inception_v1 opset7 and opset8 aren't accurate enough on AMD Radeon VII & Intel(R) UHD Graphics 630 & NVIDIA https://microsoft.visualstudio.com/OS/_workitems/edit/31144419")},
-      {"candy_opset9", std::make_pair("(Intel\\(R\\) (UHD )?Graphics)|(Adreno)", "Bug 31652854: Results of candy_opset9 aren't accurate enough on Intel Graphics and Qualcomm Adreno 685 https://microsoft.visualstudio.com/OS/_workitems/edit/31652854")},
-    });
+      //{"fp16_inception_v1_opset7_GPU", std::make_pair("NVIDIA", "Bug 31144419: Results of fp16_inception_v1 opset7 and opset8 aren't accurate enough on AMD Radeon VII & Intel(R) UHD Graphics 630 & NVIDIA https://microsoft.visualstudio.com/OS/_workitems/edit/31144419")},
+      //{"fp16_inception_v1_opset8_GPU", std::make_pair("NVIDIA", "Bug 31144419: Results of fp16_inception_v1 opset7 and opset8 aren't accurate enough on AMD Radeon VII & Intel(R) UHD Graphics 630 & NVIDIA https://microsoft.visualstudio.com/OS/_workitems/edit/31144419")},
+    }
+);
 
 /*
-    test name -> sampleTolerance
+    Override the default tolerances for these test cases (can be tailored to only CPU or GPU with suffix).
+    test name -> absolute difference sampleTolerance
 */
-std::unordered_map<std::string, double> gpuSampleTolerancePerTests(
-    {{"fp16_inception_v1", 0.005}});
+std::unordered_map<std::string, double> sampleTolerancePerTests(
+    {
+      {"fp16_inception_v1_opset7_GPU", 0.005},
+      {"fp16_inception_v1_opset8_GPU", 0.005},
+      {"candy_opset9_GPU", 0.00121000}, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/
+      {"fp16_tiny_yolov2_opset8_GPU", 0.109000}, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/
+    }
+);

From ef7671b938e10d2b4a9b39d368d13cdeea81e4b1 Mon Sep 17 00:00:00 2001
From: Dwayne Robinson <dwayner@microsoft.com>
Date: Thu, 2 Dec 2021 13:30:34 -0800
Subject: [PATCH 3/5] Comment out old lines

---
 winml/test/model/skip_model_tests.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/winml/test/model/skip_model_tests.h b/winml/test/model/skip_model_tests.h
index 2f29806200..21a249d9e6 100644
--- a/winml/test/model/skip_model_tests.h
+++ b/winml/test/model/skip_model_tests.h
@@ -132,8 +132,7 @@ std::unordered_map<std::string, std::string> disabledTests(
 */
 std::unordered_map<std::string, std::pair<std::string, std::string>> disabledGpuAdapterTests(
     {
-      //{"fp16_inception_v1_opset7_GPU", std::make_pair("NVIDIA", "Bug 31144419: Results of fp16_inception_v1 opset7 and opset8 aren't accurate enough on AMD Radeon VII & Intel(R) UHD Graphics 630 & NVIDIA https://microsoft.visualstudio.com/OS/_workitems/edit/31144419")},
-      //{"fp16_inception_v1_opset8_GPU", std::make_pair("NVIDIA", "Bug 31144419: Results of fp16_inception_v1 opset7 and opset8 aren't accurate enough on AMD Radeon VII & Intel(R) UHD Graphics 630 & NVIDIA https://microsoft.visualstudio.com/OS/_workitems/edit/31144419")},
+      // e.g. {"fp16_inception_v1_opset7_GPU", std::make_pair("NVIDIA", "Bug 31144419: Results of fp16_inception_v1 opset7 and opset8 aren't accurate enough on AMD Radeon VII & Intel(R) UHD Graphics 630 & NVIDIA https://microsoft.visualstudio.com/OS/_workitems/edit/31144419")},
     }
 );
 

From 77e67a6de74fe05485e983b574f4b91540c8e443 Mon Sep 17 00:00:00 2001
From: Dwayne Robinson <dwayner@microsoft.com>
Date: Thu, 2 Dec 2021 13:34:01 -0800
Subject: [PATCH 4/5] Add one more example line

---
 winml/test/model/skip_model_tests.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/winml/test/model/skip_model_tests.h b/winml/test/model/skip_model_tests.h
index 21a249d9e6..080f05e50f 100644
--- a/winml/test/model/skip_model_tests.h
+++ b/winml/test/model/skip_model_tests.h
@@ -133,6 +133,7 @@ std::unordered_map<std::string, std::string> disabledTests(
 std::unordered_map<std::string, std::pair<std::string, std::string>> disabledGpuAdapterTests(
     {
       // e.g. {"fp16_inception_v1_opset7_GPU", std::make_pair("NVIDIA", "Bug 31144419: Results of fp16_inception_v1 opset7 and opset8 aren't accurate enough on AMD Radeon VII & Intel(R) UHD Graphics 630 & NVIDIA https://microsoft.visualstudio.com/OS/_workitems/edit/31144419")},
+      //      {"candy_opset9", std::make_pair("(Intel\\(R\\) (UHD )?Graphics)|(Adreno)", "Bug 31652854: Results of candy_opset9 aren't accurate enough on Intel Graphics and Qualcomm Adreno 685 https://microsoft.visualstudio.com/OS/_workitems/edit/31652854")},
     }
 );
 

From 6e4c534ce219f015024ecb60ecc7b08cba4b3981 Mon Sep 17 00:00:00 2001
From: Dwayne Robinson <dwayner@microsoft.com>
Date: Thu, 2 Dec 2021 19:42:31 -0800
Subject: [PATCH 5/5] Relax tolerance slightly more for Intel after autopilot
 run

---
 winml/test/model/skip_model_tests.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/winml/test/model/skip_model_tests.h b/winml/test/model/skip_model_tests.h
index 080f05e50f..e463b66f2f 100644
--- a/winml/test/model/skip_model_tests.h
+++ b/winml/test/model/skip_model_tests.h
@@ -145,7 +145,7 @@ std::unordered_map<std::string, double> sampleTolerancePerTests(
     {
       {"fp16_inception_v1_opset7_GPU", 0.005},
       {"fp16_inception_v1_opset8_GPU", 0.005},
-      {"candy_opset9_GPU", 0.00121000}, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/
+      {"candy_opset9_GPU", 0.00150000}, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/
       {"fp16_tiny_yolov2_opset8_GPU", 0.109000}, // Intel(R) UHD Graphics 630 (29.20.100.9020) AP machine has inaccurate GPU results for FNS Candy opset 9 https://microsoft.visualstudio.com/OS/_workitems/edit/30696168/
     }
 );