From ca8ff8c91c3ae6f7587df3cea3bd82b5578613a9 Mon Sep 17 00:00:00 2001
From: Zhang Lei <zhang.huanning@hotmail.com>
Date: Tue, 26 Nov 2019 11:35:28 -0800
Subject: [PATCH 01/13] Enable conv/conv_transpose for opset 11 in cuda
 execution provider. (#2401)

Enable conv/conv_transpose and existing pooling for opset 11 in cuda execution provider.

They are of spec dilates/strides change related cuda pooling ops for op set 11.
---
 .../providers/cuda/cuda_execution_provider.cc | 72 ++++++++++++-------
 onnxruntime/core/providers/cuda/nn/conv.cc    | 12 +++-
 .../core/providers/cuda/nn/conv_transpose.cc  | 12 +++-
 onnxruntime/core/providers/cuda/nn/pool.cc    | 20 ++++--
 4 files changed, 84 insertions(+), 32 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index dce0c23ab8..3d55b58446 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -389,12 +389,12 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, LRN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, LRN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, LRN);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Conv);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Conv);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Conv);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ConvTranspose);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ConvTranspose);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ConvTranspose);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, Conv);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, Conv);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, Conv);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ConvTranspose);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ConvTranspose);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ConvTranspose);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, double, AveragePool);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, MLFloat16, AveragePool);
@@ -549,13 +549,13 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDom
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scan);
 
 // opset 10
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, AveragePool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, AveragePool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, AveragePool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, AveragePool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, MaxPool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, MaxPool);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, MaxPool);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, MaxPool);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, Resize);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, Resize);
@@ -640,6 +640,18 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, S
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Squeeze);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, TopK);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Unsqueeze);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Conv);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Conv);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, ConvTranspose);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, ConvTranspose);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, ConvTranspose);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, AveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, AveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, AveragePool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool);
 
 static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -824,12 +836,12 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, LRN)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, LRN)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, LRN)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, Conv)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, Conv)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, Conv)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, float, ConvTranspose)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, double, ConvTranspose)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, MLFloat16, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, float, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, double, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 1, 10, MLFloat16, ConvTranspose)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, float, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, double, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 9, MLFloat16, AveragePool)>,
@@ -983,13 +995,13 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, 10, Scan)>,
 
       // opset 10
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, AveragePool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, AveragePool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, Dropout)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, MaxPool)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, MLFloat16, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, double, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, MLFloat16, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, 10, NonMaxSuppression)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, float, Resize)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 10, double, Resize)>,
@@ -1075,6 +1087,18 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Squeeze)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, TopK)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, Unsqueeze)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, ConvTranspose)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, AveragePool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool)>,      
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index 1eb9643f30..01b3929d78 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -9,11 +9,21 @@
 namespace onnxruntime {
 namespace cuda {
 
+// Op Set 11 for Conv only update document to clearify default dilations and strides value.
+// which are already convered by op set 11 cpu versoin, so simply add declaration.
 #define REGISTER_KERNEL_TYPED(T)                                                \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                      \
+      Conv,                                                                     \
+      kOnnxDomain,                                                              \
+      1, 10,                                                                    \
+      T,                                                                        \
+      kCudaExecutionProvider,                                                   \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Conv<T>);                                                                 \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                \
       Conv,                                                                     \
       kOnnxDomain,                                                              \
-      1,                                                                        \
+      11,                                                                       \
       T,                                                                        \
       kCudaExecutionProvider,                                                   \
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
index 3e51d36223..92fcf8ac53 100644
--- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc
@@ -6,11 +6,21 @@
 namespace onnxruntime {
 namespace cuda {
 
+// Op Set 11 for ConvTranspose only update document to clearify default dilations and strides value.
+// which are already covered by op set 11 cpu version, so simply add declaration.
 #define REGISTER_KERNEL_TYPED(T)                                                \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                      \
+      ConvTranspose,                                                            \
+      kOnnxDomain,                                                              \
+      1, 10,                                                                    \
+      T,                                                                        \
+      kCudaExecutionProvider,                                                   \
+      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      ConvTranspose<T>);                                                        \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                \
       ConvTranspose,                                                            \
       kOnnxDomain,                                                              \
-      1,                                                                        \
+      11,                                                                       \
       T,                                                                        \
       kCudaExecutionProvider,                                                   \
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc
index ae6508e581..8dbdeca70c 100644
--- a/onnxruntime/core/providers/cuda/nn/pool.cc
+++ b/onnxruntime/core/providers/cuda/nn/pool.cc
@@ -30,12 +30,17 @@ namespace cuda {
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<data_type>()).TypeConstraint("I", DataTypeImpl::GetTensorType<int64_t>()), \
       Pool<data_type, pool_type>);
 
+
 POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9)
 POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9)
 POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9)
-POOLING_KERNEL(AveragePool, float, AveragePool, 10)
-POOLING_KERNEL(AveragePool, double, AveragePool, 10)
-POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 10)
+POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 10, 10)
+POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 10, 10)
+POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 10, 10)
+//AveragePool and MaxPool op set 11 only update spec document on default value for dilations and strides.
+POOLING_KERNEL(AveragePool, float, AveragePool, 11)
+POOLING_KERNEL(AveragePool, double, AveragePool, 11)
+POOLING_KERNEL(AveragePool, MLFloat16, AveragePool, 11)
 POOLING_KERNEL(GlobalAveragePool, float, AveragePool, 1)
 POOLING_KERNEL(GlobalAveragePool, double, AveragePool, 1)
 POOLING_KERNEL(GlobalAveragePool, MLFloat16, AveragePool, 1)
@@ -45,9 +50,12 @@ POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<1>, 1, 7)
 POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<8>, 8, 9)
 POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<8>, 8, 9)
 POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<8>, 8, 9)
-POOLING_KERNEL(MaxPool, float, MaxPool<8>, 10)
-POOLING_KERNEL(MaxPool, double, MaxPool<8>, 10)
-POOLING_KERNEL(MaxPool, MLFloat16, MaxPool<8>, 10)
+POOLING_KERNEL_VERSIONED(MaxPool, float, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED(MaxPool, double, MaxPool<8>, 10, 10)
+POOLING_KERNEL_VERSIONED(MaxPool, MLFloat16, MaxPool<8>, 10, 10)
+POOLING_KERNEL(MaxPool, float, MaxPool<8>, 11)
+POOLING_KERNEL(MaxPool, double, MaxPool<8>, 11)
+POOLING_KERNEL(MaxPool, MLFloat16, MaxPool<8>, 11)
 POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1)
 POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1)
 POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1)

From d6c84925d514a94d75ade49fb71a5f6f77885b0a Mon Sep 17 00:00:00 2001
From: Xiang Zhang <xianz@microsoft.com>
Date: Tue, 26 Nov 2019 19:09:22 -0800
Subject: [PATCH 02/13] move logSessionCreation after session is initialized
 (#2481)

---
 onnxruntime/core/session/inference_session.cc | 13 ++++++++-----
 onnxruntime/core/session/inference_session.h  |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index e18aa82de3..84126926ce 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -240,11 +240,7 @@ common::Status InferenceSession::Load(std::function<common::Status(std::shared_p
     // all steps complete, mark the model as loaded.
     is_model_loaded_ = true;
 
-    // and log telemetry
-    const Env& env = Env::Default();
-    env.GetTelemetryProvider().LogSessionCreation(session_id_, model_->IrVersion(), model_->ProducerName(), model_->ProducerVersion(),
-                                                  model_->Domain(), model_->MainGraph().DomainToVersionMap(), model_->MainGraph().Name(),
-                                                  model_->MetaData(), event_name, execution_providers_.GetIds());
+    event_name_ = event_name;
 
   } catch (const std::exception& ex) {
     status = Status(common::ONNXRUNTIME, common::FAIL, "Exception during loading: " + std::string(ex.what()));
@@ -633,6 +629,13 @@ common::Status InferenceSession::Initialize() {
     // handle any subgraphs
     ORT_RETURN_IF_ERROR_SESSIONID_(InitializeSubgraphSessions(graph, session_state_));
     is_inited_ = true;
+    
+    // and log telemetry
+    const Env& env = Env::Default();
+    env.GetTelemetryProvider().LogSessionCreation(session_id_, model_->IrVersion(), model_->ProducerName(), model_->ProducerVersion(),
+                                                  model_->Domain(), model_->MainGraph().DomainToVersionMap(), model_->MainGraph().Name(),
+                                                  model_->MetaData(), event_name_, execution_providers_.GetIds());
+
     LOGS(*session_logger_, INFO) << "Session successfully initialized.";
   } catch (const NotImplementedException& ex) {
     status = ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "Exception during initialization: ", ex.what());
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
index 791d61e62e..a25cb5ebc2 100644
--- a/onnxruntime/core/session/inference_session.h
+++ b/onnxruntime/core/session/inference_session.h
@@ -445,6 +445,7 @@ class InferenceSession {
   long long total_run_duration_since_last_;         // the total duration (us) of Run() calls since the last report
   TimePoint time_sent_last_;                        // the TimePoint of the last report
   const long long kDurationBetweenSending = 1000* 1000 * 60 * 10;  // duration in (us).  send a report every 10 mins
+  std::string event_name_;                          // where the model is loaded from: ["model_loading_uri", "model_loading_proto", "model_loading_istream"]
 
 #ifdef ONNXRUNTIME_ENABLE_INSTRUMENT
   bool session_activity_started_ = false;

From e29fb5cef1d0e8ad27a4ede6f37606a368f3682d Mon Sep 17 00:00:00 2001
From: Yulong Wang <f.s@qq.com>
Date: Wed, 27 Nov 2019 00:32:26 -0800
Subject: [PATCH 03/13] Add BatchParallelFor, TryParallelFor,
 TryBatchParallelFor into ThreadPool (#2476)

---
 .../onnxruntime/core/platform/threadpool.h    |  42 ++++++++
 .../contrib_ops/cpu/crop_and_resize.cc        |  19 +---
 onnxruntime/core/common/threadpool.cc         |  30 ++++++
 .../cpu/object_detection/roialign.cc          |  19 +---
 onnxruntime/test/platform/threadpool_test.cc  | 101 ++++++++++++++++++
 5 files changed, 177 insertions(+), 34 deletions(-)
 create mode 100644 onnxruntime/test/platform/threadpool_test.cc

diff --git a/include/onnxruntime/core/platform/threadpool.h b/include/onnxruntime/core/platform/threadpool.h
index 3337583612..e6b442348a 100644
--- a/include/onnxruntime/core/platform/threadpool.h
+++ b/include/onnxruntime/core/platform/threadpool.h
@@ -46,6 +46,11 @@ class ThreadPool {
   */
   void ParallelFor(int32_t total, std::function<void(int32_t)> fn);
 
+  /*
+  Schedule work in the interval [0, total), with calls split into (num_batches) batches.
+  */
+  void BatchParallelFor(int32_t total, std::function<void(int32_t)> fn, int32_t num_batches = 0);
+
   /*
   Schedule work in the interval [first, last].
   */
@@ -54,6 +59,43 @@ class ThreadPool {
   // This is not supported until the latest Eigen
   // void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions);
 
+  /**
+  Tries to call the given function in parallel, with calls split into (num_batches) batches.
+  **/
+  template <typename F>
+  inline static void TryBatchParallelFor(concurrency::ThreadPool* tp, int32_t total, F&& fn, int32_t num_batches = 0) {
+    if (tp != nullptr) {
+      if (num_batches <= 0) {
+        num_batches = tp->NumThreads() + 1;
+      }
+      tp->BatchParallelFor(total, std::forward<F>(fn), num_batches);
+    } else {
+#ifdef USE_OPENMP
+#pragma omp parallel for
+#endif
+      for (int32_t i = 0; i < total; ++i) {
+        fn(i);
+      }
+    }
+  }
+
+  /**
+  Tries to call the given function in parallel.
+  **/
+  template <typename F>
+  inline static void TryParallelFor(concurrency::ThreadPool* tp, int32_t total, F&& fn) {
+    if (tp != nullptr) {
+      tp->ParallelFor(total, std::forward<F>(fn));
+    } else {
+#ifdef USE_OPENMP
+#pragma omp parallel for
+#endif
+      for (int32_t i = 0; i < total; ++i) {
+        fn(i);
+      }
+    }
+  }
+
   int NumThreads() const;
 
   int CurrentThreadId() const;
diff --git a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
index f966c9c821..f486d6a6a9 100644
--- a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
+++ b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc
@@ -43,17 +43,6 @@ namespace contrib {
 
 ADD_TYPED_CROPANDRESIZE_OP(float);
 
-template <typename T>
-static void TryParallelFor(concurrency::ThreadPool* tp, int32_t total, T&& fn) {
-  if (tp != nullptr)
-    tp->ParallelFor(total, fn);
-  else {
-    for (int32_t i = 0; i != total; ++i) {
-      fn(i);
-    }
-  }
-}
-
 template <typename T>
 void CropAndResizeForward(const TensorShape& output_shape,
                           const T* bottom_data,
@@ -71,9 +60,7 @@ void CropAndResizeForward(const TensorShape& output_shape,
   int64_t pooled_height = output_shape[2];
   int64_t pooled_width = output_shape[3];
 
-  // TODO: This should do blocks of work based on the number of threads in the threadpool with each block
-  // being n_rois / num_threads
-  std::function<void(int32_t)> work_object = [&](int32_t n) {
+  ThreadPool::TryBatchParallelFor(ttp, static_cast<int32_t>(n_rois), [&](int32_t n) {
     int64_t index_n = n * channels * pooled_width * pooled_height;
 
     const T* offset_bottom_rois = bottom_rois + n * num_roi_cols;
@@ -182,9 +169,7 @@ void CropAndResizeForward(const TensorShape& output_shape,
         }
       }  // for pw
     }    // for ph
-  };     // for n
-
-  TryParallelFor(ttp, static_cast<int32_t>(n_rois), work_object);
+  });    // for n
 }
 
 template <typename T>
diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc
index 0595f8c56e..9f08dc68bb 100644
--- a/onnxruntime/core/common/threadpool.cc
+++ b/onnxruntime/core/common/threadpool.cc
@@ -57,6 +57,36 @@ void ThreadPool::ParallelFor(int32_t total, std::function<void(int32_t)> fn) {
   barrier.Wait();
 }
 
+void ThreadPool::BatchParallelFor(int32_t total, std::function<void(int32_t)> fn, int32_t num_batches) {
+  if (total <= 0)
+    return;
+
+  if (total == 1) {
+    fn(0);
+    return;
+  }
+
+  if (num_batches <= 1) {
+    for (int i = 0; i < total; i++) {
+      fn(i);
+    }
+    return;
+  }
+
+  if (num_batches >= total) {
+    ParallelFor(total, fn);
+    return;
+  }
+
+  ParallelFor(num_batches, [&](int batch_index) {
+    int start = batch_index * total / num_batches;
+    int end = (batch_index + 1) * total / num_batches;
+    for (int i = start; i < end; i++) {
+      fn(i);
+    }
+  });
+}
+
 void ThreadPool::ParallelForRange(int64_t first, int64_t last, std::function<void(int64_t, int64_t)> fn) {
   if (last <= first) return;
   if (last - first == 1) {
diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
index e68258cb3b..f3bab71ef5 100644
--- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc
+++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc
@@ -42,17 +42,6 @@ ADD_TYPED_ROIALIGN_OP(float);
 ADD_TYPED_ROIALIGN_OP(double);
 
 namespace {
-template <typename T>
-void TryParallelFor(concurrency::ThreadPool* tp, int32_t total, T&& fn) {
-  if (tp != nullptr)
-    tp->ParallelFor(total, fn);
-  else {
-    for (int32_t i = 0; i != total; ++i) {
-      fn(i);
-    }
-  }
-}
-
 template <typename T>
 struct PreCalc {
   int64_t pos1;
@@ -183,9 +172,7 @@ void RoiAlignForward(const TensorShape& output_shape,
   int64_t pooled_height = output_shape[2];
   int64_t pooled_width = output_shape[3];
 
-  // TODO: This should do blocks of work based on the number of threads in the threadpool with each block
-  // being n_rois / num_threads
-  std::function<void(int32_t)> work_object = [&](int32_t n) {
+  ThreadPool::TryBatchParallelFor(ttp, static_cast<int32_t>(n_rois), [&](int32_t n) {
     int64_t index_n = n * channels * pooled_width * pooled_height;
 
     const T* offset_bottom_rois = bottom_rois + n * num_roi_cols;
@@ -281,9 +268,7 @@ void RoiAlignForward(const TensorShape& output_shape,
         }  // for pw
       }    // for ph
     }      // for c
-  };       // for n
-
-  TryParallelFor(ttp, static_cast<int32_t>(n_rois), work_object);
+  });      // for n
 }
 }  // namespace
 
diff --git a/onnxruntime/test/platform/threadpool_test.cc b/onnxruntime/test/platform/threadpool_test.cc
new file mode 100644
index 0000000000..ec628bc293
--- /dev/null
+++ b/onnxruntime/test/platform/threadpool_test.cc
@@ -0,0 +1,101 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/platform/threadpool.h"
+
+#include <core/common/make_unique.h>
+
+#include "gtest/gtest.h"
+#include <algorithm>
+#include <memory>
+#include <functional>
+#include <mutex>
+
+using namespace onnxruntime::concurrency;
+
+namespace {
+
+struct TestData {
+  explicit TestData(int num) : data(num, 0) {}
+  std::vector<int> data;
+  std::mutex mutex;
+};
+
+// This unittest tests ThreadPool function by counting the number of calls to function with each index.
+// the function should be called exactly once for each element.
+
+std::unique_ptr<TestData> CreateTestData(int num) {
+  return onnxruntime::make_unique<TestData>(num);
+}
+
+void IncrementElement(TestData& test_data, int i) {
+  std::lock_guard<std::mutex> lock(test_data.mutex);
+  test_data.data[i]++;
+}
+
+void ValidateTestData(TestData& test_data) {
+  ASSERT_TRUE(std::count_if(test_data.data.cbegin(),
+                            test_data.data.cend(),
+                            [](int i) { return i != 1; }) == 0);
+}
+
+void CreateThreadPoolAndTest(const std::string& name, int num_threads, const std::function<void(ThreadPool*)>& test_body) {
+  auto tp = onnxruntime::make_unique<ThreadPool>(name, num_threads);
+  test_body(tp.get());
+}
+
+void TestParallelFor(const std::string& name, int num_threads, int num_tasks) {
+  auto test_data = CreateTestData(num_tasks);
+  CreateThreadPoolAndTest(name, num_threads, [&](ThreadPool* tp) {
+    tp->ParallelFor(num_tasks, [&](int i) {
+      IncrementElement(*test_data, i);
+    });
+  });
+  ValidateTestData(*test_data);
+}
+
+void TestBatchParallelFor(const std::string& name, int num_threads, int num_tasks, int batch_size) {
+  auto test_data = CreateTestData(num_tasks);
+  CreateThreadPoolAndTest(name, num_threads, [&](ThreadPool* tp) {
+    tp->BatchParallelFor(
+        num_tasks, [&](int i) {
+          IncrementElement(*test_data, i);
+        },
+        batch_size);
+  });
+  ValidateTestData(*test_data);
+}
+
+}  // namespace
+
+TEST(ThreadPoolTest, TestParallelFor_2_Thread_NoTask) {
+  TestParallelFor("TestParallelFor_2_Thread_NoTask", 2, 0);
+}
+
+TEST(ThreadPoolTest, TestParallelFor_2_Thread_50_Task) {
+  TestParallelFor("TestParallelFor_2_Thread_50_Task", 2, 50);
+}
+
+TEST(ThreadPoolTest, TestParallelFor_1_Thread_50_Task) {
+  TestParallelFor("TestParallelFor_1_Thread_50_Task", 1, 50);
+}
+
+TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_10_Batch) {
+  TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_10_Batch", 2, 50, 10);
+}
+
+TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_0_Batch) {
+  TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_0_Batch", 2, 50, 0);
+}
+
+TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_1_Batch) {
+  TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_1_Batch", 2, 50, 1);
+}
+
+TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_50_Task_100_Batch) {
+  TestBatchParallelFor("TestBatchParallelFor_2_Thread_50_Task_100_Batch", 2, 50, 100);
+}
+
+TEST(ThreadPoolTest, TestBatchParallelFor_2_Thread_81_Task_20_Batch) {
+  TestBatchParallelFor("TestBatchParallelFor_2_Thread_81_Task_20_Batch", 2, 81, 20);
+}

From 7c7d5a149c9ed52eec67304bae5c4b132166a8a1 Mon Sep 17 00:00:00 2001
From: Yulong Wang <f.s@qq.com>
Date: Wed, 27 Nov 2019 02:09:18 -0800
Subject: [PATCH 04/13] Disable thread pool creation when enabled OpenMP
 (#2485)

---
 onnxruntime/core/framework/session_options.h     | 1 +
 onnxruntime/core/session/inference_session.cc    | 4 ++++
 onnxruntime/test/perftest/README.md              | 2 +-
 onnxruntime/test/perftest/command_args_parser.cc | 7 ++++++-
 onnxruntime/test/perftest/ort_test_session.cc    | 5 ++++-
 5 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 3ab220b0b6..27ac93f403 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -55,6 +55,7 @@ struct SessionOptions {
   TransformerLevel graph_optimization_level = TransformerLevel::Level1;
 
   // controls the size of the thread pool used to parallelize the execution of tasks within individual nodes (ops)
+  // if OpenMP is enabled, this configuration will be ignored
   int intra_op_num_threads = 0;
 
   // controls the size of the thread pool used to parallelize the execution of nodes (ops)
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 84126926ce..5b13df4387 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -102,8 +102,12 @@ InferenceSession::InferenceSession(const SessionOptions& session_options,
     : session_options_(session_options),
       graph_transformation_mgr_(session_options.max_num_graph_transformation_steps),
       logging_manager_(logging_manager),
+#ifndef USE_OPENMP
       thread_pool_(concurrency::CreateThreadPool("intra_op_thread_pool",
                                                  session_options.intra_op_num_threads)),
+#else
+      thread_pool_(nullptr),
+#endif
       inter_op_thread_pool_(session_options.execution_mode == ExecutionMode::ORT_PARALLEL
                                 ? concurrency::CreateThreadPool("inter_op_thread_pool",
                                                                 session_options.inter_op_num_threads)
diff --git a/onnxruntime/test/perftest/README.md b/onnxruntime/test/perftest/README.md
index ae8851d546..9f3cba19c0 100644
--- a/onnxruntime/test/perftest/README.md
+++ b/onnxruntime/test/perftest/README.md
@@ -32,7 +32,7 @@ Options:
         
 	-v: Show verbose information.
         
-	-x: [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes. A value of 0 means the test will auto-select a default. Must >=0.
+	-x: [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes. A value of 0 means the test will auto-select a default. Must >=0. If OpenMP is enabled, this configuration will be ignored.
 	
 	-y: [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means the test will auto-select a default. Must >=0.
 	
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 4b03aa7a12..30c8937828 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -41,7 +41,7 @@ namespace perftest {
       "\t-p [profile_file]: Specifies the profile name to enable profiling and dump the profile data to the file.\n"
       "\t-s: Show statistics result, like P75, P90.\n"
       "\t-v: Show verbose information.\n"
-      "\t-x [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0.\n"
+      "\t-x [intra_op_num_threads]: Sets the number of threads used to parallelize the execution within nodes, A value of 0 means ORT will pick a default. Must >=0. If OpenMP is enabled, this configuration will be ignored.\n"
       "\t-y [inter_op_num_threads]: Sets the number of threads used to parallelize the execution of the graph (across nodes), A value of 0 means ORT will pick a default. Must >=0.\n"
       "\t-P: Use parallel executor instead of sequential executor.\n"
       "\t-o [optimization level]: Default is 1. Valid values are 0 (disable), 1 (basic), 2 (extended), 99 (all).\n"
@@ -123,10 +123,15 @@ namespace perftest {
         test_config.run_config.f_verbose = true;
         break;
       case 'x':
+#ifdef USE_OPENMP
+        fprintf(stderr, "cannot use argument '-x' when OpenMP is enabled.\n");
+        return false;
+#else
         test_config.run_config.intra_op_num_threads = static_cast<int>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
         if (test_config.run_config.intra_op_num_threads < 0) {
           return false;
         }
+#endif
         break;
       case 'y':
         test_config.run_config.inter_op_num_threads = static_cast<int>(OrtStrtol<PATH_CHAR_TYPE>(optarg, nullptr));
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 5c8c5a8a99..2e16451ae2 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -82,7 +82,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
   } else if (provider_name == onnxruntime::kAclExecutionProvider) {
 #ifdef USE_ACL
     Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ACL(session_options,
-	performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0));
+                                                                   performance_test_config.run_config.enable_cpu_mem_arena ? 1 : 0));
 #else
     ORT_THROW("Acl is not supported in this build\n");
 #endif
@@ -100,8 +100,11 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
   else
     session_options.DisableMemPattern();
   session_options.SetExecutionMode(performance_test_config.run_config.execution_mode);
+
+#ifndef USE_OPENMP
   fprintf(stdout, "Setting intra_op_num_threads to %d\n", performance_test_config.run_config.intra_op_num_threads);
   session_options.SetIntraOpNumThreads(performance_test_config.run_config.intra_op_num_threads);
+#endif
 
   if (performance_test_config.run_config.execution_mode == ExecutionMode::ORT_PARALLEL) {
     fprintf(stdout, "Setting inter_op_num_threads to %d\n", performance_test_config.run_config.inter_op_num_threads);

From e57b735bb9a4912eb38f9de9d272d42c03779569 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Wed, 27 Nov 2019 10:15:50 -0800
Subject: [PATCH 05/13] Add a transformer to use Gelu approximation for cuda
 provider (#2480)

* Add Gelu Approximation Transformer to convert Gelu or AddGeluFusion to FastGelu to get better inference performance.
---
 .../core/optimizer/graph_transformer_level.h  |   5 +-
 .../core/optimizer/gelu_approximation.cc      | 119 ++++++++++++++++++
 .../core/optimizer/gelu_approximation.h       |  24 ++++
 .../core/optimizer/graph_transformer_utils.cc |   3 +-
 .../core/session/abi_session_options.cc       |   2 +-
 onnxruntime/core/session/inference_session.cc |  20 ++-
 .../test/framework/inference_session_test.cc  |   2 +-
 .../test/optimizer/graph_transform_test.cc    |  70 +++++++++++
 .../transform/approximation/gelu.onnx         |  13 ++
 .../approximation/gelu_add_bias.onnx          |  17 +++
 .../approximation/gelu_add_matmul.onnx        |  25 ++++
 .../gelu_add_shape_not_match.onnx             |  19 +++
 .../approximation/gelu_approximation_gen.py   |  86 +++++++++++++
 13 files changed, 387 insertions(+), 18 deletions(-)
 create mode 100644 onnxruntime/core/optimizer/gelu_approximation.cc
 create mode 100644 onnxruntime/core/optimizer/gelu_approximation.h
 create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu.onnx
 create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx
 create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx
 create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx
 create mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py

diff --git a/include/onnxruntime/core/optimizer/graph_transformer_level.h b/include/onnxruntime/core/optimizer/graph_transformer_level.h
index 4f2d5b305c..7aeb00ba66 100644
--- a/include/onnxruntime/core/optimizer/graph_transformer_level.h
+++ b/include/onnxruntime/core/optimizer/graph_transformer_level.h
@@ -12,9 +12,8 @@ enum class TransformerLevel : int {
   Level1,
   Level2,
   Level3,
-  // Convenience enum to always get the max available value.
-  // This way when we add more levels code which iterates over this enum does not need to change.
-  MaxTransformerLevel
+  // The max level should always be same as the last level.
+  MaxLevel = Level3
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/gelu_approximation.cc b/onnxruntime/core/optimizer/gelu_approximation.cc
new file mode 100644
index 0000000000..9982a3836d
--- /dev/null
+++ b/onnxruntime/core/optimizer/gelu_approximation.cc
@@ -0,0 +1,119 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/gelu_approximation.h"
+#include "core/framework/tensorprotoutils.h"
+#include "core/optimizer/utils.h"
+#include "core/graph/graph_utils.h"
+#include "float.h"
+
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::common;
+namespace onnxruntime {
+
+// FastGelu supports limited data types.
+static std::vector<std::string> supported_data_types{"tensor(float16)", "tensor(float)"};
+
+static bool IsSupportedDataType(const Node& node) {
+  for (const auto& input_arg : node.InputDefs()) {
+    if (std::find(supported_data_types.begin(), supported_data_types.end(),
+                  *(input_arg->Type())) == supported_data_types.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static bool CheckInputShape(const Node& node, const NodeArg& input, const NodeArg& bias) {
+  const TensorShapeProto* bias_shape = bias.Shape();
+  if (nullptr == bias_shape ||
+      bias_shape->dim_size() != 1 || 
+      !utils::HasDimValue(bias_shape->dim(0))) {
+    return false;
+  }
+  auto bias_length = bias_shape->dim(0).dim_value();
+
+  const TensorShapeProto* input_shape = input.Shape();
+  if (nullptr != input_shape) {
+    if (input_shape->dim_size() >= 1) {
+      int last_dim = input_shape->dim_size() - 1;
+      if (utils::HasDimValue(input_shape->dim(last_dim)) &&
+          input_shape->dim(last_dim).dim_value() == bias_length) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Input does not have shape. We will check its parent node.
+  // When the parent is MatMul and its 2nd input has shape like {*, bias_length},
+  // it means that the shape of MatMul output is good for FastGelu.
+  const Node* parent_node = graph_utils::GetInputNode(node, 0);
+  if (nullptr != parent_node &&
+      graph_utils::IsSupportedOptypeVersionAndDomain(*parent_node, "MatMul", {1, 9}, kOnnxDomain)) {
+    const NodeArg& input_b = *(parent_node->InputDefs()[1]);
+    if (optimizer_utils::ValidateShape(input_b, {-1, bias_length})) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static bool CheckGeluInputShape(const NodeArg& input) {
+  const TensorShapeProto* input_shape = input.Shape();
+  return nullptr != input_shape && input_shape->dim_size() >= 1;
+}
+
+static bool IsCandidateNode(Node& node, const std::unordered_set<std::string>& compatible_providers) {
+  if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "AddGeluFusion", {1}, kMSDomain)) {
+    return graph_utils::IsSupportedProvider(node, compatible_providers) &&
+           IsSupportedDataType(node) &&
+           CheckInputShape(node, *(node.InputDefs()[0]), *(node.InputDefs()[1]));
+  } else if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "Gelu", {1}, kMSDomain)) {
+    return graph_utils::IsSupportedProvider(node, compatible_providers) &&
+           IsSupportedDataType(node) &&
+           CheckGeluInputShape(*(node.InputDefs()[0]));
+  }
+  return false;
+}
+
+Status GeluApproximation::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+  GraphViewer graph_viewer(graph);
+  const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
+
+  int count = 0;
+  for (auto node_index : node_topology_list) {
+    auto* p_node = graph.GetNode(node_index);
+    if (p_node == nullptr)
+      continue;  // we removed the node as part of an earlier fusion
+
+    Node& node = *p_node;
+    ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger));
+
+    if (IsCandidateNode(node, GetCompatibleExecutionProviders())) {
+      Node& fastgelu = graph.AddNode(
+          graph.GenerateNodeName("FastGelu"),
+          "FastGelu",
+          "Gelu approximation",
+          node.MutableInputDefs(),
+          node.MutableOutputDefs(), nullptr, kMSDomain);
+
+      fastgelu.SetExecutionProviderType(node.GetExecutionProviderType());
+
+      graph_utils::RemoveNodeOutputEdges(graph, node);
+      graph.RemoveNode(node.Index());
+
+      count++;
+    }
+  }
+
+  if (count > 0) {
+    modified = true;
+    LOGS(logger, INFO) << "Total Gelu Approximation (FastGelu) node count: " << count;
+  }
+
+  return Status::OK();
+}
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/gelu_approximation.h b/onnxruntime/core/optimizer/gelu_approximation.h
new file mode 100644
index 0000000000..def16eec92
--- /dev/null
+++ b/onnxruntime/core/optimizer/gelu_approximation.h
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_transformer.h"
+
+namespace onnxruntime {
+
+/**
+@Class GeluApproximation
+
+Rewrite graph to replace Gelu or AddGeluFusion by FastGelu node. FastGelu uses approximation for Gelu,
+and it is faster.
+*/
+class GeluApproximation : public GraphTransformer {
+ public:
+  GeluApproximation(const std::unordered_set<std::string>& compatible_execution_providers={}) noexcept
+      : GraphTransformer("GeluApproximation", compatible_execution_providers) {}
+
+  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 8845752d2c..3f0b6c1b43 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -20,6 +20,7 @@
 #include "core/optimizer/free_dim_override_transformer.h"
 #include "core/optimizer/add_gelu_fusion.h"
 #include "core/optimizer/gelu_fusion.h"
+#include "core/optimizer/gelu_approximation.h"
 #include "core/optimizer/layer_norm_fusion.h"
 #include "core/optimizer/skip_layer_norm_fusion.h"
 #include "core/optimizer/reshape_fusion.h"
@@ -130,6 +131,7 @@ std::vector<std::unique_ptr<GraphTransformer>> GenerateTransformers(TransformerL
 
       std::unordered_set<std::string> cuda_execution_providers = {onnxruntime::kCudaExecutionProvider};
       transformers.emplace_back(onnxruntime::make_unique<AddGeluFusion>(cuda_execution_providers));
+      transformers.emplace_back(onnxruntime::make_unique<GeluApproximation>(cuda_execution_providers));
       transformers.emplace_back(onnxruntime::make_unique<SkipLayerNormFusion>(cuda_execution_providers));
 
 #endif
@@ -142,7 +144,6 @@ std::vector<std::unique_ptr<GraphTransformer>> GenerateTransformers(TransformerL
         transformers.emplace_back(onnxruntime::make_unique<NchwcTransformer>());
       }
 #endif
-
     } break;
 
     default:
diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
index 026e7425fc..d032e2dbd5 100644
--- a/onnxruntime/core/session/abi_session_options.cc
+++ b/onnxruntime/core/session/abi_session_options.cc
@@ -130,7 +130,7 @@ ORT_API_STATUS_IMPL(OrtApis::SetSessionGraphOptimizationLevel, _In_ OrtSessionOp
       options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level2;
       break;
     case ORT_ENABLE_ALL:
-      options->value.graph_optimization_level = onnxruntime::TransformerLevel::Level3;
+      options->value.graph_optimization_level = onnxruntime::TransformerLevel::MaxLevel;
       break;
     default:
       return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "graph_optimization_level is not valid");
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 5b13df4387..c9aad03a7d 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -404,7 +404,7 @@ common::Status InferenceSession::TransformGraph(onnxruntime::Graph& graph,
 
   // apply transformers except default transformers
   // Default transformers are required for correctness and they are owned and run by inference session
-  for (int i = static_cast<int>(TransformerLevel::Level1); i < static_cast<int>(TransformerLevel::MaxTransformerLevel); i++) {
+  for (int i = static_cast<int>(TransformerLevel::Level1); i <= static_cast<int>(TransformerLevel::MaxLevel); i++) {
     ORT_RETURN_IF_ERROR_SESSIONID_(graph_transformer_mgr.ApplyTransformers(graph, static_cast<TransformerLevel>(i), *session_logger_));
   }
 
@@ -1141,21 +1141,17 @@ void InferenceSession::AddPredefinedTransformers(GraphTransformerManager& transf
     }
   };
 
-  ORT_ENFORCE(graph_optimization_level < TransformerLevel::MaxTransformerLevel,
-              "Allowed values are 1 and 2. Current level is set to " +
+  ORT_ENFORCE(graph_optimization_level <= TransformerLevel::MaxLevel,
+              "Exceeded max transformer level. Current level is set to " +
                   std::to_string(static_cast<uint32_t>(graph_optimization_level)));
 
-  if ((graph_optimization_level >= TransformerLevel::Level1) || !custom_list.empty()) {
-    add_transformers(TransformerLevel::Level1);
+  for (int i = static_cast<int>(TransformerLevel::Level1); i <= static_cast<int>(TransformerLevel::MaxLevel); i++) {
+    TransformerLevel level = static_cast<TransformerLevel>(i);
+    if ((graph_optimization_level >= level) || !custom_list.empty()) {
+      add_transformers(level);
+    }
   }
 
-  if ((graph_optimization_level >= TransformerLevel::Level2) || !custom_list.empty()) {
-    add_transformers(TransformerLevel::Level2);
-  }
-
-  if ((graph_optimization_level >= TransformerLevel::Level3) || !custom_list.empty()) {
-    add_transformers(TransformerLevel::Level3);
-  }
 }
 
 common::Status InferenceSession::WaitForNotification(Notification* p_executor_done, int64_t timeout_in_ms) {
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 44f221aa3e..e5abaa7176 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -1428,7 +1428,7 @@ TEST(InferenceSessionTests, TestCopyToFromDevices) {
 TEST(InferenceSessionTests, TestRegisterTransformers) {
   string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
 
-  for (int i = static_cast<int>(TransformerLevel::Default); i < static_cast<int>(TransformerLevel::MaxTransformerLevel); i++) {
+  for (int i = static_cast<int>(TransformerLevel::Default); i <= static_cast<int>(TransformerLevel::MaxLevel); i++) {
     SessionOptions so;
     so.session_logid = "InferenceSessionTests.TestL1AndL2Transformers";
     so.graph_optimization_level = static_cast<TransformerLevel>(i);
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 9ef55cab71..9915505477 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -16,6 +16,7 @@
 #include "core/optimizer/gemm_activation_fusion.h"
 #include "core/optimizer/add_gelu_fusion.h"
 #include "core/optimizer/gelu_fusion.h"
+#include "core/optimizer/gelu_approximation.h"
 #include "core/optimizer/layer_norm_fusion.h"
 #include "core/optimizer/skip_layer_norm_fusion.h"
 #include "core/optimizer/graph_transformer.h"
@@ -1101,6 +1102,75 @@ TEST(GraphTransformationTests, AddGeluFusionTest) {
   ASSERT_TRUE(op_to_count["GeluFusion"] == 0);
 }
 
+// Test Gelu -> FastGelu
+TEST(GraphTransformationTests, GeluApproximation_Gelu) {
+  auto model_uri = MODEL_FOLDER "approximation/gelu.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<GeluApproximation>(), TransformerLevel::Level2);
+  auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
+  ASSERT_TRUE(ret.IsOK());
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["Gelu"], 0);
+  EXPECT_EQ(op_to_count["FastGelu"], 1);
+}
+
+// Test AddGeluFusion -> FastGelu
+TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_Bias) {
+  auto model_uri = MODEL_FOLDER "approximation/gelu_add_bias.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<GeluApproximation>(), TransformerLevel::Level2);
+  auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
+  ASSERT_TRUE(ret.IsOK());
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["AddGeluFusion"], 0);
+  EXPECT_EQ(op_to_count["FastGelu"], 1);
+}
+
+// Test MatMul & AddGeluFusion -> MatMul & FastGelu
+TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_MatMul) {
+  auto model_uri = MODEL_FOLDER "approximation/gelu_add_matmul.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<GeluApproximation>(), TransformerLevel::Level2);
+  auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
+  ASSERT_TRUE(ret.IsOK());
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["AddGeluFusion"], 0);
+  EXPECT_EQ(op_to_count["MatMul"], 1);
+  EXPECT_EQ(op_to_count["FastGelu"], 1);
+}
+
+// Test AddGeluFusion with mis-match bias shape cannot convert to FastGelu.
+TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_Shape_Not_Match) {
+  auto model_uri = MODEL_FOLDER "approximation/gelu_add_shape_not_match.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<GeluApproximation>(), TransformerLevel::Level2);
+  auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
+  ASSERT_TRUE(ret.IsOK());
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["AddGeluFusion"], 1);
+  EXPECT_EQ(op_to_count["FastGelu"], 0);
+}
+
 TEST(GraphTransformationTests, LayerNormFusionTest) {
   auto model_uri = MODEL_FOLDER "fusion/layer_norm.onnx";
   std::shared_ptr<Model> p_model;
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu.onnx b/onnxruntime/test/testdata/transform/approximation/gelu.onnx
new file mode 100644
index 0000000000..b183a1fc84
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/approximation/gelu.onnx
@@ -0,0 +1,13 @@
+:~
+#
+ACGelu_1"Gelu:com.microsoftGelu_NoBiasZ$
+A
+
+batch
+	seq_len
+�b$
+C
+
+batch
+	seq_len
+�B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx
new file mode 100644
index 0000000000..b96cb4752a
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx
@@ -0,0 +1,17 @@
+:�
+8
+A
+BCAddGeluFusion_1"AddGeluFusion:com.microsoftGelu_AddBiasZ$
+A
+
+batch
+	seq_len
+�Z
+B
+	
+�b$
+C
+
+batch
+	seq_len
+�B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx
new file mode 100644
index 0000000000..688fa1cd68
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx
@@ -0,0 +1,25 @@
+:�
+
+A
+BCMatMul_1"MatMul
+8
+C
+DEAddGeluFusion_1"AddGeluFusion:com.microsoftMatMul_AddGeluFusionZ$
+A
+
+batch
+	seq_len
+xZ
+B
+
+
+�
+�Z
+D
+	
+�b$
+E
+
+batch
+	seq_len
+�B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx
new file mode 100644
index 0000000000..5061df5da8
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx
@@ -0,0 +1,19 @@
+:�
+8
+A
+BCAddGeluFusion_1"AddGeluFusion:com.microsoftGelu_Add_ShapeNotMatchZ$
+A
+
+batch
+	seq_len
+�Z$
+B
+
+batch
+	seq_len
+�b$
+C
+
+batch
+	seq_len
+�B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py b/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py
new file mode 100644
index 0000000000..65dae0cbe8
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py
@@ -0,0 +1,86 @@
+import onnx
+from onnx import helper
+from onnx import TensorProto
+
+graph = helper.make_graph(
+    [ # nodes
+        # Add node before Gelu
+        helper.make_node("Gelu", ["A"], ["C"], "Gelu_1", domain="com.microsoft"),
+    ],
+    "Gelu_NoBias",  #name
+    [  # inputs
+        helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
+    ],
+    [  # outputs
+        helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
+    ],
+    [  # initializers
+    ]
+)
+
+model = helper.make_model(graph)
+onnx.save(model, r'gelu.onnx')
+
+graph = helper.make_graph(
+    [ # nodes
+        # Add node before Gelu
+        helper.make_node("AddGeluFusion", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"),
+    ],
+    "Gelu_AddBias",  #name
+    [  # inputs
+        helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
+        helper.make_tensor_value_info('B', TensorProto.FLOAT, [3072]),
+    ],
+    [  # outputs
+        helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
+    ],
+    [  # initializers
+    ]
+)
+
+model = helper.make_model(graph)
+onnx.save(model, r'gelu_add_bias.onnx')
+
+graph = helper.make_graph(
+    [ # nodes
+        # Add node before Gelu
+        helper.make_node("AddGeluFusion", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"),
+    ],
+    "Gelu_Add_ShapeNotMatch",  #name
+    [  # inputs
+        helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
+        helper.make_tensor_value_info('B', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), # Bias shape not matched for FastGelu
+    ],
+    [  # outputs
+        helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
+    ],
+    [  # initializers
+    ]
+)
+
+model = helper.make_model(graph)
+onnx.save(model, r'gelu_add_shape_not_match.onnx')
+
+
+graph = helper.make_graph(
+    [ # nodes
+        # Add node before Gelu
+        helper.make_node("MatMul", ["A", "B"], ["C"], "MatMul_1"),
+        helper.make_node("AddGeluFusion", ["C", "D"], ["E"], "AddGeluFusion_1", domain="com.microsoft"),
+    ],
+    "MatMul_AddGeluFusion",  #name
+    [  # inputs
+        helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 'x']),
+        helper.make_tensor_value_info('B', TensorProto.FLOAT, [128, 3072]),
+        helper.make_tensor_value_info('D', TensorProto.FLOAT, [3072]),
+    ],
+    [  # outputs
+        helper.make_tensor_value_info('E', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
+    ],
+    [  # initializers
+    ]
+)
+
+model = helper.make_model(graph)
+onnx.save(model, r'gelu_add_matmul.onnx')
+

From ccbd778d0d15f2597f2cc5d644a58e99b44cca27 Mon Sep 17 00:00:00 2001
From: Yulong Wang <yulongw@microsoft.com>
Date: Wed, 27 Nov 2019 12:34:57 -0800
Subject: [PATCH 06/13] optimize CPU implementation of EmbedLayerNorm (#2491)

* optimize CPU implementation of EmbedLayerNorm
* use atomic in parallelization
---
 .../contrib_ops/cpu/bert/embed_layer_norm.cc  | 84 ++++++++++++-------
 1 file changed, 55 insertions(+), 29 deletions(-)

diff --git a/onnxruntime/contrib_ops/cpu/bert/embed_layer_norm.cc b/onnxruntime/contrib_ops/cpu/bert/embed_layer_norm.cc
index b3b77aa0e1..21a24e7e4c 100644
--- a/onnxruntime/contrib_ops/cpu/bert/embed_layer_norm.cc
+++ b/onnxruntime/contrib_ops/cpu/bert/embed_layer_norm.cc
@@ -4,6 +4,9 @@
 #include "embed_layer_norm.h"
 #include "embed_layer_norm_helper.h"
 #include "core/util/math_cpuonly.h"
+#include "core/platform/threadpool.h"
+
+#include <atomic>
 
 namespace onnxruntime {
 namespace contrib {
@@ -60,40 +63,63 @@ Status EmbedLayerNorm<T>::Compute(OpKernelContext* context) const {
   int position_embedding_length = static_cast<int>(position_embedding->Shape()[0]);
   int segment_embedding_length = static_cast<int>(segment_embedding->Shape()[0]);
 
-  ConstEigenArrayMap<T> word_embedding_arr(word_embedding->template Data<T>(), hidden_size, word_embedding_length);
-  ConstEigenArrayMap<T> position_embedding_arr(position_embedding->template Data<T>(), hidden_size, position_embedding_length);
-  ConstEigenArrayMap<T> segment_embedding_arr(segment_embedding->template Data<T>(), hidden_size, segment_embedding_length);
-  ConstEigenVectorMap<T> gamma_vector(gamma->template Data<T>(), hidden_size);
-  ConstEigenVectorMap<T> beta_vector(beta->template Data<T>(), hidden_size);
-  EigenArrayMap<T> output_arr(output->template MutableData<T>(), hidden_size, batch_size * sequence_length);
+  auto input_ids_data = input_ids->template Data<int>();
+  auto segment_ids_data = segment_ids->template Data<int>();
+  auto word_embedding_data = word_embedding->template Data<T>();
+  auto position_embedding_data = position_embedding->template Data<T>();
+  auto segment_embedding_data = segment_embedding->template Data<T>();
+  auto gamma_data = gamma->template Data<T>();
+  auto beta_data = beta->template Data<T>();
+  auto output_data = output->template MutableData<T>();
 
   // Calculate output
   {
-    size_t index = 0;
-    for (int b = 0; b < batch_size; b++) {
-      for (int s = 0; s < sequence_length; s++) {
-        int word_col_index = input_ids->template Data<int>()[index];
-        if (word_col_index < 0 || word_col_index >= word_embedding_length) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "word_col_index out of range");
-        }
-        int position_col_index = s;
-        if (position_col_index >= position_embedding_length) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "position_col_index out of range");
-        }
-        int segment_col_index = segment_ids->template Data<int>()[index];
-        if (segment_col_index < 0 || segment_col_index >= segment_embedding_length) {
-          return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "segment_col_index out of range");
-        }
+    std::atomic_bool failed{false};
 
-        output_arr.col(index) = word_embedding_arr.col(word_col_index) +
-                                position_embedding_arr.col(position_col_index) +
-                                segment_embedding_arr.col(segment_col_index);
-        output_arr.col(index) -= output_arr.col(index).mean();
-        output_arr.col(index) /= static_cast<T>(sqrt(output_arr.col(index).pow(2).mean() + 1.0e-13));
-        output_arr.col(index) *= gamma_vector.array();
-        output_arr.col(index) += beta_vector.array();
-        index++;
+    int n = batch_size * sequence_length;
+    concurrency::ThreadPool::TryBatchParallelFor(context->GetOperatorThreadPool(), n, [=, &failed](int index) {
+      int word_col_index = input_ids_data[index];
+      if (word_col_index < 0 || word_col_index >= word_embedding_length) {
+        failed.store(true, std::memory_order_release);
+        return;
       }
+      int position_col_index = index % sequence_length;
+      if (position_col_index >= position_embedding_length) {
+        failed.store(true, std::memory_order_release);
+        return;
+      }
+      int segment_col_index = segment_ids_data[index];
+      if (segment_col_index < 0 || segment_col_index >= segment_embedding_length) {
+        failed.store(true, std::memory_order_release);
+        return;
+      }
+
+      T* y = output_data + index * hidden_size;
+      const T* input_word_embedding = word_embedding_data + word_col_index * hidden_size;
+      const T* input_position_embedding = position_embedding_data + position_col_index * hidden_size;
+      const T* input_segment_embedding = segment_embedding_data + segment_col_index * hidden_size;
+
+      T sum = static_cast<T>(0);
+      for (int i = 0; i < hidden_size; i++) {
+        T subtotal = input_word_embedding[i] + input_position_embedding[i] + input_segment_embedding[i];
+        y[i] = subtotal;
+        sum += subtotal;
+      }
+      T mean = sum / hidden_size;
+      sum = 0;
+      for (int i = 0; i < hidden_size; i++) {
+        T a = y[i] - mean;
+        y[i] = a;
+        sum += a * a;
+      }
+      T e = sqrt(sum / hidden_size + static_cast<T>(1.0e-13));
+      for (int i = 0; i < hidden_size; i++) {
+        y[i] = y[i] / e * gamma_data[i] + beta_data[i];
+      }
+    });
+
+    if (failed.load(std::memory_order_acquire)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "input index out of range");
     }
   }
 

From 04b6097db45610ddfa2c534ff0ea04589a1cc6ef Mon Sep 17 00:00:00 2001
From: Zhang Lei <zhang.huanning@hotmail.com>
Date: Wed, 27 Nov 2019 12:42:45 -0800
Subject: [PATCH 07/13] Cuda Clip() for op set 11. (#2411)

* Cuda Clip() for op set 11.
* make min_val and max_value input CPU memory directly.
* Remove original cu file useless "#pragma once"
* merge duplicate logic into one class.
---
 .../providers/cuda/cuda_execution_provider.cc |  6 +-
 onnxruntime/core/providers/cuda/math/clip.cc  | 57 +++++++++++++------
 onnxruntime/core/providers/cuda/math/clip.h   | 24 +++++---
 .../core/providers/cuda/math/clip_impl.cu     |  1 -
 4 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 3d55b58446..6b83987403 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -363,7 +363,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Ceil);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Ceil);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Ceil);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Clip);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 10, float, Clip);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Reciprocal);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Reciprocal);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Reciprocal);
@@ -652,6 +652,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Clip);
 
 static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -676,7 +677,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, float, MatMul)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, double, MatMul)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, MLFloat16, MatMul)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Clip)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, 10, float, Clip)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, float, Tile)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, double, Tile)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 6, MLFloat16, Tile)>,
@@ -1099,6 +1100,7 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool)>,      
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Clip)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cuda/math/clip.cc b/onnxruntime/core/providers/cuda/math/clip.cc
index df3d4dd77e..7bfa0f1fe4 100644
--- a/onnxruntime/core/providers/cuda/math/clip.cc
+++ b/onnxruntime/core/providers/cuda/math/clip.cc
@@ -8,38 +8,61 @@
 namespace onnxruntime {
 namespace cuda {
 
-#define REGISTER_KERNEL_TYPED(T)                                                \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                                \
-      Clip,                                                                     \
-      kOnnxDomain,                                                              \
-      6,                                                                        \
-      T,                                                                        \
-      kCudaExecutionProvider,                                                   \
-      KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+#define REGISTER_KERNEL_TYPED(T)                                  \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                        \
+      Clip,                                                       \
+      kOnnxDomain,                                                \
+      6,                                                          \
+      10,                                                         \
+      T,                                                          \
+      kCudaExecutionProvider,                                     \
+      KernelDefBuilder()                                          \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      Clip<T>);                                                   \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      Clip,                                                       \
+      kOnnxDomain,                                                \
+      11,                                                         \
+      T,                                                          \
+      kCudaExecutionProvider,                                     \
+      KernelDefBuilder()                                          \
+          .InputMemoryType<OrtMemTypeCPUInput>(1)                 \
+          .InputMemoryType<OrtMemTypeCPUInput>(2)                 \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       Clip<T>);
 
 template <typename T>
 Status Clip<T>::ComputeInternal(OpKernelContext* ctx) const {
+  T min_val = min_;
+  T max_val = max_;
+  if (is_min_max_input_) {
+    const auto* min_input = ctx->Input<Tensor>(1);
+    const auto* max_input = ctx->Input<Tensor>(2);
+    if (min_input) {
+      ORT_ENFORCE(min_input->Shape().NumDimensions() == 0, "min should be a scalar.");
+      min_val = *(min_input->template Data<T>());
+    }
+    if (max_input) {
+      ORT_ENFORCE(max_input->Shape().NumDimensions() == 0, "max should be a scalar.");
+      max_val = *(max_input->template Data<T>());
+    }
+    ORT_ENFORCE(min_val <= max_val);
+  }
+
   const Tensor& X = *ctx->Input<Tensor>(0);
-  const TensorShape input_shape{X.Shape()};
+  const TensorShape& input_shape{X.Shape()};
   Tensor* Y = ctx->Output(0, input_shape);
-
   size_t count = input_shape.Size();
-
   if (count > 0) {
     auto* y_data = Y->template MutableData<T>();
     const auto* x_data = X.template Data<T>();
-    ClipImpl<T>(x_data, y_data, min_, max_, count);
+    ClipImpl<T>(x_data, y_data, min_val, max_val, count);
   }
 
   return Status::OK();
 }
 
-#define SPECIALIZED_COMPUTE(T) \
-  REGISTER_KERNEL_TYPED(T)     \
-  template Status Clip<T>::ComputeInternal(OpKernelContext* ctx) const;
-
-SPECIALIZED_COMPUTE(float)
+REGISTER_KERNEL_TYPED(float)
 
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/clip.h b/onnxruntime/core/providers/cuda/math/clip.h
index 1f748990c1..d600dc1e9c 100644
--- a/onnxruntime/core/providers/cuda/math/clip.h
+++ b/onnxruntime/core/providers/cuda/math/clip.h
@@ -10,21 +10,29 @@ namespace cuda {
 template <typename T>
 class Clip final : public CudaKernel {
  public:
-  Clip(const OpKernelInfo& info) : CudaKernel{info} {
-    auto min_val = -std::numeric_limits<T>::infinity();
-    auto max_val = std::numeric_limits<T>::infinity();
+  Clip(const OpKernelInfo& info) : CudaKernel{info}, is_min_max_input_(false) {
+    int start_version;
+    int end_version;
+    info.GetKernelDef().SinceVersion(&start_version, &end_version);
 
-    info.GetAttrOrDefault("min", &min_, min_val);
-    info.GetAttrOrDefault("max", &max_, max_val);
-
-    // Make sure the range of interval is sensible 
-    ORT_ENFORCE(min_val <= max_val);
+    if (start_version < 11) {
+      auto min_val = -std::numeric_limits<T>::infinity();
+      auto max_val = std::numeric_limits<T>::infinity();
+      info.GetAttrOrDefault("min", &min_, min_val);
+      info.GetAttrOrDefault("max", &max_, max_val);
+      ORT_ENFORCE(min_ <= max_);
+    } else {
+      min_ = -std::numeric_limits<T>::infinity();
+      max_ = std::numeric_limits<T>::infinity();
+      is_min_max_input_ = true;
+    }
   }
 
   Status ComputeInternal(OpKernelContext* context) const override;
 
  private:
   T min_, max_;
+  bool is_min_max_input_;
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/core/providers/cuda/math/clip_impl.cu b/onnxruntime/core/providers/cuda/math/clip_impl.cu
index 42d415e946..a73c5ad812 100644
--- a/onnxruntime/core/providers/cuda/math/clip_impl.cu
+++ b/onnxruntime/core/providers/cuda/math/clip_impl.cu
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#pragma once
 #include "core/providers/cuda/math/clip_impl.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
 

From 1fdf1006ac312295d164b2cbc0f88698122a7b8f Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Thu, 28 Nov 2019 07:03:23 +1000
Subject: [PATCH 08/13] Various fixes coming out of discussions in #2436
 (#2497)

- Add --skip_tests option to build.py based on github feedback
  - Add debug output at end of run_subprocess so it's clearer when the output is from a different process running
  - Add check for scipy as it's required by gen_test_models.py for the onnx tests
  - Use log.warning instead of warnings.warn for consistency. We use the logger almost everywhere and somewhat randomly used warnings.warn in two places.
  - Add check for 'wheel' dependency not being found in setup.py and handle more gracefully
  - Fix invalid input name in Keras tests
---
 .../python/onnxruntime_test_python_keras.py   |  2 +-
 setup.py                                      | 11 ++++++--
 tools/ci_build/build.py                       | 26 ++++++++++++++-----
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/test/python/onnxruntime_test_python_keras.py b/onnxruntime/test/python/onnxruntime_test_python_keras.py
index 715a2a3490..77b788f234 100644
--- a/onnxruntime/test/python/onnxruntime_test_python_keras.py
+++ b/onnxruntime/test/python/onnxruntime_test_python_keras.py
@@ -61,7 +61,7 @@ class TestInferenceSessionKeras(unittest.TestCase):
         # runtime
         content = converted_model.SerializeToString()
         rt = onnxrt.InferenceSession(content)
-        input = {'conv2d_1_input_0': x}
+        input = {rt.get_inputs()[0].name: x}
         actual_rt = rt.run(None, input)
         self.assertEqual(len(actual_rt), 1)
         np.testing.assert_allclose(actual, actual_rt[0], rtol=1e-05, atol=1e-08)
diff --git a/setup.py b/setup.py
index 0151c17457..a8b6282dac 100644
--- a/setup.py
+++ b/setup.py
@@ -110,7 +110,9 @@ try:
                     logger.info('removing %s', file)
                     remove(file)
 
-except ImportError:
+except ImportError as error:
+    print("Error importing dependencies:")
+    print(error)
     bdist_wheel = None
 
 # Additional binaries
@@ -182,6 +184,11 @@ if nightly_build:
     date_suffix = str(datetime.datetime.now().date().strftime("%m%d"))
     version_number = version_number + ".dev" + date_suffix
 
+cmd_classes = {}
+if bdist_wheel is not None :
+    cmd_classes['bdist_wheel'] = bdist_wheel
+cmd_classes['build_ext'] = build_ext
+
 # Setup
 setup(
     name=package_name,
@@ -190,7 +197,7 @@ setup(
     long_description=long_description,
     author='Microsoft Corporation',
     author_email='onnx@microsoft.com',
-    cmdclass={'bdist_wheel': bdist_wheel, 'build_ext': build_ext},
+    cmdclass=cmd_classes,
     license="MIT License",
     packages=['onnxruntime',
               'onnxruntime.backend',
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index be5fe0fc05..98d1d4a2d7 100755
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -14,7 +14,6 @@ import re
 import shutil
 import subprocess
 import sys
-import warnings
 import hashlib
 from os.path import expanduser
 
@@ -60,6 +59,7 @@ Use the individual flags to only run the specified stages.
     If you've done an update that fetched external dependencies you have to build without --parallel the first time.
     Once that's done, run with "--build --parallel --test" to just build in parallel and run tests.''')
     parser.add_argument("--test", action='store_true', help="Run unit tests.")
+    parser.add_argument("--skip_tests", action='store_true', help="Skip all tests.")
 
     # enable ONNX tests
     parser.add_argument("--enable_onnx_tests", action='store_true',
@@ -150,7 +150,7 @@ Use the individual flags to only run the specified stages.
     parser.add_argument("--tensorrt_home", help="Path to TensorRT installation dir")
     parser.add_argument("--use_full_protobuf", action='store_true', help="Use the full protobuf library")
     parser.add_argument("--disable_contrib_ops", action='store_true', help="Disable contrib ops (reduces binary size)")
-    parser.add_argument("--skip_onnx_tests", action='store_true', help="Explicitly disable all onnx related tests")
+    parser.add_argument("--skip_onnx_tests", action='store_true', help="Explicitly disable all onnx related tests. Note: Use --skip_tests to skip all tests.")
     parser.add_argument("--enable_msvc_static_runtime", action='store_true', help="Enable static linking of MSVC runtimes.")
     parser.add_argument("--enable_language_interop_ops", action='store_true', help="Enable operator implemented in language other than cpp")
     parser.add_argument("--cmake_generator", choices=['Visual Studio 15 2017', 'Visual Studio 16 2019'],
@@ -190,7 +190,9 @@ def run_subprocess(args, cwd=None, capture=False, dll_path=None, shell=False, en
 
     stdout, stderr = (subprocess.PIPE, subprocess.STDOUT) if capture else (None, None)
     my_env.update(env)
-    return subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell)
+    completed_process = subprocess.run(args, cwd=cwd, check=True, stdout=stdout, stderr=stderr, env=my_env, shell=shell)
+    log.debug("Subprocess completed. Return code=" + str(completed_process.returncode))
+    return completed_process
 
 def update_submodules(source_dir):
     run_subprocess(["git", "submodule", "sync", "--recursive"], cwd=source_dir)
@@ -549,26 +551,33 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs, enab
                 return
             if is_windows():
                 cwd = os.path.join(cwd, config)
+
             run_subprocess([sys.executable, 'onnxruntime_test_python.py'], cwd=cwd, dll_path=dll_path)
+
             try:
                 import onnx
+                import scipy  # gen_test_models.py used by onnx_test has a dependency on scipy
                 onnx_test = True
-            except ImportError:
-                warnings.warn("onnx is not installed. Following test cannot be run.")
+            except ImportError as error:
+                log.exception(error)
+                log.warning("onnx or scipy is not installed. The ONNX tests will be skipped.")
                 onnx_test = False
+
             if onnx_test:
                 run_subprocess([sys.executable, 'onnxruntime_test_python_backend.py'], cwd=cwd, dll_path=dll_path)
-                run_subprocess([sys.executable, os.path.join(source_dir,'onnxruntime','test','onnx','gen_test_models.py'),'--output_dir','test_models'], cwd=cwd)
+                run_subprocess([sys.executable, os.path.join(source_dir,'onnxruntime','test','onnx','gen_test_models.py'),
+                                '--output_dir','test_models'], cwd=cwd)
                 run_subprocess([os.path.join(cwd,'onnx_test_runner'), 'test_models'], cwd=cwd)
                 if config != 'Debug':
                     run_subprocess([sys.executable, 'onnx_backend_test_series.py'], cwd=cwd, dll_path=dll_path)
+
             if not args.skip_keras_test:
                 try:
                     import onnxmltools
                     import keras
                     onnxml_test = True
                 except ImportError:
-                    warnings.warn("onnxmltools and keras are not installed. Following test cannot be run.")
+                    log.warning("onnxmltools and keras are not installed. The keras tests will be skipped.")
                     onnxml_test = False
                 if onnxml_test:
                     run_subprocess([sys.executable, 'onnxruntime_test_python_keras.py'], cwd=cwd, dll_path=dll_path)
@@ -838,6 +847,9 @@ def main():
         else:
             args.test = True
 
+    if args.skip_tests:
+        args.test = False
+
     if args.use_tensorrt:
         args.use_cuda = True
 

From 75b4747701b818a37af0dddca0d175f854e15964 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Wed, 27 Nov 2019 15:32:05 -0800
Subject: [PATCH 09/13] Fix a memleak in pybind. (#2503)

---
 onnxruntime/python/onnxruntime_pybind_mlvalue.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
index 491ca6e105..e2b68e121e 100644
--- a/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
+++ b/onnxruntime/python/onnxruntime_pybind_mlvalue.cc
@@ -233,7 +233,7 @@ void CreateSequenceOfTensors(AllocatorPtr alloc, const std::string& name_input,
         throw std::runtime_error("CreateSequenceOfTensors: Input is not a tensor");
       }
       auto p_tensor = CreateTensor(alloc, name_input, reinterpret_cast<PyArrayObject*>(py_obj));
-      p_seq_tensors->tensors[i] = std::move(*(p_tensor.release()));
+      p_seq_tensors->tensors[i] = std::move(*p_tensor);
     }
   }
 

From ee0bde6b69154a18e8719fa6bf4324b0f38f1f38 Mon Sep 17 00:00:00 2001
From: Zhang Lei <zhang.huanning@hotmail.com>
Date: Thu, 28 Nov 2019 03:03:43 -0800
Subject: [PATCH 10/13] Enable three type of Equal() to version 11. (#2508)

---
 .../providers/cuda/cuda_execution_provider.cc  | 18 ++++++++++++------
 .../cuda/math/binary_elementwise_ops.cc        |  8 +++++++-
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 6b83987403..b33be3c7e0 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -300,9 +300,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Greater);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Greater);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Greater);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Equal);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int32_t, Equal);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int64_t, Equal);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, bool, Equal);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int32_t, Equal);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int64_t, Equal);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, Expand);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Greater);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Greater);
@@ -653,6 +653,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Clip);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, bool, Equal);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t, Equal);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t, Equal);
 
 static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -749,9 +752,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, float, Greater)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, double, Greater)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 8, MLFloat16, Greater)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, bool, Equal)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int32_t, Equal)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, int64_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, bool, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 7, 10, int64_t, Equal)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 8, Expand)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int32_t, Greater)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, int64_t, Greater)>,
@@ -1101,6 +1104,9 @@ static void RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, double, MaxPool)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, MLFloat16, MaxPool)>,      
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, float, Clip)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, bool, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int32_t, Equal)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 11, int64_t, Equal)>,
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc
index f2941ab5e2..23865e28e5 100644
--- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.cc
@@ -176,6 +176,11 @@ Status BinaryElementwise<ShouldBroadcast>::Prepare(OpKernelContext* context, Bin
   BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, int32_t) \
   BINARY_ELEMENTWISE_REGISTER_KERNEL_TYPED(name, ver, int64_t)
 
+#define BINARY_OP_REGISTER_VERSIONED_OIL(name, startver, endver)                      \
+  BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, bool)    \
+  BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, int32_t) \
+  BINARY_ELEMENTWISE_REGISTER_KERNEL_VERSIONED_TYPED(name, startver, endver, int64_t)
+
 #define BINARY_LOGICALOP_REGISTER_OIL(name, ver)                         \
   BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, bool)    \
   BINARY_ELEMENTWISE_LOGICALOP_REGISTER_KERNEL_TYPED(name, ver, int32_t) \
@@ -379,7 +384,8 @@ Status Less<T>::ComputeInternal(OpKernelContext* context) const {
 BINARY_OP_REGISTER_UZILHFD(Sum, 8)
 BINARY_OP_REGISTER_VERSIONED_UZILHFD(Sum, 6, 7)
 BINARY_LOGICALOP_REGISTER_UZILHFD(Greater, 9)
-BINARY_OP_REGISTER_OIL(Equal, 7)
+BINARY_OP_REGISTER_VERSIONED_OIL(Equal, 7, 10)
+BINARY_OP_REGISTER_OIL(Equal, 11)
 BINARY_OP_REGISTER_VERSIONED_HFD(Greater, 7, 8)
 BINARY_OP_REGISTER_HFD(Max, 8)
 BINARY_OP_REGISTER_VERSIONED_HFD(Max, 6, 7)

From 005305be6e201c7ac0d8619f6813027165c521e5 Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Thu, 28 Nov 2019 08:29:59 -0800
Subject: [PATCH 11/13] Implement AddGelu and SkipLayerNorm (#2487)

* Implement AddGelu and SkipLayerNorm
---
 .../contrib_ops/cpu/bert/bias_gelu_fusion.cc  |  76 +++++++++++
 .../contrib_ops/cpu/bert/bias_gelu_fusion.h   |  20 +++
 onnxruntime/contrib_ops/cpu/layer_norm.cc     |  70 +++++-----
 .../contrib_ops/cpu/skip_layer_norm.cc        | 129 ++++++++++++++++++
 onnxruntime/contrib_ops/cpu/skip_layer_norm.h |  21 +++
 .../contrib_ops/cpu_contrib_kernels.cc        |   8 +-
 .../cuda/math/binary_elementwise_ops.cc       |   2 +-
 .../cuda/math/binary_elementwise_ops.h        |   4 +-
 .../cuda/math/binary_elementwise_ops_impl.cu  |   2 +-
 .../cuda/math/binary_elementwise_ops_impl.h   |   2 +-
 .../contrib_ops/cuda_contrib_kernels.cc       |  12 +-
 .../core/graph/contrib_ops/contrib_defs.cc    |  22 ++-
 ...add_gelu_fusion.cc => bias_gelu_fusion.cc} |  25 +++-
 .../{add_gelu_fusion.h => bias_gelu_fusion.h} |   6 +-
 .../core/optimizer/gelu_approximation.cc      |   4 +-
 .../core/optimizer/graph_transformer_utils.cc |   7 +-
 .../test/contrib_ops/element_wise_ops_test.cc |  24 +---
 .../test/contrib_ops/layer_norm_op_test.cc    |   4 +-
 .../test/contrib_ops/skiplayernorm_op_test.cc |  70 +++++-----
 .../test/optimizer/graph_transform_test.cc    |  31 +----
 .../optimizer/graph_transform_utils_test.cc   |   2 +-
 .../transform/approximation/gelu.onnx         |   4 +-
 .../approximation/gelu_add_bias.onnx          |   8 +-
 .../approximation/gelu_add_matmul.onnx        |   8 +-
 .../gelu_add_shape_not_match.onnx             |  19 ---
 .../approximation/gelu_approximation_gen.py   |  25 +---
 ...gelu_fusion.onnx => bias_gelu_fusion.onnx} | Bin 419 -> 401 bytes
 .../{gelu_add_gen.py => bias_gelu_gen.py}     |   4 +-
 28 files changed, 402 insertions(+), 207 deletions(-)
 create mode 100644 onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.cc
 create mode 100644 onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.h
 create mode 100644 onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
 create mode 100644 onnxruntime/contrib_ops/cpu/skip_layer_norm.h
 rename onnxruntime/core/optimizer/{add_gelu_fusion.cc => bias_gelu_fusion.cc} (69%)
 rename onnxruntime/core/optimizer/{add_gelu_fusion.h => bias_gelu_fusion.h} (63%)
 delete mode 100644 onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx
 rename onnxruntime/test/testdata/transform/fusion/{add_gelu_fusion.onnx => bias_gelu_fusion.onnx} (89%)
 rename onnxruntime/test/testdata/transform/fusion/{gelu_add_gen.py => bias_gelu_gen.py} (90%)

diff --git a/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.cc b/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.cc
new file mode 100644
index 0000000000..8d193eaca9
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.cc
@@ -0,0 +1,76 @@
+#include "bias_gelu_fusion.h"
+
+#include "core/util/math_cpuonly.h"
+#include "core/platform/threadpool.h"
+#include "core/mlas/inc/mlas.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+ONNX_OPERATOR_KERNEL_EX(
+    BiasGelu,
+    kMSDomain,
+    1,
+    kCpuExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    BiasGelu<float>);
+
+template <typename T>
+Status BiasGelu<T>::Compute(OpKernelContext* ctx) const {
+  const Tensor* X = ctx->Input<Tensor>(0);
+  const auto input_dims = X->Shape().GetDims();
+  if (input_dims.size() < 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Shape of Input 0 is expected to have at least 1 dimension, got ", input_dims.size());
+  }
+
+  const Tensor* B = ctx->Input<Tensor>(1);
+  const auto bias_dims = B->Shape().GetDims();
+  if (bias_dims.size() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 1 is expected to have 1 dimension, got ", bias_dims.size());
+  }
+
+  int64_t bias_len = bias_dims[0];
+  if (bias_len != input_dims[input_dims.size() - 1]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "dimension 0 of Input 1 should have same length as the last dimension of input 0");
+  }
+
+  Tensor* Y = ctx->Output(0, X->Shape());
+
+  AllocatorPtr alloc;
+  ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&alloc));
+
+  BufferUniquePtr temp_data_buf_ptr = BufferUniquePtr(alloc->Alloc(sizeof(T) * X->Shape().Size()), BufferDeleter(alloc));
+  T* tmp_data = static_cast<T*>(temp_data_buf_ptr.get());
+
+  const T* X_data = X->template Data<T>();
+  const T* B_data = B->template Data<T>();
+  T* Y_data = Y->template MutableData<T>();
+  int64_t task_count = X->Shape().Size() / bias_len;
+
+  concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(),
+                                               static_cast<int32_t>(task_count),
+                                               [&](int32_t task_idx) {
+                                                 const T* p_input = X_data + task_idx * bias_len;
+                                                 T* p_output = Y_data + task_idx * bias_len;
+                                                 T* p_output_tmp = tmp_data + task_idx * bias_len;
+
+                                                 for (int64_t h = 0; h < bias_len; h++) {
+                                                   T value = p_input[h] + B_data[h];
+                                                   p_output[h] = value * static_cast<T>(M_SQRT1_2);
+                                                   p_output_tmp[h] = value * 0.5f;
+                                                 }
+
+                                                 MlasComputeErf(p_output, p_output, bias_len);
+
+                                                 for (int64_t h = 0; h < bias_len; h++) {
+                                                   p_output[h] = p_output_tmp[h] * (p_output[h] + 1.0f);
+                                                 }
+                                               });
+
+  return Status::OK();
+}
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.h b/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.h
new file mode 100644
index 0000000000..f8ccb6c0ef
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/bert/bias_gelu_fusion.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <typename T>
+class BiasGelu : public OpKernel {
+ public:
+  explicit BiasGelu(const OpKernelInfo& info) : OpKernel(info) {}
+  Status Compute(OpKernelContext* context) const override;
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/layer_norm.cc b/onnxruntime/contrib_ops/cpu/layer_norm.cc
index d6d4a16751..18f3664088 100644
--- a/onnxruntime/contrib_ops/cpu/layer_norm.cc
+++ b/onnxruntime/contrib_ops/cpu/layer_norm.cc
@@ -1,11 +1,13 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/framework/tensor.h"
-#include "core/util/math_cpuonly.h"
-#include "core/providers/common.h"
 #include "layer_norm.h"
 
+#include "core/framework/tensor.h"
+#include "core/platform/threadpool.h"
+#include "core/providers/common.h"
+#include "core/util/math_cpuonly.h"
+
 namespace onnxruntime {
 namespace contrib {
 
@@ -31,19 +33,22 @@ LayerNorm<T>::LayerNorm(const OpKernelInfo& op_kernel_info)
 }
 
 template <typename T>
-Status LayerNorm<T>::Compute(OpKernelContext* p_op_kernel_context) const {
+Status LayerNorm<T>::Compute(OpKernelContext* p_ctx) const {
   // Inputs
-  const Tensor* X = p_op_kernel_context->Input<Tensor>(0);
-  const Tensor* scale = p_op_kernel_context->Input<Tensor>(1);
-  const Tensor* bias = p_op_kernel_context->Input<Tensor>(2);
+  const Tensor* X = p_ctx->Input<Tensor>(0);
+  const Tensor* scale = p_ctx->Input<Tensor>(1);
+  const Tensor* bias = p_ctx->Input<Tensor>(2);
   auto X_data = X->template Data<T>();
   auto scale_data = scale->template Data<T>();
   auto bias_data = bias->template Data<T>();
 
   const TensorShape& x_shape = X->Shape();
   const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
-  auto N = x_shape.SizeToDimension(axis);
-  auto M = x_shape.SizeFromDimension(axis);
+  auto norm_count = x_shape.SizeToDimension(axis);
+  auto norm_size = x_shape.SizeFromDimension(axis);
+
+  Tensor* Y = p_ctx->Output(0, x_shape);
+  auto Y_data = Y->template MutableData<T>();
 
   std::vector<int64_t> mean_inv_std_var_dim;
   mean_inv_std_var_dim.reserve(x_shape.NumDimensions());
@@ -56,16 +61,16 @@ Status LayerNorm<T>::Compute(OpKernelContext* p_op_kernel_context) const {
   }
 
   AllocatorPtr alloc;
-  ORT_RETURN_IF_ERROR(p_op_kernel_context->GetTempSpaceAllocator(&alloc));
+  ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc));
 
   T* mean_data = nullptr;
   BufferUniquePtr mean_data_buf_ptr;
 
-  Tensor* mean = p_op_kernel_context->Output(1, TensorShape(mean_inv_std_var_dim));
+  Tensor* mean = p_ctx->Output(1, TensorShape(mean_inv_std_var_dim));
   if (mean != nullptr) {
     mean_data = mean->template MutableData<T>();
   } else {
-    auto mean_data_buf = alloc->Alloc(sizeof(T) * N);
+    auto mean_data_buf = alloc->Alloc(sizeof(T) * norm_count);
     mean_data_buf_ptr = BufferUniquePtr(mean_data_buf, BufferDeleter(alloc));
     mean_data = static_cast<T*>(mean_data_buf_ptr.get());
   }
@@ -73,38 +78,39 @@ Status LayerNorm<T>::Compute(OpKernelContext* p_op_kernel_context) const {
   T* inv_std_var_data = nullptr;
   BufferUniquePtr inv_std_var_data_buf_ptr;
 
-  Tensor* inv_std_var = p_op_kernel_context->Output(2, TensorShape(mean_inv_std_var_dim));
+  Tensor* inv_std_var = p_ctx->Output(2, TensorShape(mean_inv_std_var_dim));
   if (inv_std_var != nullptr) {
     inv_std_var_data = inv_std_var->template MutableData<T>();
   } else {
-    auto inv_std_var_data_buf = alloc->Alloc(sizeof(T) * N);
+    auto inv_std_var_data_buf = alloc->Alloc(sizeof(T) * norm_count);
     inv_std_var_data_buf_ptr = BufferUniquePtr(inv_std_var_data_buf, BufferDeleter(alloc));
     inv_std_var_data = static_cast<T*>(inv_std_var_data_buf_ptr.get());
   }
 
-  std::memset(mean_data, 0, sizeof(T) * N);
-  std::memset(inv_std_var_data, 0, sizeof(T) * N);
+  concurrency::ThreadPool::TryBatchParallelFor(p_ctx->GetOperatorThreadPool(),
+                                               static_cast<int32_t>(norm_count),
+                                               [&](int32_t task_idx) {
+                                                 const T* p_input = X_data + task_idx * norm_size;
+                                                 T* p_output = Y_data + task_idx * norm_size;
 
-  ConstEigenArrayMap<T> X_arr(X_data, M, N);
-  for (int i = 0; i < N; ++i) {
-    mean_data[i] = X_arr.col(i).mean();
-    inv_std_var_data[i] = X_arr.col(i).square().mean() - mean_data[i] * mean_data[i];
-  }
+                                                 T mean = 0;
+                                                 T mean_square = 0;
 
-  // Compute Y = ((x - mean) * (inv_var) * scale + bias
-  Tensor* Y = p_op_kernel_context->Output(0, x_shape);
-  auto Y_data = Y->template MutableData<T>();
-  EigenArrayMap<T> Y_arr(Y_data, M, N);
+                                                 for (int64_t h = 0; h < norm_size; h++) {
+                                                   mean += p_input[h];
+                                                   mean_square += p_input[h] * p_input[h];
+                                                 }
 
-  ConstEigenVectorArrayMap<T> mean_arr(mean_data, N);
-  EigenVectorArrayMap<T> inv_std_var_arr(inv_std_var_data, N);
-  inv_std_var_arr = (inv_std_var_arr + epsilon_).sqrt().inverse();
+                                                 mean = mean / norm_size;
+                                                 mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon_);
 
-  Y_arr = (X_arr.rowwise() - mean_arr.transpose()).rowwise() * inv_std_var_arr.transpose();
+                                                 for (int64_t h = 0; h < norm_size; h++) {
+                                                   p_output[h] = (p_input[h] - mean) / mean_square * scale_data[h] + bias_data[h];
+                                                 }
 
-  ConstEigenVectorArrayMap<T> scale_arr(scale_data, M);
-  ConstEigenVectorArrayMap<T> bias_arr(bias_data, M);
-  Y_arr = (Y_arr.colwise() * scale_arr).colwise() + bias_arr;
+                                                 mean_data[task_idx] = mean;
+                                                 inv_std_var_data[task_idx] = mean_square;
+                                               });
 
   return Status::OK();
 }
diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
new file mode 100644
index 0000000000..7dd0ff155f
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc
@@ -0,0 +1,129 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/framework/tensor.h"
+#include "core/util/math_cpuonly.h"
+#include "core/providers/common.h"
+#include "core/platform/threadpool.h"
+#include "skip_layer_norm.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+#define REGISTER_KERNEL_TYPED(T)                                  \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      SkipLayerNormalization,                                     \
+      kMSDomain,                                                  \
+      1,                                                          \
+      T,                                                          \
+      kCpuExecutionProvider,                                      \
+      KernelDefBuilder()                                          \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      SkipLayerNorm<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(double)
+
+template <typename T>
+SkipLayerNorm<T>::SkipLayerNorm(const OpKernelInfo& op_kernel_info)
+    : OpKernel(op_kernel_info) {
+}
+
+template <typename T>
+Status SkipLayerNorm<T>::Compute(OpKernelContext* p_ctx) const {
+  const Tensor* input = p_ctx->Input<Tensor>(0);
+  const Tensor* skip = p_ctx->Input<Tensor>(1);
+  const Tensor* gamma = p_ctx->Input<Tensor>(2);
+  const Tensor* beta = p_ctx->Input<Tensor>(3);
+  const Tensor* bias = p_ctx->Input<Tensor>(4);
+  Tensor* output = p_ctx->Output(0, input->Shape());
+
+  const auto input_dims = input->Shape().GetDims();
+  if (input_dims.size() != 3) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "input is expected to have 3 dimensions, got ", input_dims.size());
+  }
+
+  if (input->Shape() != skip->Shape()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "skip is expected to have same shape as input");
+  }
+
+  const auto gamma_dims = gamma->Shape().GetDims();
+  if (gamma_dims.size() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "gamma is expected to have 1 dimension, got ", gamma_dims.size());
+  }
+  if (gamma_dims[0] != input_dims[2]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Last dimension of gamma and input does not match");
+  }
+
+  const auto beta_dims = beta->Shape().GetDims();
+  if (beta_dims.size() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "beta is expected to have 1 dimension, got ", beta_dims.size());
+  }
+  if (beta_dims[0] != input_dims[2]) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Last dimension of beta and input does not match");
+  }
+
+  if (nullptr != bias) {
+    const auto bias_dims = bias->Shape().GetDims();
+    if (bias_dims.size() != 1) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "bias is expected to have 1 dimension, got ", bias_dims.size());
+    }
+    if (bias_dims[0] != input_dims[2]) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             "Last dimension of bias and input does not match");
+    }
+  }
+
+  int64_t batch_size = input_dims[0];
+  int64_t sequence_length = input_dims[1];
+  int64_t hidden_size = input_dims[2];
+  int64_t task_count = batch_size * sequence_length;
+
+  const T* input_data = input->Data<T>();
+  const T* skip_data = skip->Data<T>();
+  const T* gamma_data = gamma->Data<T>();
+  const T* beta_data = beta->Data<T>();
+  const T* bias_data = bias == nullptr ? nullptr : bias->Data<T>();
+
+  T* output_data = output->MutableData<T>();
+
+  concurrency::ThreadPool::TryBatchParallelFor(p_ctx->GetOperatorThreadPool(),
+                                               static_cast<int32_t>(task_count),
+                                               [&](int32_t task_idx) {
+                                                 const T* p_input = input_data + task_idx * hidden_size;
+                                                 const T* p_skip = skip_data + task_idx * hidden_size;
+                                                 T* p_output = output_data + task_idx * hidden_size;
+
+                                                 T mean = 0;
+                                                 T mean_square = 0;
+
+                                                 for (int64_t h = 0; h < hidden_size; h++) {
+                                                   T value = p_input[h] + p_skip[h];
+                                                   if (nullptr != bias_data) {
+                                                     value += bias_data[h];
+                                                   }
+                                                   p_output[h] = value;
+                                                   mean += value;
+                                                   mean_square += value * value;
+                                                 }
+
+                                                 mean = mean / hidden_size;
+                                                 mean_square = sqrt(mean_square / hidden_size - mean * mean + float(1e-12));
+
+                                                 for (int64_t h = 0; h < hidden_size; h++) {
+                                                   p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h];
+                                                 }
+                                               });
+
+  return Status::OK();
+}  // namespace contrib
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
new file mode 100644
index 0000000000..1ede7c8f23
--- /dev/null
+++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/framework/op_kernel.h"
+#include "core/framework/tensor.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <typename T>
+class SkipLayerNorm final : public OpKernel {
+ public:
+  SkipLayerNorm(const OpKernelInfo& op_kernel_info);
+  Status Compute(OpKernelContext* p_op_kernel_context) const override;
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu_contrib_kernels.cc
index 5d7a9122c0..5e456aa653 100644
--- a/onnxruntime/contrib_ops/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu_contrib_kernels.cc
@@ -33,6 +33,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu);
 
 // This section includes all op kernel declarations for former experimental ops which have now been removed from onnx.
 // To maintain backward compatibility these are added as contrib ops.
@@ -68,6 +69,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomai
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSNchwcDomain, 1, float, GlobalAveragePool);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, LayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, LayerNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization);
 
 Status RegisterNchwcKernels(KernelRegistry& kernel_registry) {
   static const BuildKernelCreateInfoFn function_table[] = {
@@ -112,6 +115,7 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QuantizeLinear)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, CDist)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, CDist)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, BiasGelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Gelu)>,
 
       // These ops were experimental ops in onnx domain which have been removed now. We add them here as
@@ -138,7 +142,9 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, 9, ThresholdedRelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, Scale)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, float, LayerNormalization)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, LayerNormalization)>};
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 1, double, LayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, SkipLayerNormalization)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, double, SkipLayerNormalization)>};
 
   for (auto& function_table_entry : function_table) {
     ORT_RETURN_IF_ERROR(kernel_registry.Register(function_table_entry()));
diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc
index 5449bbb491..1d32d89810 100644
--- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc
+++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.cc
@@ -63,7 +63,7 @@ namespace cuda {
   CONTRIB_BINARY_OP_TYPED(name, ver, float)     \
   CONTRIB_BINARY_OP_TYPED(name, ver, double)
 
-CONTRIB_BINARY_OP_HFD(AddGeluFusion, 1)
+CONTRIB_BINARY_OP_HFD(BiasGelu, 1)
 
 }  // namespace cuda
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.h b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.h
index 69dba848a5..9807321d43 100644
--- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.h
+++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops.h
@@ -16,9 +16,9 @@ namespace cuda {
 
 // AddGelu fuse Add + Gelu
 template <typename T>
-class AddGeluFusion final : public BinaryElementwise<ShouldBroadcast> {
+class BiasGelu final : public BinaryElementwise<ShouldBroadcast> {
  public:
-  AddGeluFusion(const OpKernelInfo& info) : BinaryElementwise(info) {
+  BiasGelu(const OpKernelInfo& info) : BinaryElementwise(info) {
   }
 
   Status ComputeInternal(OpKernelContext* context) const override;
diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu
index db4fcb18bb..4f8cc34b59 100644
--- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.cu
@@ -87,7 +87,7 @@ CONTRIB_BINARY_OPS()
 // D: double
 // O: bool
 
-CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(AddGeluFusion)
+CONTRIB_SPECIALIZED_BINARY_ELEMENTWISE_IMPL_HFD(BiasGelu)
 
 }  // namespace cuda
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h
index ee6cd680ae..22f1daa8e1 100644
--- a/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h
+++ b/onnxruntime/contrib_ops/cuda/math/binary_elementwise_ops_impl.h
@@ -13,7 +13,7 @@ namespace cuda {
 // 4. Add op kernel class definition in binary_elementwise_ops.h
 // 5. Add op kernel registration and compute specialization in binary_elementwise_ops.cc
 #define CONTRIB_BINARY_OPS() \
-  CONTRIB_BINARY_OP_NAME_EXPR(AddGeluFusion, _Gelu(a + b))
+  CONTRIB_BINARY_OP_NAME_EXPR(BiasGelu, _Gelu(a + b))
 
 // NOTE that cu files are compiled with nvcc and should not refer to any onnxruntime headers
 // so struct BinaryElementwisePreparation cannot be used here
diff --git a/onnxruntime/contrib_ops/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda_contrib_kernels.cc
index 31790b899b..674546219a 100644
--- a/onnxruntime/contrib_ops/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda_contrib_kernels.cc
@@ -14,9 +14,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, Gelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, Gelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, Gelu);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, AddGeluFusion);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, AddGeluFusion);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, AddGeluFusion);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, BiasGelu);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, BiasGelu);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, BiasGelu);
 
 // These ops were experimental ops in onnx domain which have been removed now. We add them here as
 // contrib ops to maintain backward compatibility
@@ -58,9 +58,9 @@ void RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, Gelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, Gelu)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, Gelu)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, AddGeluFusion)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, AddGeluFusion)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, AddGeluFusion)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, BiasGelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, BiasGelu)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, BiasGelu)>,
 
       // These ops were experimental ops in onnx domain which have been removed now. We add them here as
       // contrib ops to maintain backward compatibility
diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
index 0266703696..92937dad05 100644
--- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc
@@ -1945,28 +1945,22 @@ inputs by their magnitude, rather than gates inputs by their sign as in ReLUs.)D
           "Constrain input and output types to float tensors.")
       .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput);
 
-  ONNX_CONTRIB_OPERATOR_SCHEMA(AddGeluFusion)
+  static const char* BiasGelu_ver1_doc =
+      R"DOC(Bias Gelu.
+It's an extension of Gelu. It takes the sum of input A and bias input B as the input of Gelu activation. )DOC";
+  ONNX_CONTRIB_OPERATOR_SCHEMA(BiasGelu)
       .SetDomain(kMSDomain)
       .SinceVersion(1)
       .SetSupportLevel(OpSchema::SupportType::EXPERIMENTAL)
-      .SetDoc("AddGeluFusion fuses Add+Gelu. The fused Add op is the parent node of the fused Gelu.")
-      .Input(0, "A", "The input data as Tensor that is the first input of fused Add.", "T")
-      .Input(1, "B", "The input data as Tensor that is the second input of fused Add.", "T")
+      .SetDoc(BiasGelu_ver1_doc)
+      .Input(0, "A", "The normal input data.", "T")
+      .Input(1, "B", "The bias input data that is a 1D tensor.", "T")
       .Output(0, "C", "The output.", "T")
       .TypeConstraint(
           "T",
           {"tensor(float16)", "tensor(float)", "tensor(double)"},
           "Constrain input and output types to float tensors.")
-      .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
-        propagateElemTypeFromInputToOutput(ctx, 0, 0);
-
-        if (hasNInputShapes(ctx, 2)) {
-          bidirectionalBroadcastShapeInference(
-              ctx.getInputType(0)->tensor_type().shape(),
-              ctx.getInputType(1)->tensor_type().shape(),
-              *ctx.getOutputType(0)->mutable_tensor_type()->mutable_shape());
-        }
-      });
+      .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput);
 
   RegisterBertSchemas();
 
diff --git a/onnxruntime/core/optimizer/add_gelu_fusion.cc b/onnxruntime/core/optimizer/bias_gelu_fusion.cc
similarity index 69%
rename from onnxruntime/core/optimizer/add_gelu_fusion.cc
rename to onnxruntime/core/optimizer/bias_gelu_fusion.cc
index 2c26f3aded..94b8c601dc 100644
--- a/onnxruntime/core/optimizer/add_gelu_fusion.cc
+++ b/onnxruntime/core/optimizer/bias_gelu_fusion.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/optimizer/initializer.h"
-#include "core/optimizer/add_gelu_fusion.h"
+#include "core/optimizer/bias_gelu_fusion.h"
 #include "core/graph/graph_utils.h"
 #include <deque>
 
@@ -10,7 +10,7 @@ using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
 namespace onnxruntime {
 
-Status AddGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+Status BiasGelu::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
   GraphViewer graph_viewer(graph);
   const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
 
@@ -29,6 +29,21 @@ Status AddGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, c
       continue;
     }
 
+    std::vector<NodeArg*> gelu_input;
+    const TensorShapeProto* add_input1_shape = node.MutableInputDefs()[0]->Shape();
+    const TensorShapeProto* add_input2_shape = node.MutableInputDefs()[1]->Shape();
+    if (add_input1_shape != nullptr &&
+        add_input1_shape->dim_size() == 1) {
+      gelu_input.push_back(node.MutableInputDefs()[1]);
+      gelu_input.push_back(node.MutableInputDefs()[0]);
+    } else if (add_input2_shape != nullptr &&
+               add_input2_shape->dim_size() == 1) {
+      gelu_input.push_back(node.MutableInputDefs()[0]);
+      gelu_input.push_back(node.MutableInputDefs()[1]);
+    } else {
+      continue;
+    }
+
     auto next_node_itr = node.OutputNodesBegin();
     if (next_node_itr == node.OutputNodesEnd()) {
       continue;
@@ -47,10 +62,10 @@ Status AddGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, c
     Node& add_node = node;
     Node& gelu_node = const_cast<Node&>(next_node);
 
-    Node& gelu_add_fusion_node = graph.AddNode(graph.GenerateNodeName("AddGeluFusion"),
-                                               "AddGeluFusion",
+    Node& gelu_add_fusion_node = graph.AddNode(graph.GenerateNodeName("BiasGelu"),
+                                               "BiasGelu",
                                                "fused Add and Gelu",
-                                               {add_node.MutableInputDefs()},
+                                               gelu_input,
                                                {},
                                                {},
                                                kMSDomain);
diff --git a/onnxruntime/core/optimizer/add_gelu_fusion.h b/onnxruntime/core/optimizer/bias_gelu_fusion.h
similarity index 63%
rename from onnxruntime/core/optimizer/add_gelu_fusion.h
rename to onnxruntime/core/optimizer/bias_gelu_fusion.h
index 58caed2b87..9a2f8e5e91 100644
--- a/onnxruntime/core/optimizer/add_gelu_fusion.h
+++ b/onnxruntime/core/optimizer/bias_gelu_fusion.h
@@ -11,10 +11,10 @@ namespace onnxruntime {
 @Class GeluFusion
 Fuse Add + Gelu to GeluFusion
 */
-class AddGeluFusion : public GraphTransformer {
+class BiasGelu : public GraphTransformer {
  public:
-  AddGeluFusion(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
-      : GraphTransformer("AddGeluFusion", compatible_execution_providers) {
+  BiasGelu(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("BiasGelu", compatible_execution_providers) {
   }
 
   Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
diff --git a/onnxruntime/core/optimizer/gelu_approximation.cc b/onnxruntime/core/optimizer/gelu_approximation.cc
index 9982a3836d..494666dd9e 100644
--- a/onnxruntime/core/optimizer/gelu_approximation.cc
+++ b/onnxruntime/core/optimizer/gelu_approximation.cc
@@ -66,8 +66,8 @@ static bool CheckGeluInputShape(const NodeArg& input) {
   return nullptr != input_shape && input_shape->dim_size() >= 1;
 }
 
-static bool IsCandidateNode(Node& node, const std::unordered_set<std::string>& compatible_providers) {
-  if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "AddGeluFusion", {1}, kMSDomain)) {
+static bool IsCandidateNode(const Node& node, const std::unordered_set<std::string>& compatible_providers) {
+  if (graph_utils::IsSupportedOptypeVersionAndDomain(node, "BiasGelu", {1}, kMSDomain)) {
     return graph_utils::IsSupportedProvider(node, compatible_providers) &&
            IsSupportedDataType(node) &&
            CheckInputShape(node, *(node.InputDefs()[0]), *(node.InputDefs()[1]));
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 3f0b6c1b43..e2ae44fa18 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -18,7 +18,7 @@
 #include "core/optimizer/shape_to_initializer.h"
 #include "core/optimizer/nchwc_transformer.h"
 #include "core/optimizer/free_dim_override_transformer.h"
-#include "core/optimizer/add_gelu_fusion.h"
+#include "core/optimizer/bias_gelu_fusion.h"
 #include "core/optimizer/gelu_fusion.h"
 #include "core/optimizer/gelu_approximation.h"
 #include "core/optimizer/layer_norm_fusion.h"
@@ -128,12 +128,11 @@ std::vector<std::unique_ptr<GraphTransformer>> GenerateTransformers(TransformerL
       transformers.emplace_back(onnxruntime::make_unique<GeluFusion>(cpu_cuda_execution_providers));
       transformers.emplace_back(onnxruntime::make_unique<LayerNormFusion>(cpu_cuda_execution_providers));
       transformers.emplace_back(onnxruntime::make_unique<AttentionFusion>(cpu_cuda_execution_providers));
+      transformers.emplace_back(onnxruntime::make_unique<BiasGelu>(cpu_cuda_execution_providers));
+      transformers.emplace_back(onnxruntime::make_unique<SkipLayerNormFusion>(cpu_cuda_execution_providers));
 
       std::unordered_set<std::string> cuda_execution_providers = {onnxruntime::kCudaExecutionProvider};
-      transformers.emplace_back(onnxruntime::make_unique<AddGeluFusion>(cuda_execution_providers));
       transformers.emplace_back(onnxruntime::make_unique<GeluApproximation>(cuda_execution_providers));
-      transformers.emplace_back(onnxruntime::make_unique<SkipLayerNormFusion>(cuda_execution_providers));
-
 #endif
     } break;
 
diff --git a/onnxruntime/test/contrib_ops/element_wise_ops_test.cc b/onnxruntime/test/contrib_ops/element_wise_ops_test.cc
index 8cd60df3f9..10da620c53 100644
--- a/onnxruntime/test/contrib_ops/element_wise_ops_test.cc
+++ b/onnxruntime/test/contrib_ops/element_wise_ops_test.cc
@@ -84,7 +84,7 @@ const std::vector<float> ComputeGeluWithErf(const std::vector<float>& input_data
   return output;
 }
 
-static void RunAddGeluFusionTest(
+static void RunBiasGeluTest(
     const std::vector<float>& input_a_data,
     const std::vector<float>& input_b_data,
     const std::vector<int64_t>& input_a_dims,
@@ -92,20 +92,18 @@ static void RunAddGeluFusionTest(
   if (HasCudaEnvironment(0)) {
     std::vector<float> output_data = ComputeGeluWithErf(Add_Simple(input_a_data, input_b_data));
 
-    OpTester tester("AddGeluFusion", 1, onnxruntime::kMSDomain);
+    OpTester tester("BiasGelu", 1, onnxruntime::kMSDomain);
 
     const std::vector<int64_t>& output_dims = input_a_dims.size() >= input_b_dims.size() ? input_a_dims : input_b_dims;
     tester.AddInput<float>("A", input_a_dims, input_a_data);
     tester.AddInput<float>("B", input_b_dims, input_b_data);
     tester.AddOutput<float>("C", output_dims, output_data);
 
-    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
-    execution_providers.push_back(DefaultCudaExecutionProvider());
-    tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+    tester.Run();
   }
 }
 
-TEST(AddGeluFusionTest, Two_One_Dim) {
+TEST(BiasGeluTest, Two_One_Dim) {
   std::vector<float> input_a_data = {
       0.8f, -0.5f, 0.0f, 1.f,
       0.5f, 0.2f, 0.3f, -0.6f};
@@ -113,19 +111,7 @@ TEST(AddGeluFusionTest, Two_One_Dim) {
   std::vector<float> input_b_data = {
       -0.5f, 0.6f, 1.2f, 2.1f};
 
-  RunAddGeluFusionTest(input_a_data, input_b_data, {2, 4}, {4});
-}
-
-TEST(AddGeluFusionTest, Two_Two_Dim) {
-  std::vector<float> input_a_data = {
-      0.8f, -0.5f, 0.0f, 1.f,
-      0.5f, 0.2f, 0.3f, -0.6f};
-
-  std::vector<float> input_b_data = {
-      -0.5f, 0.6f, 1.2f, 2.1f,
-      0.4f, 0.6f, 0.2f, -0.4f};
-
-  RunAddGeluFusionTest(input_a_data, input_b_data, {2, 4}, {2, 4});
+  RunBiasGeluTest(input_a_data, input_b_data, {2, 4}, {4});
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index da0b447b48..4ae04d7b06 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -89,7 +89,7 @@ class LayerNormOpTester : public OpTester {
   }
   void Run() {
 #ifndef NDEBUG
-     run_called_ = true;
+    run_called_ = true;
 #endif
     std::vector<MLValue> cpu_fetches;
     std::vector<MLValue> cuda_fetches;
@@ -110,7 +110,7 @@ class LayerNormOpTester : public OpTester {
     // Compare GPU with original subgraph
     if (DefaultCudaExecutionProvider()) {
       ASSERT_TRUE(cuda_fetches.size() == subgraph_fetches.size());
-      for(size_t i = 0; i < cuda_fetches.size(); i++) {
+      for (size_t i = 0; i < cuda_fetches.size(); i++) {
         if (cuda_fetches[i].IsTensor() && subgraph_fetches[i].IsTensor()) {
           VLOGS_DEFAULT(1) << "Checking tensor " << i;
           CheckTensor(subgraph_fetches[i].Get<Tensor>(), cuda_fetches[i].Get<Tensor>(), 1e-3, 1e-3);
diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
index 2a557f800e..0d239ca4a1 100644
--- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
@@ -20,45 +20,45 @@ static void RunTest(
     int sequence_length,
     int hidden_size,
     bool use_float16 = false) {
-  int min_cuda_architecture = use_float16 ? 530 : 0;
-  if (HasCudaEnvironment(min_cuda_architecture)) {
+  // Input and output shapes
+  //   Input 0 - input: (batch_size, sequence_length, hidden_size)
+  //   Input 1 - skip : (batch_size, sequence_length, hidden_size)
+  //   Input 2 - gamma: (hidden_size)
+  //   Input 3 - beta : (hidden_size)
+  //   Output         : (batch_size, sequence_length, hidden_size)
+  std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
+  std::vector<int64_t> skip_dims = input_dims;
+  std::vector<int64_t> gamma_dims = {hidden_size};
+  std::vector<int64_t> beta_dims = gamma_dims;
+  std::vector<int64_t> bias_dims = gamma_dims;
+  std::vector<int64_t> output_dims = input_dims;
+
+  if (!use_float16) {
     OpTester test("SkipLayerNormalization", 1, onnxruntime::kMSDomain);
+    test.AddInput<float>("input", input_dims, input_data);
+    test.AddInput<float>("skip", skip_dims, skip_data);
+    test.AddInput<float>("gamma", gamma_dims, gamma_data);
+    test.AddInput<float>("beta", beta_dims, beta_data);
 
-    // Input and output shapes
-    //   Input 0 - input: (batch_size, sequence_length, hidden_size)
-    //   Input 1 - skip : (batch_size, sequence_length, hidden_size)
-    //   Input 2 - gamma: (hidden_size)
-    //   Input 3 - beta : (hidden_size)
-    //   Output         : (batch_size, sequence_length, hidden_size)
-    std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
-    std::vector<int64_t> skip_dims = input_dims;
-    std::vector<int64_t> gamma_dims = {hidden_size};
-    std::vector<int64_t> beta_dims = gamma_dims;
-    std::vector<int64_t> bias_dims = gamma_dims;
-    std::vector<int64_t> output_dims = input_dims;
-
-    if (use_float16) {
-      test.AddInput<MLFloat16>("input", input_dims, ToFloat16(input_data));
-      test.AddInput<MLFloat16>("skip", skip_dims, ToFloat16(skip_data));
-      test.AddInput<MLFloat16>("gamma", gamma_dims, ToFloat16(gamma_data));
-      test.AddInput<MLFloat16>("beta", beta_dims, ToFloat16(beta_data));
-      if (!bias_data.empty()) {
-        test.AddInput<MLFloat16>("bias", bias_dims, ToFloat16(bias_data));
-      }
-
-      test.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
-    } else {
-      test.AddInput<float>("input", input_dims, input_data);
-      test.AddInput<float>("skip", skip_dims, skip_data);
-      test.AddInput<float>("gamma", gamma_dims, gamma_data);
-      test.AddInput<float>("beta", beta_dims, beta_data);
-      if (!bias_data.empty()) {
-        test.AddInput<float>("bias", bias_dims, bias_data);
-      }
-
-      test.AddOutput<float>("output", output_dims, output_data);
+    if (!bias_data.empty()) {
+      test.AddInput<float>("bias", bias_dims, bias_data);
     }
 
+    test.AddOutput<float>("output", output_dims, output_data);
+    test.Run();
+  } else if (HasCudaEnvironment(530 /*min_cuda_architecture*/)) {
+    OpTester test("SkipLayerNormalization", 1, onnxruntime::kMSDomain);
+    test.AddInput<MLFloat16>("input", input_dims, ToFloat16(input_data));
+    test.AddInput<MLFloat16>("skip", skip_dims, ToFloat16(skip_data));
+    test.AddInput<MLFloat16>("gamma", gamma_dims, ToFloat16(gamma_data));
+    test.AddInput<MLFloat16>("beta", beta_dims, ToFloat16(beta_data));
+
+    if (!bias_data.empty()) {
+      test.AddInput<MLFloat16>("bias", bias_dims, ToFloat16(bias_data));
+    }
+
+    test.AddOutput<MLFloat16>("output", output_dims, ToFloat16(output_data));
+
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultCudaExecutionProvider());
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 9915505477..854365515c 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -14,7 +14,7 @@
 #include "core/optimizer/conv_activation_fusion.h"
 #include "core/optimizer/dropout_elimination.h"
 #include "core/optimizer/gemm_activation_fusion.h"
-#include "core/optimizer/add_gelu_fusion.h"
+#include "core/optimizer/bias_gelu_fusion.h"
 #include "core/optimizer/gelu_fusion.h"
 #include "core/optimizer/gelu_approximation.h"
 #include "core/optimizer/layer_norm_fusion.h"
@@ -1082,15 +1082,15 @@ TEST(GraphTransformationTests, GeluFusionTest) {
   ASSERT_TRUE(op_to_count["Gelu"] == 1);
 }
 
-TEST(GraphTransformationTests, AddGeluFusionTest) {
-  auto model_uri = MODEL_FOLDER "fusion/add_gelu_fusion.onnx";
+TEST(GraphTransformationTests, BiasGeluTest) {
+  auto model_uri = MODEL_FOLDER "fusion/bias_gelu_fusion.onnx";
   std::shared_ptr<Model> p_model;
   ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
   Graph& graph = p_model->MainGraph();
 
   onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
   graph_transformation_mgr.Register(onnxruntime::make_unique<GeluFusion>(), TransformerLevel::Level2);
-  graph_transformation_mgr.Register(onnxruntime::make_unique<AddGeluFusion>(), TransformerLevel::Level2);
+  graph_transformation_mgr.Register(onnxruntime::make_unique<BiasGelu>(), TransformerLevel::Level2);
   auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
   ASSERT_TRUE(ret.IsOK());
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
@@ -1099,7 +1099,7 @@ TEST(GraphTransformationTests, AddGeluFusionTest) {
   ASSERT_TRUE(op_to_count["Erf"] == 0);
   ASSERT_TRUE(op_to_count["Mul"] == 0);
   ASSERT_TRUE(op_to_count["Gelu"] == 0);
-  ASSERT_TRUE(op_to_count["GeluFusion"] == 0);
+  ASSERT_TRUE(op_to_count["BiasGelu"] == 1);
 }
 
 // Test Gelu -> FastGelu
@@ -1132,7 +1132,7 @@ TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_Bias) {
   ASSERT_TRUE(ret.IsOK());
 
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
-  EXPECT_EQ(op_to_count["AddGeluFusion"], 0);
+  EXPECT_EQ(op_to_count["BiasGelu"], 0);
   EXPECT_EQ(op_to_count["FastGelu"], 1);
 }
 
@@ -1149,28 +1149,11 @@ TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_MatMul) {
   ASSERT_TRUE(ret.IsOK());
 
   std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
-  EXPECT_EQ(op_to_count["AddGeluFusion"], 0);
+  EXPECT_EQ(op_to_count["BiasGelu"], 0);
   EXPECT_EQ(op_to_count["MatMul"], 1);
   EXPECT_EQ(op_to_count["FastGelu"], 1);
 }
 
-// Test AddGeluFusion with mis-match bias shape cannot convert to FastGelu.
-TEST(GraphTransformationTests, GeluApproximation_Gelu_Add_Shape_Not_Match) {
-  auto model_uri = MODEL_FOLDER "approximation/gelu_add_shape_not_match.onnx";
-  std::shared_ptr<Model> p_model;
-  ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
-  Graph& graph = p_model->MainGraph();
-
-  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
-  graph_transformation_mgr.Register(onnxruntime::make_unique<GeluApproximation>(), TransformerLevel::Level2);
-  auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
-  ASSERT_TRUE(ret.IsOK());
-
-  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
-  EXPECT_EQ(op_to_count["AddGeluFusion"], 1);
-  EXPECT_EQ(op_to_count["FastGelu"], 0);
-}
-
 TEST(GraphTransformationTests, LayerNormFusionTest) {
   auto model_uri = MODEL_FOLDER "fusion/layer_norm.onnx";
   std::shared_ptr<Model> p_model;
diff --git a/onnxruntime/test/optimizer/graph_transform_utils_test.cc b/onnxruntime/test/optimizer/graph_transform_utils_test.cc
index 190a289807..4351fd873a 100644
--- a/onnxruntime/test/optimizer/graph_transform_utils_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_utils_test.cc
@@ -51,7 +51,7 @@ TEST(GraphTransformerUtilsTests, TestGenerateGraphTransformers) {
     }
   }
   ASSERT_TRUE(rule_transformer && rule_transformer->RulesCount() == 1);
-  
+
   transformers = optimizer_utils::GenerateTransformers(TransformerLevel::Level2, {}, custom_list);
 #ifndef DISABLE_CONTRIB_OPS
   ASSERT_TRUE(transformers.size() == 1);
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu.onnx b/onnxruntime/test/testdata/transform/approximation/gelu.onnx
index b183a1fc84..1f18ac6f51 100644
--- a/onnxruntime/test/testdata/transform/approximation/gelu.onnx
+++ b/onnxruntime/test/testdata/transform/approximation/gelu.onnx
@@ -1,4 +1,4 @@
-:~
+:~
 #
 ACGelu_1"Gelu:com.microsoftGelu_NoBiasZ$
 A
@@ -10,4 +10,4 @@
 
 batch
 	seq_len
-�B
\ No newline at end of file
+�B
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx
index b96cb4752a..0c9ad44787 100644
--- a/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx
+++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_bias.onnx
@@ -1,7 +1,7 @@
-:�
-8
+:�
+3
 A
-BCAddGeluFusion_1"AddGeluFusion:com.microsoftGelu_AddBiasZ$
+BCAddGeluFusion_1"BiasGelu:com.microsoftGelu_AddBiasZ$
 A
 
 batch
@@ -14,4 +14,4 @@
 
 batch
 	seq_len
-�B
\ No newline at end of file
+�B
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx
index 688fa1cd68..ff9f5a0ce5 100644
--- a/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx
+++ b/onnxruntime/test/testdata/transform/approximation/gelu_add_matmul.onnx
@@ -1,10 +1,10 @@
-:�
+:�
 
 A
 BCMatMul_1"MatMul
-8
+3
 C
-DEAddGeluFusion_1"AddGeluFusion:com.microsoftMatMul_AddGeluFusionZ$
+DEAddGeluFusion_1"BiasGelu:com.microsoftMatMul_AddGeluFusionZ$
 A
 
 batch
@@ -22,4 +22,4 @@
 
 batch
 	seq_len
-�B
\ No newline at end of file
+�B
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx b/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx
deleted file mode 100644
index 5061df5da8..0000000000
--- a/onnxruntime/test/testdata/transform/approximation/gelu_add_shape_not_match.onnx
+++ /dev/null
@@ -1,19 +0,0 @@
-:�
-8
-A
-BCAddGeluFusion_1"AddGeluFusion:com.microsoftGelu_Add_ShapeNotMatchZ$
-A
-
-batch
-	seq_len
-�Z$
-B
-
-batch
-	seq_len
-�b$
-C
-
-batch
-	seq_len
-�B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py b/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py
index 65dae0cbe8..3f2d02af91 100644
--- a/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py
+++ b/onnxruntime/test/testdata/transform/approximation/gelu_approximation_gen.py
@@ -24,7 +24,7 @@ onnx.save(model, r'gelu.onnx')
 graph = helper.make_graph(
     [ # nodes
         # Add node before Gelu
-        helper.make_node("AddGeluFusion", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"),
+        helper.make_node("BiasGelu", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"),
     ],
     "Gelu_AddBias",  #name
     [  # inputs
@@ -41,32 +41,11 @@ graph = helper.make_graph(
 model = helper.make_model(graph)
 onnx.save(model, r'gelu_add_bias.onnx')
 
-graph = helper.make_graph(
-    [ # nodes
-        # Add node before Gelu
-        helper.make_node("AddGeluFusion", ["A", "B"], ["C"], "AddGeluFusion_1", domain="com.microsoft"),
-    ],
-    "Gelu_Add_ShapeNotMatch",  #name
-    [  # inputs
-        helper.make_tensor_value_info('A', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
-        helper.make_tensor_value_info('B', TensorProto.FLOAT, ['batch', 'seq_len', 3072]), # Bias shape not matched for FastGelu
-    ],
-    [  # outputs
-        helper.make_tensor_value_info('C', TensorProto.FLOAT, ['batch', 'seq_len', 3072]),
-    ],
-    [  # initializers
-    ]
-)
-
-model = helper.make_model(graph)
-onnx.save(model, r'gelu_add_shape_not_match.onnx')
-
-
 graph = helper.make_graph(
     [ # nodes
         # Add node before Gelu
         helper.make_node("MatMul", ["A", "B"], ["C"], "MatMul_1"),
-        helper.make_node("AddGeluFusion", ["C", "D"], ["E"], "AddGeluFusion_1", domain="com.microsoft"),
+        helper.make_node("BiasGelu", ["C", "D"], ["E"], "AddGeluFusion_1", domain="com.microsoft"),
     ],
     "MatMul_AddGeluFusion",  #name
     [  # inputs
diff --git a/onnxruntime/test/testdata/transform/fusion/add_gelu_fusion.onnx b/onnxruntime/test/testdata/transform/fusion/bias_gelu_fusion.onnx
similarity index 89%
rename from onnxruntime/test/testdata/transform/fusion/add_gelu_fusion.onnx
rename to onnxruntime/test/testdata/transform/fusion/bias_gelu_fusion.onnx
index d495584a739e7ff0a51925fffd9ec25f1e9e3633..5adf9e1de1a280b3d97fa8a6d9b66f2b8a2b9c9d 100644
GIT binary patch
delta 30
lcmZ3?Jdv4&gVm~IBTEP)j{p~=lMpu-CkLYt>*O*<T>w|+1qc8D

delta 26
hcmbQpyqKAVgVk!*MwSpp79}o5r^yzK@{=<eH2_aF1+4%8

diff --git a/onnxruntime/test/testdata/transform/fusion/gelu_add_gen.py b/onnxruntime/test/testdata/transform/fusion/bias_gelu_gen.py
similarity index 90%
rename from onnxruntime/test/testdata/transform/fusion/gelu_add_gen.py
rename to onnxruntime/test/testdata/transform/fusion/bias_gelu_gen.py
index e318ad954a..a262b95ea3 100644
--- a/onnxruntime/test/testdata/transform/fusion/gelu_add_gen.py
+++ b/onnxruntime/test/testdata/transform/fusion/bias_gelu_gen.py
@@ -17,7 +17,7 @@ graph = helper.make_graph(
     "Gelu_Add_Fusion",  #name
     [  # inputs
         helper.make_tensor_value_info('A', TensorProto.FLOAT, ['unk_1', 'unk_2', 3072]),
-        helper.make_tensor_value_info('B', TensorProto.FLOAT, ['unk_1', 'unk_2', 3072]),
+        helper.make_tensor_value_info('B', TensorProto.FLOAT, [3072]),
     ],
     [  # outputs
         helper.make_tensor_value_info('C', TensorProto.FLOAT, ['unk_3', 'unk_4', 3072]),
@@ -30,4 +30,4 @@ graph = helper.make_graph(
 )
 
 model = helper.make_model(graph)
-onnx.save(model, r'add_gelu_fusion.onnx')
\ No newline at end of file
+onnx.save(model, r'bias_gelu_fusion.onnx')
\ No newline at end of file

From 60208463a9bb0a96bbdd693a89d405a04c8e3235 Mon Sep 17 00:00:00 2001
From: KeDengMS <kedeng@microsoft.com>
Date: Thu, 28 Nov 2019 08:35:56 -0800
Subject: [PATCH 12/13] [NupharEP] Enable parallel schedule (#2505)

* [NupharEP] Enable parallel schedule
* Update TVM with the fix to TVM threadpool to use OpenMP if possible
* Add parallel schedule when trying to vectorize
With this change, BERT squad perf on a 4-core (8 HT) CPU goes from 187ms to 150ms

* Address CR, docs and cmake update

* Doc fix

* Fix mkl

* Fix TVM windows build when using mklml
---
 cmake/CMakeLists.txt                          | 22 ++++-
 cmake/external/tvm                            |  2 +-
 .../Nuphar-ExecutionProvider.md               |  6 ++
 .../passes/scheduler/schedule_utils.cc        | 21 +++--
 .../codegen/passes/scheduler/schedule_utils.h |  5 ++
 .../nuphar/common/nuphar_settings.cc          | 17 +++-
 .../providers/nuphar/common/nuphar_settings.h |  3 +
 .../providers/nuphar/compiler/nuphar_handle.h |  2 +-
 .../compiler/nuphar_schedule_builder.cc       |  4 +-
 .../compiler/x86/scheduler/nuphar_scheduler.h |  8 ++
 .../x86/scheduler/ort_type_schedule.cc        | 82 ++++++++++++++++++-
 .../x86/scheduler/tvm_rule_schedule.cc        | 12 ++-
 .../nuphar/mti_x86/math/reduce_ops.cc         |  3 +-
 .../nuphar/nuphar_execution_provider.cc       |  4 +-
 .../nuphar/nuphar_execution_provider.h        |  9 +-
 .../nuphar/nuphar_provider_factory.cc         |  2 +-
 16 files changed, 170 insertions(+), 32 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 3e3583b445..eba8c0a0bd 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -343,6 +343,11 @@ if (onnxruntime_USE_ACL)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES arm_compute acl arm_compute_graph arm_compute_core)
 endif()
 
+# MKLML
+if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
+  include(mkldnn)
+endif()
+
 # TVM
 if (onnxruntime_USE_TVM)
   if (onnxruntime_USE_CUDA)
@@ -352,6 +357,19 @@ if (onnxruntime_USE_TVM)
     set(USE_LLVM ON)
     add_definitions(-DUSE_TVM_WITH_LLVM)
   endif()
+  if (onnxruntime_USE_OPENMP)
+    set(USE_OPENMP "gnu")
+  endif()
+  if (onnxruntime_USE_MKLML)
+    set(USE_OPENMP "intel")
+    # make sure MKLML in ORT is used by TVM
+    if (WIN32)
+      set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_IMPORT_LIB})
+    else()
+      set(OMP_LIBRARY ${MKLML_LIB_DIR}/${IOMP5MD_SHARED_LIB})
+    endif()
+  endif()
+
   add_subdirectory(${PROJECT_SOURCE_DIR}/external/tvm EXCLUDE_FROM_ALL)
   set_target_properties(tvm PROPERTIES FOLDER "External/tvm")
   set_target_properties(tvm_topi PROPERTIES FOLDER "External/tvm")
@@ -501,10 +519,6 @@ include_directories(
   ${REPO_ROOT}/include/onnxruntime/core/session
 )
 
-if (onnxruntime_USE_MKLDNN OR onnxruntime_USE_MKLML)
-  include(mkldnn)
-endif()
-
 if(onnxruntime_USE_GEMMLOWP)
   add_definitions(-DUSE_GEMMLOWP=1)
 endif()
diff --git a/cmake/external/tvm b/cmake/external/tvm
index 9b3a424a91..c6e3efcdb0 160000
--- a/cmake/external/tvm
+++ b/cmake/external/tvm
@@ -1 +1 @@
-Subproject commit 9b3a424a91d6003db1993cdd7121e46696f220e8
+Subproject commit c6e3efcdb09aeda961a6badf76093ceac69db64d
diff --git a/docs/execution_providers/Nuphar-ExecutionProvider.md b/docs/execution_providers/Nuphar-ExecutionProvider.md
index 05c6d57bfc..17244fcb8d 100644
--- a/docs/execution_providers/Nuphar-ExecutionProvider.md
+++ b/docs/execution_providers/Nuphar-ExecutionProvider.md
@@ -17,6 +17,12 @@ You can use the Nuphar execution provider via the python wheel from the ONNX Run
 ## Performance and Accuracy Testing
 You can test your ONNX model's performance with [onnxruntime_perf_test](../../onnxruntime/test/perftest/README.md), or test accuracy with [onnx_test_runner](../../onnxruntime/test/onnx/README.txt). To run these tools with the Nuphar execution provider, please pass `-e nuphar` in command line options.
 
+Please note that Nuphar uses TVM thread pool and parallel schedule for multi-thread inference performance. When building with OpenMP or MKLML, TVM thread pool would use gomp or iomp as its implementation; otherwise, TVM creates its own thread pool. Because of this, the current default parallel schedule policy is:
+- Default to on for USE_OPENMP or USE_MKLML. User can use OMP_NUM_THREADS/MKL_NUM_THREADS to control TVM thread pool, as well as TVM_NUM_THREADS
+- Default to off for none of above. User can use TVM_NUM_THREADS to control TVM thread pool.
+
+This choice is to ensure to get ideal performance with the different build options. When build with USE_OPENMP or USE_MKLML, users would have to avoid thread confliction from OpenMP or MKL with their inference invocations anyway, so parallel schedule is enable to leverage existing thread pool. When not building with gomp or iomp, TVM thread pool is turned off to avoid confliction with user threads. If needed, user can set env or settings with [NUPHAR_PARALLEL_MIN_WORKLOADS](../../onnxruntime/core/providers/nuphar/common/nuphar_settings.cc#L61) to 0 to disable parallel schedule, or to some non-zero value to enable parallel schedule. The non-zero value indicates the minimal number of elements being computed per thread when parallel schedule would be turned on.
+
 ## Model Conversion and Quantization
 You may use Python script [model_editor.py](../../onnxruntime/core/providers/nuphar/scripts/model_editor.py) to turn LSTM/GRU/RNN ops to Scan ops for a given model, and then use [model_quantizer.py](../../onnxruntime/core/providers/nuphar/scripts/model_quantizer.py) to quantize MatMul ops into MatMulInteger ops.
 
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
index 8e86f13eaa..3595229bbe 100644
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc
@@ -58,6 +58,19 @@ bool InsertRootScheduleAndClosure(
   return true;
 }
 
+// Check precondition for vectorize schedule
+bool ShouldTryVectorization(
+    const tvm::Tensor& tensor,
+    ScheduleContext& ctx) {
+  auto it = ctx.scheduled_tensors.find(tensor->op.get());
+  if (it != ctx.scheduled_tensors.end()) {
+    if (it->second > ScheduleType::ScheduleInline) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
@@ -66,12 +79,8 @@ bool TryVectorization(
     const tvm::Tensor& tensor,
     int64_t natural_vector_size,
     ScheduleContext& ctx) {
-  auto it = ctx.scheduled_tensors.find(tensor->op.get());
-  if (it != ctx.scheduled_tensors.end()) {
-    if (it->second > ScheduleType::ScheduleInline) {
-      return false;
-    }
-  }
+  if (!ShouldTryVectorization(tensor, ctx))
+    return false;
 
   auto shape = tensor->shape;
   auto rank = shape.size();
diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
index f928928f30..757366b551 100644
--- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
+++ b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h
@@ -26,6 +26,11 @@ bool InsertRootScheduleAndClosure(
     const tvm::Tensor& tensor,
     ScheduleContext& ctx);
 
+// Check precondition for vectorize schedule
+bool ShouldTryVectorization(
+    const tvm::Tensor& tensor,
+    ScheduleContext& ctx);
+
 // Check the schedule of tensor
 // If it is not scheduled, try to vectorize it.
 // Note TryVectorization has to use with compute_root.
diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc b/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc
index 389e4f817b..271e12f50c 100644
--- a/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_settings.cc
@@ -38,7 +38,8 @@ static const std::unordered_set<std::string> valid_keys = {
     kNupharCacheSoName,
     kNupharCacheModelChecksum,
     kNupharCacheForceNoJIT,
-    kNupharCodeGenTarget};
+    kNupharCodeGenTarget,
+    kNupharParallelMinWorkloads};
 
 void SetDefaultOptions(std::map<std::string, std::string>& options) {
   // create two temporary strings to get rid of the odr-use issue introduced
@@ -56,6 +57,20 @@ void SetDefaultOptions(std::map<std::string, std::string>& options) {
   std::string cache_so_name_opt(kNupharCacheSoName);
   std::string cache_so_name_default(kNupharCacheSoName_Default);
   options.insert(std::make_pair(cache_so_name_opt, cache_so_name_default));
+
+  std::string parallel_min_workloads_opt(kNupharParallelMinWorkloads);
+#if defined(USE_OPENMP) || defined(USE_MKLML)
+  // a rough estimate of workloads based on static dimensions for each thread, when using parallel schedule
+  // user may change it to 0 to turn it off,
+  // or use OMP_NUM_THREADS to control TVM thread pool similar to control MKL
+  unsigned int parallel_min_workloads_default = 64;
+#else
+  // turn off parallel schedule by default to avoid TVM thread pool confliction with others
+  // this is to ensure performance when user runs multiple inference threads, with each runs as single thread
+  // if needed, user can override it with settings, and use TVM_NUM_THREADS to control the thread pool
+  unsigned int parallel_min_workloads_default = 0;
+#endif
+  options.insert(std::make_pair(parallel_min_workloads_opt, std::to_string(parallel_min_workloads_default)));
 }
 
 void CreateNupharCodeGenSettings(const NupharExecutionProviderInfo& info) {
diff --git a/onnxruntime/core/providers/nuphar/common/nuphar_settings.h b/onnxruntime/core/providers/nuphar/common/nuphar_settings.h
index 8e836b7e84..5d2c149186 100644
--- a/onnxruntime/core/providers/nuphar/common/nuphar_settings.h
+++ b/onnxruntime/core/providers/nuphar/common/nuphar_settings.h
@@ -45,6 +45,9 @@ constexpr static const char* kNupharActivations_DeepCpu = "deep_cpu_activation";
 // Option to control nuphar code generation target (avx / avx2 / avx512)
 constexpr static const char* kNupharCodeGenTarget = "nuphar_codegen_target";
 
+// Option to control nuphar code to run with parallel schedule
+constexpr static const char* kNupharParallelMinWorkloads = "nuphar_parallel_min_workloads";
+
 // cache version number (MAJOR.MINOR.PATCH) following https://semver.org/
 // 1. MAJOR version when you make incompatible changes that old cache files no longer work,
 // 2. MINOR version when you add functionality in a backwards - compatible manner, and
diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h b/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h
index 84be4555ba..1b5f91990d 100644
--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_handle.h
@@ -28,7 +28,7 @@ struct NupharCodeGenHandle : codegen::CodeGenHandle {
   std::shared_ptr<tvm_codegen::TVMScheduleBuilder> schedule_builder;  // keep
   // maybe add a layout
   tvm_codegen::WeightLayoutRegistry* layout_registry;
-  bool enable_per_node_parallelized;  // TODO: change to config
+  int64_t parallel_min_workloads;
 
   bool allow_unaligned_buffers;  // move to another place
 
diff --git a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
index 3f54d50a56..47e7030551 100644
--- a/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/nuphar_schedule_builder.cc
@@ -9,6 +9,7 @@
 
 #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
 #include "core/providers/nuphar/compiler/x86/x86_target_info.h"
+#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
 
 // TODO change name space
 namespace onnxruntime {
@@ -37,9 +38,8 @@ static void Traverse(const tvm::Tensor& tensor,
   if (is_real_output) {
     CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
     ORT_ENFORCE(target != nullptr);
-    int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());
 
-    TryVectorization(tensor, natural_vector_size, ctx_schedule);  // to x86
+    TryVectorizationX86(tensor, ctx_codegen, ctx_schedule);
     InsertRootScheduleAndClosure(tensor, ctx_schedule);
   }
 
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h
index a41642e7e3..96e0fbc584 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h
@@ -39,5 +39,13 @@ bool InputRootScheduleWithVectorizationX86(
     tvm_codegen::CodeGenContext& ctx_codegen,
     tvm_codegen::ScheduleContext& ctx_sched);
 
+bool TryParallelX86(
+    const tvm::Tensor& tensor,
+    int64_t to_dim,  // fuse dims before to_dim for parallel schedule, 0 to fuse all but last dim
+    tvm_codegen::CodeGenContext& ctx_codegen,
+    tvm_codegen::ScheduleContext& ctx_sched);
+
+constexpr auto kNupharScheduleNoParallel = "nuphar_schedule_no_parallel";
+
 }  // namespace nuphar
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc
index d32e35796a..2cc444dc18 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/ort_type_schedule.cc
@@ -3,26 +3,99 @@
 
 #include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
 
+#include "core/codegen/passes/scheduler/schedule_utils.h"
+#include "core/framework/op_kernel_info.h"
+#include "core/providers/nuphar/common/nuphar_settings.h"
 #include "core/providers/nuphar/common/analysis/subgraph_codegen_stats.h"
 #include "core/providers/nuphar/compiler/nuphar_codegen_ctx.h"
-#include "core/codegen/passes/scheduler/schedule_utils.h"
 #include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_extern.h"
 #include "core/providers/nuphar/compiler/x86/scheduler/tensorize/intrin_gemv_ll_ir.h"
 #include "core/providers/nuphar/compiler/x86/x86_target_info.h"
-#include "core/framework/op_kernel_info.h"
 #include <tvm/tvm.h>
+#include <tvm/ir_pass.h>
 
 namespace onnxruntime {
 namespace nuphar {
 
+bool TryParallelX86(
+    const tvm::Tensor& tensor,
+    int64_t to_dim,
+    tvm_codegen::CodeGenContext& ctx_codegen,
+    tvm_codegen::ScheduleContext& ctx_sched) {
+  auto compute_op = tensor->op.as<tvm::ComputeOpNode>();
+  if (compute_op == nullptr) {
+    return false;
+  }
+  if (compute_op->attrs.count(kNupharScheduleNoParallel)) {
+    return false;
+  }
+
+  const auto& shape = tensor->shape;
+
+  int rank = gsl::narrow<int>(shape.size());
+  tvm::Array<tvm::IterVar> to_fuse_for_parallel;
+  int64_t rank_to_parallel = (to_dim ? to_dim : rank - 1);
+  for (int64_t i = 0; i < rank_to_parallel && i < gsl::narrow<int64_t>(compute_op->axis.size()); ++i) {
+    tvm::IterVar axis = compute_op->axis[i];
+    auto dom = axis->dom;
+    if (!tvm::ir::Equal(dom->extent, shape[i])) {
+      // only do parallel schedule on axis not being fused or split yet
+      rank_to_parallel = i;
+      break;
+    }
+    to_fuse_for_parallel.push_back(axis);
+  }
+
+  if (to_fuse_for_parallel.size() < 1) {
+    return false;
+  }
+
+  int64_t per_thread_static_dims = 1;
+  for (const auto& reduce_axis : compute_op->reduce_axis) {
+    const int64_t* static_range = tvm::as_const_int(reduce_axis->dom->extent);
+    if (static_range != nullptr) {
+      per_thread_static_dims *= *static_range;
+    }
+  }
+  for (int64_t i = rank_to_parallel; i < rank; ++i) {
+    auto dim = tvm::as_const_int(shape[i]);
+    if (dim != nullptr) {
+      per_thread_static_dims *= *dim;
+    }
+  }
+
+  // skip small per thread workloads, note that symbolic dims are ignored (treated as 1)
+  int64_t workloads_threshold = Promote<NupharCodeGenCtx>(&ctx_codegen)->GetCodeGenHandle()->parallel_min_workloads;
+  if (workloads_threshold <= 0 || per_thread_static_dims < workloads_threshold) {
+    return false;
+  }
+
+  tvm::IterVar parallel_axis;
+  if (to_fuse_for_parallel.size() > 1) {
+    ctx_sched.schedule[tensor->op].fuse(to_fuse_for_parallel, &parallel_axis);
+  } else {
+    parallel_axis = to_fuse_for_parallel[0];
+  }
+  ctx_sched.schedule[tensor->op].parallel(parallel_axis);
+  return true;
+}
+
 bool TryVectorizationX86(
     const tvm::Tensor& tensor,
     tvm_codegen::CodeGenContext& ctx_codegen,
     tvm_codegen::ScheduleContext& ctx_sched) {
+  if (!ShouldTryVectorization(tensor, ctx_sched))
+    return false;
+
   CodeGenTargetX86* target = dynamic_cast<CodeGenTargetX86*>(ctx_codegen.GetCodeGenHandle()->codegen_target);
   ORT_ENFORCE(target != nullptr);
   int64_t natural_vector_size = target->NaturalVectorWidth(tensor->dtype.bits());
 
+  // try to use parallel schedule when vectorizing
+  // note that we don't do logic-or in return value here
+  // to make sure vectorization is always tried
+  TryParallelX86(tensor, 0, ctx_codegen, ctx_sched);
+
   return TryVectorization(tensor, natural_vector_size, ctx_sched);
 }
 
@@ -176,7 +249,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,
 
   ctx_sched.schedule[tensor->op].reorder({b, oc_chunk, y, xo, ic_chunk, m, n, ic_block, xi, oc_block});
 
-  if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
+  if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
     tvm::Array<tvm::IterVar> fused_axis;
     fused_axis.push_back(b);
     fused_axis.push_back(oc_chunk);
@@ -186,6 +259,7 @@ static Status ConvScheduleX86(const tvm::Tensor& tensor,
     ctx_sched.schedule[tensor->op].fuse(fused_axis, &parallel_axis);
     ctx_sched.schedule[tensor->op].parallel(parallel_axis);
   }
+
   ctx_sched.schedule[tensor->op].vectorize(oc_block);
 
   return Status::OK();
@@ -243,7 +317,7 @@ static Status MatMul_2DWeight_Schedule(
   ctx_sched.schedule[CC->op].unroll(ki);
   ctx_sched.schedule[CC->op].vectorize(yc);
 
-  if (ctx_codegen.GetCodeGenHandle()->enable_per_node_parallelized) {
+  if (ctx_codegen.GetCodeGenHandle()->parallel_min_workloads > 0) {
     // parallelize
     tvm::Array<tvm::IterVar> fused_axis;
     for (size_t d = 0; d < C_rank - 2; ++d)
diff --git a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc
index 646f22d7d7..d3f923f4e6 100644
--- a/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc
+++ b/onnxruntime/core/providers/nuphar/compiler/x86/scheduler/tvm_rule_schedule.cc
@@ -22,6 +22,7 @@ bool TVM_SCHEDULER_CLASS(Extern, NupharX86TVMRule)::Evaluate(
 
 static bool ReduceVScheduleNupharX86(
     const tvm::Tensor& tensor,
+    tvm_codegen::CodeGenContext& ctx_codegen,
     tvm_codegen::ScheduleContext& ctx_sched) {
   InsertRootScheduleAndClosure(tensor, ctx_sched);
 
@@ -55,6 +56,8 @@ static bool ReduceVScheduleNupharX86(
     if (shape.size() > 0)
       head_dim = as_const_int(shape[0]);
 
+    bool try_parallel = true;
+
     // unroll packed reduce by checking head dim
     if (nullptr != head_dim) {
       // if head_dim is already fused, don't unroll
@@ -81,8 +84,13 @@ static bool ReduceVScheduleNupharX86(
 
         ctx_sched.schedule[tensor->op].reorder(reorder_axis);
         ctx_sched.schedule[tensor->op].unroll(x0);
+        try_parallel = false;
       }
     }
+
+    if (try_parallel) {
+      TryParallelX86(tensor, *fuse_dim, ctx_codegen, ctx_sched);
+    }
   } else if (compute_op->axis.size() > 0 &&
              tvm::as_const_int(tensor->shape[0]) != nullptr) {
     tvm::IterVar x = compute_op->axis[0];
@@ -101,7 +109,7 @@ static bool ReduceVScheduleNupharX86(
 bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
     const tvm::Tensor& tensor,
     const Node*,
-    tvm_codegen::CodeGenContext&,
+    tvm_codegen::CodeGenContext& ctx_codegen,
     tvm_codegen::ScheduleContext& ctx_sched) {
   // respect topi::kCommReduce
   if (tensor->op->tag == topi::kCommReduce) {
@@ -109,7 +117,7 @@ bool TVM_SCHEDULER_CLASS(Reduce, NupharX86TVMRule)::Evaluate(
   }
 
   if (tensor->op->tag == nuphar::kNupharVReduce) {
-    return ReduceVScheduleNupharX86(tensor, ctx_sched);
+    return ReduceVScheduleNupharX86(tensor, ctx_codegen, ctx_sched);
   }
 
   // unknown goes to InsertRootScheduleAndClosure
diff --git a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
index e816c77ed9..2bcb6f0010 100644
--- a/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
+++ b/onnxruntime/core/providers/nuphar/mti_x86/math/reduce_ops.cc
@@ -7,6 +7,7 @@
 #include "core/codegen/mti/mti_tvm_utils.h"
 #include "core/codegen/mti/tensor/pad_ops.h"
 #include "core/codegen/mti/tensor/reshape_ops.h"
+#include "core/providers/nuphar/compiler/x86/scheduler/nuphar_scheduler.h"
 #include <topi/reduction.h>
 
 namespace onnxruntime {
@@ -85,7 +86,7 @@ tvm::Tensor ReduceValueWithoutSplit(const tvm::Tensor& X,
 
   tvm::Map<std::string, tvm::NodeRef> attrs;
   attrs.Set(kNupharVReduceFuseDim, tvm::Expr(fuse_dim));
-
+  attrs.Set(kNupharScheduleNoParallel, tvm::Expr(true));
   return tvm::compute(output_shape, l_out, name + "_regular_reduce", kNupharVReduce, attrs);
 }
 
diff --git a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
index b36d526828..d244c51d30 100644
--- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
+++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.cc
@@ -125,8 +125,8 @@ NupharExecutionProvider::NupharExecutionProvider(const NupharExecutionProviderIn
 
   handle->shape_inference = whole_graph_shape_infer_;
 
-  // TODO: remove
-  handle->enable_per_node_parallelized = info.enable_per_node_parallel;
+  handle->parallel_min_workloads = std::stoi(settings.GetOptionValue(kNupharParallelMinWorkloads));
+
   // TODO: remove
   handle->allow_unaligned_buffers = info.allow_unaligned_buffers;  // TODO remove this
 
diff --git a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h
index 40c7821032..07ca11e4f0 100644
--- a/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h
+++ b/onnxruntime/core/providers/nuphar/nuphar_execution_provider.h
@@ -31,9 +31,6 @@ constexpr const char* default_nuphar_target_str = stackvm_target_str;
 
 // Information needed to construct Nuphar execution providers.
 struct NupharExecutionProviderInfo {
-  // By default, let provider decide the target by passing in empty string.
-  bool enable_per_node_parallel;  // TODO: remove
-
   // this flag set TVM build_config with data_alignment=1, at the cost of performance
   bool allow_unaligned_buffers;
 
@@ -43,10 +40,8 @@ struct NupharExecutionProviderInfo {
   std::string settings;
 
   explicit NupharExecutionProviderInfo(bool unaligned_buffers,
-                                       const std::string& str_settings = "",
-                                       bool per_node_parallel = true)
-      : enable_per_node_parallel(per_node_parallel),
-        allow_unaligned_buffers(unaligned_buffers),
+                                       const std::string& str_settings = "")
+      : allow_unaligned_buffers(unaligned_buffers),
         settings(str_settings) {}
   NupharExecutionProviderInfo() = default;
 };
diff --git a/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc b/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc
index 4fdcbfa277..e5c2989e51 100644
--- a/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc
+++ b/onnxruntime/core/providers/nuphar/nuphar_provider_factory.cc
@@ -21,7 +21,7 @@ struct NupharExecutionProviderFactory : IExecutionProviderFactory {
 };
 
 std::unique_ptr<IExecutionProvider> NupharExecutionProviderFactory::CreateProvider() {
-  NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_, /*per_node_parallel*/ true);
+  NupharExecutionProviderInfo info(allow_unaligned_buffers_, settings_);
   return onnxruntime::make_unique<NupharExecutionProvider>(info);
 }
 

From 0edd4ef6ca7323659693eb92b3c9b3a97f13970d Mon Sep 17 00:00:00 2001
From: liuziyue <ziyue.liu7@gmail.com>
Date: Thu, 28 Nov 2019 14:03:58 -0800
Subject: [PATCH 13/13] EmbedLayerNormalization fusion (#2452)

Embed Layer Normalization Fusion
---
 .../core/optimizer/embed_layer_norm_fusion.cc | 270 ++++++++++++++++++
 .../core/optimizer/embed_layer_norm_fusion.h  |  24 ++
 .../core/optimizer/graph_transformer_utils.cc |   2 +
 .../core/optimizer/skip_layer_norm_fusion.h   |   7 +-
 .../test/optimizer/graph_transform_test.cc    |  43 +++
 .../fusion/embed_layer_norm_format1.onnx      | Bin 0 -> 940 bytes
 .../fusion/embed_layer_norm_format2.onnx      | Bin 0 -> 1060 bytes
 7 files changed, 341 insertions(+), 5 deletions(-)
 create mode 100644 onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
 create mode 100644 onnxruntime/core/optimizer/embed_layer_norm_fusion.h
 create mode 100644 onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format1.onnx
 create mode 100644 onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format2.onnx

diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
new file mode 100644
index 0000000000..c2f6136297
--- /dev/null
+++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc
@@ -0,0 +1,270 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "core/optimizer/initializer.h"
+#include "core/optimizer/embed_layer_norm_fusion.h"
+#include "core/graph/graph_utils.h"
+#include "float.h"
+
+#define DEBUG_LOG(x) LOGS(logger, VERBOSE) << x
+
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::common;
+namespace onnxruntime {
+
+// Add a Cast to convert Input from int64 to int32.
+static NodeArg* CastToInt32(Graph& graph, NodeArg* input, ProviderType provider_type) {
+  const TensorShapeProto* input_shape = input->Shape();
+  TypeProto input_int32;
+  input_int32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT32);
+  auto dim0 = input_int32.mutable_tensor_type()->mutable_shape()->add_dim();
+  *dim0 = input_shape->dim(0);
+  auto dim1 = input_int32.mutable_tensor_type()->mutable_shape()->add_dim();
+  *dim1 = input_shape->dim(1);
+  auto& cast32 = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(input->Name() + "_Int32"), &input_int32);
+
+  Node& node = graph.AddNode(graph.GenerateNodeName(input->Name() + "_Cast"),
+                             "Cast",
+                             "Cast Input from int64 to int32",
+                             {input},
+                             {&cast32},
+                             nullptr,
+                             kOnnxDomain);
+
+  // Add attribute: "to" = 6
+  ONNX_NAMESPACE::AttributeProto to;
+  to.set_name("to");
+  to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
+  to.set_i(static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_INT32));
+  node.AddAttribute("to", to);
+
+  node.SetExecutionProviderType(provider_type);
+  return &cast32;
+}
+
+static NodeArg* CheckInput(Graph& graph, NodeArg* input, ProviderType provider_type, const logging::Logger& logger) {
+  // Validate input shape (batch_size, sequence_length) and data type.
+  // Note that batch_size and sequence_length could be symbolic.
+  const TensorShapeProto* input_shape = input->Shape();
+  if (input_shape == nullptr || input_shape->dim_size() != 2 || input->Type() == nullptr) {
+    DEBUG_LOG("Mask shape is unknown or not 2D, or data type unknown");
+    return nullptr;
+  }
+
+  auto data_type = input->TypeAsProto()->tensor_type().elem_type();
+  if (data_type != ONNX_NAMESPACE::TensorProto_DataType_INT64 &&
+      data_type != ONNX_NAMESPACE::TensorProto_DataType_INT32) {
+    DEBUG_LOG("Input data type is not int32 or int64");
+    return nullptr;
+  }
+
+  if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT64) {
+    return CastToInt32(graph, input, provider_type);
+  }
+  return input;
+}
+
+/**
+Embed Layer Normalization will fuse embeddings and mask processing into one node : 
+The embeddings before conversion:
+  (input_ids) -------->  Gather ----------+       (segment_ids)
+    |                                    |            |
+    |                                    v            v
+    +--> Shape --> Expand -> Gather---->Add         Gather
+    |                ^                   |            |
+    |                |                   v            v
+    +---(optional graph)               SkipLayerNormalization
+
+*/
+Status EmbedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
+  GraphViewer graph_viewer(graph);
+  const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
+  for (auto node_index : node_topology_list) {
+    auto* p_layer_norm = graph.GetNode(node_index);
+    if (p_layer_norm == nullptr)
+      continue;  // we removed the node as part of an earlier fusion
+
+    Node& layer_norm_node = *p_layer_norm;
+    ORT_RETURN_IF_ERROR(Recurse(layer_norm_node, modified, graph_level, logger));
+    if (!graph_utils::IsSupportedOptypeVersionAndDomain(layer_norm_node, "LayerNormalization", {1}, kOnnxDomain) ||
+        !graph_utils::IsSupportedProvider(layer_norm_node, GetCompatibleExecutionProviders())) {
+      continue;
+    }
+    // Find Attention after SkipLayerNormalization
+    const Node* p_attention = graph_utils::FirstChildByType(layer_norm_node, "Attention");
+    // Stop EmbedLayerNormalization fusion if Attention is not found.
+    if (p_attention == nullptr) {
+      return Status::OK();
+    }
+    Node& attention_node = *graph.GetNode(p_attention->Index());
+    if (!graph_utils::IsSupportedOptypeVersionAndDomain(attention_node, "Attention", {1}, kMSDomain) ||
+        !graph_utils::IsSupportedProvider(attention_node, GetCompatibleExecutionProviders())) {
+      continue;
+    }
+    // Find ReduceSum --> Attention
+    std::vector<const Node::EdgeEnd*> edges;
+    if (!graph_utils::FindPath(attention_node, true, {{0, 3, "ReduceSum", {1, 11}, kOnnxDomain}}, edges, logger)) {
+      continue;
+    }
+    Node& reduce_sum_node = *graph.GetNode(edges[0]->GetNode().Index());
+
+    // Find Add --> LayerNormalization
+    if (!graph_utils::FindPath(layer_norm_node, true, {{0, 0, "Add", {7}, kOnnxDomain}}, edges, logger)) {
+      continue;
+    }
+    Node& layer_norm_add_node = *graph.GetNode(edges[0]->GetNode().Index());
+
+    // Traceback the SkipLayerNormalization node to find Gather --> SkipLayerNormalization
+    std::vector<graph_utils::EdgeEndToMatch> segment_embedding_path{
+        {0, 1, "Gather", {1, 11}, kOnnxDomain}};
+    if (!graph_utils::FindPath(layer_norm_add_node, true, segment_embedding_path, edges, logger)) {
+      continue;
+    }
+    Node& segment_gather_node = *graph.GetNode(edges[0]->GetNode().Index());
+    if (segment_gather_node.GetOutputEdgesCount() != 1) {
+      continue;
+    }
+    // The first input of segment_gather_node must be 2d.
+    auto sg_shape = segment_gather_node.MutableInputDefs()[0]->Shape();
+    if (sg_shape != nullptr && sg_shape->dim_size() != 2) {
+      continue;
+    }
+
+    // Traceback the SkipLayerNormalization node to find Gather --> Add --> SkipLayerNormalization
+    std::vector<graph_utils::EdgeEndToMatch> word_embedding_path{
+        {0, 0, "Add", {7}, kOnnxDomain},
+        {0, 0, "Gather", {1, 11}, kOnnxDomain}};
+    if (!graph_utils::FindPath(layer_norm_add_node, true, word_embedding_path, edges, logger)) {
+      continue;
+    }
+    Node& add_node = *graph.GetNode(edges[0]->GetNode().Index());
+    Node& word_gather_node = *graph.GetNode(edges[1]->GetNode().Index());
+    if (add_node.GetOutputEdgesCount() != 1 || word_gather_node.GetOutputEdgesCount() != 1) {
+      continue;
+    }
+    // The first input of word_gather_node must be 2d.
+    auto wg_shape = word_gather_node.MutableInputDefs()[0]->Shape();
+    if (wg_shape != nullptr && wg_shape->dim_size() != 2) {
+      continue;
+    }
+
+    // Traceback the Add node to find (Shape --> Expand -->) Gather --> Add.
+    // Constant folding removes Shape and Expand nodes when input does not have symbolic shape. In that
+    // case just look for Gather --> Add.
+    std::vector<graph_utils::EdgeEndToMatch> position_embedding_path{
+        {0, 1, "Gather", {1, 11}, kOnnxDomain}};
+    if (!graph_utils::FindPath(add_node, true, position_embedding_path, edges, logger)) {
+      continue;
+    }
+    Node& position_gather_node = *graph.GetNode(edges[0]->GetNode().Index());
+    if (position_gather_node.GetOutputEdgesCount() != 1) {
+      continue;
+    }
+    // The first input of position_gather_node must be 2d.
+    auto pg_shape = position_gather_node.MutableInputDefs()[0]->Shape();
+    if (pg_shape != nullptr && pg_shape->dim_size() != 2) {
+      continue;
+    }
+
+    // Match Shape --> Expand path if needed.
+    std::vector<graph_utils::EdgeEndToMatch> position_embedding_path_symbolic{
+        {0, 1, "Expand", {8}, kOnnxDomain},
+        {0, 1, "Shape", {1}, kOnnxDomain}};
+    Node* p_expand_node = nullptr;
+    Node* p_shape_node = nullptr;
+    if (graph_utils::FindPath(position_gather_node, true, position_embedding_path_symbolic, edges, logger)) {
+      if (edges[0]->GetNode().GetOutputEdgesCount() == 1 && edges[1]->GetNode().GetOutputEdgesCount() == 1) {
+        p_expand_node = graph.GetNode(edges[0]->GetNode().Index());
+        p_shape_node = graph.GetNode(edges[1]->GetNode().Index());
+      }
+    }
+
+    // Get input "input_ids" from node.
+    NodeArg* input_ids = CheckInput(graph, word_gather_node.MutableInputDefs()[1], layer_norm_node.GetExecutionProviderType(), logger);
+    if (input_ids == nullptr) {
+      DEBUG_LOG("Input id is not valid. ");
+      continue;
+    }
+
+    // Get input "segment_ids" from node.
+    NodeArg* segment_ids = CheckInput(graph, segment_gather_node.MutableInputDefs()[1], layer_norm_node.GetExecutionProviderType(), logger);
+    if (segment_ids == nullptr) {
+      DEBUG_LOG("Segment id is not valid. ");
+      continue;
+    }
+
+    // Get input "mask" from "ReduceSum" node.
+    NodeArg* mask = CheckInput(graph, reduce_sum_node.MutableInputDefs()[0], layer_norm_node.GetExecutionProviderType(), logger);
+    if (mask == nullptr) {
+      DEBUG_LOG("Mask is not valid. ");
+      continue;
+    }
+
+    const std::vector<NodeArg*> embed_layer_norm_input_defs{
+        input_ids,
+        segment_ids,
+        mask,
+        word_gather_node.MutableInputDefs()[0],
+        position_gather_node.MutableInputDefs()[0],
+        segment_gather_node.MutableInputDefs()[0],
+        layer_norm_node.MutableInputDefs()[1],
+        layer_norm_node.MutableInputDefs()[2]};
+    Node& embed_layer_norm_node = graph.AddNode(graph.GenerateNodeName("EmbedLayerNormalization"),
+                                                "EmbedLayerNormalization",
+                                                "fused EmbedLayerNorm subgraphs ",
+                                                embed_layer_norm_input_defs,
+                                                {layer_norm_node.MutableOutputDefs()[0], reduce_sum_node.MutableOutputDefs()[0]},
+                                                {}, kMSDomain);
+
+    // Assign provider to this new node. Provider should be same as the provider for old node.
+    embed_layer_norm_node.SetExecutionProviderType(layer_norm_node.GetExecutionProviderType());
+
+    // move input edges to gather (first in list) across to the embed_layer_norm_node.
+    // move output definitions and output edges to embed_layer_norm_node.
+    // remove all the other nodes.
+    std::vector<NodeIndex> nodes_to_remove;
+    if (p_shape_node != nullptr && p_expand_node != nullptr) {
+      // Match Shape --> Gather --> Unsqueeze --> ConstantOfShape --> NonZero --> Transpose --> Squeeze --> Cast --> Unsqueeze --> Expand
+      if (p_expand_node != nullptr) {
+        Node& expand_node = *graph.GetNode(p_expand_node->Index());
+        std::vector<graph_utils::EdgeEndToMatch> expand_parent_path{
+            {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain},
+            {0, 0, "Cast", {9}, kOnnxDomain},
+            {0, 0, "Squeeze", {1}, kOnnxDomain},
+            {0, 0, "Transpose", {1}, kOnnxDomain},
+            {0, 0, "NonZero", {9}, kOnnxDomain},
+            {0, 0, "ConstantOfShape", {9}, kOnnxDomain},
+            {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain},
+            {0, 0, "Gather", {1, 11}, kOnnxDomain},
+            {0, 0, "Shape", {1}, kOnnxDomain},
+        };
+        if (graph_utils::FindPath(expand_node, true, expand_parent_path, edges, logger)) {
+          for (size_t i = 0; i < edges.size(); i++) {
+            if (edges[i]->GetNode().GetOutputEdgesCount() != 1) {
+              nodes_to_remove.clear();
+              break;
+            }
+            nodes_to_remove.push_back(edges[i]->GetNode().Index());
+          }
+        }
+      }
+      nodes_to_remove.push_back(p_shape_node->Index());
+      nodes_to_remove.push_back(p_expand_node->Index());
+    }
+    nodes_to_remove.push_back(word_gather_node.Index());
+    nodes_to_remove.push_back(position_gather_node.Index());
+    nodes_to_remove.push_back(segment_gather_node.Index());
+    nodes_to_remove.push_back(add_node.Index());
+    nodes_to_remove.push_back(reduce_sum_node.Index());
+    nodes_to_remove.push_back(layer_norm_add_node.Index());
+    nodes_to_remove.push_back(layer_norm_node.Index());
+
+    for (const auto& index : nodes_to_remove) {
+      Node* node = graph.GetNode(index);
+      graph_utils::RemoveNodeOutputEdges(graph, *node);
+      graph.RemoveNode(node->Index());
+    }
+    modified = true;
+  }
+  return Status::OK();
+}
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.h b/onnxruntime/core/optimizer/embed_layer_norm_fusion.h
new file mode 100644
index 0000000000..6814bf4bea
--- /dev/null
+++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.h
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/optimizer/graph_transformer.h"
+
+namespace onnxruntime {
+
+/**
+@Class EmbedLayerNormFusion
+
+Rewrite graph fusing embeddings and mask processing into one node.
+
+*/
+class EmbedLayerNormFusion : public GraphTransformer {
+ public:
+  EmbedLayerNormFusion(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
+      : GraphTransformer("EmbedLayerNormFusion", compatible_execution_providers) {}
+
+  Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index e2ae44fa18..cd3712f633 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -23,6 +23,7 @@
 #include "core/optimizer/gelu_approximation.h"
 #include "core/optimizer/layer_norm_fusion.h"
 #include "core/optimizer/skip_layer_norm_fusion.h"
+#include "core/optimizer/embed_layer_norm_fusion.h"
 #include "core/optimizer/reshape_fusion.h"
 #include "core/optimizer/attention_fusion.h"
 #include "core/mlas/inc/mlas.h"
@@ -128,6 +129,7 @@ std::vector<std::unique_ptr<GraphTransformer>> GenerateTransformers(TransformerL
       transformers.emplace_back(onnxruntime::make_unique<GeluFusion>(cpu_cuda_execution_providers));
       transformers.emplace_back(onnxruntime::make_unique<LayerNormFusion>(cpu_cuda_execution_providers));
       transformers.emplace_back(onnxruntime::make_unique<AttentionFusion>(cpu_cuda_execution_providers));
+      transformers.emplace_back(onnxruntime::make_unique<EmbedLayerNormFusion>(cpu_cuda_execution_providers));
       transformers.emplace_back(onnxruntime::make_unique<BiasGelu>(cpu_cuda_execution_providers));
       transformers.emplace_back(onnxruntime::make_unique<SkipLayerNormFusion>(cpu_cuda_execution_providers));
 
diff --git a/onnxruntime/core/optimizer/skip_layer_norm_fusion.h b/onnxruntime/core/optimizer/skip_layer_norm_fusion.h
index 99eb0b0ed1..7b634f51db 100644
--- a/onnxruntime/core/optimizer/skip_layer_norm_fusion.h
+++ b/onnxruntime/core/optimizer/skip_layer_norm_fusion.h
@@ -8,12 +8,9 @@
 namespace onnxruntime {
 
 /**
-@Class LayerNormFusion
+@Class SkipLayerNormFusion
 
-Rewrite graph fusing Layer Normalization subgraph to a single LayerNormalization node.
-
-The formula corresponding to LayerNorm activation subgraph:
-(x - mean(x, axis)) / sqrt(var(x, axis)) * scale + bias, where x is the input.
+Rewrite graph fusing Add + Layer Normalization subgraph to a single SkipLayerNormalization node.
 
 */
 class SkipLayerNormFusion : public GraphTransformer {
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 854365515c..00220740a6 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -19,6 +19,7 @@
 #include "core/optimizer/gelu_approximation.h"
 #include "core/optimizer/layer_norm_fusion.h"
 #include "core/optimizer/skip_layer_norm_fusion.h"
+#include "core/optimizer/embed_layer_norm_fusion.h"
 #include "core/optimizer/graph_transformer.h"
 #include "core/optimizer/graph_transformer_mgr.h"
 #include "core/optimizer/identity_elimination.h"
@@ -1253,6 +1254,48 @@ TEST(GraphTransformationTests, SkipLayerNormFusionTest) {
   TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format2.onnx");
   TestSkipLayerNormFusion(MODEL_FOLDER "fusion/skip_layer_norm_format3.onnx");
 }
+
+TEST(GraphTransformationTests, EmbedLayerNormFusionFormat1) {
+  auto model_uri = MODEL_FOLDER "fusion/embed_layer_norm_format1.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<EmbedLayerNormFusion>(), TransformerLevel::Level2);
+  auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
+  ASSERT_TRUE(ret.IsOK());
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Gather"] == 0);
+  ASSERT_TRUE(op_to_count["Add"] == 0);
+  ASSERT_TRUE(op_to_count["ReduceSum"] == 0);
+  ASSERT_TRUE(op_to_count["Attention"] == 1);
+  ASSERT_TRUE(op_to_count["SkipLayerNormalization"] == 0);
+  ASSERT_TRUE(op_to_count["EmbedLayerNormalization"] == 1);
+}
+
+TEST(GraphTransformationTests, EmbedLayerNormFusionFormat2) {
+  auto model_uri = MODEL_FOLDER "fusion/embed_layer_norm_format2.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_TRUE(Model::Load(model_uri, p_model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<EmbedLayerNormFusion>(), TransformerLevel::Level2);
+  auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, DefaultLoggingManager().DefaultLogger());
+  ASSERT_TRUE(ret.IsOK());
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_TRUE(op_to_count["Shape"] == 0);
+  ASSERT_TRUE(op_to_count["Expand"] == 0);
+  ASSERT_TRUE(op_to_count["Gather"] == 0);
+  ASSERT_TRUE(op_to_count["Add"] == 0);
+  ASSERT_TRUE(op_to_count["ReduceSum"] == 0);
+  ASSERT_TRUE(op_to_count["Attention"] == 1);
+  ASSERT_TRUE(op_to_count["SkipLayerNormalization"] == 0);
+  ASSERT_TRUE(op_to_count["EmbedLayerNormalization"] == 1);
+}
 #endif
 
 }  // namespace test
diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format1.onnx b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format1.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..413c52bdb280d5976c5b3624353ce017a2129e4d
GIT binary patch
literal 940
zcmd;J6%x)bD$y+|O3W)x%P-1JEXmBzE4J!m=ThckD^JZ#&nV$yj1b~WPb|quEi#PH
zFD)r3Em2}~2lKHiGD25m#D%6E#)qjC;!aFSft$_jn395Qa3sujV|0V{xsWx&Oob@}
zn+mf+32dq_SOZLqi?uj8F(;LaB`GtpSco$x4{E-YkWXS|YLOo(IC3(pz(JyfMT|>@
zizPR)I9o`tC^e-tIW@kxG#BPX&LFT*aA_`AJ0lmuY%USUl9JRsP$<V2W|zf7gIEYv
z6yzW-OjCu}Ai*!i30JO!U}_0)apnPCkdd00QY^u<fRV|HH#t97FE=x}D8D#Atwcy7
zH8&|WB|Zn_yZAhy_u?T@Zm1=~!N|cVz^KH+z`)?(1dVttF`yWm79*1rZ)!zBVqOZ=
zwOWEOwGIwWkTBE|KoSCnr&bpu2QvpJ(9E1O-f?z&4Ek*Kr_8q7-M-t7`97C@#t~Ed
zcT+U&^=|Rov+#)8+lac`TfR)TxBqWsACVAZe_-unyVrKt?KXr;*)0l*vMcn;vA4RG
zX1~Qi!7eyX$$sVO)pqlfmF-z4G}sv}6|(>OgwaN6*)uyQtl_I=2X=`{?^${KURM!&
z-@*d><;PO(?=;@GHI4|iZ(w4zKX`SDT}Xb5trM2Gh!O^80xlj7HX#l!CJrVbW(ElY
n(<Yjr7&J|wi6=>LK_d^WgArj&mNK@OMrdaQ+QsO^B)|m#>ir&`

literal 0
HcmV?d00001

diff --git a/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format2.onnx b/onnxruntime/test/testdata/transform/fusion/embed_layer_norm_format2.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..58e4ee64e2f3f708d1770471c5757b7a6913cf23
GIT binary patch
literal 1060
zcmd;J6%x)bD$y+|O3W)x%P-1JEXmBzE4FIm;8NydD^JZ#&nV$yj1b~WPb|quEi#PH
zFD)r3Em2}~2lKfkK$1Mg8Hoj{Fe%nx5Rc1*i#N5RATci`9%>4TDj}F8%sf}HS}uLK
zrKk#E4lsh*0&{>77n;*xe3(ih?!=T7xVxDhQ&Nz976~)h7~NofE@X``Q(?-$royaH
z0-Nd!)&LXZVl7Ti%t_^9Ny<zt7UImwgPJcT<dayLTI2@`>YU6faBwSO5#v(fV#!S`
z&K43ZN=+$EPK_@v&4oFUGYBjcT$;<(&d7x@n@hy8q$D*D6oc`F*=4YR7eW;UIfx6>
zR3SD<R7i2cl`A2bS^`|0c|aFrq$Z{mOYkgUWU}H-&d=4$%}g%JFV0UZ5t2yFO-fCP
z&jI-^J`d=<ct~<G(h}id<X{wFRAOOZU~q7PCMqp4pctDLBa;&>q9MsdOAw~k!NCa<
zhFSthLg4V!>SE+z=HLXHnbVxP->!SHrOo7t&32u4Jnd!p!tF)(UbFMQR%>6+>uA61
zw66V(BYgH!S%&ruy1BMS|BdW;zb4u9KYeU7rI6F!OV-=&;RSPhx9D2?Idv}f-?Ap!
z#O`sm&)w%`uVSBNx7eB4?w+B(z3UHmdnc^nt7QjviOThPEcTr{j@Vf(UvIl;g|z*(
z_%C*sW~te;hlkrwWjbh=8>?^cge5Mbguxk%i-&_vh=YrXg9(V4L4v@vi6$rpO%rJ1
sNfKPp$OG$OL>QAL#f6&fxVSht*o9a?)?_JTO9cp{8G(uzotOl;0Ei()4FCWD

literal 0
HcmV?d00001