From eabf6dc9eebffaf4aa6b2ae2191453b0540bb29e Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 6 Jul 2020 19:37:37 -0700
Subject: [PATCH] Add Fusion for GPT Attention with both past state and
 attention mask (#4437)

 Add Fusion for GPT Attention with past state and attention mask
---
 .../core/optimizer/attention_fusion.cc        |  250 +---
 .../core/optimizer/attention_fusion_helper.h  | 1142 +++++++++++++++++
 .../test/optimizer/graph_transform_test.cc    |   18 +
 .../fusion/gpt2_past_mask_one_layer.onnx      |  Bin 0 -> 10363 bytes
 4 files changed, 1204 insertions(+), 206 deletions(-)
 create mode 100644 onnxruntime/core/optimizer/attention_fusion_helper.h
 create mode 100644 onnxruntime/test/testdata/transform/fusion/gpt2_past_mask_one_layer.onnx
diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc
index bffe2372c6..fb8d17b43d 100644
--- a/onnxruntime/core/optimizer/attention_fusion.cc
+++ b/onnxruntime/core/optimizer/attention_fusion.cc
@@ -5,12 +5,9 @@
 #include "core/optimizer/initializer.h"
 #include "core/optimizer/attention_fusion.h"
 #include "core/optimizer/utils.h"
+#include "core/optimizer/attention_fusion_helper.h"
 #include <cmath>
 
-#define DEBUG_LOG(x) LOGS(logger, VERBOSE) << x
-
-using namespace ONNX_NAMESPACE;
-using namespace onnxruntime::common;
 namespace onnxruntime {
 
 static bool ValidateMatMulInitializer(const Graph& graph, const Node& matmul, int64_t hidden_size) {
@@ -31,7 +28,7 @@ static bool ValidateAddBiasInitializer(const Graph& graph, const Node& add, int6
   return optimizer_utils::ValidateShape(input_b, {hidden_size});
 }
 
-// Merge 1-D weights (q, k and v) by concanating them one by one.
+// Merge 1-D weights (q, k and v) by concatenating them one by one.
 template <typename T>
 void MergeWeights(const T* q, const T* k, const T* v, std::vector<T>& result, int64_t element_count) {
   for (int64_t i = 0; i < element_count; i++) {
@@ -50,7 +47,7 @@ void MergeWeights(const T* q, const T* k, const T* v, std::vector<T>& result, in
   }
 }
 
-// Merge 2-D weights (q, k and v) by concanating them row by row.
+// Merge 2-D weights (q, k and v) by concatenating them row by row.
 template <typename T>
 void MergeMatMulWeights(const T* q_weight, const T* k_weight, const T* v_weight, std::vector<T>& result, int64_t hidden_size) {
   const T* q = q_weight;
@@ -146,36 +143,6 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size,
   return graph_utils::AddInitializer(graph, initializer);
 }
 
-// Add a Cast to convert Mask from int64 to int32.
-static NodeArg& CastMaskToInt32(Graph& graph, NodeArg* mask_input, ProviderType provider_type) {
-  const TensorShapeProto* mask_shape = mask_input->Shape();
-  TypeProto mask_int32;
-  mask_int32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT32);
-  auto dim0 = mask_int32.mutable_tensor_type()->mutable_shape()->add_dim();
-  *dim0 = mask_shape->dim(0);
-  auto dim1 = mask_int32.mutable_tensor_type()->mutable_shape()->add_dim();
-  *dim1 = mask_shape->dim(1);
-  auto& cast32 = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("Mask_Int32"), &mask_int32);
-
-  Node& node = graph.AddNode(graph.GenerateNodeName("MaskCast"),
-                             "Cast",
-                             "Cast mask from int64 to int32",
-                             {mask_input},
-                             {&cast32},
-                             nullptr,
-                             kOnnxDomain);
-
-  // Add attribute: "to" = 6
-  ONNX_NAMESPACE::AttributeProto to;
-  to.set_name("to");
-  to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
-  to.set_i(static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_INT32));
-  node.AddAttribute("to", to);
-
-  node.SetExecutionProviderType(provider_type);
-  return cast32;
-}
-
 static NodeArg& AddMaskReduceSum(Graph& graph, NodeArg* reduce_sum_input, TypeProto& output_type, ProviderType provider_type) {
   NodeArg& reduce_sum_output = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("MaskIndex_Int32"), &output_type);
 
@@ -229,7 +196,7 @@ static NodeArg* ProcessMask(Graph& graph, NodeArg* mask_input, ProviderType prov
   NodeArg* reduce_sum_input = mask_input;
   if (data_type == ONNX_NAMESPACE::TensorProto_DataType_INT64 ||
     data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
-    NodeArg& cast_int32 = CastMaskToInt32(graph, mask_input, provider_type);
+    NodeArg& cast_int32 = AttentionFusionHelper::CastMaskToInt32(graph, mask_input, provider_type);
     reduce_sum_input = &cast_int32;
   }
 
@@ -272,6 +239,9 @@ Status AttentionFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
   // A map from mask input arg name to mask index output.
   std::map<std::string, NodeArg*> mask_index_map;
 
+  // A map from mask input arg name to the one casted to int32
+  std::map<std::string, NodeArg*> mask_int32_map;
+
   int fused_count = 0;
   for (auto node_index : node_topology_list) {
     auto* p_node = graph.GetNode(node_index);
@@ -296,23 +266,31 @@ Status AttentionFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
       const Node* add_node = nullptr;
       int add_count = 0;
       int matmul_count = 0;
+      int shape_count = 0;
+      int reshape_count = 0;
       for (auto it = node.OutputNodesBegin(); it != node.OutputNodesEnd(); ++it) {
         if ((*it).OpType().compare("Add") == 0) {
           add_count++;
           add_node = &(*it);
         } else if ((*it).OpType().compare("MatMul") == 0) {
           matmul_count++;
+        } else if ((*it).OpType().compare("Shape") == 0) {
+          shape_count++;
+        } else if ((*it).OpType().compare("Reshape") == 0) {
+          reshape_count++;
         }
       }
 
-      if (add_count != 1 || matmul_count != 3) {
-        DEBUG_LOG("Attention subgraph expects 1 Add and 3 MatMul as children of LayerNormalization.");
-        continue;
-      }
-
-      if (AttentionFusion::FuseSubGraph(node, *add_node, graph, hidden_size, mask_index_map, logger)) {
-        fused_count++;
-        modified = true;
+      if (add_count == 1 && matmul_count == 3) { // BERT
+        if (AttentionFusion::FuseSubGraph(node, *add_node, graph, hidden_size, mask_index_map, logger)) {
+          fused_count++;
+          modified = true;
+        }
+      } else if (reshape_count == 1 && shape_count == 3) {  // GPT
+        if (AttentionFusionHelper::FuseGptAttention(node, graph, hidden_size, mask_int32_map, logger)) {
+          fused_count++;
+          modified = true;
+        }
       }
     }
   }
@@ -345,9 +323,9 @@ Status AttentionFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
           |  (0,2,1,3)  (0,2,3,1)    (perm=0,2,1,3)              |
           |         \       /         |                    mask_Unsqueeze(axes=2)
           |      qk_MatMul            |                          |
-          |           |    [B=2]      |              ([A=1] mask_Cast(to=1))
+          |           |    [B=2]      |              ([A=1.0] mask_Cast(to=1))
           |           |   /           |                   \     /
-          |        qk_Div             |                 mask_Sub   [A=1000]
+          |        qk_Div             |                 mask_Sub   [B=-10000.0]
           |            \              |                        \   /
           |       mask_Add <-------- /---------------------mask_Mul
           |             |           /
@@ -413,46 +391,16 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
     return false;
   }
 
-  // Internal nodes of attention subgraph only allow edges within the subgraph, and no graph output is allowed.
-  // No constraints for four nodes: reshape node is last node of Attention; and add, matmul and v_root are not in attention subgraph.
-  if (!optimizer_utils::CheckOutputEdges(graph, transpose, 1) ||
-      !optimizer_utils::CheckOutputEdges(graph, qkv_matmul, 1) ||
-      !optimizer_utils::CheckOutputEdges(graph, v_transpose, 1) ||
-      !optimizer_utils::CheckOutputEdges(graph, v_reshape, 1) ||
-      !optimizer_utils::CheckOutputEdges(graph, v_add, 1) ||
+  if (!optimizer_utils::CheckOutputEdges(graph, v_add, 1) ||
       !optimizer_utils::CheckOutputEdges(graph, v_matmul, 1)) {
-    DEBUG_LOG("Output edge count not expected for nodes in path v");
+    DEBUG_LOG("Output edge count not expected for Add or MatMul in path v");
     return false;
   }
 
-  std::vector<int64_t> perm;
-  if (!(graph_utils::GetRepeatedNodeAttributeValues(transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 2 && perm[2] == 1 && perm[3] == 3)) {
-    DEBUG_LOG("Failed in match Transpose attribute perm. Expected: 0, 2, 1, 3");
-    return false;
-  }
-  if (!(graph_utils::GetRepeatedNodeAttributeValues(v_transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 2 && perm[2] == 1 && perm[3] == 3)) {
-    DEBUG_LOG("Failed in match v_transpose attribute perm. Expected: 0, 2, 1, 3");
-    return false;
-  }
-
-  std::vector<int64_t> v_reshape_shape;
-  if (!optimizer_utils::AppendTensorFromInitializer(graph, *(v_reshape.InputDefs()[1]), v_reshape_shape) ||
-      v_reshape_shape.size() != 4 ||
-      v_reshape_shape[2] <= 0 ||
-      v_reshape_shape[3] <= 0 ||
-      hidden_size != v_reshape_shape[2] * v_reshape_shape[3]) {
-    DEBUG_LOG("v_reshape initializer value is not expected");
-    return false;
-  }
-
-  const int64_t num_attention_head = v_reshape_shape[2];
-  const int64_t attention_head_size = v_reshape_shape[3];
-
-  std::vector<int64_t> reshape_shape;
-  if (!optimizer_utils::AppendTensorFromInitializer(graph, *(reshape.InputDefs()[1]), reshape_shape) ||
-      reshape_shape.size() != 3 ||
-      reshape_shape[2] != hidden_size) {
-    DEBUG_LOG("reshape initializer value is not expected");
+  int64_t num_heads = 0;  // will be updated in CheckNodesInPathV
+  int64_t head_size = 0;  // will be updated in CheckNodesInPathV
+  if (!AttentionFusionHelper::CheckNodesInPathV(graph, reshape, transpose, qkv_matmul, v_transpose, v_reshape, num_heads, head_size, hidden_size, logger)) {
+    DEBUG_LOG("CheckNodesInPathV return false");
     return false;
   }
 
@@ -465,86 +413,11 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
     return false;
   }
 
-  // path 2 to find mask. Unsqueeze -> Unsqueeze -> (Cast) -> Sub -> Mul -> Add -> Softmax
-  // The "Cast" node in parentheses is optional. 
-  std::vector<graph_utils::EdgeEndToMatch> mask_path{
-      {0, 0, "Softmax", {1, 11}, kOnnxDomain},
-      {0, 0, "Add", {7}, kOnnxDomain},
-      {0, 1, "Mul", {7}, kOnnxDomain},
-      {0, 0, "Sub", {7}, kOnnxDomain}};
-
-  if (!graph_utils::FindPath(qkv_matmul, true, mask_path, edges, logger)) {
-    DEBUG_LOG("Failed to find path for mask");
-    return false;
-  }
-  
-  const Node& softmax = edges[0]->GetNode();
-  const Node& mask_add = edges[1]->GetNode();
-  const Node& mask_mul = edges[2]->GetNode();
-  const Node& mask_sub = edges[3]->GetNode();
-
-  // Match optional mask cast node
-  Node* p_mask_cast = nullptr;
-  Node* p_mask_unsqueeze_2 = nullptr;
-  Node* p_mask_unsqueeze_1 = nullptr;
-  std::vector<graph_utils::EdgeEndToMatch> mask_path_format_1{
-      {0, 1, "Cast", {9}, kOnnxDomain}, 
-      {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain},
-      {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain}};
-
-  std::vector<graph_utils::EdgeEndToMatch> mask_path_format_2{
-      {0, 1, "Unsqueeze", {1, 11}, kOnnxDomain},
-      {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain}};
-
-  if (graph_utils::FindPath(mask_sub, true, mask_path_format_1, edges, logger)) {
-    p_mask_cast = const_cast<Node*>(&edges[0]->GetNode());
-    p_mask_unsqueeze_2 = const_cast<Node*>(&edges[1]->GetNode());
-    p_mask_unsqueeze_1 = const_cast<Node*>(&edges[2]->GetNode());
-  } else if (graph_utils::FindPath(mask_sub, true, mask_path_format_2, edges, logger)) {
-    p_mask_unsqueeze_2 = const_cast<Node*>(&edges[0]->GetNode());
-    p_mask_unsqueeze_1 = const_cast<Node*>(&edges[1]->GetNode());
-  } else {
-    DEBUG_LOG("Failed to find path for mask");
-    return false;
-  }
-  
-  const Node& mask_unsqueeze_2 = *p_mask_unsqueeze_2;
-  const Node& mask_unsqueeze_1 = *p_mask_unsqueeze_1;
-
-
-  if (!optimizer_utils::CheckOutputEdges(graph, softmax, 1) ||
-      !optimizer_utils::CheckOutputEdges(graph, mask_add, 1) ||
-      !optimizer_utils::CheckOutputEdges(graph, mask_sub, 1) ||
-      (p_mask_cast != nullptr && !optimizer_utils::CheckOutputEdges(graph, *p_mask_cast, 1)) ||
-      !optimizer_utils::CheckOutputEdges(graph, mask_unsqueeze_2, 1) ||
-      !optimizer_utils::CheckOutputEdges(graph, mask_unsqueeze_1, 1)) {
-    DEBUG_LOG("Output edge count not expected for mask nodes");
-    return false;
-  }
-
-  if (!optimizer_utils::IsAttributeWithExpectedValue(softmax, "axis", 3)) {
-    DEBUG_LOG("Softmax attribute axis is expected to be 3");
-    return false;
-  }
-
-  std::vector<int64_t> axes;
-  if (!(graph_utils::GetRepeatedNodeAttributeValues(mask_unsqueeze_1, "axes", axes) && axes.size() == 1 && axes[0] == 1)) {
-    DEBUG_LOG("mask_unsqueeze_1 axes not matched. Expect: 1");
-    return false;
-  }
-
-  if (!(graph_utils::GetRepeatedNodeAttributeValues(mask_unsqueeze_2, "axes", axes) && axes.size() == 1 && axes[0] == 2)) {
-    DEBUG_LOG("mask_unsqueeze_2 axes not matched. Expect: 2");
-    return false;
-  }
-
-  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(mask_sub.InputDefs()[0]), float(1), false)) {
-    DEBUG_LOG("mask_sub const input not matched");
-    return false;
-  }
-
-  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(mask_mul.InputDefs()[1]), float(-10000), false)) {
-    DEBUG_LOG("mask_mul const input not matched");
+  // Find mask nodes: Unsqueeze -> Unsqueeze -> (Cast) -> Sub -> Mul -> Add -> Softmax --> [MatMul]
+  // The "Cast" node in parentheses is optional.
+  AttentionFusionHelper::AttentionMaskNodes mask_nodes;
+  if (!AttentionFusionHelper::MatchInputMaskSubgraph(graph, qkv_matmul, mask_nodes, logger)) {
+    DEBUG_LOG("Failed in match input mask subgraph");
     return false;
   }
 
@@ -558,7 +431,7 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
       {0, 0, "MatMul", {1, 9}, kOnnxDomain},
       {0, 0, "LayerNormalization", {1}, kOnnxDomain}};
 
-  if (!graph_utils::FindPath(mask_add, true, q_path, edges, logger)) {
+  if (!graph_utils::FindPath(*(mask_nodes.add), true, q_path, edges, logger)) {
     DEBUG_LOG("Failed to find path for q");
     return false;
   }
@@ -575,23 +448,8 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
     return false;
   }
 
-  std::vector<int64_t> q_reshape_shape;
-  if (!optimizer_utils::AppendTensorFromInitializer(graph, *(q_reshape.InputDefs()[1]), q_reshape_shape) ||
-      q_reshape_shape.size() != 4 ||
-      q_reshape_shape[2] != num_attention_head ||
-      q_reshape_shape[3] != attention_head_size) {
-    DEBUG_LOG("q_reshape const not matched");
-    return false;
-  }
-
-  float expected_value = std::sqrt(static_cast<float>(attention_head_size));
-  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(qk_div.InputDefs()[1]), expected_value, false)) {
-    DEBUG_LOG("qk_div const not matched.");
-    return false;
-  }
-
-  if (!(graph_utils::GetRepeatedNodeAttributeValues(q_transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 2 && perm[2] == 1 && perm[3] == 3)) {
-    DEBUG_LOG("q_transpose perm attribute not matched");
+  if (!AttentionFusionHelper::CheckNodesInPathQ(graph, qk_div, q_reshape, q_transpose, num_heads, head_size, logger)) {
+    DEBUG_LOG("CheckNodesInPathQ returns false");
     return false;
   }
 
@@ -624,8 +482,8 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
     return false;
   }
 
-  if (!(graph_utils::GetRepeatedNodeAttributeValues(k_transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 2 && perm[2] == 3 && perm[3] == 1)) {
-    DEBUG_LOG("k_transpose perm attribute not matched");
+  if (!AttentionFusionHelper::CheckNodesInPathK(graph, k_reshape, k_transpose, num_heads, head_size, logger)) {
+    DEBUG_LOG("CheckNodesInPathK returns false");
     return false;
   }
 
@@ -635,15 +493,6 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
     return false;
   }
 
-  std::vector<int64_t> k_reshape_shape;
-  if (!optimizer_utils::AppendTensorFromInitializer(graph, *(k_reshape.InputDefs()[1]), k_reshape_shape) ||
-      k_reshape_shape.size() != 4 ||
-      k_reshape_shape[2] != num_attention_head ||
-      k_reshape_shape[3] != attention_head_size) {
-    DEBUG_LOG("k_reshape const not matched");
-    return false;
-  }
-
   // Load q, k and v weights
   const ONNX_NAMESPACE::TensorProto* q_weight_tensor = nullptr;
   const ONNX_NAMESPACE::TensorProto* k_weight_tensor = nullptr;
@@ -662,7 +511,7 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
   }
 
   // Now everything is ready, we will start fusing subgraph.
-  NodeArg* mask_input = graph.GetNode(mask_unsqueeze_1.Index())->MutableInputDefs()[0];
+  NodeArg* mask_input = graph.GetNode(mask_nodes.unsqueeze_1->Index())->MutableInputDefs()[0];
   NodeArg* mask_index = GetOrCreateMaskIndex(graph, mask_input, mask_index_map, layer_norm.GetExecutionProviderType(), logger);
   if (nullptr == mask_index) {
     DEBUG_LOG("Failed to create mask index");
@@ -684,7 +533,7 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
       output_defs,
       nullptr,
       kMSDomain);
-  attention_node.AddAttribute("num_heads", num_attention_head);
+  attention_node.AddAttribute("num_heads", num_heads);
 
   // Assign provider to this new node.
   attention_node.SetExecutionProviderType(layer_norm.GetExecutionProviderType());
@@ -698,8 +547,6 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
       v_reshape.Index(),
       v_add.Index(),
       v_matmul.Index(),
-      softmax.Index(),
-      mask_add.Index(),
       qk_div.Index(),
       qk_matmul.Index(),
       q_transpose.Index(),
@@ -711,16 +558,7 @@ bool AttentionFusion::FuseSubGraph(Node& layer_norm, const Node& add_after_layer
       k_add.Index(),
       k_matmul.Index()};
 
-  // When the last Attention node is fused. Original mask processing nodes can be removed safely.
-  if (optimizer_utils::CheckOutputEdges(graph, mask_mul, 1)) {
-    nodes_to_remove.push_back(mask_mul.Index());
-    nodes_to_remove.push_back(mask_sub.Index());
-    if (p_mask_cast != nullptr) {
-      nodes_to_remove.push_back((*p_mask_cast).Index());
-    }
-    nodes_to_remove.push_back(mask_unsqueeze_2.Index());
-    nodes_to_remove.push_back(mask_unsqueeze_1.Index());
-  }
+  AttentionFusionHelper::SetMaskNodesToRemove(graph, mask_nodes, nodes_to_remove);
 
   for (const auto& node_index : nodes_to_remove) {
     Node* node = graph.GetNode(node_index);
diff --git a/onnxruntime/core/optimizer/attention_fusion_helper.h b/onnxruntime/core/optimizer/attention_fusion_helper.h
new file mode 100644
index 0000000000..b588efcc10
--- /dev/null
+++ b/onnxruntime/core/optimizer/attention_fusion_helper.h
@@ -0,0 +1,1142 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#define DEBUG_LOG(x) LOGS(logger, VERBOSE) << x
+
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::common;
+namespace onnxruntime {
+
+// This file is for helping attention fusion for GPT models.
+namespace AttentionFusionHelper {
+
+struct MatchGemmResult {
+  const Node* gemm;                     // the Gemm node.
+  const Node* input_node;               // one node in the subgraph that accept the input.
+  const Node* output_node;              // the node that have output of the subgraph.
+  std::vector<NodeIndex> node_indices;  // id of all nodes.
+};
+
+// Compare the expected parameters (starts, ends, axes and step)
+bool CheckSliceParameters(const Graph& graph, const Node& slice, const std::vector<int>& input_indices, const std::vector<int64_t>& expected_values, const logging::Logger& logger) {
+  ORT_ENFORCE(input_indices.size() == expected_values.size() && input_indices.size() > 0);
+
+  // Here assumes that the last element of input_indices is the maximum one.
+  if (slice.InputDefs().size() <= static_cast<size_t>(input_indices[input_indices.size() - 1])) {
+    DEBUG_LOG("Slice does not have enough number of inputs");
+    return false;
+  }
+
+  for (size_t i = 0; i < expected_values.size(); i++) {
+    const NodeArg& input = *(slice.InputDefs()[input_indices[i]]);
+    if (expected_values[i] >= static_cast<int64_t>(INT_MAX)) {
+      std::vector<int64_t> ends;
+      if (!(optimizer_utils::AppendTensorFromInitializer(graph, input, ends, true) && ends.size() == 1 && ends[0] >= INT_MAX)) {
+        DEBUG_LOG("Slice ends is less than INT_MAX");
+        return false;
+      }
+    } else if (!optimizer_utils::IsInitializerWithExpectedValue(graph, input, expected_values[i], true)) {
+      DEBUG_LOG("Slice parameter is not expected. Input index:" << input_indices[i] << "Expected value:" << expected_values[i]);
+      return false;
+    }
+  }
+
+  return true;
+}
+/** Match GEMM subgraph:
+      +-----------------------------------------------------------------------------------------+
+      |                                                                                         |
+      |                (*,-1,max,0)                                                             v
+[Input]--> Shape --> Slice ---------> Squeeze --> Unsqueeze (axes=0) --> Concat (-1, *) --> Reshape-->Gemm (B:W*4W, C:4W, or B:W*W, C:W, or B:4W*W, C:W)
+      |                                                                                                    |
+      |                                                             Concat (  ,  , 4W or W)-------------Reshape ----> [Output]
+      |                                                                     ^  ^
+      |                                                                     |  |
+      +----> Shape --> Gather (indices=0) --> Unsqueeze (axes=0) -----------+  |
+      |                                                                        |
+      +----> Shape --> Gather (indices=1) --> Unsqueeze (axes=0) --------------+ 
+*/
+bool MatchGemmSubgraph(Graph& graph,
+                       Node& node_after_gemm_reshape,
+                       int dst_arg_index,
+                       MatchGemmResult& result,
+                       const logging::Logger& logger) {
+  DEBUG_LOG("Start MatchGemmSubgraph");
+  // GPT Attention fusion supports opset version 9 or later.
+  std::vector<graph_utils::EdgeEndToMatch> parent_path{
+      {0, dst_arg_index, "Reshape", {5, 13}, kOnnxDomain},
+      {0, 0, "Gemm", {9, 11, 13}, kOnnxDomain},
+      {0, 0, "Reshape", {5, 13}, kOnnxDomain},
+      {0, 1, "Concat", {4, 11, 13}, kOnnxDomain},
+      {0, 1, "Unsqueeze", {1, 11, 13}, kOnnxDomain},
+      {0, 0, "Squeeze", {1, 11, 13}, kOnnxDomain},
+      {0, 0, "Slice", {1, 10, 11, 13}, kOnnxDomain},
+      {0, 0, "Shape", {1, 13}, kOnnxDomain}};
+
+  std::vector<const Node::EdgeEnd*> edges;
+  if (!graph_utils::FindPath(node_after_gemm_reshape, true, parent_path, edges, logger)) {
+    DEBUG_LOG("Faild to match gemm path");
+    return false;
+  }
+
+  const Node& reshape_after_gemm = edges[0]->GetNode();
+  const Node& gemm = edges[1]->GetNode();
+  const Node& reshape_before_gemm = edges[2]->GetNode();
+  const Node& concat = edges[3]->GetNode();
+  const Node& unsqueeze = edges[4]->GetNode();
+  const Node& squeeze = edges[5]->GetNode();
+  const Node& slice = edges[6]->GetNode();
+  const Node& shape_before_slice = edges[7]->GetNode();
+
+  const auto& subgraph_input = shape_before_slice.InputDefs()[0];
+  if (reshape_before_gemm.InputDefs()[0]->Name() != subgraph_input->Name()) {
+    DEBUG_LOG("Input of reshape_before_gemm is not the input of subgraph");
+    return false;
+  }
+
+  if (!optimizer_utils::CheckOutputEdges(graph, shape_before_slice, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, slice, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, squeeze, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, unsqueeze, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, concat, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, reshape_before_gemm, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, gemm, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, reshape_after_gemm, 1)) {
+    DEBUG_LOG("Output edge count not expected for nodes in gemm path");
+    return false;
+  }
+
+  if (gemm.InputDefs().size() != 3) {
+    DEBUG_LOG("Gemm does not have 3 inputs");
+    return false;
+  }
+
+  // Get the shape of bias, to be compared with the last input value of Concat
+  if (!graph_utils::IsInitializer(graph, gemm.InputDefs()[2]->Name(), true)) {
+    DEBUG_LOG("Gemm bias is not constant");
+    return false;
+  }
+  auto bias_shape = gemm.InputDefs()[2]->Shape();
+  if (bias_shape == nullptr || static_cast<size_t>(bias_shape->dim_size()) != 1 || !utils::HasDimValue(bias_shape->dim(0))) {
+    DEBUG_LOG("Gemm bias shape not expected");
+    return false;
+  }
+
+  if (!CheckSliceParameters(graph, slice, {1, 2, 3}, {-1, INT_MAX, 0}, logger)) {
+    DEBUG_LOG("CheckSliceParameters return false");
+    return false;
+  }
+
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(concat.InputDefs()[0]), (int64_t)-1, true)) {
+    DEBUG_LOG("concat first input value is not -1");
+    return false;
+  }
+
+  // Find the concat node for Gather paths.
+  std::vector<graph_utils::EdgeEndToMatch> edge_to_match{{0, 1, "Concat", {4, 11, 13}, kOnnxDomain}};
+  if (!graph_utils::FindPath(reshape_after_gemm, true, edge_to_match, edges, logger)) {
+    DEBUG_LOG("Faild to match concat node for Gather paths");
+    return false;
+  }
+
+  const Node& concat_after_gather = edges[0]->GetNode();
+  if (concat_after_gather.InputDefs().size() != 3 ||
+      !optimizer_utils::CheckOutputEdges(graph, concat_after_gather, 1)) {
+    DEBUG_LOG("concat_after_gather does not have expected number of inputs or output edges");
+    return false;
+  }
+
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(concat_after_gather.InputDefs()[2]), bias_shape->dim(0).dim_value(), true)) {
+    DEBUG_LOG("concat_after_gather input 2 does not have expected value");
+    return false;
+  }
+
+  result.node_indices.reserve(15);
+
+  // Match: [Input] ----> Shape --> Gather (indices=0 or 1) --> Unsqueeze (axes=0) ----> Concat ( , , )
+  for (int i = 0; i < 2; i++) {
+    std::vector<graph_utils::EdgeEndToMatch> gather_path1{
+        {0, i, "Unsqueeze", {1, 11, 13}, kOnnxDomain},
+        {0, 0, "Gather", {1, 11, 13}, kOnnxDomain},
+        {0, 0, "Shape", {1, 13}, kOnnxDomain}};
+
+    if (!graph_utils::FindPath(concat_after_gather, true, gather_path1, edges, logger)) {
+      DEBUG_LOG("Faild to match gemm gather path");
+      return false;
+    }
+
+    const Node& unsqueeze_after_gather = edges[0]->GetNode();
+    const Node& gather = edges[1]->GetNode();
+    const Node& shape = edges[2]->GetNode();
+
+    if (!optimizer_utils::CheckOutputEdges(graph, unsqueeze_after_gather, 1) ||
+        !optimizer_utils::CheckOutputEdges(graph, gather, 1) ||
+        !optimizer_utils::CheckOutputEdges(graph, shape, 1)) {  //TODO: deal with shared Shape node which has output edges > 1
+      DEBUG_LOG("Output edge count not expected for nodes in gemm gather path");
+      return false;
+    }
+
+    result.node_indices.push_back(unsqueeze_after_gather.Index());
+    result.node_indices.push_back(gather.Index());
+    result.node_indices.push_back(shape.Index());
+
+    if (shape.InputDefs()[0]->Name() != subgraph_input->Name()) {
+      return false;
+    }
+
+    std::vector<int64_t> axes;
+    if (!(graph_utils::GetRepeatedNodeAttributeValues(unsqueeze_after_gather, "axes", axes) && axes.size() == 1 && axes[0] == 0)) {
+      DEBUG_LOG("unsqueeze_after_gather axes value not expected");
+      return false;
+    }
+
+    if (!optimizer_utils::IsAttributeWithExpectedValue(gather, "axis", (int64_t)0)) {
+      DEBUG_LOG("gather axis value not expected");
+      return false;
+    }
+
+    if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(gather.InputDefs()[1]), static_cast<int64_t>(i), true)) {
+      DEBUG_LOG("gather input 1 value is not expected");
+      return false;
+    }
+  }
+
+  result.gemm = &gemm;
+  result.input_node = &shape_before_slice;
+  result.output_node = &reshape_after_gemm;
+  result.node_indices.insert(result.node_indices.end(),
+                             {reshape_after_gemm.Index(),
+                              gemm.Index(),
+                              reshape_before_gemm.Index(),
+                              concat.Index(),
+                              unsqueeze.Index(),
+                              squeeze.Index(),
+                              slice.Index(),
+                              shape_before_slice.Index(),
+                              concat_after_gather.Index()});
+
+  DEBUG_LOG("Pass MatchGemmSubgraph");
+  return true;
+}
+
+bool ValidateGemmInitializer(const Graph& graph, const Node& gemm, int64_t hidden_size, bool is_before_split, const logging::Logger& logger) {
+  DEBUG_LOG("Start ValidateGemmInitializer");
+  const NodeArg& bias = *(gemm.InputDefs()[2]);
+  if (!graph_utils::IsInitializer(graph, bias.Name(), true)) {
+    DEBUG_LOG("Gemm bias is not constant initializer");
+    return false;
+  }
+
+  int64_t bias_length = (is_before_split ? 3 : 1) * hidden_size;
+  if (!optimizer_utils::ValidateShape(bias, {bias_length})) {
+    DEBUG_LOG("Gemm bias shape is not expected");
+    return false;
+  }
+
+  const NodeArg& weights = *(gemm.InputDefs()[1]);
+  if (!graph_utils::IsInitializer(graph, weights.Name(), true)) {
+    DEBUG_LOG("Gemm weight is not constant initializer");
+    return false;
+  }
+
+  if (!optimizer_utils::ValidateShape(weights, {hidden_size, bias_length})) {
+    DEBUG_LOG("Gemm weight shape is not expected");
+    return false;
+  }
+
+  DEBUG_LOG("Pass ValidateGemmInitializer");
+  return true;
+}
+
+struct MatchUnidirMaskResult {
+  const Node* div_node;                 // the root node (Div) of the subgraph
+  std::vector<NodeIndex> node_indices;  // id of all nodes in the subgraph for removing later.
+};
+
+/**  Match Unidirectional Mask subgraph. 
+     In the below graph, ':' is followed by variable name in code. * means the input on the left side.
+
+
+                                                               (axes=0)
+                                        +---------------------Unsqueeze----------------------------------------+
+                                        |                     :unsqueeze2                                      |
+                                        |                      (axes=0)                                        |
+                                        +---------------------Unsqueeze-------------------+                    |
+                                        |                     :unsqueeze3                 |                    |
+                       (*,-1,max,0)     | (axes=0)  A          (axes=0)           starts  |ends                |
+ [Div] --> Shape --> Slice ---------> Squeeze -----> Sub -->  Unsqueeze ----------------+ |                    |ends
+      |    :shape1   :slice1          :squeeze1       ^       :unsqueeze1               v v                    v
+      |                                               |B                  Slice(1x1xWxW, , ,2,1) --> Slice(*,0, ,3, 1) :last_slice
+      |                                               |                   :mask_slice                  |
+      |                (*, -2, -1, 0)   (axes=0)      |                                              Cast(9)
+      +----> Shape --> Slice ---------> Squeeze-------+                                                |
+      |      :shape2   :slice2         :squeeze2                                                       v condition
+      +----------------------------------------------------------------------------------------->Where( ,*,-10000)--->[Add]
+*/
+bool MatchUnidirMaskSubgraph(const Graph& graph, const Node& add_node, MatchUnidirMaskResult& result, const logging::Logger& logger) {
+  DEBUG_LOG("Start MatchUnidirMaskSubgraph");
+  std::vector<graph_utils::EdgeEndToMatch> root_path{
+      {0, 0, "Where", {9}, kOnnxDomain},
+      {0, 1, "Div", {7, 13}, kOnnxDomain}};
+
+  std::vector<const Node::EdgeEnd*> edges;
+  if (!graph_utils::FindPath(add_node, true, root_path, edges, logger)) {
+    DEBUG_LOG("Faild to match the path (Div-->Where-->Add) for unidirectional mask");
+    return false;
+  }
+
+  const Node& where_node = edges[0]->GetNode();
+  const Node& div_node = edges[1]->GetNode();
+
+  const float expected_value = -10000.0f;
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where_node.InputDefs()[2]), expected_value, true)) {
+    return false;
+  }
+
+  std::vector<graph_utils::EdgeEndToMatch> path1{
+      {0, 0, "Cast", {9, 13}, kOnnxDomain},
+      {0, 0, "Slice", {10, 11, 13}, kOnnxDomain},  // Last Slice
+      {0, 0, "Slice", {10, 11, 13}, kOnnxDomain},  // Mask Slice
+      {0, 1, "Unsqueeze", {9, 11, 13}, kOnnxDomain},
+      {0, 0, "Sub", {7, 13}, kOnnxDomain},
+      {0, 0, "Squeeze", {1, 11, 13}, kOnnxDomain},
+      {0, 0, "Slice", {10, 11, 13}, kOnnxDomain},  // Slice 1
+      {0, 0, "Shape", {1, 13}, kOnnxDomain}};
+
+  if (!graph_utils::FindPath(where_node, true, path1, edges, logger)) {
+    DEBUG_LOG("Faild to match path 1 for unidirectional mask");
+    return false;
+  }
+
+  const Node& cast = edges[0]->GetNode();
+  const Node& last_slice = edges[1]->GetNode();
+  const Node& mask_slice = edges[2]->GetNode();
+  const Node& unsqueeze1 = edges[3]->GetNode();
+  const Node& sub = edges[4]->GetNode();
+  const Node& squeeze1 = edges[5]->GetNode();
+  const Node& slice1 = edges[6]->GetNode();
+  const Node& shape1 = edges[7]->GetNode();
+
+  if (!optimizer_utils::CheckOutputEdges(graph, where_node, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, cast, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, last_slice, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, mask_slice, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, unsqueeze1, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, sub, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, squeeze1, 3) ||
+      !optimizer_utils::CheckOutputEdges(graph, slice1, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, shape1, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, mask_slice, 1)) {
+    DEBUG_LOG("Output edge count not expected for nodes in path 1 of unidirectional mask");
+    return false;
+  }
+
+  if (div_node.OutputDefs()[0]->Name() != shape1.InputDefs()[0]->Name()) {
+    DEBUG_LOG("Div and Shape1 does not have edge");
+    return false;
+  }
+
+  if (!CheckSliceParameters(graph, last_slice, {1, 3, 4}, {0, 3, 1}, logger)) {
+    DEBUG_LOG("CheckSliceParameters returns false for last_slice");
+    return false;
+  }
+
+  if (!CheckSliceParameters(graph, mask_slice, {3, 4}, {2, 1}, logger)) {
+    DEBUG_LOG("CheckSliceParameters returns false for mask_slice");
+    return false;
+  }
+
+  if (!CheckSliceParameters(graph, slice1, {1, 2, 3}, {-1, INT_MAX, 0}, logger)) {
+    DEBUG_LOG("CheckSliceParameters returns false for slice1");
+    return false;
+  }
+
+  std::vector<graph_utils::EdgeEndToMatch> slice_ends_path{
+      {0, 2, "Unsqueeze", {9, 11, 13}, kOnnxDomain},
+      {0, 0, "Squeeze", {1, 11, 13}, kOnnxDomain}};
+
+  if (!graph_utils::FindPath(last_slice, true, slice_ends_path, edges, logger) ||
+      edges[1]->GetNode().Index() != squeeze1.Index()) {
+    DEBUG_LOG("Faild to match path 2 for unidirectional mask");
+    return false;
+  }
+
+  const Node& unsqueeze2 = edges[0]->GetNode();
+  if (!optimizer_utils::CheckOutputEdges(graph, unsqueeze2, 1)) {
+    DEBUG_LOG("Output edge count not expected for unsqueeze2 of unidirectional mask");
+    return false;
+  }
+
+  if (!graph_utils::FindPath(mask_slice, true, slice_ends_path, edges, logger) ||
+      edges[1]->GetNode().Index() != squeeze1.Index()) {
+    DEBUG_LOG("Faild to match path 3 for unidirectional mask");
+    return false;
+  }
+
+  const Node& unsqueeze3 = edges[0]->GetNode();
+  if (!optimizer_utils::CheckOutputEdges(graph, unsqueeze3, 1)) {
+    DEBUG_LOG("Output edge count not expected for unsqueeze3 of unidirectional mask");
+    return false;
+  }
+
+  std::vector<graph_utils::EdgeEndToMatch> path4{
+      {0, 1, "Squeeze", {1, 11, 13}, kOnnxDomain},
+      {0, 0, "Slice", {10, 11, 13}, kOnnxDomain},  // Slice 2
+      {0, 0, "Shape", {1, 13}, kOnnxDomain}};
+
+  if (!graph_utils::FindPath(sub, true, path4, edges, logger)) {
+    DEBUG_LOG("Faild to match path 4 for unidirectional mask");
+    return false;
+  }
+
+  if (div_node.OutputDefs()[0]->Name() != edges[2]->GetNode().InputDefs()[0]->Name()) {
+    DEBUG_LOG("Div and Shape does not have edge");
+    return false;
+  }
+
+  const Node& squeeze2 = edges[0]->GetNode();
+  const Node& slice2 = edges[1]->GetNode();
+  const Node& shape2 = edges[2]->GetNode();
+  if (!optimizer_utils::CheckOutputEdges(graph, squeeze2, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, slice2, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, shape2, 1)) {
+    DEBUG_LOG("Output edge count not expected for squeeze_2/slices2/shape2 of unidirectional mask");
+    return false;
+  }
+
+  if (!CheckSliceParameters(graph, slice2, {1, 2, 3}, {-2, -1, 0}, logger)) {
+    DEBUG_LOG("CheckSliceParameters return false for slice2");
+    return false;
+  }
+
+  result.div_node = &div_node;
+  result.node_indices = {
+      where_node.Index(),
+      cast.Index(),
+      last_slice.Index(),
+      mask_slice.Index(),
+      unsqueeze1.Index(),
+      sub.Index(),
+      squeeze1.Index(),
+      slice1.Index(),
+      shape1.Index(),
+      unsqueeze2.Index(),
+      unsqueeze3.Index(),
+      squeeze2.Index(),
+      slice2.Index(),
+      shape2.Index()};
+
+  DEBUG_LOG("Pass MatchUnidirMaskSubgraph");
+  return true;
+}
+
+struct AttentionMaskNodes {
+  const Node* softmax;
+  const Node* add;
+  const Node* mul;
+  const Node* sub;
+  const Node* cast;
+  const Node* unsqueeze_2;
+  const Node* unsqueeze_1;
+};
+
+void SetMaskNodesToRemove(const Graph& graph, AttentionMaskNodes& mask_nodes, std::vector<NodeIndex>& nodes_to_remove) {
+  nodes_to_remove.push_back(mask_nodes.softmax->Index());
+  nodes_to_remove.push_back(mask_nodes.add->Index());
+
+  // When the last Attention node is fused. Original mask processing nodes can be removed safely.
+  if (optimizer_utils::CheckOutputEdges(graph, *(mask_nodes.mul), 1)) {
+    nodes_to_remove.push_back(mask_nodes.mul->Index());
+    nodes_to_remove.push_back(mask_nodes.sub->Index());
+    if (mask_nodes.cast != nullptr) {
+      nodes_to_remove.push_back(mask_nodes.cast->Index());
+    }
+    nodes_to_remove.push_back(mask_nodes.unsqueeze_2->Index());
+    nodes_to_remove.push_back(mask_nodes.unsqueeze_1->Index());
+  }
+}
+
+/**  Match Input Mask subgraph:
+                                                                                                       {UnidirMask Subgraph}
+                                                                                                                   |
+                                                                  (optional)                                       v
+[Attention_mask] --> Unsqueeze (axes=1) --> Unsqueeze (axes=2) --> Cast ---->Sub(1,*) --> Mul(*, -10000.0) --> Add( ,*)--->SoftMax -->[MatMul]
+*/
+bool MatchInputMaskSubgraph(const Graph& graph, const Node& qkv_matmul, AttentionMaskNodes& result, const logging::Logger& logger) {
+  DEBUG_LOG("Start MatchInputMaskSubgraph");
+  std::vector<graph_utils::EdgeEndToMatch> mask_path{
+      {0, 0, "Softmax", {1, 11, 13}, kOnnxDomain},
+      {0, 0, "Add", {7, 13}, kOnnxDomain},
+      {0, 1, "Mul", {7, 13}, kOnnxDomain},
+      {0, 0, "Sub", {7, 13}, kOnnxDomain}};
+
+  std::vector<const Node::EdgeEnd*> edges;
+  if (!graph_utils::FindPath(qkv_matmul, true, mask_path, edges, logger)) {
+    DEBUG_LOG("Failed to find path for mask");
+    return false;
+  }
+
+  const Node& softmax = edges[0]->GetNode();
+  const Node& mask_add = edges[1]->GetNode();
+  const Node& mask_mul = edges[2]->GetNode();
+  const Node& mask_sub = edges[3]->GetNode();
+
+  // Match optional mask cast node
+  Node* p_mask_cast = nullptr;
+  Node* p_mask_unsqueeze_2 = nullptr;
+  Node* p_mask_unsqueeze_1 = nullptr;
+  std::vector<graph_utils::EdgeEndToMatch> mask_path_format_1{
+      {0, 1, "Cast", {9}, kOnnxDomain},
+      {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain},
+      {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain}};
+
+  std::vector<graph_utils::EdgeEndToMatch> mask_path_format_2{
+      {0, 1, "Unsqueeze", {1, 11}, kOnnxDomain},
+      {0, 0, "Unsqueeze", {1, 11}, kOnnxDomain}};
+
+  if (graph_utils::FindPath(mask_sub, true, mask_path_format_1, edges, logger)) {
+    p_mask_cast = const_cast<Node*>(&edges[0]->GetNode());
+    p_mask_unsqueeze_2 = const_cast<Node*>(&edges[1]->GetNode());
+    p_mask_unsqueeze_1 = const_cast<Node*>(&edges[2]->GetNode());
+  } else if (graph_utils::FindPath(mask_sub, true, mask_path_format_2, edges, logger)) {
+    p_mask_unsqueeze_2 = const_cast<Node*>(&edges[0]->GetNode());
+    p_mask_unsqueeze_1 = const_cast<Node*>(&edges[1]->GetNode());
+  } else {
+    DEBUG_LOG("Failed to find path for mask");
+    return false;
+  }
+
+  const Node& mask_unsqueeze_2 = *p_mask_unsqueeze_2;
+  const Node& mask_unsqueeze_1 = *p_mask_unsqueeze_1;
+
+  if (!optimizer_utils::CheckOutputEdges(graph, softmax, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, mask_add, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, mask_sub, 1) ||
+      (p_mask_cast != nullptr && !optimizer_utils::CheckOutputEdges(graph, *p_mask_cast, 1)) ||
+      !optimizer_utils::CheckOutputEdges(graph, mask_unsqueeze_2, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, mask_unsqueeze_1, 1)) {
+    DEBUG_LOG("Output edge count not expected for mask nodes");
+    return false;
+  }
+
+  if (!optimizer_utils::IsAttributeWithExpectedValue(softmax, "axis", 3)) {
+    DEBUG_LOG("Softmax attribute axis is expected to be 3");
+    return false;
+  }
+
+  std::vector<int64_t> axes;
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(mask_unsqueeze_1, "axes", axes) && axes.size() == 1 && axes[0] == 1)) {
+    DEBUG_LOG("mask_unsqueeze_1 axes not matched. Expect: 1");
+    return false;
+  }
+
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(mask_unsqueeze_2, "axes", axes) && axes.size() == 1 && axes[0] == 2)) {
+    DEBUG_LOG("mask_unsqueeze_2 axes not matched. Expect: 2");
+    return false;
+  }
+
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(mask_sub.InputDefs()[0]), float(1), false)) {
+    DEBUG_LOG("mask_sub const input not matched");
+    return false;
+  }
+
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(mask_mul.InputDefs()[1]), float(-10000), false)) {
+    DEBUG_LOG("mask_mul const input not matched");
+    return false;
+  }
+
+  result.softmax = &softmax;
+  result.add = &mask_add;
+  result.mul = &mask_mul;
+  result.sub = &mask_sub;
+  result.cast = p_mask_cast;
+  result.unsqueeze_2 = p_mask_unsqueeze_2;
+  result.unsqueeze_1 = p_mask_unsqueeze_1;
+  DEBUG_LOG("Pass MatchInputMaskSubgraph");
+  return true;
+}
+
+struct MatchPastResult {
+  NodeArg* past;
+  NodeArg* present;
+  std::vector<NodeIndex> node_indices;
+};
+
+/** Match Past Subgraph
+              --> Gather (indices=1) --> v_Concat(*, ) --> Unsqueeze(axes=0)--------------------------------------------------------------------+
+             /                                                                                                                                  v
+       [Past] --> Gather (indices=0) --> Transpose (perm=0,1,3,2) --> k_Concat(*, )--> Transpose(perm=0,1,3,2) --> Unsqueeze(axes=0)-->Concat(*, ) --> [Present]
+*/
+bool MatchPastSubgraph(Graph& graph, const Node& k_concat, const Node& v_concat, MatchPastResult& result, const logging::Logger& logger) {
+  DEBUG_LOG("Start MatchPastSubgraph");
+  std::vector<graph_utils::EdgeEndToMatch> past_k_path{
+      {0, 0, "Transpose", {1, 13}, kOnnxDomain},
+      {0, 0, "Gather", {1, 11, 13}, kOnnxDomain}};
+
+  std::vector<const Node::EdgeEnd*> edges;
+  if (!graph_utils::FindPath(k_concat, true, past_k_path, edges, logger)) {
+    DEBUG_LOG("Failed to find path for past_k");
+    return false;
+  }
+  const Node& past_k_transpose = edges[0]->GetNode();
+  const Node& past_k_gather = edges[1]->GetNode();
+
+  std::vector<graph_utils::EdgeEndToMatch> present_k_path{
+      {0, 0, "Transpose", {1, 13}, kOnnxDomain},
+      {0, 0, "Unsqueeze", {1, 11, 13}, kOnnxDomain},
+      {0, 0, "Concat", {4, 11, 13}, kOnnxDomain}};
+  if (!graph_utils::FindPath(k_concat, false, present_k_path, edges, logger)) {
+    DEBUG_LOG("Failed to find path for present_k");
+    return false;
+  }
+  const Node& present_k_transpose = edges[0]->GetNode();
+  const Node& present_k_unsqueeze = edges[1]->GetNode();
+  const Node& present_concat = edges[2]->GetNode();
+
+  std::vector<graph_utils::EdgeEndToMatch> present_past_v_path{
+      {0, 1, "Unsqueeze", {1, 11, 13}, kOnnxDomain},
+      {0, 0, "Concat", {4, 11, 13}, kOnnxDomain},
+      {0, 0, "Gather", {1, 11, 13}, kOnnxDomain}};
+  if (!graph_utils::FindPath(present_concat, true, present_past_v_path, edges, logger)) {
+    DEBUG_LOG("Failed to find path for present_v and past_v");
+    return false;
+  }
+  const Node& present_v_unsqueeze = edges[0]->GetNode();
+  const Node& past_v_concat = edges[1]->GetNode();
+  const Node& past_v_gather = edges[2]->GetNode();
+  if (past_v_concat.Index() != v_concat.Index()) {
+    DEBUG_LOG("Failed to match v_concat");
+    return false;
+  }
+
+  std::vector<int64_t> perm;
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(past_k_transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 1 && perm[2] == 3 && perm[3] == 2)) {
+    DEBUG_LOG("past_k_transpose perm attribute not matched");
+    return false;
+  }
+
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(present_k_transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 1 && perm[2] == 3 && perm[3] == 2)) {
+    DEBUG_LOG("present_k_transpose perm attribute not matched");
+    return false;
+  }
+
+  std::vector<int64_t> axes;
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(present_k_unsqueeze, "axes", axes) && axes.size() == 1 && axes[0] == 0)) {
+    DEBUG_LOG("present_k_unsqueeze axes value not expected");
+    return false;
+  }
+
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(present_v_unsqueeze, "axes", axes) && axes.size() == 1 && axes[0] == 0)) {
+    DEBUG_LOG("present_v_unsqueeze axes value not expected");
+    return false;
+  }
+
+  // Check Gather for past_v has indices == 1
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(past_v_gather.InputDefs()[1]), int64_t(1), true)) {
+    DEBUG_LOG("past_v_gather indices != 1");
+    return false;
+  }
+
+  // Check Gather for past_v has indices == 0
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(past_k_gather.InputDefs()[1]), int64_t(0), true)) {
+    DEBUG_LOG("past_k_gather indices != 0");
+    return false;
+  }
+
+  if (past_v_gather.InputDefs()[0]->Name() != past_k_gather.InputDefs()[0]->Name()) {
+    DEBUG_LOG("past_v_gather and past_k_gather does not have same past input");
+    return false;
+  }
+
+  if (!optimizer_utils::CheckOutputEdges(graph, k_concat, 2) ||
+      !optimizer_utils::CheckOutputEdges(graph, past_k_transpose, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, past_k_gather, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, present_k_transpose, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, present_k_unsqueeze, 1) ||
+      present_concat.GetOutputEdgesCount() != 0 ||  // present_concat only has a graph output, but no output edges to other nodes.
+      !optimizer_utils::CheckOutputEdges(graph, present_v_unsqueeze, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, past_v_concat, 2) ||
+      !optimizer_utils::CheckOutputEdges(graph, past_v_gather, 1)) {
+    DEBUG_LOG("Output edge count not expected for nodes in past subgraph");
+    return false;
+  }
+  result.node_indices = {
+      k_concat.Index(),
+      past_k_transpose.Index(),
+      past_k_gather.Index(),
+      present_k_transpose.Index(),
+      present_k_unsqueeze.Index(),
+      present_concat.Index(),
+      present_v_unsqueeze.Index(),
+      past_v_concat.Index(),
+      past_v_gather.Index()};
+
+  result.past = graph.GetNode(past_v_gather.Index())->MutableInputDefs()[0];
+  result.present = graph.GetNode(present_concat.Index())->MutableOutputDefs()[0];
+
+  DEBUG_LOG("Pass MatchPastSubgraph");
+  return true;
+}
+
+/** Check the following nodes (optional Concat is excluded) for path v:
+                                     v_Reshape  (shape=0,0,H,-1)
+                                      |
+                                    v_Transpose (perm=0,2,1,3)
+                                      |
+                                  [p_Concat?]
+                         \       /
+                       qkv_MatMul
+                              |
+                           Transpose (perm=0,2,1,3)
+                              |
+                           Reshape---[shape=0,0,-1]
+*/
+
+bool CheckNodesInPathV(const Graph& graph, const Node& reshape, const Node& transpose, const Node& qkv_matmul, const Node& v_transpose, const Node& v_reshape,
+                       int64_t& num_heads, int64_t& head_size, int64_t hidden_size, const logging::Logger& logger) {
+  DEBUG_LOG("Start CheckNodesInPathV");
+  // Internal nodes of attention subgraph only allow edges within the subgraph, and no graph output is allowed.
+  // No constraints for reshape node since it is the last node of Attention.
+  if (!optimizer_utils::CheckOutputEdges(graph, transpose, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, qkv_matmul, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, v_transpose, 1) ||
+      !optimizer_utils::CheckOutputEdges(graph, v_reshape, 1)) {
+    DEBUG_LOG("Output edge count not expected for nodes in path v");
+    return false;
+  }
+
+  std::vector<int64_t> perm;
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 2 && perm[2] == 1 && perm[3] == 3)) {
+    DEBUG_LOG("Failed in match Transpose attribute perm. Expected: 0, 2, 1, 3");
+    return false;
+  }
+
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(v_transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 2 && perm[2] == 1 && perm[3] == 3)) {
+    DEBUG_LOG("Failed in match v_transpose attribute perm. Expected: 0, 2, 1, 3");
+    return false;
+  }
+
+  if (num_heads > 0 && head_size > 0 && head_size != num_heads * head_size) {
+    DEBUG_LOG("hidden_size != num_heads * head_size");
+    return false;
+  }
+
+  // Check reshape for q, k or v has shape input (0, 0, N, -1) or (0, 0, N, H)
+  std::vector<int64_t> v_reshape_shape;
+  if (!optimizer_utils::AppendTensorFromInitializer(graph, *(v_reshape.InputDefs()[1]), v_reshape_shape) ||
+      v_reshape_shape.size() != 4 ||
+      v_reshape_shape[0] != 0 ||
+      v_reshape_shape[1] != 0 ||
+      v_reshape_shape[2] <= 0 ||
+      v_reshape_shape[2] > hidden_size ||
+      (head_size < 0 && v_reshape_shape[3] != -1) ||
+      (head_size == 0 && v_reshape_shape[2] * v_reshape_shape[3] != hidden_size)) {
+    DEBUG_LOG("v_reshape initializer value is not expected");
+    return false;
+  }
+
+  num_heads = v_reshape_shape[2];
+  head_size = v_reshape_shape[3];
+
+  // Check reshape for attention output has shape input (0, 0, -1) or (0, 0, N*H)
+  std::vector<int64_t> reshape_shape;
+  if (!optimizer_utils::AppendTensorFromInitializer(graph, *(reshape.InputDefs()[1]), reshape_shape) ||
+      reshape_shape.size() != 3 ||
+      reshape_shape[0] != 0 ||
+      reshape_shape[1] != 0 ||
+      (reshape_shape[2] != num_heads * head_size && reshape_shape[2] != -1)) {
+    DEBUG_LOG("reshape initializer value is not expected");
+    return false;
+  }
+
+  DEBUG_LOG("Pass CheckNodesInPathV");
+  return true;
+}
+
+bool CheckNodesInPathQ(const Graph& graph, const Node& qk_div, const Node& q_reshape, const Node& q_transpose, int64_t num_heads, int64_t head_size, const logging::Logger& logger) {
+  DEBUG_LOG("Start CheckNodesInPathQ");
+  std::vector<int64_t> q_reshape_shape;
+  if (!optimizer_utils::AppendTensorFromInitializer(graph, *(q_reshape.InputDefs()[1]), q_reshape_shape) ||
+      q_reshape_shape.size() != 4 ||
+      q_reshape_shape[0] != 0 ||
+      q_reshape_shape[1] != 0 ||
+      q_reshape_shape[2] != num_heads ||
+      q_reshape_shape[3] != head_size) {
+    DEBUG_LOG("q_reshape const not matched");
+    return false;
+  }
+
+  float expected_value = std::sqrt(static_cast<float>(head_size));
+  if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(qk_div.InputDefs()[1]), expected_value, false)) {
+    DEBUG_LOG("qk_div const not matched.");
+    return false;
+  }
+
+  std::vector<int64_t> perm;
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(q_transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 2 && perm[2] == 1 && perm[3] == 3)) {
+    DEBUG_LOG("q_transpose perm attribute not matched");
+    return false;
+  }
+  DEBUG_LOG("Pass CheckNodesInPathQ");
+  return true;
+}
+
+bool CheckNodesInPathK(const Graph& graph, const Node& k_reshape, const Node& k_transpose, int64_t num_heads, int64_t head_size, const logging::Logger& logger) {
+  DEBUG_LOG("Start CheckNodesInPathK");
+  std::vector<int64_t> perm;
+  if (!(graph_utils::GetRepeatedNodeAttributeValues(k_transpose, "perm", perm) && perm.size() == 4 && perm[0] == 0 && perm[1] == 2 && perm[2] == 3 && perm[3] == 1)) {
+    DEBUG_LOG("k_transpose perm attribute not matched");
+    return false;
+  }
+
+  std::vector<int64_t> k_reshape_shape;
+  if (!optimizer_utils::AppendTensorFromInitializer(graph, *(k_reshape.InputDefs()[1]), k_reshape_shape) ||
+      k_reshape_shape.size() != 4 ||
+      k_reshape_shape[0] != 0 ||
+      k_reshape_shape[1] != 0 ||
+      k_reshape_shape[2] != num_heads ||
+      k_reshape_shape[3] != head_size) {
+    DEBUG_LOG("k_reshape const not matched");
+    return false;
+  }
+  DEBUG_LOG("Pass CheckNodesInPathK");
+  return true;
+}
+
+// Add a Cast to convert Mask from int64 to int32.
+NodeArg& CastMaskToInt32(Graph& graph, NodeArg* mask_input, ProviderType provider_type) {
+  // Derive int32 shape info from mask_input
+  TypeProto mask_int32;
+  mask_int32.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT32);
+  auto dim0 = mask_int32.mutable_tensor_type()->mutable_shape()->add_dim();
+  auto dim1 = mask_int32.mutable_tensor_type()->mutable_shape()->add_dim();
+  const TensorShapeProto* mask_shape = mask_input->Shape();
+  if (mask_shape != nullptr && static_cast<size_t>(mask_shape->dim_size()) == 2) {
+    *dim0 = mask_shape->dim(0);
+    *dim1 = mask_shape->dim(1);
+  }
+
+  NodeArg& cast32 = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName("Mask_Int32"), &mask_int32);
+  const std::vector<NodeArg*> input_defs{mask_input};
+  const std::vector<NodeArg*> output_defs{&cast32};
+  Node& node = graph.AddNode(graph.GenerateNodeName("MaskCast"),
+                             "Cast",
+                             "Cast mask from int64 to int32",
+                             input_defs,
+                             output_defs,
+                             nullptr,
+                             kOnnxDomain);
+
+  // Add attribute: "to" = 6
+  ONNX_NAMESPACE::AttributeProto to;
+  to.set_name("to");
+  to.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType::AttributeProto_AttributeType_INT);
+  to.set_i(static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_INT32));
+  node.AddAttribute("to", to);
+
+  node.SetExecutionProviderType(provider_type);
+  return cast32;
+}
+
+NodeArg* GetOrCreateMaskInt32(
+    Graph& graph,
+    NodeArg* mask_input,
+    std::map<std::string, NodeArg*>& mask_int32_map,
+    ProviderType provider_type) {
+  // Lookup in cache map
+  auto search = mask_int32_map.find(mask_input->Name());
+  if (search != mask_int32_map.end()) {
+    return search->second;
+  }
+
+  NodeArg& cast32 = CastMaskToInt32(graph, mask_input, provider_type);
+
+  // Add it to cache map.
+  mask_int32_map.insert(std::pair<std::string, NodeArg*>(mask_input->Name(), &cast32));
+  return &cast32;
+}
+
+/** Fuse Attention SubGraph.
+@remark add_after_layer_norm is the Add node in the bottom of sub-graph.
+ Abbreviatios: B is batch_size, S is sequence_length, W is hidden_size, P is past sequence length,
+               N is number of attention heads, H is head size, and W=N*H, h=Sqrt(H)
+               B and S could be symbolic. ? means it is optional.
+    Graph before Fusion (q_, k_, v_, qk_, qkv_ and mask_ prefix is added before Operator type):
+                  Add
+               /       \ [Input](BxSxW)
+              /         \ 
+             /   LayerNormalization
+            /            |
+           /       {Gemm_Subgraph} <---[weights](Wx3W); [Bias](3W)
+          |               |
+          |             Split
+          |          /     |     \
+          |         /      |      \
+          | q_Reshape   k_Reshape   v_Reshape  (shape=0,0,H,-1)
+          |         |        |        |
+          |q_Transpose  k_Transpose v_Transpose
+          |  (0,2,1,3)  (0,2,3,1)    (perm=0,2,1,3)
+          |   \          /            |                       [Past]?
+               \        /             |                          |
+          |     \    p_Concat? <------|---------------------{Past_Subgraphj}?
+          |      \    /               |                          |
+          |      qk_MatMul            |                          |
+          |           |    [B=h]      |                          |
+          |           |   /           |                         /
+          |        qk_Div         p_Concat? <------------------
+          |            |              |
+          | {Unidir_Mask_Subgraph}    |                             [Mask]?
+          |            |              /                               |
+          |       mask_Add? <--------/---------------------{Attention_Mask_Subgraph}?
+          |            |            /
+          |          Softmax       /
+          |             \         /
+          |              \       /
+          |            qkv_MatMul
+          |                   |
+          |                Transpose (perm=0,2,1,3)
+          |                   |
+          |                Reshape---[shape=0,0,-1]
+          |                   |
+          |                 {Gemm_Subgraph} <---[weights](WxW); [Bias](W)
+          |                  /
+          +--------------> Add
+
+After Fusion:
+
+      Add
+      |   \
+      |  LayerNormalization [Weights] [Bias]   [Mask]?  [Past]?
+      |                 \   |        /         /         / 
+       \                 \  |       /         /         /
+        \                 Attention <------------------
+         \                |       |
+          \      {Gemm_Subgraph}  v
+           \              |       [Present]?
+            \             |
+             \            /
+              --------> Add
+TODO: replace Gemm_Subgraph by MatMul + Add
+*/
+bool FuseGptAttention(Node& layer_norm, Graph& graph, int64_t hidden_size, std::map<std::string, NodeArg*>& mask_int32_map, const logging::Logger& logger) {
+  DEBUG_LOG("Start FuseGptAttention");
+  const Node* parent_node = graph_utils::GetInputNode(layer_norm, 0);
+  if (nullptr == parent_node || !graph_utils::IsSupportedOptypeVersionAndDomain(*parent_node, "Add", {7, 13}, kOnnxDomain)) {
+    return false;
+  }
+
+  const Node* add_after_gemm = graph_utils::FirstChildByType(*graph.GetNode(parent_node->Index()), "Add");
+  if (nullptr == add_after_gemm) {
+    return false;
+  }
+
+  MatchGemmResult gemm1_result;
+  if (!MatchGemmSubgraph(graph, *graph.GetNode(add_after_gemm->Index()), 1, gemm1_result, logger) ||
+      !ValidateGemmInitializer(graph, *gemm1_result.gemm, hidden_size, false, logger)) {
+    return false;
+  }
+
+  std::vector<graph_utils::EdgeEndToMatch> path1{
+      {0, 0, "Reshape", {5, 13}, kOnnxDomain},
+      {0, 0, "Transpose", {1, 13}, kOnnxDomain},
+      {0, 0, "MatMul", {1, 9}, kOnnxDomain}};
+
+  std::vector<const Node::EdgeEnd*> edges;
+  if (!graph_utils::FindPath(*gemm1_result.input_node, true, path1, edges, logger)) {
+    DEBUG_LOG("Faild to find path to qkv_matmul");
+    return false;
+  }
+
+  const Node& reshape = edges[0]->GetNode();
+  const Node& transpose = edges[1]->GetNode();
+  const Node& qkv_matmul = edges[2]->GetNode();
+
+  const Node* v_concat = graph_utils::GetInputNode(qkv_matmul, 1);
+  if (v_concat == nullptr) {
+    return false;
+  }
+
+  bool has_past = graph_utils::IsSupportedOptypeVersionAndDomain(*v_concat, "Concat", {4, 11, 13}, kOnnxDomain);
+
+  std::vector<graph_utils::EdgeEndToMatch> path2{
+      {0, 1, "Transpose", {1, 13}, kOnnxDomain},
+      {0, 0, "Reshape", {5, 13}, kOnnxDomain},
+      {2, 0, "Split", {2, 11, 13}, kOnnxDomain}};
+
+  if (!graph_utils::FindPath(has_past ? *v_concat : qkv_matmul, true, path2, edges, logger)) {
+    DEBUG_LOG("Faild to find path v to Split");
+    return false;
+  }
+
+  const Node& v_transpose = edges[0]->GetNode();
+  const Node& v_reshape = edges[1]->GetNode();
+  const Node& v_split = edges[2]->GetNode();
+
+  MatchGemmResult gemm0_result;
+  if (!MatchGemmSubgraph(graph, *graph.GetNode(v_split.Index()), 0, gemm0_result, logger) ||
+      !ValidateGemmInitializer(graph, *gemm0_result.gemm, hidden_size, true, logger)) {
+    return false;
+  }
+
+  const Node* gemm0_parent = graph_utils::GetInputNode(*gemm0_result.input_node, 0);
+  if (gemm0_parent == nullptr || gemm0_parent->Index() != layer_norm.Index()) {
+    return false;
+  }
+
+  int64_t num_heads = 0;  // will be updated in CheckNodesInPathV
+  int64_t head_size = -1;
+  if (!CheckNodesInPathV(graph, reshape, transpose, qkv_matmul, v_transpose, v_reshape, num_heads, head_size, hidden_size, logger)) {
+    DEBUG_LOG("CheckNodesInPathV return false");
+    return false;
+  }
+
+  if (!optimizer_utils::CheckOutputEdges(graph, v_split, 3)) {
+    DEBUG_LOG("Output edge count not expected for nodes in path v");
+    return false;
+  }
+
+  // Find input mask. Unsqueeze -> Unsqueeze -> (Cast) -> Sub -> Mul -> Add -> Softmax
+  AttentionMaskNodes mask_nodes;
+  if (!MatchInputMaskSubgraph(graph, qkv_matmul, mask_nodes, logger)) {
+    DEBUG_LOG("MatchInputMaskSubgraph returns false");
+    return false;
+  }
+
+  MatchUnidirMaskResult unidir_mask_result;
+  if (!MatchUnidirMaskSubgraph(graph, *(mask_nodes.add), unidir_mask_result, logger)) {
+    DEBUG_LOG("MatchUnidirMaskSubgraph returns NULL");
+    return false;
+  }
+
+  // path to q
+  std::vector<graph_utils::EdgeEndToMatch> q_path{
+      {0, 0, "MatMul", {1, 9, 13}, kOnnxDomain},
+      {0, 0, "Transpose", {1, 13}, kOnnxDomain},
+      {0, 0, "Reshape", {5, 13}, kOnnxDomain},
+      {0, 0, "Split", {2, 11, 13}, kOnnxDomain}};
+
+  const Node* qk_div = unidir_mask_result.div_node;
+  if (!graph_utils::FindPath(*qk_div, true, q_path, edges, logger)) {
+    DEBUG_LOG("Failed to find path for q");
+    return false;
+  }
+
+  const Node& qk_matmul = edges[0]->GetNode();
+  const Node& q_transpose = edges[1]->GetNode();
+  const Node& q_reshape = edges[2]->GetNode();
+  const Node& q_split = edges[3]->GetNode();
+  if (q_split.Index() != v_split.Index()) {
+    DEBUG_LOG("q and v are not from same Split node");
+    return false;
+  }
+
+  if (!CheckNodesInPathQ(graph, *qk_div, q_reshape, q_transpose, num_heads, head_size, logger)) {
+    DEBUG_LOG("CheckNodesInPathQ returns false");
+    return false;
+  }
+
+  const Node* k_concat = nullptr;
+  if (has_past) {
+    k_concat = graph_utils::GetInputNode(qk_matmul, 1);
+    if (k_concat == nullptr || !graph_utils::IsSupportedOptypeVersionAndDomain(*k_concat, "Concat", {4, 11, 13}, kOnnxDomain)) {
+      return false;
+    }
+  }
+
+  // path to k
+  std::vector<graph_utils::EdgeEndToMatch> k_path{
+      {0, 1, "Transpose", {1, 13}, kOnnxDomain},
+      {0, 0, "Reshape", {5, 13}, kOnnxDomain},
+      {1, 0, "Split", {2, 11, 13}, kOnnxDomain}};
+
+  if (!graph_utils::FindPath(has_past ? *k_concat : qk_matmul, true, k_path, edges, logger)) {
+    DEBUG_LOG("Failed to find path for k");
+    return false;
+  }
+
+  const Node& k_transpose = edges[0]->GetNode();
+  const Node& k_reshape = edges[1]->GetNode();
+  const Node& k_split = edges[2]->GetNode();
+  if (k_split.Index() != v_split.Index()) {
+    DEBUG_LOG("k and v are not from same Split node");
+    return false;
+  }
+
+  if (!CheckNodesInPathK(graph, k_reshape, k_transpose, num_heads, head_size, logger)) {
+    DEBUG_LOG("CheckNodesInPathK returns false");
+    return false;
+  }
+
+  MatchPastResult past_result;
+  if (has_past && !MatchPastSubgraph(graph, *k_concat, *v_concat, past_result, logger)) {
+    DEBUG_LOG("MatchPastSubgraph returns false");
+    return false;
+  }
+
+  // Now everything is ready, we will start fusing subgraph.
+  NodeArg* mask_input = graph.GetNode(mask_nodes.unsqueeze_1->Index())->MutableInputDefs()[0];
+  NodeArg* mask_int32 = GetOrCreateMaskInt32(graph, mask_input, mask_int32_map, layer_norm.GetExecutionProviderType());
+
+  NodeArg* qkv_weights = graph.GetNode(gemm0_result.gemm->Index())->MutableInputDefs()[1];
+  NodeArg* qkv_bias = graph.GetNode(gemm0_result.gemm->Index())->MutableInputDefs()[2];
+
+  // Create Attention Node.
+  std::vector<NodeArg*> input_defs{layer_norm.MutableOutputDefs()[0], qkv_weights, qkv_bias, mask_int32};
+  std::vector<NodeArg*> output_defs{graph.GetNode(reshape.Index())->MutableOutputDefs()[0]};
+
+  if (has_past) {
+    input_defs.push_back(past_result.past);
+    output_defs.push_back(past_result.present);
+  }
+
+  Node& attention_node = graph.AddNode(
+      graph.GenerateNodeName("Attention"),
+      "Attention",
+      "Fused Attention subgraphs ",
+      input_defs,
+      output_defs,
+      nullptr,
+      kMSDomain);
+  attention_node.AddAttribute("num_heads", num_heads);
+  attention_node.AddAttribute("unidirectional", (int64_t)1);
+
+  // Assign provider to this new node.
+  attention_node.SetExecutionProviderType(layer_norm.GetExecutionProviderType());
+
+  // Remove nodes that are not used anymore.
+  std::vector<NodeIndex> nodes_to_remove{
+      reshape.Index(),
+      transpose.Index(),
+      qkv_matmul.Index(),
+      v_transpose.Index(),
+      v_reshape.Index(),
+      v_split.Index(),
+      qk_div->Index(),
+      qk_matmul.Index(),
+      q_transpose.Index(),
+      q_reshape.Index(),
+      k_transpose.Index(),
+      k_reshape.Index()};
+
+  nodes_to_remove.insert(nodes_to_remove.end(), unidir_mask_result.node_indices.begin(), unidir_mask_result.node_indices.end());
+  nodes_to_remove.insert(nodes_to_remove.end(), gemm0_result.node_indices.begin(), gemm0_result.node_indices.end());
+  if (has_past) {
+    nodes_to_remove.insert(nodes_to_remove.end(), past_result.node_indices.begin(), past_result.node_indices.end());
+  }
+  SetMaskNodesToRemove(graph, mask_nodes, nodes_to_remove);
+
+  for (const auto& node_index : nodes_to_remove) {
+    Node* node = graph.GetNode(node_index);
+    graph_utils::RemoveNodeOutputEdges(graph, *node);
+    graph.RemoveNode(node->Index());
+  }
+
+  DEBUG_LOG("Fused an attention node for GPT.");
+  return true;
+}
+
+};  // namespace AttentionFusionHelper
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 37e8f95465..6ab3de3888 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -1718,6 +1718,24 @@ TEST_F(GraphTransformationTests, AttentionFusionFloat32Test) {
   ValidateAttention(graph);
 }
 
+// Test GPT-2 Attention Fusion with float32 mask
+TEST_F(GraphTransformationTests, AttentionFusionGPTWithPastAndMaskTest) {
+  auto model_uri = MODEL_FOLDER "fusion/gpt2_past_mask_one_layer.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  graph_transformation_mgr.Register(onnxruntime::make_unique<AttentionFusion>(), TransformerLevel::Level2);
+  auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_);
+  ASSERT_TRUE(ret.IsOK());
+
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["Transpose"], 0);
+  EXPECT_EQ(op_to_count["Softmax"], 0);
+  EXPECT_EQ(op_to_count["Attention"], 1);
+}
+
 TEST_F(GraphTransformationTests, GeluFusionTest) {
   auto model_uri = MODEL_FOLDER "fusion/gelu.onnx";
   std::shared_ptr<Model> p_model;
diff --git a/onnxruntime/test/testdata/transform/fusion/gpt2_past_mask_one_layer.onnx b/onnxruntime/test/testdata/transform/fusion/gpt2_past_mask_one_layer.onnx
new file mode 100644
index 0000000000000000000000000000000000000000..bba15177ce30378af6c38a089ebe542e054ab8ff
GIT binary patch
literal 10363
zcmcIqd3;k<_9y9*^wHK5DMDDp00jb+x95fCzLtXGR2Byi{h;&7CoRE7yM!iHK-41Q
z2tQnY;({ZNjEs(gxZ(nR_fc`g1yM#FWn^Sz+&35%#%1QGzjI#lUS8US;SZ<%e3F-Y
z&%O7Yd(QW4iM!b2iZ4sHcQiG73%F`&${TYSo+*w;63H;j6bM4V122lVWJV;}9PJ46
zystR*=<m-IMwZ4BgZpiAI2jpf6{HP%OXqh)+7j{hL^R9`z7p$`e*jY$k9M@y^{aE#
z71TL5Ib6&cOi8RQ-kA)?79~6dT#)s;=QT&-uu}9D8Bfe)2FLM*Lkd2_i<FxegO>Lc
z@T}l1J+CdXq%#^_4m--e66;fn`)HyLxOXuuQz%G5k-;UnPV|;cYj10cB*T2bSDbpx
zmUj|U3IfKGvGz7(9aoCDQt~o$qluIRiqDmPW`4m8PvWI4ZtrM~c2r*;k5*qEja}TF
z40l8mvE@-*FTr{)n2|1+TROk(#_&>Pyp%xZSb@!dY_iX>uoREYC|KZfA3F?K<P>}f
z!UCT47EfKYC@k>40(fV795BPPjL$A*b2VGt(iY}=i8?YTO=B#QNM8o*F9f`mGb78Q
z9kb!WA}z7y5#y!>!RP5a40h#;#uKrY_O{B0o*x_7<Z$>0rj7F_^UncR%6k`gCZdZ*
zT3d`vbT(ex5s5b^MlwS|0<?Bs<V+hB>^2DFKp~=CiGrmiA-@d*8PH0K-3EpHHfWs>
zXiH;22-q;d3wZki-oD6Y&R24)3orm=czz)3{9s-TNU%Z*;EQ6Q0k1-agVI0{+1x;|
z*cL^TZILtC_9GVvD-rwyFXHYZnz@J~w#1sEVUY(~d}1czBzO>B40@S)Y0Qd(&y{}e
zjaVc^`FOj0qL^2{6vmMc33Vue4o%TOhq*958fXRMMhr!Y8H|Z?24fGQ=}RIrzPDLN
zlF7E}rm*pr4fe4oBFzSZBzavkqOGlAG2ko22mdKdQKTi_9I5Qrud5bKy__j*j3zUm
z%a~$g|EY#&_M0E;Q;r&il2F$&T8GTpgtGaoDyHbt=(1|Cc!2{tfwR#`HV1=lp98ln
zaG-|ymfM%O+-~0qZXbZ?V!)IkQs83Hg&=}YPwPU0_>$H|P6#vO7#G+^hrt`Kfb-)O
zoAZ;vUQC~hs#eg76@Z|PH!u<FlPx4UF(@H#K3s<kBGv^k)sk6}<gCt?umpCJdSt*G
z!VJEE4GI!?!|Ach!jkMOfOm8yFoqKW6BCjJWxxcC;|t0!V~~|FaDpn*n2;13CSV*V
z`3xqcV17&hAy5ah;-1W&028=1?l0R>B!l|$M9Pe4mY0+f!Wi)wLo~D*Nx&BXzJOR0
zz=+eL$%0)p34DPd$r;gPF^_0?-!hN;fyM<{@D|VOYz)hiuK?a5eB&5SNLga!ceb;b
zK2gZPsSSL%C*TCG2Hc^{qFAz_r=TSa>eTRE#$?c-LnZ@)@raiYKoJA7wZu{_;1I@f
zGT?Ph!*ojy`U>&EU&1((?So4&t*SyQ!&uOL$(UmZ0bt;K$i<@J0P8C<p3vfqNX`YV
z;1RIAu#rH4v2sClm~19^1o9+v7(7^X4{TQO2tYEI$1IE$0bo%IqJL*0jY$JKAX<RV
zN*E_I<#6NCHY2XLMiQ6enaCf<W?WK0%D@?qKwrdzFS2EIft*S<oQ$yo(8Hs1vKS8s
z0=dL6vQZ<+!FYvkiRYx0nFW-TnHjU_IyrPZT)<n5ULg?l6~MdU5&{5em;fay=>$n(
zQwb78HyFbS@TBwF7bjaIOT!8W=AL@CvP*D5&a4a^qqqP$iJ5^bd6Wi@@v=xuXVh@o
z0vd{7OGcG!#|U#s0I0WIVOaT3j?wUUXzBvmEha%}9V?jtYC7#7?RgivR1h3qFeMTc
zaQA7Ks>qg0#ZaQ4w@|=)*|-92S)X#bYXp)4P!n0;N(76w5^%*9&;TGX2Lxa}>Fi1g
zntlra0^|4s=Zjn>y9+|nldIYR%K|c><+HN8;0gU^>U`D$XwdSTcs>iaD<S|y0D!6M
zpm2io!7fBT3!uUL*$<d45aSDGcd~I;LMFQlW;B?E(#*A4*<DaJuMOP2$N^eJ4s<7S
zxH|`CB|<fkcK1QcLxTOoI8NHKyC5hiUv>wCC|?$GgZZ;p&XCHG5_QN6UM6jv9GfR|
z9Rn<}#+4YF6ute;%8g?Im}VNW4e~pY&!6ALJK8VlRpI2z?+j8~h(SgnG#dFG$MQc<
zf`?-9co6~`h``y!vN?g_;HjWsLkcA0y)LZhI95#gt7lPafzMgc@x8?D6H1}z<Eyjf
z1aC3OD!VcT9mh)ly^0R}wTKQ!6a(IpR9AsxE$c<uu_6(%=n%2!NVhmR7Ha-ns7DWh
z`JO0?-m*+Q;aDYCKrtE%85W?WV6tHO9fJh~5EezE2#Ju%0++|}7%ZTlgA~Zl0+-+O
zAU0+6kFscB1f_*^&w>Y+$+)wiZ?O*9t;srv>6?-2O@Iw0<X?h>N5p(jl+chQloNBJ
zFvnr_jE~GDoP~%aLL%7TEbTZBf_qN%qmY;yicmXe37Iq08~2=oKWY<mX;A`gn(*iH
zbZ`(SqQSSJQr3nO&K5ZSfIxsOy(($z-(Y8>wIvSK-QuQRU>Kd3E*lV-?x%pyLEiUs
zRCXwkjCUs+1`@Cd#M?@?nMi?=5K@%VvmoD`5pC%Vb3EsBo1ge0sXob!bLaCoYipBC
zDQjwPt!|Aqb+jiSbHS_!Hp1O91`a*Kw3VQNC}`jcs#CEUI5Y<HVhpP~iRH3%5ADEw
zxo3t2NRrYR<E1PX@K!Ll5Fxj)Fb4g={49XmkRVgGn;*0S^RWOJ18RIR3=z{*37U_V
z4f4>XN!g>7=YiQeZMT>4P`u?H5Mv73v;t|Rh%Hc2LF*2poGFY!v(JVxY><l3AkAP5
zx_S9A7Q`C~N?AfeLoc6@DU2Dcpf-ax9|C>3obsVbD#$l>DMjc~W<o6V)~pb_6)_eJ
zFynG#Cr5<MgT@>I77L;j&Y)HIv*9cV*&1jV)ojljpuJ}a1pyWVC{iF9^l^*;3q3uf
zV~Ta6kqv{}FSi8~9@;DUG+-p)pfmw`ZRJ){4A{-rZ?W3!#%Wj+h*Gc9^6MKc56JOF
z_LRh-PYuOd7~G`kSfPcMTkEqSwScaL*{AWOT1g(WJb~me-V#fOMKpALvM>QPnA2au
z6eV!9t`PsBA>*kqzMhzLNh~=jx-{P2k@OFAJ1SiDM$f*%ZB`n#Q7PxBoO3kL;4bZ#
z`m=U`a=*dt>^sUgxYL9EY_vjyyTGI^uW$fgP^dQ)_NyDV$3~TlC&<Ga+=b>^Q!1GS
zp87~7vr2=zqAzVQ7cIxy_MPA^gxi8%8{054z215=g|v<vfTZwMOM}~Co|PVggUn_A
zG7u9gU6c~Kkmg4@Ewr%!TGh4%sUZ<A(AxG?Ly6GFg`2T9wg&CWvIaBva<xA#74!g4
zRx0>1%XX5%ks!iFSsWQsK})Zp@SDOXYyIhCp@(H#W(|VY%_28LDcY1_QG(uj%Ljdu
z9F#IJlUBsl)}Asfw>UKS48WaPgN7tfJKBy(4RJZH8%i`AU3x#L;H;B>GEatz&KkVh
zpV~1?SLvY~{6Q|5;}E<@%g)S`CAddR=G2ag{l3#fc6rUEN{)+#SZVE-ktArp*ir5R
zv~lPcGz>QNV!m7K4Rt$HK#&_M%t4dLG(ABOPr~#bak?Gw4}TR7Ul-u5tJd5o_dCv?
zh{l}HM`NGQMbmq}H%+d2J59cMljcP8UA6Uns>R-oSAW=D3RlGz96he~TvNR3<`)m@
z@`rQv`(JVDm$pAgCv4fS@7%DF9xV*fE1w=h6W2zxrvgN;&sTQ$IpCPXU1*%q<EY>G
zwZ7|5tM&P_UZjtYenOX?jH;LKn@mIDx72qx@Ty}^o!;?7jT*jh;I0u%p3xRCPY}o9
zLF(AHq`LaS_3C;4s@e~CPEwEmYb9;GeZLyL<zL$OOI}xtcK(ihd2oXI<Of&PTrmAc
z^1;4t_1*2a(&_vVy<z><&{v)w`q5#xe%@J2$&SBA)FTrs=%vd-`riGG>Z0Ry`g0o=
zYq#(Ci#qkt)#_)vHtD){in`2qi+cURJ*0W*Ub0}+3{5}0O07Tnubuq05C<H-3g)9B
z>NEcwu3uO+NWY?BFU@;zy|XEQ0<I0WsJnC6CiRz}*6P!reUVhI{Z5-xwp<@~ZG*nx
z!}sWEN6JVI_gDS-(?+PjZ@7qTd3Kog`TOI^=CixC`giKpzw6KK%*%D(RL`%ZxSsJ-
zy!)}eheC(PuA=%q_jdQSBjc-PTao3Tl3Mw<^^4T-yeaD9;ltJTv!|=|C-Eh<dFtc;
zH;HWATtqfktgqQWWMwGlsad@a^VDyL7OTw{El|H*H&0!;c2jN6Q+st%{4MSZyxx+t
zeWdh??X>gmQ*`wu7k%NTRoeV7;@Yh827TbZf0LFak?z(vkJfbFHi8!4P^+#!|9|K!
zXK8fovm59LZ;MucQbnl$F(>E~5A1Rt`=W=;;9nveuPmU~U%Q6Xw0%LpyvASi^4g6x
z%`&5JeEWWFLf2F(|7w^z;Lr=?p6jO2KZZ&|YhIM-FJ8Ts^sI2`kF>0zv%mj7v}f})
z?Y60euK4}T?nUQ5LNxhtx?|@&+P~@>Qqd77Yron^D#y>DUzM$+^vE=__>vn#L#1;_
z+3vwK{^OZ++oPMi8>{|J=)^L$=AYx#bJkU9$8K=atFGTiznd2)<>iBR2Clo8ZY+O|
zK0k5`b^T#|_sknr?Sq+Ts8?KdjXL11)AY_!Yt%RE0(#*+tMq$|_R@8S=IJlre1Kj*
z=P_;0_Ca(`Nm6~cc%6E9=kN4pWrbd!r=aLv-uu;*P0hVTA=9f0$*l9Aq#tz`Y5CUo
zfzT69cZIP<m-Dju@6eXSF!j?7E^?%5cW7>9B?(SU(3Ly4kx~7FnycV$`fYPb$WyYL
z{(1Hq@>hkD$DeVKQIFP=S%a35e>^;d9vridl<ZnVcD=NZHZM3p9S2_vB_AG6rd`X>
z;}4#y4yfL*y}7!QupLL~sL$`BJwvZ0`_COgZiqcZPdVn)3)$DYSA8&=TsW{x8#L{{
z(5fk=x-ar6rDgY#>sP%Rn)=*V+Uz|yk;y%ky8ht-ZRN=K$)7&@j!b{}Y100}2JP;a
z`@1!=P@D0^E###K?j&d5x|K}1Qy^dOI7W`|+M*R4_?U{z_teZ47tmPaRC-(O3*@GS
zZ)>}vM`-;?mwWyPFn*%v>2iEJ={K5pT!a4GuUFMf`Xm@ye-_x`c^6S_&Te{e<vi{C
zs<k!SH*BM&=Y+|$?>2Pz+<fbfx5nH-r;Rv3E_~fTrD@s^HI-}kX@BYGRG;|#mF~CZ
z4i8P*%g{rSVpYFqMfX)VPOjNn{eZUb+hgR)gxWpx;Fy{ZetB;9`<MNL+|lpU?)nq|
zkjMXt(>>5&wa5ABm51q=w?<NWmy7HxDN#SF8AEOwIfI_rPKf*2m+1woo%$UMr<2oG
zht%DlJw@MrkJ9DyH|qP(yg>E5F@atan?q->dY^v1<p4SI<14iDXdtw?+pBN;vR?i5
z()ppscW<TRZgr9y%GVL!tHbr$$|C*Ow_TuL@o=%aX5ACyp*frB1GoP~x3nxK$<H>E
zWsRfh)aeVzd*gl}cYfDF_jdWHR`6%KqTo&PsP}aJ?u$a9*QQVHe&fPI^17>$c;i=)
zp<Ji-_Rfn*`8Q9|7nxde)uf=d<&;lCbKfE4#%0CyixrIie*ZJbfUuzY%dexeUg*$n
zdAf_Z&n~0)Z97EkT1V=g)Ao|;zq!=IKi)-;a}Ut-mlUi2JKjUyuIQ$NAN)pp?7oTW
z_~N7FS3MuouP5;IjgK`lPW%n+b=AFpyfnAp!il!uLU~SShPz#!Axx=<X^bSBn#0Bq
zt4chsM082GCEB)diut=X56e`$9Ued9bUS;I<4C!u)cE~ddL2?NRC|A2>6ys*VUMa_
z>j!xT+Sg7n&z_w9d$Iq>Y~z_srsno2j0|V-dHBt9CQ>&RVFN;{cUJVTKaDvxt9Mu3
z*3^7?6BKf%Geff{T3Z%JBgxJV_+eT{0tm_(CK9V|Z);l$UC#<<{XnL|{0tqNSX->^
cVxStv3^qrhTX0z{dU>^Nb^w#zIjnH}KX(CzIsgCw

literal 0
HcmV?d00001