diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index a62d757aa9..656f0e86d2 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -1126,6 +1126,8 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Attributes
+- mask_filter_value : float
+- The value to be filled in the attention mask. Default value is -10000.0f
- num_heads : int (required)
- Number of attention heads
- past_present_share_buffer : int
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
index ae9e0c1324..b205b64954 100644
--- a/onnxruntime/core/graph/contrib_ops/bert_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -473,6 +473,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
"Custom scale will be used if specified. Default value is 1/sqrt(head_size)",
AttributeProto::FLOAT,
OPTIONAL_VALUE)
+ .Attr("mask_filter_value",
+ "The value to be filled in the attention mask. Default value is -10000.0f",
+ AttributeProto::FLOAT,
+ OPTIONAL_VALUE)
.Input(0,
"input",
"Input tensor with shape (batch_size, 1, input_hidden_size)",
@@ -571,7 +575,7 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
.Input(1,
"key",
"Key with shape (batch_size, kv_sequence_length, hidden_size), or packed KV with shape (batch_size, kv_sequence_length, num_heads, 2, head_size), "
- "or past_key with shape (batch_size, num_heads, kv_sequence_length, head_size)",
+ "or past_key with shape (batch_size, num_heads, kv_sequence_length, head_size)",
"T",
OpSchema::Optional)
.Input(2,
diff --git a/onnxruntime/python/tools/transformers/convert_generation.py b/onnxruntime/python/tools/transformers/convert_generation.py
index a7ea3b4e05..22690dc18e 100644
--- a/onnxruntime/python/tools/transformers/convert_generation.py
+++ b/onnxruntime/python/tools/transformers/convert_generation.py
@@ -1106,6 +1106,7 @@ def update_decoder_subgraph_use_decoder_masked_multihead_attention(
"past_present_share_buffer",
"num_heads",
"scale",
+ "mask_filter_value",
"domain",
]