diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index 40f70e0b6b..1e6d46963c 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -2408,6 +2408,8 @@ This version of the operator has been available since version 1 of the 'com.micr
#### Attributes
+- mask_filter_value : float
+- The value to be filled in the attention mask. Default value is -10000.0f
- num_heads : int (required)
- Number of attention heads
- past_present_share_buffer : int
diff --git a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
index c45b5a79e5..6111afbd5d 100644
--- a/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/quantization_defs.cc
@@ -952,6 +952,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
.Attr("past_present_share_buffer", "Corresponding past and present are same tensor, its shape is "
"(2, batch_size, num_heads, max_sequence_length, head_size)",
AttributeProto::INT, OPTIONAL_VALUE)
+ .Attr("mask_filter_value",
+ "The value to be filled in the attention mask. Default value is -10000.0f",
+ AttributeProto::FLOAT,
+ OPTIONAL_VALUE)
.Attr("scale",
"Custom scale will be used if specified. Default value is 1/sqrt(head_size)",
AttributeProto::FLOAT,