diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 9f5cd4cc84..955957f295 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -1268,6 +1268,7 @@ Do not modify directly.*
|BiasSplitGelu|*in* X:**T**
*in* bias:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|ConvTransposeWithDynamicPads|*in* X:**T**
*in* W:**T**
*in* Pads:**tensor(int64)**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|DequantizeLinear|*in* x:**T1**
*in* x_scale:**T2**
*in* x_zero_point:**T1**
*out* y:**T2**|1+|**T1** = tensor(int32), tensor(int8), tensor(uint8)
**T2** = tensor(float), tensor(float16)|
+|DynamicQuantizeMatMul|*in* A:**T1**
*in* B:**T2**
*in* b_scale:**T1**
*in* b_zero_point:**T2**
*in* bias:**T1**
*out* Y:**T1**|1+|**T1** = tensor(float)
**T2** = tensor(int8), tensor(uint8)|
|EmbedLayerNormalization|*in* input_ids:**T1**
*in* segment_ids:**T1**
*in* word_embedding:**T**
*in* position_embedding:**T**
*in* segment_embedding:**T**
*in* gamma:**T**
*in* beta:**T**
*in* mask:**T1**
*in* position_ids:**T1**
*out* output:**T**
*out* mask_index:**T1**
*out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
|FusedMatMul|*in* A:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|FusedMatMulActivation|*in* A:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
index da57c2aa23..64ea5b7801 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DirectMLSchema.h
@@ -1865,6 +1865,25 @@ constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA {
DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA_FIELDS,
};
+constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
+ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
+ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
+ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
+ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
+ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
+ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
+ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
+ DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
+};
+
+constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
+ "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
+ static_cast(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT),
+ DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
+ 8,
+ DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
+};
+
constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS[9] {
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
@@ -1885,25 +1904,6 @@ constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHE
DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS,
};
-constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
- DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
- DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
- DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
- DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
- DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
- DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
- DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
- DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
-};
-
-constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
- "DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
- static_cast(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT),
- DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
- 8,
- DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
-};
-
constexpr DML_SCHEMA_FIELD DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA_FIELDS[11] {
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true },
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp
new file mode 100644
index 0000000000..c6a87da705
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorDynamicQuantizeMatMul.cpp
@@ -0,0 +1,173 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "precomp.h"
+
+namespace Dml
+{
+// DynamicQuantizeMatMul = MatrixMultiplyIntegerToFloat(DynamicQuantizeLinear(A), B)
+class DmlOperatorDynamicQuantizeMatMul : public DmlOperator
+{
+ // This order matches the ONNX schema.
+ enum OnnxInputIndex
+ {
+ A, // Input
+ B,
+ B_scale,
+ B_zero_point,
+ Bias,
+ Count,
+ };
+
+public:
+ DmlOperatorDynamicQuantizeMatMul(const MLOperatorKernelCreationContext& kernelCreationContext)
+ : DmlOperator(kernelCreationContext)
+ {
+ DmlOperator::Initialize(kernelCreationContext);
+
+ const bool hasBias = kernelCreationContext.IsInputValid(OnnxInputIndex::Bias);
+ const bool hasBZP = kernelCreationContext.IsInputValid(OnnxInputIndex::B_zero_point);
+
+ // Broadcast Bias tensor to the shape of the output tensor.
+ if (hasBias)
+ {
+ m_inputTensorDescs[OnnxInputIndex::Bias] = CreateTensorDescFromInput(
+ kernelCreationContext,
+ OnnxInputIndex::Bias,
+ TensorAxis::DoNotCoerce,
+ TensorAxis::W,
+ TensorAxis::RightAligned,
+ kernelCreationContext.GetTensorShapeDescription().GetOutputTensorShape(0)
+ );
+ }
+ MLOperatorTensorDataType BDatatype = kernelCreationContext.GetInputEdgeDescription(OnnxInputIndex::B).tensorDataType;
+
+ std::vector ATensorShape = kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(OnnxInputIndex::A);
+ std::vector ExpectedAScaleTensorShape = {1, 1, 1, 1};
+ std::vector ExpectedAZeroPointTensorShape = {1, 1, 1, 1};
+
+ // output edges between DynQL and MMItoFloat node
+ TensorDesc intermediateQuantizedATensorDesc = TensorDesc(
+ BDatatype,
+ gsl::make_span(ATensorShape),
+ gsl::make_span(ATensorShape),
+ TensorAxis::DoNotCoerce,
+ TensorAxis::W,
+ TensorAxis::RightAligned,
+ NchwDimensionCount, // minDimensionCount
+ 0 // guaranteedBaseOffsetAlignment
+ );
+
+ TensorDesc intermediateQuantizedAScaleTensorDesc = TensorDesc(
+ MLOperatorTensorDataType::Float,
+ gsl::make_span(ExpectedAScaleTensorShape),
+ gsl::make_span(ExpectedAScaleTensorShape),
+ TensorAxis::DoNotCoerce,
+ TensorAxis::W,
+ TensorAxis::RightAligned,
+ NchwDimensionCount, // minDimensionCount
+ 0 // guaranteedBaseOffsetAlignment
+ );
+
+ TensorDesc intermediateQuantizedAZeroPointTensorDesc = TensorDesc(
+ BDatatype,
+ gsl::make_span(ExpectedAZeroPointTensorShape),
+ gsl::make_span(ExpectedAZeroPointTensorShape),
+ TensorAxis::DoNotCoerce,
+ TensorAxis::W,
+ TensorAxis::RightAligned,
+ NchwDimensionCount, // minDimensionCount
+ 0 // guaranteedBaseOffsetAlignment
+ );
+
+ DML_TENSOR_DESC namedIntermediateQuantizedATensorDesc = intermediateQuantizedATensorDesc.GetDmlDesc();
+ DML_TENSOR_DESC namedIntermediateQuantizedAScaleTensorDesc = intermediateQuantizedAScaleTensorDesc.GetDmlDesc();
+ DML_TENSOR_DESC namedIntermediateQuantizedAZeroPointTensorDesc = intermediateQuantizedAZeroPointTensorDesc.GetDmlDesc();
+
+ std::vector inputDescs = GetDmlInputDescs();
+ std::vector outputDescs = GetDmlOutputDescs();
+
+ DML_DYNAMIC_QUANTIZE_LINEAR_OPERATOR_DESC dynamicQuantizeLinearOperatorDesc = {};
+ dynamicQuantizeLinearOperatorDesc.InputTensor = &inputDescs[OnnxInputIndex::A];
+ dynamicQuantizeLinearOperatorDesc.OutputTensor = &namedIntermediateQuantizedATensorDesc;
+ dynamicQuantizeLinearOperatorDesc.OutputScaleTensor = &namedIntermediateQuantizedAScaleTensorDesc;
+ dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor = &namedIntermediateQuantizedAZeroPointTensorDesc;
+
+ const DML_OPERATOR_DESC opDesc1{DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR, &dynamicQuantizeLinearOperatorDesc};
+
+ DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matrixMultiplyIntergerToFloatOperatorDesc = {};
+ matrixMultiplyIntergerToFloatOperatorDesc.ATensor = dynamicQuantizeLinearOperatorDesc.OutputTensor;
+ matrixMultiplyIntergerToFloatOperatorDesc.AScaleTensor = dynamicQuantizeLinearOperatorDesc.OutputScaleTensor;
+ matrixMultiplyIntergerToFloatOperatorDesc.AZeroPointTensor = dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor;
+ matrixMultiplyIntergerToFloatOperatorDesc.BTensor = &inputDescs[OnnxInputIndex::B];
+ matrixMultiplyIntergerToFloatOperatorDesc.BScaleTensor = &inputDescs[OnnxInputIndex::B_scale];
+ matrixMultiplyIntergerToFloatOperatorDesc.BZeroPointTensor = hasBZP? &inputDescs[OnnxInputIndex::B_zero_point] : nullptr;
+ matrixMultiplyIntergerToFloatOperatorDesc.BiasTensor = hasBias? &inputDescs[OnnxInputIndex::Bias] : nullptr;
+ matrixMultiplyIntergerToFloatOperatorDesc.OutputTensor = &outputDescs[0];
+
+ const DML_OPERATOR_DESC opDesc2{ DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matrixMultiplyIntergerToFloatOperatorDesc};
+
+ MLOperatorGraphDesc operatorGraphDesc = {};
+ std::vector opDescs{&opDesc1, &opDesc2};
+ operatorGraphDesc.nodeCount = static_cast(opDescs.size());
+ operatorGraphDesc.nodes = opDescs.data();
+
+ // set input edges
+ std::pair nodeToNodeInputIndex[OnnxInputIndex::Count] {{0, 0}, {1, 3}, {1, 4}, {1, 5}, {1, 6}};
+ std::vector inputEdges;
+ for (uint32_t inputIndex = 0; inputIndex < OnnxInputIndex::Count; inputIndex++)
+ {
+ if (inputIndex == OnnxInputIndex::B_zero_point && !hasBZP) continue;
+ if (inputIndex == OnnxInputIndex::Bias && !hasBias) continue;
+ DML_INPUT_GRAPH_EDGE_DESC inputEdge = {};
+ inputEdge.GraphInputIndex = inputIndex; // OnnxInputIndex and DmlInputIndex are identity for QLinearSigmoid
+ inputEdge.ToNodeIndex = nodeToNodeInputIndex[inputIndex].first;
+ inputEdge.ToNodeInputIndex = nodeToNodeInputIndex[inputIndex].second;
+ inputEdges.push_back(inputEdge);
+ }
+ operatorGraphDesc.inputEdgeCount = gsl::narrow_cast(inputEdges.size());
+ operatorGraphDesc.inputEdges = inputEdges.data();
+
+ // set intermediate edges
+ std::vector intermediateEdges;
+
+ DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge1 = {};
+ dynQLToMMItofloatEdge1.FromNodeIndex = 0;
+ dynQLToMMItofloatEdge1.FromNodeOutputIndex = 0;
+ dynQLToMMItofloatEdge1.ToNodeIndex = 1;
+ dynQLToMMItofloatEdge1.ToNodeInputIndex = 0;
+ intermediateEdges.push_back(dynQLToMMItofloatEdge1);
+
+ DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge2 = {};
+ dynQLToMMItofloatEdge2.FromNodeIndex = 0;
+ dynQLToMMItofloatEdge2.FromNodeOutputIndex = 1;
+ dynQLToMMItofloatEdge2.ToNodeIndex = 1;
+ dynQLToMMItofloatEdge2.ToNodeInputIndex = 1;
+ intermediateEdges.push_back(dynQLToMMItofloatEdge2);
+
+ DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge3 = {};
+ dynQLToMMItofloatEdge3.FromNodeIndex = 0;
+ dynQLToMMItofloatEdge3.FromNodeOutputIndex = 2;
+ dynQLToMMItofloatEdge3.ToNodeIndex = 1;
+ dynQLToMMItofloatEdge3.ToNodeInputIndex = 2;
+ intermediateEdges.push_back(dynQLToMMItofloatEdge3);
+
+ operatorGraphDesc.intermediateEdgeCount = gsl::narrow_cast(intermediateEdges.size());
+ operatorGraphDesc.intermediateEdges = intermediateEdges.data();
+
+ // set the output edges
+ std::vector outputEdges;
+ DML_OUTPUT_GRAPH_EDGE_DESC outputEdge = {};
+ outputEdge.FromNodeIndex = 1;
+ outputEdge.FromNodeOutputIndex = 0;
+ outputEdge.GraphOutputIndex = 0;
+ outputEdges.push_back(outputEdge);
+ operatorGraphDesc.outputEdgeCount = gsl::narrow_cast(outputEdges.size());
+ operatorGraphDesc.outputEdges = outputEdges.data();
+
+ SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
+ }
+};
+
+DML_OP_DEFINE_CREATION_FUNCTION(DynamicQuantizeMatMul, DmlOperatorDynamicQuantizeMatMul);
+} // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index f08151b611..38cf80b381 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -435,6 +435,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(Dropout);
DML_OP_EXTERN_CREATION_FUNCTION(MatMul);
DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMul);
DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMulActivation);
+DML_OP_EXTERN_CREATION_FUNCTION(DynamicQuantizeMatMul);
DML_OP_EXTERN_CREATION_FUNCTION(Cast);
DML_OP_EXTERN_CREATION_FUNCTION(CastLike15);
DML_OP_EXTERN_CREATION_FUNCTION(CastLike19);
@@ -1065,6 +1066,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
{REG_INFO_MS( 1, Gelu, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)},
{REG_INFO_MS( 1, BiasGelu, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)},
{REG_INFO_MS( 1, FusedMatMul, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)},
+ {REG_INFO_MS( 1, DynamicQuantizeMatMul, typeNameListTwo, supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)},
{REG_INFO_MS( 1, FusedMatMulActivation, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)},
{REG_INFO_MS( 1, QLinearSigmoid, typeNameListDefault, supportedTypeListQLinearSigmoid, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryQLinearSigmoid)},
{REG_INFO_MS( 1, Attention, typeNameListAttention, supportedTypeListAttention, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryAttention)},
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index 06bacc1b28..1f5daed6ea 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -1776,6 +1776,7 @@ using ShapeInferenceHelper_Identity19 = GetOutputShapeAsInputShapeHelper;
using ShapeInferenceHelper_MatMul = MatMulHelper;
using ShapeInferenceHelper_MatMulInteger = MatMulHelper;
using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulHelper;
+using ShapeInferenceHelper_DynamicQuantizeMatMul = MatMulHelper;
using ShapeInferenceHelper_QLinearMatMul = QLinearMatMulHelper;
using ShapeInferenceHelper_QLinearAdd = GetBroadcastedOutputShapeHelper;
using ShapeInferenceHelper_DynamicQuantizeLinear = GetOutputShapeAsInputShapeHelper;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index d081aa2e29..8de43f2705 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -462,6 +462,7 @@ namespace OperatorHelper
static const int sc_sinceVer_RotaryEmbedding = 1;
static const int sc_sinceVer_QLinearAveragePool = 1;
static const int sc_sinceVer_QLinearGlobalAveragePool = 1;
+ static const int sc_sinceVer_DynamicQuantizeMatMul = 1;
} // namespace MsftOperatorSet1
} // namespace OperatorHelper
diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
index c70f659f1b..88bee5fe1b 100644
--- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
+++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc
@@ -23,20 +23,85 @@ namespace onnxruntime {
namespace test {
template
-void TestDynamicQuantizeMatMul(const std::vector& A_dims,
- std::vector B_dims,
- const std::string& reference_model,
- bool is_matrix_b_constant,
+static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, const int64_t K,
+ const std::vector& A_data, const std::vector& B_data,
+ std::vector& B_scale, std::vector& B_zero_point,
+ const std::vector& Bias, std::vector& Y_data,
+ bool per_column, bool has_zp, bool has_bias) {
+ // DynamicQuantize Matrix A
+ const uint32_t num_elements = static_cast(M * K);
+ std::vector QuantA_data(num_elements);
+ std::vector A_scale;
+ std::vector A_zero_point;
+
+ // Get max and min
+ float min = std::numeric_limits::max();
+ float max = std::numeric_limits::lowest();
+ float qmax = static_cast(std::numeric_limits::max());
+ float qmin = static_cast(std::numeric_limits::lowest());
+
+ for (uint32_t i = 0; i < num_elements; ++i) {
+ max = std::max(A_data[i], max);
+ min = std::min(A_data[i], min);
+ }
+
+ // Adjust the maximum and minimum to include zero
+ max = std::max(max, 0.0f);
+ min = std::min(min, 0.0f);
+
+ float scale = static_cast(max - min) / (qmax - qmin);
+ T zeroPoint = std::round(std::clamp(qmin - min / scale, qmin, qmax));
+
+ A_scale.push_back(scale);
+ A_zero_point.push_back(zeroPoint);
+
+ // Matrix Multiplication
+ for (uint32_t i = 0; i < num_elements; ++i) {
+ QuantA_data[i] = static_cast(std::round((A_data[i] / scale) + zeroPoint));
+ }
+ if (!per_column) {
+ B_zero_point.resize(N, B_zero_point[0]);
+ B_scale.resize(N, B_scale[0]);
+ }
+
+ for (int64_t m = 0; m < M; m++) {
+ for (int64_t n = 0; n < N; n++) {
+ float sum = 0.0f;
+ for (int64_t k = 0; k < K; k++) {
+ float A_dequantized = (static_cast(QuantA_data[m * K + k]) - static_cast(A_zero_point[0])) * A_scale[0];
+
+ float B_dequantized = has_zp ? (static_cast(B_data[k * N + n]) - static_cast(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n];
+
+ sum += A_dequantized * B_dequantized;
+ }
+ if (has_bias) {
+ sum += Bias[n];
+ }
+ Y_data[m * N + n] = sum;
+ }
+ }
+}
+
+template
+void TestDynamicQuantizeMatMul(bool is_matrix_b_constant,
bool per_column = false,
bool has_zp = true,
- bool has_bias = false) {
+ bool has_bias = false,
+ bool empty_input = false) {
// create rand inputs
RandomValueGenerator random{};
+ int64_t M = empty_input ? 1 : 4;
+ int64_t N = 128;
+ int64_t K = 128;
+ std::vector A_dims{empty_input ? 0 : M, K};
+ std::vector B_dims{K, N};
+ std::vector Y_dims{empty_input ? 0 : M, K};
std::vector A_data = random.Uniform(A_dims, -1.0f, 1.0f);
-
std::vector B_data;
- std::vector tmp_B_data = random.Uniform(B_dims, std::numeric_limits::min(), std::numeric_limits::max());
+ std::vector tmp_B_data = random.Uniform(B_dims,
+ (std::is_same_v) ? std::numeric_limits::lowest() / 2 : std::numeric_limits::lowest(),
+ std::numeric_limits::max() / 2);
std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T {
return static_cast(v);
});
@@ -47,7 +112,9 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims,
std::for_each(B_zero_point.begin(),
B_zero_point.end(),
[&random](T& zp) {
- zp = static_cast(random.Uniform(std::array{1}, std::numeric_limits::min(), std::numeric_limits::max())[0]);
+ zp = static_cast(random.Uniform(std::array{1},
+ std::numeric_limits::min(),
+ std::numeric_limits::max())[0]);
});
std::vector Bias = random.Uniform(AsSpan({B_dims.back()}), -0.1f, 0.1f);
@@ -69,77 +136,85 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims,
test.AddOptionalInputEdge();
}
- test.AddReferenceOutputs(reference_model);
+ std::vector Y_data(M * N);
+ CalculateDynamicQuantizeMatMul(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data,
+ per_column, has_zp, has_bias);
+ test.AddOutput("Y", Y_dims, Y_data);
+ test.SetOutputRelErr("Y", 0.02f);
test.Run();
}
-template
-void RunDynamicQuantizeMatMulTest(const string& model_path) {
- std::vector A_dims{4, 128};
- std::vector B_dims{128, 128};
- std::vector Y_dims{4, 128};
-
- TestDynamicQuantizeMatMul(A_dims,
- B_dims,
- model_path,
- false, /*is_matrix_b_constant*/
- false, /*per_column*/
- HasZeroPoint, /*has_zp*/
- HasBias /*has_bias*/
+template
+void RunDynamicQuantizeMatMulTest() {
+ TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/
+ false, /*per_column*/
+ HasZeroPoint, /*has_zp*/
+ HasBias /*has_bias*/
);
- TestDynamicQuantizeMatMul(A_dims,
- B_dims,
- model_path,
- true, /*is_matrix_b_constant*/
- false, /*per_column*/
- HasZeroPoint, /*has_zp*/
- HasBias /*has_bias*/
+ TestDynamicQuantizeMatMul(true, /*is_matrix_b_constant*/
+ false, /*per_column*/
+ HasZeroPoint, /*has_zp*/
+ HasBias /*has_bias*/
);
- TestDynamicQuantizeMatMul(A_dims,
- B_dims,
- model_path,
- false, /*is_matrix_b_constant*/
- true, /*per_column*/
- HasZeroPoint, /*has_zp*/
- HasBias /*has_bias*/
+ TestDynamicQuantizeMatMul(false, /*is_matrix_b_constant*/
+ true, /*per_column*/
+ HasZeroPoint, /*has_zp*/
+ HasBias /*has_bias*/
);
- TestDynamicQuantizeMatMul(A_dims,
- B_dims,
- model_path,
- true, /*is_matrix_b_constant*/
- true, /*per_column*/
- HasZeroPoint, /*has_zp*/
- HasBias /*has_bias*/
+ TestDynamicQuantizeMatMul(true, /*is_matrix_b_constant*/
+ true, /*per_column*/
+ HasZeroPoint, /*has_zp*/
+ HasBias /*has_bias*/
);
}
-TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test) {
- RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_int8.onnx");
- RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_uint8.onnx");
+TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_S8) {
+ RunDynamicQuantizeMatMulTest();
}
-TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test) {
- RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_int8_bias.onnx");
- RunDynamicQuantizeMatMulTest("testdata/dynamic_quantize_matmul_uint8_bias.onnx");
+TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_U8) {
+ RunDynamicQuantizeMatMulTest();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_S8) {
+ RunDynamicQuantizeMatMulTest();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_U8) {
+ RunDynamicQuantizeMatMulTest();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_S8) {
+ RunDynamicQuantizeMatMulTest();
+}
+
+TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_U8) {
+ RunDynamicQuantizeMatMulTest();
+}
+
+TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_S8) {
+ RunDynamicQuantizeMatMulTest();
+}
+
+TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_U8) {
+ RunDynamicQuantizeMatMulTest();
}
TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) {
- std::vector A_dims{0, 128};
- std::vector B_dims{128, 128};
- std::vector Y_dims{0, 128};
-
- TestDynamicQuantizeMatMul(A_dims,
- B_dims,
- "testdata/dynamic_quantize_matmul_uint8.onnx",
- false /*is_matrix_b_constant*/);
-
- TestDynamicQuantizeMatMul(A_dims,
- B_dims,
- "testdata/dynamic_quantize_matmul_uint8.onnx",
- true /*is_matrix_b_constant*/);
+ std::vector A_dims{0, 2};
+ std::vector B_dims{2, 2};
+ std::vector Y_dims{0, 2};
+ OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain);
+ test.AddInput("T1", A_dims, {});
+ test.AddInput("T2", B_dims, {1, 6, 0, 8});
+ test.AddInput("b_scale", {1}, {1.0f});
+ test.AddInput("b_zero_point", {1}, {0});
+ test.AddOptionalInputEdge();
+ test.AddOutput("Y", {0, 2}, {});
+ test.Run();
}
TEST(DynamicQuantizeMatMul, B_PerColumn_ND) {