mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
[DML] DynamicQuantizeMatMul (#19763)
### Description DML Implementation for [com.microsoft.DynamicQuantizeMatMul ](https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#com.microsoft.DynamicQuantizeMatMul) ``` .\onnxruntime_test_all.exe --gtest_filter="*DynamicQuantizeMatMul.*" Note: Google Test filter = *DynamicQuantizeMatMul.* [==========] Running 10 tests from 1 test suite. [----------] Global test environment set-up. [----------] 10 tests from DynamicQuantizeMatMul [ RUN ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_S8 [ OK ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_S8 (635 ms) [ RUN ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_U8 [ OK ] DynamicQuantizeMatMul.HasZeroPoint_NoBias_test_U8 (514 ms) [ RUN ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_S8 [ OK ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_S8 (512 ms) [ RUN ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_U8 [ OK ] DynamicQuantizeMatMul.NoZeroPoint_HasBias_test_U8 (505 ms) [ RUN ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_S8 [ OK ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_S8 (526 ms) [ RUN ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_U8 [ OK ] DynamicQuantizeMatMul.NoZeroPoint_NoBias_test_U8 (504 ms) [ RUN ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_S8 [ OK ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_S8 (512 ms) [ RUN ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_U8 [ OK ] DynamicQuantizeMatMul.HasZeroPoint_HasBias_test_U8 (512 ms) [ RUN ] DynamicQuantizeMatMul.UInt8_test_with_empty_input [ OK ] DynamicQuantizeMatMul.UInt8_test_with_empty_input (112 ms) [ RUN ] DynamicQuantizeMatMul.B_PerColumn_ND [ OK ] DynamicQuantizeMatMul.B_PerColumn_ND (348 ms) [----------] 10 tests from DynamicQuantizeMatMul (4685 ms total) [----------] Global test environment tear-down [==========] 10 tests from 1 test suite ran. (4686 ms total) [ PASSED ] 10 tests. memleakdbg: ----- No memory leaks detected ----- ``` ### Motivation and Context - CalculateDynamicQuantizeMatMul to replace CPU EP run reference - Added more FP32 testcases to isolate all input datatype combinations --------- Co-authored-by: Xiang Zhang <xianz@microsoft.com>
This commit is contained in:
parent
7deee944c0
commit
fa73d7cbf9
7 changed files with 334 additions and 81 deletions
|
|
@ -1268,6 +1268,7 @@ Do not modify directly.*
|
|||
|BiasSplitGelu|*in* X:**T**<br> *in* bias:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|
||||
|ConvTransposeWithDynamicPads|*in* X:**T**<br> *in* W:**T**<br> *in* Pads:**tensor(int64)**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|
||||
|DequantizeLinear|*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|1+|**T1** = tensor(int32), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
|
||||
|DynamicQuantizeMatMul|*in* A:**T1**<br> *in* B:**T2**<br> *in* b_scale:**T1**<br> *in* b_zero_point:**T2**<br> *in* bias:**T1**<br> *out* Y:**T1**|1+|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
|
||||
|EmbedLayerNormalization|*in* input_ids:**T1**<br> *in* segment_ids:**T1**<br> *in* word_embedding:**T**<br> *in* position_embedding:**T**<br> *in* segment_embedding:**T**<br> *in* gamma:**T**<br> *in* beta:**T**<br> *in* mask:**T1**<br> *in* position_ids:**T1**<br> *out* output:**T**<br> *out* mask_index:**T1**<br> *out* embedding_sum:**T**|1+|**T** = tensor(float), tensor(float16)|
|
||||
|FusedMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|
||||
|FusedMatMulActivation|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
|
||||
|
|
|
|||
|
|
@ -1865,6 +1865,25 @@ constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA {
|
|||
DML_MATRIX_MULTIPLY_INTEGER_OPERATOR_SCHEMA_FIELDS,
|
||||
};
|
||||
|
||||
constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
|
||||
};
|
||||
|
||||
constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
|
||||
"DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
|
||||
static_cast<DML_OPERATOR_TYPE>(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT),
|
||||
DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
|
||||
8,
|
||||
DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
|
||||
};
|
||||
|
||||
constexpr DML_SCHEMA_FIELD DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS[9] {
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
|
||||
|
|
@ -1885,25 +1904,6 @@ constexpr DML_OPERATOR_SCHEMA DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHE
|
|||
DML_QUANTIZED_LINEAR_MATRIX_MULTIPLY_OPERATOR_SCHEMA_FIELDS,
|
||||
};
|
||||
|
||||
constexpr DML_SCHEMA_FIELD DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS[8] {
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "ATensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AScaleTensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "AZeroPointTensor", true },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BTensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BScaleTensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BZeroPointTensor", true },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "BiasTensor", true },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_OUTPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "OutputTensor", false },
|
||||
};
|
||||
|
||||
constexpr DML_OPERATOR_SCHEMA DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA {
|
||||
"DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT",
|
||||
static_cast<DML_OPERATOR_TYPE>(DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT),
|
||||
DML_SCHEMA_OPERATOR_SUPPORT_FLAG_NONE,
|
||||
8,
|
||||
DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_SCHEMA_FIELDS,
|
||||
};
|
||||
|
||||
constexpr DML_SCHEMA_FIELD DML_CONVOLUTION_INTEGER_OPERATOR_SCHEMA_FIELDS[11] {
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputTensor", false },
|
||||
DML_SCHEMA_FIELD { DML_SCHEMA_FIELD_KIND_INPUT_TENSOR, DML_SCHEMA_FIELD_TYPE_TENSOR_DESC, "InputZeroPointTensor", true },
|
||||
|
|
|
|||
|
|
@ -0,0 +1,173 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "precomp.h"
|
||||
|
||||
namespace Dml
|
||||
{
|
||||
// DynamicQuantizeMatMul = MatrixMultiplyIntegerToFloat(DynamicQuantizeLinear(A), B)
|
||||
class DmlOperatorDynamicQuantizeMatMul : public DmlOperator
|
||||
{
|
||||
// This order matches the ONNX schema.
|
||||
enum OnnxInputIndex
|
||||
{
|
||||
A, // Input
|
||||
B,
|
||||
B_scale,
|
||||
B_zero_point,
|
||||
Bias,
|
||||
Count,
|
||||
};
|
||||
|
||||
public:
|
||||
DmlOperatorDynamicQuantizeMatMul(const MLOperatorKernelCreationContext& kernelCreationContext)
|
||||
: DmlOperator(kernelCreationContext)
|
||||
{
|
||||
DmlOperator::Initialize(kernelCreationContext);
|
||||
|
||||
const bool hasBias = kernelCreationContext.IsInputValid(OnnxInputIndex::Bias);
|
||||
const bool hasBZP = kernelCreationContext.IsInputValid(OnnxInputIndex::B_zero_point);
|
||||
|
||||
// Broadcast Bias tensor to the shape of the output tensor.
|
||||
if (hasBias)
|
||||
{
|
||||
m_inputTensorDescs[OnnxInputIndex::Bias] = CreateTensorDescFromInput(
|
||||
kernelCreationContext,
|
||||
OnnxInputIndex::Bias,
|
||||
TensorAxis::DoNotCoerce,
|
||||
TensorAxis::W,
|
||||
TensorAxis::RightAligned,
|
||||
kernelCreationContext.GetTensorShapeDescription().GetOutputTensorShape(0)
|
||||
);
|
||||
}
|
||||
MLOperatorTensorDataType BDatatype = kernelCreationContext.GetInputEdgeDescription(OnnxInputIndex::B).tensorDataType;
|
||||
|
||||
std::vector<uint32_t> ATensorShape = kernelCreationContext.GetTensorShapeDescription().GetInputTensorShape(OnnxInputIndex::A);
|
||||
std::vector<uint32_t> ExpectedAScaleTensorShape = {1, 1, 1, 1};
|
||||
std::vector<uint32_t> ExpectedAZeroPointTensorShape = {1, 1, 1, 1};
|
||||
|
||||
// output edges between DynQL and MMItoFloat node
|
||||
TensorDesc intermediateQuantizedATensorDesc = TensorDesc(
|
||||
BDatatype,
|
||||
gsl::make_span(ATensorShape),
|
||||
gsl::make_span(ATensorShape),
|
||||
TensorAxis::DoNotCoerce,
|
||||
TensorAxis::W,
|
||||
TensorAxis::RightAligned,
|
||||
NchwDimensionCount, // minDimensionCount
|
||||
0 // guaranteedBaseOffsetAlignment
|
||||
);
|
||||
|
||||
TensorDesc intermediateQuantizedAScaleTensorDesc = TensorDesc(
|
||||
MLOperatorTensorDataType::Float,
|
||||
gsl::make_span(ExpectedAScaleTensorShape),
|
||||
gsl::make_span(ExpectedAScaleTensorShape),
|
||||
TensorAxis::DoNotCoerce,
|
||||
TensorAxis::W,
|
||||
TensorAxis::RightAligned,
|
||||
NchwDimensionCount, // minDimensionCount
|
||||
0 // guaranteedBaseOffsetAlignment
|
||||
);
|
||||
|
||||
TensorDesc intermediateQuantizedAZeroPointTensorDesc = TensorDesc(
|
||||
BDatatype,
|
||||
gsl::make_span(ExpectedAZeroPointTensorShape),
|
||||
gsl::make_span(ExpectedAZeroPointTensorShape),
|
||||
TensorAxis::DoNotCoerce,
|
||||
TensorAxis::W,
|
||||
TensorAxis::RightAligned,
|
||||
NchwDimensionCount, // minDimensionCount
|
||||
0 // guaranteedBaseOffsetAlignment
|
||||
);
|
||||
|
||||
DML_TENSOR_DESC namedIntermediateQuantizedATensorDesc = intermediateQuantizedATensorDesc.GetDmlDesc();
|
||||
DML_TENSOR_DESC namedIntermediateQuantizedAScaleTensorDesc = intermediateQuantizedAScaleTensorDesc.GetDmlDesc();
|
||||
DML_TENSOR_DESC namedIntermediateQuantizedAZeroPointTensorDesc = intermediateQuantizedAZeroPointTensorDesc.GetDmlDesc();
|
||||
|
||||
std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
|
||||
std::vector<DML_TENSOR_DESC> outputDescs = GetDmlOutputDescs();
|
||||
|
||||
DML_DYNAMIC_QUANTIZE_LINEAR_OPERATOR_DESC dynamicQuantizeLinearOperatorDesc = {};
|
||||
dynamicQuantizeLinearOperatorDesc.InputTensor = &inputDescs[OnnxInputIndex::A];
|
||||
dynamicQuantizeLinearOperatorDesc.OutputTensor = &namedIntermediateQuantizedATensorDesc;
|
||||
dynamicQuantizeLinearOperatorDesc.OutputScaleTensor = &namedIntermediateQuantizedAScaleTensorDesc;
|
||||
dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor = &namedIntermediateQuantizedAZeroPointTensorDesc;
|
||||
|
||||
const DML_OPERATOR_DESC opDesc1{DML_OPERATOR_DYNAMIC_QUANTIZE_LINEAR, &dynamicQuantizeLinearOperatorDesc};
|
||||
|
||||
DML_MATRIX_MULTIPLY_INTEGER_TO_FLOAT_OPERATOR_DESC matrixMultiplyIntergerToFloatOperatorDesc = {};
|
||||
matrixMultiplyIntergerToFloatOperatorDesc.ATensor = dynamicQuantizeLinearOperatorDesc.OutputTensor;
|
||||
matrixMultiplyIntergerToFloatOperatorDesc.AScaleTensor = dynamicQuantizeLinearOperatorDesc.OutputScaleTensor;
|
||||
matrixMultiplyIntergerToFloatOperatorDesc.AZeroPointTensor = dynamicQuantizeLinearOperatorDesc.OutputZeroPointTensor;
|
||||
matrixMultiplyIntergerToFloatOperatorDesc.BTensor = &inputDescs[OnnxInputIndex::B];
|
||||
matrixMultiplyIntergerToFloatOperatorDesc.BScaleTensor = &inputDescs[OnnxInputIndex::B_scale];
|
||||
matrixMultiplyIntergerToFloatOperatorDesc.BZeroPointTensor = hasBZP? &inputDescs[OnnxInputIndex::B_zero_point] : nullptr;
|
||||
matrixMultiplyIntergerToFloatOperatorDesc.BiasTensor = hasBias? &inputDescs[OnnxInputIndex::Bias] : nullptr;
|
||||
matrixMultiplyIntergerToFloatOperatorDesc.OutputTensor = &outputDescs[0];
|
||||
|
||||
const DML_OPERATOR_DESC opDesc2{ DML_OPERATOR_MATRIX_MULTIPLY_INTEGER_TO_FLOAT, &matrixMultiplyIntergerToFloatOperatorDesc};
|
||||
|
||||
MLOperatorGraphDesc operatorGraphDesc = {};
|
||||
std::vector<const DML_OPERATOR_DESC*> opDescs{&opDesc1, &opDesc2};
|
||||
operatorGraphDesc.nodeCount = static_cast<uint32_t>(opDescs.size());
|
||||
operatorGraphDesc.nodes = opDescs.data();
|
||||
|
||||
// set input edges
|
||||
std::pair<uint32_t, uint32_t> nodeToNodeInputIndex[OnnxInputIndex::Count] {{0, 0}, {1, 3}, {1, 4}, {1, 5}, {1, 6}};
|
||||
std::vector<DML_INPUT_GRAPH_EDGE_DESC> inputEdges;
|
||||
for (uint32_t inputIndex = 0; inputIndex < OnnxInputIndex::Count; inputIndex++)
|
||||
{
|
||||
if (inputIndex == OnnxInputIndex::B_zero_point && !hasBZP) continue;
|
||||
if (inputIndex == OnnxInputIndex::Bias && !hasBias) continue;
|
||||
DML_INPUT_GRAPH_EDGE_DESC inputEdge = {};
|
||||
inputEdge.GraphInputIndex = inputIndex; // OnnxInputIndex and DmlInputIndex are identity for QLinearSigmoid
|
||||
inputEdge.ToNodeIndex = nodeToNodeInputIndex[inputIndex].first;
|
||||
inputEdge.ToNodeInputIndex = nodeToNodeInputIndex[inputIndex].second;
|
||||
inputEdges.push_back(inputEdge);
|
||||
}
|
||||
operatorGraphDesc.inputEdgeCount = gsl::narrow_cast<uint32_t>(inputEdges.size());
|
||||
operatorGraphDesc.inputEdges = inputEdges.data();
|
||||
|
||||
// set intermediate edges
|
||||
std::vector<DML_INTERMEDIATE_GRAPH_EDGE_DESC> intermediateEdges;
|
||||
|
||||
DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge1 = {};
|
||||
dynQLToMMItofloatEdge1.FromNodeIndex = 0;
|
||||
dynQLToMMItofloatEdge1.FromNodeOutputIndex = 0;
|
||||
dynQLToMMItofloatEdge1.ToNodeIndex = 1;
|
||||
dynQLToMMItofloatEdge1.ToNodeInputIndex = 0;
|
||||
intermediateEdges.push_back(dynQLToMMItofloatEdge1);
|
||||
|
||||
DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge2 = {};
|
||||
dynQLToMMItofloatEdge2.FromNodeIndex = 0;
|
||||
dynQLToMMItofloatEdge2.FromNodeOutputIndex = 1;
|
||||
dynQLToMMItofloatEdge2.ToNodeIndex = 1;
|
||||
dynQLToMMItofloatEdge2.ToNodeInputIndex = 1;
|
||||
intermediateEdges.push_back(dynQLToMMItofloatEdge2);
|
||||
|
||||
DML_INTERMEDIATE_GRAPH_EDGE_DESC dynQLToMMItofloatEdge3 = {};
|
||||
dynQLToMMItofloatEdge3.FromNodeIndex = 0;
|
||||
dynQLToMMItofloatEdge3.FromNodeOutputIndex = 2;
|
||||
dynQLToMMItofloatEdge3.ToNodeIndex = 1;
|
||||
dynQLToMMItofloatEdge3.ToNodeInputIndex = 2;
|
||||
intermediateEdges.push_back(dynQLToMMItofloatEdge3);
|
||||
|
||||
operatorGraphDesc.intermediateEdgeCount = gsl::narrow_cast<uint32_t>(intermediateEdges.size());
|
||||
operatorGraphDesc.intermediateEdges = intermediateEdges.data();
|
||||
|
||||
// set the output edges
|
||||
std::vector<DML_OUTPUT_GRAPH_EDGE_DESC> outputEdges;
|
||||
DML_OUTPUT_GRAPH_EDGE_DESC outputEdge = {};
|
||||
outputEdge.FromNodeIndex = 1;
|
||||
outputEdge.FromNodeOutputIndex = 0;
|
||||
outputEdge.GraphOutputIndex = 0;
|
||||
outputEdges.push_back(outputEdge);
|
||||
operatorGraphDesc.outputEdgeCount = gsl::narrow_cast<uint32_t>(outputEdges.size());
|
||||
operatorGraphDesc.outputEdges = outputEdges.data();
|
||||
|
||||
SetDmlOperatorGraphDesc(std::move(operatorGraphDesc), kernelCreationContext);
|
||||
}
|
||||
};
|
||||
|
||||
DML_OP_DEFINE_CREATION_FUNCTION(DynamicQuantizeMatMul, DmlOperatorDynamicQuantizeMatMul);
|
||||
} // namespace Dml
|
||||
|
|
@ -435,6 +435,7 @@ DML_OP_EXTERN_CREATION_FUNCTION(Dropout);
|
|||
DML_OP_EXTERN_CREATION_FUNCTION(MatMul);
|
||||
DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMul);
|
||||
DML_OP_EXTERN_CREATION_FUNCTION(FusedMatMulActivation);
|
||||
DML_OP_EXTERN_CREATION_FUNCTION(DynamicQuantizeMatMul);
|
||||
DML_OP_EXTERN_CREATION_FUNCTION(Cast);
|
||||
DML_OP_EXTERN_CREATION_FUNCTION(CastLike15);
|
||||
DML_OP_EXTERN_CREATION_FUNCTION(CastLike19);
|
||||
|
|
@ -1065,6 +1066,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
|
|||
{REG_INFO_MS( 1, Gelu, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)},
|
||||
{REG_INFO_MS( 1, BiasGelu, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)},
|
||||
{REG_INFO_MS( 1, FusedMatMul, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)},
|
||||
{REG_INFO_MS( 1, DynamicQuantizeMatMul, typeNameListTwo, supportedTypeListDynamicQuantizeLinear, DmlGraphSupport::Supported)},
|
||||
{REG_INFO_MS( 1, FusedMatMulActivation, typeNameListDefault, supportedTypeListFloat16to32, DmlGraphSupport::Supported)},
|
||||
{REG_INFO_MS( 1, QLinearSigmoid, typeNameListDefault, supportedTypeListQLinearSigmoid, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryQLinearSigmoid)},
|
||||
{REG_INFO_MS( 1, Attention, typeNameListAttention, supportedTypeListAttention, DmlGraphSupport::Supported, requiredConstantCpuInputs(), std::nullopt, QueryAttention)},
|
||||
|
|
|
|||
|
|
@ -1776,6 +1776,7 @@ using ShapeInferenceHelper_Identity19 = GetOutputShapeAsInputShapeHelper;
|
|||
using ShapeInferenceHelper_MatMul = MatMulHelper;
|
||||
using ShapeInferenceHelper_MatMulInteger = MatMulHelper;
|
||||
using ShapeInferenceHelper_MatMulIntegerToFloat = MatMulHelper;
|
||||
using ShapeInferenceHelper_DynamicQuantizeMatMul = MatMulHelper;
|
||||
using ShapeInferenceHelper_QLinearMatMul = QLinearMatMulHelper;
|
||||
using ShapeInferenceHelper_QLinearAdd = GetBroadcastedOutputShapeHelper;
|
||||
using ShapeInferenceHelper_DynamicQuantizeLinear = GetOutputShapeAsInputShapeHelper;
|
||||
|
|
|
|||
|
|
@ -462,6 +462,7 @@ namespace OperatorHelper
|
|||
static const int sc_sinceVer_RotaryEmbedding = 1;
|
||||
static const int sc_sinceVer_QLinearAveragePool = 1;
|
||||
static const int sc_sinceVer_QLinearGlobalAveragePool = 1;
|
||||
static const int sc_sinceVer_DynamicQuantizeMatMul = 1;
|
||||
} // namespace MsftOperatorSet1
|
||||
|
||||
} // namespace OperatorHelper
|
||||
|
|
|
|||
|
|
@ -23,20 +23,85 @@ namespace onnxruntime {
|
|||
namespace test {
|
||||
|
||||
template <typename T>
|
||||
void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
|
||||
std::vector<int64_t> B_dims,
|
||||
const std::string& reference_model,
|
||||
bool is_matrix_b_constant,
|
||||
static void CalculateDynamicQuantizeMatMul(const int64_t M, const int64_t N, const int64_t K,
|
||||
const std::vector<float>& A_data, const std::vector<T>& B_data,
|
||||
std::vector<float>& B_scale, std::vector<T>& B_zero_point,
|
||||
const std::vector<float>& Bias, std::vector<float>& Y_data,
|
||||
bool per_column, bool has_zp, bool has_bias) {
|
||||
// DynamicQuantize Matrix A
|
||||
const uint32_t num_elements = static_cast<uint32_t>(M * K);
|
||||
std::vector<T> QuantA_data(num_elements);
|
||||
std::vector<float> A_scale;
|
||||
std::vector<T> A_zero_point;
|
||||
|
||||
// Get max and min
|
||||
float min = std::numeric_limits<float>::max();
|
||||
float max = std::numeric_limits<float>::lowest();
|
||||
float qmax = static_cast<float>(std::numeric_limits<T>::max());
|
||||
float qmin = static_cast<float>(std::numeric_limits<T>::lowest());
|
||||
|
||||
for (uint32_t i = 0; i < num_elements; ++i) {
|
||||
max = std::max(A_data[i], max);
|
||||
min = std::min(A_data[i], min);
|
||||
}
|
||||
|
||||
// Adjust the maximum and minimum to include zero
|
||||
max = std::max(max, 0.0f);
|
||||
min = std::min(min, 0.0f);
|
||||
|
||||
float scale = static_cast<float>(max - min) / (qmax - qmin);
|
||||
T zeroPoint = std::round(std::clamp(qmin - min / scale, qmin, qmax));
|
||||
|
||||
A_scale.push_back(scale);
|
||||
A_zero_point.push_back(zeroPoint);
|
||||
|
||||
// Matrix Multiplication
|
||||
for (uint32_t i = 0; i < num_elements; ++i) {
|
||||
QuantA_data[i] = static_cast<T>(std::round((A_data[i] / scale) + zeroPoint));
|
||||
}
|
||||
if (!per_column) {
|
||||
B_zero_point.resize(N, B_zero_point[0]);
|
||||
B_scale.resize(N, B_scale[0]);
|
||||
}
|
||||
|
||||
for (int64_t m = 0; m < M; m++) {
|
||||
for (int64_t n = 0; n < N; n++) {
|
||||
float sum = 0.0f;
|
||||
for (int64_t k = 0; k < K; k++) {
|
||||
float A_dequantized = (static_cast<int>(QuantA_data[m * K + k]) - static_cast<int>(A_zero_point[0])) * A_scale[0];
|
||||
|
||||
float B_dequantized = has_zp ? (static_cast<int>(B_data[k * N + n]) - static_cast<int>(B_zero_point[n])) * B_scale[n] : B_data[k * N + n] * B_scale[n];
|
||||
|
||||
sum += A_dequantized * B_dequantized;
|
||||
}
|
||||
if (has_bias) {
|
||||
sum += Bias[n];
|
||||
}
|
||||
Y_data[m * N + n] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void TestDynamicQuantizeMatMul(bool is_matrix_b_constant,
|
||||
bool per_column = false,
|
||||
bool has_zp = true,
|
||||
bool has_bias = false) {
|
||||
bool has_bias = false,
|
||||
bool empty_input = false) {
|
||||
// create rand inputs
|
||||
RandomValueGenerator random{};
|
||||
|
||||
int64_t M = empty_input ? 1 : 4;
|
||||
int64_t N = 128;
|
||||
int64_t K = 128;
|
||||
std::vector<int64_t> A_dims{empty_input ? 0 : M, K};
|
||||
std::vector<int64_t> B_dims{K, N};
|
||||
std::vector<int64_t> Y_dims{empty_input ? 0 : M, K};
|
||||
std::vector<float> A_data = random.Uniform<float>(A_dims, -1.0f, 1.0f);
|
||||
|
||||
std::vector<T> B_data;
|
||||
std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
|
||||
std::vector<T> tmp_B_data = random.Uniform<T>(B_dims,
|
||||
(std::is_same_v<T, int8_t>) ? std::numeric_limits<int8_t>::lowest() / 2 : std::numeric_limits<uint8_t>::lowest(),
|
||||
std::numeric_limits<T>::max() / 2);
|
||||
std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T {
|
||||
return static_cast<T>(v);
|
||||
});
|
||||
|
|
@ -47,7 +112,9 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
|
|||
std::for_each(B_zero_point.begin(),
|
||||
B_zero_point.end(),
|
||||
[&random](T& zp) {
|
||||
zp = static_cast<T>(random.Uniform<int32_t>(std::array<int64_t, 1>{1}, std::numeric_limits<T>::min(), std::numeric_limits<T>::max())[0]);
|
||||
zp = static_cast<T>(random.Uniform<T>(std::array<int64_t, 1>{1},
|
||||
std::numeric_limits<T>::min(),
|
||||
std::numeric_limits<T>::max())[0]);
|
||||
});
|
||||
|
||||
std::vector<float> Bias = random.Uniform<float>(AsSpan({B_dims.back()}), -0.1f, 0.1f);
|
||||
|
|
@ -69,77 +136,85 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
|
|||
test.AddOptionalInputEdge<float>();
|
||||
}
|
||||
|
||||
test.AddReferenceOutputs(reference_model);
|
||||
std::vector<float> Y_data(M * N);
|
||||
CalculateDynamicQuantizeMatMul<T>(M, N, K, A_data, B_data, B_scale, B_zero_point, Bias, Y_data,
|
||||
per_column, has_zp, has_bias);
|
||||
test.AddOutput<float>("Y", Y_dims, Y_data);
|
||||
test.SetOutputRelErr("Y", 0.02f);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
template <typename Scalar, bool HasZeroPoint, bool HasBias>
|
||||
void RunDynamicQuantizeMatMulTest(const string& model_path) {
|
||||
std::vector<int64_t> A_dims{4, 128};
|
||||
std::vector<int64_t> B_dims{128, 128};
|
||||
std::vector<int64_t> Y_dims{4, 128};
|
||||
|
||||
TestDynamicQuantizeMatMul<Scalar>(A_dims,
|
||||
B_dims,
|
||||
model_path,
|
||||
false, /*is_matrix_b_constant*/
|
||||
false, /*per_column*/
|
||||
HasZeroPoint, /*has_zp*/
|
||||
HasBias /*has_bias*/
|
||||
template <typename T, bool HasZeroPoint, bool HasBias>
|
||||
void RunDynamicQuantizeMatMulTest() {
|
||||
TestDynamicQuantizeMatMul<T>(false, /*is_matrix_b_constant*/
|
||||
false, /*per_column*/
|
||||
HasZeroPoint, /*has_zp*/
|
||||
HasBias /*has_bias*/
|
||||
);
|
||||
|
||||
TestDynamicQuantizeMatMul<Scalar>(A_dims,
|
||||
B_dims,
|
||||
model_path,
|
||||
true, /*is_matrix_b_constant*/
|
||||
false, /*per_column*/
|
||||
HasZeroPoint, /*has_zp*/
|
||||
HasBias /*has_bias*/
|
||||
TestDynamicQuantizeMatMul<T>(true, /*is_matrix_b_constant*/
|
||||
false, /*per_column*/
|
||||
HasZeroPoint, /*has_zp*/
|
||||
HasBias /*has_bias*/
|
||||
);
|
||||
|
||||
TestDynamicQuantizeMatMul<Scalar>(A_dims,
|
||||
B_dims,
|
||||
model_path,
|
||||
false, /*is_matrix_b_constant*/
|
||||
true, /*per_column*/
|
||||
HasZeroPoint, /*has_zp*/
|
||||
HasBias /*has_bias*/
|
||||
TestDynamicQuantizeMatMul<T>(false, /*is_matrix_b_constant*/
|
||||
true, /*per_column*/
|
||||
HasZeroPoint, /*has_zp*/
|
||||
HasBias /*has_bias*/
|
||||
);
|
||||
|
||||
TestDynamicQuantizeMatMul<Scalar>(A_dims,
|
||||
B_dims,
|
||||
model_path,
|
||||
true, /*is_matrix_b_constant*/
|
||||
true, /*per_column*/
|
||||
HasZeroPoint, /*has_zp*/
|
||||
HasBias /*has_bias*/
|
||||
TestDynamicQuantizeMatMul<T>(true, /*is_matrix_b_constant*/
|
||||
true, /*per_column*/
|
||||
HasZeroPoint, /*has_zp*/
|
||||
HasBias /*has_bias*/
|
||||
);
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test) {
|
||||
RunDynamicQuantizeMatMulTest<int8_t, true, false>("testdata/dynamic_quantize_matmul_int8.onnx");
|
||||
RunDynamicQuantizeMatMulTest<uint8_t, true, false>("testdata/dynamic_quantize_matmul_uint8.onnx");
|
||||
TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_S8) {
|
||||
RunDynamicQuantizeMatMulTest<int8_t, true, false>();
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test) {
|
||||
RunDynamicQuantizeMatMulTest<int8_t, false, true>("testdata/dynamic_quantize_matmul_int8_bias.onnx");
|
||||
RunDynamicQuantizeMatMulTest<uint8_t, false, true>("testdata/dynamic_quantize_matmul_uint8_bias.onnx");
|
||||
TEST(DynamicQuantizeMatMul, HasZeroPoint_NoBias_test_U8) {
|
||||
RunDynamicQuantizeMatMulTest<uint8_t, true, false>();
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_S8) {
|
||||
RunDynamicQuantizeMatMulTest<int8_t, false, true>();
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, NoZeroPoint_HasBias_test_U8) {
|
||||
RunDynamicQuantizeMatMulTest<uint8_t, false, true>();
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_S8) {
|
||||
RunDynamicQuantizeMatMulTest<int8_t, false, false>();
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, NoZeroPoint_NoBias_test_U8) {
|
||||
RunDynamicQuantizeMatMulTest<uint8_t, false, false>();
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_S8) {
|
||||
RunDynamicQuantizeMatMulTest<int8_t, true, true>();
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, HasZeroPoint_HasBias_test_U8) {
|
||||
RunDynamicQuantizeMatMulTest<uint8_t, true, true>();
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, UInt8_test_with_empty_input) {
|
||||
std::vector<int64_t> A_dims{0, 128};
|
||||
std::vector<int64_t> B_dims{128, 128};
|
||||
std::vector<int64_t> Y_dims{0, 128};
|
||||
|
||||
TestDynamicQuantizeMatMul<uint8_t>(A_dims,
|
||||
B_dims,
|
||||
"testdata/dynamic_quantize_matmul_uint8.onnx",
|
||||
false /*is_matrix_b_constant*/);
|
||||
|
||||
TestDynamicQuantizeMatMul<uint8_t>(A_dims,
|
||||
B_dims,
|
||||
"testdata/dynamic_quantize_matmul_uint8.onnx",
|
||||
true /*is_matrix_b_constant*/);
|
||||
std::vector<int64_t> A_dims{0, 2};
|
||||
std::vector<int64_t> B_dims{2, 2};
|
||||
std::vector<int64_t> Y_dims{0, 2};
|
||||
OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain);
|
||||
test.AddInput<float>("T1", A_dims, {});
|
||||
test.AddInput<uint8_t>("T2", B_dims, {1, 6, 0, 8});
|
||||
test.AddInput<float>("b_scale", {1}, {1.0f});
|
||||
test.AddInput<uint8_t>("b_zero_point", {1}, {0});
|
||||
test.AddOptionalInputEdge<float>();
|
||||
test.AddOutput<float>("Y", {0, 2}, {});
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, B_PerColumn_ND) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue