[CUDA] Support CUDA EP blocked quantization in Q/DQ ops. (#21846)

### Description
1. Added CUDA EP support for blocked quantization in QuantizeLinear and
DequantizeLinear ops.
2. Currently CUDA EP blocked quantization only supports int4/uint4
quantized types and float32/float16 unquantized types.
3. Added CUDA EP support in QDQ selector/action transformer. CUDA EP is
only added to DQ + MatMul -> MatMulNBits rule. Other rules' EP support
are not changed.



### Motivation and Context
ONNX opset 21 introduced blocked quantization for Q/DQ opts. ORT
originally only supports CPU EP blocked quantization.
This commit is contained in:
Jing Fang 2024-08-30 18:28:00 -07:00 committed by GitHub
parent 60b07623a2
commit 5dee95fa10
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 1922 additions and 193 deletions

View file

@ -587,7 +587,8 @@ Do not modify directly.*
|DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|19+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|21+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
|||[19, 20]|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
|||[13, 18]|**T** = tensor(int8), tensor(uint8)|
|||[10, 12]|**T** = tensor(int8), tensor(uint8)|
|Div|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
@ -718,7 +719,8 @@ Do not modify directly.*
|||[13, 14]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)<br/> **T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)|
|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)<br/> **T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)|
|||[7, 11]|**T** = tensor(double), tensor(float), tensor(float16)|
|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|19+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)|
|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)|
|||[19, 20]|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)|
|||[13, 18]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
|||[10, 12]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
|RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|

View file

@ -23,7 +23,10 @@ void SplitQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
const std::string action_name{"dropSplitQDQ"};
std::unique_ptr<Action> action = std::make_unique<QDQ::SplitReplaceWithQuant>();
#if !defined(ORT_MINIMAL_BUILD)
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::SplitSelector>(true /*req_equal_quant_params*/);
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::SplitSelector>(true /*req_equal_quant_params*/,
false,
providers);
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
{{"Split", {}}},
std::move(selector),
@ -63,14 +66,18 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
//
// And cannot eliminate the QDQ for MaxPool if the scale is not positive, as a negative
// scale will change the ordering of the elements between quantized & de-quantized values.
std::unique_ptr<NodeSelector> selector_no_16bit = std::make_unique<QDQ::DropQDQNodesSelector>(false);
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
std::unique_ptr<NodeSelector> selector_no_16bit = std::make_unique<QDQ::DropQDQNodesSelector>(false,
false,
true,
providers);
qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_name,
{{"Resize", {}}},
std::move(selector_no_16bit),
std::move(drop_action_no_int16));
std::unique_ptr<NodeSelector> selector_no_16bit_and_positive_scale =
std::make_unique<QDQ::DropQDQNodesSelector>(false, true, false);
std::make_unique<QDQ::DropQDQNodesSelector>(false, true, false, providers);
qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_and_positive_scale_name,
{{"MaxPool", {12}},
{"ReduceMax", {}},
@ -78,7 +85,7 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
std::move(selector_no_16bit_and_positive_scale),
std::move(drop_action_no_int16_and_positive_scale));
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropQDQNodesSelector>(true);
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropQDQNodesSelector>(true, false, true, providers);
// DepthToSpace and SpaceToDepth not included because there are no integer implementations.
// https://github.com/microsoft/onnxruntime/issues/21287
qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_name,
@ -117,7 +124,8 @@ void DropDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
#if !defined(ORT_MINIMAL_BUILD)
// TODO: Enable 16-bit types in selector when ArgMax supports 16-bit integer input tensors.
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropDQNodesSelector>();
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropDQNodesSelector>(false, false, providers);
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
{{"ArgMax", {}}},
std::move(selector),
@ -200,7 +208,8 @@ void VariadicOpQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
#if !defined(ORT_MINIMAL_BUILD)
// TODO: Enable 16-bit types in selector when QLinearConcat supports 16-bit.
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::InputVariadicSelector>();
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::InputVariadicSelector>(false, false, providers);
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
{{"Concat", {}}},
@ -222,7 +231,11 @@ void ConvQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool is_
#if !defined(ORT_MINIMAL_BUILD)
// TODO: Enable 16-bit types in selector when QLinearConv supports 16-bit.
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::ConvSelector>(is_int8_allowed);
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::ConvSelector>(is_int8_allowed,
false,
false,
providers);
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
{{"Conv", {}}},
@ -245,7 +258,11 @@ void MatMulQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool i
#if !defined(ORT_MINIMAL_BUILD)
// TODO: Enable 16-bit types in selector when QLinearMatMul and MatMulInteger support 16-bit.
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::MatMulSelector>(is_int8_allowed);
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::MatMulSelector>(is_int8_allowed,
false,
false,
providers);
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
{{"MatMul", {}}},
std::move(selector),
@ -272,7 +289,8 @@ void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_regi
p_buffered_tensors);
#if !defined(ORT_MINIMAL_BUILD)
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DQMatMulToMatMulNBitsSelector>();
std::vector<const char*> providers = {kCpuExecutionProvider, kCudaExecutionProvider};
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DQMatMulToMatMulNBitsSelector>(providers);
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
{{"MatMul", {}}},
std::move(selector),
@ -363,8 +381,9 @@ QDQSelectorActionTransformer::QDQSelectorActionTransformer(
CreateSelectorActionRegistry(is_int8_allowed, qdq_matmulnbits_accuracy_level,
intra_op_thread_pool, p_buffered_tensors),
apply_context,
// this transformer is only compatible with the CPU and DML EP
{kCpuExecutionProvider, kDmlExecutionProvider}} {
// this transformer is compatible with CPU, DML and CUDA EP.
// There is further EP control on the rule level.
{kCpuExecutionProvider, kDmlExecutionProvider, kCudaExecutionProvider}} {
}
} // namespace onnxruntime

View file

@ -302,14 +302,20 @@ class BaseSelector : public NodeSelector {
class DropQDQNodesSelector : public BaseSelector {
public:
explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false, bool allow_nonpositive_scale = true)
: BaseSelector(std::make_unique<DropQDQNodeGroupSelector>(allow_16bit, allow_4bit, allow_nonpositive_scale)) {}
explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false,
bool allow_nonpositive_scale = true,
gsl::span<const char*> compatible_providers = {})
: BaseSelector(std::make_unique<DropQDQNodeGroupSelector>(allow_16bit, allow_4bit, allow_nonpositive_scale),
compatible_providers) {}
};
class DropDQNodesSelector : public BaseSelector {
public:
explicit DropDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false)
: BaseSelector(std::make_unique<DropDQNodeGroupSelector>(allow_16bit, allow_4bit)) {}
explicit DropDQNodesSelector(bool allow_16bit = false,
bool allow_4bit = false,
gsl::span<const char*> compatible_providers = {})
: BaseSelector(std::make_unique<DropDQNodeGroupSelector>(allow_16bit, allow_4bit),
compatible_providers) {}
};
class UnarySelector : public BaseSelector {
@ -329,8 +335,11 @@ class BinarySelector : public BaseSelector {
// Variadic DQ nodes -> node -> Q
class InputVariadicSelector : public BaseSelector {
public:
explicit InputVariadicSelector(bool allow_16bit = false, bool allow_4bit = false)
: BaseSelector(std::make_unique<VariadicNodeGroupSelector>(allow_16bit, allow_4bit)) {}
explicit InputVariadicSelector(bool allow_16bit = false,
bool allow_4bit = false,
gsl::span<const char*> compatible_providers = {})
: BaseSelector(std::make_unique<VariadicNodeGroupSelector>(allow_16bit, allow_4bit),
compatible_providers) {}
void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
};
@ -338,8 +347,10 @@ class InputVariadicSelector : public BaseSelector {
// DQ -> Split -> variadic Q nodes
class SplitSelector : public BaseSelector {
public:
SplitSelector(bool req_equal_quant_params = false, bool allow_4bit = false)
: BaseSelector(std::make_unique<SplitNodeGroupSelector>(req_equal_quant_params, allow_4bit)) {}
SplitSelector(bool req_equal_quant_params = false, bool allow_4bit = false,
gsl::span<const char*> compatible_providers = {})
: BaseSelector(std::make_unique<SplitNodeGroupSelector>(req_equal_quant_params, allow_4bit),
compatible_providers) {}
void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
};
@ -347,8 +358,10 @@ class SplitSelector : public BaseSelector {
// DQ nodes for X, W and optionally B -> node -> Q
class ConvSelector : public BaseSelector {
public:
ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false)
: BaseSelector(std::make_unique<ConvNodeGroupSelector>(int8_allowed, allow_16bit, allow_4bit_weight)) {}
ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false,
gsl::span<const char*> compatible_providers = {})
: BaseSelector(std::make_unique<ConvNodeGroupSelector>(int8_allowed, allow_16bit, allow_4bit_weight),
compatible_providers) {}
void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
};
@ -363,9 +376,11 @@ class WhereSelector : public BaseSelector {
// 2 DQ nodes for input -> node -> optional Q if QLinearMatMul, MatMulIntegerToFloat if not
class MatMulSelector : public BaseSelector {
public:
MatMulSelector(bool int8_allowed, bool allow_16bit = false, bool allow_4bit = false)
MatMulSelector(bool int8_allowed, bool allow_16bit = false, bool allow_4bit = false,
gsl::span<const char*> compatible_providers = {})
: BaseSelector(std::make_unique<MatMulNodeGroupSelector>(int8_allowed, /*matmulintegertofloat_allowed*/ true,
allow_16bit, allow_4bit)) {}
allow_16bit, allow_4bit),
compatible_providers) {}
};
// Convert "1 DQ node for input B -> MatMul" to "MatMulNBits"

View file

@ -4,6 +4,7 @@
#include "core/common/inlined_containers.h"
#include "core/common/parse_string.h"
#include "core/framework/int4.h"
#include "core/providers/shared_library/provider_api.h"
#include "core/platform/env_var_utils.h"
#include "core/providers/cuda/cuda_execution_provider.h"
@ -1348,38 +1349,37 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, Cast);
#endif
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear);
#if !defined(DISABLE_FLOAT8_TYPES)
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, DequantizeLinear);
#endif
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear);
#if !defined(DISABLE_FLOAT8_TYPES)
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, DequantizeLinear);
#endif
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Identity);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, If);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Loop);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear);
#if !defined(DISABLE_FLOAT8_TYPES)
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, QuantizeLinear);
#endif
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear);
#if !defined(DISABLE_FLOAT8_TYPES)
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, QuantizeLinear);
#endif
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
#endif
// Opset 20
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu);
@ -1388,6 +1388,40 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN);
// Opset 21.
// TODO(fajin): support other quantized types
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, DequantizeLinear);
#if !defined(DISABLE_FLOAT8_TYPES)
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, DequantizeLinear);
#endif
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, QuantizeLinear);
#if !defined(DISABLE_FLOAT8_TYPES)
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, QuantizeLinear);
#endif
#endif
template <>
KernelCreateInfo BuildKernelCreateInfo<void>() {
return {};
@ -2265,34 +2299,34 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, Cast)>,
#endif
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear)>,
#if !defined(DISABLE_FLOAT8_TYPES)
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, DequantizeLinear)>,
#endif
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear)>,
#if !defined(DISABLE_FLOAT8_TYPES)
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, DequantizeLinear)>,
#endif
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Identity)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, If)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Loop)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear)>,
#if !defined(DISABLE_FLOAT8_TYPES)
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, QuantizeLinear)>,
#endif
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear)>,
#if !defined(DISABLE_FLOAT8_TYPES)
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, QuantizeLinear)>,
#endif
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
@ -2305,6 +2339,37 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN)>,
// Opset 21
// TODO(fajin): support other quantized types
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, DequantizeLinear)>,
#if !defined(DISABLE_FLOAT8_TYPES)
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, DequantizeLinear)>,
#endif
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, QuantizeLinear)>,
#if !defined(DISABLE_FLOAT8_TYPES)
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, QuantizeLinear)>,
#endif
#endif
};

View file

@ -7,36 +7,181 @@
namespace onnxruntime {
namespace cuda {
void ValidateBlockQuantizationShapes(const TensorShape& input_shape,
const TensorShape& scale_shape,
const Tensor* zero_point,
size_t axis_no_neg,
int64_t block_size_) {
ORT_ENFORCE(scale_shape.NumDimensions() == input_shape.NumDimensions(),
"scale and input must have the same rank for blocked quantization");
for (size_t i = 0, ndim = input_shape.NumDimensions(); i < ndim; ++i) {
if (i == static_cast<size_t>(axis_no_neg)) {
ORT_ENFORCE(scale_shape[i] == (input_shape[i] + block_size_ - 1) / block_size_,
"scale must be ceil(Di/block_size) on the quantize axis i for blocked quantization");
} else {
ORT_ENFORCE(scale_shape[i] == input_shape[i],
"scale and input must have the same shape despite the quantize axis for blocked quantization");
}
}
if (zero_point) {
ORT_ENFORCE(zero_point->Shape() == scale_shape,
"zero_point and scale must have the same shape for blocked quantization");
}
}
template <class T, class U>
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, bool /*saturate*/) {
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point,
size_t num_of_element, bool /*saturate*/) {
ORT_UNUSED_PARAMETER(stream);
ORT_UNUSED_PARAMETER(input);
ORT_UNUSED_PARAMETER(output);
ORT_UNUSED_PARAMETER(scale);
ORT_UNUSED_PARAMETER(zero_point);
ORT_UNUSED_PARAMETER(num_of_element);
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
}
template <class T, class U>
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point,
size_t num_of_element, size_t batch_size, size_t n_scales, bool /*saturate*/) {
ORT_UNUSED_PARAMETER(stream);
ORT_UNUSED_PARAMETER(input);
ORT_UNUSED_PARAMETER(output);
ORT_UNUSED_PARAMETER(scale);
ORT_UNUSED_PARAMETER(zero_point);
ORT_UNUSED_PARAMETER(num_of_element);
ORT_UNUSED_PARAMETER(batch_size);
ORT_UNUSED_PARAMETER(n_scales);
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
}
template <typename T, typename U>
Status CudaQuantizeLinearBlock(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point,
size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) {
ORT_UNUSED_PARAMETER(stream);
ORT_UNUSED_PARAMETER(input);
ORT_UNUSED_PARAMETER(output);
ORT_UNUSED_PARAMETER(scale);
ORT_UNUSED_PARAMETER(zero_point);
ORT_UNUSED_PARAMETER(num_of_element);
ORT_UNUSED_PARAMETER(K);
ORT_UNUSED_PARAMETER(N);
ORT_UNUSED_PARAMETER(block_size);
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
}
template <class U>
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, int8_t* output, const U* scale,
const int8_t* zero_point, size_t num_of_element, bool /*saturate*/) {
return CudaQuantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
}
template <class U>
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, uint8_t* output, const U* scale,
const uint8_t* zero_point, size_t num_of_element, bool /*saturate*/) {
return CudaQuantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
}
template <class U>
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Int4x2* output, const U* scale,
const Int4x2* zero_point, size_t num_of_element, bool /*saturate*/) {
return CudaQuantizeLinearStdInt4(stream, input, reinterpret_cast<int8_t*>(output), scale,
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
num_of_element);
}
template <class U>
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, UInt4x2* output, const U* scale,
const UInt4x2* zero_point, size_t num_of_element, bool /*saturate*/) {
return CudaQuantizeLinearStdInt4(stream, input, reinterpret_cast<uint8_t*>(output), scale,
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
num_of_element);
}
#if !defined(DISABLE_FLOAT8_TYPES)
template <class T, class U>
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, bool saturate) {
template <class U>
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Float8E4M3FN* output, const U* scale,
const Float8E4M3FN* zero_point, size_t num_of_element, bool saturate) {
return CudaQuantizeLinearSat(stream, input, output, scale, zero_point, num_of_element, saturate);
}
template <class T, class U>
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales, bool saturate) {
return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales, saturate);
template <class U>
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Float8E5M2* output, const U* scale,
const Float8E5M2* zero_point, size_t num_of_element, bool saturate) {
return CudaQuantizeLinearSat(stream, input, output, scale, zero_point, num_of_element, saturate);
}
template <class U>
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Float8E4M3FN* output, const U* scale,
const Float8E4M3FN* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales, bool saturate) {
return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size,
n_scales, saturate);
}
template <class U>
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Float8E5M2* output, const U* scale,
const Float8E5M2* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales, bool saturate) {
return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size,
n_scales, saturate);
}
#endif
template <class T, class U>
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales, bool /*saturate*/) {
template <class U>
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, int8_t* output, const U* scale,
const int8_t* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales, bool /*saturate*/) {
return CudaQuantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
}
template <class U>
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, uint8_t* output, const U* scale,
const uint8_t* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales, bool /*saturate*/) {
return CudaQuantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
}
template <class U>
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Int4x2* output, const U* scale,
const Int4x2* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales, bool /*saturate*/) {
return CudaQuantizeLinearAxisStdInt4(stream, input, reinterpret_cast<int8_t*>(output), scale,
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
num_of_element, batch_size, n_scales);
}
template <class U>
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, UInt4x2* output, const U* scale,
const UInt4x2* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales, bool /*saturate*/) {
return CudaQuantizeLinearAxisStdInt4(stream, input, reinterpret_cast<uint8_t*>(output), scale,
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
num_of_element, batch_size, n_scales);
}
template <typename U>
Status CudaQuantizeLinearBlock(cudaStream_t stream,
const U* input, Int4x2* output, const U* scale, const Int4x2* zero_point,
size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) {
return CudaQuantizeLinearBlockStdInt4(stream, input, reinterpret_cast<int8_t*>(output), scale,
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
num_of_element, K, N, block_size);
}
template <typename U>
Status CudaQuantizeLinearBlock(cudaStream_t stream,
const U* input, UInt4x2* output, const U* scale, const UInt4x2* zero_point,
size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) {
return CudaQuantizeLinearBlockStdInt4(stream, input, reinterpret_cast<uint8_t*>(output), scale,
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
num_of_element, K, N, block_size);
}
template <class T, class U>
Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToCudaType<U>::MappedType CudaU;
@ -48,21 +193,22 @@ Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
auto& y = *ctx->Output(0, x.Shape());
const auto& x_shape = x.Shape();
const auto num_of_elements = x_shape.Size();
const CudaU* input = reinterpret_cast<const CudaU*>(x.Data<U>());
T* output = y.MutableData<T>();
if (IsScalarOr1ElementVector(&y_scale)) {
if (IsScalarOr1ElementVector(&y_scale)) { // per-tensor quantization
ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point),
"y_zero_point must be a scalar or 1D tensor of size 1.");
ORT_ENFORCE(block_size_ == 0, "block_size must be 0 for per-tensor quantization.");
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
const auto num_of_elements = x_shape.Size();
ORT_RETURN_IF_ERROR(CudaQuantizeLinear(Stream(ctx), input, output, scale, zero_point, num_of_elements, saturate_));
return Status::OK();
} else {
} else if (block_size_ == 0) { // per-axis quantization
ORT_ENFORCE(y_scale.Shape().NumDimensions() == 1);
ORT_ENFORCE(y_zero_point == nullptr || (y_scale.Shape().Size() == y_zero_point->Shape().Size() &&
y_zero_point->Shape().NumDimensions() == 1),
@ -73,44 +219,184 @@ Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
const auto num_of_elements = x_shape.Size();
ORT_RETURN_IF_ERROR(CudaQuantizeLinearAxis(Stream(ctx), input, output, scale, zero_point, num_of_elements,
x_shape.SizeToDimension(axis), y_scale.Shape().Size(), saturate_));
return Status::OK();
} else { // blocked quantization
// validate shape
size_t axis_no_neg = SafeInt<size_t>(HandleNegativeAxis(axis_, x_shape.NumDimensions()));
const auto& y_scale_shape = y_scale.Shape();
ValidateBlockQuantizationShapes(x_shape,
y_scale_shape,
y_zero_point,
axis_no_neg,
block_size_);
// compute
const T* zero_point = y_zero_point ? y_zero_point->Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
ORT_RETURN_IF_ERROR(CudaQuantizeLinearBlock(Stream(ctx), input, output, scale, zero_point,
num_of_elements, x_shape[axis_no_neg],
x_shape.SizeFromDimension(axis_no_neg + 1),
block_size_, saturate_));
return Status::OK();
}
}
template <class T, class U>
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
Status CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale,
const T* zero_point, size_t num_of_element) {
ORT_UNUSED_PARAMETER(stream);
ORT_UNUSED_PARAMETER(input);
ORT_UNUSED_PARAMETER(output);
ORT_UNUSED_PARAMETER(scale);
ORT_UNUSED_PARAMETER(zero_point);
ORT_UNUSED_PARAMETER(num_of_element);
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
}
template <class T, class U>
Status CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale,
const T* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
ORT_UNUSED_PARAMETER(stream);
ORT_UNUSED_PARAMETER(input);
ORT_UNUSED_PARAMETER(output);
ORT_UNUSED_PARAMETER(scale);
ORT_UNUSED_PARAMETER(zero_point);
ORT_UNUSED_PARAMETER(num_of_element);
ORT_UNUSED_PARAMETER(batch_size);
ORT_UNUSED_PARAMETER(n_scales);
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
}
template <class T, class U>
Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
const T* zero_point, size_t num_of_element, size_t K, size_t N,
size_t block_size) {
ORT_UNUSED_PARAMETER(stream);
ORT_UNUSED_PARAMETER(input);
ORT_UNUSED_PARAMETER(output);
ORT_UNUSED_PARAMETER(scale);
ORT_UNUSED_PARAMETER(zero_point);
ORT_UNUSED_PARAMETER(num_of_element);
ORT_UNUSED_PARAMETER(K);
ORT_UNUSED_PARAMETER(N);
ORT_UNUSED_PARAMETER(block_size);
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
}
template <class U>
Status CudaDequantizeLinear(cudaStream_t stream, const int8_t* input, U* output, const U* scale,
const int8_t* zero_point, size_t num_of_element) {
return CudaDequantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
}
template <class U>
Status CudaDequantizeLinear(cudaStream_t stream, const uint8_t* input, U* output, const U* scale,
const uint8_t* zero_point, size_t num_of_element) {
return CudaDequantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
}
template <class U>
Status CudaDequantizeLinear(cudaStream_t stream, const Int4x2* input, U* output, const U* scale,
const Int4x2* zero_point, size_t num_of_element) {
return CudaDequantizeLinearStdInt4(stream, reinterpret_cast<const int8_t*>(input), output, scale,
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
num_of_element);
}
template <class U>
Status CudaDequantizeLinear(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale,
const UInt4x2* zero_point, size_t num_of_element) {
return CudaDequantizeLinearStdInt4(stream, reinterpret_cast<const uint8_t*>(input), output, scale,
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
num_of_element);
}
#if !defined(DISABLE_FLOAT8_TYPES)
template <class T, class U>
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
template <class U>
Status CudaDequantizeLinear(cudaStream_t stream, const Float8E4M3FN* input, U* output, const U* scale,
const Float8E4M3FN* zero_point, size_t num_of_element) {
return CudaDequantizeLinearSat(stream, input, output, scale, zero_point, num_of_element);
}
template <class U>
Status CudaDequantizeLinear(cudaStream_t stream, const Float8E5M2* input, U* output, const U* scale,
const Float8E5M2* zero_point, size_t num_of_element) {
return CudaDequantizeLinearSat(stream, input, output, scale, zero_point, num_of_element);
}
#endif
template <class T, class U>
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
template <class U>
Status CudaDequantizeLinearAxis(cudaStream_t stream, const int8_t* input, U* output, const U* scale,
const int8_t* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
return CudaDequantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
}
template <class U>
Status CudaDequantizeLinearAxis(cudaStream_t stream, const uint8_t* input, U* output, const U* scale,
const uint8_t* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
return CudaDequantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
}
template <class U>
Status CudaDequantizeLinearAxis(cudaStream_t stream, const Int4x2* input, U* output, const U* scale,
const Int4x2* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
return CudaDequantizeLinearAxisStdInt4(stream, reinterpret_cast<const int8_t*>(input), output, scale,
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
num_of_element, batch_size, n_scales);
}
template <class U>
Status CudaDequantizeLinearAxis(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale,
const UInt4x2* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
return CudaDequantizeLinearAxisStdInt4(stream, reinterpret_cast<const uint8_t*>(input), output, scale,
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
num_of_element, batch_size, n_scales);
}
#if !defined(DISABLE_FLOAT8_TYPES)
template <class T, class U>
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
template <class U>
Status CudaDequantizeLinearAxis(cudaStream_t stream, const Float8E4M3FN* input, U* output, const U* scale,
const Float8E4M3FN* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
return CudaDequantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
}
template <class U>
Status CudaDequantizeLinearAxis(cudaStream_t stream, const Float8E5M2* input, U* output, const U* scale,
const Float8E5M2* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
return CudaDequantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
}
#endif
template <class U>
Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale,
const UInt4x2* zero_point, size_t num_of_element, size_t K, size_t N,
size_t block_size) {
return CudaDequantizeLinearBlockStdInt4(stream, reinterpret_cast<const uint8_t*>(input), output, scale,
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
num_of_element, K, N, block_size);
}
template <class U>
Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const Int4x2* input, U* output, const U* scale,
const Int4x2* zero_point, size_t num_of_element, size_t K, size_t N,
size_t block_size) {
return CudaDequantizeLinearBlockStdInt4(stream, reinterpret_cast<const int8_t*>(input), output, scale,
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
num_of_element, K, N, block_size);
}
template <class T, class U>
Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
typedef typename ToCudaType<U>::MappedType CudaU;
@ -120,6 +406,7 @@ Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
auto* y_zero_point = ctx->Input<Tensor>(2);
const auto& x_shape = x.Shape();
const auto num_of_elements = x_shape.Size();
auto& y = *ctx->Output(0, x_shape);
@ -131,12 +418,11 @@ Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
const auto num_of_elements = x_shape.Size();
ORT_RETURN_IF_ERROR(CudaDequantizeLinear(Stream(ctx), input, output, scale, zero_point, num_of_elements));
return Status::OK();
} else {
} else if (block_size_ == 0) { // per axis quantization
ORT_ENFORCE(y_scale.Shape().NumDimensions() == 1);
ORT_ENFORCE(y_zero_point == nullptr || (y_scale.Shape().Size() == y_zero_point->Shape().Size() && y_zero_point->Shape().NumDimensions() == 1), "scale and zero_point must have the same shape.");
ORT_ENFORCE(x_shape.NumDimensions() > 1);
@ -145,11 +431,31 @@ Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
const auto num_of_elements = x_shape.Size();
ORT_RETURN_IF_ERROR(CudaDequantizeLinearAxis(Stream(ctx), input, output, scale, zero_point, num_of_elements,
x_shape.SizeToDimension(axis), y_scale.Shape().Size()));
return Status::OK();
} else { // blocked quantization
// validate shape
auto axis_no_neg = SafeInt<size_t>(HandleNegativeAxis(axis_, x_shape.NumDimensions()));
const auto& y_scale_shape = y_scale.Shape();
ValidateBlockQuantizationShapes(x_shape,
y_scale_shape,
y_zero_point,
axis_no_neg,
block_size_);
// compute
const T* zero_point = y_zero_point ? y_zero_point->Data<T>() : nullptr;
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
ORT_RETURN_IF_ERROR(CudaDequantizeLinearBlockInt4(Stream(ctx), input, output, scale, zero_point,
num_of_elements, x_shape[axis_no_neg],
x_shape.SizeFromDimension(axis_no_neg + 1),
block_size_));
return Status::OK();
}
}
@ -183,33 +489,54 @@ REGISTER_Q_KERNEL_TYPED_10_12(uint8_t)
REGISTER_Q_KERNEL_TYPED_13_18(int8_t)
REGISTER_Q_KERNEL_TYPED_13_18(uint8_t)
#define REGISTER_Q_KERNEL_TYPED_19(T) \
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
QuantizeLinear, \
kOnnxDomain, \
19, \
T, float, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
QuantizeLinear<T, float>); \
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
QuantizeLinear, \
kOnnxDomain, \
19, \
T, MLFloat16, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<MLFloat16>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
QuantizeLinear<T, MLFloat16>);
#define REGISTER_Q_KERNEL_TWO_TYPED_19_20(T, U) \
ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \
QuantizeLinear, \
kOnnxDomain, \
19, 20, \
T, U, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<U>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
QuantizeLinear<T, U>);
REGISTER_Q_KERNEL_TYPED_19(int8_t)
REGISTER_Q_KERNEL_TYPED_19(uint8_t)
REGISTER_Q_KERNEL_TWO_TYPED_19_20(int8_t, float)
REGISTER_Q_KERNEL_TWO_TYPED_19_20(uint8_t, float)
REGISTER_Q_KERNEL_TWO_TYPED_19_20(int8_t, MLFloat16)
REGISTER_Q_KERNEL_TWO_TYPED_19_20(uint8_t, MLFloat16)
#if !defined(DISABLE_FLOAT8_TYPES)
REGISTER_Q_KERNEL_TYPED_19(Float8E4M3FN)
REGISTER_Q_KERNEL_TYPED_19(Float8E5M2)
REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, float)
REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E5M2, float)
REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, MLFloat16)
REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16)
#endif
#define REGISTER_Q_KERNEL_TWO_TYPED_21(T, U) \
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
QuantizeLinear, \
kOnnxDomain, \
21, \
T, U, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<U>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
QuantizeLinear<T, U>);
REGISTER_Q_KERNEL_TWO_TYPED_21(uint8_t, float)
REGISTER_Q_KERNEL_TWO_TYPED_21(int8_t, float)
REGISTER_Q_KERNEL_TWO_TYPED_21(uint8_t, MLFloat16)
REGISTER_Q_KERNEL_TWO_TYPED_21(int8_t, MLFloat16)
REGISTER_Q_KERNEL_TWO_TYPED_21(UInt4x2, float)
REGISTER_Q_KERNEL_TWO_TYPED_21(Int4x2, float)
REGISTER_Q_KERNEL_TWO_TYPED_21(UInt4x2, MLFloat16)
REGISTER_Q_KERNEL_TWO_TYPED_21(Int4x2, MLFloat16)
#if !defined(DISABLE_FLOAT8_TYPES)
REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E4M3FN, float)
REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E5M2, float)
REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E4M3FN, MLFloat16)
REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E5M2, MLFloat16)
#endif
// register DequantizeLinear kernels
@ -240,33 +567,54 @@ REGISTER_DQ_KERNEL_TYPED_10_12(uint8_t)
REGISTER_DQ_KERNEL_TYPED_13_18(int8_t)
REGISTER_DQ_KERNEL_TYPED_13_18(uint8_t)
#define REGISTER_DQ_KERNEL_TYPED_19(T) \
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
DequantizeLinear, \
kOnnxDomain, \
19, \
T, float, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<float>()), \
DequantizeLinear<T, float>); \
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
DequantizeLinear, \
kOnnxDomain, \
19, \
T, MLFloat16, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<MLFloat16>()), \
DequantizeLinear<T, MLFloat16>);
#define REGISTER_DQ_KERNEL_TWO_TYPED_19_20(T, U) \
ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \
DequantizeLinear, \
kOnnxDomain, \
19, 20, \
T, U, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<U>()), \
DequantizeLinear<T, U>);
REGISTER_DQ_KERNEL_TYPED_19(int8_t)
REGISTER_DQ_KERNEL_TYPED_19(uint8_t)
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(int8_t, float)
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(uint8_t, float)
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(int8_t, MLFloat16)
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(uint8_t, MLFloat16)
#if !defined(DISABLE_FLOAT8_TYPES)
REGISTER_DQ_KERNEL_TYPED_19(Float8E4M3FN)
REGISTER_DQ_KERNEL_TYPED_19(Float8E5M2)
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, float)
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, float)
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, MLFloat16)
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16)
#endif
#define REGISTER_DQ_KERNEL_TWO_TYPED_21(T, U) \
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
DequantizeLinear, \
kOnnxDomain, \
21, \
T, U, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<U>()), \
DequantizeLinear<T, U>);
REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, float)
REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, float)
REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, MLFloat16)
REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, MLFloat16)
REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, float)
REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, float)
REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, MLFloat16)
REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, MLFloat16)
#if !defined(DISABLE_FLOAT8_TYPES)
REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, float)
REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, float)
REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, MLFloat16)
REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, MLFloat16)
#endif
// specialize QuantizeLinear::ComputeInternal and DequantizeLinear::ComputeInternal
@ -278,6 +626,10 @@ SPECIALIZED_QDQ_COMPUTE(int8_t, float)
SPECIALIZED_QDQ_COMPUTE(uint8_t, float)
SPECIALIZED_QDQ_COMPUTE(int8_t, MLFloat16)
SPECIALIZED_QDQ_COMPUTE(uint8_t, MLFloat16)
SPECIALIZED_QDQ_COMPUTE(Int4x2, float)
SPECIALIZED_QDQ_COMPUTE(UInt4x2, float)
SPECIALIZED_QDQ_COMPUTE(Int4x2, MLFloat16)
SPECIALIZED_QDQ_COMPUTE(UInt4x2, MLFloat16)
#if !defined(DISABLE_FLOAT8_TYPES)
SPECIALIZED_QDQ_COMPUTE(Float8E4M3FN, float)

View file

@ -9,6 +9,7 @@
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
#include "cuda_fp8.h"
#include "cuda_fp16.h"
#endif
namespace onnxruntime {
@ -17,9 +18,23 @@ namespace cuda {
template <typename InT, typename OutT>
struct RoundStd;
template <typename InT, typename OutT>
struct RoundStdInt4;
template <typename InT, typename OutT>
struct RoundSat;
template <typename T>
__device__ __forceinline__ int ExtractInt4FromByte(T byte, int index) {
return static_cast<int>((byte >> (index << 2)) & 0x0f);
}
template <>
__device__ __forceinline__ int ExtractInt4FromByte<int8_t>(int8_t byte, int index) {
constexpr auto shift = (sizeof(int) << 3) - 4;
return (static_cast<int>(((byte >> (index << 2)) & 0x0f)) << shift) >> shift;
}
template <>
struct RoundStd<float, int8_t> {
__device__ __forceinline__ int8_t operator()(float v, float scale, int8_t zero_point) const {
@ -28,6 +43,22 @@ struct RoundStd<float, int8_t> {
}
};
template <>
struct RoundStdInt4<float, int8_t> {
__device__ __forceinline__ int8_t operator()(float v0,
float v1,
float scale0,
float scale1,
int zp0,
int zp1) const {
int value0 = __float2int_rn(v0 / scale0) + zp0;
int value1 = __float2int_rn(v1 / scale1) + zp1;
int value0_clip = max(-8, min(7, value0));
int value1_clip = max(-8, min(7, value1));
return static_cast<int8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
}
};
template <>
struct RoundStd<float, uint8_t> {
__device__ __forceinline__ uint8_t operator()(float v, float scale, uint8_t zero_point) const {
@ -36,6 +67,22 @@ struct RoundStd<float, uint8_t> {
}
};
template <>
struct RoundStdInt4<float, uint8_t> {
__device__ __forceinline__ uint8_t operator()(float v0,
float v1,
float scale0,
float scale1,
int zp0,
int zp1) const {
int value0 = __float2int_rn(v0 / scale0) + zp0;
int value1 = __float2int_rn(v1 / scale1) + zp1;
int value0_clip = max(0, min(15, value0));
int value1_clip = max(0, min(15, value1));
return static_cast<uint8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
}
};
#if !defined(DISABLE_FLOAT8_TYPES)
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
@ -104,7 +151,7 @@ struct RoundSat<half, Float8E5M2> {
#endif
#endif // DISABLE_FLOAT8_TYPES
#endif // DISABLE_FLOAT8_TYPES
template <>
struct RoundStd<half, int8_t> {
@ -114,6 +161,26 @@ struct RoundStd<half, int8_t> {
}
};
template <>
struct RoundStdInt4<half, int8_t> {
__device__ __forceinline__ int8_t operator()(half v0,
half v1,
half scale0,
half scale1,
int zp0,
int zp1) const {
half2 v = __halves2half2(v0, v1);
half2 scale = __halves2half2(scale0, scale1);
half2 scaled_v = v / scale;
int value0 = __half2int_rn(__low2half(scaled_v)) + zp0;
int value1 = __half2int_rn(__high2half(scaled_v)) + zp1;
int value0_clip = max(-8, min(7, value0));
int value1_clip = max(-8, min(7, value1));
return static_cast<int8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
}
};
template <>
struct RoundStd<half, uint8_t> {
__device__ __forceinline__ int8_t operator()(half v, half scale, uint8_t zero_point) const {
@ -122,6 +189,26 @@ struct RoundStd<half, uint8_t> {
}
};
template <>
struct RoundStdInt4<half, uint8_t> {
__device__ __forceinline__ uint8_t operator()(half v0,
half v1,
half scale0,
half scale1,
int zp0,
int zp1) const {
half2 v = __halves2half2(v0, v1);
half2 scale = __halves2half2(scale0, scale1);
half2 scaled_v = v / scale;
int value0 = __half2int_rn(__low2half(scaled_v)) + zp0;
int value1 = __half2int_rn(__high2half(scaled_v)) + zp1;
int value0_clip = max(0, min(15, value0));
int value1_clip = max(0, min(15, value1));
return static_cast<uint8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
}
};
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
__global__ void QuantizeLinearKernelStd(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, RoundStd<InT, OutT> round) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
@ -137,6 +224,29 @@ __global__ void QuantizeLinearKernelStd(const InT* input, OutT* output, const In
}
}
// cuda kernel for int4 per tensor quantization with standard rounding
// OutT is int8_t for Int4x2 and uint8_t for UInt4x2
// NumElementsPerThread must be multiple of 2.
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
__global__ void QuantizeLinearKernelStdInt4(const InT* input, OutT* output, const InT* scale_ptr,
const OutT* zero_point_ptr, CUDA_LONG N,
RoundStdInt4<InT, OutT> round) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
InT scale = *scale_ptr;
int zero_point = zero_point_ptr ? ExtractInt4FromByte(*zero_point_ptr, 0) : 0;
int i = 0;
constexpr int step = NumThreadsPerBlock << 1;
#pragma unroll
for (; i + 1 < NumElementsPerThread && id + 1 < N; i += 2, id += step) {
output[id >> 1] = round(input[id], input[id + 1], scale, scale, zero_point, zero_point);
}
if (i < NumElementsPerThread && id < N) {
output[id >> 1] = round(input[id], 0.0, scale, 1.0, zero_point, 0);
}
}
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
__global__ void QuantizeLinearKernelAxisStd(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, size_t batch_size, size_t n_scales, RoundStd<InT, OutT> round) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
@ -154,6 +264,97 @@ __global__ void QuantizeLinearKernelAxisStd(const InT* input, OutT* output, cons
}
}
// cuda kernel for int4 per axis quantization with standard rounding
// OutT is int8_t for Int4x2 and uint8_t for UInt4x2
// NumElementsPerThread must be multiple of 2.
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
__global__ void QuantizeLinearKernelAxisStdInt4(const InT* input, OutT* output, const InT* scale_ptr,
const OutT* zero_point_ptr, CUDA_LONG num_element,
size_t batch_size, size_t n_scales,
RoundStdInt4<InT, OutT> round) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
// Process continuous NumElementsPerThread int4 per thread.
int i = 0;
// The scale needs to change every n_same_scale.
CUDA_LONG n_same_scale = num_element / (batch_size * n_scales);
constexpr int step = NumThreadsPerBlock << 1;
#pragma unroll
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
int scale_id0 = (id / n_same_scale) % n_scales;
int scale_id1 = ((id + 1) / n_same_scale) % n_scales;
int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
int zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1);
output[id >> 1] = round(input[id],
input[id + 1],
scale_ptr[scale_id0],
scale_ptr[scale_id1],
zp0,
zp1);
}
if (i < NumElementsPerThread && id < num_element) {
int scale_id0 = (id / n_same_scale) % n_scales;
int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
output[id >> 1] = round(input[id],
0.0,
scale_ptr[scale_id0],
1.0,
zp0,
0);
}
}
// cuda kernel for int4 block-wise quantization with standard rounding
// OutT is int8_t for Int4x2 and uint8_t for UInt4x2
// NumElementsPerThread must be multiple of 2.
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
__global__ void QuantizeLinearKernelBlockStdInt4(const InT* input, OutT* output, const InT* scale_ptr,
const OutT* zero_point_ptr, CUDA_LONG num_element, size_t KN,
size_t N, size_t scale_KN, size_t block_size,
RoundStdInt4<InT, OutT> round) {
// Process continuous NumElementsPerThread int4 per thread.
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
int i = 0;
constexpr int step = NumThreadsPerBlock << 1;
#pragma unroll
// Process two elements which belong to one byte at a time.
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
int x0 = id / KN, x1 = (id + 1) / KN;
int y0 = id % KN / N, y1 = (id + 1) % KN / N;
int z0 = id % N, z1 = (id + 1) % N;
int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
int scale_id1 = x1 * scale_KN + y1 / block_size * N + z1;
output[id >> 1] = round(input[id],
input[id + 1],
scale_ptr[scale_id0],
scale_ptr[scale_id1],
zero_point_ptr == nullptr
? 0
: ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1),
zero_point_ptr == nullptr
? 0
: ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1));
}
// last non-paired element
if (i < NumElementsPerThread && id < num_element) {
int x0 = id / KN;
int y0 = id % KN / N;
int z0 = id % N;
int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
output[id >> 1] = round(input[id],
0.0,
scale_ptr[scale_id0],
1.0,
zero_point_ptr == nullptr
? 0
: ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1),
0);
}
}
#if !defined(DISABLE_FLOAT8_TYPES)
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
@ -207,6 +408,27 @@ Status CudaQuantizeLinearStd(cudaStream_t stream, const InT* input, OutT* output
return Status::OK();
}
template <class OutT, class InT>
Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale,
const OutT* zero_point, size_t num_of_element) {
if (num_of_element <= 0)
return Status::OK();
static_assert((GridDim::maxElementsPerThread & 1) == 0);
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
QuantizeLinearKernelStdInt4<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input,
output,
scale,
zero_point,
static_cast<int>(num_of_element),
RoundStdInt4<InT, OutT>());
return Status::OK();
}
template <class OutT, class InT>
Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
@ -226,6 +448,59 @@ Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* ou
return Status::OK();
}
template <class OutT, class InT>
Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale,
const OutT* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
if (num_of_element <= 0)
return Status::OK();
static_assert((GridDim::maxElementsPerThread & 1) == 0);
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
QuantizeLinearKernelAxisStdInt4<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input,
output,
scale,
zero_point,
static_cast<int>(num_of_element),
batch_size,
n_scales,
RoundStdInt4<InT, OutT>());
return Status::OK();
}
template <class OutT, class InT>
Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale,
const OutT* zero_point, size_t num_of_element, size_t K, size_t N,
size_t block_size) {
if (num_of_element <= 0)
return Status::OK();
static_assert((GridDim::maxElementsPerThread & 1) == 0);
size_t KN = K * N;
size_t num_block = (K + block_size - 1) / block_size;
size_t scale_KN = num_block * N;
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
QuantizeLinearKernelBlockStdInt4<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input,
output,
scale,
zero_point,
static_cast<int>(num_of_element),
KN,
N,
scale_KN,
block_size,
RoundStdInt4<InT, OutT>());
return Status::OK();
}
#if !defined(DISABLE_FLOAT8_TYPES)
template <class OutT, class InT>
@ -282,6 +557,29 @@ __global__ void DequantizeLinearKernelStd(const InT* input, OutT* output, const
}
}
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DequantizeLinearKernelStdInt4(const InT* input, OutT* output, const OutT* scale_ptr,
const InT* zero_point_ptr, CUDA_LONG num_element) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
OutT scale = *scale_ptr;
int zero_point = zero_point_ptr ? ExtractInt4FromByte(*zero_point_ptr, 0) : 0;
int i = 0, v0, v1;
constexpr int step = NumThreadsPerBlock << 1;
#pragma unroll
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
v0 = ExtractInt4FromByte(input[id >> 1], 0);
v1 = ExtractInt4FromByte(input[id >> 1], 1);
output[id] = static_cast<OutT>(v0 - zero_point) * scale;
output[id + 1] = static_cast<OutT>(v1 - zero_point) * scale;
}
if (i < NumElementsPerThread && id < num_element) {
v0 = ExtractInt4FromByte(input[id >> 1], 0);
output[id] = static_cast<OutT>(v0 - zero_point) * scale;
}
}
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DequantizeLinearKernelAxisStd(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, CUDA_LONG N,
size_t batch_size, size_t n_scales) {
@ -300,6 +598,80 @@ __global__ void DequantizeLinearKernelAxisStd(const InT* input, OutT* output, co
}
}
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DequantizeLinearKernelAxisStdInt4(const InT* input, OutT* output, const OutT* scale_ptr,
const InT* zero_point_ptr, CUDA_LONG num_element,
size_t batch_size, size_t n_scales) {
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
// The scale needs to change every n_same_scale.
CUDA_LONG n_same_scale = num_element / (batch_size * n_scales);
int i = 0;
int scale_id0, scale_id1, zp0, zp1, v0, v1;
constexpr int step = NumThreadsPerBlock << 1;
#pragma unroll
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
scale_id0 = (id / n_same_scale) % n_scales;
scale_id1 = ((id + 1) / n_same_scale) % n_scales;
v0 = ExtractInt4FromByte(input[id >> 1], 0);
v1 = ExtractInt4FromByte(input[id >> 1], 1);
zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1);
output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
output[id + 1] = static_cast<OutT>(v1 - zp1) * scale_ptr[scale_id1];
}
if (i < NumElementsPerThread && id < num_element) {
scale_id0 = (id / n_same_scale) % n_scales;
v0 = ExtractInt4FromByte(input[id >> 1], 0);
zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
}
}
// cuda kernel for int4 block-wise dequantization with standard rounding
// IntT is int8_t for Int4x2 and uint8_t for UInt4x2
// NumElementsPerThread must be multiple of 2.
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
__global__ void DequantizeLinearKernelBlockStdInt4(const InT* input, OutT* output, const OutT* scale_ptr,
const InT* zero_point_ptr, CUDA_LONG num_element,
size_t KN, size_t N, size_t scale_KN, size_t block_size) {
// Process continuous NumElementsPerThread int4 per thread.
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
int i = 0;
constexpr int step = NumThreadsPerBlock << 1;
#pragma unroll
// Process two elements which belong to one byte at a time.
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
int x0 = id / KN, x1 = (id + 1) / KN;
int y0 = id % KN / N, y1 = (id + 1) % KN / N;
int z0 = id % N, z1 = (id + 1) % N;
int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
int scale_id1 = x1 * scale_KN + y1 / block_size * N + z1;
int v0 = ExtractInt4FromByte(input[id >> 1], 0);
int v1 = ExtractInt4FromByte(input[id >> 1], 1);
int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
int zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1);
output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
output[id + 1] = static_cast<OutT>(v1 - zp1) * scale_ptr[scale_id1];
}
// last non-paired element
if (i < NumElementsPerThread && id < num_element) {
int x0 = id / KN;
int y0 = id % KN / N;
int z0 = id % N;
int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
int v0 = ExtractInt4FromByte(input[id >> 1], 0);
int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
}
}
template <typename InT, typename OutT>
struct DQFloat8;
@ -422,6 +794,26 @@ Status CudaDequantizeLinearStd(cudaStream_t stream, const InT* input, OutT* outp
return Status::OK();
}
template <class InT, class OutT>
Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale,
const InT* zero_point, size_t num_of_element) {
if (num_of_element <= 0)
return Status::OK();
static_assert((GridDim::maxElementsPerThread & 1) == 0);
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
DequantizeLinearKernelStdInt4<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input,
output,
scale,
zero_point,
static_cast<int>(num_of_element));
return Status::OK();
}
template <class InT, class OutT>
Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
@ -440,6 +832,57 @@ Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT*
return Status::OK();
}
template <class InT, class OutT>
Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale,
const InT* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales) {
if (num_of_element <= 0)
return Status::OK();
static_assert((GridDim::maxElementsPerThread & 1) == 0);
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
DequantizeLinearKernelAxisStdInt4<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input,
output,
scale,
zero_point,
static_cast<int>(num_of_element),
batch_size,
n_scales);
return Status::OK();
}
template <class T, class U>
Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
const T* zero_point, size_t num_of_element, size_t K, size_t N,
size_t block_size) {
if (num_of_element <= 0)
return Status::OK();
static_assert((GridDim::maxElementsPerThread & 1) == 0);
size_t KN = K * N;
size_t num_block = (K + block_size - 1) / block_size;
size_t scale_KN = num_block * N;
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
DequantizeLinearKernelBlockStdInt4<T, U, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
input,
output,
scale,
zero_point,
static_cast<CUDA_LONG>(num_of_element),
KN,
N,
scale_KN,
block_size);
return Status::OK();
}
#if !defined(DISABLE_FLOAT8_TYPES)
template <class InT, class OutT>
@ -481,11 +924,24 @@ template Status CudaQuantizeLinearStd<int8_t, float>(cudaStream_t stream, const
template Status CudaQuantizeLinearStd<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinearStd<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinearStd<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinearStdInt4<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinearStdInt4<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinearStdInt4<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinearStdInt4<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaQuantizeLinearAxisStd<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaQuantizeLinearAxisStd<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaQuantizeLinearAxisStd<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaQuantizeLinearAxisStd<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaQuantizeLinearAxisStdInt4<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaQuantizeLinearAxisStdInt4<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaQuantizeLinearAxisStdInt4<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaQuantizeLinearAxisStdInt4<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaQuantizeLinearBlockStdInt4<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
template Status CudaQuantizeLinearBlockStdInt4<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
template Status CudaQuantizeLinearBlockStdInt4<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
template Status CudaQuantizeLinearBlockStdInt4<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
#if !defined(DISABLE_FLOAT8_TYPES)
@ -505,11 +961,24 @@ template Status CudaDequantizeLinearStd<int8_t, float>(cudaStream_t stream, cons
template Status CudaDequantizeLinearStd<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinearStd<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinearStd<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinearStdInt4<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinearStdInt4<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinearStdInt4<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinearStdInt4<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
template Status CudaDequantizeLinearAxisStd<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaDequantizeLinearAxisStd<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaDequantizeLinearAxisStd<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaDequantizeLinearAxisStd<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaDequantizeLinearAxisStdInt4<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaDequantizeLinearAxisStdInt4<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaDequantizeLinearAxisStdInt4<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaDequantizeLinearAxisStdInt4<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template Status CudaDequantizeLinearBlockStdInt4<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
template Status CudaDequantizeLinearBlockStdInt4<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
template Status CudaDequantizeLinearBlockStdInt4<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
template Status CudaDequantizeLinearBlockStdInt4<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
#if !defined(DISABLE_FLOAT8_TYPES)

View file

@ -11,33 +11,96 @@ namespace onnxruntime {
namespace cuda {
template <class T, class U>
Status CudaQuantizeLinearStd(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element);
Status CudaQuantizeLinearStd(cudaStream_t stream, const U* input, T* output, const U* scale,
const T* zero_point, size_t num_of_element);
template <class T, class U>
Status CudaQuantizeLinearSat(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
bool saturate);
Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale,
const T* zero_point, size_t num_of_element);
template <class T, class U>
Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales);
Status CudaQuantizeLinearSat(cudaStream_t stream, const U* input, T* output, const U* scale,
const T* zero_point, size_t num_of_element, bool saturate);
template <class T, class U>
Status CudaQuantizeLinearAxisSat(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales, bool saturate);
Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const U* input, T* output, const U* scale,
const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template <class T, class U>
Status CudaDequantizeLinearStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element);
Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale,
const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template <class T, class U>
Status CudaDequantizeLinearSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element);
Status CudaQuantizeLinearAxisSat(cudaStream_t stream, const U* input, T* output, const U* scale,
const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales,
bool saturate);
/**
* @brief block-wise quantization with standard rounding to int4. Input is reshaped to [M, K, N]. K the quantization
* axis. Scale is reshaped to [M, ceil(K/block_size), N]. For an index i in input, the coordiate is (xi, yi, zi)
* = (i / (K * N), i % (K * N) / N, i % N). The scale coordiate is (xi, yi / block_size, zi). The scale index
* is xi * ceil(K / block_size) * N + yi / block_size * N + zi.
* @tparam T quantized type, int8_t for Int4x2, uint8_t for UInt4x2
* @tparam U full precision type
* @param stream cuda stream
* @param input input tensor
* @param output output tensor
* @param scale scale tensor
* @param zero_point zero point tensor
* @param num_of_element number of elements in input tensor
* @param K K
* @param N N
* @param block_size block size
*/
template <class T, class U>
Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale,
const T* zero_point, size_t num_of_element, size_t K, size_t N,
size_t block_size);
template <class T, class U>
Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales);
Status CudaDequantizeLinearStd(cudaStream_t stream, const T* input, U* output, const U* scale,
const T* zero_point, size_t num_of_element);
template <class T, class U>
Status CudaDequantizeLinearAxisSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
size_t batch_size, size_t n_scales);
Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
const T* zero_point, size_t num_of_element);
template <class T, class U>
Status CudaDequantizeLinearSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point,
size_t num_of_element);
template <class T, class U>
Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point,
size_t num_of_element, size_t batch_size, size_t n_scales);
template <class T, class U>
Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
template <class T, class U>
Status CudaDequantizeLinearAxisSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point,
size_t num_of_element, size_t batch_size, size_t n_scales);
/**
* @brief block-wise dequantization with standard rounding to int4. Input is reshaped to [M, K, N]. K the quantization
* axis. Scale is reshaped to [M, ceil(K/block_size), N]. For an index i in input, the coordiate is (xi, yi, zi)
* = (i / (K * N), i % (K * N) / N, i % N). The scale coordiate is (xi, yi / block_size, zi). The scale index
* is xi * ceil(K / block_size) * N + yi / block_size * N + zi.
* @tparam T quantized type, int8_t for Int4x2, uint8_t for UInt4x2
* @tparam U full precision type
* @param stream cuda stream
* @param input input tensor
* @param output output tensor
* @param scale scale tensor
* @param zero_point zero point tensor
* @param num_of_element number of elements in input tensor
* @param K K
* @param N N
* @param block_size block size
*/
template <class T, class U>
Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
const T* zero_point, size_t num_of_element, size_t K, size_t N,
size_t block_size);
} // namespace cuda
} // namespace onnxruntime

View file

@ -19,6 +19,11 @@ class QuantizeLinear final : public CudaKernel {
if (!info.GetAttr<int64_t>("saturate", &saturate_).IsOK()) {
saturate_ = 1;
}
if (!info.GetAttr<int64_t>("block_size", &block_size_).IsOK()) {
block_size_ = 0;
}
ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative.");
}
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
@ -26,6 +31,7 @@ class QuantizeLinear final : public CudaKernel {
private:
int64_t axis_;
int64_t saturate_;
int64_t block_size_;
};
template <class T, class U>
@ -35,12 +41,18 @@ class DequantizeLinear final : public CudaKernel {
if (!info.GetAttr<int64_t>("axis", &axis_).IsOK()) {
axis_ = 1;
}
if (!info.GetAttr<int64_t>("block_size", &block_size_).IsOK()) {
block_size_ = 0;
}
ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative.");
}
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
private:
int64_t axis_;
int64_t block_size_;
};
} // namespace cuda

View file

@ -1370,18 +1370,26 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint64_t, Cast);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, bool, Cast);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
float, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
float, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
MLFloat16, DequantizeLinear);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Identity);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, If);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Loop);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
float, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
float, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
MLFloat16, QuantizeLinear);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Reshape);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Shape);
@ -1390,6 +1398,24 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, S
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN);
// Opset 21
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float,
DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float,
DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16,
DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16,
DequantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float,
QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float,
QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16,
QuantizeLinear);
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16,
QuantizeLinear);
template <>
KernelCreateInfo BuildKernelCreateInfo<void>() {
return {};
@ -2333,19 +2359,19 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, bool, Cast)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, BFloat16, Cast)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Identity)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, If)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Loop)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Reshape)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan)>,
@ -2354,6 +2380,16 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
// opset 20
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN)>,
// opset 21
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear)>,
};
for (auto& function_table_entry : function_table) {

View file

@ -1289,6 +1289,10 @@ struct Tensor final {
template <>
inline bool Tensor::IsDataType<bool>() const { return g_host->Tensor__IsDataType_bool(this); }
template <>
inline bool Tensor::IsDataType<Int4x2>() const { return g_host->Tensor__IsDataType_Int4x2(this); }
template <>
inline bool Tensor::IsDataType<UInt4x2>() const { return g_host->Tensor__IsDataType_UInt4x2(this); }
template <>
inline bool Tensor::IsDataType<int8_t>() const { return g_host->Tensor__IsDataType_int8(this); }
template <>
inline bool Tensor::IsDataType<uint8_t>() const { return g_host->Tensor__IsDataType_uint8(this); }
@ -1327,6 +1331,10 @@ inline bool Tensor::IsDataType<Float8E5M2FNUZ>() const { return g_host->Tensor__
template <>
inline bool* Tensor::MutableData<bool>() { return g_host->Tensor__MutableData_bool(this); }
template <>
inline Int4x2* Tensor::MutableData<Int4x2>() { return g_host->Tensor__MutableData_Int4x2(this); }
template <>
inline UInt4x2* Tensor::MutableData<UInt4x2>() { return g_host->Tensor__MutableData_UInt4x2(this); }
template <>
inline int8_t* Tensor::MutableData<int8_t>() { return g_host->Tensor__MutableData_int8(this); }
template <>
inline uint8_t* Tensor::MutableData<uint8_t>() { return g_host->Tensor__MutableData_uint8(this); }
@ -1365,6 +1373,10 @@ inline Float8E5M2FNUZ* Tensor::MutableData<Float8E5M2FNUZ>() { return g_host->Te
template <>
inline const bool* Tensor::Data<bool>() const { return g_host->Tensor__Data_bool(this); }
template <>
inline const Int4x2* Tensor::Data<Int4x2>() const { return g_host->Tensor__Data_Int4x2(this); }
template <>
inline const UInt4x2* Tensor::Data<UInt4x2>() const { return g_host->Tensor__Data_UInt4x2(this); }
template <>
inline const int8_t* Tensor::Data<int8_t>() const { return g_host->Tensor__Data_int8(this); }
template <>
inline const uint8_t* Tensor::Data<uint8_t>() const { return g_host->Tensor__Data_uint8(this); }

View file

@ -6,6 +6,7 @@
#include <functional>
#include <string>
#include <vector>
#include <memory>
#include "core/common/inlined_containers_fwd.h"
#include "core/common/span_utils.h"
@ -140,7 +141,8 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
double relative_per_sample_tolerance,
std::unique_ptr<GraphTransformer> transformer,
const std::function<void(SessionOptions&)>& add_session_options,
const InlinedHashSet<std::string>& disabled_optimizers) {
const InlinedHashSet<std::string>& disabled_optimizers,
std::unique_ptr<IExecutionProvider> ep) {
// Build the model for this test.
std::unordered_map<std::string, int> domain_to_version;
domain_to_version[kOnnxDomain] = opset_version;
@ -157,6 +159,7 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
// Serialize the model to a string.
std::string model_data;
model.ToProto().SerializeToString(&model_data);
std::shared_ptr<IExecutionProvider> ep_shared = ep ? std::move(ep) : nullptr;
auto run_model = [&](TransformerLevel level, std::vector<OrtValue>& fetches,
std::unique_ptr<GraphTransformer> transformer = nullptr) {
@ -170,6 +173,10 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
add_session_options(session_options);
}
InferenceSessionWrapper session{session_options, GetEnvironment()};
if (ep_shared) {
ASSERT_STATUS_OK(session.RegisterExecutionProvider(ep_shared));
}
ASSERT_STATUS_OK(session.Load(model_data.data(), static_cast<int>(model_data.size())));
if (transformer) {
ASSERT_STATUS_OK(session.RegisterGraphTransformer(std::move(transformer), level));

View file

@ -555,7 +555,8 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
double relative_per_sample_tolerance = 0.0,
std::unique_ptr<GraphTransformer> transformer = nullptr,
const std::function<void(SessionOptions&)>& add_session_options = {},
const InlinedHashSet<std::string>& disabled_optimizers = {});
const InlinedHashSet<std::string>& disabled_optimizers = {},
std::unique_ptr<IExecutionProvider> ep = nullptr);
void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& build_test_case,
const std::function<void(InferenceSessionWrapper& session)>& check_transformed_graph,

View file

@ -17,6 +17,7 @@
#include "test/optimizer/qdq_test_utils.h"
#include "test/optimizer/graph_transform_test_builder.h"
#include "test/util/include/asserts.h"
#include "test/util/include/default_providers.h"
#include "test/util/include/inference_session_wrapper.h"
#include "gtest/gtest.h"
@ -55,7 +56,8 @@ RunDQMatMulNotConverted_NonConstDQ(const std::vector<int64_t>& input1_shape,
const std::vector<int64_t>& input2_shape,
const int64_t axis,
const int64_t block_size,
int64_t accuracy_level) {
int64_t accuracy_level,
std::unique_ptr<IExecutionProvider> ep = nullptr) {
auto build_test_case = [&](ModelTestBuilder& builder) {
auto* input1_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
auto* input2_arg = builder.MakeInput(input2_shape, T(T::min_val, 0), T(T::max_val, 0));
@ -104,7 +106,9 @@ RunDQMatMulNotConverted_NonConstDQ(const std::vector<int64_t>& input1_shape,
1e-5 /*per_sample_tolerance*/,
1e-5 /*relative_per_sample_tolerance*/,
nullptr,
add_session_options_fn);
add_session_options_fn,
{},
ep ? std::move(ep) : nullptr);
}
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) {
@ -127,6 +131,27 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) {
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
}
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ_Cuda) {
// DQ contrib op schema is not updated to support blocked quantization
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
;
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
}
// Input2
// |
// DQ /
@ -140,7 +165,8 @@ RunDQMatMulNotConverted_FirstDQInput(const std::vector<int64_t>& weight_shape,
const std::vector<int64_t>& input2_shape,
const int64_t axis,
const int64_t block_size,
int64_t accuracy_level) {
int64_t accuracy_level,
std::unique_ptr<IExecutionProvider> ep = nullptr) {
auto build_test_case = [&](ModelTestBuilder& builder) {
auto* weight_arg = builder.MakeInitializer(weight_shape, T(T::min_val, 0), T(T::max_val, 0));
auto* input2_arg = builder.MakeInput(input2_shape, -100.0f, 100.0f);
@ -189,7 +215,9 @@ RunDQMatMulNotConverted_FirstDQInput(const std::vector<int64_t>& weight_shape,
1e-5 /*per_sample_tolerance*/,
1e-5 /*relative_per_sample_tolerance*/,
nullptr,
add_session_options_fn);
add_session_options_fn,
{},
ep ? std::move(ep) : nullptr);
}
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) {
@ -212,6 +240,27 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) {
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
}
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput_Cuda) {
// DQ contrib op schema is not updated to support blocked quantization
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
;
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
}
// Input1
// |
// \ DQ
@ -224,7 +273,8 @@ void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector<int64_t>& input
const std::vector<int64_t>& weight_shape,
const int64_t axis,
const int64_t block_size,
int64_t accuracy_level) {
int64_t accuracy_level,
std::unique_ptr<IExecutionProvider> ep = nullptr) {
auto build_test_case = [&](ModelTestBuilder& builder) {
auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
auto* output_arg = builder.MakeOutput();
@ -287,7 +337,9 @@ void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector<int64_t>& input
1e-5 /*per_sample_tolerance*/,
1e-5 /*relative_per_sample_tolerance*/,
nullptr,
add_session_options_fn);
add_session_options_fn,
{},
ep ? std::move(ep) : nullptr);
}
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_TypeMismatch) {
@ -327,6 +379,31 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch) {
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
}
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch_Cuda) {
// DQ contrib op schema is not updated to support blocked quantization
// block size too small
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
// block size not 2's power
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
;
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
// not axis 0
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
// not rank 2
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
}
// Input1
// | DQ
// \ /
@ -343,7 +420,8 @@ RunDQMatMulConverted(const std::vector<int64_t>& input1_shape,
const std::vector<int64_t>& weight2_shape,
const int64_t axis,
const int64_t block_size,
int64_t accuracy_level) {
int64_t accuracy_level,
std::unique_ptr<IExecutionProvider> ep = nullptr) {
auto build_test_case = [&](ModelTestBuilder& builder) {
auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
auto* output_arg = builder.MakeOutput();
@ -402,9 +480,11 @@ RunDQMatMulConverted(const std::vector<int64_t>& input1_shape,
TransformerLevel::Level2,
21 /*opset_version*/,
1e-5 /*per_sample_tolerance*/,
1e-5 /*relative_per_sample_tolerance*/,
2e-5 /*relative_per_sample_tolerance*/,
nullptr,
add_session_options_fn);
add_session_options_fn,
{},
ep ? std::move(ep) : nullptr);
}
TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) {
@ -419,6 +499,18 @@ TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) {
RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
}
TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits_Cuda) {
// DQ contrib op schema is not updated to support blocked quantization
RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
}
#endif // !defined(DISABLE_CONTRIB_OPS)
} // namespace test

View file

@ -869,7 +869,8 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size,
template <typename Tin, typename Tout>
void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
int64_t scale_block_count,
int64_t zero_point_block_count) {
int64_t zero_point_block_count,
std::unique_ptr<IExecutionProvider> ep = nullptr) {
OpTester test("DequantizeLinear", 21);
std::vector<int64_t> dims{2, 4};
std::vector<Tout> x_scale, y;
@ -877,7 +878,7 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
SessionOptions so;
std::vector<std::string> log_msgs; // redirect error messages
std::vector<std::unique_ptr<IExecutionProvider>> eps;
eps.push_back(DefaultCpuExecutionProvider());
eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
const char* logid, const char* code_location, const char* message) {
ORT_UNUSED_PARAMETER(severity);
@ -970,6 +971,13 @@ TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int) {
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(-1, 2, 2);
}
TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int_Cuda) {
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(-1, 2, 2, DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(-1, 2, 2, DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(-2, 2, 2, DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(-2, 2, 2, DefaultCudaExecutionProvider());
}
#if !defined(DISABLE_FLOAT8_TYPES)
TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) {
constexpr int min_cuda_architecture = 11080;
@ -1013,6 +1021,13 @@ TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) {
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 1, 1);
}
TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int_Cuda) {
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 1, 1, DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 3, 3, DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 3, 3, DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 1, 1, DefaultCudaExecutionProvider());
}
#if !defined(DISABLE_FLOAT8_TYPES)
TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) {
constexpr int min_cuda_architecture = 11080;
@ -1052,6 +1067,13 @@ TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) {
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 2, 1);
}
TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int_Cuda) {
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 2, 1, DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 2, 3, DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 2, 3, DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 2, 1, DefaultCudaExecutionProvider());
}
#if !defined(DISABLE_FLOAT8_TYPES)
TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) {
constexpr int min_cuda_architecture = 11080;
@ -1081,14 +1103,14 @@ void DequantizeLinearOp21BlockedTest_Int4_Succeed(std::vector<int64_t>&& dims,
std::vector<int>& x_,
std::vector<float>& x_scale_,
std::vector<int>& x_zero_point_,
std::vector<float>& y_) {
std::vector<float>& y_,
std::unique_ptr<IExecutionProvider> ep = nullptr) {
OpTester test("DequantizeLinear", 21);
std::vector<int64_t> x_scale_shape;
std::vector<Tout> x_scale, y;
std::vector<Tin> x, x_zero_point;
std::vector<std::unique_ptr<IExecutionProvider>> eps;
eps.push_back(DefaultCpuExecutionProvider());
eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
bool use_zero_point = !x_zero_point_.empty();
@ -1216,6 +1238,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point;
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
std::vector<float> y_2{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, 2.0, 8.0, -10.5, -4.0, 10.0, 24.0, -24.5, 8.0};
std::vector<float> y_3{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, -2.0, -8.0, 10.5, 4.0, 10.0, 24.0, -24.5, 8.0};
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
@ -1237,6 +1276,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
std::vector<float> y_2{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, 2.0, 0, 3.5, 3.0, 10.0, 16.0, -10.5, 15};
std::vector<float> y_3{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, -14.0, -24, 21, 5, 10.0, 16.0, -10.5, 15};
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
std::vector<int> zero_point{};
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
@ -1262,6 +1318,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis_Cuda) {
std::vector<int> zero_point{};
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
std::vector<float> y_2{14, 24, 10, 16, -10.5, -2, -3.5, 0, 2, 8, 6, 16, -17.5, -6, -24.5, 8};
std::vector<float> y_3{14, 24, 10, 16, 6, 8, -3.5, 0, 2, 8, 6, 16, 10, 24, -24.5, 8};
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
@ -1283,6 +1356,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
std::vector<float> y_2{2, 8, -2, 0, 0, -1, 7, 1, 2, 0, 6, 8, -3.5, 1, -10.5, 15};
std::vector<float> y_3{2, 8, -2, 0, -6, -8, 7, 1, 2, 0, 6, 8, 10, 16, -10.5, 15};
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
std::vector<int> zero_point{};
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
@ -1308,6 +1398,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis_Cuda) {
std::vector<int> zero_point{};
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
std::vector<float> y_2{14, 12, 20, 16, -10.5, -7, -1, 0, 2, 4, 12, 16, -17.5, -21, -7, 8};
std::vector<float> y_3{14, 12, 10, 16, -10.5, -7, -3.5, 0, 2, 4, 6, 16, -17.5, -21, -24.5, 8};
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
@ -1329,6 +1436,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
std::vector<float> y_2{2, 0, 4, 0, 0, 3.5, 0, 1, 2, 4, 4, 8, -3.5, -7, 0, 15};
std::vector<float> y_3{2, 0, -2, 0, 0, 3.5, 7, 1, 2, 4, 6, 8, -3.5, -7, -10.5, 15};
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{};
@ -1350,6 +1474,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{};
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
std::vector<float> y_2{0, -4, 7, 3, -8, -20, 21, 7, 16, 36, -35, -11, 24, 52, -49, -15};
std::vector<float> y_3{0, -4, 7, 3, -8, -20, 21, 7, -16, -36, 35, 11, 24, 52, -49, -15};
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
@ -1371,6 +1512,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
std::vector<float> y_2{4, -4, 3.5, -6, -4, -20, 17.5, -2, -10, 16, 3.5, -5, -2, 32, -10.5, -9};
std::vector<float> y_3{4, -4, 3.5, -6, -4, -20, 17.5, -2, -12, -36, 31.5, 2, -2, 32, -10.5, -9};
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{};
@ -1392,6 +1550,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{};
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
std::vector<float> y_2{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15};
std::vector<float> y_3{0, -4, -4, -12, -8, -20, 21, 7, 16, 36, 20, 44, 24, 52, -49, -15};
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
@ -1413,6 +1588,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
std::vector<float> y_2{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9};
std::vector<float> y_3{4, -4, 0, -12, -4, -20, 17.5, -2, -10, 16, -6, 24, -2, 32, -10.5, -9};
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{};
@ -1434,6 +1626,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{};
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
std::vector<float> y_2{0, -2, -8, -12, 14, 17.5, 6, 7, 16, 18, 40, 44, -42, -45.5, -14, -15};
std::vector<float> y_3{0, -2, -4, -12, 14, 17.5, 21, 7, 16, 18, 20, 44, -42, -45.5, -49, -15};
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
@ -1455,6 +1664,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
}
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis_Cuda) {
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
std::vector<float> y_2{4, 2, -8, -12, 10.5, 14, -3, -2, -10, -8, 20, 24, -3.5, -7, -8, -9};
std::vector<float> y_3{4, 2, 0, -12, 10.5, 14, 17.5, -2, -10, -8, -6, 24, -3.5, -7, -10.5, -9};
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
#if !defined(DISABLE_FLOAT8_TYPES)
TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_FirstAxis) {
constexpr int min_cuda_architecture = 11080;
@ -1624,7 +1850,8 @@ void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size,
template <typename Tout, typename Tin>
void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
int64_t scale_block_count,
int64_t zero_point_block_count) {
int64_t zero_point_block_count,
std::unique_ptr<IExecutionProvider> ep = nullptr) {
OpTester test("QuantizeLinear", 21);
std::vector<int64_t> dims{2, 4};
std::vector<Tout> x_zero_point, y;
@ -1632,7 +1859,7 @@ void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
SessionOptions so;
std::vector<std::string> log_msgs; // redirect error messages
std::vector<std::unique_ptr<IExecutionProvider>> eps;
eps.push_back(DefaultCpuExecutionProvider());
eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
const char* logid, const char* code_location, const char* message) {
ORT_UNUSED_PARAMETER(severity);
@ -1725,6 +1952,13 @@ TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Int) {
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(-1, 2, 2);
}
TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Int_Cuda) {
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(-1, 2, 2, DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(-1, 2, 2, DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(-2, 2, 2, DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(-2, 2, 2, DefaultCudaExecutionProvider());
}
#if !defined(DISABLE_FLOAT8_TYPES)
TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) {
constexpr int min_cuda_architecture = 11080;
@ -1768,6 +2002,13 @@ TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) {
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 1, 1);
}
TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int_Cuda) {
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 1, 1, DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 3, 3, DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 3, 3, DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 1, 1, DefaultCudaExecutionProvider());
}
#if !defined(DISABLE_FLOAT8_TYPES)
TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) {
constexpr int min_cuda_architecture = 11080;
@ -1807,6 +2048,13 @@ TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) {
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 2, 1);
}
TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int_Cuda) {
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 2, 1, DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 2, 3, DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 2, 3, DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 2, 1, DefaultCudaExecutionProvider());
}
#if !defined(DISABLE_FLOAT8_TYPES)
TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) {
constexpr int min_cuda_architecture = 11080;
@ -1836,14 +2084,14 @@ void QuantizeLinearOp21BlockedTest_Int4_Succeed(std::vector<int64_t>&& dims,
std::vector<float>& x_,
std::vector<float>& scale_,
std::vector<int>& zero_point_,
std::vector<int>& y_) {
std::vector<int>& y_,
std::unique_ptr<IExecutionProvider> ep = nullptr) {
OpTester test("QuantizeLinear", 21);
std::vector<int64_t> scale_shape;
std::vector<Tout> zero_point, y;
std::vector<Tin> x, scale;
std::vector<std::unique_ptr<IExecutionProvider>> eps;
eps.push_back(DefaultCpuExecutionProvider());
eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
bool use_zero_point = !zero_point_.empty();
@ -1970,6 +2218,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_FirstAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_FirstAxis_Cuda) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
std::vector<float> x{14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4,
6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0,
2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0,
10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0};
std::vector<int> y_2{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
std::vector<int> y_3{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
-1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4,
5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
@ -2022,6 +2298,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_FirstAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_FirstAxis_Cuda) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{-6, -4, -3, -1, -6, -4, -3, -1, -6, -4, -3, -1, -6, -4, -3, -1,
0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7};
std::vector<float> x{2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3,
-6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1,
2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0,
10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15};
std::vector<int> y_2{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
std::vector<int> y_3{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
-7, -4, -2, 2, -7, -4, -2, 2, -7, -4, -2, 2, -7, -4, -2, 2,
5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
@ -2074,6 +2378,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_MiddleAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_MiddleAxis_Cuda) {
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, -2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0,
2.0, 4.0, 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0};
std::vector<float> x{14, 24, 14, 24, 14, 24, 14, 24, 10, 16, 10, 16, 10, 16, 10, 16,
-10.5, -2, -10.5, -2, -10.5, -2, -10.5, -2, -3.5, 0, -3.5, 0, -3.5, 0, -3.5, 0,
2, 8, 2, 8, 2, 8, 2, 8, 6, 16, 6, 16, 6, 16, 6, 16,
-17.5, -6, -17.5, -6, -17.5, -6, -17.5, -6, -24.5, 8, -24.5, 8, -24.5, 8, -24.5, 8};
std::vector<int> y_2{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
-3, -2, -3, -2, -3, -2, -3, -2, -1, 0, -1, 0, -1, 0, -1, 0,
1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
5, 6, 5, 6, 5, 6, 5, 6, 7, -8, 7, -8, 7, -8, 7, -8};
std::vector<int> y_3{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
5, 0, 5, 0, 5, 0, 5, 0, -1, 0, -1, 0, -1, 0, -1, 0,
1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
-8, -2, -8, -2, -8, -2, -8, -2, 7, -8, 7, -8, 7, -8, 7, -8};
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@ -2126,6 +2458,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_MiddleAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_MiddleAxis_Cuda) {
std::vector<int> zero_point{-6, -4, -6, -4, -6, -4, -6, -4, -3, -1, -3, -1, -3, -1, -3, -1,
0, 2, 0, 2, 0, 2, 0, 2, 4, 7, 4, 7, 4, 7, 4, 7};
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, -2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0,
2.0, 4.0, 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0};
std::vector<float> x{2, 8, 2, 8, 2, 8, 2, 8, -2, 0, -2, 0, -2, 0, -2, 0,
0, -1, 0, -1, 0, -1, 0, -1, 7, 1, 7, 1, 7, 1, 7, 1,
2, 0, 2, 0, 2, 0, 2, 0, 6, 8, 6, 8, 6, 8, 6, 8,
-3.5, 1, -3.5, 1, -3.5, 1, -3.5, 1, -10.5, 15, -10.5, 15, -10.5, 15, -10.5, 15};
std::vector<int> y_2{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
-3, -2, -3, -2, -3, -2, -3, -2, -1, 0, -1, 0, -1, 0, -1, 0,
1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
5, 6, 5, 6, 5, 6, 5, 6, 7, -8, 7, -8, 7, -8, 7, -8};
std::vector<int> y_3{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
-6, -4, -6, -4, -6, -4, -6, -4, -1, 0, -1, 0, -1, 0, -1, 0,
1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
-2, 2, -2, 2, -2, 2, -2, 2, 7, -8, 7, -8, 7, -8, 7, -8};
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
std::vector<int> zero_point{-6, -4, -6, -4, -6, -4, -6, -4, -3, -1, -3, -1, -3, -1, -3, -1,
0, 2, 0, 2, 0, 2, 0, 2, 4, 7, 4, 7, 4, 7, 4, 7};
@ -2178,6 +2538,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_LastAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_LastAxis_Cuda) {
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0};
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
std::vector<float> x{14, 12, 14, 12, 20, 16, 20, 16, 14, 12, 14, 12, 20, 16, 20, 16,
-10.5, -7, -10.5, -7, -1, 0, -1, 0, -10.5, -7, -10.5, -7, -1, 0, -1, 0,
2, 4, 2, 4, 12, 16, 12, 16, 2, 4, 2, 4, 12, 16, 12, 16,
-17.5, -21, -17.5, -21, -7, 8, -7, 8, -17.5, -21, -17.5, -21, -7, 8, -7, 8};
std::vector<int> y_2{-7, -6, -7, -6, -5, -4, -5, -4, -7, -6, -7, -6, -5, -4, -5, -4,
-3, -2, -3, -2, -1, 0, -1, 0, -3, -2, -3, -2, -1, 0, -1, 0,
1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4,
5, 6, 5, 6, 7, -8, 7, -8, 5, 6, 5, 6, 7, -8, 7, -8};
std::vector<int> y_3{-7, -6, -7, -6, -8, -4, -5, -4, -7, -6, -7, -6, -8, -4, -5, -4,
-3, -2, -3, -2, 0, 0, -1, 0, -3, -2, -3, -2, 0, 0, -1, 0,
1, 2, 1, 2, 6, 4, 3, 4, 1, 2, 1, 2, 6, 4, 3, 4,
5, 6, 5, 6, 2, -8, 7, -8, 5, 6, 5, 6, 2, -8, 7, -8};
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0};
@ -2230,6 +2618,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_LastAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_LastAxis_Cuda) {
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
std::vector<int> zero_point{-6, -4, -6, -4, -3, -1, -3, -1,
0, 2, 0, 2, 4, 7, 4, 7};
std::vector<float> x{2, 0, 2, 0, 4, 0, 4, 0, 2, 0, 2, 0, 4, 0, 4, 0,
0, 3.5, 0, 3.5, 0, 1, 0, 1, 0, 3.5, 0, 3.5, 0, 1, 0, 1,
2, 4, 2, 4, 4, 8, 4, 8, 2, 4, 2, 4, 4, 8, 4, 8,
-3.5, -7, -3.5, -7, 0, 15, 0, 15, -3.5, -7, -3.5, -7, 0, 15, 0, 15};
std::vector<int> y_2{-7, -6, -7, -6, -5, -4, -5, -4, -7, -6, -7, -6, -5, -4, -5, -4,
-3, -2, -3, -2, -1, 0, -1, 0, -3, -2, -3, -2, -1, 0, -1, 0,
1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4,
5, 6, 5, 6, 7, -8, 7, -8, 5, 6, 5, 6, 7, -8, 7, -8};
std::vector<int> y_3{-7, -6, -7, -6, -8, -4, -5, -4, -7, -6, -7, -6, -8, -4, -5, -4,
-3, -2, -3, -2, -3, 0, -1, 0, -3, -2, -3, -2, -3, 0, -1, 0,
1, 2, 1, 2, 2, 4, 3, 4, 1, 2, 1, 2, 2, 4, 3, 4,
5, 6, 5, 6, 4, -8, 7, -8, 5, 6, 5, 6, 4, -8, 7, -8};
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
@ -2282,6 +2698,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_FirstAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_FirstAxis_Cuda) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
std::vector<float> x{0, -4, 7, 3, 0, -4, 7, 3, 0, -4, 7, 3, 0, -4, 7, 3,
-8, -20, 21, 7, -8, -20, 21, 7, -8, -20, 21, 7, -8, -20, 21, 7,
16, 36, -35, -11, 16, 36, -35, -11, 16, 36, -35, -11, 16, 36, -35, -11,
24, 52, -49, -15, 24, 52, -49, -15, 24, 52, -49, -15, 24, 52, -49, -15};
std::vector<int> y_2{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
std::vector<int> y_3{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
@ -2334,6 +2778,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_FirstAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_FirstAxis_Cuda) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{2, 0, 1, 9, 2, 0, 1, 9, 2, 0, 1, 9, 2, 0, 1, 9,
13, 5, 11, 6, 13, 5, 11, 6, 13, 5, 11, 6, 13, 5, 11, 6};
std::vector<float> x{4, -4, 3.5, -6, 4, -4, 3.5, -6, 4, -4, 3.5, -6, 4, -4, 3.5, -6,
-4, -20, 17.5, -2, -4, -20, 17.5, -2, -4, -20, 17.5, -2, -4, -20, 17.5, -2,
-10, 16, 3.5, -5, -10, 16, 3.5, -5, -10, 16, 3.5, -5, -10, 16, 3.5, -5,
-2, 32, -10.5, -9, -2, 32, -10.5, -9, -2, 32, -10.5, -9, -2, 32, -10.5, -9};
std::vector<int> y_2{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
std::vector<int> y_3{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4,
12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
@ -2386,6 +2858,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_MiddleAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_MiddleAxis_Cuda) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
std::vector<float> x{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15,
0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15,
0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15,
0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15};
std::vector<int> y_2{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
std::vector<int> y_3{0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15,
0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15,
0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15,
0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15};
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
@ -2438,6 +2938,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_MiddleAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_MiddleAxis_Cuda) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6, 2, 0, 1, 9, 13, 5, 11, 6,
2, 0, 1, 9, 13, 5, 11, 6, 2, 0, 1, 9, 13, 5, 11, 6};
std::vector<float> x{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9,
4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9,
4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9,
4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9};
std::vector<int> y_2{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
std::vector<int> y_3{0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15,
0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15,
0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15,
0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15};
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
@ -2490,6 +3018,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_LastAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_LastAxis_Cuda) {
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0};
std::vector<float> x{0, -2, 0, -2, -8, -12, -8, -12, 0, -2, 0, -2, -8, -12, -8, -12,
14, 17.5, 14, 17.5, 6, 7, 6, 7, 14, 17.5, 14, 17.5, 6, 7, 6, 7,
16, 18, 16, 18, 40, 44, 40, 44, 16, 18, 16, 18, 40, 44, 40, 44,
-42, -45.5, -42, -45.5, -14, -15, -14, -15, -42, -45.5, -42, -45.5, -14, -15, -14, -15};
std::vector<int> y_2{0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3,
4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7,
8, 9, 8, 9, 10, 11, 10, 11, 8, 9, 8, 9, 10, 11, 10, 11,
12, 13, 12, 13, 14, 15, 14, 15, 12, 13, 12, 13, 14, 15, 14, 15};
std::vector<int> y_3{0, 1, 0, 1, 4, 3, 2, 3, 0, 1, 0, 1, 4, 3, 2, 3,
4, 5, 4, 5, 2, 7, 6, 7, 4, 5, 4, 5, 2, 7, 6, 7,
8, 9, 8, 9, 15, 11, 10, 11, 8, 9, 8, 9, 15, 11, 10, 11,
12, 13, 12, 13, 4, 15, 14, 15, 12, 13, 12, 13, 4, 15, 14, 15};
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
@ -2542,6 +3098,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_LastAxis) {
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_LastAxis_Cuda) {
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
std::vector<int> zero_point{2, 0, 2, 0, 1, 9, 1, 9,
13, 5, 13, 5, 11, 6, 11, 6};
std::vector<float> x{4, 2, 4, 2, -8, -12, -8, -12, 4, 2, 4, 2, -8, -12, -8, -12,
10.5, 14, 10.5, 14, -3, -2, -3, -2, 10.5, 14, 10.5, 14, -3, -2, -3, -2,
-10, -8, -10, -8, 20, 24, 20, 24, -10, -8, -10, -8, 20, 24, 20, 24,
-3.5, -7, -3.5, -7, -8, -9, -8, -9, -3.5, -7, -3.5, -7, -8, -9, -8, -9};
std::vector<int> y_2{0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3,
4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7,
8, 9, 8, 9, 10, 11, 10, 11, 8, 9, 8, 9, 10, 11, 10, 11,
12, 13, 12, 13, 14, 15, 14, 15, 12, 13, 12, 13, 14, 15, 14, 15};
std::vector<int> y_3{0, 1, 0, 1, 6, 3, 2, 3, 0, 1, 0, 1, 6, 3, 2, 3,
4, 5, 4, 5, 0, 7, 6, 7, 4, 5, 4, 5, 0, 7, 6, 7,
8, 9, 8, 9, 15, 11, 10, 11, 8, 9, 8, 9, 15, 11, 10, 11,
12, 13, 12, 13, 13, 15, 14, 15, 12, 13, 12, 13, 13, 15, 14, 15};
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
DefaultCudaExecutionProvider());
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
DefaultCudaExecutionProvider());
}
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};