mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-20 02:07:56 +00:00
[CUDA] Support CUDA EP blocked quantization in Q/DQ ops. (#21846)
### Description 1. Added CUDA EP support for blocked quantization in QuantizeLinear and DequantizeLinear ops. 2. Currently CUDA EP blocked quantization only supports int4/uint4 quantized types and float32/float16 unquantized types. 3. Added CUDA EP support in QDQ selector/action transformer. CUDA EP is only added to DQ + MatMul -> MatMulNBits rule. Other rules' EP support are not changed. ### Motivation and Context ONNX opset 21 introduced blocked quantization for Q/DQ opts. ORT originally only supports CPU EP blocked quantization.
This commit is contained in:
parent
60b07623a2
commit
5dee95fa10
14 changed files with 1922 additions and 193 deletions
|
|
@ -587,7 +587,8 @@ Do not modify directly.*
|
|||
|DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
|
||||
|||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
|
||||
|||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
|
||||
|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|19+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
|
||||
|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|21+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
|
||||
|||[19, 20]|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
|
||||
|||[13, 18]|**T** = tensor(int8), tensor(uint8)|
|
||||
|||[10, 12]|**T** = tensor(int8), tensor(uint8)|
|
||||
|Div|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
|
||||
|
|
@ -718,7 +719,8 @@ Do not modify directly.*
|
|||
|||[13, 14]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)<br/> **T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)|
|
||||
|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)<br/> **T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)|
|
||||
|||[7, 11]|**T** = tensor(double), tensor(float), tensor(float16)|
|
||||
|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|19+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)|
|
||||
|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)|
|
||||
|||[19, 20]|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)|
|
||||
|||[13, 18]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
|
||||
|||[10, 12]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
|
||||
|RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
|
||||
|
|
|
|||
|
|
@ -23,7 +23,10 @@ void SplitQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
|
|||
const std::string action_name{"dropSplitQDQ"};
|
||||
std::unique_ptr<Action> action = std::make_unique<QDQ::SplitReplaceWithQuant>();
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::SplitSelector>(true /*req_equal_quant_params*/);
|
||||
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::SplitSelector>(true /*req_equal_quant_params*/,
|
||||
false,
|
||||
providers);
|
||||
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
|
||||
{{"Split", {}}},
|
||||
std::move(selector),
|
||||
|
|
@ -63,14 +66,18 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
|
|||
//
|
||||
// And cannot eliminate the QDQ for MaxPool if the scale is not positive, as a negative
|
||||
// scale will change the ordering of the elements between quantized & de-quantized values.
|
||||
std::unique_ptr<NodeSelector> selector_no_16bit = std::make_unique<QDQ::DropQDQNodesSelector>(false);
|
||||
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
|
||||
std::unique_ptr<NodeSelector> selector_no_16bit = std::make_unique<QDQ::DropQDQNodesSelector>(false,
|
||||
false,
|
||||
true,
|
||||
providers);
|
||||
qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_name,
|
||||
{{"Resize", {}}},
|
||||
std::move(selector_no_16bit),
|
||||
std::move(drop_action_no_int16));
|
||||
|
||||
std::unique_ptr<NodeSelector> selector_no_16bit_and_positive_scale =
|
||||
std::make_unique<QDQ::DropQDQNodesSelector>(false, true, false);
|
||||
std::make_unique<QDQ::DropQDQNodesSelector>(false, true, false, providers);
|
||||
qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_and_positive_scale_name,
|
||||
{{"MaxPool", {12}},
|
||||
{"ReduceMax", {}},
|
||||
|
|
@ -78,7 +85,7 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
|
|||
std::move(selector_no_16bit_and_positive_scale),
|
||||
std::move(drop_action_no_int16_and_positive_scale));
|
||||
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropQDQNodesSelector>(true);
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropQDQNodesSelector>(true, false, true, providers);
|
||||
// DepthToSpace and SpaceToDepth not included because there are no integer implementations.
|
||||
// https://github.com/microsoft/onnxruntime/issues/21287
|
||||
qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_name,
|
||||
|
|
@ -117,7 +124,8 @@ void DropDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
|
|||
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
// TODO: Enable 16-bit types in selector when ArgMax supports 16-bit integer input tensors.
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropDQNodesSelector>();
|
||||
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropDQNodesSelector>(false, false, providers);
|
||||
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
|
||||
{{"ArgMax", {}}},
|
||||
std::move(selector),
|
||||
|
|
@ -200,7 +208,8 @@ void VariadicOpQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
|
|||
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
// TODO: Enable 16-bit types in selector when QLinearConcat supports 16-bit.
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::InputVariadicSelector>();
|
||||
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::InputVariadicSelector>(false, false, providers);
|
||||
|
||||
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
|
||||
{{"Concat", {}}},
|
||||
|
|
@ -222,7 +231,11 @@ void ConvQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool is_
|
|||
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
// TODO: Enable 16-bit types in selector when QLinearConv supports 16-bit.
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::ConvSelector>(is_int8_allowed);
|
||||
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::ConvSelector>(is_int8_allowed,
|
||||
false,
|
||||
false,
|
||||
providers);
|
||||
|
||||
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
|
||||
{{"Conv", {}}},
|
||||
|
|
@ -245,7 +258,11 @@ void MatMulQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool i
|
|||
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
// TODO: Enable 16-bit types in selector when QLinearMatMul and MatMulInteger support 16-bit.
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::MatMulSelector>(is_int8_allowed);
|
||||
std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::MatMulSelector>(is_int8_allowed,
|
||||
false,
|
||||
false,
|
||||
providers);
|
||||
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
|
||||
{{"MatMul", {}}},
|
||||
std::move(selector),
|
||||
|
|
@ -272,7 +289,8 @@ void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_regi
|
|||
p_buffered_tensors);
|
||||
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DQMatMulToMatMulNBitsSelector>();
|
||||
std::vector<const char*> providers = {kCpuExecutionProvider, kCudaExecutionProvider};
|
||||
std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DQMatMulToMatMulNBitsSelector>(providers);
|
||||
qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
|
||||
{{"MatMul", {}}},
|
||||
std::move(selector),
|
||||
|
|
@ -363,8 +381,9 @@ QDQSelectorActionTransformer::QDQSelectorActionTransformer(
|
|||
CreateSelectorActionRegistry(is_int8_allowed, qdq_matmulnbits_accuracy_level,
|
||||
intra_op_thread_pool, p_buffered_tensors),
|
||||
apply_context,
|
||||
// this transformer is only compatible with the CPU and DML EP
|
||||
{kCpuExecutionProvider, kDmlExecutionProvider}} {
|
||||
// this transformer is compatible with CPU, DML and CUDA EP.
|
||||
// There is further EP control on the rule level.
|
||||
{kCpuExecutionProvider, kDmlExecutionProvider, kCudaExecutionProvider}} {
|
||||
}
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -302,14 +302,20 @@ class BaseSelector : public NodeSelector {
|
|||
|
||||
class DropQDQNodesSelector : public BaseSelector {
|
||||
public:
|
||||
explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false, bool allow_nonpositive_scale = true)
|
||||
: BaseSelector(std::make_unique<DropQDQNodeGroupSelector>(allow_16bit, allow_4bit, allow_nonpositive_scale)) {}
|
||||
explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false,
|
||||
bool allow_nonpositive_scale = true,
|
||||
gsl::span<const char*> compatible_providers = {})
|
||||
: BaseSelector(std::make_unique<DropQDQNodeGroupSelector>(allow_16bit, allow_4bit, allow_nonpositive_scale),
|
||||
compatible_providers) {}
|
||||
};
|
||||
|
||||
class DropDQNodesSelector : public BaseSelector {
|
||||
public:
|
||||
explicit DropDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false)
|
||||
: BaseSelector(std::make_unique<DropDQNodeGroupSelector>(allow_16bit, allow_4bit)) {}
|
||||
explicit DropDQNodesSelector(bool allow_16bit = false,
|
||||
bool allow_4bit = false,
|
||||
gsl::span<const char*> compatible_providers = {})
|
||||
: BaseSelector(std::make_unique<DropDQNodeGroupSelector>(allow_16bit, allow_4bit),
|
||||
compatible_providers) {}
|
||||
};
|
||||
|
||||
class UnarySelector : public BaseSelector {
|
||||
|
|
@ -329,8 +335,11 @@ class BinarySelector : public BaseSelector {
|
|||
// Variadic DQ nodes -> node -> Q
|
||||
class InputVariadicSelector : public BaseSelector {
|
||||
public:
|
||||
explicit InputVariadicSelector(bool allow_16bit = false, bool allow_4bit = false)
|
||||
: BaseSelector(std::make_unique<VariadicNodeGroupSelector>(allow_16bit, allow_4bit)) {}
|
||||
explicit InputVariadicSelector(bool allow_16bit = false,
|
||||
bool allow_4bit = false,
|
||||
gsl::span<const char*> compatible_providers = {})
|
||||
: BaseSelector(std::make_unique<VariadicNodeGroupSelector>(allow_16bit, allow_4bit),
|
||||
compatible_providers) {}
|
||||
|
||||
void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
|
||||
};
|
||||
|
|
@ -338,8 +347,10 @@ class InputVariadicSelector : public BaseSelector {
|
|||
// DQ -> Split -> variadic Q nodes
|
||||
class SplitSelector : public BaseSelector {
|
||||
public:
|
||||
SplitSelector(bool req_equal_quant_params = false, bool allow_4bit = false)
|
||||
: BaseSelector(std::make_unique<SplitNodeGroupSelector>(req_equal_quant_params, allow_4bit)) {}
|
||||
SplitSelector(bool req_equal_quant_params = false, bool allow_4bit = false,
|
||||
gsl::span<const char*> compatible_providers = {})
|
||||
: BaseSelector(std::make_unique<SplitNodeGroupSelector>(req_equal_quant_params, allow_4bit),
|
||||
compatible_providers) {}
|
||||
|
||||
void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
|
||||
};
|
||||
|
|
@ -347,8 +358,10 @@ class SplitSelector : public BaseSelector {
|
|||
// DQ nodes for X, W and optionally B -> node -> Q
|
||||
class ConvSelector : public BaseSelector {
|
||||
public:
|
||||
ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false)
|
||||
: BaseSelector(std::make_unique<ConvNodeGroupSelector>(int8_allowed, allow_16bit, allow_4bit_weight)) {}
|
||||
ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false,
|
||||
gsl::span<const char*> compatible_providers = {})
|
||||
: BaseSelector(std::make_unique<ConvNodeGroupSelector>(int8_allowed, allow_16bit, allow_4bit_weight),
|
||||
compatible_providers) {}
|
||||
|
||||
void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
|
||||
};
|
||||
|
|
@ -363,9 +376,11 @@ class WhereSelector : public BaseSelector {
|
|||
// 2 DQ nodes for input -> node -> optional Q if QLinearMatMul, MatMulIntegerToFloat if not
|
||||
class MatMulSelector : public BaseSelector {
|
||||
public:
|
||||
MatMulSelector(bool int8_allowed, bool allow_16bit = false, bool allow_4bit = false)
|
||||
MatMulSelector(bool int8_allowed, bool allow_16bit = false, bool allow_4bit = false,
|
||||
gsl::span<const char*> compatible_providers = {})
|
||||
: BaseSelector(std::make_unique<MatMulNodeGroupSelector>(int8_allowed, /*matmulintegertofloat_allowed*/ true,
|
||||
allow_16bit, allow_4bit)) {}
|
||||
allow_16bit, allow_4bit),
|
||||
compatible_providers) {}
|
||||
};
|
||||
|
||||
// Convert "1 DQ node for input B -> MatMul" to "MatMulNBits"
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include "core/common/inlined_containers.h"
|
||||
#include "core/common/parse_string.h"
|
||||
#include "core/framework/int4.h"
|
||||
#include "core/providers/shared_library/provider_api.h"
|
||||
#include "core/platform/env_var_utils.h"
|
||||
#include "core/providers/cuda/cuda_execution_provider.h"
|
||||
|
|
@ -1348,38 +1349,37 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, Cast);
|
||||
#endif
|
||||
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear);
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, DequantizeLinear);
|
||||
#endif
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear);
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, DequantizeLinear);
|
||||
#endif
|
||||
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Identity);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, If);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Loop);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear);
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, QuantizeLinear);
|
||||
#endif
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear);
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, QuantizeLinear);
|
||||
#endif
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
|
||||
#endif
|
||||
|
||||
// Opset 20
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu);
|
||||
|
|
@ -1388,6 +1388,40 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN);
|
||||
|
||||
// Opset 21.
|
||||
// TODO(fajin): support other quantized types
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, DequantizeLinear);
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, DequantizeLinear);
|
||||
#endif
|
||||
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, QuantizeLinear);
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, QuantizeLinear);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
template <>
|
||||
KernelCreateInfo BuildKernelCreateInfo<void>() {
|
||||
return {};
|
||||
|
|
@ -2265,34 +2299,34 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, Cast)>,
|
||||
#endif
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear)>,
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, DequantizeLinear)>,
|
||||
#endif
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear)>,
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, DequantizeLinear)>,
|
||||
#endif
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Identity)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, If)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Loop)>,
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear)>,
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, QuantizeLinear)>,
|
||||
#endif
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear)>,
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, QuantizeLinear)>,
|
||||
#endif
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
|
||||
|
|
@ -2305,6 +2339,37 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN)>,
|
||||
|
||||
// Opset 21
|
||||
// TODO(fajin): support other quantized types
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, DequantizeLinear)>,
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, DequantizeLinear)>,
|
||||
#endif
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, QuantizeLinear)>,
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, QuantizeLinear)>,
|
||||
#endif
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -7,36 +7,181 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
void ValidateBlockQuantizationShapes(const TensorShape& input_shape,
|
||||
const TensorShape& scale_shape,
|
||||
const Tensor* zero_point,
|
||||
size_t axis_no_neg,
|
||||
int64_t block_size_) {
|
||||
ORT_ENFORCE(scale_shape.NumDimensions() == input_shape.NumDimensions(),
|
||||
"scale and input must have the same rank for blocked quantization");
|
||||
|
||||
for (size_t i = 0, ndim = input_shape.NumDimensions(); i < ndim; ++i) {
|
||||
if (i == static_cast<size_t>(axis_no_neg)) {
|
||||
ORT_ENFORCE(scale_shape[i] == (input_shape[i] + block_size_ - 1) / block_size_,
|
||||
"scale must be ceil(Di/block_size) on the quantize axis i for blocked quantization");
|
||||
} else {
|
||||
ORT_ENFORCE(scale_shape[i] == input_shape[i],
|
||||
"scale and input must have the same shape despite the quantize axis for blocked quantization");
|
||||
}
|
||||
}
|
||||
|
||||
if (zero_point) {
|
||||
ORT_ENFORCE(zero_point->Shape() == scale_shape,
|
||||
"zero_point and scale must have the same shape for blocked quantization");
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
|
||||
CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, bool /*saturate*/) {
|
||||
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point,
|
||||
size_t num_of_element, bool /*saturate*/) {
|
||||
ORT_UNUSED_PARAMETER(stream);
|
||||
ORT_UNUSED_PARAMETER(input);
|
||||
ORT_UNUSED_PARAMETER(output);
|
||||
ORT_UNUSED_PARAMETER(scale);
|
||||
ORT_UNUSED_PARAMETER(zero_point);
|
||||
ORT_UNUSED_PARAMETER(num_of_element);
|
||||
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point,
|
||||
size_t num_of_element, size_t batch_size, size_t n_scales, bool /*saturate*/) {
|
||||
ORT_UNUSED_PARAMETER(stream);
|
||||
ORT_UNUSED_PARAMETER(input);
|
||||
ORT_UNUSED_PARAMETER(output);
|
||||
ORT_UNUSED_PARAMETER(scale);
|
||||
ORT_UNUSED_PARAMETER(zero_point);
|
||||
ORT_UNUSED_PARAMETER(num_of_element);
|
||||
ORT_UNUSED_PARAMETER(batch_size);
|
||||
ORT_UNUSED_PARAMETER(n_scales);
|
||||
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
|
||||
}
|
||||
|
||||
template <typename T, typename U>
|
||||
Status CudaQuantizeLinearBlock(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point,
|
||||
size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) {
|
||||
ORT_UNUSED_PARAMETER(stream);
|
||||
ORT_UNUSED_PARAMETER(input);
|
||||
ORT_UNUSED_PARAMETER(output);
|
||||
ORT_UNUSED_PARAMETER(scale);
|
||||
ORT_UNUSED_PARAMETER(zero_point);
|
||||
ORT_UNUSED_PARAMETER(num_of_element);
|
||||
ORT_UNUSED_PARAMETER(K);
|
||||
ORT_UNUSED_PARAMETER(N);
|
||||
ORT_UNUSED_PARAMETER(block_size);
|
||||
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, int8_t* output, const U* scale,
|
||||
const int8_t* zero_point, size_t num_of_element, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, uint8_t* output, const U* scale,
|
||||
const uint8_t* zero_point, size_t num_of_element, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Int4x2* output, const U* scale,
|
||||
const Int4x2* zero_point, size_t num_of_element, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearStdInt4(stream, input, reinterpret_cast<int8_t*>(output), scale,
|
||||
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
|
||||
num_of_element);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, UInt4x2* output, const U* scale,
|
||||
const UInt4x2* zero_point, size_t num_of_element, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearStdInt4(stream, input, reinterpret_cast<uint8_t*>(output), scale,
|
||||
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
|
||||
num_of_element);
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
|
||||
template <class T, class U>
|
||||
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
|
||||
CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, bool saturate) {
|
||||
template <class U>
|
||||
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Float8E4M3FN* output, const U* scale,
|
||||
const Float8E4M3FN* zero_point, size_t num_of_element, bool saturate) {
|
||||
return CudaQuantizeLinearSat(stream, input, output, scale, zero_point, num_of_element, saturate);
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
|
||||
CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales, bool saturate) {
|
||||
return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales, saturate);
|
||||
template <class U>
|
||||
Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Float8E5M2* output, const U* scale,
|
||||
const Float8E5M2* zero_point, size_t num_of_element, bool saturate) {
|
||||
return CudaQuantizeLinearSat(stream, input, output, scale, zero_point, num_of_element, saturate);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Float8E4M3FN* output, const U* scale,
|
||||
const Float8E4M3FN* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales, bool saturate) {
|
||||
return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size,
|
||||
n_scales, saturate);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Float8E5M2* output, const U* scale,
|
||||
const Float8E5M2* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales, bool saturate) {
|
||||
return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size,
|
||||
n_scales, saturate);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template <class T, class U>
|
||||
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
|
||||
CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales, bool /*saturate*/) {
|
||||
template <class U>
|
||||
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, int8_t* output, const U* scale,
|
||||
const int8_t* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, uint8_t* output, const U* scale,
|
||||
const uint8_t* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Int4x2* output, const U* scale,
|
||||
const Int4x2* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearAxisStdInt4(stream, input, reinterpret_cast<int8_t*>(output), scale,
|
||||
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
|
||||
num_of_element, batch_size, n_scales);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, UInt4x2* output, const U* scale,
|
||||
const UInt4x2* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearAxisStdInt4(stream, input, reinterpret_cast<uint8_t*>(output), scale,
|
||||
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
|
||||
num_of_element, batch_size, n_scales);
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
Status CudaQuantizeLinearBlock(cudaStream_t stream,
|
||||
const U* input, Int4x2* output, const U* scale, const Int4x2* zero_point,
|
||||
size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearBlockStdInt4(stream, input, reinterpret_cast<int8_t*>(output), scale,
|
||||
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
|
||||
num_of_element, K, N, block_size);
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
Status CudaQuantizeLinearBlock(cudaStream_t stream,
|
||||
const U* input, UInt4x2* output, const U* scale, const UInt4x2* zero_point,
|
||||
size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) {
|
||||
return CudaQuantizeLinearBlockStdInt4(stream, input, reinterpret_cast<uint8_t*>(output), scale,
|
||||
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
|
||||
num_of_element, K, N, block_size);
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
|
||||
typedef typename ToCudaType<U>::MappedType CudaU;
|
||||
|
|
@ -48,21 +193,22 @@ Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
auto& y = *ctx->Output(0, x.Shape());
|
||||
|
||||
const auto& x_shape = x.Shape();
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
const CudaU* input = reinterpret_cast<const CudaU*>(x.Data<U>());
|
||||
T* output = y.MutableData<T>();
|
||||
|
||||
if (IsScalarOr1ElementVector(&y_scale)) {
|
||||
if (IsScalarOr1ElementVector(&y_scale)) { // per-tensor quantization
|
||||
ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point),
|
||||
"y_zero_point must be a scalar or 1D tensor of size 1.");
|
||||
ORT_ENFORCE(block_size_ == 0, "block_size must be 0 for per-tensor quantization.");
|
||||
|
||||
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
ORT_RETURN_IF_ERROR(CudaQuantizeLinear(Stream(ctx), input, output, scale, zero_point, num_of_elements, saturate_));
|
||||
return Status::OK();
|
||||
} else {
|
||||
} else if (block_size_ == 0) { // per-axis quantization
|
||||
ORT_ENFORCE(y_scale.Shape().NumDimensions() == 1);
|
||||
ORT_ENFORCE(y_zero_point == nullptr || (y_scale.Shape().Size() == y_zero_point->Shape().Size() &&
|
||||
y_zero_point->Shape().NumDimensions() == 1),
|
||||
|
|
@ -73,44 +219,184 @@ Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
|
||||
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
ORT_RETURN_IF_ERROR(CudaQuantizeLinearAxis(Stream(ctx), input, output, scale, zero_point, num_of_elements,
|
||||
x_shape.SizeToDimension(axis), y_scale.Shape().Size(), saturate_));
|
||||
return Status::OK();
|
||||
} else { // blocked quantization
|
||||
// validate shape
|
||||
size_t axis_no_neg = SafeInt<size_t>(HandleNegativeAxis(axis_, x_shape.NumDimensions()));
|
||||
const auto& y_scale_shape = y_scale.Shape();
|
||||
|
||||
ValidateBlockQuantizationShapes(x_shape,
|
||||
y_scale_shape,
|
||||
y_zero_point,
|
||||
axis_no_neg,
|
||||
block_size_);
|
||||
|
||||
// compute
|
||||
const T* zero_point = y_zero_point ? y_zero_point->Data<T>() : nullptr;
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
|
||||
|
||||
ORT_RETURN_IF_ERROR(CudaQuantizeLinearBlock(Stream(ctx), input, output, scale, zero_point,
|
||||
num_of_elements, x_shape[axis_no_neg],
|
||||
x_shape.SizeFromDimension(axis_no_neg + 1),
|
||||
block_size_, saturate_));
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
|
||||
CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
|
||||
Status CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element) {
|
||||
ORT_UNUSED_PARAMETER(stream);
|
||||
ORT_UNUSED_PARAMETER(input);
|
||||
ORT_UNUSED_PARAMETER(output);
|
||||
ORT_UNUSED_PARAMETER(scale);
|
||||
ORT_UNUSED_PARAMETER(zero_point);
|
||||
ORT_UNUSED_PARAMETER(num_of_element);
|
||||
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
ORT_UNUSED_PARAMETER(stream);
|
||||
ORT_UNUSED_PARAMETER(input);
|
||||
ORT_UNUSED_PARAMETER(output);
|
||||
ORT_UNUSED_PARAMETER(scale);
|
||||
ORT_UNUSED_PARAMETER(zero_point);
|
||||
ORT_UNUSED_PARAMETER(num_of_element);
|
||||
ORT_UNUSED_PARAMETER(batch_size);
|
||||
ORT_UNUSED_PARAMETER(n_scales);
|
||||
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element, size_t K, size_t N,
|
||||
size_t block_size) {
|
||||
ORT_UNUSED_PARAMETER(stream);
|
||||
ORT_UNUSED_PARAMETER(input);
|
||||
ORT_UNUSED_PARAMETER(output);
|
||||
ORT_UNUSED_PARAMETER(scale);
|
||||
ORT_UNUSED_PARAMETER(zero_point);
|
||||
ORT_UNUSED_PARAMETER(num_of_element);
|
||||
ORT_UNUSED_PARAMETER(K);
|
||||
ORT_UNUSED_PARAMETER(N);
|
||||
ORT_UNUSED_PARAMETER(block_size);
|
||||
ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinear(cudaStream_t stream, const int8_t* input, U* output, const U* scale,
|
||||
const int8_t* zero_point, size_t num_of_element) {
|
||||
return CudaDequantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinear(cudaStream_t stream, const uint8_t* input, U* output, const U* scale,
|
||||
const uint8_t* zero_point, size_t num_of_element) {
|
||||
return CudaDequantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinear(cudaStream_t stream, const Int4x2* input, U* output, const U* scale,
|
||||
const Int4x2* zero_point, size_t num_of_element) {
|
||||
return CudaDequantizeLinearStdInt4(stream, reinterpret_cast<const int8_t*>(input), output, scale,
|
||||
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
|
||||
num_of_element);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinear(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale,
|
||||
const UInt4x2* zero_point, size_t num_of_element) {
|
||||
return CudaDequantizeLinearStdInt4(stream, reinterpret_cast<const uint8_t*>(input), output, scale,
|
||||
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
|
||||
num_of_element);
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
template <class T, class U>
|
||||
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
|
||||
CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
|
||||
template <class U>
|
||||
Status CudaDequantizeLinear(cudaStream_t stream, const Float8E4M3FN* input, U* output, const U* scale,
|
||||
const Float8E4M3FN* zero_point, size_t num_of_element) {
|
||||
return CudaDequantizeLinearSat(stream, input, output, scale, zero_point, num_of_element);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinear(cudaStream_t stream, const Float8E5M2* input, U* output, const U* scale,
|
||||
const Float8E5M2* zero_point, size_t num_of_element) {
|
||||
return CudaDequantizeLinearSat(stream, input, output, scale, zero_point, num_of_element);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class T, class U>
|
||||
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
|
||||
CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
template <class U>
|
||||
Status CudaDequantizeLinearAxis(cudaStream_t stream, const int8_t* input, U* output, const U* scale,
|
||||
const int8_t* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
return CudaDequantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinearAxis(cudaStream_t stream, const uint8_t* input, U* output, const U* scale,
|
||||
const uint8_t* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
return CudaDequantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinearAxis(cudaStream_t stream, const Int4x2* input, U* output, const U* scale,
|
||||
const Int4x2* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
return CudaDequantizeLinearAxisStdInt4(stream, reinterpret_cast<const int8_t*>(input), output, scale,
|
||||
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
|
||||
num_of_element, batch_size, n_scales);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinearAxis(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale,
|
||||
const UInt4x2* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
return CudaDequantizeLinearAxisStdInt4(stream, reinterpret_cast<const uint8_t*>(input), output, scale,
|
||||
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
|
||||
num_of_element, batch_size, n_scales);
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
template <class T, class U>
|
||||
typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
|
||||
CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
template <class U>
|
||||
Status CudaDequantizeLinearAxis(cudaStream_t stream, const Float8E4M3FN* input, U* output, const U* scale,
|
||||
const Float8E4M3FN* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
return CudaDequantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinearAxis(cudaStream_t stream, const Float8E5M2* input, U* output, const U* scale,
|
||||
const Float8E5M2* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
return CudaDequantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
|
||||
}
|
||||
#endif
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale,
|
||||
const UInt4x2* zero_point, size_t num_of_element, size_t K, size_t N,
|
||||
size_t block_size) {
|
||||
return CudaDequantizeLinearBlockStdInt4(stream, reinterpret_cast<const uint8_t*>(input), output, scale,
|
||||
zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
|
||||
num_of_element, K, N, block_size);
|
||||
}
|
||||
|
||||
template <class U>
|
||||
Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const Int4x2* input, U* output, const U* scale,
|
||||
const Int4x2* zero_point, size_t num_of_element, size_t K, size_t N,
|
||||
size_t block_size) {
|
||||
return CudaDequantizeLinearBlockStdInt4(stream, reinterpret_cast<const int8_t*>(input), output, scale,
|
||||
zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
|
||||
num_of_element, K, N, block_size);
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
|
||||
typedef typename ToCudaType<U>::MappedType CudaU;
|
||||
|
|
@ -120,6 +406,7 @@ Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
auto* y_zero_point = ctx->Input<Tensor>(2);
|
||||
|
||||
const auto& x_shape = x.Shape();
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
auto& y = *ctx->Output(0, x_shape);
|
||||
|
||||
|
|
@ -131,12 +418,11 @@ Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
|
||||
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
ORT_RETURN_IF_ERROR(CudaDequantizeLinear(Stream(ctx), input, output, scale, zero_point, num_of_elements));
|
||||
|
||||
return Status::OK();
|
||||
} else {
|
||||
} else if (block_size_ == 0) { // per axis quantization
|
||||
ORT_ENFORCE(y_scale.Shape().NumDimensions() == 1);
|
||||
ORT_ENFORCE(y_zero_point == nullptr || (y_scale.Shape().Size() == y_zero_point->Shape().Size() && y_zero_point->Shape().NumDimensions() == 1), "scale and zero_point must have the same shape.");
|
||||
ORT_ENFORCE(x_shape.NumDimensions() > 1);
|
||||
|
|
@ -145,11 +431,31 @@ Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
|
|||
|
||||
const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
|
||||
const auto num_of_elements = x_shape.Size();
|
||||
|
||||
ORT_RETURN_IF_ERROR(CudaDequantizeLinearAxis(Stream(ctx), input, output, scale, zero_point, num_of_elements,
|
||||
x_shape.SizeToDimension(axis), y_scale.Shape().Size()));
|
||||
return Status::OK();
|
||||
} else { // blocked quantization
|
||||
// validate shape
|
||||
auto axis_no_neg = SafeInt<size_t>(HandleNegativeAxis(axis_, x_shape.NumDimensions()));
|
||||
const auto& y_scale_shape = y_scale.Shape();
|
||||
|
||||
ValidateBlockQuantizationShapes(x_shape,
|
||||
y_scale_shape,
|
||||
y_zero_point,
|
||||
axis_no_neg,
|
||||
block_size_);
|
||||
|
||||
// compute
|
||||
const T* zero_point = y_zero_point ? y_zero_point->Data<T>() : nullptr;
|
||||
const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
|
||||
|
||||
ORT_RETURN_IF_ERROR(CudaDequantizeLinearBlockInt4(Stream(ctx), input, output, scale, zero_point,
|
||||
num_of_elements, x_shape[axis_no_neg],
|
||||
x_shape.SizeFromDimension(axis_no_neg + 1),
|
||||
block_size_));
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -183,33 +489,54 @@ REGISTER_Q_KERNEL_TYPED_10_12(uint8_t)
|
|||
REGISTER_Q_KERNEL_TYPED_13_18(int8_t)
|
||||
REGISTER_Q_KERNEL_TYPED_13_18(uint8_t)
|
||||
|
||||
#define REGISTER_Q_KERNEL_TYPED_19(T) \
|
||||
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
|
||||
QuantizeLinear, \
|
||||
kOnnxDomain, \
|
||||
19, \
|
||||
T, float, \
|
||||
kCudaExecutionProvider, \
|
||||
(*KernelDefBuilder::Create()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<float>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
|
||||
QuantizeLinear<T, float>); \
|
||||
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
|
||||
QuantizeLinear, \
|
||||
kOnnxDomain, \
|
||||
19, \
|
||||
T, MLFloat16, \
|
||||
kCudaExecutionProvider, \
|
||||
(*KernelDefBuilder::Create()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<MLFloat16>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
|
||||
QuantizeLinear<T, MLFloat16>);
|
||||
#define REGISTER_Q_KERNEL_TWO_TYPED_19_20(T, U) \
|
||||
ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \
|
||||
QuantizeLinear, \
|
||||
kOnnxDomain, \
|
||||
19, 20, \
|
||||
T, U, \
|
||||
kCudaExecutionProvider, \
|
||||
(*KernelDefBuilder::Create()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<U>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
|
||||
QuantizeLinear<T, U>);
|
||||
|
||||
REGISTER_Q_KERNEL_TYPED_19(int8_t)
|
||||
REGISTER_Q_KERNEL_TYPED_19(uint8_t)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_19_20(int8_t, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_19_20(uint8_t, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_19_20(int8_t, MLFloat16)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_19_20(uint8_t, MLFloat16)
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
REGISTER_Q_KERNEL_TYPED_19(Float8E4M3FN)
|
||||
REGISTER_Q_KERNEL_TYPED_19(Float8E5M2)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E5M2, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, MLFloat16)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16)
|
||||
#endif
|
||||
|
||||
#define REGISTER_Q_KERNEL_TWO_TYPED_21(T, U) \
|
||||
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
|
||||
QuantizeLinear, \
|
||||
kOnnxDomain, \
|
||||
21, \
|
||||
T, U, \
|
||||
kCudaExecutionProvider, \
|
||||
(*KernelDefBuilder::Create()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<U>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
|
||||
QuantizeLinear<T, U>);
|
||||
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(uint8_t, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(int8_t, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(uint8_t, MLFloat16)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(int8_t, MLFloat16)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(UInt4x2, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(Int4x2, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(UInt4x2, MLFloat16)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(Int4x2, MLFloat16)
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E4M3FN, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E5M2, float)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E4M3FN, MLFloat16)
|
||||
REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E5M2, MLFloat16)
|
||||
#endif
|
||||
|
||||
// register DequantizeLinear kernels
|
||||
|
|
@ -240,33 +567,54 @@ REGISTER_DQ_KERNEL_TYPED_10_12(uint8_t)
|
|||
REGISTER_DQ_KERNEL_TYPED_13_18(int8_t)
|
||||
REGISTER_DQ_KERNEL_TYPED_13_18(uint8_t)
|
||||
|
||||
#define REGISTER_DQ_KERNEL_TYPED_19(T) \
|
||||
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
|
||||
DequantizeLinear, \
|
||||
kOnnxDomain, \
|
||||
19, \
|
||||
T, float, \
|
||||
kCudaExecutionProvider, \
|
||||
(*KernelDefBuilder::Create()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<float>()), \
|
||||
DequantizeLinear<T, float>); \
|
||||
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
|
||||
DequantizeLinear, \
|
||||
kOnnxDomain, \
|
||||
19, \
|
||||
T, MLFloat16, \
|
||||
kCudaExecutionProvider, \
|
||||
(*KernelDefBuilder::Create()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<MLFloat16>()), \
|
||||
DequantizeLinear<T, MLFloat16>);
|
||||
#define REGISTER_DQ_KERNEL_TWO_TYPED_19_20(T, U) \
|
||||
ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \
|
||||
DequantizeLinear, \
|
||||
kOnnxDomain, \
|
||||
19, 20, \
|
||||
T, U, \
|
||||
kCudaExecutionProvider, \
|
||||
(*KernelDefBuilder::Create()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<U>()), \
|
||||
DequantizeLinear<T, U>);
|
||||
|
||||
REGISTER_DQ_KERNEL_TYPED_19(int8_t)
|
||||
REGISTER_DQ_KERNEL_TYPED_19(uint8_t)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(int8_t, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(uint8_t, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(int8_t, MLFloat16)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(uint8_t, MLFloat16)
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
REGISTER_DQ_KERNEL_TYPED_19(Float8E4M3FN)
|
||||
REGISTER_DQ_KERNEL_TYPED_19(Float8E5M2)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, MLFloat16)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16)
|
||||
#endif
|
||||
|
||||
#define REGISTER_DQ_KERNEL_TWO_TYPED_21(T, U) \
|
||||
ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \
|
||||
DequantizeLinear, \
|
||||
kOnnxDomain, \
|
||||
21, \
|
||||
T, U, \
|
||||
kCudaExecutionProvider, \
|
||||
(*KernelDefBuilder::Create()) \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<T>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<U>()), \
|
||||
DequantizeLinear<T, U>);
|
||||
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, MLFloat16)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, MLFloat16)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, MLFloat16)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, MLFloat16)
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, float)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, MLFloat16)
|
||||
REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, MLFloat16)
|
||||
#endif
|
||||
|
||||
// specialize QuantizeLinear::ComputeInternal and DequantizeLinear::ComputeInternal
|
||||
|
|
@ -278,6 +626,10 @@ SPECIALIZED_QDQ_COMPUTE(int8_t, float)
|
|||
SPECIALIZED_QDQ_COMPUTE(uint8_t, float)
|
||||
SPECIALIZED_QDQ_COMPUTE(int8_t, MLFloat16)
|
||||
SPECIALIZED_QDQ_COMPUTE(uint8_t, MLFloat16)
|
||||
SPECIALIZED_QDQ_COMPUTE(Int4x2, float)
|
||||
SPECIALIZED_QDQ_COMPUTE(UInt4x2, float)
|
||||
SPECIALIZED_QDQ_COMPUTE(Int4x2, MLFloat16)
|
||||
SPECIALIZED_QDQ_COMPUTE(UInt4x2, MLFloat16)
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
SPECIALIZED_QDQ_COMPUTE(Float8E4M3FN, float)
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@
|
|||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
|
||||
#include "cuda_fp8.h"
|
||||
#include "cuda_fp16.h"
|
||||
#endif
|
||||
|
||||
namespace onnxruntime {
|
||||
|
|
@ -17,9 +18,23 @@ namespace cuda {
|
|||
template <typename InT, typename OutT>
|
||||
struct RoundStd;
|
||||
|
||||
template <typename InT, typename OutT>
|
||||
struct RoundStdInt4;
|
||||
|
||||
template <typename InT, typename OutT>
|
||||
struct RoundSat;
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ int ExtractInt4FromByte(T byte, int index) {
|
||||
return static_cast<int>((byte >> (index << 2)) & 0x0f);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ __forceinline__ int ExtractInt4FromByte<int8_t>(int8_t byte, int index) {
|
||||
constexpr auto shift = (sizeof(int) << 3) - 4;
|
||||
return (static_cast<int>(((byte >> (index << 2)) & 0x0f)) << shift) >> shift;
|
||||
}
|
||||
|
||||
template <>
|
||||
struct RoundStd<float, int8_t> {
|
||||
__device__ __forceinline__ int8_t operator()(float v, float scale, int8_t zero_point) const {
|
||||
|
|
@ -28,6 +43,22 @@ struct RoundStd<float, int8_t> {
|
|||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct RoundStdInt4<float, int8_t> {
|
||||
__device__ __forceinline__ int8_t operator()(float v0,
|
||||
float v1,
|
||||
float scale0,
|
||||
float scale1,
|
||||
int zp0,
|
||||
int zp1) const {
|
||||
int value0 = __float2int_rn(v0 / scale0) + zp0;
|
||||
int value1 = __float2int_rn(v1 / scale1) + zp1;
|
||||
int value0_clip = max(-8, min(7, value0));
|
||||
int value1_clip = max(-8, min(7, value1));
|
||||
return static_cast<int8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct RoundStd<float, uint8_t> {
|
||||
__device__ __forceinline__ uint8_t operator()(float v, float scale, uint8_t zero_point) const {
|
||||
|
|
@ -36,6 +67,22 @@ struct RoundStd<float, uint8_t> {
|
|||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct RoundStdInt4<float, uint8_t> {
|
||||
__device__ __forceinline__ uint8_t operator()(float v0,
|
||||
float v1,
|
||||
float scale0,
|
||||
float scale1,
|
||||
int zp0,
|
||||
int zp1) const {
|
||||
int value0 = __float2int_rn(v0 / scale0) + zp0;
|
||||
int value1 = __float2int_rn(v1 / scale1) + zp1;
|
||||
int value0_clip = max(0, min(15, value0));
|
||||
int value1_clip = max(0, min(15, value1));
|
||||
return static_cast<uint8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
|
||||
}
|
||||
};
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
|
||||
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
|
||||
|
|
@ -104,7 +151,7 @@ struct RoundSat<half, Float8E5M2> {
|
|||
|
||||
#endif
|
||||
|
||||
#endif // DISABLE_FLOAT8_TYPES
|
||||
#endif // DISABLE_FLOAT8_TYPES
|
||||
|
||||
template <>
|
||||
struct RoundStd<half, int8_t> {
|
||||
|
|
@ -114,6 +161,26 @@ struct RoundStd<half, int8_t> {
|
|||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct RoundStdInt4<half, int8_t> {
|
||||
__device__ __forceinline__ int8_t operator()(half v0,
|
||||
half v1,
|
||||
half scale0,
|
||||
half scale1,
|
||||
int zp0,
|
||||
int zp1) const {
|
||||
half2 v = __halves2half2(v0, v1);
|
||||
half2 scale = __halves2half2(scale0, scale1);
|
||||
half2 scaled_v = v / scale;
|
||||
|
||||
int value0 = __half2int_rn(__low2half(scaled_v)) + zp0;
|
||||
int value1 = __half2int_rn(__high2half(scaled_v)) + zp1;
|
||||
int value0_clip = max(-8, min(7, value0));
|
||||
int value1_clip = max(-8, min(7, value1));
|
||||
return static_cast<int8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct RoundStd<half, uint8_t> {
|
||||
__device__ __forceinline__ int8_t operator()(half v, half scale, uint8_t zero_point) const {
|
||||
|
|
@ -122,6 +189,26 @@ struct RoundStd<half, uint8_t> {
|
|||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct RoundStdInt4<half, uint8_t> {
|
||||
__device__ __forceinline__ uint8_t operator()(half v0,
|
||||
half v1,
|
||||
half scale0,
|
||||
half scale1,
|
||||
int zp0,
|
||||
int zp1) const {
|
||||
half2 v = __halves2half2(v0, v1);
|
||||
half2 scale = __halves2half2(scale0, scale1);
|
||||
half2 scaled_v = v / scale;
|
||||
|
||||
int value0 = __half2int_rn(__low2half(scaled_v)) + zp0;
|
||||
int value1 = __half2int_rn(__high2half(scaled_v)) + zp1;
|
||||
int value0_clip = max(0, min(15, value0));
|
||||
int value1_clip = max(0, min(15, value1));
|
||||
return static_cast<uint8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
|
||||
}
|
||||
};
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
|
||||
__global__ void QuantizeLinearKernelStd(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, RoundStd<InT, OutT> round) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
|
@ -137,6 +224,29 @@ __global__ void QuantizeLinearKernelStd(const InT* input, OutT* output, const In
|
|||
}
|
||||
}
|
||||
|
||||
// cuda kernel for int4 per tensor quantization with standard rounding
|
||||
// OutT is int8_t for Int4x2 and uint8_t for UInt4x2
|
||||
// NumElementsPerThread must be multiple of 2.
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
|
||||
__global__ void QuantizeLinearKernelStdInt4(const InT* input, OutT* output, const InT* scale_ptr,
|
||||
const OutT* zero_point_ptr, CUDA_LONG N,
|
||||
RoundStdInt4<InT, OutT> round) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
|
||||
InT scale = *scale_ptr;
|
||||
int zero_point = zero_point_ptr ? ExtractInt4FromByte(*zero_point_ptr, 0) : 0;
|
||||
int i = 0;
|
||||
constexpr int step = NumThreadsPerBlock << 1;
|
||||
|
||||
#pragma unroll
|
||||
for (; i + 1 < NumElementsPerThread && id + 1 < N; i += 2, id += step) {
|
||||
output[id >> 1] = round(input[id], input[id + 1], scale, scale, zero_point, zero_point);
|
||||
}
|
||||
|
||||
if (i < NumElementsPerThread && id < N) {
|
||||
output[id >> 1] = round(input[id], 0.0, scale, 1.0, zero_point, 0);
|
||||
}
|
||||
}
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
|
||||
__global__ void QuantizeLinearKernelAxisStd(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, size_t batch_size, size_t n_scales, RoundStd<InT, OutT> round) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
|
||||
|
|
@ -154,6 +264,97 @@ __global__ void QuantizeLinearKernelAxisStd(const InT* input, OutT* output, cons
|
|||
}
|
||||
}
|
||||
|
||||
// cuda kernel for int4 per axis quantization with standard rounding
|
||||
// OutT is int8_t for Int4x2 and uint8_t for UInt4x2
|
||||
// NumElementsPerThread must be multiple of 2.
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
|
||||
__global__ void QuantizeLinearKernelAxisStdInt4(const InT* input, OutT* output, const InT* scale_ptr,
|
||||
const OutT* zero_point_ptr, CUDA_LONG num_element,
|
||||
size_t batch_size, size_t n_scales,
|
||||
RoundStdInt4<InT, OutT> round) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
|
||||
// Process continuous NumElementsPerThread int4 per thread.
|
||||
int i = 0;
|
||||
// The scale needs to change every n_same_scale.
|
||||
CUDA_LONG n_same_scale = num_element / (batch_size * n_scales);
|
||||
constexpr int step = NumThreadsPerBlock << 1;
|
||||
|
||||
#pragma unroll
|
||||
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
|
||||
int scale_id0 = (id / n_same_scale) % n_scales;
|
||||
int scale_id1 = ((id + 1) / n_same_scale) % n_scales;
|
||||
int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
|
||||
int zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1);
|
||||
output[id >> 1] = round(input[id],
|
||||
input[id + 1],
|
||||
scale_ptr[scale_id0],
|
||||
scale_ptr[scale_id1],
|
||||
zp0,
|
||||
zp1);
|
||||
}
|
||||
|
||||
if (i < NumElementsPerThread && id < num_element) {
|
||||
int scale_id0 = (id / n_same_scale) % n_scales;
|
||||
int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
|
||||
output[id >> 1] = round(input[id],
|
||||
0.0,
|
||||
scale_ptr[scale_id0],
|
||||
1.0,
|
||||
zp0,
|
||||
0);
|
||||
}
|
||||
}
|
||||
|
||||
// cuda kernel for int4 block-wise quantization with standard rounding
|
||||
// OutT is int8_t for Int4x2 and uint8_t for UInt4x2
|
||||
// NumElementsPerThread must be multiple of 2.
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
|
||||
__global__ void QuantizeLinearKernelBlockStdInt4(const InT* input, OutT* output, const InT* scale_ptr,
|
||||
const OutT* zero_point_ptr, CUDA_LONG num_element, size_t KN,
|
||||
size_t N, size_t scale_KN, size_t block_size,
|
||||
RoundStdInt4<InT, OutT> round) {
|
||||
// Process continuous NumElementsPerThread int4 per thread.
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
|
||||
int i = 0;
|
||||
constexpr int step = NumThreadsPerBlock << 1;
|
||||
|
||||
#pragma unroll
|
||||
// Process two elements which belong to one byte at a time.
|
||||
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
|
||||
int x0 = id / KN, x1 = (id + 1) / KN;
|
||||
int y0 = id % KN / N, y1 = (id + 1) % KN / N;
|
||||
int z0 = id % N, z1 = (id + 1) % N;
|
||||
int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
|
||||
int scale_id1 = x1 * scale_KN + y1 / block_size * N + z1;
|
||||
output[id >> 1] = round(input[id],
|
||||
input[id + 1],
|
||||
scale_ptr[scale_id0],
|
||||
scale_ptr[scale_id1],
|
||||
zero_point_ptr == nullptr
|
||||
? 0
|
||||
: ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1),
|
||||
zero_point_ptr == nullptr
|
||||
? 0
|
||||
: ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1));
|
||||
}
|
||||
|
||||
// last non-paired element
|
||||
if (i < NumElementsPerThread && id < num_element) {
|
||||
int x0 = id / KN;
|
||||
int y0 = id % KN / N;
|
||||
int z0 = id % N;
|
||||
int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
|
||||
output[id >> 1] = round(input[id],
|
||||
0.0,
|
||||
scale_ptr[scale_id0],
|
||||
1.0,
|
||||
zero_point_ptr == nullptr
|
||||
? 0
|
||||
: ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1),
|
||||
0);
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
|
||||
template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
|
||||
|
|
@ -207,6 +408,27 @@ Status CudaQuantizeLinearStd(cudaStream_t stream, const InT* input, OutT* output
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class OutT, class InT>
|
||||
Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale,
|
||||
const OutT* zero_point, size_t num_of_element) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
static_assert((GridDim::maxElementsPerThread & 1) == 0);
|
||||
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
|
||||
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
QuantizeLinearKernelStdInt4<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
zero_point,
|
||||
static_cast<int>(num_of_element),
|
||||
RoundStdInt4<InT, OutT>());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class OutT, class InT>
|
||||
Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
|
|
@ -226,6 +448,59 @@ Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* ou
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class OutT, class InT>
|
||||
Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale,
|
||||
const OutT* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
static_assert((GridDim::maxElementsPerThread & 1) == 0);
|
||||
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
|
||||
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
QuantizeLinearKernelAxisStdInt4<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
zero_point,
|
||||
static_cast<int>(num_of_element),
|
||||
batch_size,
|
||||
n_scales,
|
||||
RoundStdInt4<InT, OutT>());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class OutT, class InT>
|
||||
Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale,
|
||||
const OutT* zero_point, size_t num_of_element, size_t K, size_t N,
|
||||
size_t block_size) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
static_assert((GridDim::maxElementsPerThread & 1) == 0);
|
||||
|
||||
size_t KN = K * N;
|
||||
size_t num_block = (K + block_size - 1) / block_size;
|
||||
size_t scale_KN = num_block * N;
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
|
||||
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
QuantizeLinearKernelBlockStdInt4<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
zero_point,
|
||||
static_cast<int>(num_of_element),
|
||||
KN,
|
||||
N,
|
||||
scale_KN,
|
||||
block_size,
|
||||
RoundStdInt4<InT, OutT>());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
|
||||
template <class OutT, class InT>
|
||||
|
|
@ -282,6 +557,29 @@ __global__ void DequantizeLinearKernelStd(const InT* input, OutT* output, const
|
|||
}
|
||||
}
|
||||
|
||||
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void DequantizeLinearKernelStdInt4(const InT* input, OutT* output, const OutT* scale_ptr,
|
||||
const InT* zero_point_ptr, CUDA_LONG num_element) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
|
||||
|
||||
OutT scale = *scale_ptr;
|
||||
int zero_point = zero_point_ptr ? ExtractInt4FromByte(*zero_point_ptr, 0) : 0;
|
||||
int i = 0, v0, v1;
|
||||
constexpr int step = NumThreadsPerBlock << 1;
|
||||
#pragma unroll
|
||||
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
|
||||
v0 = ExtractInt4FromByte(input[id >> 1], 0);
|
||||
v1 = ExtractInt4FromByte(input[id >> 1], 1);
|
||||
output[id] = static_cast<OutT>(v0 - zero_point) * scale;
|
||||
output[id + 1] = static_cast<OutT>(v1 - zero_point) * scale;
|
||||
}
|
||||
|
||||
if (i < NumElementsPerThread && id < num_element) {
|
||||
v0 = ExtractInt4FromByte(input[id >> 1], 0);
|
||||
output[id] = static_cast<OutT>(v0 - zero_point) * scale;
|
||||
}
|
||||
}
|
||||
|
||||
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void DequantizeLinearKernelAxisStd(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, CUDA_LONG N,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
|
|
@ -300,6 +598,80 @@ __global__ void DequantizeLinearKernelAxisStd(const InT* input, OutT* output, co
|
|||
}
|
||||
}
|
||||
|
||||
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void DequantizeLinearKernelAxisStdInt4(const InT* input, OutT* output, const OutT* scale_ptr,
|
||||
const InT* zero_point_ptr, CUDA_LONG num_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
|
||||
// The scale needs to change every n_same_scale.
|
||||
CUDA_LONG n_same_scale = num_element / (batch_size * n_scales);
|
||||
int i = 0;
|
||||
int scale_id0, scale_id1, zp0, zp1, v0, v1;
|
||||
constexpr int step = NumThreadsPerBlock << 1;
|
||||
|
||||
#pragma unroll
|
||||
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
|
||||
scale_id0 = (id / n_same_scale) % n_scales;
|
||||
scale_id1 = ((id + 1) / n_same_scale) % n_scales;
|
||||
|
||||
v0 = ExtractInt4FromByte(input[id >> 1], 0);
|
||||
v1 = ExtractInt4FromByte(input[id >> 1], 1);
|
||||
zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
|
||||
zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1);
|
||||
output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
|
||||
output[id + 1] = static_cast<OutT>(v1 - zp1) * scale_ptr[scale_id1];
|
||||
}
|
||||
|
||||
if (i < NumElementsPerThread && id < num_element) {
|
||||
scale_id0 = (id / n_same_scale) % n_scales;
|
||||
v0 = ExtractInt4FromByte(input[id >> 1], 0);
|
||||
zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
|
||||
output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
|
||||
}
|
||||
}
|
||||
|
||||
// cuda kernel for int4 block-wise dequantization with standard rounding
|
||||
// IntT is int8_t for Int4x2 and uint8_t for UInt4x2
|
||||
// NumElementsPerThread must be multiple of 2.
|
||||
template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
|
||||
__global__ void DequantizeLinearKernelBlockStdInt4(const InT* input, OutT* output, const OutT* scale_ptr,
|
||||
const InT* zero_point_ptr, CUDA_LONG num_element,
|
||||
size_t KN, size_t N, size_t scale_KN, size_t block_size) {
|
||||
// Process continuous NumElementsPerThread int4 per thread.
|
||||
CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
|
||||
int i = 0;
|
||||
constexpr int step = NumThreadsPerBlock << 1;
|
||||
|
||||
#pragma unroll
|
||||
// Process two elements which belong to one byte at a time.
|
||||
for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
|
||||
int x0 = id / KN, x1 = (id + 1) / KN;
|
||||
int y0 = id % KN / N, y1 = (id + 1) % KN / N;
|
||||
int z0 = id % N, z1 = (id + 1) % N;
|
||||
int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
|
||||
int scale_id1 = x1 * scale_KN + y1 / block_size * N + z1;
|
||||
|
||||
int v0 = ExtractInt4FromByte(input[id >> 1], 0);
|
||||
int v1 = ExtractInt4FromByte(input[id >> 1], 1);
|
||||
int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
|
||||
int zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1);
|
||||
output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
|
||||
output[id + 1] = static_cast<OutT>(v1 - zp1) * scale_ptr[scale_id1];
|
||||
}
|
||||
|
||||
// last non-paired element
|
||||
if (i < NumElementsPerThread && id < num_element) {
|
||||
int x0 = id / KN;
|
||||
int y0 = id % KN / N;
|
||||
int z0 = id % N;
|
||||
int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
|
||||
|
||||
int v0 = ExtractInt4FromByte(input[id >> 1], 0);
|
||||
int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
|
||||
output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InT, typename OutT>
|
||||
struct DQFloat8;
|
||||
|
||||
|
|
@ -422,6 +794,26 @@ Status CudaDequantizeLinearStd(cudaStream_t stream, const InT* input, OutT* outp
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class InT, class OutT>
|
||||
Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale,
|
||||
const InT* zero_point, size_t num_of_element) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
static_assert((GridDim::maxElementsPerThread & 1) == 0);
|
||||
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
|
||||
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
DequantizeLinearKernelStdInt4<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
zero_point,
|
||||
static_cast<int>(num_of_element));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class InT, class OutT>
|
||||
Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
|
|
@ -440,6 +832,57 @@ Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT*
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class InT, class OutT>
|
||||
Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale,
|
||||
const InT* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
static_assert((GridDim::maxElementsPerThread & 1) == 0);
|
||||
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
|
||||
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
DequantizeLinearKernelAxisStdInt4<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
zero_point,
|
||||
static_cast<int>(num_of_element),
|
||||
batch_size,
|
||||
n_scales);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element, size_t K, size_t N,
|
||||
size_t block_size) {
|
||||
if (num_of_element <= 0)
|
||||
return Status::OK();
|
||||
|
||||
static_assert((GridDim::maxElementsPerThread & 1) == 0);
|
||||
|
||||
size_t KN = K * N;
|
||||
size_t num_block = (K + block_size - 1) / block_size;
|
||||
size_t scale_KN = num_block * N;
|
||||
int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
|
||||
GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
|
||||
DequantizeLinearKernelBlockStdInt4<T, U, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
|
||||
<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
|
||||
input,
|
||||
output,
|
||||
scale,
|
||||
zero_point,
|
||||
static_cast<CUDA_LONG>(num_of_element),
|
||||
KN,
|
||||
N,
|
||||
scale_KN,
|
||||
block_size);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
|
||||
template <class InT, class OutT>
|
||||
|
|
@ -481,11 +924,24 @@ template Status CudaQuantizeLinearStd<int8_t, float>(cudaStream_t stream, const
|
|||
template Status CudaQuantizeLinearStd<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaQuantizeLinearStd<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaQuantizeLinearStd<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaQuantizeLinearStdInt4<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaQuantizeLinearStdInt4<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaQuantizeLinearStdInt4<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaQuantizeLinearStdInt4<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
|
||||
template Status CudaQuantizeLinearAxisStd<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaQuantizeLinearAxisStd<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaQuantizeLinearAxisStd<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaQuantizeLinearAxisStd<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaQuantizeLinearAxisStdInt4<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaQuantizeLinearAxisStdInt4<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaQuantizeLinearAxisStdInt4<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaQuantizeLinearAxisStdInt4<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
|
||||
template Status CudaQuantizeLinearBlockStdInt4<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
|
||||
template Status CudaQuantizeLinearBlockStdInt4<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
|
||||
template Status CudaQuantizeLinearBlockStdInt4<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
|
||||
template Status CudaQuantizeLinearBlockStdInt4<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
|
||||
|
|
@ -505,11 +961,24 @@ template Status CudaDequantizeLinearStd<int8_t, float>(cudaStream_t stream, cons
|
|||
template Status CudaDequantizeLinearStd<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaDequantizeLinearStd<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaDequantizeLinearStd<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaDequantizeLinearStdInt4<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaDequantizeLinearStdInt4<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaDequantizeLinearStdInt4<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
|
||||
template Status CudaDequantizeLinearStdInt4<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
|
||||
|
||||
template Status CudaDequantizeLinearAxisStd<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaDequantizeLinearAxisStd<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaDequantizeLinearAxisStd<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaDequantizeLinearAxisStd<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaDequantizeLinearAxisStdInt4<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaDequantizeLinearAxisStdInt4<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaDequantizeLinearAxisStdInt4<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
template Status CudaDequantizeLinearAxisStdInt4<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
|
||||
template Status CudaDequantizeLinearBlockStdInt4<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
|
||||
template Status CudaDequantizeLinearBlockStdInt4<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
|
||||
template Status CudaDequantizeLinearBlockStdInt4<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
|
||||
template Status CudaDequantizeLinearBlockStdInt4<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
|
||||
|
|
|
|||
|
|
@ -11,33 +11,96 @@ namespace onnxruntime {
|
|||
namespace cuda {
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaQuantizeLinearStd(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element);
|
||||
Status CudaQuantizeLinearStd(cudaStream_t stream, const U* input, T* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaQuantizeLinearSat(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
|
||||
bool saturate);
|
||||
Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales);
|
||||
Status CudaQuantizeLinearSat(cudaStream_t stream, const U* input, T* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element, bool saturate);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaQuantizeLinearAxisSat(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales, bool saturate);
|
||||
Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const U* input, T* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element);
|
||||
Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element);
|
||||
Status CudaQuantizeLinearAxisSat(cudaStream_t stream, const U* input, T* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales,
|
||||
bool saturate);
|
||||
|
||||
/**
|
||||
* @brief block-wise quantization with standard rounding to int4. Input is reshaped to [M, K, N]. K the quantization
|
||||
* axis. Scale is reshaped to [M, ceil(K/block_size), N]. For an index i in input, the coordiate is (xi, yi, zi)
|
||||
* = (i / (K * N), i % (K * N) / N, i % N). The scale coordiate is (xi, yi / block_size, zi). The scale index
|
||||
* is xi * ceil(K / block_size) * N + yi / block_size * N + zi.
|
||||
* @tparam T quantized type, int8_t for Int4x2, uint8_t for UInt4x2
|
||||
* @tparam U full precision type
|
||||
* @param stream cuda stream
|
||||
* @param input input tensor
|
||||
* @param output output tensor
|
||||
* @param scale scale tensor
|
||||
* @param zero_point zero point tensor
|
||||
* @param num_of_element number of elements in input tensor
|
||||
* @param K K
|
||||
* @param N N
|
||||
* @param block_size block size
|
||||
*/
|
||||
template <class T, class U>
|
||||
Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element, size_t K, size_t N,
|
||||
size_t block_size);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales);
|
||||
Status CudaDequantizeLinearStd(cudaStream_t stream, const T* input, U* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearAxisSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
|
||||
size_t batch_size, size_t n_scales);
|
||||
Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point,
|
||||
size_t num_of_element);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point,
|
||||
size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearAxisSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point,
|
||||
size_t num_of_element, size_t batch_size, size_t n_scales);
|
||||
|
||||
/**
|
||||
* @brief block-wise dequantization with standard rounding to int4. Input is reshaped to [M, K, N]. K the quantization
|
||||
* axis. Scale is reshaped to [M, ceil(K/block_size), N]. For an index i in input, the coordiate is (xi, yi, zi)
|
||||
* = (i / (K * N), i % (K * N) / N, i % N). The scale coordiate is (xi, yi / block_size, zi). The scale index
|
||||
* is xi * ceil(K / block_size) * N + yi / block_size * N + zi.
|
||||
* @tparam T quantized type, int8_t for Int4x2, uint8_t for UInt4x2
|
||||
* @tparam U full precision type
|
||||
* @param stream cuda stream
|
||||
* @param input input tensor
|
||||
* @param output output tensor
|
||||
* @param scale scale tensor
|
||||
* @param zero_point zero point tensor
|
||||
* @param num_of_element number of elements in input tensor
|
||||
* @param K K
|
||||
* @param N N
|
||||
* @param block_size block size
|
||||
*/
|
||||
template <class T, class U>
|
||||
Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
|
||||
const T* zero_point, size_t num_of_element, size_t K, size_t N,
|
||||
size_t block_size);
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -19,6 +19,11 @@ class QuantizeLinear final : public CudaKernel {
|
|||
if (!info.GetAttr<int64_t>("saturate", &saturate_).IsOK()) {
|
||||
saturate_ = 1;
|
||||
}
|
||||
if (!info.GetAttr<int64_t>("block_size", &block_size_).IsOK()) {
|
||||
block_size_ = 0;
|
||||
}
|
||||
|
||||
ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative.");
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
|
||||
|
|
@ -26,6 +31,7 @@ class QuantizeLinear final : public CudaKernel {
|
|||
private:
|
||||
int64_t axis_;
|
||||
int64_t saturate_;
|
||||
int64_t block_size_;
|
||||
};
|
||||
|
||||
template <class T, class U>
|
||||
|
|
@ -35,12 +41,18 @@ class DequantizeLinear final : public CudaKernel {
|
|||
if (!info.GetAttr<int64_t>("axis", &axis_).IsOK()) {
|
||||
axis_ = 1;
|
||||
}
|
||||
if (!info.GetAttr<int64_t>("block_size", &block_size_).IsOK()) {
|
||||
block_size_ = 0;
|
||||
}
|
||||
|
||||
ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative.");
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
|
||||
|
||||
private:
|
||||
int64_t axis_;
|
||||
int64_t block_size_;
|
||||
};
|
||||
|
||||
} // namespace cuda
|
||||
|
|
|
|||
|
|
@ -1370,18 +1370,26 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint64_t, Cast);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, bool, Cast);
|
||||
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
|
||||
float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
|
||||
float, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
|
||||
MLFloat16, DequantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
|
||||
MLFloat16, DequantizeLinear);
|
||||
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Identity);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, If);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Loop);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
|
||||
float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
|
||||
float, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
|
||||
MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
|
||||
MLFloat16, QuantizeLinear);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Reshape);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Shape);
|
||||
|
|
@ -1390,6 +1398,24 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, S
|
|||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN);
|
||||
|
||||
// Opset 21
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float,
|
||||
DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float,
|
||||
DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16,
|
||||
DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16,
|
||||
DequantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float,
|
||||
QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float,
|
||||
QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16,
|
||||
QuantizeLinear);
|
||||
class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16,
|
||||
QuantizeLinear);
|
||||
|
||||
template <>
|
||||
KernelCreateInfo BuildKernelCreateInfo<void>() {
|
||||
return {};
|
||||
|
|
@ -2333,19 +2359,19 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, bool, Cast)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, BFloat16, Cast)>,
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear)>,
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Identity)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, If)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Loop)>,
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear)>,
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Reshape)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan)>,
|
||||
|
|
@ -2354,6 +2380,16 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
|
|||
// opset 20
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN)>,
|
||||
|
||||
// opset 21
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear)>,
|
||||
};
|
||||
|
||||
for (auto& function_table_entry : function_table) {
|
||||
|
|
|
|||
|
|
@ -1289,6 +1289,10 @@ struct Tensor final {
|
|||
template <>
|
||||
inline bool Tensor::IsDataType<bool>() const { return g_host->Tensor__IsDataType_bool(this); }
|
||||
template <>
|
||||
inline bool Tensor::IsDataType<Int4x2>() const { return g_host->Tensor__IsDataType_Int4x2(this); }
|
||||
template <>
|
||||
inline bool Tensor::IsDataType<UInt4x2>() const { return g_host->Tensor__IsDataType_UInt4x2(this); }
|
||||
template <>
|
||||
inline bool Tensor::IsDataType<int8_t>() const { return g_host->Tensor__IsDataType_int8(this); }
|
||||
template <>
|
||||
inline bool Tensor::IsDataType<uint8_t>() const { return g_host->Tensor__IsDataType_uint8(this); }
|
||||
|
|
@ -1327,6 +1331,10 @@ inline bool Tensor::IsDataType<Float8E5M2FNUZ>() const { return g_host->Tensor__
|
|||
template <>
|
||||
inline bool* Tensor::MutableData<bool>() { return g_host->Tensor__MutableData_bool(this); }
|
||||
template <>
|
||||
inline Int4x2* Tensor::MutableData<Int4x2>() { return g_host->Tensor__MutableData_Int4x2(this); }
|
||||
template <>
|
||||
inline UInt4x2* Tensor::MutableData<UInt4x2>() { return g_host->Tensor__MutableData_UInt4x2(this); }
|
||||
template <>
|
||||
inline int8_t* Tensor::MutableData<int8_t>() { return g_host->Tensor__MutableData_int8(this); }
|
||||
template <>
|
||||
inline uint8_t* Tensor::MutableData<uint8_t>() { return g_host->Tensor__MutableData_uint8(this); }
|
||||
|
|
@ -1365,6 +1373,10 @@ inline Float8E5M2FNUZ* Tensor::MutableData<Float8E5M2FNUZ>() { return g_host->Te
|
|||
template <>
|
||||
inline const bool* Tensor::Data<bool>() const { return g_host->Tensor__Data_bool(this); }
|
||||
template <>
|
||||
inline const Int4x2* Tensor::Data<Int4x2>() const { return g_host->Tensor__Data_Int4x2(this); }
|
||||
template <>
|
||||
inline const UInt4x2* Tensor::Data<UInt4x2>() const { return g_host->Tensor__Data_UInt4x2(this); }
|
||||
template <>
|
||||
inline const int8_t* Tensor::Data<int8_t>() const { return g_host->Tensor__Data_int8(this); }
|
||||
template <>
|
||||
inline const uint8_t* Tensor::Data<uint8_t>() const { return g_host->Tensor__Data_uint8(this); }
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
#include "core/common/inlined_containers_fwd.h"
|
||||
#include "core/common/span_utils.h"
|
||||
|
|
@ -140,7 +141,8 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
|
|||
double relative_per_sample_tolerance,
|
||||
std::unique_ptr<GraphTransformer> transformer,
|
||||
const std::function<void(SessionOptions&)>& add_session_options,
|
||||
const InlinedHashSet<std::string>& disabled_optimizers) {
|
||||
const InlinedHashSet<std::string>& disabled_optimizers,
|
||||
std::unique_ptr<IExecutionProvider> ep) {
|
||||
// Build the model for this test.
|
||||
std::unordered_map<std::string, int> domain_to_version;
|
||||
domain_to_version[kOnnxDomain] = opset_version;
|
||||
|
|
@ -157,6 +159,7 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
|
|||
// Serialize the model to a string.
|
||||
std::string model_data;
|
||||
model.ToProto().SerializeToString(&model_data);
|
||||
std::shared_ptr<IExecutionProvider> ep_shared = ep ? std::move(ep) : nullptr;
|
||||
|
||||
auto run_model = [&](TransformerLevel level, std::vector<OrtValue>& fetches,
|
||||
std::unique_ptr<GraphTransformer> transformer = nullptr) {
|
||||
|
|
@ -170,6 +173,10 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
|
|||
add_session_options(session_options);
|
||||
}
|
||||
InferenceSessionWrapper session{session_options, GetEnvironment()};
|
||||
if (ep_shared) {
|
||||
ASSERT_STATUS_OK(session.RegisterExecutionProvider(ep_shared));
|
||||
}
|
||||
|
||||
ASSERT_STATUS_OK(session.Load(model_data.data(), static_cast<int>(model_data.size())));
|
||||
if (transformer) {
|
||||
ASSERT_STATUS_OK(session.RegisterGraphTransformer(std::move(transformer), level));
|
||||
|
|
|
|||
|
|
@ -555,7 +555,8 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
|
|||
double relative_per_sample_tolerance = 0.0,
|
||||
std::unique_ptr<GraphTransformer> transformer = nullptr,
|
||||
const std::function<void(SessionOptions&)>& add_session_options = {},
|
||||
const InlinedHashSet<std::string>& disabled_optimizers = {});
|
||||
const InlinedHashSet<std::string>& disabled_optimizers = {},
|
||||
std::unique_ptr<IExecutionProvider> ep = nullptr);
|
||||
|
||||
void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& build_test_case,
|
||||
const std::function<void(InferenceSessionWrapper& session)>& check_transformed_graph,
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
#include "test/optimizer/qdq_test_utils.h"
|
||||
#include "test/optimizer/graph_transform_test_builder.h"
|
||||
#include "test/util/include/asserts.h"
|
||||
#include "test/util/include/default_providers.h"
|
||||
#include "test/util/include/inference_session_wrapper.h"
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
|
|
@ -55,7 +56,8 @@ RunDQMatMulNotConverted_NonConstDQ(const std::vector<int64_t>& input1_shape,
|
|||
const std::vector<int64_t>& input2_shape,
|
||||
const int64_t axis,
|
||||
const int64_t block_size,
|
||||
int64_t accuracy_level) {
|
||||
int64_t accuracy_level,
|
||||
std::unique_ptr<IExecutionProvider> ep = nullptr) {
|
||||
auto build_test_case = [&](ModelTestBuilder& builder) {
|
||||
auto* input1_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
|
||||
auto* input2_arg = builder.MakeInput(input2_shape, T(T::min_val, 0), T(T::max_val, 0));
|
||||
|
|
@ -104,7 +106,9 @@ RunDQMatMulNotConverted_NonConstDQ(const std::vector<int64_t>& input1_shape,
|
|||
1e-5 /*per_sample_tolerance*/,
|
||||
1e-5 /*relative_per_sample_tolerance*/,
|
||||
nullptr,
|
||||
add_session_options_fn);
|
||||
add_session_options_fn,
|
||||
{},
|
||||
ep ? std::move(ep) : nullptr);
|
||||
}
|
||||
|
||||
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) {
|
||||
|
|
@ -127,6 +131,27 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) {
|
|||
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
|
||||
}
|
||||
|
||||
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ_Cuda) {
|
||||
// DQ contrib op schema is not updated to support blocked quantization
|
||||
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
|
||||
;
|
||||
RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
// Input2
|
||||
// |
|
||||
// DQ /
|
||||
|
|
@ -140,7 +165,8 @@ RunDQMatMulNotConverted_FirstDQInput(const std::vector<int64_t>& weight_shape,
|
|||
const std::vector<int64_t>& input2_shape,
|
||||
const int64_t axis,
|
||||
const int64_t block_size,
|
||||
int64_t accuracy_level) {
|
||||
int64_t accuracy_level,
|
||||
std::unique_ptr<IExecutionProvider> ep = nullptr) {
|
||||
auto build_test_case = [&](ModelTestBuilder& builder) {
|
||||
auto* weight_arg = builder.MakeInitializer(weight_shape, T(T::min_val, 0), T(T::max_val, 0));
|
||||
auto* input2_arg = builder.MakeInput(input2_shape, -100.0f, 100.0f);
|
||||
|
|
@ -189,7 +215,9 @@ RunDQMatMulNotConverted_FirstDQInput(const std::vector<int64_t>& weight_shape,
|
|||
1e-5 /*per_sample_tolerance*/,
|
||||
1e-5 /*relative_per_sample_tolerance*/,
|
||||
nullptr,
|
||||
add_session_options_fn);
|
||||
add_session_options_fn,
|
||||
{},
|
||||
ep ? std::move(ep) : nullptr);
|
||||
}
|
||||
|
||||
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) {
|
||||
|
|
@ -212,6 +240,27 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) {
|
|||
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
|
||||
}
|
||||
|
||||
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput_Cuda) {
|
||||
// DQ contrib op schema is not updated to support blocked quantization
|
||||
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
|
||||
;
|
||||
RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
// Input1
|
||||
// |
|
||||
// \ DQ
|
||||
|
|
@ -224,7 +273,8 @@ void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector<int64_t>& input
|
|||
const std::vector<int64_t>& weight_shape,
|
||||
const int64_t axis,
|
||||
const int64_t block_size,
|
||||
int64_t accuracy_level) {
|
||||
int64_t accuracy_level,
|
||||
std::unique_ptr<IExecutionProvider> ep = nullptr) {
|
||||
auto build_test_case = [&](ModelTestBuilder& builder) {
|
||||
auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
|
||||
auto* output_arg = builder.MakeOutput();
|
||||
|
|
@ -287,7 +337,9 @@ void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector<int64_t>& input
|
|||
1e-5 /*per_sample_tolerance*/,
|
||||
1e-5 /*relative_per_sample_tolerance*/,
|
||||
nullptr,
|
||||
add_session_options_fn);
|
||||
add_session_options_fn,
|
||||
{},
|
||||
ep ? std::move(ep) : nullptr);
|
||||
}
|
||||
|
||||
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_TypeMismatch) {
|
||||
|
|
@ -327,6 +379,31 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch) {
|
|||
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
|
||||
}
|
||||
|
||||
TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch_Cuda) {
|
||||
// DQ contrib op schema is not updated to support blocked quantization
|
||||
// block size too small
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
|
||||
// block size not 2's power
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
|
||||
;
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
|
||||
// not axis 0
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
|
||||
// not rank 2
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
// Input1
|
||||
// | DQ
|
||||
// \ /
|
||||
|
|
@ -343,7 +420,8 @@ RunDQMatMulConverted(const std::vector<int64_t>& input1_shape,
|
|||
const std::vector<int64_t>& weight2_shape,
|
||||
const int64_t axis,
|
||||
const int64_t block_size,
|
||||
int64_t accuracy_level) {
|
||||
int64_t accuracy_level,
|
||||
std::unique_ptr<IExecutionProvider> ep = nullptr) {
|
||||
auto build_test_case = [&](ModelTestBuilder& builder) {
|
||||
auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
|
||||
auto* output_arg = builder.MakeOutput();
|
||||
|
|
@ -402,9 +480,11 @@ RunDQMatMulConverted(const std::vector<int64_t>& input1_shape,
|
|||
TransformerLevel::Level2,
|
||||
21 /*opset_version*/,
|
||||
1e-5 /*per_sample_tolerance*/,
|
||||
1e-5 /*relative_per_sample_tolerance*/,
|
||||
2e-5 /*relative_per_sample_tolerance*/,
|
||||
nullptr,
|
||||
add_session_options_fn);
|
||||
add_session_options_fn,
|
||||
{},
|
||||
ep ? std::move(ep) : nullptr);
|
||||
}
|
||||
|
||||
TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) {
|
||||
|
|
@ -419,6 +499,18 @@ TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) {
|
|||
RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
|
||||
}
|
||||
|
||||
TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits_Cuda) {
|
||||
// DQ contrib op schema is not updated to support blocked quantization
|
||||
RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
#endif // !defined(DISABLE_CONTRIB_OPS)
|
||||
|
||||
} // namespace test
|
||||
|
|
|
|||
|
|
@ -869,7 +869,8 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size,
|
|||
template <typename Tin, typename Tout>
|
||||
void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
|
||||
int64_t scale_block_count,
|
||||
int64_t zero_point_block_count) {
|
||||
int64_t zero_point_block_count,
|
||||
std::unique_ptr<IExecutionProvider> ep = nullptr) {
|
||||
OpTester test("DequantizeLinear", 21);
|
||||
std::vector<int64_t> dims{2, 4};
|
||||
std::vector<Tout> x_scale, y;
|
||||
|
|
@ -877,7 +878,7 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
|
|||
SessionOptions so;
|
||||
std::vector<std::string> log_msgs; // redirect error messages
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> eps;
|
||||
eps.push_back(DefaultCpuExecutionProvider());
|
||||
eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
|
||||
so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
|
||||
const char* logid, const char* code_location, const char* message) {
|
||||
ORT_UNUSED_PARAMETER(severity);
|
||||
|
|
@ -970,6 +971,13 @@ TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int) {
|
|||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(-1, 2, 2);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int_Cuda) {
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(-1, 2, 2, DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(-1, 2, 2, DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(-2, 2, 2, DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(-2, 2, 2, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) {
|
||||
constexpr int min_cuda_architecture = 11080;
|
||||
|
|
@ -1013,6 +1021,13 @@ TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) {
|
|||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 1, 1);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int_Cuda) {
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 1, 1, DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 3, 3, DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 3, 3, DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 1, 1, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) {
|
||||
constexpr int min_cuda_architecture = 11080;
|
||||
|
|
@ -1052,6 +1067,13 @@ TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) {
|
|||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 2, 1);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int_Cuda) {
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 2, 1, DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 2, 3, DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 2, 3, DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 2, 1, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) {
|
||||
constexpr int min_cuda_architecture = 11080;
|
||||
|
|
@ -1081,14 +1103,14 @@ void DequantizeLinearOp21BlockedTest_Int4_Succeed(std::vector<int64_t>&& dims,
|
|||
std::vector<int>& x_,
|
||||
std::vector<float>& x_scale_,
|
||||
std::vector<int>& x_zero_point_,
|
||||
std::vector<float>& y_) {
|
||||
std::vector<float>& y_,
|
||||
std::unique_ptr<IExecutionProvider> ep = nullptr) {
|
||||
OpTester test("DequantizeLinear", 21);
|
||||
std::vector<int64_t> x_scale_shape;
|
||||
std::vector<Tout> x_scale, y;
|
||||
std::vector<Tin> x, x_zero_point;
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> eps;
|
||||
eps.push_back(DefaultCpuExecutionProvider());
|
||||
|
||||
eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
|
||||
int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
|
||||
bool use_zero_point = !x_zero_point_.empty();
|
||||
|
||||
|
|
@ -1216,6 +1238,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point;
|
||||
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
|
||||
std::vector<float> y_2{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, 2.0, 8.0, -10.5, -4.0, 10.0, 24.0, -24.5, 8.0};
|
||||
std::vector<float> y_3{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, -2.0, -8.0, 10.5, 4.0, 10.0, 24.0, -24.5, 8.0};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
|
||||
|
|
@ -1237,6 +1276,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
|
||||
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
|
||||
std::vector<float> y_2{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, 2.0, 0, 3.5, 3.0, 10.0, 16.0, -10.5, 15};
|
||||
std::vector<float> y_3{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, -14.0, -24, 21, 5, 10.0, 16.0, -10.5, 15};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
|
||||
std::vector<int> zero_point{};
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
|
|
@ -1262,6 +1318,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis_Cuda) {
|
||||
std::vector<int> zero_point{};
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
|
||||
std::vector<float> y_2{14, 24, 10, 16, -10.5, -2, -3.5, 0, 2, 8, 6, 16, -17.5, -6, -24.5, 8};
|
||||
std::vector<float> y_3{14, 24, 10, 16, 6, 8, -3.5, 0, 2, 8, 6, 16, 10, 24, -24.5, 8};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
|
||||
|
|
@ -1283,6 +1356,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
|
||||
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
|
||||
std::vector<float> y_2{2, 8, -2, 0, 0, -1, 7, 1, 2, 0, 6, 8, -3.5, 1, -10.5, 15};
|
||||
std::vector<float> y_3{2, 8, -2, 0, -6, -8, 7, 1, 2, 0, 6, 8, 10, 16, -10.5, 15};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
|
||||
std::vector<int> zero_point{};
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
|
|
@ -1308,6 +1398,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis_Cuda) {
|
||||
std::vector<int> zero_point{};
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
|
||||
std::vector<float> y_2{14, 12, 20, 16, -10.5, -7, -1, 0, 2, 4, 12, 16, -17.5, -21, -7, 8};
|
||||
std::vector<float> y_3{14, 12, 10, 16, -10.5, -7, -3.5, 0, 2, 4, 6, 16, -17.5, -21, -24.5, 8};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
|
||||
|
|
@ -1329,6 +1436,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
|
||||
std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
|
||||
std::vector<float> y_2{2, 0, 4, 0, 0, 3.5, 0, 1, 2, 4, 4, 8, -3.5, -7, 0, 15};
|
||||
std::vector<float> y_3{2, 0, -2, 0, 0, 3.5, 7, 1, 2, 4, 6, 8, -3.5, -7, -10.5, 15};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{};
|
||||
|
|
@ -1350,6 +1474,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{};
|
||||
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
std::vector<float> y_2{0, -4, 7, 3, -8, -20, 21, 7, 16, 36, -35, -11, 24, 52, -49, -15};
|
||||
std::vector<float> y_3{0, -4, 7, 3, -8, -20, 21, 7, -16, -36, 35, 11, 24, 52, -49, -15};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
|
||||
|
|
@ -1371,6 +1512,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
|
||||
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
std::vector<float> y_2{4, -4, 3.5, -6, -4, -20, 17.5, -2, -10, 16, 3.5, -5, -2, 32, -10.5, -9};
|
||||
std::vector<float> y_3{4, -4, 3.5, -6, -4, -20, 17.5, -2, -12, -36, 31.5, 2, -2, 32, -10.5, -9};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{};
|
||||
|
|
@ -1392,6 +1550,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{};
|
||||
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
std::vector<float> y_2{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15};
|
||||
std::vector<float> y_3{0, -4, -4, -12, -8, -20, 21, 7, 16, 36, 20, 44, 24, 52, -49, -15};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
|
||||
|
|
@ -1413,6 +1588,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
|
||||
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
std::vector<float> y_2{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9};
|
||||
std::vector<float> y_3{4, -4, 0, -12, -4, -20, 17.5, -2, -10, 16, -6, 24, -2, 32, -10.5, -9};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{};
|
||||
|
|
@ -1434,6 +1626,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{};
|
||||
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
std::vector<float> y_2{0, -2, -8, -12, 14, 17.5, 6, 7, 16, 18, 40, 44, -42, -45.5, -14, -15};
|
||||
std::vector<float> y_3{0, -2, -4, -12, 14, 17.5, 21, 7, 16, 18, 20, 44, -42, -45.5, -49, -15};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
|
||||
|
|
@ -1455,6 +1664,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
|
|||
DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis_Cuda) {
|
||||
std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
|
||||
std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
std::vector<float> y_2{4, 2, -8, -12, 10.5, 14, -3, -2, -10, -8, 20, 24, -3.5, -7, -8, -9};
|
||||
std::vector<float> y_3{4, 2, 0, -12, 10.5, 14, 17.5, -2, -10, -8, -6, 24, -3.5, -7, -10.5, -9};
|
||||
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_FirstAxis) {
|
||||
constexpr int min_cuda_architecture = 11080;
|
||||
|
|
@ -1624,7 +1850,8 @@ void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size,
|
|||
template <typename Tout, typename Tin>
|
||||
void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
|
||||
int64_t scale_block_count,
|
||||
int64_t zero_point_block_count) {
|
||||
int64_t zero_point_block_count,
|
||||
std::unique_ptr<IExecutionProvider> ep = nullptr) {
|
||||
OpTester test("QuantizeLinear", 21);
|
||||
std::vector<int64_t> dims{2, 4};
|
||||
std::vector<Tout> x_zero_point, y;
|
||||
|
|
@ -1632,7 +1859,7 @@ void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
|
|||
SessionOptions so;
|
||||
std::vector<std::string> log_msgs; // redirect error messages
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> eps;
|
||||
eps.push_back(DefaultCpuExecutionProvider());
|
||||
eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
|
||||
so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
|
||||
const char* logid, const char* code_location, const char* message) {
|
||||
ORT_UNUSED_PARAMETER(severity);
|
||||
|
|
@ -1725,6 +1952,13 @@ TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Int) {
|
|||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(-1, 2, 2);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Int_Cuda) {
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(-1, 2, 2, DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(-1, 2, 2, DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(-2, 2, 2, DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(-2, 2, 2, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) {
|
||||
constexpr int min_cuda_architecture = 11080;
|
||||
|
|
@ -1768,6 +2002,13 @@ TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) {
|
|||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 1, 1);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int_Cuda) {
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 1, 1, DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 3, 3, DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 3, 3, DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 1, 1, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) {
|
||||
constexpr int min_cuda_architecture = 11080;
|
||||
|
|
@ -1807,6 +2048,13 @@ TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) {
|
|||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 2, 1);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int_Cuda) {
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 2, 1, DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 2, 3, DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 2, 3, DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 2, 1, DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
#if !defined(DISABLE_FLOAT8_TYPES)
|
||||
TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) {
|
||||
constexpr int min_cuda_architecture = 11080;
|
||||
|
|
@ -1836,14 +2084,14 @@ void QuantizeLinearOp21BlockedTest_Int4_Succeed(std::vector<int64_t>&& dims,
|
|||
std::vector<float>& x_,
|
||||
std::vector<float>& scale_,
|
||||
std::vector<int>& zero_point_,
|
||||
std::vector<int>& y_) {
|
||||
std::vector<int>& y_,
|
||||
std::unique_ptr<IExecutionProvider> ep = nullptr) {
|
||||
OpTester test("QuantizeLinear", 21);
|
||||
std::vector<int64_t> scale_shape;
|
||||
std::vector<Tout> zero_point, y;
|
||||
std::vector<Tin> x, scale;
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> eps;
|
||||
eps.push_back(DefaultCpuExecutionProvider());
|
||||
|
||||
eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
|
||||
int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
|
||||
bool use_zero_point = !zero_point_.empty();
|
||||
|
||||
|
|
@ -1970,6 +2218,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_FirstAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_FirstAxis_Cuda) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
|
||||
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
std::vector<float> x{14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4,
|
||||
6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0,
|
||||
2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0,
|
||||
10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0};
|
||||
std::vector<int> y_2{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
|
||||
-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
|
||||
1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
|
||||
5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
|
||||
std::vector<int> y_3{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
|
||||
-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
|
||||
-1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4,
|
||||
5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
|
||||
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
|
|
@ -2022,6 +2298,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_FirstAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_FirstAxis_Cuda) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
|
||||
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{-6, -4, -3, -1, -6, -4, -3, -1, -6, -4, -3, -1, -6, -4, -3, -1,
|
||||
0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7};
|
||||
std::vector<float> x{2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3,
|
||||
-6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1,
|
||||
2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0,
|
||||
10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15};
|
||||
std::vector<int> y_2{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
|
||||
-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
|
||||
1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
|
||||
5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
|
||||
std::vector<int> y_3{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
|
||||
-3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
|
||||
-7, -4, -2, 2, -7, -4, -2, 2, -7, -4, -2, 2, -7, -4, -2, 2,
|
||||
5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
|
||||
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
|
|
@ -2074,6 +2378,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_MiddleAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_MiddleAxis_Cuda) {
|
||||
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, -2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0,
|
||||
2.0, 4.0, 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0};
|
||||
std::vector<float> x{14, 24, 14, 24, 14, 24, 14, 24, 10, 16, 10, 16, 10, 16, 10, 16,
|
||||
-10.5, -2, -10.5, -2, -10.5, -2, -10.5, -2, -3.5, 0, -3.5, 0, -3.5, 0, -3.5, 0,
|
||||
2, 8, 2, 8, 2, 8, 2, 8, 6, 16, 6, 16, 6, 16, 6, 16,
|
||||
-17.5, -6, -17.5, -6, -17.5, -6, -17.5, -6, -24.5, 8, -24.5, 8, -24.5, 8, -24.5, 8};
|
||||
std::vector<int> y_2{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
|
||||
-3, -2, -3, -2, -3, -2, -3, -2, -1, 0, -1, 0, -1, 0, -1, 0,
|
||||
1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
|
||||
5, 6, 5, 6, 5, 6, 5, 6, 7, -8, 7, -8, 7, -8, 7, -8};
|
||||
std::vector<int> y_3{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
|
||||
5, 0, 5, 0, 5, 0, 5, 0, -1, 0, -1, 0, -1, 0, -1, 0,
|
||||
1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
|
||||
-8, -2, -8, -2, -8, -2, -8, -2, 7, -8, 7, -8, 7, -8, 7, -8};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
|
||||
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
|
@ -2126,6 +2458,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_MiddleAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_MiddleAxis_Cuda) {
|
||||
std::vector<int> zero_point{-6, -4, -6, -4, -6, -4, -6, -4, -3, -1, -3, -1, -3, -1, -3, -1,
|
||||
0, 2, 0, 2, 0, 2, 0, 2, 4, 7, 4, 7, 4, 7, 4, 7};
|
||||
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, -2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0,
|
||||
2.0, 4.0, 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0};
|
||||
std::vector<float> x{2, 8, 2, 8, 2, 8, 2, 8, -2, 0, -2, 0, -2, 0, -2, 0,
|
||||
0, -1, 0, -1, 0, -1, 0, -1, 7, 1, 7, 1, 7, 1, 7, 1,
|
||||
2, 0, 2, 0, 2, 0, 2, 0, 6, 8, 6, 8, 6, 8, 6, 8,
|
||||
-3.5, 1, -3.5, 1, -3.5, 1, -3.5, 1, -10.5, 15, -10.5, 15, -10.5, 15, -10.5, 15};
|
||||
std::vector<int> y_2{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
|
||||
-3, -2, -3, -2, -3, -2, -3, -2, -1, 0, -1, 0, -1, 0, -1, 0,
|
||||
1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
|
||||
5, 6, 5, 6, 5, 6, 5, 6, 7, -8, 7, -8, 7, -8, 7, -8};
|
||||
std::vector<int> y_3{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
|
||||
-6, -4, -6, -4, -6, -4, -6, -4, -1, 0, -1, 0, -1, 0, -1, 0,
|
||||
1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
|
||||
-2, 2, -2, 2, -2, 2, -2, 2, 7, -8, 7, -8, 7, -8, 7, -8};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
|
||||
std::vector<int> zero_point{-6, -4, -6, -4, -6, -4, -6, -4, -3, -1, -3, -1, -3, -1, -3, -1,
|
||||
0, 2, 0, 2, 0, 2, 0, 2, 4, 7, 4, 7, 4, 7, 4, 7};
|
||||
|
|
@ -2178,6 +2538,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_LastAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_LastAxis_Cuda) {
|
||||
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0};
|
||||
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
|
||||
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
|
||||
std::vector<float> x{14, 12, 14, 12, 20, 16, 20, 16, 14, 12, 14, 12, 20, 16, 20, 16,
|
||||
-10.5, -7, -10.5, -7, -1, 0, -1, 0, -10.5, -7, -10.5, -7, -1, 0, -1, 0,
|
||||
2, 4, 2, 4, 12, 16, 12, 16, 2, 4, 2, 4, 12, 16, 12, 16,
|
||||
-17.5, -21, -17.5, -21, -7, 8, -7, 8, -17.5, -21, -17.5, -21, -7, 8, -7, 8};
|
||||
std::vector<int> y_2{-7, -6, -7, -6, -5, -4, -5, -4, -7, -6, -7, -6, -5, -4, -5, -4,
|
||||
-3, -2, -3, -2, -1, 0, -1, 0, -3, -2, -3, -2, -1, 0, -1, 0,
|
||||
1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4,
|
||||
5, 6, 5, 6, 7, -8, 7, -8, 5, 6, 5, 6, 7, -8, 7, -8};
|
||||
std::vector<int> y_3{-7, -6, -7, -6, -8, -4, -5, -4, -7, -6, -7, -6, -8, -4, -5, -4,
|
||||
-3, -2, -3, -2, 0, 0, -1, 0, -3, -2, -3, -2, 0, 0, -1, 0,
|
||||
1, 2, 1, 2, 6, 4, 3, 4, 1, 2, 1, 2, 6, 4, 3, 4,
|
||||
5, 6, 5, 6, 2, -8, 7, -8, 5, 6, 5, 6, 2, -8, 7, -8};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
|
||||
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
|
@ -2230,6 +2618,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_LastAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_LastAxis_Cuda) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
|
||||
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{-6, -4, -6, -4, -3, -1, -3, -1,
|
||||
0, 2, 0, 2, 4, 7, 4, 7};
|
||||
std::vector<float> x{2, 0, 2, 0, 4, 0, 4, 0, 2, 0, 2, 0, 4, 0, 4, 0,
|
||||
0, 3.5, 0, 3.5, 0, 1, 0, 1, 0, 3.5, 0, 3.5, 0, 1, 0, 1,
|
||||
2, 4, 2, 4, 4, 8, 4, 8, 2, 4, 2, 4, 4, 8, 4, 8,
|
||||
-3.5, -7, -3.5, -7, 0, 15, 0, 15, -3.5, -7, -3.5, -7, 0, 15, 0, 15};
|
||||
std::vector<int> y_2{-7, -6, -7, -6, -5, -4, -5, -4, -7, -6, -7, -6, -5, -4, -5, -4,
|
||||
-3, -2, -3, -2, -1, 0, -1, 0, -3, -2, -3, -2, -1, 0, -1, 0,
|
||||
1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4,
|
||||
5, 6, 5, 6, 7, -8, 7, -8, 5, 6, 5, 6, 7, -8, 7, -8};
|
||||
std::vector<int> y_3{-7, -6, -7, -6, -8, -4, -5, -4, -7, -6, -7, -6, -8, -4, -5, -4,
|
||||
-3, -2, -3, -2, -3, 0, -1, 0, -3, -2, -3, -2, -3, 0, -1, 0,
|
||||
1, 2, 1, 2, 2, 4, 3, 4, 1, 2, 1, 2, 2, 4, 3, 4,
|
||||
5, 6, 5, 6, 4, -8, 7, -8, 5, 6, 5, 6, 4, -8, 7, -8};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
|
||||
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
|
||||
|
|
@ -2282,6 +2698,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_FirstAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_FirstAxis_Cuda) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
|
||||
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
std::vector<float> x{0, -4, 7, 3, 0, -4, 7, 3, 0, -4, 7, 3, 0, -4, 7, 3,
|
||||
-8, -20, 21, 7, -8, -20, 21, 7, -8, -20, 21, 7, -8, -20, 21, 7,
|
||||
16, 36, -35, -11, 16, 36, -35, -11, 16, 36, -35, -11, 16, 36, -35, -11,
|
||||
24, 52, -49, -15, 24, 52, -49, -15, 24, 52, -49, -15, 24, 52, -49, -15};
|
||||
std::vector<int> y_2{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
|
||||
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
|
||||
std::vector<int> y_3{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
|
||||
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
|
||||
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
|
|
@ -2334,6 +2778,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_FirstAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_FirstAxis_Cuda) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
|
||||
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{2, 0, 1, 9, 2, 0, 1, 9, 2, 0, 1, 9, 2, 0, 1, 9,
|
||||
13, 5, 11, 6, 13, 5, 11, 6, 13, 5, 11, 6, 13, 5, 11, 6};
|
||||
std::vector<float> x{4, -4, 3.5, -6, 4, -4, 3.5, -6, 4, -4, 3.5, -6, 4, -4, 3.5, -6,
|
||||
-4, -20, 17.5, -2, -4, -20, 17.5, -2, -4, -20, 17.5, -2, -4, -20, 17.5, -2,
|
||||
-10, 16, 3.5, -5, -10, 16, 3.5, -5, -10, 16, 3.5, -5, -10, 16, 3.5, -5,
|
||||
-2, 32, -10.5, -9, -2, 32, -10.5, -9, -2, 32, -10.5, -9, -2, 32, -10.5, -9};
|
||||
std::vector<int> y_2{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
|
||||
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
|
||||
8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
|
||||
12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
|
||||
std::vector<int> y_3{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
|
||||
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
|
||||
7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4,
|
||||
12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
|
||||
2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
|
|
@ -2386,6 +2858,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_MiddleAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_MiddleAxis_Cuda) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
|
||||
-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
std::vector<float> x{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15,
|
||||
0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15,
|
||||
0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15,
|
||||
0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15};
|
||||
std::vector<int> y_2{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
std::vector<int> y_3{0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15,
|
||||
0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15,
|
||||
0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15,
|
||||
0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
|
||||
-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
|
|
@ -2438,6 +2938,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_MiddleAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_MiddleAxis_Cuda) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
|
||||
-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6, 2, 0, 1, 9, 13, 5, 11, 6,
|
||||
2, 0, 1, 9, 13, 5, 11, 6, 2, 0, 1, 9, 13, 5, 11, 6};
|
||||
std::vector<float> x{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9,
|
||||
4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9,
|
||||
4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9,
|
||||
4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9};
|
||||
std::vector<int> y_2{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
std::vector<int> y_3{0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15,
|
||||
0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15,
|
||||
0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15,
|
||||
0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
|
||||
-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
|
||||
|
|
@ -2490,6 +3018,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_LastAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_LastAxis_Cuda) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
|
||||
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0};
|
||||
std::vector<float> x{0, -2, 0, -2, -8, -12, -8, -12, 0, -2, 0, -2, -8, -12, -8, -12,
|
||||
14, 17.5, 14, 17.5, 6, 7, 6, 7, 14, 17.5, 14, 17.5, 6, 7, 6, 7,
|
||||
16, 18, 16, 18, 40, 44, 40, 44, 16, 18, 16, 18, 40, 44, 40, 44,
|
||||
-42, -45.5, -42, -45.5, -14, -15, -14, -15, -42, -45.5, -42, -45.5, -14, -15, -14, -15};
|
||||
std::vector<int> y_2{0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3,
|
||||
4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7,
|
||||
8, 9, 8, 9, 10, 11, 10, 11, 8, 9, 8, 9, 10, 11, 10, 11,
|
||||
12, 13, 12, 13, 14, 15, 14, 15, 12, 13, 12, 13, 14, 15, 14, 15};
|
||||
std::vector<int> y_3{0, 1, 0, 1, 4, 3, 2, 3, 0, 1, 0, 1, 4, 3, 2, 3,
|
||||
4, 5, 4, 5, 2, 7, 6, 7, 4, 5, 4, 5, 2, 7, 6, 7,
|
||||
8, 9, 8, 9, 15, 11, 10, 11, 8, 9, 8, 9, 15, 11, 10, 11,
|
||||
12, 13, 12, 13, 4, 15, 14, 15, 12, 13, 12, 13, 4, 15, 14, 15};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
|
||||
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
|
||||
|
|
@ -2542,6 +3098,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_LastAxis) {
|
|||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_LastAxis_Cuda) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
|
||||
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
|
||||
std::vector<int> zero_point{2, 0, 2, 0, 1, 9, 1, 9,
|
||||
13, 5, 13, 5, 11, 6, 11, 6};
|
||||
std::vector<float> x{4, 2, 4, 2, -8, -12, -8, -12, 4, 2, 4, 2, -8, -12, -8, -12,
|
||||
10.5, 14, 10.5, 14, -3, -2, -3, -2, 10.5, 14, 10.5, 14, -3, -2, -3, -2,
|
||||
-10, -8, -10, -8, 20, 24, 20, 24, -10, -8, -10, -8, 20, 24, 20, 24,
|
||||
-3.5, -7, -3.5, -7, -8, -9, -8, -9, -3.5, -7, -3.5, -7, -8, -9, -8, -9};
|
||||
std::vector<int> y_2{0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3,
|
||||
4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7,
|
||||
8, 9, 8, 9, 10, 11, 10, 11, 8, 9, 8, 9, 10, 11, 10, 11,
|
||||
12, 13, 12, 13, 14, 15, 14, 15, 12, 13, 12, 13, 14, 15, 14, 15};
|
||||
std::vector<int> y_3{0, 1, 0, 1, 6, 3, 2, 3, 0, 1, 0, 1, 6, 3, 2, 3,
|
||||
4, 5, 4, 5, 0, 7, 6, 7, 4, 5, 4, 5, 0, 7, 6, 7,
|
||||
8, 9, 8, 9, 15, 11, 10, 11, 8, 9, 8, 9, 15, 11, 10, 11,
|
||||
12, 13, 12, 13, 13, 15, 14, 15, 12, 13, 12, 13, 13, 15, 14, 15};
|
||||
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
|
||||
DefaultCudaExecutionProvider());
|
||||
QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
|
||||
DefaultCudaExecutionProvider());
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
|
||||
std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
|
||||
2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
|
||||
|
|
|
|||
Loading…
Reference in a new issue