diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 46d9e217bf..d57394b3e7 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -587,7 +587,8 @@ Do not modify directly.* |DepthToSpace|*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)| |||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)| |||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)| -|DequantizeLinear|*in* x:**T**
*in* x_scale:**tensor(float)**
*in* x_zero_point:**T**
*out* y:**tensor(float)**

or

*in* x:**T1**
*in* x_scale:**T2**
*in* x_zero_point:**T1**
*out* y:**T2**|19+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)
**T2** = tensor(float), tensor(float16)| +|DequantizeLinear|*in* x:**T**
*in* x_scale:**tensor(float)**
*in* x_zero_point:**T**
*out* y:**tensor(float)**

or

*in* x:**T1**
*in* x_scale:**T2**
*in* x_zero_point:**T1**
*out* y:**T2**|21+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)
**T2** = tensor(float), tensor(float16)| +|||[19, 20]|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)
**T2** = tensor(float), tensor(float16)| |||[13, 18]|**T** = tensor(int8), tensor(uint8)| |||[10, 12]|**T** = tensor(int8), tensor(uint8)| |Div|*in* A:**T**
*in* B:**T**
*out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| @@ -718,7 +719,8 @@ Do not modify directly.* |||[13, 14]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)
**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)| |||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)
**T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)| |||[7, 11]|**T** = tensor(double), tensor(float), tensor(float16)| -|QuantizeLinear|*in* x:**T1**
*in* y_scale:**T1**
*in* y_zero_point:**T2**
*out* y:**T2**

or

*in* x:**T1**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T2**
*out* y:**T2**|19+|**T1** = tensor(float), tensor(float16)
**T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)| +|QuantizeLinear|*in* x:**T1**
*in* y_scale:**T1**
*in* y_zero_point:**T2**
*out* y:**T2**

or

*in* x:**T1**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T2**
*out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)
**T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)| +|||[19, 20]|**T1** = tensor(float), tensor(float16)
**T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)| |||[13, 18]|**T1** = tensor(float)
**T2** = tensor(int8), tensor(uint8)| |||[10, 12]|**T1** = tensor(float)
**T2** = tensor(int8), tensor(uint8)| |RNN|*in* X:**T**
*in* W:**T**
*in* R:**T**
*in* B:**T**
*in* sequence_lens:**T1**
*in* initial_h:**T**
*out* Y:**T**
*out* Y_h:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)
**T1** = tensor(int32)| diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc index f1b30da01f..adfa680878 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc @@ -23,7 +23,10 @@ void SplitQDQRules(SelectorActionRegistry& qdq_selector_action_registry) { const std::string action_name{"dropSplitQDQ"}; std::unique_ptr action = std::make_unique(); #if !defined(ORT_MINIMAL_BUILD) - std::unique_ptr selector = std::make_unique(true /*req_equal_quant_params*/); + std::vector providers = {kCpuExecutionProvider, kDmlExecutionProvider}; + std::unique_ptr selector = std::make_unique(true /*req_equal_quant_params*/, + false, + providers); qdq_selector_action_registry.RegisterSelectorAndAction(action_name, {{"Split", {}}}, std::move(selector), @@ -63,14 +66,18 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) { // // And cannot eliminate the QDQ for MaxPool if the scale is not positive, as a negative // scale will change the ordering of the elements between quantized & de-quantized values. - std::unique_ptr selector_no_16bit = std::make_unique(false); + std::vector providers = {kCpuExecutionProvider, kDmlExecutionProvider}; + std::unique_ptr selector_no_16bit = std::make_unique(false, + false, + true, + providers); qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_name, {{"Resize", {}}}, std::move(selector_no_16bit), std::move(drop_action_no_int16)); std::unique_ptr selector_no_16bit_and_positive_scale = - std::make_unique(false, true, false); + std::make_unique(false, true, false, providers); qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_and_positive_scale_name, {{"MaxPool", {12}}, {"ReduceMax", {}}, @@ -78,7 +85,7 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) { std::move(selector_no_16bit_and_positive_scale), std::move(drop_action_no_int16_and_positive_scale)); - std::unique_ptr selector = std::make_unique(true); + std::unique_ptr selector = std::make_unique(true, false, true, providers); // DepthToSpace and SpaceToDepth not included because there are no integer implementations. // https://github.com/microsoft/onnxruntime/issues/21287 qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_name, @@ -117,7 +124,8 @@ void DropDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) { #if !defined(ORT_MINIMAL_BUILD) // TODO: Enable 16-bit types in selector when ArgMax supports 16-bit integer input tensors. - std::unique_ptr selector = std::make_unique(); + std::vector providers = {kCpuExecutionProvider, kDmlExecutionProvider}; + std::unique_ptr selector = std::make_unique(false, false, providers); qdq_selector_action_registry.RegisterSelectorAndAction(action_name, {{"ArgMax", {}}}, std::move(selector), @@ -200,7 +208,8 @@ void VariadicOpQDQRules(SelectorActionRegistry& qdq_selector_action_registry) { #if !defined(ORT_MINIMAL_BUILD) // TODO: Enable 16-bit types in selector when QLinearConcat supports 16-bit. - std::unique_ptr selector = std::make_unique(); + std::vector providers = {kCpuExecutionProvider, kDmlExecutionProvider}; + std::unique_ptr selector = std::make_unique(false, false, providers); qdq_selector_action_registry.RegisterSelectorAndAction(action_name, {{"Concat", {}}}, @@ -222,7 +231,11 @@ void ConvQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool is_ #if !defined(ORT_MINIMAL_BUILD) // TODO: Enable 16-bit types in selector when QLinearConv supports 16-bit. - std::unique_ptr selector = std::make_unique(is_int8_allowed); + std::vector providers = {kCpuExecutionProvider, kDmlExecutionProvider}; + std::unique_ptr selector = std::make_unique(is_int8_allowed, + false, + false, + providers); qdq_selector_action_registry.RegisterSelectorAndAction(action_name, {{"Conv", {}}}, @@ -245,7 +258,11 @@ void MatMulQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool i #if !defined(ORT_MINIMAL_BUILD) // TODO: Enable 16-bit types in selector when QLinearMatMul and MatMulInteger support 16-bit. - std::unique_ptr selector = std::make_unique(is_int8_allowed); + std::vector providers = {kCpuExecutionProvider, kDmlExecutionProvider}; + std::unique_ptr selector = std::make_unique(is_int8_allowed, + false, + false, + providers); qdq_selector_action_registry.RegisterSelectorAndAction(action_name, {{"MatMul", {}}}, std::move(selector), @@ -272,7 +289,8 @@ void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_regi p_buffered_tensors); #if !defined(ORT_MINIMAL_BUILD) - std::unique_ptr selector = std::make_unique(); + std::vector providers = {kCpuExecutionProvider, kCudaExecutionProvider}; + std::unique_ptr selector = std::make_unique(providers); qdq_selector_action_registry.RegisterSelectorAndAction(action_name, {{"MatMul", {}}}, std::move(selector), @@ -363,8 +381,9 @@ QDQSelectorActionTransformer::QDQSelectorActionTransformer( CreateSelectorActionRegistry(is_int8_allowed, qdq_matmulnbits_accuracy_level, intra_op_thread_pool, p_buffered_tensors), apply_context, - // this transformer is only compatible with the CPU and DML EP - {kCpuExecutionProvider, kDmlExecutionProvider}} { + // this transformer is compatible with CPU, DML and CUDA EP. + // There is further EP control on the rule level. + {kCpuExecutionProvider, kDmlExecutionProvider, kCudaExecutionProvider}} { } } // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h index 7e009da394..0ba5436e69 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h @@ -302,14 +302,20 @@ class BaseSelector : public NodeSelector { class DropQDQNodesSelector : public BaseSelector { public: - explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false, bool allow_nonpositive_scale = true) - : BaseSelector(std::make_unique(allow_16bit, allow_4bit, allow_nonpositive_scale)) {} + explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false, + bool allow_nonpositive_scale = true, + gsl::span compatible_providers = {}) + : BaseSelector(std::make_unique(allow_16bit, allow_4bit, allow_nonpositive_scale), + compatible_providers) {} }; class DropDQNodesSelector : public BaseSelector { public: - explicit DropDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false) - : BaseSelector(std::make_unique(allow_16bit, allow_4bit)) {} + explicit DropDQNodesSelector(bool allow_16bit = false, + bool allow_4bit = false, + gsl::span compatible_providers = {}) + : BaseSelector(std::make_unique(allow_16bit, allow_4bit), + compatible_providers) {} }; class UnarySelector : public BaseSelector { @@ -329,8 +335,11 @@ class BinarySelector : public BaseSelector { // Variadic DQ nodes -> node -> Q class InputVariadicSelector : public BaseSelector { public: - explicit InputVariadicSelector(bool allow_16bit = false, bool allow_4bit = false) - : BaseSelector(std::make_unique(allow_16bit, allow_4bit)) {} + explicit InputVariadicSelector(bool allow_16bit = false, + bool allow_4bit = false, + gsl::span compatible_providers = {}) + : BaseSelector(std::make_unique(allow_16bit, allow_4bit), + compatible_providers) {} void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override; }; @@ -338,8 +347,10 @@ class InputVariadicSelector : public BaseSelector { // DQ -> Split -> variadic Q nodes class SplitSelector : public BaseSelector { public: - SplitSelector(bool req_equal_quant_params = false, bool allow_4bit = false) - : BaseSelector(std::make_unique(req_equal_quant_params, allow_4bit)) {} + SplitSelector(bool req_equal_quant_params = false, bool allow_4bit = false, + gsl::span compatible_providers = {}) + : BaseSelector(std::make_unique(req_equal_quant_params, allow_4bit), + compatible_providers) {} void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override; }; @@ -347,8 +358,10 @@ class SplitSelector : public BaseSelector { // DQ nodes for X, W and optionally B -> node -> Q class ConvSelector : public BaseSelector { public: - ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false) - : BaseSelector(std::make_unique(int8_allowed, allow_16bit, allow_4bit_weight)) {} + ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false, + gsl::span compatible_providers = {}) + : BaseSelector(std::make_unique(int8_allowed, allow_16bit, allow_4bit_weight), + compatible_providers) {} void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override; }; @@ -363,9 +376,11 @@ class WhereSelector : public BaseSelector { // 2 DQ nodes for input -> node -> optional Q if QLinearMatMul, MatMulIntegerToFloat if not class MatMulSelector : public BaseSelector { public: - MatMulSelector(bool int8_allowed, bool allow_16bit = false, bool allow_4bit = false) + MatMulSelector(bool int8_allowed, bool allow_16bit = false, bool allow_4bit = false, + gsl::span compatible_providers = {}) : BaseSelector(std::make_unique(int8_allowed, /*matmulintegertofloat_allowed*/ true, - allow_16bit, allow_4bit)) {} + allow_16bit, allow_4bit), + compatible_providers) {} }; // Convert "1 DQ node for input B -> MatMul" to "MatMulNBits" diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index f74754c3cd..b54c572556 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -4,6 +4,7 @@ #include "core/common/inlined_containers.h" #include "core/common/parse_string.h" +#include "core/framework/int4.h" #include "core/providers/shared_library/provider_api.h" #include "core/platform/env_var_utils.h" #include "core/providers/cuda/cuda_execution_provider.h" @@ -1348,38 +1349,37 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, Cast); #endif -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear); #if !defined(DISABLE_FLOAT8_TYPES) -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, DequantizeLinear); #endif -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear); #if !defined(DISABLE_FLOAT8_TYPES) -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, DequantizeLinear); #endif class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Identity); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, If); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Loop); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear); #if !defined(DISABLE_FLOAT8_TYPES) -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, QuantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, QuantizeLinear); #endif -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear); #if !defined(DISABLE_FLOAT8_TYPES) -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, QuantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, QuantizeLinear); #endif class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape); -#endif // Opset 20 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu); @@ -1388,6 +1388,40 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN); +// Opset 21. +// TODO(fajin): support other quantized types +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, DequantizeLinear); +#if !defined(DISABLE_FLOAT8_TYPES) +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, DequantizeLinear); +#endif + +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, QuantizeLinear); +#if !defined(DISABLE_FLOAT8_TYPES) +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, QuantizeLinear); +#endif + +#endif + template <> KernelCreateInfo BuildKernelCreateInfo() { return {}; @@ -2265,34 +2299,34 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif BuildKernelCreateInfo, @@ -2305,6 +2339,37 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + + // Opset 21 + // TODO(fajin): support other quantized types + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, +#if !defined(DISABLE_FLOAT8_TYPES) + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, +#endif + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, +#if !defined(DISABLE_FLOAT8_TYPES) + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, +#endif #endif }; diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc index d4b6d1bc49..6a5dbc433f 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc @@ -7,36 +7,181 @@ namespace onnxruntime { namespace cuda { +void ValidateBlockQuantizationShapes(const TensorShape& input_shape, + const TensorShape& scale_shape, + const Tensor* zero_point, + size_t axis_no_neg, + int64_t block_size_) { + ORT_ENFORCE(scale_shape.NumDimensions() == input_shape.NumDimensions(), + "scale and input must have the same rank for blocked quantization"); + + for (size_t i = 0, ndim = input_shape.NumDimensions(); i < ndim; ++i) { + if (i == static_cast(axis_no_neg)) { + ORT_ENFORCE(scale_shape[i] == (input_shape[i] + block_size_ - 1) / block_size_, + "scale must be ceil(Di/block_size) on the quantize axis i for blocked quantization"); + } else { + ORT_ENFORCE(scale_shape[i] == input_shape[i], + "scale and input must have the same shape despite the quantize axis for blocked quantization"); + } + } + + if (zero_point) { + ORT_ENFORCE(zero_point->Shape() == scale_shape, + "zero_point and scale must have the same shape for blocked quantization"); + } +} + template -typename std::enable_if, T>::value, Status>::type -CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, bool /*saturate*/) { +Status CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, + size_t num_of_element, bool /*saturate*/) { + ORT_UNUSED_PARAMETER(stream); + ORT_UNUSED_PARAMETER(input); + ORT_UNUSED_PARAMETER(output); + ORT_UNUSED_PARAMETER(scale); + ORT_UNUSED_PARAMETER(zero_point); + ORT_UNUSED_PARAMETER(num_of_element); + ORT_NOT_IMPLEMENTED("Unsupported quantization type."); +} + +template +Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, + size_t num_of_element, size_t batch_size, size_t n_scales, bool /*saturate*/) { + ORT_UNUSED_PARAMETER(stream); + ORT_UNUSED_PARAMETER(input); + ORT_UNUSED_PARAMETER(output); + ORT_UNUSED_PARAMETER(scale); + ORT_UNUSED_PARAMETER(zero_point); + ORT_UNUSED_PARAMETER(num_of_element); + ORT_UNUSED_PARAMETER(batch_size); + ORT_UNUSED_PARAMETER(n_scales); + ORT_NOT_IMPLEMENTED("Unsupported quantization type."); +} + +template +Status CudaQuantizeLinearBlock(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, + size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) { + ORT_UNUSED_PARAMETER(stream); + ORT_UNUSED_PARAMETER(input); + ORT_UNUSED_PARAMETER(output); + ORT_UNUSED_PARAMETER(scale); + ORT_UNUSED_PARAMETER(zero_point); + ORT_UNUSED_PARAMETER(num_of_element); + ORT_UNUSED_PARAMETER(K); + ORT_UNUSED_PARAMETER(N); + ORT_UNUSED_PARAMETER(block_size); + ORT_NOT_IMPLEMENTED("Unsupported quantization type."); +} + +template +Status CudaQuantizeLinear(cudaStream_t stream, const U* input, int8_t* output, const U* scale, + const int8_t* zero_point, size_t num_of_element, bool /*saturate*/) { return CudaQuantizeLinearStd(stream, input, output, scale, zero_point, num_of_element); } +template +Status CudaQuantizeLinear(cudaStream_t stream, const U* input, uint8_t* output, const U* scale, + const uint8_t* zero_point, size_t num_of_element, bool /*saturate*/) { + return CudaQuantizeLinearStd(stream, input, output, scale, zero_point, num_of_element); +} + +template +Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Int4x2* output, const U* scale, + const Int4x2* zero_point, size_t num_of_element, bool /*saturate*/) { + return CudaQuantizeLinearStdInt4(stream, input, reinterpret_cast(output), scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element); +} + +template +Status CudaQuantizeLinear(cudaStream_t stream, const U* input, UInt4x2* output, const U* scale, + const UInt4x2* zero_point, size_t num_of_element, bool /*saturate*/) { + return CudaQuantizeLinearStdInt4(stream, input, reinterpret_cast(output), scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element); +} + #if !defined(DISABLE_FLOAT8_TYPES) -template -typename std::enable_if, T>::value, Status>::type -CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, bool saturate) { +template +Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Float8E4M3FN* output, const U* scale, + const Float8E4M3FN* zero_point, size_t num_of_element, bool saturate) { return CudaQuantizeLinearSat(stream, input, output, scale, zero_point, num_of_element, saturate); } -template -typename std::enable_if, T>::value, Status>::type -CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, - size_t batch_size, size_t n_scales, bool saturate) { - return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales, saturate); +template +Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Float8E5M2* output, const U* scale, + const Float8E5M2* zero_point, size_t num_of_element, bool saturate) { + return CudaQuantizeLinearSat(stream, input, output, scale, zero_point, num_of_element, saturate); +} + +template +Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Float8E4M3FN* output, const U* scale, + const Float8E4M3FN* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales, bool saturate) { + return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, + n_scales, saturate); +} + +template +Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Float8E5M2* output, const U* scale, + const Float8E5M2* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales, bool saturate) { + return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, + n_scales, saturate); } #endif -template -typename std::enable_if, T>::value, Status>::type -CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, - size_t batch_size, size_t n_scales, bool /*saturate*/) { +template +Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, int8_t* output, const U* scale, + const int8_t* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales, bool /*saturate*/) { return CudaQuantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales); } +template +Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, uint8_t* output, const U* scale, + const uint8_t* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales, bool /*saturate*/) { + return CudaQuantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales); +} + +template +Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Int4x2* output, const U* scale, + const Int4x2* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales, bool /*saturate*/) { + return CudaQuantizeLinearAxisStdInt4(stream, input, reinterpret_cast(output), scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element, batch_size, n_scales); +} + +template +Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, UInt4x2* output, const U* scale, + const UInt4x2* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales, bool /*saturate*/) { + return CudaQuantizeLinearAxisStdInt4(stream, input, reinterpret_cast(output), scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element, batch_size, n_scales); +} + +template +Status CudaQuantizeLinearBlock(cudaStream_t stream, + const U* input, Int4x2* output, const U* scale, const Int4x2* zero_point, + size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) { + return CudaQuantizeLinearBlockStdInt4(stream, input, reinterpret_cast(output), scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element, K, N, block_size); +} + +template +Status CudaQuantizeLinearBlock(cudaStream_t stream, + const U* input, UInt4x2* output, const U* scale, const UInt4x2* zero_point, + size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) { + return CudaQuantizeLinearBlockStdInt4(stream, input, reinterpret_cast(output), scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element, K, N, block_size); +} + template Status QuantizeLinear::ComputeInternal(OpKernelContext* ctx) const { typedef typename ToCudaType::MappedType CudaU; @@ -48,21 +193,22 @@ Status QuantizeLinear::ComputeInternal(OpKernelContext* ctx) const { auto& y = *ctx->Output(0, x.Shape()); const auto& x_shape = x.Shape(); + const auto num_of_elements = x_shape.Size(); const CudaU* input = reinterpret_cast(x.Data()); T* output = y.MutableData(); - if (IsScalarOr1ElementVector(&y_scale)) { + if (IsScalarOr1ElementVector(&y_scale)) { // per-tensor quantization ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point), "y_zero_point must be a scalar or 1D tensor of size 1."); + ORT_ENFORCE(block_size_ == 0, "block_size must be 0 for per-tensor quantization."); const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data() : nullptr; const CudaU* scale = reinterpret_cast(y_scale.Data()); - const auto num_of_elements = x_shape.Size(); ORT_RETURN_IF_ERROR(CudaQuantizeLinear(Stream(ctx), input, output, scale, zero_point, num_of_elements, saturate_)); return Status::OK(); - } else { + } else if (block_size_ == 0) { // per-axis quantization ORT_ENFORCE(y_scale.Shape().NumDimensions() == 1); ORT_ENFORCE(y_zero_point == nullptr || (y_scale.Shape().Size() == y_zero_point->Shape().Size() && y_zero_point->Shape().NumDimensions() == 1), @@ -73,44 +219,184 @@ Status QuantizeLinear::ComputeInternal(OpKernelContext* ctx) const { const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data() : nullptr; const CudaU* scale = reinterpret_cast(y_scale.Data()); - const auto num_of_elements = x_shape.Size(); ORT_RETURN_IF_ERROR(CudaQuantizeLinearAxis(Stream(ctx), input, output, scale, zero_point, num_of_elements, x_shape.SizeToDimension(axis), y_scale.Shape().Size(), saturate_)); return Status::OK(); + } else { // blocked quantization + // validate shape + size_t axis_no_neg = SafeInt(HandleNegativeAxis(axis_, x_shape.NumDimensions())); + const auto& y_scale_shape = y_scale.Shape(); + + ValidateBlockQuantizationShapes(x_shape, + y_scale_shape, + y_zero_point, + axis_no_neg, + block_size_); + + // compute + const T* zero_point = y_zero_point ? y_zero_point->Data() : nullptr; + const CudaU* scale = reinterpret_cast(y_scale.Data()); + + ORT_RETURN_IF_ERROR(CudaQuantizeLinearBlock(Stream(ctx), input, output, scale, zero_point, + num_of_elements, x_shape[axis_no_neg], + x_shape.SizeFromDimension(axis_no_neg + 1), + block_size_, saturate_)); + return Status::OK(); } } template -typename std::enable_if, T>::value, Status>::type -CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) { +Status CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, + const T* zero_point, size_t num_of_element) { + ORT_UNUSED_PARAMETER(stream); + ORT_UNUSED_PARAMETER(input); + ORT_UNUSED_PARAMETER(output); + ORT_UNUSED_PARAMETER(scale); + ORT_UNUSED_PARAMETER(zero_point); + ORT_UNUSED_PARAMETER(num_of_element); + ORT_NOT_IMPLEMENTED("Unsupported quantization type."); +} + +template +Status CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale, + const T* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales) { + ORT_UNUSED_PARAMETER(stream); + ORT_UNUSED_PARAMETER(input); + ORT_UNUSED_PARAMETER(output); + ORT_UNUSED_PARAMETER(scale); + ORT_UNUSED_PARAMETER(zero_point); + ORT_UNUSED_PARAMETER(num_of_element); + ORT_UNUSED_PARAMETER(batch_size); + ORT_UNUSED_PARAMETER(n_scales); + ORT_NOT_IMPLEMENTED("Unsupported quantization type."); +} + +template +Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const T* input, U* output, const U* scale, + const T* zero_point, size_t num_of_element, size_t K, size_t N, + size_t block_size) { + ORT_UNUSED_PARAMETER(stream); + ORT_UNUSED_PARAMETER(input); + ORT_UNUSED_PARAMETER(output); + ORT_UNUSED_PARAMETER(scale); + ORT_UNUSED_PARAMETER(zero_point); + ORT_UNUSED_PARAMETER(num_of_element); + ORT_UNUSED_PARAMETER(K); + ORT_UNUSED_PARAMETER(N); + ORT_UNUSED_PARAMETER(block_size); + ORT_NOT_IMPLEMENTED("Unsupported quantization type."); +} + +template +Status CudaDequantizeLinear(cudaStream_t stream, const int8_t* input, U* output, const U* scale, + const int8_t* zero_point, size_t num_of_element) { return CudaDequantizeLinearStd(stream, input, output, scale, zero_point, num_of_element); } +template +Status CudaDequantizeLinear(cudaStream_t stream, const uint8_t* input, U* output, const U* scale, + const uint8_t* zero_point, size_t num_of_element) { + return CudaDequantizeLinearStd(stream, input, output, scale, zero_point, num_of_element); +} + +template +Status CudaDequantizeLinear(cudaStream_t stream, const Int4x2* input, U* output, const U* scale, + const Int4x2* zero_point, size_t num_of_element) { + return CudaDequantizeLinearStdInt4(stream, reinterpret_cast(input), output, scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element); +} + +template +Status CudaDequantizeLinear(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale, + const UInt4x2* zero_point, size_t num_of_element) { + return CudaDequantizeLinearStdInt4(stream, reinterpret_cast(input), output, scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element); +} + #if !defined(DISABLE_FLOAT8_TYPES) -template -typename std::enable_if, T>::value, Status>::type -CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) { +template +Status CudaDequantizeLinear(cudaStream_t stream, const Float8E4M3FN* input, U* output, const U* scale, + const Float8E4M3FN* zero_point, size_t num_of_element) { + return CudaDequantizeLinearSat(stream, input, output, scale, zero_point, num_of_element); +} + +template +Status CudaDequantizeLinear(cudaStream_t stream, const Float8E5M2* input, U* output, const U* scale, + const Float8E5M2* zero_point, size_t num_of_element) { return CudaDequantizeLinearSat(stream, input, output, scale, zero_point, num_of_element); } #endif -template -typename std::enable_if, T>::value, Status>::type -CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element, - size_t batch_size, size_t n_scales) { +template +Status CudaDequantizeLinearAxis(cudaStream_t stream, const int8_t* input, U* output, const U* scale, + const int8_t* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales) { return CudaDequantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales); } +template +Status CudaDequantizeLinearAxis(cudaStream_t stream, const uint8_t* input, U* output, const U* scale, + const uint8_t* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales) { + return CudaDequantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales); +} + +template +Status CudaDequantizeLinearAxis(cudaStream_t stream, const Int4x2* input, U* output, const U* scale, + const Int4x2* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales) { + return CudaDequantizeLinearAxisStdInt4(stream, reinterpret_cast(input), output, scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element, batch_size, n_scales); +} + +template +Status CudaDequantizeLinearAxis(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale, + const UInt4x2* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales) { + return CudaDequantizeLinearAxisStdInt4(stream, reinterpret_cast(input), output, scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element, batch_size, n_scales); +} + #if !defined(DISABLE_FLOAT8_TYPES) -template -typename std::enable_if, T>::value, Status>::type -CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element, - size_t batch_size, size_t n_scales) { +template +Status CudaDequantizeLinearAxis(cudaStream_t stream, const Float8E4M3FN* input, U* output, const U* scale, + const Float8E4M3FN* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales) { + return CudaDequantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales); +} + +template +Status CudaDequantizeLinearAxis(cudaStream_t stream, const Float8E5M2* input, U* output, const U* scale, + const Float8E5M2* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales) { return CudaDequantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales); } #endif +template +Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale, + const UInt4x2* zero_point, size_t num_of_element, size_t K, size_t N, + size_t block_size) { + return CudaDequantizeLinearBlockStdInt4(stream, reinterpret_cast(input), output, scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element, K, N, block_size); +} + +template +Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const Int4x2* input, U* output, const U* scale, + const Int4x2* zero_point, size_t num_of_element, size_t K, size_t N, + size_t block_size) { + return CudaDequantizeLinearBlockStdInt4(stream, reinterpret_cast(input), output, scale, + zero_point ? reinterpret_cast(zero_point) : nullptr, + num_of_element, K, N, block_size); +} + template Status DequantizeLinear::ComputeInternal(OpKernelContext* ctx) const { typedef typename ToCudaType::MappedType CudaU; @@ -120,6 +406,7 @@ Status DequantizeLinear::ComputeInternal(OpKernelContext* ctx) const { auto* y_zero_point = ctx->Input(2); const auto& x_shape = x.Shape(); + const auto num_of_elements = x_shape.Size(); auto& y = *ctx->Output(0, x_shape); @@ -131,12 +418,11 @@ Status DequantizeLinear::ComputeInternal(OpKernelContext* ctx) const { const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data() : nullptr; const CudaU* scale = reinterpret_cast(y_scale.Data()); - const auto num_of_elements = x_shape.Size(); ORT_RETURN_IF_ERROR(CudaDequantizeLinear(Stream(ctx), input, output, scale, zero_point, num_of_elements)); return Status::OK(); - } else { + } else if (block_size_ == 0) { // per axis quantization ORT_ENFORCE(y_scale.Shape().NumDimensions() == 1); ORT_ENFORCE(y_zero_point == nullptr || (y_scale.Shape().Size() == y_zero_point->Shape().Size() && y_zero_point->Shape().NumDimensions() == 1), "scale and zero_point must have the same shape."); ORT_ENFORCE(x_shape.NumDimensions() > 1); @@ -145,11 +431,31 @@ Status DequantizeLinear::ComputeInternal(OpKernelContext* ctx) const { const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data() : nullptr; const CudaU* scale = reinterpret_cast(y_scale.Data()); - const auto num_of_elements = x_shape.Size(); ORT_RETURN_IF_ERROR(CudaDequantizeLinearAxis(Stream(ctx), input, output, scale, zero_point, num_of_elements, x_shape.SizeToDimension(axis), y_scale.Shape().Size())); return Status::OK(); + } else { // blocked quantization + // validate shape + auto axis_no_neg = SafeInt(HandleNegativeAxis(axis_, x_shape.NumDimensions())); + const auto& y_scale_shape = y_scale.Shape(); + + ValidateBlockQuantizationShapes(x_shape, + y_scale_shape, + y_zero_point, + axis_no_neg, + block_size_); + + // compute + const T* zero_point = y_zero_point ? y_zero_point->Data() : nullptr; + const CudaU* scale = reinterpret_cast(y_scale.Data()); + + ORT_RETURN_IF_ERROR(CudaDequantizeLinearBlockInt4(Stream(ctx), input, output, scale, zero_point, + num_of_elements, x_shape[axis_no_neg], + x_shape.SizeFromDimension(axis_no_neg + 1), + block_size_)); + + return Status::OK(); } } @@ -183,33 +489,54 @@ REGISTER_Q_KERNEL_TYPED_10_12(uint8_t) REGISTER_Q_KERNEL_TYPED_13_18(int8_t) REGISTER_Q_KERNEL_TYPED_13_18(uint8_t) -#define REGISTER_Q_KERNEL_TYPED_19(T) \ - ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \ - QuantizeLinear, \ - kOnnxDomain, \ - 19, \ - T, float, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ - QuantizeLinear); \ - ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \ - QuantizeLinear, \ - kOnnxDomain, \ - 19, \ - T, MLFloat16, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ - QuantizeLinear); +#define REGISTER_Q_KERNEL_TWO_TYPED_19_20(T, U) \ + ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \ + QuantizeLinear, \ + kOnnxDomain, \ + 19, 20, \ + T, U, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ + QuantizeLinear); -REGISTER_Q_KERNEL_TYPED_19(int8_t) -REGISTER_Q_KERNEL_TYPED_19(uint8_t) +REGISTER_Q_KERNEL_TWO_TYPED_19_20(int8_t, float) +REGISTER_Q_KERNEL_TWO_TYPED_19_20(uint8_t, float) +REGISTER_Q_KERNEL_TWO_TYPED_19_20(int8_t, MLFloat16) +REGISTER_Q_KERNEL_TWO_TYPED_19_20(uint8_t, MLFloat16) #if !defined(DISABLE_FLOAT8_TYPES) -REGISTER_Q_KERNEL_TYPED_19(Float8E4M3FN) -REGISTER_Q_KERNEL_TYPED_19(Float8E5M2) +REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, float) +REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E5M2, float) +REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, MLFloat16) +REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16) +#endif + +#define REGISTER_Q_KERNEL_TWO_TYPED_21(T, U) \ + ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \ + QuantizeLinear, \ + kOnnxDomain, \ + 21, \ + T, U, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ + QuantizeLinear); + +REGISTER_Q_KERNEL_TWO_TYPED_21(uint8_t, float) +REGISTER_Q_KERNEL_TWO_TYPED_21(int8_t, float) +REGISTER_Q_KERNEL_TWO_TYPED_21(uint8_t, MLFloat16) +REGISTER_Q_KERNEL_TWO_TYPED_21(int8_t, MLFloat16) +REGISTER_Q_KERNEL_TWO_TYPED_21(UInt4x2, float) +REGISTER_Q_KERNEL_TWO_TYPED_21(Int4x2, float) +REGISTER_Q_KERNEL_TWO_TYPED_21(UInt4x2, MLFloat16) +REGISTER_Q_KERNEL_TWO_TYPED_21(Int4x2, MLFloat16) +#if !defined(DISABLE_FLOAT8_TYPES) +REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E4M3FN, float) +REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E5M2, float) +REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E4M3FN, MLFloat16) +REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E5M2, MLFloat16) #endif // register DequantizeLinear kernels @@ -240,33 +567,54 @@ REGISTER_DQ_KERNEL_TYPED_10_12(uint8_t) REGISTER_DQ_KERNEL_TYPED_13_18(int8_t) REGISTER_DQ_KERNEL_TYPED_13_18(uint8_t) -#define REGISTER_DQ_KERNEL_TYPED_19(T) \ - ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \ - DequantizeLinear, \ - kOnnxDomain, \ - 19, \ - T, float, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ - DequantizeLinear); \ - ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \ - DequantizeLinear, \ - kOnnxDomain, \ - 19, \ - T, MLFloat16, \ - kCudaExecutionProvider, \ - (*KernelDefBuilder::Create()) \ - .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ - DequantizeLinear); +#define REGISTER_DQ_KERNEL_TWO_TYPED_19_20(T, U) \ + ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX( \ + DequantizeLinear, \ + kOnnxDomain, \ + 19, 20, \ + T, U, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ + DequantizeLinear); -REGISTER_DQ_KERNEL_TYPED_19(int8_t) -REGISTER_DQ_KERNEL_TYPED_19(uint8_t) +REGISTER_DQ_KERNEL_TWO_TYPED_19_20(int8_t, float) +REGISTER_DQ_KERNEL_TWO_TYPED_19_20(uint8_t, float) +REGISTER_DQ_KERNEL_TWO_TYPED_19_20(int8_t, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_19_20(uint8_t, MLFloat16) #if !defined(DISABLE_FLOAT8_TYPES) -REGISTER_DQ_KERNEL_TYPED_19(Float8E4M3FN) -REGISTER_DQ_KERNEL_TYPED_19(Float8E5M2) +REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, float) +REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16) +#endif + +#define REGISTER_DQ_KERNEL_TWO_TYPED_21(T, U) \ + ONNX_OPERATOR_TWO_TYPED_KERNEL_EX( \ + DequantizeLinear, \ + kOnnxDomain, \ + 21, \ + T, U, \ + kCudaExecutionProvider, \ + (*KernelDefBuilder::Create()) \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ + DequantizeLinear); + +REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, MLFloat16) +#if !defined(DISABLE_FLOAT8_TYPES) +REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, float) +REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, MLFloat16) +REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, MLFloat16) #endif // specialize QuantizeLinear::ComputeInternal and DequantizeLinear::ComputeInternal @@ -278,6 +626,10 @@ SPECIALIZED_QDQ_COMPUTE(int8_t, float) SPECIALIZED_QDQ_COMPUTE(uint8_t, float) SPECIALIZED_QDQ_COMPUTE(int8_t, MLFloat16) SPECIALIZED_QDQ_COMPUTE(uint8_t, MLFloat16) +SPECIALIZED_QDQ_COMPUTE(Int4x2, float) +SPECIALIZED_QDQ_COMPUTE(UInt4x2, float) +SPECIALIZED_QDQ_COMPUTE(Int4x2, MLFloat16) +SPECIALIZED_QDQ_COMPUTE(UInt4x2, MLFloat16) #if !defined(DISABLE_FLOAT8_TYPES) SPECIALIZED_QDQ_COMPUTE(Float8E4M3FN, float) diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu index 1da308811f..19b148d919 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu @@ -9,6 +9,7 @@ #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 #include "cuda_fp8.h" +#include "cuda_fp16.h" #endif namespace onnxruntime { @@ -17,9 +18,23 @@ namespace cuda { template struct RoundStd; +template +struct RoundStdInt4; + template struct RoundSat; +template +__device__ __forceinline__ int ExtractInt4FromByte(T byte, int index) { + return static_cast((byte >> (index << 2)) & 0x0f); +} + +template <> +__device__ __forceinline__ int ExtractInt4FromByte(int8_t byte, int index) { + constexpr auto shift = (sizeof(int) << 3) - 4; + return (static_cast(((byte >> (index << 2)) & 0x0f)) << shift) >> shift; +} + template <> struct RoundStd { __device__ __forceinline__ int8_t operator()(float v, float scale, int8_t zero_point) const { @@ -28,6 +43,22 @@ struct RoundStd { } }; +template <> +struct RoundStdInt4 { + __device__ __forceinline__ int8_t operator()(float v0, + float v1, + float scale0, + float scale1, + int zp0, + int zp1) const { + int value0 = __float2int_rn(v0 / scale0) + zp0; + int value1 = __float2int_rn(v1 / scale1) + zp1; + int value0_clip = max(-8, min(7, value0)); + int value1_clip = max(-8, min(7, value1)); + return static_cast((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4)); + } +}; + template <> struct RoundStd { __device__ __forceinline__ uint8_t operator()(float v, float scale, uint8_t zero_point) const { @@ -36,6 +67,22 @@ struct RoundStd { } }; +template <> +struct RoundStdInt4 { + __device__ __forceinline__ uint8_t operator()(float v0, + float v1, + float scale0, + float scale1, + int zp0, + int zp1) const { + int value0 = __float2int_rn(v0 / scale0) + zp0; + int value1 = __float2int_rn(v1 / scale1) + zp1; + int value0_clip = max(0, min(15, value0)); + int value1_clip = max(0, min(15, value1)); + return static_cast((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4)); + } +}; + #if !defined(DISABLE_FLOAT8_TYPES) #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080 @@ -104,7 +151,7 @@ struct RoundSat { #endif -#endif // DISABLE_FLOAT8_TYPES +#endif // DISABLE_FLOAT8_TYPES template <> struct RoundStd { @@ -114,6 +161,26 @@ struct RoundStd { } }; +template <> +struct RoundStdInt4 { + __device__ __forceinline__ int8_t operator()(half v0, + half v1, + half scale0, + half scale1, + int zp0, + int zp1) const { + half2 v = __halves2half2(v0, v1); + half2 scale = __halves2half2(scale0, scale1); + half2 scaled_v = v / scale; + + int value0 = __half2int_rn(__low2half(scaled_v)) + zp0; + int value1 = __half2int_rn(__high2half(scaled_v)) + zp1; + int value0_clip = max(-8, min(7, value0)); + int value1_clip = max(-8, min(7, value1)); + return static_cast((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4)); + } +}; + template <> struct RoundStd { __device__ __forceinline__ int8_t operator()(half v, half scale, uint8_t zero_point) const { @@ -122,6 +189,26 @@ struct RoundStd { } }; +template <> +struct RoundStdInt4 { + __device__ __forceinline__ uint8_t operator()(half v0, + half v1, + half scale0, + half scale1, + int zp0, + int zp1) const { + half2 v = __halves2half2(v0, v1); + half2 scale = __halves2half2(scale0, scale1); + half2 scaled_v = v / scale; + + int value0 = __half2int_rn(__low2half(scaled_v)) + zp0; + int value1 = __half2int_rn(__high2half(scaled_v)) + zp1; + int value0_clip = max(0, min(15, value0)); + int value1_clip = max(0, min(15, value1)); + return static_cast((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4)); + } +}; + template __global__ void QuantizeLinearKernelStd(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, RoundStd round) { CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; @@ -137,6 +224,29 @@ __global__ void QuantizeLinearKernelStd(const InT* input, OutT* output, const In } } +// cuda kernel for int4 per tensor quantization with standard rounding +// OutT is int8_t for Int4x2 and uint8_t for UInt4x2 +// NumElementsPerThread must be multiple of 2. +template +__global__ void QuantizeLinearKernelStdInt4(const InT* input, OutT* output, const InT* scale_ptr, + const OutT* zero_point_ptr, CUDA_LONG N, + RoundStdInt4 round) { + CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1); + InT scale = *scale_ptr; + int zero_point = zero_point_ptr ? ExtractInt4FromByte(*zero_point_ptr, 0) : 0; + int i = 0; + constexpr int step = NumThreadsPerBlock << 1; + +#pragma unroll + for (; i + 1 < NumElementsPerThread && id + 1 < N; i += 2, id += step) { + output[id >> 1] = round(input[id], input[id + 1], scale, scale, zero_point, zero_point); + } + + if (i < NumElementsPerThread && id < N) { + output[id >> 1] = round(input[id], 0.0, scale, 1.0, zero_point, 0); + } +} + template __global__ void QuantizeLinearKernelAxisStd(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, size_t batch_size, size_t n_scales, RoundStd round) { CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; @@ -154,6 +264,97 @@ __global__ void QuantizeLinearKernelAxisStd(const InT* input, OutT* output, cons } } +// cuda kernel for int4 per axis quantization with standard rounding +// OutT is int8_t for Int4x2 and uint8_t for UInt4x2 +// NumElementsPerThread must be multiple of 2. +template +__global__ void QuantizeLinearKernelAxisStdInt4(const InT* input, OutT* output, const InT* scale_ptr, + const OutT* zero_point_ptr, CUDA_LONG num_element, + size_t batch_size, size_t n_scales, + RoundStdInt4 round) { + CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1); + // Process continuous NumElementsPerThread int4 per thread. + int i = 0; + // The scale needs to change every n_same_scale. + CUDA_LONG n_same_scale = num_element / (batch_size * n_scales); + constexpr int step = NumThreadsPerBlock << 1; + +#pragma unroll + for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) { + int scale_id0 = (id / n_same_scale) % n_scales; + int scale_id1 = ((id + 1) / n_same_scale) % n_scales; + int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1); + int zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1); + output[id >> 1] = round(input[id], + input[id + 1], + scale_ptr[scale_id0], + scale_ptr[scale_id1], + zp0, + zp1); + } + + if (i < NumElementsPerThread && id < num_element) { + int scale_id0 = (id / n_same_scale) % n_scales; + int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1); + output[id >> 1] = round(input[id], + 0.0, + scale_ptr[scale_id0], + 1.0, + zp0, + 0); + } +} + +// cuda kernel for int4 block-wise quantization with standard rounding +// OutT is int8_t for Int4x2 and uint8_t for UInt4x2 +// NumElementsPerThread must be multiple of 2. +template +__global__ void QuantizeLinearKernelBlockStdInt4(const InT* input, OutT* output, const InT* scale_ptr, + const OutT* zero_point_ptr, CUDA_LONG num_element, size_t KN, + size_t N, size_t scale_KN, size_t block_size, + RoundStdInt4 round) { + // Process continuous NumElementsPerThread int4 per thread. + CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1); + int i = 0; + constexpr int step = NumThreadsPerBlock << 1; + +#pragma unroll + // Process two elements which belong to one byte at a time. + for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) { + int x0 = id / KN, x1 = (id + 1) / KN; + int y0 = id % KN / N, y1 = (id + 1) % KN / N; + int z0 = id % N, z1 = (id + 1) % N; + int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0; + int scale_id1 = x1 * scale_KN + y1 / block_size * N + z1; + output[id >> 1] = round(input[id], + input[id + 1], + scale_ptr[scale_id0], + scale_ptr[scale_id1], + zero_point_ptr == nullptr + ? 0 + : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1), + zero_point_ptr == nullptr + ? 0 + : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1)); + } + + // last non-paired element + if (i < NumElementsPerThread && id < num_element) { + int x0 = id / KN; + int y0 = id % KN / N; + int z0 = id % N; + int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0; + output[id >> 1] = round(input[id], + 0.0, + scale_ptr[scale_id0], + 1.0, + zero_point_ptr == nullptr + ? 0 + : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1), + 0); + } +} + #if !defined(DISABLE_FLOAT8_TYPES) template @@ -207,6 +408,27 @@ Status CudaQuantizeLinearStd(cudaStream_t stream, const InT* input, OutT* output return Status::OK(); } +template +Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale, + const OutT* zero_point, size_t num_of_element) { + if (num_of_element <= 0) + return Status::OK(); + + static_assert((GridDim::maxElementsPerThread & 1) == 0); + + int blocksPerGrid = static_cast(CeilDiv(num_of_element, + GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + QuantizeLinearKernelStdInt4 + <<>>( + input, + output, + scale, + zero_point, + static_cast(num_of_element), + RoundStdInt4()); + return Status::OK(); +} + template Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales) { @@ -226,6 +448,59 @@ Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* ou return Status::OK(); } +template +Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale, + const OutT* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales) { + if (num_of_element <= 0) + return Status::OK(); + + static_assert((GridDim::maxElementsPerThread & 1) == 0); + + int blocksPerGrid = static_cast(CeilDiv(num_of_element, + GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + QuantizeLinearKernelAxisStdInt4 + <<>>( + input, + output, + scale, + zero_point, + static_cast(num_of_element), + batch_size, + n_scales, + RoundStdInt4()); + return Status::OK(); +} + +template +Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale, + const OutT* zero_point, size_t num_of_element, size_t K, size_t N, + size_t block_size) { + if (num_of_element <= 0) + return Status::OK(); + + static_assert((GridDim::maxElementsPerThread & 1) == 0); + + size_t KN = K * N; + size_t num_block = (K + block_size - 1) / block_size; + size_t scale_KN = num_block * N; + int blocksPerGrid = static_cast(CeilDiv(num_of_element, + GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + QuantizeLinearKernelBlockStdInt4 + <<>>( + input, + output, + scale, + zero_point, + static_cast(num_of_element), + KN, + N, + scale_KN, + block_size, + RoundStdInt4()); + return Status::OK(); +} + #if !defined(DISABLE_FLOAT8_TYPES) template @@ -282,6 +557,29 @@ __global__ void DequantizeLinearKernelStd(const InT* input, OutT* output, const } } +template +__global__ void DequantizeLinearKernelStdInt4(const InT* input, OutT* output, const OutT* scale_ptr, + const InT* zero_point_ptr, CUDA_LONG num_element) { + CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1); + + OutT scale = *scale_ptr; + int zero_point = zero_point_ptr ? ExtractInt4FromByte(*zero_point_ptr, 0) : 0; + int i = 0, v0, v1; + constexpr int step = NumThreadsPerBlock << 1; +#pragma unroll + for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) { + v0 = ExtractInt4FromByte(input[id >> 1], 0); + v1 = ExtractInt4FromByte(input[id >> 1], 1); + output[id] = static_cast(v0 - zero_point) * scale; + output[id + 1] = static_cast(v1 - zero_point) * scale; + } + + if (i < NumElementsPerThread && id < num_element) { + v0 = ExtractInt4FromByte(input[id >> 1], 0); + output[id] = static_cast(v0 - zero_point) * scale; + } +} + template __global__ void DequantizeLinearKernelAxisStd(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, CUDA_LONG N, size_t batch_size, size_t n_scales) { @@ -300,6 +598,80 @@ __global__ void DequantizeLinearKernelAxisStd(const InT* input, OutT* output, co } } +template +__global__ void DequantizeLinearKernelAxisStdInt4(const InT* input, OutT* output, const OutT* scale_ptr, + const InT* zero_point_ptr, CUDA_LONG num_element, + size_t batch_size, size_t n_scales) { + CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1); + // The scale needs to change every n_same_scale. + CUDA_LONG n_same_scale = num_element / (batch_size * n_scales); + int i = 0; + int scale_id0, scale_id1, zp0, zp1, v0, v1; + constexpr int step = NumThreadsPerBlock << 1; + +#pragma unroll + for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) { + scale_id0 = (id / n_same_scale) % n_scales; + scale_id1 = ((id + 1) / n_same_scale) % n_scales; + + v0 = ExtractInt4FromByte(input[id >> 1], 0); + v1 = ExtractInt4FromByte(input[id >> 1], 1); + zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1); + zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1); + output[id] = static_cast(v0 - zp0) * scale_ptr[scale_id0]; + output[id + 1] = static_cast(v1 - zp1) * scale_ptr[scale_id1]; + } + + if (i < NumElementsPerThread && id < num_element) { + scale_id0 = (id / n_same_scale) % n_scales; + v0 = ExtractInt4FromByte(input[id >> 1], 0); + zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1); + output[id] = static_cast(v0 - zp0) * scale_ptr[scale_id0]; + } +} + +// cuda kernel for int4 block-wise dequantization with standard rounding +// IntT is int8_t for Int4x2 and uint8_t for UInt4x2 +// NumElementsPerThread must be multiple of 2. +template +__global__ void DequantizeLinearKernelBlockStdInt4(const InT* input, OutT* output, const OutT* scale_ptr, + const InT* zero_point_ptr, CUDA_LONG num_element, + size_t KN, size_t N, size_t scale_KN, size_t block_size) { + // Process continuous NumElementsPerThread int4 per thread. + CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1); + int i = 0; + constexpr int step = NumThreadsPerBlock << 1; + +#pragma unroll + // Process two elements which belong to one byte at a time. + for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) { + int x0 = id / KN, x1 = (id + 1) / KN; + int y0 = id % KN / N, y1 = (id + 1) % KN / N; + int z0 = id % N, z1 = (id + 1) % N; + int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0; + int scale_id1 = x1 * scale_KN + y1 / block_size * N + z1; + + int v0 = ExtractInt4FromByte(input[id >> 1], 0); + int v1 = ExtractInt4FromByte(input[id >> 1], 1); + int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1); + int zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1); + output[id] = static_cast(v0 - zp0) * scale_ptr[scale_id0]; + output[id + 1] = static_cast(v1 - zp1) * scale_ptr[scale_id1]; + } + + // last non-paired element + if (i < NumElementsPerThread && id < num_element) { + int x0 = id / KN; + int y0 = id % KN / N; + int z0 = id % N; + int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0; + + int v0 = ExtractInt4FromByte(input[id >> 1], 0); + int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1); + output[id] = static_cast(v0 - zp0) * scale_ptr[scale_id0]; + } +} + template struct DQFloat8; @@ -422,6 +794,26 @@ Status CudaDequantizeLinearStd(cudaStream_t stream, const InT* input, OutT* outp return Status::OK(); } +template +Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale, + const InT* zero_point, size_t num_of_element) { + if (num_of_element <= 0) + return Status::OK(); + + static_assert((GridDim::maxElementsPerThread & 1) == 0); + + int blocksPerGrid = static_cast(CeilDiv(num_of_element, + GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + DequantizeLinearKernelStdInt4 + <<>>( + input, + output, + scale, + zero_point, + static_cast(num_of_element)); + return Status::OK(); +} + template Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales) { @@ -440,6 +832,57 @@ Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* return Status::OK(); } +template +Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale, + const InT* zero_point, size_t num_of_element, + size_t batch_size, size_t n_scales) { + if (num_of_element <= 0) + return Status::OK(); + + static_assert((GridDim::maxElementsPerThread & 1) == 0); + + int blocksPerGrid = static_cast(CeilDiv(num_of_element, + GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + DequantizeLinearKernelAxisStdInt4 + <<>>( + input, + output, + scale, + zero_point, + static_cast(num_of_element), + batch_size, + n_scales); + return Status::OK(); +} + +template +Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale, + const T* zero_point, size_t num_of_element, size_t K, size_t N, + size_t block_size) { + if (num_of_element <= 0) + return Status::OK(); + + static_assert((GridDim::maxElementsPerThread & 1) == 0); + + size_t KN = K * N; + size_t num_block = (K + block_size - 1) / block_size; + size_t scale_KN = num_block * N; + int blocksPerGrid = static_cast(CeilDiv(num_of_element, + GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread)); + DequantizeLinearKernelBlockStdInt4 + <<>>( + input, + output, + scale, + zero_point, + static_cast(num_of_element), + KN, + N, + scale_KN, + block_size); + return Status::OK(); +} + #if !defined(DISABLE_FLOAT8_TYPES) template @@ -481,11 +924,24 @@ template Status CudaQuantizeLinearStd(cudaStream_t stream, const template Status CudaQuantizeLinearStd(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); template Status CudaQuantizeLinearStd(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element); template Status CudaQuantizeLinearStd(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); template Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); template Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); template Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); template Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); +template Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); +template Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); +template Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); +template Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); + +template Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size); +template Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size); +template Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size); +template Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size); #if !defined(DISABLE_FLOAT8_TYPES) @@ -505,11 +961,24 @@ template Status CudaDequantizeLinearStd(cudaStream_t stream, cons template Status CudaDequantizeLinearStd(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); template Status CudaDequantizeLinearStd(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element); template Status CudaDequantizeLinearStd(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element); +template Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element); template Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); template Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); template Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); template Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); +template Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); +template Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); +template Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); +template Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); + +template Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size); +template Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size); +template Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size); +template Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size); #if !defined(DISABLE_FLOAT8_TYPES) diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh index e8cd5d416f..cd14625d58 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh @@ -11,33 +11,96 @@ namespace onnxruntime { namespace cuda { template -Status CudaQuantizeLinearStd(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element); +Status CudaQuantizeLinearStd(cudaStream_t stream, const U* input, T* output, const U* scale, + const T* zero_point, size_t num_of_element); template -Status CudaQuantizeLinearSat(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, - bool saturate); +Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale, + const T* zero_point, size_t num_of_element); template -Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, - size_t batch_size, size_t n_scales); +Status CudaQuantizeLinearSat(cudaStream_t stream, const U* input, T* output, const U* scale, + const T* zero_point, size_t num_of_element, bool saturate); template -Status CudaQuantizeLinearAxisSat(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, - size_t batch_size, size_t n_scales, bool saturate); +Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const U* input, T* output, const U* scale, + const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); template -Status CudaDequantizeLinearStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element); +Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale, + const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); template -Status CudaDequantizeLinearSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element); +Status CudaQuantizeLinearAxisSat(cudaStream_t stream, const U* input, T* output, const U* scale, + const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales, + bool saturate); + +/** + * @brief block-wise quantization with standard rounding to int4. Input is reshaped to [M, K, N]. K the quantization + * axis. Scale is reshaped to [M, ceil(K/block_size), N]. For an index i in input, the coordiate is (xi, yi, zi) + * = (i / (K * N), i % (K * N) / N, i % N). The scale coordiate is (xi, yi / block_size, zi). The scale index + * is xi * ceil(K / block_size) * N + yi / block_size * N + zi. + * @tparam T quantized type, int8_t for Int4x2, uint8_t for UInt4x2 + * @tparam U full precision type + * @param stream cuda stream + * @param input input tensor + * @param output output tensor + * @param scale scale tensor + * @param zero_point zero point tensor + * @param num_of_element number of elements in input tensor + * @param K K + * @param N N + * @param block_size block size + */ +template +Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale, + const T* zero_point, size_t num_of_element, size_t K, size_t N, + size_t block_size); template -Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element, - size_t batch_size, size_t n_scales); +Status CudaDequantizeLinearStd(cudaStream_t stream, const T* input, U* output, const U* scale, + const T* zero_point, size_t num_of_element); template -Status CudaDequantizeLinearAxisSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element, - size_t batch_size, size_t n_scales); +Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale, + const T* zero_point, size_t num_of_element); +template +Status CudaDequantizeLinearSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, + size_t num_of_element); + +template +Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, + size_t num_of_element, size_t batch_size, size_t n_scales); + +template +Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale, + const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales); + +template +Status CudaDequantizeLinearAxisSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, + size_t num_of_element, size_t batch_size, size_t n_scales); + +/** + * @brief block-wise dequantization with standard rounding to int4. Input is reshaped to [M, K, N]. K the quantization + * axis. Scale is reshaped to [M, ceil(K/block_size), N]. For an index i in input, the coordiate is (xi, yi, zi) + * = (i / (K * N), i % (K * N) / N, i % N). The scale coordiate is (xi, yi / block_size, zi). The scale index + * is xi * ceil(K / block_size) * N + yi / block_size * N + zi. + * @tparam T quantized type, int8_t for Int4x2, uint8_t for UInt4x2 + * @tparam U full precision type + * @param stream cuda stream + * @param input input tensor + * @param output output tensor + * @param scale scale tensor + * @param zero_point zero point tensor + * @param num_of_element number of elements in input tensor + * @param K K + * @param N N + * @param block_size block size + */ +template +Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale, + const T* zero_point, size_t num_of_element, size_t K, size_t N, + size_t block_size); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/quantize_linear.h b/onnxruntime/core/providers/cuda/tensor/quantize_linear.h index 86036f28ef..2d44fcc522 100644 --- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.h +++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.h @@ -19,6 +19,11 @@ class QuantizeLinear final : public CudaKernel { if (!info.GetAttr("saturate", &saturate_).IsOK()) { saturate_ = 1; } + if (!info.GetAttr("block_size", &block_size_).IsOK()) { + block_size_ = 0; + } + + ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative."); } Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override; @@ -26,6 +31,7 @@ class QuantizeLinear final : public CudaKernel { private: int64_t axis_; int64_t saturate_; + int64_t block_size_; }; template @@ -35,12 +41,18 @@ class DequantizeLinear final : public CudaKernel { if (!info.GetAttr("axis", &axis_).IsOK()) { axis_ = 1; } + if (!info.GetAttr("block_size", &block_size_).IsOK()) { + block_size_ = 0; + } + + ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative."); } Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override; private: int64_t axis_; + int64_t block_size_; }; } // namespace cuda diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index c1cedd4750..7d741a6604 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -1370,18 +1370,26 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint64_t, Cast); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, bool, Cast); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, + float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, + float, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, + MLFloat16, DequantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, + MLFloat16, DequantizeLinear); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Identity); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, If); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Loop); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear); -class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, + float, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, + float, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, + MLFloat16, QuantizeLinear); +class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, + MLFloat16, QuantizeLinear); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Reshape); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Shape); @@ -1390,6 +1398,24 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, S class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN); +// Opset 21 +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float, + DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float, + DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, + DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, + DequantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float, + QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float, + QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, + QuantizeLinear); +class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, + QuantizeLinear); + template <> KernelCreateInfo BuildKernelCreateInfo() { return {}; @@ -2333,19 +2359,19 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -2354,6 +2380,16 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) { // opset 20 BuildKernelCreateInfo, BuildKernelCreateInfo, + + // opset 21 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index a0bc73a478..dd6f024247 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -1289,6 +1289,10 @@ struct Tensor final { template <> inline bool Tensor::IsDataType() const { return g_host->Tensor__IsDataType_bool(this); } template <> +inline bool Tensor::IsDataType() const { return g_host->Tensor__IsDataType_Int4x2(this); } +template <> +inline bool Tensor::IsDataType() const { return g_host->Tensor__IsDataType_UInt4x2(this); } +template <> inline bool Tensor::IsDataType() const { return g_host->Tensor__IsDataType_int8(this); } template <> inline bool Tensor::IsDataType() const { return g_host->Tensor__IsDataType_uint8(this); } @@ -1327,6 +1331,10 @@ inline bool Tensor::IsDataType() const { return g_host->Tensor__ template <> inline bool* Tensor::MutableData() { return g_host->Tensor__MutableData_bool(this); } template <> +inline Int4x2* Tensor::MutableData() { return g_host->Tensor__MutableData_Int4x2(this); } +template <> +inline UInt4x2* Tensor::MutableData() { return g_host->Tensor__MutableData_UInt4x2(this); } +template <> inline int8_t* Tensor::MutableData() { return g_host->Tensor__MutableData_int8(this); } template <> inline uint8_t* Tensor::MutableData() { return g_host->Tensor__MutableData_uint8(this); } @@ -1365,6 +1373,10 @@ inline Float8E5M2FNUZ* Tensor::MutableData() { return g_host->Te template <> inline const bool* Tensor::Data() const { return g_host->Tensor__Data_bool(this); } template <> +inline const Int4x2* Tensor::Data() const { return g_host->Tensor__Data_Int4x2(this); } +template <> +inline const UInt4x2* Tensor::Data() const { return g_host->Tensor__Data_UInt4x2(this); } +template <> inline const int8_t* Tensor::Data() const { return g_host->Tensor__Data_int8(this); } template <> inline const uint8_t* Tensor::Data() const { return g_host->Tensor__Data_uint8(this); } diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.cc b/onnxruntime/test/optimizer/graph_transform_test_builder.cc index 03a71868a3..756cc4159e 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc +++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include "core/common/inlined_containers_fwd.h" #include "core/common/span_utils.h" @@ -140,7 +141,8 @@ void TransformerTester(const std::function& buil double relative_per_sample_tolerance, std::unique_ptr transformer, const std::function& add_session_options, - const InlinedHashSet& disabled_optimizers) { + const InlinedHashSet& disabled_optimizers, + std::unique_ptr ep) { // Build the model for this test. std::unordered_map domain_to_version; domain_to_version[kOnnxDomain] = opset_version; @@ -157,6 +159,7 @@ void TransformerTester(const std::function& buil // Serialize the model to a string. std::string model_data; model.ToProto().SerializeToString(&model_data); + std::shared_ptr ep_shared = ep ? std::move(ep) : nullptr; auto run_model = [&](TransformerLevel level, std::vector& fetches, std::unique_ptr transformer = nullptr) { @@ -170,6 +173,10 @@ void TransformerTester(const std::function& buil add_session_options(session_options); } InferenceSessionWrapper session{session_options, GetEnvironment()}; + if (ep_shared) { + ASSERT_STATUS_OK(session.RegisterExecutionProvider(ep_shared)); + } + ASSERT_STATUS_OK(session.Load(model_data.data(), static_cast(model_data.size()))); if (transformer) { ASSERT_STATUS_OK(session.RegisterGraphTransformer(std::move(transformer), level)); diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h index b9af675afe..f641c597ac 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_builder.h +++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h @@ -555,7 +555,8 @@ void TransformerTester(const std::function& buil double relative_per_sample_tolerance = 0.0, std::unique_ptr transformer = nullptr, const std::function& add_session_options = {}, - const InlinedHashSet& disabled_optimizers = {}); + const InlinedHashSet& disabled_optimizers = {}, + std::unique_ptr ep = nullptr); void TransformerTester(const std::function& build_test_case, const std::function& check_transformed_graph, diff --git a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc index 3d11779410..e9c7b11fe9 100644 --- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc @@ -17,6 +17,7 @@ #include "test/optimizer/qdq_test_utils.h" #include "test/optimizer/graph_transform_test_builder.h" #include "test/util/include/asserts.h" +#include "test/util/include/default_providers.h" #include "test/util/include/inference_session_wrapper.h" #include "gtest/gtest.h" @@ -55,7 +56,8 @@ RunDQMatMulNotConverted_NonConstDQ(const std::vector& input1_shape, const std::vector& input2_shape, const int64_t axis, const int64_t block_size, - int64_t accuracy_level) { + int64_t accuracy_level, + std::unique_ptr ep = nullptr) { auto build_test_case = [&](ModelTestBuilder& builder) { auto* input1_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f); auto* input2_arg = builder.MakeInput(input2_shape, T(T::min_val, 0), T(T::max_val, 0)); @@ -104,7 +106,9 @@ RunDQMatMulNotConverted_NonConstDQ(const std::vector& input1_shape, 1e-5 /*per_sample_tolerance*/, 1e-5 /*relative_per_sample_tolerance*/, nullptr, - add_session_options_fn); + add_session_options_fn, + {}, + ep ? std::move(ep) : nullptr); } TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) { @@ -127,6 +131,27 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) { RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, -1); } +TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ_Cuda) { + // DQ contrib op schema is not updated to support blocked quantization + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider()); + ; + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_NonConstDQ({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider()); +} + // Input2 // | // DQ / @@ -140,7 +165,8 @@ RunDQMatMulNotConverted_FirstDQInput(const std::vector& weight_shape, const std::vector& input2_shape, const int64_t axis, const int64_t block_size, - int64_t accuracy_level) { + int64_t accuracy_level, + std::unique_ptr ep = nullptr) { auto build_test_case = [&](ModelTestBuilder& builder) { auto* weight_arg = builder.MakeInitializer(weight_shape, T(T::min_val, 0), T(T::max_val, 0)); auto* input2_arg = builder.MakeInput(input2_shape, -100.0f, 100.0f); @@ -189,7 +215,9 @@ RunDQMatMulNotConverted_FirstDQInput(const std::vector& weight_shape, 1e-5 /*per_sample_tolerance*/, 1e-5 /*relative_per_sample_tolerance*/, nullptr, - add_session_options_fn); + add_session_options_fn, + {}, + ep ? std::move(ep) : nullptr); } TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) { @@ -212,6 +240,27 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) { RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, -1); } +TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput_Cuda) { + // DQ contrib op schema is not updated to support blocked quantization + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider()); + ; + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_FirstDQInput({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider()); +} + // Input1 // | // \ DQ @@ -224,7 +273,8 @@ void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector& input const std::vector& weight_shape, const int64_t axis, const int64_t block_size, - int64_t accuracy_level) { + int64_t accuracy_level, + std::unique_ptr ep = nullptr) { auto build_test_case = [&](ModelTestBuilder& builder) { auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f); auto* output_arg = builder.MakeOutput(); @@ -287,7 +337,9 @@ void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector& input 1e-5 /*per_sample_tolerance*/, 1e-5 /*relative_per_sample_tolerance*/, nullptr, - add_session_options_fn); + add_session_options_fn, + {}, + ep ? std::move(ep) : nullptr); } TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_TypeMismatch) { @@ -327,6 +379,31 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch) { RunDQMatMulNotConverted_TypeShapeMismatch({2, 12, 37}, {2, 37, 12}, 0, 16, 0); } +TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch_Cuda) { + // DQ contrib op schema is not updated to support blocked quantization + // block size too small + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider()); + // block size not 2's power + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider()); + ; + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider()); + // not axis 0 + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider()); + // not rank 2 + RunDQMatMulNotConverted_TypeShapeMismatch({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulNotConverted_TypeShapeMismatch({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); +} + // Input1 // | DQ // \ / @@ -343,7 +420,8 @@ RunDQMatMulConverted(const std::vector& input1_shape, const std::vector& weight2_shape, const int64_t axis, const int64_t block_size, - int64_t accuracy_level) { + int64_t accuracy_level, + std::unique_ptr ep = nullptr) { auto build_test_case = [&](ModelTestBuilder& builder) { auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f); auto* output_arg = builder.MakeOutput(); @@ -402,9 +480,11 @@ RunDQMatMulConverted(const std::vector& input1_shape, TransformerLevel::Level2, 21 /*opset_version*/, 1e-5 /*per_sample_tolerance*/, - 1e-5 /*relative_per_sample_tolerance*/, + 2e-5 /*relative_per_sample_tolerance*/, nullptr, - add_session_options_fn); + add_session_options_fn, + {}, + ep ? std::move(ep) : nullptr); } TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) { @@ -419,6 +499,18 @@ TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) { RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 1); } +TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits_Cuda) { + // DQ contrib op schema is not updated to support blocked quantization + RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider()); + RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); + RunDQMatMulConverted({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider()); +} + #endif // !defined(DISABLE_CONTRIB_OPS) } // namespace test diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc index cc34f7e18c..51aae0cfd4 100644 --- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc @@ -869,7 +869,8 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size, template void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size, int64_t scale_block_count, - int64_t zero_point_block_count) { + int64_t zero_point_block_count, + std::unique_ptr ep = nullptr) { OpTester test("DequantizeLinear", 21); std::vector dims{2, 4}; std::vector x_scale, y; @@ -877,7 +878,7 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size, SessionOptions so; std::vector log_msgs; // redirect error messages std::vector> eps; - eps.push_back(DefaultCpuExecutionProvider()); + eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider()); so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) { ORT_UNUSED_PARAMETER(severity); @@ -970,6 +971,13 @@ TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int) { DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-1, 2, 2); } +TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int_Cuda) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-1, 2, 2, DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-1, 2, 2, DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-2, 2, 2, DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-2, 2, 2, DefaultCudaExecutionProvider()); +} + #if !defined(DISABLE_FLOAT8_TYPES) TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) { constexpr int min_cuda_architecture = 11080; @@ -1013,6 +1021,13 @@ TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) { DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 1, 1); } +TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int_Cuda) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 1, 1, DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 3, 3, DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 3, 3, DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 1, 1, DefaultCudaExecutionProvider()); +} + #if !defined(DISABLE_FLOAT8_TYPES) TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) { constexpr int min_cuda_architecture = 11080; @@ -1052,6 +1067,13 @@ TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) { DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 1); } +TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int_Cuda) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 1, DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 3, DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 3, DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 1, DefaultCudaExecutionProvider()); +} + #if !defined(DISABLE_FLOAT8_TYPES) TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) { constexpr int min_cuda_architecture = 11080; @@ -1081,14 +1103,14 @@ void DequantizeLinearOp21BlockedTest_Int4_Succeed(std::vector&& dims, std::vector& x_, std::vector& x_scale_, std::vector& x_zero_point_, - std::vector& y_) { + std::vector& y_, + std::unique_ptr ep = nullptr) { OpTester test("DequantizeLinear", 21); std::vector x_scale_shape; std::vector x_scale, y; std::vector x, x_zero_point; std::vector> eps; - eps.push_back(DefaultCpuExecutionProvider()); - + eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider()); int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis; bool use_zero_point = !x_zero_point_.empty(); @@ -1216,6 +1238,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, 2.0, 8.0, -10.5, -4.0, 10.0, 24.0, -24.5, 8.0}; + std::vector y_3{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, -2.0, -8.0, 10.5, 4.0, 10.0, 24.0, -24.5, 8.0}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) { std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; std::vector zero_point{-6, -4, -3, -1, 0, 2, 4, 7}; @@ -1237,6 +1276,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{-6, -4, -3, -1, 0, 2, 4, 7}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, 2.0, 0, 3.5, 3.0, 10.0, 16.0, -10.5, 15}; + std::vector y_3{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, -14.0, -24, 21, 5, 10.0, 16.0, -10.5, 15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) { std::vector zero_point{}; std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; @@ -1262,6 +1318,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis_Cuda) { + std::vector zero_point{}; + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{14, 24, 10, 16, -10.5, -2, -3.5, 0, 2, 8, 6, 16, -17.5, -6, -24.5, 8}; + std::vector y_3{14, 24, 10, 16, 6, 8, -3.5, 0, 2, 8, 6, 16, 10, 24, -24.5, 8}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) { std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; std::vector zero_point{-6, -4, -3, -1, 0, 2, 4, 7}; @@ -1283,6 +1356,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{-6, -4, -3, -1, 0, 2, 4, 7}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{2, 8, -2, 0, 0, -1, 7, 1, 2, 0, 6, 8, -3.5, 1, -10.5, 15}; + std::vector y_3{2, 8, -2, 0, -6, -8, 7, 1, 2, 0, 6, 8, 10, 16, -10.5, 15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) { std::vector zero_point{}; std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; @@ -1308,6 +1398,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis_Cuda) { + std::vector zero_point{}; + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{14, 12, 20, 16, -10.5, -7, -1, 0, 2, 4, 12, 16, -17.5, -21, -7, 8}; + std::vector y_3{14, 12, 10, 16, -10.5, -7, -3.5, 0, 2, 4, 6, 16, -17.5, -21, -24.5, 8}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) { std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; std::vector zero_point{-6, -4, -3, -1, 0, 2, 4, 7}; @@ -1329,6 +1436,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{-6, -4, -3, -1, 0, 2, 4, 7}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{2, 0, 4, 0, 0, 3.5, 0, 1, 2, 4, 4, 8, -3.5, -7, 0, 15}; + std::vector y_3{2, 0, -2, 0, 0, 3.5, 7, 1, 2, 4, 6, 8, -3.5, -7, -10.5, 15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) { std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; std::vector zero_point{}; @@ -1350,6 +1474,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{0, -4, 7, 3, -8, -20, 21, 7, 16, 36, -35, -11, 24, 52, -49, -15}; + std::vector y_3{0, -4, 7, 3, -8, -20, 21, 7, -16, -36, 35, 11, 24, 52, -49, -15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) { std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6}; @@ -1371,6 +1512,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{4, -4, 3.5, -6, -4, -20, 17.5, -2, -10, 16, 3.5, -5, -2, 32, -10.5, -9}; + std::vector y_3{4, -4, 3.5, -6, -4, -20, 17.5, -2, -12, -36, 31.5, 2, -2, 32, -10.5, -9}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) { std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; std::vector zero_point{}; @@ -1392,6 +1550,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15}; + std::vector y_3{0, -4, -4, -12, -8, -20, 21, 7, 16, 36, 20, 44, 24, 52, -49, -15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) { std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6}; @@ -1413,6 +1588,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9}; + std::vector y_3{4, -4, 0, -12, -4, -20, 17.5, -2, -10, 16, -6, 24, -2, 32, -10.5, -9}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) { std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; std::vector zero_point{}; @@ -1434,6 +1626,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{0, -2, -8, -12, 14, 17.5, 6, 7, 16, 18, 40, 44, -42, -45.5, -14, -15}; + std::vector y_3{0, -2, -4, -12, 14, 17.5, 21, 7, 16, 18, 20, 44, -42, -45.5, -49, -15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) { std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6}; @@ -1455,6 +1664,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) { DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); } +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis_Cuda) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{4, 2, -8, -12, 10.5, 14, -3, -2, -10, -8, 20, 24, -3.5, -7, -8, -9}; + std::vector y_3{4, 2, 0, -12, 10.5, 14, 17.5, -2, -10, -8, -6, 24, -3.5, -7, -10.5, -9}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + #if !defined(DISABLE_FLOAT8_TYPES) TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_FirstAxis) { constexpr int min_cuda_architecture = 11080; @@ -1624,7 +1850,8 @@ void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size, template void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size, int64_t scale_block_count, - int64_t zero_point_block_count) { + int64_t zero_point_block_count, + std::unique_ptr ep = nullptr) { OpTester test("QuantizeLinear", 21); std::vector dims{2, 4}; std::vector x_zero_point, y; @@ -1632,7 +1859,7 @@ void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size, SessionOptions so; std::vector log_msgs; // redirect error messages std::vector> eps; - eps.push_back(DefaultCpuExecutionProvider()); + eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider()); so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) { ORT_UNUSED_PARAMETER(severity); @@ -1725,6 +1952,13 @@ TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Int) { QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-1, 2, 2); } +TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Int_Cuda) { + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-1, 2, 2, DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-1, 2, 2, DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-2, 2, 2, DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-2, 2, 2, DefaultCudaExecutionProvider()); +} + #if !defined(DISABLE_FLOAT8_TYPES) TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) { constexpr int min_cuda_architecture = 11080; @@ -1768,6 +2002,13 @@ TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) { QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 1, 1); } +TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int_Cuda) { + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 1, 1, DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 3, 3, DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 3, 3, DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 1, 1, DefaultCudaExecutionProvider()); +} + #if !defined(DISABLE_FLOAT8_TYPES) TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) { constexpr int min_cuda_architecture = 11080; @@ -1807,6 +2048,13 @@ TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) { QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 1); } +TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int_Cuda) { + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 1, DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 3, DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 3, DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 1, DefaultCudaExecutionProvider()); +} + #if !defined(DISABLE_FLOAT8_TYPES) TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) { constexpr int min_cuda_architecture = 11080; @@ -1836,14 +2084,14 @@ void QuantizeLinearOp21BlockedTest_Int4_Succeed(std::vector&& dims, std::vector& x_, std::vector& scale_, std::vector& zero_point_, - std::vector& y_) { + std::vector& y_, + std::unique_ptr ep = nullptr) { OpTester test("QuantizeLinear", 21); std::vector scale_shape; std::vector zero_point, y; std::vector x, scale; std::vector> eps; - eps.push_back(DefaultCpuExecutionProvider()); - + eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider()); int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis; bool use_zero_point = !zero_point_.empty(); @@ -1970,6 +2218,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_FirstAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_FirstAxis_Cuda) { + std::vector y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, + 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector x{14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4, + 6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0, + 2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0, + 10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0}; + std::vector y_2{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, + -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, + 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8}; + std::vector y_3{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, + -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, + -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, + 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) { std::vector y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0}; @@ -2022,6 +2298,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_FirstAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_FirstAxis_Cuda) { + std::vector y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, + 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{-6, -4, -3, -1, -6, -4, -3, -1, -6, -4, -3, -1, -6, -4, -3, -1, + 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7}; + std::vector x{2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3, + -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1, + 2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0, + 10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15}; + std::vector y_2{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, + -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, + 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, + 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8}; + std::vector y_3{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, + -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, + -7, -4, -2, 2, -7, -4, -2, 2, -7, -4, -2, 2, -7, -4, -2, 2, + 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) { std::vector y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0}; @@ -2074,6 +2378,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_MiddleAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_MiddleAxis_Cuda) { + std::vector zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector y_scale{-2.0, -4.0, -2.0, -4.0, -2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, + 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0}; + std::vector x{14, 24, 14, 24, 14, 24, 14, 24, 10, 16, 10, 16, 10, 16, 10, 16, + -10.5, -2, -10.5, -2, -10.5, -2, -10.5, -2, -3.5, 0, -3.5, 0, -3.5, 0, -3.5, 0, + 2, 8, 2, 8, 2, 8, 2, 8, 6, 16, 6, 16, 6, 16, 6, 16, + -17.5, -6, -17.5, -6, -17.5, -6, -17.5, -6, -24.5, 8, -24.5, 8, -24.5, 8, -24.5, 8}; + std::vector y_2{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4, + -3, -2, -3, -2, -3, -2, -3, -2, -1, 0, -1, 0, -1, 0, -1, 0, + 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, + 5, 6, 5, 6, 5, 6, 5, 6, 7, -8, 7, -8, 7, -8, 7, -8}; + std::vector y_3{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4, + 5, 0, 5, 0, 5, 0, 5, 0, -1, 0, -1, 0, -1, 0, -1, 0, + 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, + -8, -2, -8, -2, -8, -2, -8, -2, 7, -8, 7, -8, 7, -8, 7, -8}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) { std::vector zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; @@ -2126,6 +2458,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_MiddleAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_MiddleAxis_Cuda) { + std::vector zero_point{-6, -4, -6, -4, -6, -4, -6, -4, -3, -1, -3, -1, -3, -1, -3, -1, + 0, 2, 0, 2, 0, 2, 0, 2, 4, 7, 4, 7, 4, 7, 4, 7}; + std::vector y_scale{-2.0, -4.0, -2.0, -4.0, -2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, + 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0}; + std::vector x{2, 8, 2, 8, 2, 8, 2, 8, -2, 0, -2, 0, -2, 0, -2, 0, + 0, -1, 0, -1, 0, -1, 0, -1, 7, 1, 7, 1, 7, 1, 7, 1, + 2, 0, 2, 0, 2, 0, 2, 0, 6, 8, 6, 8, 6, 8, 6, 8, + -3.5, 1, -3.5, 1, -3.5, 1, -3.5, 1, -10.5, 15, -10.5, 15, -10.5, 15, -10.5, 15}; + std::vector y_2{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4, + -3, -2, -3, -2, -3, -2, -3, -2, -1, 0, -1, 0, -1, 0, -1, 0, + 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, + 5, 6, 5, 6, 5, 6, 5, 6, 7, -8, 7, -8, 7, -8, 7, -8}; + std::vector y_3{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4, + -6, -4, -6, -4, -6, -4, -6, -4, -1, 0, -1, 0, -1, 0, -1, 0, + 1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, + -2, 2, -2, 2, -2, 2, -2, 2, 7, -8, 7, -8, 7, -8, 7, -8}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) { std::vector zero_point{-6, -4, -6, -4, -6, -4, -6, -4, -3, -1, -3, -1, -3, -1, -3, -1, 0, 2, 0, 2, 0, 2, 0, 2, 4, 7, 4, 7, 4, 7, 4, 7}; @@ -2178,6 +2538,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_LastAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_LastAxis_Cuda) { + std::vector zero_point{0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, + 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0}; + std::vector x{14, 12, 14, 12, 20, 16, 20, 16, 14, 12, 14, 12, 20, 16, 20, 16, + -10.5, -7, -10.5, -7, -1, 0, -1, 0, -10.5, -7, -10.5, -7, -1, 0, -1, 0, + 2, 4, 2, 4, 12, 16, 12, 16, 2, 4, 2, 4, 12, 16, 12, 16, + -17.5, -21, -17.5, -21, -7, 8, -7, 8, -17.5, -21, -17.5, -21, -7, 8, -7, 8}; + std::vector y_2{-7, -6, -7, -6, -5, -4, -5, -4, -7, -6, -7, -6, -5, -4, -5, -4, + -3, -2, -3, -2, -1, 0, -1, 0, -3, -2, -3, -2, -1, 0, -1, 0, + 1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4, + 5, 6, 5, 6, 7, -8, 7, -8, 5, 6, 5, 6, 7, -8, 7, -8}; + std::vector y_3{-7, -6, -7, -6, -8, -4, -5, -4, -7, -6, -7, -6, -8, -4, -5, -4, + -3, -2, -3, -2, 0, 0, -1, 0, -3, -2, -3, -2, 0, 0, -1, 0, + 1, 2, 1, 2, 6, 4, 3, 4, 1, 2, 1, 2, 6, 4, 3, 4, + 5, 6, 5, 6, 2, -8, 7, -8, 5, 6, 5, 6, 2, -8, 7, -8}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) { std::vector zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; @@ -2230,6 +2618,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_LastAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_LastAxis_Cuda) { + std::vector y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, + 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0}; + std::vector zero_point{-6, -4, -6, -4, -3, -1, -3, -1, + 0, 2, 0, 2, 4, 7, 4, 7}; + std::vector x{2, 0, 2, 0, 4, 0, 4, 0, 2, 0, 2, 0, 4, 0, 4, 0, + 0, 3.5, 0, 3.5, 0, 1, 0, 1, 0, 3.5, 0, 3.5, 0, 1, 0, 1, + 2, 4, 2, 4, 4, 8, 4, 8, 2, 4, 2, 4, 4, 8, 4, 8, + -3.5, -7, -3.5, -7, 0, 15, 0, 15, -3.5, -7, -3.5, -7, 0, 15, 0, 15}; + std::vector y_2{-7, -6, -7, -6, -5, -4, -5, -4, -7, -6, -7, -6, -5, -4, -5, -4, + -3, -2, -3, -2, -1, 0, -1, 0, -3, -2, -3, -2, -1, 0, -1, 0, + 1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4, + 5, 6, 5, 6, 7, -8, 7, -8, 5, 6, 5, 6, 7, -8, 7, -8}; + std::vector y_3{-7, -6, -7, -6, -8, -4, -5, -4, -7, -6, -7, -6, -8, -4, -5, -4, + -3, -2, -3, -2, -3, 0, -1, 0, -3, -2, -3, -2, -3, 0, -1, 0, + 1, 2, 1, 2, 2, 4, 3, 4, 1, 2, 1, 2, 2, 4, 3, 4, + 5, 6, 5, 6, 4, -8, 7, -8, 5, 6, 5, 6, 4, -8, 7, -8}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) { std::vector y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0}; @@ -2282,6 +2698,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_FirstAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_FirstAxis_Cuda) { + std::vector y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, + 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector x{0, -4, 7, 3, 0, -4, 7, 3, 0, -4, 7, 3, 0, -4, 7, 3, + -8, -20, 21, 7, -8, -20, 21, 7, -8, -20, 21, 7, -8, -20, 21, 7, + 16, 36, -35, -11, 16, 36, -35, -11, 16, 36, -35, -11, 16, 36, -35, -11, + 24, 52, -49, -15, 24, 52, -49, -15, 24, 52, -49, -15, 24, 52, -49, -15}; + std::vector y_2{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15}; + std::vector y_3{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) { std::vector y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0}; @@ -2334,6 +2778,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_FirstAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_FirstAxis_Cuda) { + std::vector y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, + 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{2, 0, 1, 9, 2, 0, 1, 9, 2, 0, 1, 9, 2, 0, 1, 9, + 13, 5, 11, 6, 13, 5, 11, 6, 13, 5, 11, 6, 13, 5, 11, 6}; + std::vector x{4, -4, 3.5, -6, 4, -4, 3.5, -6, 4, -4, 3.5, -6, 4, -4, 3.5, -6, + -4, -20, 17.5, -2, -4, -20, 17.5, -2, -4, -20, 17.5, -2, -4, -20, 17.5, -2, + -10, 16, 3.5, -5, -10, 16, 3.5, -5, -10, 16, 3.5, -5, -10, 16, 3.5, -5, + -2, 32, -10.5, -9, -2, 32, -10.5, -9, -2, 32, -10.5, -9, -2, 32, -10.5, -9}; + std::vector y_2{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15}; + std::vector y_3{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, + 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, + 7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, + 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) { std::vector y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0}; @@ -2386,6 +2858,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_MiddleAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_MiddleAxis_Cuda) { + std::vector y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, + -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector x{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15, + 0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15, + 0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15, + 0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15}; + std::vector y_2{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_3{0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15, + 0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15, + 0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15, + 0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) { std::vector y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; @@ -2438,6 +2938,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_MiddleAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_MiddleAxis_Cuda) { + std::vector y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, + -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6, 2, 0, 1, 9, 13, 5, 11, 6, + 2, 0, 1, 9, 13, 5, 11, 6, 2, 0, 1, 9, 13, 5, 11, 6}; + std::vector x{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9, + 4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9, + 4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9, + 4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9}; + std::vector y_2{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_3{0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15, + 0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15, + 0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15, + 0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) { std::vector y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; @@ -2490,6 +3018,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_LastAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_LastAxis_Cuda) { + std::vector y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, + 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0}; + std::vector zero_point{0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0}; + std::vector x{0, -2, 0, -2, -8, -12, -8, -12, 0, -2, 0, -2, -8, -12, -8, -12, + 14, 17.5, 14, 17.5, 6, 7, 6, 7, 14, 17.5, 14, 17.5, 6, 7, 6, 7, + 16, 18, 16, 18, 40, 44, 40, 44, 16, 18, 16, 18, 40, 44, 40, 44, + -42, -45.5, -42, -45.5, -14, -15, -14, -15, -42, -45.5, -42, -45.5, -14, -15, -14, -15}; + std::vector y_2{0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7, + 8, 9, 8, 9, 10, 11, 10, 11, 8, 9, 8, 9, 10, 11, 10, 11, + 12, 13, 12, 13, 14, 15, 14, 15, 12, 13, 12, 13, 14, 15, 14, 15}; + std::vector y_3{0, 1, 0, 1, 4, 3, 2, 3, 0, 1, 0, 1, 4, 3, 2, 3, + 4, 5, 4, 5, 2, 7, 6, 7, 4, 5, 4, 5, 2, 7, 6, 7, + 8, 9, 8, 9, 15, 11, 10, 11, 8, 9, 8, 9, 15, 11, 10, 11, + 12, 13, 12, 13, 4, 15, 14, 15, 12, 13, 12, 13, 4, 15, 14, 15}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) { std::vector y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0}; @@ -2542,6 +3098,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_LastAxis) { QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3); } +TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_LastAxis_Cuda) { + std::vector y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, + 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0}; + std::vector zero_point{2, 0, 2, 0, 1, 9, 1, 9, + 13, 5, 13, 5, 11, 6, 11, 6}; + std::vector x{4, 2, 4, 2, -8, -12, -8, -12, 4, 2, 4, 2, -8, -12, -8, -12, + 10.5, 14, 10.5, 14, -3, -2, -3, -2, 10.5, 14, 10.5, 14, -3, -2, -3, -2, + -10, -8, -10, -8, 20, 24, 20, 24, -10, -8, -10, -8, 20, 24, 20, 24, + -3.5, -7, -3.5, -7, -8, -9, -8, -9, -3.5, -7, -3.5, -7, -8, -9, -8, -9}; + std::vector y_2{0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, + 4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7, + 8, 9, 8, 9, 10, 11, 10, 11, 8, 9, 8, 9, 10, 11, 10, 11, + 12, 13, 12, 13, 14, 15, 14, 15, 12, 13, 12, 13, 14, 15, 14, 15}; + std::vector y_3{0, 1, 0, 1, 6, 3, 2, 3, 0, 1, 0, 1, 6, 3, 2, 3, + 4, 5, 4, 5, 0, 7, 6, 7, 4, 5, 4, 5, 0, 7, 6, 7, + 8, 9, 8, 9, 15, 11, 10, 11, 8, 9, 8, 9, 15, 11, 10, 11, + 12, 13, 12, 13, 13, 15, 14, 15, 12, 13, 12, 13, 13, 15, 14, 15}; + + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2, + DefaultCudaExecutionProvider()); + QuantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3, + DefaultCudaExecutionProvider()); +} + TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) { std::vector y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};