[CUDA] Support CUDA EP blocked quantization in Q/DQ ops. (#21846)

### Description 1. Added CUDA EP support for blocked quantization in QuantizeLinear and DequantizeLinear ops. 2. Currently CUDA EP blocked quantization only supports int4/uint4 quantized types and float32/float16 unquantized types. 3. Added CUDA EP support in QDQ selector/action transformer. CUDA EP is only added to DQ + MatMul -> MatMulNBits rule. Other rules' EP support are not changed. ### Motivation and Context ONNX opset 21 introduced blocked quantization for Q/DQ opts. ORT originally only supports CPU EP blocked quantization.
2026-06-20 02:07:56 +00:00 · 2024-08-30 18:28:00 -07:00 · 2024-08-30 18:28:00 -07:00 · 5dee95fa10
commit 5dee95fa10
parent 60b07623a2
14 changed files with 1922 additions and 193 deletions
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@ -587,7 +587,8 @@ Do not modify directly.*
 |DepthToSpace|*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
-|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|19+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|DequantizeLinear|*in* x:**T**<br> *in* x_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *out* y:**tensor(float)**<br><br>or<br><br>*in* x:**T1**<br> *in* x_scale:**T2**<br> *in* x_zero_point:**T1**<br> *out* y:**T2**|21+|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|||[19, 20]|**T1** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
 |||[13, 18]|**T** = tensor(int8), tensor(uint8)|
 |||[10, 12]|**T** = tensor(int8), tensor(uint8)|
 |Div|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T**|14+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)|
@ -718,7 +719,8 @@ Do not modify directly.*
 |||[13, 14]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)<br/> **T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)|
 |||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)<br/> **T1** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64)|
 |||[7, 11]|**T** = tensor(double), tensor(float), tensor(float16)|
-|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|19+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)|
+|QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**<br><br>or<br><br>*in* x:**T1**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int4), tensor(int8), tensor(uint4), tensor(uint8)|
+|||[19, 20]|**T1** = tensor(float), tensor(float16)<br/> **T2** = tensor(float8e4m3fn), tensor(float8e5m2), tensor(int8), tensor(uint8)|
 |||[13, 18]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |||[10, 12]|**T1** = tensor(float)<br/> **T2** = tensor(int8), tensor(uint8)|
 |RNN|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc
@ -23,7 +23,10 @@ void SplitQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {
  const std::string action_name{"dropSplitQDQ"};
  std::unique_ptr<Action> action = std::make_unique<QDQ::SplitReplaceWithQuant>();
 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::SplitSelector>(true /*req_equal_quant_params*/);
+  std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::SplitSelector>(true /*req_equal_quant_params*/,
+                                                                                false,
+                                                                                providers);
  qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                         {{"Split", {}}},
                                                         std::move(selector),
@ -63,14 +66,18 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
  //
  // And cannot eliminate the QDQ for MaxPool if the scale is not positive, as a negative
  // scale will change the ordering of the elements between quantized & de-quantized values.
-  std::unique_ptr<NodeSelector> selector_no_16bit = std::make_unique<QDQ::DropQDQNodesSelector>(false);
+  std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
+  std::unique_ptr<NodeSelector> selector_no_16bit = std::make_unique<QDQ::DropQDQNodesSelector>(false,
+                                                                                                false,
+                                                                                                true,
+                                                                                                providers);
  qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_name,
                                                         {{"Resize", {}}},
                                                         std::move(selector_no_16bit),
                                                         std::move(drop_action_no_int16));

  std::unique_ptr<NodeSelector> selector_no_16bit_and_positive_scale =
-      std::make_unique<QDQ::DropQDQNodesSelector>(false, true, false);
+      std::make_unique<QDQ::DropQDQNodesSelector>(false, true, false, providers);
  qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_and_positive_scale_name,
                                                         {{"MaxPool", {12}},
                                                          {"ReduceMax", {}},
@ -78,7 +85,7 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {
                                                         std::move(selector_no_16bit_and_positive_scale),
                                                         std::move(drop_action_no_int16_and_positive_scale));

-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropQDQNodesSelector>(true);
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropQDQNodesSelector>(true, false, true, providers);
  // DepthToSpace and SpaceToDepth not included because there are no integer implementations.
  // https://github.com/microsoft/onnxruntime/issues/21287
  qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_name,
@ -117,7 +124,8 @@ void DropDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) {

 #if !defined(ORT_MINIMAL_BUILD)
  // TODO: Enable 16-bit types in selector when ArgMax supports 16-bit integer input tensors.
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropDQNodesSelector>();
+  std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DropDQNodesSelector>(false, false, providers);
  qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                         {{"ArgMax", {}}},
                                                         std::move(selector),
@ -200,7 +208,8 @@ void VariadicOpQDQRules(SelectorActionRegistry& qdq_selector_action_registry) {

 #if !defined(ORT_MINIMAL_BUILD)
  // TODO: Enable 16-bit types in selector when QLinearConcat supports 16-bit.
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::InputVariadicSelector>();
+  std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::InputVariadicSelector>(false, false, providers);

  qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                         {{"Concat", {}}},
@ -222,7 +231,11 @@ void ConvQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool is_

 #if !defined(ORT_MINIMAL_BUILD)
  // TODO: Enable 16-bit types in selector when QLinearConv supports 16-bit.
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::ConvSelector>(is_int8_allowed);
+  std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::ConvSelector>(is_int8_allowed,
+                                                                               false,
+                                                                               false,
+                                                                               providers);

  qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                         {{"Conv", {}}},
@ -245,7 +258,11 @@ void MatMulQDQRules(SelectorActionRegistry& qdq_selector_action_registry, bool i

 #if !defined(ORT_MINIMAL_BUILD)
  // TODO: Enable 16-bit types in selector when QLinearMatMul and MatMulInteger support 16-bit.
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::MatMulSelector>(is_int8_allowed);
+  std::vector<const char*> providers = {kCpuExecutionProvider, kDmlExecutionProvider};
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::MatMulSelector>(is_int8_allowed,
+                                                                                 false,
+                                                                                 false,
+                                                                                 providers);
  qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                         {{"MatMul", {}}},
                                                         std::move(selector),
@ -272,7 +289,8 @@ void DQMatMulToMatMulNBitsRules(SelectorActionRegistry& qdq_selector_action_regi
                                                         p_buffered_tensors);

 #if !defined(ORT_MINIMAL_BUILD)
-  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DQMatMulToMatMulNBitsSelector>();
+  std::vector<const char*> providers = {kCpuExecutionProvider, kCudaExecutionProvider};
+  std::unique_ptr<NodeSelector> selector = std::make_unique<QDQ::DQMatMulToMatMulNBitsSelector>(providers);
  qdq_selector_action_registry.RegisterSelectorAndAction(action_name,
                                                         {{"MatMul", {}}},
                                                         std::move(selector),
@ -363,8 +381,9 @@ QDQSelectorActionTransformer::QDQSelectorActionTransformer(
          CreateSelectorActionRegistry(is_int8_allowed, qdq_matmulnbits_accuracy_level,
                                       intra_op_thread_pool, p_buffered_tensors),
          apply_context,
-          // this transformer is only compatible with the CPU and DML EP
-          {kCpuExecutionProvider, kDmlExecutionProvider}} {
+          // this transformer is compatible with CPU, DML and CUDA EP.
+          // There is further EP control on the rule level.
+          {kCpuExecutionProvider, kDmlExecutionProvider, kCudaExecutionProvider}} {
 }

 }  // namespace onnxruntime
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@ -302,14 +302,20 @@ class BaseSelector : public NodeSelector {

 class DropQDQNodesSelector : public BaseSelector {
 public:
-  explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false, bool allow_nonpositive_scale = true)
-      : BaseSelector(std::make_unique<DropQDQNodeGroupSelector>(allow_16bit, allow_4bit, allow_nonpositive_scale)) {}
+  explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false,
+                                bool allow_nonpositive_scale = true,
+                                gsl::span<const char*> compatible_providers = {})
+      : BaseSelector(std::make_unique<DropQDQNodeGroupSelector>(allow_16bit, allow_4bit, allow_nonpositive_scale),
+                     compatible_providers) {}
 };

 class DropDQNodesSelector : public BaseSelector {
 public:
-  explicit DropDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false)
-      : BaseSelector(std::make_unique<DropDQNodeGroupSelector>(allow_16bit, allow_4bit)) {}
+  explicit DropDQNodesSelector(bool allow_16bit = false,
+                               bool allow_4bit = false,
+                               gsl::span<const char*> compatible_providers = {})
+      : BaseSelector(std::make_unique<DropDQNodeGroupSelector>(allow_16bit, allow_4bit),
+                     compatible_providers) {}
 };

 class UnarySelector : public BaseSelector {
@ -329,8 +335,11 @@ class BinarySelector : public BaseSelector {
 // Variadic DQ nodes -> node -> Q
 class InputVariadicSelector : public BaseSelector {
 public:
-  explicit InputVariadicSelector(bool allow_16bit = false, bool allow_4bit = false)
-      : BaseSelector(std::make_unique<VariadicNodeGroupSelector>(allow_16bit, allow_4bit)) {}
+  explicit InputVariadicSelector(bool allow_16bit = false,
+                                 bool allow_4bit = false,
+                                 gsl::span<const char*> compatible_providers = {})
+      : BaseSelector(std::make_unique<VariadicNodeGroupSelector>(allow_16bit, allow_4bit),
+                     compatible_providers) {}

  void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
 };
@ -338,8 +347,10 @@ class InputVariadicSelector : public BaseSelector {
 //  DQ -> Split -> variadic Q nodes
 class SplitSelector : public BaseSelector {
 public:
-  SplitSelector(bool req_equal_quant_params = false, bool allow_4bit = false)
-      : BaseSelector(std::make_unique<SplitNodeGroupSelector>(req_equal_quant_params, allow_4bit)) {}
+  SplitSelector(bool req_equal_quant_params = false, bool allow_4bit = false,
+                gsl::span<const char*> compatible_providers = {})
+      : BaseSelector(std::make_unique<SplitNodeGroupSelector>(req_equal_quant_params, allow_4bit),
+                     compatible_providers) {}

  void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
 };
@ -347,8 +358,10 @@ class SplitSelector : public BaseSelector {
 // DQ nodes for X, W and optionally B -> node -> Q
 class ConvSelector : public BaseSelector {
 public:
-  ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false)
-      : BaseSelector(std::make_unique<ConvNodeGroupSelector>(int8_allowed, allow_16bit, allow_4bit_weight)) {}
+  ConvSelector(bool int8_allowed = false, bool allow_16bit = false, bool allow_4bit_weight = false,
+               gsl::span<const char*> compatible_providers = {})
+      : BaseSelector(std::make_unique<ConvNodeGroupSelector>(int8_allowed, allow_16bit, allow_4bit_weight),
+                     compatible_providers) {}

  void UpdateBuilder(NodesToOptimizeIndicesBuilder&) const override;
 };
@ -363,9 +376,11 @@ class WhereSelector : public BaseSelector {
 // 2 DQ nodes for input -> node -> optional Q if QLinearMatMul, MatMulIntegerToFloat if not
 class MatMulSelector : public BaseSelector {
 public:
-  MatMulSelector(bool int8_allowed, bool allow_16bit = false, bool allow_4bit = false)
+  MatMulSelector(bool int8_allowed, bool allow_16bit = false, bool allow_4bit = false,
+                 gsl::span<const char*> compatible_providers = {})
      : BaseSelector(std::make_unique<MatMulNodeGroupSelector>(int8_allowed, /*matmulintegertofloat_allowed*/ true,
-                                                               allow_16bit, allow_4bit)) {}
+                                                               allow_16bit, allow_4bit),
+                     compatible_providers) {}
 };

 // Convert "1 DQ node for input B -> MatMul" to "MatMulNBits"
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@ -4,6 +4,7 @@

 #include "core/common/inlined_containers.h"
 #include "core/common/parse_string.h"
+#include "core/framework/int4.h"
 #include "core/providers/shared_library/provider_api.h"
 #include "core/platform/env_var_utils.h"
 #include "core/providers/cuda/cuda_execution_provider.h"
@ -1348,38 +1349,37 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, Cast);
 #endif

-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear);
 #if !defined(DISABLE_FLOAT8_TYPES)
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, DequantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, DequantizeLinear);
 #endif
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear);
 #if !defined(DISABLE_FLOAT8_TYPES)
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, DequantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, DequantizeLinear);
 #endif

 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Identity);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, If);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Loop);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear);
 #if !defined(DISABLE_FLOAT8_TYPES)
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, QuantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, QuantizeLinear);
 #endif
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear);
 #if !defined(DISABLE_FLOAT8_TYPES)
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, QuantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, QuantizeLinear);
 #endif
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
-#endif

 // Opset 20
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu);
@ -1388,6 +1388,40 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN);

+// Opset 21.
+// TODO(fajin): support other quantized types
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, DequantizeLinear);
+#if !defined(DISABLE_FLOAT8_TYPES)
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, DequantizeLinear);
+#endif
+
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, QuantizeLinear);
+#if !defined(DISABLE_FLOAT8_TYPES)
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, QuantizeLinear);
+#endif
+
+#endif
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
  return {};
@ -2265,34 +2299,34 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, Cast)>,
 #endif

-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, DequantizeLinear)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, DequantizeLinear)>,
 #endif
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, DequantizeLinear)>,
 #endif

      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Identity)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, If)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Loop)>,

-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, float, QuantizeLinear)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, float, QuantizeLinear)>,
 #endif
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Float8E5M2, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, 20, Float8E5M2, MLFloat16, QuantizeLinear)>,
 #endif

      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
@ -2305,6 +2339,37 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsInf)>,
      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, IsNaN)>,
+
+      // Opset 21
+      // TODO(fajin): support other quantized types
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, DequantizeLinear)>,
+#if !defined(DISABLE_FLOAT8_TYPES)
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, DequantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, DequantizeLinear)>,
+#endif
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, UInt4x2, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Int4x2, MLFloat16, QuantizeLinear)>,
+#if !defined(DISABLE_FLOAT8_TYPES)
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, float, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, MLFloat16, QuantizeLinear)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 21, Float8E5M2, MLFloat16, QuantizeLinear)>,
+#endif
 #endif
  };

--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cc
@ -7,36 +7,181 @@
 namespace onnxruntime {
 namespace cuda {

+void ValidateBlockQuantizationShapes(const TensorShape& input_shape,
+                                     const TensorShape& scale_shape,
+                                     const Tensor* zero_point,
+                                     size_t axis_no_neg,
+                                     int64_t block_size_) {
+  ORT_ENFORCE(scale_shape.NumDimensions() == input_shape.NumDimensions(),
+              "scale and input must have the same rank for blocked quantization");
+
+  for (size_t i = 0, ndim = input_shape.NumDimensions(); i < ndim; ++i) {
+    if (i == static_cast<size_t>(axis_no_neg)) {
+      ORT_ENFORCE(scale_shape[i] == (input_shape[i] + block_size_ - 1) / block_size_,
+                  "scale must be ceil(Di/block_size) on the quantize axis i for blocked quantization");
+    } else {
+      ORT_ENFORCE(scale_shape[i] == input_shape[i],
+                  "scale and input must have the same shape despite the quantize axis for blocked quantization");
+    }
+  }
+
+  if (zero_point) {
+    ORT_ENFORCE(zero_point->Shape() == scale_shape,
+                "zero_point and scale must have the same shape for blocked quantization");
+  }
+}
+
 template <class T, class U>
-typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
-CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, bool /*saturate*/) {
+Status CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point,
+                          size_t num_of_element, bool /*saturate*/) {
+  ORT_UNUSED_PARAMETER(stream);
+  ORT_UNUSED_PARAMETER(input);
+  ORT_UNUSED_PARAMETER(output);
+  ORT_UNUSED_PARAMETER(scale);
+  ORT_UNUSED_PARAMETER(zero_point);
+  ORT_UNUSED_PARAMETER(num_of_element);
+  ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
+}
+
+template <class T, class U>
+Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point,
+                              size_t num_of_element, size_t batch_size, size_t n_scales, bool /*saturate*/) {
+  ORT_UNUSED_PARAMETER(stream);
+  ORT_UNUSED_PARAMETER(input);
+  ORT_UNUSED_PARAMETER(output);
+  ORT_UNUSED_PARAMETER(scale);
+  ORT_UNUSED_PARAMETER(zero_point);
+  ORT_UNUSED_PARAMETER(num_of_element);
+  ORT_UNUSED_PARAMETER(batch_size);
+  ORT_UNUSED_PARAMETER(n_scales);
+  ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
+}
+
+template <typename T, typename U>
+Status CudaQuantizeLinearBlock(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point,
+                               size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) {
+  ORT_UNUSED_PARAMETER(stream);
+  ORT_UNUSED_PARAMETER(input);
+  ORT_UNUSED_PARAMETER(output);
+  ORT_UNUSED_PARAMETER(scale);
+  ORT_UNUSED_PARAMETER(zero_point);
+  ORT_UNUSED_PARAMETER(num_of_element);
+  ORT_UNUSED_PARAMETER(K);
+  ORT_UNUSED_PARAMETER(N);
+  ORT_UNUSED_PARAMETER(block_size);
+  ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
+}
+
+template <class U>
+Status CudaQuantizeLinear(cudaStream_t stream, const U* input, int8_t* output, const U* scale,
+                          const int8_t* zero_point, size_t num_of_element, bool /*saturate*/) {
  return CudaQuantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
 }

+template <class U>
+Status CudaQuantizeLinear(cudaStream_t stream, const U* input, uint8_t* output, const U* scale,
+                          const uint8_t* zero_point, size_t num_of_element, bool /*saturate*/) {
+  return CudaQuantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
+}
+
+template <class U>
+Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Int4x2* output, const U* scale,
+                          const Int4x2* zero_point, size_t num_of_element, bool /*saturate*/) {
+  return CudaQuantizeLinearStdInt4(stream, input, reinterpret_cast<int8_t*>(output), scale,
+                                   zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
+                                   num_of_element);
+}
+
+template <class U>
+Status CudaQuantizeLinear(cudaStream_t stream, const U* input, UInt4x2* output, const U* scale,
+                          const UInt4x2* zero_point, size_t num_of_element, bool /*saturate*/) {
+  return CudaQuantizeLinearStdInt4(stream, input, reinterpret_cast<uint8_t*>(output), scale,
+                                   zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
+                                   num_of_element);
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)

-template <class T, class U>
-typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
-CudaQuantizeLinear(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element, bool saturate) {
+template <class U>
+Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Float8E4M3FN* output, const U* scale,
+                          const Float8E4M3FN* zero_point, size_t num_of_element, bool saturate) {
  return CudaQuantizeLinearSat(stream, input, output, scale, zero_point, num_of_element, saturate);
 }

-template <class T, class U>
-typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
-CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
-                       size_t batch_size, size_t n_scales, bool saturate) {
-  return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales, saturate);
+template <class U>
+Status CudaQuantizeLinear(cudaStream_t stream, const U* input, Float8E5M2* output, const U* scale,
+                          const Float8E5M2* zero_point, size_t num_of_element, bool saturate) {
+  return CudaQuantizeLinearSat(stream, input, output, scale, zero_point, num_of_element, saturate);
+}
+
+template <class U>
+Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Float8E4M3FN* output, const U* scale,
+                              const Float8E4M3FN* zero_point, size_t num_of_element,
+                              size_t batch_size, size_t n_scales, bool saturate) {
+  return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size,
+                                   n_scales, saturate);
+}
+
+template <class U>
+Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Float8E5M2* output, const U* scale,
+                              const Float8E5M2* zero_point, size_t num_of_element,
+                              size_t batch_size, size_t n_scales, bool saturate) {
+  return CudaQuantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size,
+                                   n_scales, saturate);
 }

 #endif

-template <class T, class U>
-typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
-CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
-                       size_t batch_size, size_t n_scales, bool /*saturate*/) {
+template <class U>
+Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, int8_t* output, const U* scale,
+                              const int8_t* zero_point, size_t num_of_element,
+                              size_t batch_size, size_t n_scales, bool /*saturate*/) {
  return CudaQuantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
 }

+template <class U>
+Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, uint8_t* output, const U* scale,
+                              const uint8_t* zero_point, size_t num_of_element,
+                              size_t batch_size, size_t n_scales, bool /*saturate*/) {
+  return CudaQuantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
+}
+
+template <class U>
+Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, Int4x2* output, const U* scale,
+                              const Int4x2* zero_point, size_t num_of_element,
+                              size_t batch_size, size_t n_scales, bool /*saturate*/) {
+  return CudaQuantizeLinearAxisStdInt4(stream, input, reinterpret_cast<int8_t*>(output), scale,
+                                       zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
+                                       num_of_element, batch_size, n_scales);
+}
+
+template <class U>
+Status CudaQuantizeLinearAxis(cudaStream_t stream, const U* input, UInt4x2* output, const U* scale,
+                              const UInt4x2* zero_point, size_t num_of_element,
+                              size_t batch_size, size_t n_scales, bool /*saturate*/) {
+  return CudaQuantizeLinearAxisStdInt4(stream, input, reinterpret_cast<uint8_t*>(output), scale,
+                                       zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
+                                       num_of_element, batch_size, n_scales);
+}
+
+template <typename U>
+Status CudaQuantizeLinearBlock(cudaStream_t stream,
+                               const U* input, Int4x2* output, const U* scale, const Int4x2* zero_point,
+                               size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) {
+  return CudaQuantizeLinearBlockStdInt4(stream, input, reinterpret_cast<int8_t*>(output), scale,
+                                        zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
+                                        num_of_element, K, N, block_size);
+}
+
+template <typename U>
+Status CudaQuantizeLinearBlock(cudaStream_t stream,
+                               const U* input, UInt4x2* output, const U* scale, const UInt4x2* zero_point,
+                               size_t num_of_element, size_t K, size_t N, size_t block_size, bool /*saturate*/) {
+  return CudaQuantizeLinearBlockStdInt4(stream, input, reinterpret_cast<uint8_t*>(output), scale,
+                                        zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
+                                        num_of_element, K, N, block_size);
+}
+
 template <class T, class U>
 Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
  typedef typename ToCudaType<U>::MappedType CudaU;
@ -48,21 +193,22 @@ Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
  auto& y = *ctx->Output(0, x.Shape());

  const auto& x_shape = x.Shape();
+  const auto num_of_elements = x_shape.Size();

  const CudaU* input = reinterpret_cast<const CudaU*>(x.Data<U>());
  T* output = y.MutableData<T>();

-  if (IsScalarOr1ElementVector(&y_scale)) {
+  if (IsScalarOr1ElementVector(&y_scale)) {  // per-tensor quantization
    ORT_ENFORCE(y_zero_point == nullptr || IsScalarOr1ElementVector(y_zero_point),
                "y_zero_point must be a scalar or 1D tensor of size 1.");
+    ORT_ENFORCE(block_size_ == 0, "block_size must be 0 for per-tensor quantization.");

    const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
    const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
-    const auto num_of_elements = x_shape.Size();

    ORT_RETURN_IF_ERROR(CudaQuantizeLinear(Stream(ctx), input, output, scale, zero_point, num_of_elements, saturate_));
    return Status::OK();
-  } else {
+  } else if (block_size_ == 0) {  // per-axis quantization
    ORT_ENFORCE(y_scale.Shape().NumDimensions() == 1);
    ORT_ENFORCE(y_zero_point == nullptr || (y_scale.Shape().Size() == y_zero_point->Shape().Size() &&
                                            y_zero_point->Shape().NumDimensions() == 1),
@ -73,44 +219,184 @@ Status QuantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {

    const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
    const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
-    const auto num_of_elements = x_shape.Size();

    ORT_RETURN_IF_ERROR(CudaQuantizeLinearAxis(Stream(ctx), input, output, scale, zero_point, num_of_elements,
                                               x_shape.SizeToDimension(axis), y_scale.Shape().Size(), saturate_));
    return Status::OK();
+  } else {  // blocked quantization
+    // validate shape
+    size_t axis_no_neg = SafeInt<size_t>(HandleNegativeAxis(axis_, x_shape.NumDimensions()));
+    const auto& y_scale_shape = y_scale.Shape();
+
+    ValidateBlockQuantizationShapes(x_shape,
+                                    y_scale_shape,
+                                    y_zero_point,
+                                    axis_no_neg,
+                                    block_size_);
+
+    // compute
+    const T* zero_point = y_zero_point ? y_zero_point->Data<T>() : nullptr;
+    const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
+
+    ORT_RETURN_IF_ERROR(CudaQuantizeLinearBlock(Stream(ctx), input, output, scale, zero_point,
+                                                num_of_elements, x_shape[axis_no_neg],
+                                                x_shape.SizeFromDimension(axis_no_neg + 1),
+                                                block_size_, saturate_));
+    return Status::OK();
  }
 }

 template <class T, class U>
-typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
-CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
+Status CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale,
+                            const T* zero_point, size_t num_of_element) {
+  ORT_UNUSED_PARAMETER(stream);
+  ORT_UNUSED_PARAMETER(input);
+  ORT_UNUSED_PARAMETER(output);
+  ORT_UNUSED_PARAMETER(scale);
+  ORT_UNUSED_PARAMETER(zero_point);
+  ORT_UNUSED_PARAMETER(num_of_element);
+  ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
+}
+
+template <class T, class U>
+Status CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale,
+                                const T* zero_point, size_t num_of_element,
+                                size_t batch_size, size_t n_scales) {
+  ORT_UNUSED_PARAMETER(stream);
+  ORT_UNUSED_PARAMETER(input);
+  ORT_UNUSED_PARAMETER(output);
+  ORT_UNUSED_PARAMETER(scale);
+  ORT_UNUSED_PARAMETER(zero_point);
+  ORT_UNUSED_PARAMETER(num_of_element);
+  ORT_UNUSED_PARAMETER(batch_size);
+  ORT_UNUSED_PARAMETER(n_scales);
+  ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
+}
+
+template <class T, class U>
+Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
+                                     const T* zero_point, size_t num_of_element, size_t K, size_t N,
+                                     size_t block_size) {
+  ORT_UNUSED_PARAMETER(stream);
+  ORT_UNUSED_PARAMETER(input);
+  ORT_UNUSED_PARAMETER(output);
+  ORT_UNUSED_PARAMETER(scale);
+  ORT_UNUSED_PARAMETER(zero_point);
+  ORT_UNUSED_PARAMETER(num_of_element);
+  ORT_UNUSED_PARAMETER(K);
+  ORT_UNUSED_PARAMETER(N);
+  ORT_UNUSED_PARAMETER(block_size);
+  ORT_NOT_IMPLEMENTED("Unsupported quantization type.");
+}
+
+template <class U>
+Status CudaDequantizeLinear(cudaStream_t stream, const int8_t* input, U* output, const U* scale,
+                            const int8_t* zero_point, size_t num_of_element) {
  return CudaDequantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
 }

+template <class U>
+Status CudaDequantizeLinear(cudaStream_t stream, const uint8_t* input, U* output, const U* scale,
+                            const uint8_t* zero_point, size_t num_of_element) {
+  return CudaDequantizeLinearStd(stream, input, output, scale, zero_point, num_of_element);
+}
+
+template <class U>
+Status CudaDequantizeLinear(cudaStream_t stream, const Int4x2* input, U* output, const U* scale,
+                            const Int4x2* zero_point, size_t num_of_element) {
+  return CudaDequantizeLinearStdInt4(stream, reinterpret_cast<const int8_t*>(input), output, scale,
+                                     zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
+                                     num_of_element);
+}
+
+template <class U>
+Status CudaDequantizeLinear(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale,
+                            const UInt4x2* zero_point, size_t num_of_element) {
+  return CudaDequantizeLinearStdInt4(stream, reinterpret_cast<const uint8_t*>(input), output, scale,
+                                     zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
+                                     num_of_element);
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
-template <class T, class U>
-typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
-CudaDequantizeLinear(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element) {
+template <class U>
+Status CudaDequantizeLinear(cudaStream_t stream, const Float8E4M3FN* input, U* output, const U* scale,
+                            const Float8E4M3FN* zero_point, size_t num_of_element) {
+  return CudaDequantizeLinearSat(stream, input, output, scale, zero_point, num_of_element);
+}
+
+template <class U>
+Status CudaDequantizeLinear(cudaStream_t stream, const Float8E5M2* input, U* output, const U* scale,
+                            const Float8E5M2* zero_point, size_t num_of_element) {
  return CudaDequantizeLinearSat(stream, input, output, scale, zero_point, num_of_element);
 }
 #endif

-template <class T, class U>
-typename std::enable_if<boost::mp11::mp_set_contains<TypeList<int8_t, uint8_t>, T>::value, Status>::type
-CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
-                         size_t batch_size, size_t n_scales) {
+template <class U>
+Status CudaDequantizeLinearAxis(cudaStream_t stream, const int8_t* input, U* output, const U* scale,
+                                const int8_t* zero_point, size_t num_of_element,
+                                size_t batch_size, size_t n_scales) {
  return CudaDequantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
 }

+template <class U>
+Status CudaDequantizeLinearAxis(cudaStream_t stream, const uint8_t* input, U* output, const U* scale,
+                                const uint8_t* zero_point, size_t num_of_element,
+                                size_t batch_size, size_t n_scales) {
+  return CudaDequantizeLinearAxisStd(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
+}
+
+template <class U>
+Status CudaDequantizeLinearAxis(cudaStream_t stream, const Int4x2* input, U* output, const U* scale,
+                                const Int4x2* zero_point, size_t num_of_element,
+                                size_t batch_size, size_t n_scales) {
+  return CudaDequantizeLinearAxisStdInt4(stream, reinterpret_cast<const int8_t*>(input), output, scale,
+                                         zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
+                                         num_of_element, batch_size, n_scales);
+}
+
+template <class U>
+Status CudaDequantizeLinearAxis(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale,
+                                const UInt4x2* zero_point, size_t num_of_element,
+                                size_t batch_size, size_t n_scales) {
+  return CudaDequantizeLinearAxisStdInt4(stream, reinterpret_cast<const uint8_t*>(input), output, scale,
+                                         zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
+                                         num_of_element, batch_size, n_scales);
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
-template <class T, class U>
-typename std::enable_if<boost::mp11::mp_set_contains<TypeList<Float8E4M3FN, Float8E5M2>, T>::value, Status>::type
-CudaDequantizeLinearAxis(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
-                         size_t batch_size, size_t n_scales) {
+template <class U>
+Status CudaDequantizeLinearAxis(cudaStream_t stream, const Float8E4M3FN* input, U* output, const U* scale,
+                                const Float8E4M3FN* zero_point, size_t num_of_element,
+                                size_t batch_size, size_t n_scales) {
+  return CudaDequantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
+}
+
+template <class U>
+Status CudaDequantizeLinearAxis(cudaStream_t stream, const Float8E5M2* input, U* output, const U* scale,
+                                const Float8E5M2* zero_point, size_t num_of_element,
+                                size_t batch_size, size_t n_scales) {
  return CudaDequantizeLinearAxisSat(stream, input, output, scale, zero_point, num_of_element, batch_size, n_scales);
 }
 #endif

+template <class U>
+Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const UInt4x2* input, U* output, const U* scale,
+                                     const UInt4x2* zero_point, size_t num_of_element, size_t K, size_t N,
+                                     size_t block_size) {
+  return CudaDequantizeLinearBlockStdInt4(stream, reinterpret_cast<const uint8_t*>(input), output, scale,
+                                          zero_point ? reinterpret_cast<const uint8_t*>(zero_point) : nullptr,
+                                          num_of_element, K, N, block_size);
+}
+
+template <class U>
+Status CudaDequantizeLinearBlockInt4(cudaStream_t stream, const Int4x2* input, U* output, const U* scale,
+                                     const Int4x2* zero_point, size_t num_of_element, size_t K, size_t N,
+                                     size_t block_size) {
+  return CudaDequantizeLinearBlockStdInt4(stream, reinterpret_cast<const int8_t*>(input), output, scale,
+                                          zero_point ? reinterpret_cast<const int8_t*>(zero_point) : nullptr,
+                                          num_of_element, K, N, block_size);
+}
+
 template <class T, class U>
 Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
  typedef typename ToCudaType<U>::MappedType CudaU;
@ -120,6 +406,7 @@ Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {
  auto* y_zero_point = ctx->Input<Tensor>(2);

  const auto& x_shape = x.Shape();
+  const auto num_of_elements = x_shape.Size();

  auto& y = *ctx->Output(0, x_shape);

@ -131,12 +418,11 @@ Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {

    const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
    const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
-    const auto num_of_elements = x_shape.Size();

    ORT_RETURN_IF_ERROR(CudaDequantizeLinear(Stream(ctx), input, output, scale, zero_point, num_of_elements));

    return Status::OK();
-  } else {
+  } else if (block_size_ == 0) {  // per axis quantization
    ORT_ENFORCE(y_scale.Shape().NumDimensions() == 1);
    ORT_ENFORCE(y_zero_point == nullptr || (y_scale.Shape().Size() == y_zero_point->Shape().Size() && y_zero_point->Shape().NumDimensions() == 1), "scale and zero_point must have the same shape.");
    ORT_ENFORCE(x_shape.NumDimensions() > 1);
@ -145,11 +431,31 @@ Status DequantizeLinear<T, U>::ComputeInternal(OpKernelContext* ctx) const {

    const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data<T>() : nullptr;
    const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
-    const auto num_of_elements = x_shape.Size();

    ORT_RETURN_IF_ERROR(CudaDequantizeLinearAxis(Stream(ctx), input, output, scale, zero_point, num_of_elements,
                                                 x_shape.SizeToDimension(axis), y_scale.Shape().Size()));
    return Status::OK();
+  } else {  // blocked quantization
+    // validate shape
+    auto axis_no_neg = SafeInt<size_t>(HandleNegativeAxis(axis_, x_shape.NumDimensions()));
+    const auto& y_scale_shape = y_scale.Shape();
+
+    ValidateBlockQuantizationShapes(x_shape,
+                                    y_scale_shape,
+                                    y_zero_point,
+                                    axis_no_neg,
+                                    block_size_);
+
+    // compute
+    const T* zero_point = y_zero_point ? y_zero_point->Data<T>() : nullptr;
+    const CudaU* scale = reinterpret_cast<const CudaU*>(y_scale.Data<U>());
+
+    ORT_RETURN_IF_ERROR(CudaDequantizeLinearBlockInt4(Stream(ctx), input, output, scale, zero_point,
+                                                      num_of_elements, x_shape[axis_no_neg],
+                                                      x_shape.SizeFromDimension(axis_no_neg + 1),
+                                                      block_size_));
+
+    return Status::OK();
  }
 }

@ -183,33 +489,54 @@ REGISTER_Q_KERNEL_TYPED_10_12(uint8_t)
 REGISTER_Q_KERNEL_TYPED_13_18(int8_t)
 REGISTER_Q_KERNEL_TYPED_13_18(uint8_t)

-#define REGISTER_Q_KERNEL_TYPED_19(T)                                     \
-  ONNX_OPERATOR_TWO_TYPED_KERNEL_EX(                                      \
-      QuantizeLinear,                                                     \
-      kOnnxDomain,                                                        \
-      19,                                                                 \
-      T, float,                                                           \
-      kCudaExecutionProvider,                                             \
-      (*KernelDefBuilder::Create())                                       \
-          .TypeConstraint("T1", DataTypeImpl::GetTensorType<float>())     \
-          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()),        \
-      QuantizeLinear<T, float>);                                          \
-  ONNX_OPERATOR_TWO_TYPED_KERNEL_EX(                                      \
-      QuantizeLinear,                                                     \
-      kOnnxDomain,                                                        \
-      19,                                                                 \
-      T, MLFloat16,                                                       \
-      kCudaExecutionProvider,                                             \
-      (*KernelDefBuilder::Create())                                       \
-          .TypeConstraint("T1", DataTypeImpl::GetTensorType<MLFloat16>()) \
-          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()),        \
-      QuantizeLinear<T, MLFloat16>);
+#define REGISTER_Q_KERNEL_TWO_TYPED_19_20(T, U)                    \
+  ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX(                     \
+      QuantizeLinear,                                              \
+      kOnnxDomain,                                                 \
+      19, 20,                                                      \
+      T, U,                                                        \
+      kCudaExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<U>())  \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
+      QuantizeLinear<T, U>);

-REGISTER_Q_KERNEL_TYPED_19(int8_t)
-REGISTER_Q_KERNEL_TYPED_19(uint8_t)
+REGISTER_Q_KERNEL_TWO_TYPED_19_20(int8_t, float)
+REGISTER_Q_KERNEL_TWO_TYPED_19_20(uint8_t, float)
+REGISTER_Q_KERNEL_TWO_TYPED_19_20(int8_t, MLFloat16)
+REGISTER_Q_KERNEL_TWO_TYPED_19_20(uint8_t, MLFloat16)
 #if !defined(DISABLE_FLOAT8_TYPES)
-REGISTER_Q_KERNEL_TYPED_19(Float8E4M3FN)
-REGISTER_Q_KERNEL_TYPED_19(Float8E5M2)
+REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, float)
+REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E5M2, float)
+REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, MLFloat16)
+REGISTER_Q_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16)
+#endif
+
+#define REGISTER_Q_KERNEL_TWO_TYPED_21(T, U)                       \
+  ONNX_OPERATOR_TWO_TYPED_KERNEL_EX(                               \
+      QuantizeLinear,                                              \
+      kOnnxDomain,                                                 \
+      21,                                                          \
+      T, U,                                                        \
+      kCudaExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<U>())  \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
+      QuantizeLinear<T, U>);
+
+REGISTER_Q_KERNEL_TWO_TYPED_21(uint8_t, float)
+REGISTER_Q_KERNEL_TWO_TYPED_21(int8_t, float)
+REGISTER_Q_KERNEL_TWO_TYPED_21(uint8_t, MLFloat16)
+REGISTER_Q_KERNEL_TWO_TYPED_21(int8_t, MLFloat16)
+REGISTER_Q_KERNEL_TWO_TYPED_21(UInt4x2, float)
+REGISTER_Q_KERNEL_TWO_TYPED_21(Int4x2, float)
+REGISTER_Q_KERNEL_TWO_TYPED_21(UInt4x2, MLFloat16)
+REGISTER_Q_KERNEL_TWO_TYPED_21(Int4x2, MLFloat16)
+#if !defined(DISABLE_FLOAT8_TYPES)
+REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E4M3FN, float)
+REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E5M2, float)
+REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E4M3FN, MLFloat16)
+REGISTER_Q_KERNEL_TWO_TYPED_21(Float8E5M2, MLFloat16)
 #endif

 // register DequantizeLinear kernels
@ -240,33 +567,54 @@ REGISTER_DQ_KERNEL_TYPED_10_12(uint8_t)
 REGISTER_DQ_KERNEL_TYPED_13_18(int8_t)
 REGISTER_DQ_KERNEL_TYPED_13_18(uint8_t)

-#define REGISTER_DQ_KERNEL_TYPED_19(T)                                     \
-  ONNX_OPERATOR_TWO_TYPED_KERNEL_EX(                                       \
-      DequantizeLinear,                                                    \
-      kOnnxDomain,                                                         \
-      19,                                                                  \
-      T, float,                                                            \
-      kCudaExecutionProvider,                                              \
-      (*KernelDefBuilder::Create())                                        \
-          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())          \
-          .TypeConstraint("T2", DataTypeImpl::GetTensorType<float>()),     \
-      DequantizeLinear<T, float>);                                         \
-  ONNX_OPERATOR_TWO_TYPED_KERNEL_EX(                                       \
-      DequantizeLinear,                                                    \
-      kOnnxDomain,                                                         \
-      19,                                                                  \
-      T, MLFloat16,                                                        \
-      kCudaExecutionProvider,                                              \
-      (*KernelDefBuilder::Create())                                        \
-          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())          \
-          .TypeConstraint("T2", DataTypeImpl::GetTensorType<MLFloat16>()), \
-      DequantizeLinear<T, MLFloat16>);
+#define REGISTER_DQ_KERNEL_TWO_TYPED_19_20(T, U)                   \
+  ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX(                     \
+      DequantizeLinear,                                            \
+      kOnnxDomain,                                                 \
+      19, 20,                                                      \
+      T, U,                                                        \
+      kCudaExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())  \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<U>()), \
+      DequantizeLinear<T, U>);

-REGISTER_DQ_KERNEL_TYPED_19(int8_t)
-REGISTER_DQ_KERNEL_TYPED_19(uint8_t)
+REGISTER_DQ_KERNEL_TWO_TYPED_19_20(int8_t, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_19_20(uint8_t, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_19_20(int8_t, MLFloat16)
+REGISTER_DQ_KERNEL_TWO_TYPED_19_20(uint8_t, MLFloat16)
 #if !defined(DISABLE_FLOAT8_TYPES)
-REGISTER_DQ_KERNEL_TYPED_19(Float8E4M3FN)
-REGISTER_DQ_KERNEL_TYPED_19(Float8E5M2)
+REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E4M3FN, MLFloat16)
+REGISTER_DQ_KERNEL_TWO_TYPED_19_20(Float8E5M2, MLFloat16)
+#endif
+
+#define REGISTER_DQ_KERNEL_TWO_TYPED_21(T, U)                      \
+  ONNX_OPERATOR_TWO_TYPED_KERNEL_EX(                               \
+      DequantizeLinear,                                            \
+      kOnnxDomain,                                                 \
+      21,                                                          \
+      T, U,                                                        \
+      kCudaExecutionProvider,                                      \
+      (*KernelDefBuilder::Create())                                \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())  \
+          .TypeConstraint("T2", DataTypeImpl::GetTensorType<U>()), \
+      DequantizeLinear<T, U>);
+
+REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(uint8_t, MLFloat16)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(int8_t, MLFloat16)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(UInt4x2, MLFloat16)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(Int4x2, MLFloat16)
+#if !defined(DISABLE_FLOAT8_TYPES)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, float)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E4M3FN, MLFloat16)
+REGISTER_DQ_KERNEL_TWO_TYPED_21(Float8E5M2, MLFloat16)
 #endif

 // specialize QuantizeLinear::ComputeInternal and DequantizeLinear::ComputeInternal
@ -278,6 +626,10 @@ SPECIALIZED_QDQ_COMPUTE(int8_t, float)
 SPECIALIZED_QDQ_COMPUTE(uint8_t, float)
 SPECIALIZED_QDQ_COMPUTE(int8_t, MLFloat16)
 SPECIALIZED_QDQ_COMPUTE(uint8_t, MLFloat16)
+SPECIALIZED_QDQ_COMPUTE(Int4x2, float)
+SPECIALIZED_QDQ_COMPUTE(UInt4x2, float)
+SPECIALIZED_QDQ_COMPUTE(Int4x2, MLFloat16)
+SPECIALIZED_QDQ_COMPUTE(UInt4x2, MLFloat16)

 #if !defined(DISABLE_FLOAT8_TYPES)
 SPECIALIZED_QDQ_COMPUTE(Float8E4M3FN, float)
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cu
@ -9,6 +9,7 @@

 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
 #include "cuda_fp8.h"
+#include "cuda_fp16.h"
 #endif

 namespace onnxruntime {
@ -17,9 +18,23 @@ namespace cuda {
 template <typename InT, typename OutT>
 struct RoundStd;

+template <typename InT, typename OutT>
+struct RoundStdInt4;
+
 template <typename InT, typename OutT>
 struct RoundSat;

+template <typename T>
+__device__ __forceinline__ int ExtractInt4FromByte(T byte, int index) {
+  return static_cast<int>((byte >> (index << 2)) & 0x0f);
+}
+
+template <>
+__device__ __forceinline__ int ExtractInt4FromByte<int8_t>(int8_t byte, int index) {
+  constexpr auto shift = (sizeof(int) << 3) - 4;
+  return (static_cast<int>(((byte >> (index << 2)) & 0x0f)) << shift) >> shift;
+}
+
 template <>
 struct RoundStd<float, int8_t> {
  __device__ __forceinline__ int8_t operator()(float v, float scale, int8_t zero_point) const {
@ -28,6 +43,22 @@ struct RoundStd<float, int8_t> {
  }
 };

+template <>
+struct RoundStdInt4<float, int8_t> {
+  __device__ __forceinline__ int8_t operator()(float v0,
+                                               float v1,
+                                               float scale0,
+                                               float scale1,
+                                               int zp0,
+                                               int zp1) const {
+    int value0 = __float2int_rn(v0 / scale0) + zp0;
+    int value1 = __float2int_rn(v1 / scale1) + zp1;
+    int value0_clip = max(-8, min(7, value0));
+    int value1_clip = max(-8, min(7, value1));
+    return static_cast<int8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
+  }
+};
+
 template <>
 struct RoundStd<float, uint8_t> {
  __device__ __forceinline__ uint8_t operator()(float v, float scale, uint8_t zero_point) const {
@ -36,6 +67,22 @@ struct RoundStd<float, uint8_t> {
  }
 };

+template <>
+struct RoundStdInt4<float, uint8_t> {
+  __device__ __forceinline__ uint8_t operator()(float v0,
+                                                float v1,
+                                                float scale0,
+                                                float scale1,
+                                                int zp0,
+                                                int zp1) const {
+    int value0 = __float2int_rn(v0 / scale0) + zp0;
+    int value1 = __float2int_rn(v1 / scale1) + zp1;
+    int value0_clip = max(0, min(15, value0));
+    int value1_clip = max(0, min(15, value1));
+    return static_cast<uint8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
+  }
+};
+
 #if !defined(DISABLE_FLOAT8_TYPES)

 #if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
@ -104,7 +151,7 @@ struct RoundSat<half, Float8E5M2> {

 #endif

-#endif  // DISABLE_FLOAT8_TYPES 
+#endif  // DISABLE_FLOAT8_TYPES

 template <>
 struct RoundStd<half, int8_t> {
@ -114,6 +161,26 @@ struct RoundStd<half, int8_t> {
  }
 };

+template <>
+struct RoundStdInt4<half, int8_t> {
+  __device__ __forceinline__ int8_t operator()(half v0,
+                                               half v1,
+                                               half scale0,
+                                               half scale1,
+                                               int zp0,
+                                               int zp1) const {
+    half2 v = __halves2half2(v0, v1);
+    half2 scale = __halves2half2(scale0, scale1);
+    half2 scaled_v = v / scale;
+
+    int value0 = __half2int_rn(__low2half(scaled_v)) + zp0;
+    int value1 = __half2int_rn(__high2half(scaled_v)) + zp1;
+    int value0_clip = max(-8, min(7, value0));
+    int value1_clip = max(-8, min(7, value1));
+    return static_cast<int8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
+  }
+};
+
 template <>
 struct RoundStd<half, uint8_t> {
  __device__ __forceinline__ int8_t operator()(half v, half scale, uint8_t zero_point) const {
@ -122,6 +189,26 @@ struct RoundStd<half, uint8_t> {
  }
 };

+template <>
+struct RoundStdInt4<half, uint8_t> {
+  __device__ __forceinline__ uint8_t operator()(half v0,
+                                                half v1,
+                                                half scale0,
+                                                half scale1,
+                                                int zp0,
+                                                int zp1) const {
+    half2 v = __halves2half2(v0, v1);
+    half2 scale = __halves2half2(scale0, scale1);
+    half2 scaled_v = v / scale;
+
+    int value0 = __half2int_rn(__low2half(scaled_v)) + zp0;
+    int value1 = __half2int_rn(__high2half(scaled_v)) + zp1;
+    int value0_clip = max(0, min(15, value0));
+    int value1_clip = max(0, min(15, value1));
+    return static_cast<uint8_t>((value0_clip & 0x0f) | ((value1_clip & 0x0f) << 4));
+  }
+};
+
 template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
 __global__ void QuantizeLinearKernelStd(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, RoundStd<InT, OutT> round) {
  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
@ -137,6 +224,29 @@ __global__ void QuantizeLinearKernelStd(const InT* input, OutT* output, const In
  }
 }

+// cuda kernel for int4 per tensor quantization with standard rounding
+// OutT is int8_t for Int4x2 and uint8_t for UInt4x2
+// NumElementsPerThread must be multiple of 2.
+template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
+__global__ void QuantizeLinearKernelStdInt4(const InT* input, OutT* output, const InT* scale_ptr,
+                                            const OutT* zero_point_ptr, CUDA_LONG N,
+                                            RoundStdInt4<InT, OutT> round) {
+  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
+  InT scale = *scale_ptr;
+  int zero_point = zero_point_ptr ? ExtractInt4FromByte(*zero_point_ptr, 0) : 0;
+  int i = 0;
+  constexpr int step = NumThreadsPerBlock << 1;
+
+#pragma unroll
+  for (; i + 1 < NumElementsPerThread && id + 1 < N; i += 2, id += step) {
+    output[id >> 1] = round(input[id], input[id + 1], scale, scale, zero_point, zero_point);
+  }
+
+  if (i < NumElementsPerThread && id < N) {
+    output[id >> 1] = round(input[id], 0.0, scale, 1.0, zero_point, 0);
+  }
+}
+
 template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
 __global__ void QuantizeLinearKernelAxisStd(const InT* input, OutT* output, const InT* scale_ptr, const OutT* zero_point_ptr, CUDA_LONG N, size_t batch_size, size_t n_scales, RoundStd<InT, OutT> round) {
  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
@ -154,6 +264,97 @@ __global__ void QuantizeLinearKernelAxisStd(const InT* input, OutT* output, cons
  }
 }

+// cuda kernel for int4 per axis quantization with standard rounding
+// OutT is int8_t for Int4x2 and uint8_t for UInt4x2
+// NumElementsPerThread must be multiple of 2.
+template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
+__global__ void QuantizeLinearKernelAxisStdInt4(const InT* input, OutT* output, const InT* scale_ptr,
+                                                const OutT* zero_point_ptr, CUDA_LONG num_element,
+                                                size_t batch_size, size_t n_scales,
+                                                RoundStdInt4<InT, OutT> round) {
+  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
+  // Process continuous NumElementsPerThread int4 per thread.
+  int i = 0;
+  // The scale needs to change every n_same_scale.
+  CUDA_LONG n_same_scale = num_element / (batch_size * n_scales);
+  constexpr int step = NumThreadsPerBlock << 1;
+
+#pragma unroll
+  for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
+    int scale_id0 = (id / n_same_scale) % n_scales;
+    int scale_id1 = ((id + 1) / n_same_scale) % n_scales;
+    int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
+    int zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1);
+    output[id >> 1] = round(input[id],
+                            input[id + 1],
+                            scale_ptr[scale_id0],
+                            scale_ptr[scale_id1],
+                            zp0,
+                            zp1);
+  }
+
+  if (i < NumElementsPerThread && id < num_element) {
+    int scale_id0 = (id / n_same_scale) % n_scales;
+    int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
+    output[id >> 1] = round(input[id],
+                            0.0,
+                            scale_ptr[scale_id0],
+                            1.0,
+                            zp0,
+                            0);
+  }
+}
+
+// cuda kernel for int4 block-wise quantization with standard rounding
+// OutT is int8_t for Int4x2 and uint8_t for UInt4x2
+// NumElementsPerThread must be multiple of 2.
+template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
+__global__ void QuantizeLinearKernelBlockStdInt4(const InT* input, OutT* output, const InT* scale_ptr,
+                                                 const OutT* zero_point_ptr, CUDA_LONG num_element, size_t KN,
+                                                 size_t N, size_t scale_KN, size_t block_size,
+                                                 RoundStdInt4<InT, OutT> round) {
+  // Process continuous NumElementsPerThread int4 per thread.
+  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
+  int i = 0;
+  constexpr int step = NumThreadsPerBlock << 1;
+
+#pragma unroll
+  // Process two elements which belong to one byte at a time.
+  for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
+    int x0 = id / KN, x1 = (id + 1) / KN;
+    int y0 = id % KN / N, y1 = (id + 1) % KN / N;
+    int z0 = id % N, z1 = (id + 1) % N;
+    int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
+    int scale_id1 = x1 * scale_KN + y1 / block_size * N + z1;
+    output[id >> 1] = round(input[id],
+                            input[id + 1],
+                            scale_ptr[scale_id0],
+                            scale_ptr[scale_id1],
+                            zero_point_ptr == nullptr
+                                ? 0
+                                : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1),
+                            zero_point_ptr == nullptr
+                                ? 0
+                                : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1));
+  }
+
+  // last non-paired element
+  if (i < NumElementsPerThread && id < num_element) {
+    int x0 = id / KN;
+    int y0 = id % KN / N;
+    int z0 = id % N;
+    int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
+    output[id >> 1] = round(input[id],
+                            0.0,
+                            scale_ptr[scale_id0],
+                            1.0,
+                            zero_point_ptr == nullptr
+                                ? 0
+                                : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1),
+                            0);
+  }
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)

 template <int NumThreadsPerBlock, int NumElementsPerThread, typename OutT, typename InT>
@ -207,6 +408,27 @@ Status CudaQuantizeLinearStd(cudaStream_t stream, const InT* input, OutT* output
  return Status::OK();
 }

+template <class OutT, class InT>
+Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale,
+                                 const OutT* zero_point, size_t num_of_element) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  static_assert((GridDim::maxElementsPerThread & 1) == 0);
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
+                                               GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  QuantizeLinearKernelStdInt4<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+  <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      input,
+      output,
+      scale,
+      zero_point,
+      static_cast<int>(num_of_element),
+      RoundStdInt4<InT, OutT>());
+  return Status::OK();
+}
+
 template <class OutT, class InT>
 Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* output, const InT* scale, const OutT* zero_point, size_t num_of_element,
                                 size_t batch_size, size_t n_scales) {
@ -226,6 +448,59 @@ Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* ou
  return Status::OK();
 }

+template <class OutT, class InT>
+Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale,
+                                     const OutT* zero_point, size_t num_of_element,
+                                     size_t batch_size, size_t n_scales) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  static_assert((GridDim::maxElementsPerThread & 1) == 0);
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
+                                               GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  QuantizeLinearKernelAxisStdInt4<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+          input,
+          output,
+          scale,
+          zero_point,
+          static_cast<int>(num_of_element),
+          batch_size,
+          n_scales,
+          RoundStdInt4<InT, OutT>());
+  return Status::OK();
+}
+
+template <class OutT, class InT>
+Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const InT* input, OutT* output, const InT* scale,
+                                      const OutT* zero_point, size_t num_of_element, size_t K, size_t N,
+                                      size_t block_size) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  static_assert((GridDim::maxElementsPerThread & 1) == 0);
+
+  size_t KN = K * N;
+  size_t num_block = (K + block_size - 1) / block_size;
+  size_t scale_KN = num_block * N;
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
+                                               GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  QuantizeLinearKernelBlockStdInt4<GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+          input,
+          output,
+          scale,
+          zero_point,
+          static_cast<int>(num_of_element),
+          KN,
+          N,
+          scale_KN,
+          block_size,
+          RoundStdInt4<InT, OutT>());
+  return Status::OK();
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)

 template <class OutT, class InT>
@ -282,6 +557,29 @@ __global__ void DequantizeLinearKernelStd(const InT* input, OutT* output, const
  }
 }

+template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void DequantizeLinearKernelStdInt4(const InT* input, OutT* output, const OutT* scale_ptr,
+                                              const InT* zero_point_ptr, CUDA_LONG num_element) {
+  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
+
+  OutT scale = *scale_ptr;
+  int zero_point = zero_point_ptr ? ExtractInt4FromByte(*zero_point_ptr, 0) : 0;
+  int i = 0, v0, v1;
+  constexpr int step = NumThreadsPerBlock << 1;
+#pragma unroll
+  for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
+    v0 = ExtractInt4FromByte(input[id >> 1], 0);
+    v1 = ExtractInt4FromByte(input[id >> 1], 1);
+    output[id] = static_cast<OutT>(v0 - zero_point) * scale;
+    output[id + 1] = static_cast<OutT>(v1 - zero_point) * scale;
+  }
+
+  if (i < NumElementsPerThread && id < num_element) {
+    v0 = ExtractInt4FromByte(input[id >> 1], 0);
+    output[id] = static_cast<OutT>(v0 - zero_point) * scale;
+  }
+}
+
 template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
 __global__ void DequantizeLinearKernelAxisStd(const InT* input, OutT* output, const OutT* scale_ptr, const InT* zero_point_ptr, CUDA_LONG N,
                                              size_t batch_size, size_t n_scales) {
@ -300,6 +598,80 @@ __global__ void DequantizeLinearKernelAxisStd(const InT* input, OutT* output, co
  }
 }

+template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void DequantizeLinearKernelAxisStdInt4(const InT* input, OutT* output, const OutT* scale_ptr,
+                                                  const InT* zero_point_ptr, CUDA_LONG num_element,
+                                                  size_t batch_size, size_t n_scales) {
+  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
+  // The scale needs to change every n_same_scale.
+  CUDA_LONG n_same_scale = num_element / (batch_size * n_scales);
+  int i = 0;
+  int scale_id0, scale_id1, zp0, zp1, v0, v1;
+  constexpr int step = NumThreadsPerBlock << 1;
+
+#pragma unroll
+  for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
+      scale_id0 = (id / n_same_scale) % n_scales;
+      scale_id1 = ((id + 1) / n_same_scale) % n_scales;
+
+      v0 = ExtractInt4FromByte(input[id >> 1], 0);
+      v1 = ExtractInt4FromByte(input[id >> 1], 1);
+      zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
+      zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1);
+      output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
+      output[id + 1] = static_cast<OutT>(v1 - zp1) * scale_ptr[scale_id1];
+  }
+
+  if (i < NumElementsPerThread && id < num_element) {
+    scale_id0 = (id / n_same_scale) % n_scales;
+    v0 = ExtractInt4FromByte(input[id >> 1], 0);
+    zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
+    output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
+  }
+}
+
+// cuda kernel for int4 block-wise dequantization with standard rounding
+// IntT is int8_t for Int4x2 and uint8_t for UInt4x2
+// NumElementsPerThread must be multiple of 2.
+template <class InT, class OutT, int NumThreadsPerBlock, int NumElementsPerThread>
+__global__ void DequantizeLinearKernelBlockStdInt4(const InT* input, OutT* output, const OutT* scale_ptr,
+                                                   const InT* zero_point_ptr, CUDA_LONG num_element,
+                                                   size_t KN, size_t N, size_t scale_KN, size_t block_size) {
+  // Process continuous NumElementsPerThread int4 per thread.
+  CUDA_LONG id = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + (threadIdx.x << 1);
+  int i = 0;
+  constexpr int step = NumThreadsPerBlock << 1;
+
+#pragma unroll
+  // Process two elements which belong to one byte at a time.
+  for (; i + 1 < NumElementsPerThread && id + 1 < num_element; i += 2, id += step) {
+    int x0 = id / KN, x1 = (id + 1) / KN;
+    int y0 = id % KN / N, y1 = (id + 1) % KN / N;
+    int z0 = id % N, z1 = (id + 1) % N;
+    int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
+    int scale_id1 = x1 * scale_KN + y1 / block_size * N + z1;
+
+    int v0 = ExtractInt4FromByte(input[id >> 1], 0);
+    int v1 = ExtractInt4FromByte(input[id >> 1], 1);
+    int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
+    int zp1 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id1 >> 1], scale_id1 & 1);
+    output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
+    output[id + 1] = static_cast<OutT>(v1 - zp1) * scale_ptr[scale_id1];
+  }
+
+  // last non-paired element
+  if (i < NumElementsPerThread && id < num_element) {
+    int x0 = id / KN;
+    int y0 = id % KN / N;
+    int z0 = id % N;
+    int scale_id0 = x0 * scale_KN + y0 / block_size * N + z0;
+
+    int v0 = ExtractInt4FromByte(input[id >> 1], 0);
+    int zp0 = zero_point_ptr == nullptr ? 0 : ExtractInt4FromByte(zero_point_ptr[scale_id0 >> 1], scale_id0 & 1);
+    output[id] = static_cast<OutT>(v0 - zp0) * scale_ptr[scale_id0];
+  }
+}
+
 template <typename InT, typename OutT>
 struct DQFloat8;

@ -422,6 +794,26 @@ Status CudaDequantizeLinearStd(cudaStream_t stream, const InT* input, OutT* outp
  return Status::OK();
 }

+template <class InT, class OutT>
+Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale,
+                                   const InT* zero_point, size_t num_of_element) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  static_assert((GridDim::maxElementsPerThread & 1) == 0);
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
+                                               GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  DequantizeLinearKernelStdInt4<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+          input,
+          output,
+          scale,
+          zero_point,
+          static_cast<int>(num_of_element));
+  return Status::OK();
+}
+
 template <class InT, class OutT>
 Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale, const InT* zero_point, size_t num_of_element,
                                   size_t batch_size, size_t n_scales) {
@ -440,6 +832,57 @@ Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const InT* input, OutT*
  return Status::OK();
 }

+template <class InT, class OutT>
+Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const InT* input, OutT* output, const OutT* scale,
+                                       const InT* zero_point, size_t num_of_element,
+                                       size_t batch_size, size_t n_scales) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  static_assert((GridDim::maxElementsPerThread & 1) == 0);
+
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
+                                               GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  DequantizeLinearKernelAxisStdInt4<InT, OutT, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+          input,
+          output,
+          scale,
+          zero_point,
+          static_cast<int>(num_of_element),
+          batch_size,
+          n_scales);
+  return Status::OK();
+}
+
+template <class T, class U>
+Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
+                                        const T* zero_point, size_t num_of_element, size_t K, size_t N,
+                                        size_t block_size) {
+  if (num_of_element <= 0)
+    return Status::OK();
+
+  static_assert((GridDim::maxElementsPerThread & 1) == 0);
+
+  size_t KN = K * N;
+  size_t num_block = (K + block_size - 1) / block_size;
+  size_t scale_KN = num_block * N;
+  int blocksPerGrid = static_cast<int>(CeilDiv(num_of_element,
+                                               GridDim::maxThreadsPerBlock * GridDim::maxElementsPerThread));
+  DequantizeLinearKernelBlockStdInt4<T, U, GridDim::maxThreadsPerBlock, GridDim::maxElementsPerThread>
+      <<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+          input,
+          output,
+          scale,
+          zero_point,
+          static_cast<CUDA_LONG>(num_of_element),
+          KN,
+          N,
+          scale_KN,
+          block_size);
+  return Status::OK();
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)

 template <class InT, class OutT>
@ -481,11 +924,24 @@ template Status CudaQuantizeLinearStd<int8_t, float>(cudaStream_t stream, const
 template Status CudaQuantizeLinearStd<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
 template Status CudaQuantizeLinearStd<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
 template Status CudaQuantizeLinearStd<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
+template Status CudaQuantizeLinearStdInt4<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaQuantizeLinearStdInt4<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
+template Status CudaQuantizeLinearStdInt4<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaQuantizeLinearStdInt4<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);

 template Status CudaQuantizeLinearAxisStd<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
 template Status CudaQuantizeLinearAxisStd<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
 template Status CudaQuantizeLinearAxisStd<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
 template Status CudaQuantizeLinearAxisStd<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+template Status CudaQuantizeLinearAxisStdInt4<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+template Status CudaQuantizeLinearAxisStdInt4<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+template Status CudaQuantizeLinearAxisStdInt4<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+template Status CudaQuantizeLinearAxisStdInt4<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+
+template Status CudaQuantizeLinearBlockStdInt4<int8_t, float>(cudaStream_t stream, const float* input, int8_t* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
+template Status CudaQuantizeLinearBlockStdInt4<uint8_t, float>(cudaStream_t stream, const float* input, uint8_t* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
+template Status CudaQuantizeLinearBlockStdInt4<int8_t, half>(cudaStream_t stream, const half* input, int8_t* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
+template Status CudaQuantizeLinearBlockStdInt4<uint8_t, half>(cudaStream_t stream, const half* input, uint8_t* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);

 #if !defined(DISABLE_FLOAT8_TYPES)

@ -505,11 +961,24 @@ template Status CudaDequantizeLinearStd<int8_t, float>(cudaStream_t stream, cons
 template Status CudaDequantizeLinearStd<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
 template Status CudaDequantizeLinearStd<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
 template Status CudaDequantizeLinearStd<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);
+template Status CudaDequantizeLinearStdInt4<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaDequantizeLinearStdInt4<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element);
+template Status CudaDequantizeLinearStdInt4<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element);
+template Status CudaDequantizeLinearStdInt4<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element);

 template Status CudaDequantizeLinearAxisStd<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
 template Status CudaDequantizeLinearAxisStd<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
 template Status CudaDequantizeLinearAxisStd<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
 template Status CudaDequantizeLinearAxisStd<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+template Status CudaDequantizeLinearAxisStdInt4<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+template Status CudaDequantizeLinearAxisStdInt4<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+template Status CudaDequantizeLinearAxisStdInt4<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+template Status CudaDequantizeLinearAxisStdInt4<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+
+template Status CudaDequantizeLinearBlockStdInt4<int8_t, float>(cudaStream_t stream, const int8_t* input, float* output, const float* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
+template Status CudaDequantizeLinearBlockStdInt4<uint8_t, float>(cudaStream_t stream, const uint8_t* input, float* output, const float* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
+template Status CudaDequantizeLinearBlockStdInt4<int8_t, half>(cudaStream_t stream, const int8_t* input, half* output, const half* scale, const int8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);
+template Status CudaDequantizeLinearBlockStdInt4<uint8_t, half>(cudaStream_t stream, const uint8_t* input, half* output, const half* scale, const uint8_t* zero_point, size_t num_of_element, size_t K, size_t N, size_t block_size);

 #if !defined(DISABLE_FLOAT8_TYPES)

--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.cuh
@ -11,33 +11,96 @@ namespace onnxruntime {
 namespace cuda {

 template <class T, class U>
-Status CudaQuantizeLinearStd(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element);
+Status CudaQuantizeLinearStd(cudaStream_t stream, const U* input, T* output, const U* scale,
+                             const T* zero_point, size_t num_of_element);

 template <class T, class U>
-Status CudaQuantizeLinearSat(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
-                             bool saturate);
+Status CudaQuantizeLinearStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale,
+                                 const T* zero_point, size_t num_of_element);

 template <class T, class U>
-Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
-                                 size_t batch_size, size_t n_scales);
+Status CudaQuantizeLinearSat(cudaStream_t stream, const U* input, T* output, const U* scale,
+                             const T* zero_point, size_t num_of_element, bool saturate);

 template <class T, class U>
-Status CudaQuantizeLinearAxisSat(cudaStream_t stream, const U* input, T* output, const U* scale, const T* zero_point, size_t num_of_element,
-                                 size_t batch_size, size_t n_scales, bool saturate);
+Status CudaQuantizeLinearAxisStd(cudaStream_t stream, const U* input, T* output, const U* scale,
+                                 const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);

 template <class T, class U>
-Status CudaDequantizeLinearStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element);
+Status CudaQuantizeLinearAxisStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale,
+                                     const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);

 template <class T, class U>
-Status CudaDequantizeLinearSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element);
+Status CudaQuantizeLinearAxisSat(cudaStream_t stream, const U* input, T* output, const U* scale,
+                                 const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales,
+                                 bool saturate);
+
+/**
+ * @brief block-wise quantization with standard rounding to int4. Input is reshaped to [M, K, N]. K the quantization
+ *        axis. Scale is reshaped to [M, ceil(K/block_size), N]. For an index i in input, the coordiate is (xi, yi, zi)
+ *        = (i / (K * N), i % (K * N) / N, i % N). The scale coordiate is (xi, yi / block_size, zi). The scale index
+ *        is xi * ceil(K / block_size) * N + yi / block_size * N + zi.
+ * @tparam T              quantized type, int8_t for Int4x2, uint8_t for UInt4x2
+ * @tparam U              full precision type
+ * @param stream          cuda stream
+ * @param input           input tensor
+ * @param output          output tensor
+ * @param scale           scale tensor
+ * @param zero_point      zero point tensor
+ * @param num_of_element  number of elements in input tensor
+ * @param K               K
+ * @param N               N
+ * @param block_size      block size
+ */
+template <class T, class U>
+Status CudaQuantizeLinearBlockStdInt4(cudaStream_t stream, const U* input, T* output, const U* scale,
+                                      const T* zero_point, size_t num_of_element, size_t K, size_t N,
+                                      size_t block_size);

 template <class T, class U>
-Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
-                                   size_t batch_size, size_t n_scales);
+Status CudaDequantizeLinearStd(cudaStream_t stream, const T* input, U* output, const U* scale,
+                               const T* zero_point, size_t num_of_element);

 template <class T, class U>
-Status CudaDequantizeLinearAxisSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point, size_t num_of_element,
-                                   size_t batch_size, size_t n_scales);
+Status CudaDequantizeLinearStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
+                                   const T* zero_point, size_t num_of_element);

+template <class T, class U>
+Status CudaDequantizeLinearSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point,
+                               size_t num_of_element);
+
+template <class T, class U>
+Status CudaDequantizeLinearAxisStd(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point,
+                                   size_t num_of_element, size_t batch_size, size_t n_scales);
+
+template <class T, class U>
+Status CudaDequantizeLinearAxisStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
+                                       const T* zero_point, size_t num_of_element, size_t batch_size, size_t n_scales);
+
+template <class T, class U>
+Status CudaDequantizeLinearAxisSat(cudaStream_t stream, const T* input, U* output, const U* scale, const T* zero_point,
+                                   size_t num_of_element, size_t batch_size, size_t n_scales);
+
+/**
+ * @brief block-wise dequantization with standard rounding to int4. Input is reshaped to [M, K, N]. K the quantization
+ *        axis. Scale is reshaped to [M, ceil(K/block_size), N]. For an index i in input, the coordiate is (xi, yi, zi)
+ *        = (i / (K * N), i % (K * N) / N, i % N). The scale coordiate is (xi, yi / block_size, zi). The scale index
+ *        is xi * ceil(K / block_size) * N + yi / block_size * N + zi.
+ * @tparam T              quantized type, int8_t for Int4x2, uint8_t for UInt4x2
+ * @tparam U              full precision type
+ * @param stream          cuda stream
+ * @param input           input tensor
+ * @param output          output tensor
+ * @param scale           scale tensor
+ * @param zero_point      zero point tensor
+ * @param num_of_element  number of elements in input tensor
+ * @param K               K
+ * @param N               N
+ * @param block_size      block size
+ */
+template <class T, class U>
+Status CudaDequantizeLinearBlockStdInt4(cudaStream_t stream, const T* input, U* output, const U* scale,
+                                        const T* zero_point, size_t num_of_element, size_t K, size_t N,
+                                        size_t block_size);
 }  // namespace cuda
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/cuda/tensor/quantize_linear.h
+++ b/onnxruntime/core/providers/cuda/tensor/quantize_linear.h
@ -19,6 +19,11 @@ class QuantizeLinear final : public CudaKernel {
    if (!info.GetAttr<int64_t>("saturate", &saturate_).IsOK()) {
      saturate_ = 1;
    }
+    if (!info.GetAttr<int64_t>("block_size", &block_size_).IsOK()) {
+      block_size_ = 0;
+    }
+
+    ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative.");
  }

  Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;
@ -26,6 +31,7 @@ class QuantizeLinear final : public CudaKernel {
 private:
  int64_t axis_;
  int64_t saturate_;
+  int64_t block_size_;
 };

 template <class T, class U>
@ -35,12 +41,18 @@ class DequantizeLinear final : public CudaKernel {
    if (!info.GetAttr<int64_t>("axis", &axis_).IsOK()) {
      axis_ = 1;
    }
+    if (!info.GetAttr<int64_t>("block_size", &block_size_).IsOK()) {
+      block_size_ = 0;
+    }
+
+    ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative.");
  }

  Status ComputeInternal(OpKernelContext* p_op_kernel_context) const override;

 private:
  int64_t axis_;
+  int64_t block_size_;
 };

 }  // namespace cuda
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@ -1370,18 +1370,26 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint64_t, Cast);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, bool, Cast);

-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
+                                                          float, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
+                                                          float, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
+                                                          MLFloat16, DequantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
+                                                          MLFloat16, DequantizeLinear);

 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Identity);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, If);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Loop);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear);
-class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
+                                                          float, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
+                                                          float, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t,
+                                                          MLFloat16, QuantizeLinear);
+class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t,
+                                                          MLFloat16, QuantizeLinear);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Reshape);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Shape);
@ -1390,6 +1398,24 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, S
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN);

+// Opset 21
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float,
+                                                DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float,
+                                                DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16,
+                                                DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16,
+                                                DequantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float,
+                                                QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float,
+                                                QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16,
+                                                QuantizeLinear);
+class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16,
+                                                QuantizeLinear);
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
  return {};
@ -2333,19 +2359,19 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, bool, Cast)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, BFloat16, Cast)>,

-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, DequantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, DequantizeLinear)>,

    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Identity)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, If)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Loop)>,

-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, float, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, float, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, uint8_t, MLFloat16, QuantizeLinear)>,
-    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, int8_t, MLFloat16, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, float, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, float, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, uint8_t, MLFloat16, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, 20, int8_t, MLFloat16, QuantizeLinear)>,

    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Reshape)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 19, Scan)>,
@ -2354,6 +2380,16 @@ static Status RegisterRocmKernels(KernelRegistry& kernel_registry) {
    // opset 20
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsInf)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 20, IsNaN)>,
+
+    // opset 21
+    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, DequantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, float, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, float, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, uint8_t, MLFloat16, QuantizeLinear)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 21, int8_t, MLFloat16, QuantizeLinear)>,
  };

  for (auto& function_table_entry : function_table) {
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@ -1289,6 +1289,10 @@ struct Tensor final {
 template <>
 inline bool Tensor::IsDataType<bool>() const { return g_host->Tensor__IsDataType_bool(this); }
 template <>
+inline bool Tensor::IsDataType<Int4x2>() const { return g_host->Tensor__IsDataType_Int4x2(this); }
+template <>
+inline bool Tensor::IsDataType<UInt4x2>() const { return g_host->Tensor__IsDataType_UInt4x2(this); }
+template <>
 inline bool Tensor::IsDataType<int8_t>() const { return g_host->Tensor__IsDataType_int8(this); }
 template <>
 inline bool Tensor::IsDataType<uint8_t>() const { return g_host->Tensor__IsDataType_uint8(this); }
@ -1327,6 +1331,10 @@ inline bool Tensor::IsDataType<Float8E5M2FNUZ>() const { return g_host->Tensor__
 template <>
 inline bool* Tensor::MutableData<bool>() { return g_host->Tensor__MutableData_bool(this); }
 template <>
+inline Int4x2* Tensor::MutableData<Int4x2>() { return g_host->Tensor__MutableData_Int4x2(this); }
+template <>
+inline UInt4x2* Tensor::MutableData<UInt4x2>() { return g_host->Tensor__MutableData_UInt4x2(this); }
+template <>
 inline int8_t* Tensor::MutableData<int8_t>() { return g_host->Tensor__MutableData_int8(this); }
 template <>
 inline uint8_t* Tensor::MutableData<uint8_t>() { return g_host->Tensor__MutableData_uint8(this); }
@ -1365,6 +1373,10 @@ inline Float8E5M2FNUZ* Tensor::MutableData<Float8E5M2FNUZ>() { return g_host->Te
 template <>
 inline const bool* Tensor::Data<bool>() const { return g_host->Tensor__Data_bool(this); }
 template <>
+inline const Int4x2* Tensor::Data<Int4x2>() const { return g_host->Tensor__Data_Int4x2(this); }
+template <>
+inline const UInt4x2* Tensor::Data<UInt4x2>() const { return g_host->Tensor__Data_UInt4x2(this); }
+template <>
 inline const int8_t* Tensor::Data<int8_t>() const { return g_host->Tensor__Data_int8(this); }
 template <>
 inline const uint8_t* Tensor::Data<uint8_t>() const { return g_host->Tensor__Data_uint8(this); }
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc
@ -6,6 +6,7 @@
 #include <functional>
 #include <string>
 #include <vector>
+#include <memory>

 #include "core/common/inlined_containers_fwd.h"
 #include "core/common/span_utils.h"
@ -140,7 +141,8 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
                       double relative_per_sample_tolerance,
                       std::unique_ptr<GraphTransformer> transformer,
                       const std::function<void(SessionOptions&)>& add_session_options,
-                       const InlinedHashSet<std::string>& disabled_optimizers) {
+                       const InlinedHashSet<std::string>& disabled_optimizers,
+                       std::unique_ptr<IExecutionProvider> ep) {
  // Build the model for this test.
  std::unordered_map<std::string, int> domain_to_version;
  domain_to_version[kOnnxDomain] = opset_version;
@ -157,6 +159,7 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
  // Serialize the model to a string.
  std::string model_data;
  model.ToProto().SerializeToString(&model_data);
+  std::shared_ptr<IExecutionProvider> ep_shared = ep ? std::move(ep) : nullptr;

  auto run_model = [&](TransformerLevel level, std::vector<OrtValue>& fetches,
                       std::unique_ptr<GraphTransformer> transformer = nullptr) {
@ -170,6 +173,10 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
      add_session_options(session_options);
    }
    InferenceSessionWrapper session{session_options, GetEnvironment()};
+    if (ep_shared) {
+      ASSERT_STATUS_OK(session.RegisterExecutionProvider(ep_shared));
+    }
+
    ASSERT_STATUS_OK(session.Load(model_data.data(), static_cast<int>(model_data.size())));
    if (transformer) {
      ASSERT_STATUS_OK(session.RegisterGraphTransformer(std::move(transformer), level));
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.h
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@ -555,7 +555,8 @@ void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& buil
                       double relative_per_sample_tolerance = 0.0,
                       std::unique_ptr<GraphTransformer> transformer = nullptr,
                       const std::function<void(SessionOptions&)>& add_session_options = {},
-                       const InlinedHashSet<std::string>& disabled_optimizers = {});
+                       const InlinedHashSet<std::string>& disabled_optimizers = {},
+                       std::unique_ptr<IExecutionProvider> ep = nullptr);

 void TransformerTester(const std::function<void(ModelTestBuilder& helper)>& build_test_case,
                       const std::function<void(InferenceSessionWrapper& session)>& check_transformed_graph,
--- a/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_matmulnbits_transformer_test.cc
@ -17,6 +17,7 @@
 #include "test/optimizer/qdq_test_utils.h"
 #include "test/optimizer/graph_transform_test_builder.h"
 #include "test/util/include/asserts.h"
+#include "test/util/include/default_providers.h"
 #include "test/util/include/inference_session_wrapper.h"

 #include "gtest/gtest.h"
@ -55,7 +56,8 @@ RunDQMatMulNotConverted_NonConstDQ(const std::vector<int64_t>& input1_shape,
                                   const std::vector<int64_t>& input2_shape,
                                   const int64_t axis,
                                   const int64_t block_size,
-                                   int64_t accuracy_level) {
+                                   int64_t accuracy_level,
+                                   std::unique_ptr<IExecutionProvider> ep = nullptr) {
  auto build_test_case = [&](ModelTestBuilder& builder) {
    auto* input1_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
    auto* input2_arg = builder.MakeInput(input2_shape, T(T::min_val, 0), T(T::max_val, 0));
@ -104,7 +106,9 @@ RunDQMatMulNotConverted_NonConstDQ(const std::vector<int64_t>& input1_shape,
                    1e-5 /*per_sample_tolerance*/,
                    1e-5 /*relative_per_sample_tolerance*/,
                    nullptr,
-                    add_session_options_fn);
+                    add_session_options_fn,
+                    {},
+                    ep ? std::move(ep) : nullptr);
 }

 TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) {
@ -127,6 +131,27 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ) {
  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
 }

+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_NonConstDQ_Cuda) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
+  ;
+  RunDQMatMulNotConverted_NonConstDQ<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_NonConstDQ<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
+}
+
 //        Input2
 //           |
 //    DQ     /
@ -140,7 +165,8 @@ RunDQMatMulNotConverted_FirstDQInput(const std::vector<int64_t>& weight_shape,
                                     const std::vector<int64_t>& input2_shape,
                                     const int64_t axis,
                                     const int64_t block_size,
-                                     int64_t accuracy_level) {
+                                     int64_t accuracy_level,
+                                     std::unique_ptr<IExecutionProvider> ep = nullptr) {
  auto build_test_case = [&](ModelTestBuilder& builder) {
    auto* weight_arg = builder.MakeInitializer(weight_shape, T(T::min_val, 0), T(T::max_val, 0));
    auto* input2_arg = builder.MakeInput(input2_shape, -100.0f, 100.0f);
@ -189,7 +215,9 @@ RunDQMatMulNotConverted_FirstDQInput(const std::vector<int64_t>& weight_shape,
                    1e-5 /*per_sample_tolerance*/,
                    1e-5 /*relative_per_sample_tolerance*/,
                    nullptr,
-                    add_session_options_fn);
+                    add_session_options_fn,
+                    {},
+                    ep ? std::move(ep) : nullptr);
 }

 TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) {
@ -212,6 +240,27 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput) {
  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1);
 }

+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_FirstDQInput_Cuda) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, 4, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
+  ;
+  RunDQMatMulNotConverted_FirstDQInput<UInt4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, true>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_FirstDQInput<Int4x2, false>({12, 37}, {37, 12}, 0, 16, -1, DefaultCudaExecutionProvider());
+}
+
 // Input1
 //   |
 //    \      DQ
@ -224,7 +273,8 @@ void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector<int64_t>& input
                                               const std::vector<int64_t>& weight_shape,
                                               const int64_t axis,
                                               const int64_t block_size,
-                                               int64_t accuracy_level) {
+                                               int64_t accuracy_level,
+                                               std::unique_ptr<IExecutionProvider> ep = nullptr) {
  auto build_test_case = [&](ModelTestBuilder& builder) {
    auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
    auto* output_arg = builder.MakeOutput();
@ -287,7 +337,9 @@ void RunDQMatMulNotConverted_TypeShapeMismatch(const std::vector<int64_t>& input
                    1e-5 /*per_sample_tolerance*/,
                    1e-5 /*relative_per_sample_tolerance*/,
                    nullptr,
-                    add_session_options_fn);
+                    add_session_options_fn,
+                    {},
+                    ep ? std::move(ep) : nullptr);
 }

 TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_TypeMismatch) {
@ -327,6 +379,31 @@ TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch) {
  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0);
 }

+TEST(QDQTransformerTests, DQMatMulNotConvertedToMatMulNBits_ShapeMismatch_Cuda) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  // block size too small
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 8, 0, DefaultCudaExecutionProvider());
+  // block size not 2's power
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
+  ;
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 12}, 0, 17, 0, DefaultCudaExecutionProvider());
+  // not axis 0
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({12, 37}, {37, 37}, 1, 16, 0, DefaultCudaExecutionProvider());
+  // not rank 2
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<UInt4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, true>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulNotConverted_TypeShapeMismatch<Int4x2, false>({2, 12, 37}, {2, 37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+}
+
 //  Input1
 //    |      DQ
 //     \    /
@ -343,7 +420,8 @@ RunDQMatMulConverted(const std::vector<int64_t>& input1_shape,
                     const std::vector<int64_t>& weight2_shape,
                     const int64_t axis,
                     const int64_t block_size,
-                     int64_t accuracy_level) {
+                     int64_t accuracy_level,
+                     std::unique_ptr<IExecutionProvider> ep = nullptr) {
  auto build_test_case = [&](ModelTestBuilder& builder) {
    auto* input_arg = builder.MakeInput(input1_shape, -100.0f, 100.0f);
    auto* output_arg = builder.MakeOutput();
@ -402,9 +480,11 @@ RunDQMatMulConverted(const std::vector<int64_t>& input1_shape,
                    TransformerLevel::Level2,
                    21 /*opset_version*/,
                    1e-5 /*per_sample_tolerance*/,
-                    1e-5 /*relative_per_sample_tolerance*/,
+                    2e-5 /*relative_per_sample_tolerance*/,
                    nullptr,
-                    add_session_options_fn);
+                    add_session_options_fn,
+                    {},
+                    ep ? std::move(ep) : nullptr);
 }

 TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) {
@ -419,6 +499,18 @@ TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits) {
  RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1);
 }

+TEST(QDQTransformerTests, DQMatMulConvertedToMatMulNBits_Cuda) {
+  // DQ contrib op schema is not updated to support blocked quantization
+  RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 0, DefaultCudaExecutionProvider());
+  RunDQMatMulConverted<Int4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulConverted<Int4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulConverted<UInt4x2, true>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+  RunDQMatMulConverted<UInt4x2, false>({12, 12}, {12, 37}, {37, 12}, 0, 16, 1, DefaultCudaExecutionProvider());
+}
+
 #endif  // !defined(DISABLE_CONTRIB_OPS)

 }  // namespace test
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@ -869,7 +869,8 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size,
 template <typename Tin, typename Tout>
 void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
                                                           int64_t scale_block_count,
-                                                           int64_t zero_point_block_count) {
+                                                           int64_t zero_point_block_count,
+                                                           std::unique_ptr<IExecutionProvider> ep = nullptr) {
  OpTester test("DequantizeLinear", 21);
  std::vector<int64_t> dims{2, 4};
  std::vector<Tout> x_scale, y;
@ -877,7 +878,7 @@ void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
  SessionOptions so;
  std::vector<std::string> log_msgs;  // redirect error messages
  std::vector<std::unique_ptr<IExecutionProvider>> eps;
-  eps.push_back(DefaultCpuExecutionProvider());
+  eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
  so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
                                const char* logid, const char* code_location, const char* message) {
    ORT_UNUSED_PARAMETER(severity);
@ -970,6 +971,13 @@ TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int) {
  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(-1, 2, 2);
 }

+TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int_Cuda) {
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(-1, 2, 2, DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(-1, 2, 2, DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(-2, 2, 2, DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(-2, 2, 2, DefaultCudaExecutionProvider());
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) {
  constexpr int min_cuda_architecture = 11080;
@ -1013,6 +1021,13 @@ TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) {
  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 1, 1);
 }

+TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int_Cuda) {
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 1, 1, DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 3, 3, DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 3, 3, DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 1, 1, DefaultCudaExecutionProvider());
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) {
  constexpr int min_cuda_architecture = 11080;
@ -1052,6 +1067,13 @@ TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) {
  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 2, 1);
 }

+TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int_Cuda) {
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 2, 1, DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 2, 3, DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 2, 3, DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 2, 1, DefaultCudaExecutionProvider());
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) {
  constexpr int min_cuda_architecture = 11080;
@ -1081,14 +1103,14 @@ void DequantizeLinearOp21BlockedTest_Int4_Succeed(std::vector<int64_t>&& dims,
                                                  std::vector<int>& x_,
                                                  std::vector<float>& x_scale_,
                                                  std::vector<int>& x_zero_point_,
-                                                  std::vector<float>& y_) {
+                                                  std::vector<float>& y_,
+                                                  std::unique_ptr<IExecutionProvider> ep = nullptr) {
  OpTester test("DequantizeLinear", 21);
  std::vector<int64_t> x_scale_shape;
  std::vector<Tout> x_scale, y;
  std::vector<Tin> x, x_zero_point;
  std::vector<std::unique_ptr<IExecutionProvider>> eps;
-  eps.push_back(DefaultCpuExecutionProvider());
-
+  eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
  int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
  bool use_zero_point = !x_zero_point_.empty();

@ -1216,6 +1238,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point;
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, 2.0, 8.0, -10.5, -4.0, 10.0, 24.0, -24.5, 8.0};
+  std::vector<float> y_3{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, -2.0, -8.0, 10.5, 4.0, 10.0, 24.0, -24.5, 8.0};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
+                                                                  DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
+                                                                  DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
  std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
@ -1237,6 +1276,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, 2.0, 0, 3.5, 3.0, 10.0, 16.0, -10.5, 15};
+  std::vector<float> y_3{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, -14.0, -24, 21, 5, 10.0, 16.0, -10.5, 15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
+                                                                  DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
+                                                                  DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
  std::vector<int> zero_point{};
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
@ -1262,6 +1318,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis_Cuda) {
+  std::vector<int> zero_point{};
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{14, 24, 10, 16, -10.5, -2, -3.5, 0, 2, 8, 6, 16, -17.5, -6, -24.5, 8};
+  std::vector<float> y_3{14, 24, 10, 16, 6, 8, -3.5, 0, 2, 8, 6, 16, 10, 24, -24.5, 8};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
+                                                                  DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
+                                                                  DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
  std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
@ -1283,6 +1356,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{2, 8, -2, 0, 0, -1, 7, 1, 2, 0, 6, 8, -3.5, 1, -10.5, 15};
+  std::vector<float> y_3{2, 8, -2, 0, -6, -8, 7, 1, 2, 0, 6, 8, 10, 16, -10.5, 15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
+                                                                  DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
+                                                                  DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
  std::vector<int> zero_point{};
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
@ -1308,6 +1398,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<int32_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis_Cuda) {
+  std::vector<int> zero_point{};
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{14, 12, 20, 16, -10.5, -7, -1, 0, 2, 4, 12, 16, -17.5, -21, -7, 8};
+  std::vector<float> y_3{14, 12, 10, 16, -10.5, -7, -3.5, 0, 2, 4, 6, 16, -17.5, -21, -24.5, 8};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
+                                                                  DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
+                                                                  DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
  std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
@ -1329,6 +1436,23 @@ TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<int16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{-6, -4, -3, -1, 0, 2, 4, 7};
+  std::vector<int> x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8};
+  std::vector<float> y_2{2, 0, 4, 0, 0, 3.5, 0, 1, 2, 4, 4, 8, -3.5, -7, 0, 15};
+  std::vector<float> y_3{2, 0, -2, 0, 0, 3.5, 7, 1, 2, 4, 6, 8, -3.5, -7, -10.5, 15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
+                                                              DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
+                                                                  DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
+                                                                  DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
  std::vector<int> zero_point{};
@ -1350,6 +1474,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{0, -4, 7, 3, -8, -20, 21, 7, 16, 36, -35, -11, 24, 52, -49, -15};
+  std::vector<float> y_3{0, -4, 7, 3, -8, -20, 21, 7, -16, -36, 35, 11, 24, 52, -49, -15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
+                                                                   DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
+                                                                   DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
@ -1371,6 +1512,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{4, -4, 3.5, -6, -4, -20, 17.5, -2, -10, 16, 3.5, -5, -2, 32, -10.5, -9};
+  std::vector<float> y_3{4, -4, 3.5, -6, -4, -20, 17.5, -2, -12, -36, 31.5, 2, -2, 32, -10.5, -9};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2,
+                                                                   DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3,
+                                                                   DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
  std::vector<int> zero_point{};
@ -1392,6 +1550,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15};
+  std::vector<float> y_3{0, -4, -4, -12, -8, -20, 21, 7, 16, 36, 20, 44, 24, 52, -49, -15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
+                                                                   DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
+                                                                   DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
@ -1413,6 +1588,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9};
+  std::vector<float> y_3{4, -4, 0, -12, -4, -20, 17.5, -2, -10, 16, -6, 24, -2, 32, -10.5, -9};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2,
+                                                                   DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3,
+                                                                   DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
  std::vector<int> zero_point{};
@ -1434,6 +1626,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{0, -2, -8, -12, 14, 17.5, 6, 7, 16, 18, 40, 44, -42, -45.5, -14, -15};
+  std::vector<float> y_3{0, -2, -4, -12, 14, 17.5, 21, 7, 16, 18, 20, 44, -42, -45.5, -49, -15};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
+                                                                   DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
+                                                                   DefaultCudaExecutionProvider());
+}
+
 TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
@ -1455,6 +1664,23 @@ TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
  DequantizeLinearOp21BlockedTest_Int_Succeed<uint16_t, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3);
 }

+TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis_Cuda) {
+  std::vector<float> x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6};
+  std::vector<int> x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<float> y_2{4, 2, -8, -12, 10.5, 14, -3, -2, -10, -8, 20, 24, -3.5, -7, -8, -9};
+  std::vector<float> y_3{4, 2, 0, -12, 10.5, 14, 17.5, -2, -10, -8, -6, 24, -3.5, -7, -10.5, -9};
+
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
+                                                               DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2,
+                                                                   DefaultCudaExecutionProvider());
+  DequantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3,
+                                                                   DefaultCudaExecutionProvider());
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_FirstAxis) {
  constexpr int min_cuda_architecture = 11080;
@ -1624,7 +1850,8 @@ void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size,
 template <typename Tout, typename Tin>
 void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
                                                         int64_t scale_block_count,
-                                                         int64_t zero_point_block_count) {
+                                                         int64_t zero_point_block_count,
+                                                         std::unique_ptr<IExecutionProvider> ep = nullptr) {
  OpTester test("QuantizeLinear", 21);
  std::vector<int64_t> dims{2, 4};
  std::vector<Tout> x_zero_point, y;
@ -1632,7 +1859,7 @@ void QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size,
  SessionOptions so;
  std::vector<std::string> log_msgs;  // redirect error messages
  std::vector<std::unique_ptr<IExecutionProvider>> eps;
-  eps.push_back(DefaultCpuExecutionProvider());
+  eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
  so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category,
                                const char* logid, const char* code_location, const char* message) {
    ORT_UNUSED_PARAMETER(severity);
@ -1725,6 +1952,13 @@ TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Int) {
  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(-1, 2, 2);
 }

+TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Int_Cuda) {
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(-1, 2, 2, DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(-1, 2, 2, DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(-2, 2, 2, DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(-2, 2, 2, DefaultCudaExecutionProvider());
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 TEST(QuantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) {
  constexpr int min_cuda_architecture = 11080;
@ -1768,6 +2002,13 @@ TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) {
  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 1, 1);
 }

+TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int_Cuda) {
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 1, 1, DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 3, 3, DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 3, 3, DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 1, 1, DefaultCudaExecutionProvider());
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 TEST(QuantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) {
  constexpr int min_cuda_architecture = 11080;
@ -1807,6 +2048,13 @@ TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) {
  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int<int32_t, MLFloat16>(3, 2, 1);
 }

+TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int_Cuda) {
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, float>(3, 2, 1, DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<Int4x2, MLFloat16>(3, 2, 3, DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, float>(3, 2, 3, DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_InvalidBlockSize_Int4<UInt4x2, MLFloat16>(3, 2, 1, DefaultCudaExecutionProvider());
+}
+
 #if !defined(DISABLE_FLOAT8_TYPES)
 TEST(QuantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) {
  constexpr int min_cuda_architecture = 11080;
@ -1836,14 +2084,14 @@ void QuantizeLinearOp21BlockedTest_Int4_Succeed(std::vector<int64_t>&& dims,
                                                std::vector<float>& x_,
                                                std::vector<float>& scale_,
                                                std::vector<int>& zero_point_,
-                                                std::vector<int>& y_) {
+                                                std::vector<int>& y_,
+                                                std::unique_ptr<IExecutionProvider> ep = nullptr) {
  OpTester test("QuantizeLinear", 21);
  std::vector<int64_t> scale_shape;
  std::vector<Tout> zero_point, y;
  std::vector<Tin> x, scale;
  std::vector<std::unique_ptr<IExecutionProvider>> eps;
-  eps.push_back(DefaultCpuExecutionProvider());
-
+  eps.push_back(ep ? std::move(ep) : DefaultCpuExecutionProvider());
  int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis;
  bool use_zero_point = !zero_point_.empty();

@ -1970,6 +2218,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_FirstAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_FirstAxis_Cuda) {
+  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
+                             2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> x{14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4, 14.0, 24.0, -17.5, -4,
+                       6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0, 6.0, 8.0, -3.5, 0.0,
+                       2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0, 2.0, 8.0, -10.5, -4.0,
+                       10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0, 10.0, 24.0, -24.5, 8.0};
+  std::vector<int> y_2{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
+                       -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
+                       1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+                       5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
+  std::vector<int> y_3{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
+                       -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
+                       -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4,
+                       5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
+                                                                DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
+                                                                DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) {
  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
                             2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
@ -2022,6 +2298,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_FirstAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_FirstAxis_Cuda) {
+  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
+                             2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{-6, -4, -3, -1, -6, -4, -3, -1, -6, -4, -3, -1, -6, -4, -3, -1,
+                              0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7};
+  std::vector<float> x{2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3, 2.0, 8.0, -7.0, -3,
+                       -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1, -6.0, -8.0, 7.0, 1,
+                       2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0, 2.0, 0, 3.5, 3.0,
+                       10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15, 10.0, 16.0, -10.5, 15};
+  std::vector<int> y_2{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
+                       -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
+                       1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+                       5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
+  std::vector<int> y_3{-7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4, -7, -6, -5, -4,
+                       -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0, -3, -2, -1, 0,
+                       -7, -4, -2, 2, -7, -4, -2, 2, -7, -4, -2, 2, -7, -4, -2, 2,
+                       5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8, 5, 6, 7, -8};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
+                                                                DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
+                                                                DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) {
  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
                             2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
@ -2074,6 +2378,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_MiddleAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_MiddleAxis_Cuda) {
+  std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, -2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0,
+                             2.0, 4.0, 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0};
+  std::vector<float> x{14, 24, 14, 24, 14, 24, 14, 24, 10, 16, 10, 16, 10, 16, 10, 16,
+                       -10.5, -2, -10.5, -2, -10.5, -2, -10.5, -2, -3.5, 0, -3.5, 0, -3.5, 0, -3.5, 0,
+                       2, 8, 2, 8, 2, 8, 2, 8, 6, 16, 6, 16, 6, 16, 6, 16,
+                       -17.5, -6, -17.5, -6, -17.5, -6, -17.5, -6, -24.5, 8, -24.5, 8, -24.5, 8, -24.5, 8};
+  std::vector<int> y_2{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
+                       -3, -2, -3, -2, -3, -2, -3, -2, -1, 0, -1, 0, -1, 0, -1, 0,
+                       1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
+                       5, 6, 5, 6, 5, 6, 5, 6, 7, -8, 7, -8, 7, -8, 7, -8};
+  std::vector<int> y_3{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
+                       5, 0, 5, 0, 5, 0, 5, 0, -1, 0, -1, 0, -1, 0, -1, 0,
+                       1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
+                       -8, -2, -8, -2, -8, -2, -8, -2, 7, -8, 7, -8, 7, -8, 7, -8};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
+                                                                DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
+                                                                DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) {
  std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@ -2126,6 +2458,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_MiddleAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_MiddleAxis_Cuda) {
+  std::vector<int> zero_point{-6, -4, -6, -4, -6, -4, -6, -4, -3, -1, -3, -1, -3, -1, -3, -1,
+                              0, 2, 0, 2, 0, 2, 0, 2, 4, 7, 4, 7, 4, 7, 4, 7};
+  std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, -2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0, 3.5, 1.0,
+                             2.0, 4.0, 2.0, 4.0, 2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0, -3.5, -1.0};
+  std::vector<float> x{2, 8, 2, 8, 2, 8, 2, 8, -2, 0, -2, 0, -2, 0, -2, 0,
+                       0, -1, 0, -1, 0, -1, 0, -1, 7, 1, 7, 1, 7, 1, 7, 1,
+                       2, 0, 2, 0, 2, 0, 2, 0, 6, 8, 6, 8, 6, 8, 6, 8,
+                       -3.5, 1, -3.5, 1, -3.5, 1, -3.5, 1, -10.5, 15, -10.5, 15, -10.5, 15, -10.5, 15};
+  std::vector<int> y_2{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
+                       -3, -2, -3, -2, -3, -2, -3, -2, -1, 0, -1, 0, -1, 0, -1, 0,
+                       1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
+                       5, 6, 5, 6, 5, 6, 5, 6, 7, -8, 7, -8, 7, -8, 7, -8};
+  std::vector<int> y_3{-7, -6, -7, -6, -7, -6, -7, -6, -5, -4, -5, -4, -5, -4, -5, -4,
+                       -6, -4, -6, -4, -6, -4, -6, -4, -1, 0, -1, 0, -1, 0, -1, 0,
+                       1, 2, 1, 2, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4,
+                       -2, 2, -2, 2, -2, 2, -2, 2, 7, -8, 7, -8, 7, -8, 7, -8};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 2, x, y_scale, zero_point, y_2,
+                                                                DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 1, 3, x, y_scale, zero_point, y_3,
+                                                                DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) {
  std::vector<int> zero_point{-6, -4, -6, -4, -6, -4, -6, -4, -3, -1, -3, -1, -3, -1, -3, -1,
                              0, 2, 0, 2, 0, 2, 0, 2, 4, 7, 4, 7, 4, 7, 4, 7};
@ -2178,6 +2538,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_LastAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, SignedInt4_NoZeroPoint_LastAxis_Cuda) {
+  std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0,
+                              0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
+                             2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
+  std::vector<float> x{14, 12, 14, 12, 20, 16, 20, 16, 14, 12, 14, 12, 20, 16, 20, 16,
+                       -10.5, -7, -10.5, -7, -1, 0, -1, 0, -10.5, -7, -10.5, -7, -1, 0, -1, 0,
+                       2, 4, 2, 4, 12, 16, 12, 16, 2, 4, 2, 4, 12, 16, 12, 16,
+                       -17.5, -21, -17.5, -21, -7, 8, -7, 8, -17.5, -21, -17.5, -21, -7, 8, -7, 8};
+  std::vector<int> y_2{-7, -6, -7, -6, -5, -4, -5, -4, -7, -6, -7, -6, -5, -4, -5, -4,
+                       -3, -2, -3, -2, -1, 0, -1, 0, -3, -2, -3, -2, -1, 0, -1, 0,
+                       1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4,
+                       5, 6, 5, 6, 7, -8, 7, -8, 5, 6, 5, 6, 7, -8, 7, -8};
+  std::vector<int> y_3{-7, -6, -7, -6, -8, -4, -5, -4, -7, -6, -7, -6, -8, -4, -5, -4,
+                       -3, -2, -3, -2, 0, 0, -1, 0, -3, -2, -3, -2, 0, 0, -1, 0,
+                       1, 2, 1, 2, 6, 4, 3, 4, 1, 2, 1, 2, 6, 4, 3, 4,
+                       5, 6, 5, 6, 2, -8, 7, -8, 5, 6, 5, 6, 2, -8, 7, -8};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
+                                                                DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
+                                                                DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) {
  std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0,
                              0, 0, 0, 0, 0, 0, 0, 0};
@ -2230,6 +2618,34 @@ TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_LastAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, SignedInt4_UseZeroPoint_LastAxis_Cuda) {
+  std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
+                             2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
+  std::vector<int> zero_point{-6, -4, -6, -4, -3, -1, -3, -1,
+                              0, 2, 0, 2, 4, 7, 4, 7};
+  std::vector<float> x{2, 0, 2, 0, 4, 0, 4, 0, 2, 0, 2, 0, 4, 0, 4, 0,
+                       0, 3.5, 0, 3.5, 0, 1, 0, 1, 0, 3.5, 0, 3.5, 0, 1, 0, 1,
+                       2, 4, 2, 4, 4, 8, 4, 8, 2, 4, 2, 4, 4, 8, 4, 8,
+                       -3.5, -7, -3.5, -7, 0, 15, 0, 15, -3.5, -7, -3.5, -7, 0, 15, 0, 15};
+  std::vector<int> y_2{-7, -6, -7, -6, -5, -4, -5, -4, -7, -6, -7, -6, -5, -4, -5, -4,
+                       -3, -2, -3, -2, -1, 0, -1, 0, -3, -2, -3, -2, -1, 0, -1, 0,
+                       1, 2, 1, 2, 3, 4, 3, 4, 1, 2, 1, 2, 3, 4, 3, 4,
+                       5, 6, 5, 6, 7, -8, 7, -8, 5, 6, 5, 6, 7, -8, 7, -8};
+  std::vector<int> y_3{-7, -6, -7, -6, -8, -4, -5, -4, -7, -6, -7, -6, -8, -4, -5, -4,
+                       -3, -2, -3, -2, -3, 0, -1, 0, -3, -2, -3, -2, -3, 0, -1, 0,
+                       1, 2, 1, 2, 2, 4, 3, 4, 1, 2, 1, 2, 2, 4, 3, 4,
+                       5, 6, 5, 6, 4, -8, 7, -8, 5, 6, 5, 6, 4, -8, 7, -8};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
+                                                            DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
+                                                                DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<Int4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
+                                                                DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) {
  std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
                             2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
@ -2282,6 +2698,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_FirstAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_FirstAxis_Cuda) {
+  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
+                             2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> x{0, -4, 7, 3, 0, -4, 7, 3, 0, -4, 7, 3, 0, -4, 7, 3,
+                       -8, -20, 21, 7, -8, -20, 21, 7, -8, -20, 21, 7, -8, -20, 21, 7,
+                       16, 36, -35, -11, 16, 36, -35, -11, 16, 36, -35, -11, 16, 36, -35, -11,
+                       24, 52, -49, -15, 24, 52, -49, -15, 24, 52, -49, -15, 24, 52, -49, -15};
+  std::vector<int> y_2{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+                       4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+                       8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
+                       12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
+  std::vector<int> y_3{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+                       4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+                       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                       12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
+                                                                 DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
+                                                                 DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) {
  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
                             2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
@ -2334,6 +2778,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_FirstAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_FirstAxis_Cuda) {
+  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
+                             2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{2, 0, 1, 9, 2, 0, 1, 9, 2, 0, 1, 9, 2, 0, 1, 9,
+                              13, 5, 11, 6, 13, 5, 11, 6, 13, 5, 11, 6, 13, 5, 11, 6};
+  std::vector<float> x{4, -4, 3.5, -6, 4, -4, 3.5, -6, 4, -4, 3.5, -6, 4, -4, 3.5, -6,
+                       -4, -20, 17.5, -2, -4, -20, 17.5, -2, -4, -20, 17.5, -2, -4, -20, 17.5, -2,
+                       -10, 16, 3.5, -5, -10, 16, 3.5, -5, -10, 16, 3.5, -5, -10, 16, 3.5, -5,
+                       -2, 32, -10.5, -9, -2, 32, -10.5, -9, -2, 32, -10.5, -9, -2, 32, -10.5, -9};
+  std::vector<int> y_2{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+                       4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+                       8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11,
+                       12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
+  std::vector<int> y_3{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+                       4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7,
+                       7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4, 7, 0, 2, 4,
+                       12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 2, x, y_scale, zero_point, y_2,
+                                                                 DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({4, 8, 2}, 0, 3, x, y_scale, zero_point, y_3,
+                                                                 DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) {
  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0, -2.0, -4.0, 3.5, 1.0,
                             2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0, 2.0, 4.0, -3.5, -1.0};
@ -2386,6 +2858,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_MiddleAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_MiddleAxis_Cuda) {
+  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
+                             -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                              0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> x{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15,
+                       0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15,
+                       0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15,
+                       0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15};
+  std::vector<int> y_2{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<int> y_3{0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15,
+                       0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15,
+                       0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15,
+                       0, 1, 2, 3, 0, 0, 6, 7, 8, 9, 10, 11, 0, 0, 14, 15};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
+                                                                 DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
+                                                                 DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) {
  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
                             -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
@ -2438,6 +2938,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_MiddleAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_MiddleAxis_Cuda) {
+  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
+                             -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
+  std::vector<int> zero_point{2, 0, 1, 9, 13, 5, 11, 6, 2, 0, 1, 9, 13, 5, 11, 6,
+                              2, 0, 1, 9, 13, 5, 11, 6, 2, 0, 1, 9, 13, 5, 11, 6};
+  std::vector<float> x{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9,
+                       4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9,
+                       4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9,
+                       4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9};
+  std::vector<int> y_2{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                       0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  std::vector<int> y_3{0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15,
+                       0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15,
+                       0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15,
+                       0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 11, 3, 14, 15};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 2, x, y_scale, zero_point, y_2,
+                                                                 DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({8, 4, 2}, 1, 3, x, y_scale, zero_point, y_3,
+                                                                 DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) {
  std::vector<float> y_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0,
                             -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0, -2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0};
@ -2490,6 +3018,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_LastAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_NoZeroPoint_LastAxis_Cuda) {
+  std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
+                             2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
+  std::vector<int> zero_point{0, 0, 0, 0, 0, 0, 0, 0,
+                              0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<float> x{0, -2, 0, -2, -8, -12, -8, -12, 0, -2, 0, -2, -8, -12, -8, -12,
+                       14, 17.5, 14, 17.5, 6, 7, 6, 7, 14, 17.5, 14, 17.5, 6, 7, 6, 7,
+                       16, 18, 16, 18, 40, 44, 40, 44, 16, 18, 16, 18, 40, 44, 40, 44,
+                       -42, -45.5, -42, -45.5, -14, -15, -14, -15, -42, -45.5, -42, -45.5, -14, -15, -14, -15};
+  std::vector<int> y_2{0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3,
+                       4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7,
+                       8, 9, 8, 9, 10, 11, 10, 11, 8, 9, 8, 9, 10, 11, 10, 11,
+                       12, 13, 12, 13, 14, 15, 14, 15, 12, 13, 12, 13, 14, 15, 14, 15};
+  std::vector<int> y_3{0, 1, 0, 1, 4, 3, 2, 3, 0, 1, 0, 1, 4, 3, 2, 3,
+                       4, 5, 4, 5, 2, 7, 6, 7, 4, 5, 4, 5, 2, 7, 6, 7,
+                       8, 9, 8, 9, 15, 11, 10, 11, 8, 9, 8, 9, 15, 11, 10, 11,
+                       12, 13, 12, 13, 4, 15, 14, 15, 12, 13, 12, 13, 4, 15, 14, 15};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
+                                                                 DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
+                                                                 DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) {
  std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
                             2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
@ -2542,6 +3098,34 @@ TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_LastAxis) {
  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3);
 }

+TEST(QuantizeLinearOp21BlockedTest, UnsignedInt4_UseZeroPoint_LastAxis_Cuda) {
+  std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
+                             2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};
+  std::vector<int> zero_point{2, 0, 2, 0, 1, 9, 1, 9,
+                              13, 5, 13, 5, 11, 6, 11, 6};
+  std::vector<float> x{4, 2, 4, 2, -8, -12, -8, -12, 4, 2, 4, 2, -8, -12, -8, -12,
+                       10.5, 14, 10.5, 14, -3, -2, -3, -2, 10.5, 14, 10.5, 14, -3, -2, -3, -2,
+                       -10, -8, -10, -8, 20, 24, 20, 24, -10, -8, -10, -8, 20, 24, 20, 24,
+                       -3.5, -7, -3.5, -7, -8, -9, -8, -9, -3.5, -7, -3.5, -7, -8, -9, -8, -9};
+  std::vector<int> y_2{0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3,
+                       4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7,
+                       8, 9, 8, 9, 10, 11, 10, 11, 8, 9, 8, 9, 10, 11, 10, 11,
+                       12, 13, 12, 13, 14, 15, 14, 15, 12, 13, 12, 13, 14, 15, 14, 15};
+  std::vector<int> y_3{0, 1, 0, 1, 6, 3, 2, 3, 0, 1, 0, 1, 6, 3, 2, 3,
+                       4, 5, 4, 5, 0, 7, 6, 7, 4, 5, 4, 5, 0, 7, 6, 7,
+                       8, 9, 8, 9, 15, 11, 10, 11, 8, 9, 8, 9, 15, 11, 10, 11,
+                       12, 13, 12, 13, 13, 15, 14, 15, 12, 13, 12, 13, 13, 15, 14, 15};
+
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, float>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
+                                                             DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 4, x, y_scale, zero_point, y_2,
+                                                                 DefaultCudaExecutionProvider());
+  QuantizeLinearOp21BlockedTest_Int4_Succeed<UInt4x2, MLFloat16>({2, 4, 8}, 2, 5, x, y_scale, zero_point, y_3,
+                                                                 DefaultCudaExecutionProvider());
+}
+
 TEST(QuantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) {
  std::vector<float> y_scale{-2.0, -4.0, -2.0, -4.0, 3.5, 1.0, 3.5, 1.0,
                             2.0, 4.0, 2.0, 4.0, -3.5, -1.0, -3.5, -1.0};