Transformer model CUDA EP align with CPU on corner case (#9889)

* align with cpu on no input data * review comments and add tests Co-authored-by: Ubuntu <wy@linux-v100.aidmrjtolptuzevavgwhrapqcd.jx.internal.cloudapp.net>
2026-07-08 17:17:15 +00:00 · 2022-02-03 12:58:49 -08:00 · 2022-02-03 12:58:49 -08:00 · bb09acffed
commit bb09acffed
parent 63198a6566
6 changed files with 59 additions and 15 deletions
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
@ -44,6 +44,9 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
  Tensor* output = context->Output(0, input->Shape());

  int64_t input_length = input->Shape().Size();
+  if (input_length == 0) {
+    return Status::OK();
+  }
  int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
  typedef typename ToCudaType<T>::MappedType CudaT;

--- a/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/skip_layer_norm.cc
@ -41,12 +41,13 @@ Status SkipLayerNorm<T>::ComputeInternal(OpKernelContext* ctx) const {

  Tensor* output = ctx->Output(0, input->Shape());

-  if (input->SizeInBytes() == 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'input' has no data from upstream nodes");
+  if (input->Shape() != skip->Shape()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "skip is expected to have same shape as input");
  }

-  if (skip->SizeInBytes() == 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'skip' has no data from upstream nodes");
+  if (input->Shape().Size() == 0) {
+    return Status::OK();
  }

  const auto& input_dims = input->Shape().GetDims();
@ -55,11 +56,6 @@ Status SkipLayerNorm<T>::ComputeInternal(OpKernelContext* ctx) const {
                           "input is expected to have 3 dimensions, got ", input_dims.size());
  }

-  if (input->Shape() != skip->Shape()) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                           "skip is expected to have same shape as input");
-  }
-
  const auto& gamma_dims = gamma->Shape().GetDims();
  if (gamma_dims.size() != 1) {
    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
--- a/onnxruntime/contrib_ops/cuda/layer_norm.cc
+++ b/onnxruntime/contrib_ops/cuda/layer_norm.cc
@ -59,12 +59,6 @@ Status LayerNorm<T, U, simplified>::ComputeInternal(OpKernelContext* ctx) const
  auto bias_data = (simplified || (nullptr == bias)) ? nullptr : reinterpret_cast<const CudaT*>(bias->template Data<T>());

  const TensorShape& x_shape = X->Shape();
-  // Sometimes due to conversion issue, the input 'X' has no data which is a case that cuda kernel cannot handle.
-  // Provide more error infomation here instead of CUDA errors.
-  if (X->SizeInBytes() == 0) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'X' has no data from upstream nodes");
-  }
-
  const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());

  int n1 = gsl::narrow<int>(x_shape.SizeToDimension(axis));
@ -101,6 +95,10 @@ Status LayerNorm<T, U, simplified>::ComputeInternal(OpKernelContext* ctx) const
    inv_var_data = reinterpret_cast<CudaU*>(var->template MutableData<U>());
  }

+  if (x_shape.Size() == 0) {
+    return Status::OK();
+  }
+
  HostApplyLayerNorm<CudaT, CudaU, simplified>(GetDeviceProp(), Stream(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data);
  return Status::OK();
 }
--- a/onnxruntime/test/contrib_ops/fastgelu_op_test.cc
+++ b/onnxruntime/test/contrib_ops/fastgelu_op_test.cc
@ -110,6 +110,19 @@ static void RunFastGeluTest(
  RunFastGeluTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims, has_bias);
 }

+TEST(FastGeluTest, FastGeluWithNullInput) {
+  int batch_size = 1;
+  int sequence_length = 0;
+  int hidden_size = 4;
+
+  std::vector<float> input_data = {};
+
+  std::vector<float> bias_data = {
+      -0.5f, 0.6f, 1.2f, 2.1f};
+
+  RunFastGeluTest(input_data, bias_data, batch_size, sequence_length, hidden_size);
+}
+
 TEST(FastGeluTest, FastGeluWithBiasFloat32) {
  int batch_size = 1;
  int sequence_length = 2;
--- a/onnxruntime/test/contrib_ops/layer_norm_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc
@ -80,6 +80,11 @@ static void TestLayerNorm(const std::vector<int64_t>& x_dims,
 #endif
 }

+TEST(CudaKernelTest, LayerNorm_NullInput) {
+  const std::vector<int64_t> X_dims{0, 20, 128};
+  TestLayerNorm(X_dims, LAYER_NORM_OP, k_epsilon_default);
+}
+
 TEST(CudaKernelTest, LayerNorm_SmallSizeTensor) {
  const std::vector<int64_t> X_dims{4, 20, 128};
  TestLayerNorm(X_dims, LAYER_NORM_OP, k_epsilon_default);
--- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc
@ -83,6 +83,35 @@ static void RunTest(
  }
 }

+TEST(SkipLayerNormTest, SkipLayerNormNullInput) {
+  int batch_size = 1;
+  int sequence_length = 0;
+  int hidden_size = 4;
+
+  std::vector<float> input_data = {};
+
+  std::vector<float> skip_data = {};
+
+  std::vector<float> gamma_data = {
+      0.3f, 0.2f, 4.0f, 2.2f};
+
+  std::vector<float> beta_data = {
+      0.2f, 0.1f, 0.4f, 1.6f};
+
+  std::vector<float> output_data = {};
+
+  RunTest(input_data,
+          skip_data,
+          gamma_data,
+          beta_data,
+          std::vector<float>(),
+          output_data,
+          epsilon_,
+          batch_size,
+          sequence_length,
+          hidden_size);
+}
+
 TEST(SkipLayerNormTest, SkipLayerNormBatch1) {
  int batch_size = 1;
  int sequence_length = 2;