Transformer model CUDA EP align with CPU on corner case (#9889)

* align with cpu on no input data

* review comments and add tests

Co-authored-by: Ubuntu <wy@linux-v100.aidmrjtolptuzevavgwhrapqcd.jx.internal.cloudapp.net>
This commit is contained in:
Ye Wang 2022-02-03 12:58:49 -08:00 committed by GitHub
parent 63198a6566
commit bb09acffed
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 59 additions and 15 deletions

View file

@ -44,6 +44,9 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
Tensor* output = context->Output(0, input->Shape());
int64_t input_length = input->Shape().Size();
if (input_length == 0) {
return Status::OK();
}
int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
typedef typename ToCudaType<T>::MappedType CudaT;

View file

@ -41,12 +41,13 @@ Status SkipLayerNorm<T>::ComputeInternal(OpKernelContext* ctx) const {
Tensor* output = ctx->Output(0, input->Shape());
if (input->SizeInBytes() == 0) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'input' has no data from upstream nodes");
if (input->Shape() != skip->Shape()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"skip is expected to have same shape as input");
}
if (skip->SizeInBytes() == 0) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'skip' has no data from upstream nodes");
if (input->Shape().Size() == 0) {
return Status::OK();
}
const auto& input_dims = input->Shape().GetDims();
@ -55,11 +56,6 @@ Status SkipLayerNorm<T>::ComputeInternal(OpKernelContext* ctx) const {
"input is expected to have 3 dimensions, got ", input_dims.size());
}
if (input->Shape() != skip->Shape()) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
"skip is expected to have same shape as input");
}
const auto& gamma_dims = gamma->Shape().GetDims();
if (gamma_dims.size() != 1) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,

View file

@ -59,12 +59,6 @@ Status LayerNorm<T, U, simplified>::ComputeInternal(OpKernelContext* ctx) const
auto bias_data = (simplified || (nullptr == bias)) ? nullptr : reinterpret_cast<const CudaT*>(bias->template Data<T>());
const TensorShape& x_shape = X->Shape();
// Sometimes due to conversion issue, the input 'X' has no data which is a case that cuda kernel cannot handle.
// Provide more error infomation here instead of CUDA errors.
if (X->SizeInBytes() == 0) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Inputs 'X' has no data from upstream nodes");
}
const int64_t axis = HandleNegativeAxis(axis_, x_shape.NumDimensions());
int n1 = gsl::narrow<int>(x_shape.SizeToDimension(axis));
@ -101,6 +95,10 @@ Status LayerNorm<T, U, simplified>::ComputeInternal(OpKernelContext* ctx) const
inv_var_data = reinterpret_cast<CudaU*>(var->template MutableData<U>());
}
if (x_shape.Size() == 0) {
return Status::OK();
}
HostApplyLayerNorm<CudaT, CudaU, simplified>(GetDeviceProp(), Stream(), Y_data, mean_data, inv_var_data, X_data, n1, n2, epsilon_, scale_data, bias_data);
return Status::OK();
}

View file

@ -110,6 +110,19 @@ static void RunFastGeluTest(
RunFastGeluTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims, has_bias);
}
TEST(FastGeluTest, FastGeluWithNullInput) {
int batch_size = 1;
int sequence_length = 0;
int hidden_size = 4;
std::vector<float> input_data = {};
std::vector<float> bias_data = {
-0.5f, 0.6f, 1.2f, 2.1f};
RunFastGeluTest(input_data, bias_data, batch_size, sequence_length, hidden_size);
}
TEST(FastGeluTest, FastGeluWithBiasFloat32) {
int batch_size = 1;
int sequence_length = 2;

View file

@ -80,6 +80,11 @@ static void TestLayerNorm(const std::vector<int64_t>& x_dims,
#endif
}
TEST(CudaKernelTest, LayerNorm_NullInput) {
const std::vector<int64_t> X_dims{0, 20, 128};
TestLayerNorm(X_dims, LAYER_NORM_OP, k_epsilon_default);
}
TEST(CudaKernelTest, LayerNorm_SmallSizeTensor) {
const std::vector<int64_t> X_dims{4, 20, 128};
TestLayerNorm(X_dims, LAYER_NORM_OP, k_epsilon_default);

View file

@ -83,6 +83,35 @@ static void RunTest(
}
}
TEST(SkipLayerNormTest, SkipLayerNormNullInput) {
int batch_size = 1;
int sequence_length = 0;
int hidden_size = 4;
std::vector<float> input_data = {};
std::vector<float> skip_data = {};
std::vector<float> gamma_data = {
0.3f, 0.2f, 4.0f, 2.2f};
std::vector<float> beta_data = {
0.2f, 0.1f, 0.4f, 1.6f};
std::vector<float> output_data = {};
RunTest(input_data,
skip_data,
gamma_data,
beta_data,
std::vector<float>(),
output_data,
epsilon_,
batch_size,
sequence_length,
hidden_size);
}
TEST(SkipLayerNormTest, SkipLayerNormBatch1) {
int batch_size = 1;
int sequence_length = 2;