[ROCm] Bugfix of BFloat16-float conversion and Add FastGelu Kernel for AMD (#10557)

* bf16 bugfix on amd

* enable fastgelu ut on amd
This commit is contained in:
Vincent Wang 2022-02-16 11:11:08 +08:00 committed by GitHub
parent f22cd3af5d
commit ceb1e2b1a6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 86 additions and 37 deletions

View file

@ -50,6 +50,20 @@ struct BFloat16 {
inline ORT_HOST_DEVICE BFloat16(float v) {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
val = __bfloat16_as_ushort(__float2bfloat16(v));
#elif defined(USE_ROCM)
// We should be using memcpy in order to respect the strict aliasing rule but it fails in the HIP environment.
if (v != v) { // isnan
val = UINT16_C(0x7FC0);
} else {
union {
uint32_t U32;
float F32;
};
F32 = v;
uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
val = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
}
#else
ORT_IF_CONSTEXPR(endian::native == endian::little) {
std::memcpy(&val, reinterpret_cast<char*>(&v) + sizeof(uint16_t), sizeof(uint16_t));
@ -63,6 +77,14 @@ struct BFloat16 {
inline ORT_HOST_DEVICE float ToFloat() const {
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&val));
#elif defined(USE_ROCM)
// We should be using memcpy in order to respect the strict aliasing rule but it fails in the HIP environment.
float result = 0;
uint32_t tmp = val;
tmp <<= 16;
float* tempRes = reinterpret_cast<float*>(&tmp);
result = *tempRes;
return result;
#else
float result;
char* const first = reinterpret_cast<char*>(&result);

View file

@ -113,6 +113,16 @@ bool LaunchFastGeluKernel(const hipDeviceProp_t& prop, hipStream_t stream, int i
return HIP_CALL(hipPeekAtLastError());
}
template <>
bool LaunchFastGeluKernel(const hipDeviceProp_t& prop, hipStream_t stream, int input_length, int bias_length,
const BFloat16* input, const BFloat16* bias, BFloat16* output, bool /*use_half2*/) {
constexpr int blockSize = 256;
const int gridSize = (input_length + blockSize - 1) / blockSize;
hipLaunchKernelGGL(HIP_KERNEL_NAME(FastGeluKernel<BFloat16, blockSize>), dim3(gridSize), dim3(blockSize), 0, stream,
A, B, C, input_length, bias_length, input, bias, output);
return HIP_CALL(hipPeekAtLastError());
}
} // namespace rocm
} // namespace contrib
} // namespace onnxruntime

View file

@ -80,7 +80,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int8_t, QAttention);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int8_t, QAttention);
// class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FastGelu);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FastGelu);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, TransposeMatMul); // backward compatibility
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FusedMatMul);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kOnnxDomain, 1, BFloat16_float, LayerNormalization);
@ -166,7 +166,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) {
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int8_t, QAttention)>,
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int8_t, QAttention)>
// BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FastGelu)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FastGelu)>,
// TransposedMatMul is still here for backward compatibility
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, TransposeMatMul)>, // backward compatibility
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FusedMatMul)>,

View file

@ -40,41 +40,51 @@ const std::vector<float> GetExpectedResult(const std::vector<float>& input_data,
return ComputeGelu(add_bias_data);
}
static void RunFastGeluTest(
const std::vector<float>& input_data,
const std::vector<float>& bias_data,
const std::vector<float>& output_data,
const std::vector<int64_t>& input_dims,
const std::vector<int64_t>& bias_dims,
const std::vector<int64_t>& output_dims,
bool has_bias = true,
bool use_float16 = false) {
#if defined(USE_CUDA) || defined(USE_ROCM)
static void RunFastGeluGpuTest(const std::vector<float>& input_data, const std::vector<float>& bias_data,
const std::vector<float>& output_data, const std::vector<int64_t>& input_dims,
const std::vector<int64_t>& bias_dims, const std::vector<int64_t>& output_dims,
bool has_bias = true, bool use_float16 = false) {
#ifdef USE_CUDA
// Test CUDA operator.
int min_cuda_architecture = use_float16 ? 530 : 0;
if (HasCudaEnvironment(min_cuda_architecture)) {
OpTester tester("FastGelu", 1, onnxruntime::kMSDomain);
if (!HasCudaEnvironment(min_cuda_architecture)) {
LOGS_DEFAULT(WARNING) << "Hardware NOT support FP16";
return;
}
#endif
OpTester tester("FastGelu", 1, onnxruntime::kMSDomain);
if (use_float16) {
tester.AddInput<MLFloat16>("X", input_dims, ToFloat16(input_data));
if (has_bias) {
tester.AddInput<MLFloat16>("bias", bias_dims, ToFloat16(bias_data));
}
tester.AddOutput<MLFloat16>("Y", output_dims, ToFloat16(output_data));
} else {
tester.AddInput<float>("X", input_dims, input_data);
if (has_bias) {
tester.AddInput<float>("bias", bias_dims, bias_data);
}
tester.AddOutput<float>("Y", output_dims, output_data);
if (use_float16) {
tester.AddInput<MLFloat16>("X", input_dims, ToFloat16(input_data));
if (has_bias) {
tester.AddInput<MLFloat16>("bias", bias_dims, ToFloat16(bias_data));
}
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultCudaExecutionProvider());
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
tester.AddOutput<MLFloat16>("Y", output_dims, ToFloat16(output_data));
} else {
tester.AddInput<float>("X", input_dims, input_data);
if (has_bias) {
tester.AddInput<float>("bias", bias_dims, bias_data);
}
tester.AddOutput<float>("Y", output_dims, output_data);
}
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
#ifdef USE_CUDA
execution_providers.push_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
execution_providers.push_back(DefaultRocmExecutionProvider());
#endif
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
}
#endif
static void RunFastGeluCpuTest(const std::vector<float>& input_data, const std::vector<float>& bias_data,
const std::vector<float>& output_data, const std::vector<int64_t>& input_dims,
const std::vector<int64_t>& bias_dims, const std::vector<int64_t>& output_dims,
bool has_bias = true) {
// Test CPU operator: only float32 is implemented for FastGelu CPU.
if (nullptr != DefaultCpuExecutionProvider().get() && !use_float16) {
if (nullptr != DefaultCpuExecutionProvider().get()) {
OpTester tester("FastGelu", 1, onnxruntime::kMSDomain);
tester.AddInput<float>("X", input_dims, input_data);
@ -107,7 +117,10 @@ static void RunFastGeluTest(
std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
std::vector<int64_t> bias_dims = {hidden_size};
std::vector<int64_t> output_dims = input_dims;
RunFastGeluTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims, has_bias);
#if defined(USE_CUDA) || defined(USE_ROCM)
RunFastGeluGpuTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims, has_bias);
#endif
RunFastGeluCpuTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims, has_bias);
}
TEST(FastGeluTest, FastGeluWithNullInput) {
@ -152,6 +165,8 @@ TEST(FastGeluTest, FastGeluWithoutBiasFloat32) {
RunFastGeluTest(input_data, bias_data, batch_size, sequence_length, hidden_size);
}
// CUDA and ROCm only for Float16 and BFloat16 type.
#if defined(USE_CUDA) || defined(USE_ROCM)
TEST(FastGeluTest, FastGeluWithBiasFloat16) {
int batch_size = 1;
int sequence_length = 2;
@ -172,7 +187,7 @@ TEST(FastGeluTest, FastGeluWithBiasFloat16) {
std::vector<int64_t> bias_dims = {hidden_size};
std::vector<int64_t> output_dims = input_dims;
RunFastGeluTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims, true, true);
RunFastGeluGpuTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims, true, true);
}
TEST(FastGeluTest, FastGeluWithoutBiasFloat16) {
@ -194,17 +209,17 @@ TEST(FastGeluTest, FastGeluWithoutBiasFloat16) {
std::vector<int64_t> bias_dims = {};
std::vector<int64_t> output_dims = input_dims;
RunFastGeluTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims, false, true);
RunFastGeluGpuTest(input_data, bias_data, output_data, input_dims, bias_dims, output_dims, false, true);
}
// CUDA only, ROCM has not been supported yet
#ifdef USE_CUDA
TEST(FastGeluTest, FastGeluWithBias_BFloat16) {
#ifdef USE_CUDA
int min_cuda_architecture = 530;
if (!HasCudaEnvironment(min_cuda_architecture)) {
LOGS_DEFAULT(WARNING) << "Hardware NOT support BFP16";
return;
}
#endif
OpTester tester("FastGelu", 1, onnxruntime::kMSDomain);
int batch_size = 1;
@ -235,12 +250,14 @@ TEST(FastGeluTest, FastGeluWithBias_BFloat16) {
tester.AddOutput<BFloat16>("Y", output_dims, f_Y);
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
#ifdef USE_CUDA
execution_providers.push_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
execution_providers.push_back(DefaultRocmExecutionProvider());
#endif
tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
}
#endif
} // namespace test
} // namespace onnxruntime