diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp index 97ad1ca3a5..2ae593ec96 100644 --- a/onnxruntime/core/mlas/lib/compute.cpp +++ b/onnxruntime/core/mlas/lib/compute.cpp @@ -995,7 +995,7 @@ Return Value: if (LogSoftmax) { dispatch->LogSoftmax_Fp16(Input, Output, D, NegativeMaximum, MLAS_FP16(std::log(accumulation_fp32))); } else { - dispatch->Softmax_Fp16(Output, Output, D, MLAS_FP16(1.0f / accumulation_fp32)); + dispatch->Softmax_Fp16(Output, Output, D, MLAS_FP16(accumulation_fp32)); } Input += D; diff --git a/onnxruntime/core/mlas/lib/softmax.h b/onnxruntime/core/mlas/lib/softmax.h index 04bd1f990b..69fe1ae86d 100644 --- a/onnxruntime/core/mlas/lib/softmax.h +++ b/onnxruntime/core/mlas/lib/softmax.h @@ -94,17 +94,17 @@ struct MLAS_SOFTMAX_DISPATCH { SumExp_Fp16_Fn* SumExp_Fp16 = nullptr; /** - * @brief Compute the softmax output for each element of the input array. input * scale. + * @brief Compute the softmax output for each element of the input array. input / sum. * @param Input Address of the input array. Values of exp(x) * @param Output Address of the output array. Could be the same as the input array. * @param N Number of elements in the input array - * @param scale The scale factor to apply to the output + * @param Sum Sum of exp(input) */ typedef void(Softmax_Fp16_Fn)( const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, - const MLAS_FP16 scale + const MLAS_FP16 Sum ); Softmax_Fp16_Fn* Softmax_Fp16 = nullptr; diff --git a/onnxruntime/core/mlas/lib/softmax_kernel_neon.h b/onnxruntime/core/mlas/lib/softmax_kernel_neon.h index 53207a0448..e362e5d4cc 100644 --- a/onnxruntime/core/mlas/lib/softmax_kernel_neon.h +++ b/onnxruntime/core/mlas/lib/softmax_kernel_neon.h @@ -33,7 +33,7 @@ MLAS_FP16 ReduceMax_Kernel_Fp16(const MLAS_FP16* Input, size_t N); MLAS_FP16 SumExp_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 NegativeMaximum); -void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 scale); +void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 Sum); void LogSoftmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 NegativeMaximum, const MLAS_FP16 LogSum); diff --git a/onnxruntime/core/mlas/lib/softmax_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/softmax_kernel_neon_fp16.cpp index 876128b48c..819b7fe941 100644 --- a/onnxruntime/core/mlas/lib/softmax_kernel_neon_fp16.cpp +++ b/onnxruntime/core/mlas/lib/softmax_kernel_neon_fp16.cpp @@ -747,11 +747,13 @@ MLAS_FP16 ReduceMax_Kernel_Fp16(const MLAS_FP16* Input, size_t N) { return MLAS_FP16::FromBits(result); } -void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 scale) { +void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 Sum) { const auto* input = reinterpret_cast(Input); auto* output = reinterpret_cast<_mlas_fp16_*>(Output); - auto scale8 = MlasBroadcastFloat16x8(scale.val); - auto scale4 = MlasBroadcastFloat16x4(scale.val); + auto sum8 = MlasBroadcastFloat16x8(Sum.val); + auto sum4 = MlasBroadcastFloat16x4(Sum.val); + auto scale8 = MlasDivide(MlasBroadcastFloat16x8((_mlas_fp16_)0x3c00), sum8); + auto scale4 = MlasDivide(MlasBroadcastFloat16x4((_mlas_fp16_)0x3c00), sum4); while (N >= 32) { auto v0 = MlasLoadFloat16x8(input); diff --git a/onnxruntime/test/mlas/unittest/test_exp.cpp b/onnxruntime/test/mlas/unittest/test_exp.cpp deleted file mode 100644 index e69dd4376c..0000000000 --- a/onnxruntime/test/mlas/unittest/test_exp.cpp +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "test_util.h" - -class MlasComputeExpTest : public MlasTestBase { - private: - MatrixGuardBuffer BufferInput; - MatrixGuardBuffer BufferOutput; - MatrixGuardBuffer BufferOutputReference; - - void Test(size_t N, float MinimumValue, float MaximumValue) { - float* Input = BufferInput.GetBuffer(N); - float* Output = BufferOutput.GetBuffer(N); - float* OutputReference = BufferOutputReference.GetBuffer(N); - - std::default_random_engine generator(static_cast(N)); - std::uniform_real_distribution distribution(MinimumValue, MaximumValue); - - for (size_t n = 0; n < N; n++) { - Input[n] = distribution(generator); - } - - for (size_t n = 0; n < N; n++) { - OutputReference[n] = std::exp(Input[n]); - } - - MlasComputeExp(Input, Output, N); - - constexpr float AbsoluteTolerance = 1e-6f; - constexpr float RelativeTolerance = 1e-6f; - - for (size_t n = 0; n < N; n++) { - float diff = std::fabs(Output[n] - OutputReference[n]); - ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(OutputReference[n]) * RelativeTolerance) - << " @" << n << " of " << N << ", got: " << Output[n] << ", expecting: " << OutputReference[n]; - } - } - - public: - static const char* GetTestSuiteName() { - static const std::string suite_name("Exp"); - return suite_name.c_str(); - } - - void ExecuteShort(void) override { - for (size_t n = 1; n < 128; n++) { - Test(n, -10.f, 10.f); - } - } -}; - -static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { - // no long execute needed - if (is_short_execute) { - return MlasDirectShortExecuteTests::RegisterShortExecute(); - } - return 0ul; -}); diff --git a/onnxruntime/test/mlas/unittest/test_softcap.cpp b/onnxruntime/test/mlas/unittest/test_softcap.cpp new file mode 100644 index 0000000000..9c40e3c381 --- /dev/null +++ b/onnxruntime/test/mlas/unittest/test_softcap.cpp @@ -0,0 +1,113 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "test_util.h" +#include "core/mlas/lib/mlasi.h" +#include "core/mlas/lib/softmax.h" + +class MlasComputeTanhTest : public MlasTestBase { +private: + MatrixGuardBuffer BufferInputFp16; + MatrixGuardBuffer BufferOutputFp16; + +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + void TestFp16(size_t N, float MinimumValue, float MaximumValue) { + MLAS_FP16* Input = BufferInputFp16.GetBuffer(N); + MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N); + + std::default_random_engine generator(static_cast(N)); + std::uniform_real_distribution distribution(MinimumValue, MaximumValue); + + for (size_t n = 0; n < N; n++) { + Input[n] = MLAS_FP16(distribution(generator)); + } + + MlasComputeTanh(Input, Output, N); + + constexpr float AbsoluteTolerance = 5e-3f; + constexpr float RelativeTolerance = 5e-3f; + + for (size_t n = 0; n < N; n++) { + float in = Input[n].ToFloat(); + float ref = std::tanh(in); + float out = Output[n].ToFloat(); + float diff = std::fabs(out - ref); + ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance) + << " @ " << in << ", got: " << out << ", expecting: " << ref + << ", diff: " << diff << ", r-diff: " << diff / std::fabs(ref); + } + } +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + +public: + static const char* GetTestSuiteName() { + static const std::string suite_name("Tanh"); + return suite_name.c_str(); + } + + void ExecuteShort(void) override { + for (size_t n = 1; n < 128; n++) { +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + TestFp16(n, -3.51562f, 3.51562f); +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + } + } +}; + +class MlasComputeSoftcapTest : public MlasTestBase { +private: + MatrixGuardBuffer BufferInputFp16; + MatrixGuardBuffer BufferOutputFp16; + +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + void TestFp16(size_t N, float MinimumValue, float MaximumValue, float cap) { + MLAS_FP16* Input = BufferInputFp16.GetBuffer(N); + MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N); + + std::default_random_engine generator(static_cast(N)); + std::uniform_real_distribution distribution(MinimumValue, MaximumValue); + + for (size_t n = 0; n < N; n++) { + Input[n] = MLAS_FP16(distribution(generator)); + } + + MlasComputeSoftcap(Input, Output, N, MLAS_FP16(cap)); + + constexpr float AbsoluteTolerance = 5e-3f; + constexpr float RelativeTolerance = 5e-3f; + + for (size_t n = 0; n < N; n++) { + float in = Input[n].ToFloat(); + float ref = std::tanh(in/cap) * cap; + float out = Output[n].ToFloat(); + float diff = std::fabs(out - ref); + ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance) + << " @ " << in << ", got: " << out << ", expecting: " << ref << ", r-diff " << diff / std::fabs(ref); + } + } +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + +public: + static const char* GetTestSuiteName() { + static const std::string suite_name("Softcap"); + return suite_name.c_str(); + } + + void ExecuteShort(void) override { + for (size_t n = 1; n < 128; n++) { +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + TestFp16(n, -10.f, 10.f, 3.2f); +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + } + } +}; + + +static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { + size_t count = 0; + if (is_short_execute) { + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + } + return count; +}); diff --git a/onnxruntime/test/mlas/unittest/test_softmax.cpp b/onnxruntime/test/mlas/unittest/test_softmax.cpp index fb4ebbee77..e0820517d4 100644 --- a/onnxruntime/test/mlas/unittest/test_softmax.cpp +++ b/onnxruntime/test/mlas/unittest/test_softmax.cpp @@ -2,6 +2,126 @@ // Licensed under the MIT License. #include "test_util.h" +#include "core/mlas/lib/mlasi.h" +#include "core/mlas/lib/softmax.h" + +class MlasComputeExpTest : public MlasTestBase { + private: + MatrixGuardBuffer BufferInput; + MatrixGuardBuffer BufferOutput; + MatrixGuardBuffer BufferOutputReference; + MatrixGuardBuffer BufferInputFp16; + MatrixGuardBuffer BufferOutputFp16; + + void Test(size_t N, float MinimumValue, float MaximumValue) { + float* Input = BufferInput.GetBuffer(N); + float* Output = BufferOutput.GetBuffer(N); + float* OutputReference = BufferOutputReference.GetBuffer(N); + + std::default_random_engine generator(static_cast(N)); + std::uniform_real_distribution distribution(MinimumValue, MaximumValue); + + for (size_t n = 0; n < N; n++) { + Input[n] = distribution(generator); + } + + for (size_t n = 0; n < N; n++) { + OutputReference[n] = std::exp(Input[n]); + } + + MlasComputeExp(Input, Output, N); + + constexpr float AbsoluteTolerance = 1e-6f; + constexpr float RelativeTolerance = 1e-6f; + + for (size_t n = 0; n < N; n++) { + float diff = std::fabs(Output[n] - OutputReference[n]); + ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(OutputReference[n]) * RelativeTolerance) + << " @" << n << " of " << N << ", got: " << Output[n] << ", expecting: " << OutputReference[n]; + } + } + +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + + void TestFp16(size_t N, float MinimumValue, float MaximumValue) { + MLAS_FP16* Input = BufferInputFp16.GetBuffer(N); + MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N); + + std::default_random_engine generator(N); + std::uniform_real_distribution distribution(MinimumValue, MaximumValue); + + for (size_t n = 0; n < N; n++) { + Input[n] = MLAS_FP16(distribution(generator)); + } + + MlasComputeExp(Input, Output, N); + + constexpr float AbsoluteTolerance = 5e-4f; + constexpr float RelativeTolerance = 1e-3f; + + for (size_t n = 0; n < N; n++) { + float in = Input[n].ToFloat(); + float ref = std::exp(in); + float out = Output[n].ToFloat(); + float diff = std::fabs(out - ref); + ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance) + << " @ " << in << ", got: " << out << ", expecting: " << ref << ", r-diff: " << diff / std::fabs(ref); + } + } + + void TestSumFp16(size_t N, float MinimumValue, float MaximumValue) { + MLAS_FP16* Input = BufferInputFp16.GetBuffer(N); + MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N); + + std::default_random_engine generator(N); + std::uniform_real_distribution distribution(MinimumValue, MaximumValue); + + float max_val = std::numeric_limits::min(); + for (size_t n = 0; n < N; n++) { + Input[n] = MLAS_FP16(distribution(generator)); + max_val = std::max(max_val, Input[n].ToFloat()); + } + + const auto* dispatch = GetMlasPlatform().SoftmaxDispatch; + auto sum = dispatch->SumExp_Fp16(Input, Output, N, MLAS_FP16(-max_val)); + + constexpr float AbsoluteTolerance = 5e-4f; + constexpr float RelativeTolerance = 1e-3f; + + float sum_ref = 0.0f; + for (size_t n = 0; n < N; n++) { + float in = Input[n].ToFloat(); + float ref = std::exp(in - max_val); + sum_ref += ref; + float out = Output[n].ToFloat(); + float diff = std::fabs(out - ref); + ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance) + << " @ " << in << ", got: " << out << ", expecting: " << ref << ", r-diff: " << diff / std::fabs(ref); + } + + float diff = std::fabs(sum.ToFloat() - sum_ref); + ASSERT_TRUE(diff <= 1e-3f || diff <= std::fabs(sum_ref) * 5e-3f) + << " sum: " << sum.ToFloat() << ", expecting: " << sum_ref << ", r-diff: " << diff / std::fabs(sum_ref); + } + +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + + public: + static const char* GetTestSuiteName() { + static const std::string suite_name("Exp"); + return suite_name.c_str(); + } + + void ExecuteShort(void) override { + for (size_t n = 1; n < 128; n++) { + Test(n, -10.f, 10.f); +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + TestFp16(n, -17.f, 11.f); + TestSumFp16(n, -10.f, 10.f); +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + } + } +}; template class MlasSoftmaxTest : public MlasTestBase { @@ -9,6 +129,8 @@ class MlasSoftmaxTest : public MlasTestBase { MatrixGuardBuffer BufferInput; MatrixGuardBuffer BufferOutput; MatrixGuardBuffer BufferOutputReference; + MatrixGuardBuffer BufferInputFp16; + MatrixGuardBuffer BufferOutputFp16; MLAS_THREADPOOL* threadpool_; void Test(size_t N, size_t D, float MinimumValue, float MaximumValue) { @@ -44,6 +166,65 @@ class MlasSoftmaxTest : public MlasTestBase { } } +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + void TestReduceMaxFp16(size_t N, float MinimumValue, float MaximumValue) { + MLAS_FP16* Input = BufferInputFp16.GetBuffer(N); + + std::default_random_engine generator(static_cast(N)); + std::uniform_real_distribution distribution(MinimumValue, MaximumValue); + + float ref = std::numeric_limits::lowest(); + + for (size_t nd = 0; nd < N; nd++) { + Input[nd] = MLAS_FP16(distribution(generator)); + ref = std::max(ref, Input[nd].ToFloat()); + } + + const auto* dispatch = GetMlasPlatform().SoftmaxDispatch; + auto out = dispatch->ReduceMax_Fp16(Input, N).ToFloat(); + + constexpr float AbsoluteTolerance = 1e-3f; + constexpr float RelativeTolerance = 1e-3f; + + float diff = std::fabs(out - ref); + ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance) + << "ReduceMaxFp16: " << N << ", got: " << out << ", expecting: " << ref + << ", diff: " << diff << ", r-diff: " << diff / std::fabs(ref); + } + + void TestFp16(size_t N, size_t D, float MinimumValue, float MaximumValue, bool LogSoftmax, bool SmoothSoftmax) { + MLAS_FP16* Input = BufferInputFp16.GetBuffer(N * D); + MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N * D); + float* InputReference = BufferInput.GetBuffer(N * D); + float* OutputReference = BufferOutputReference.GetBuffer(N * D); + + std::default_random_engine generator(static_cast(N * D)); + std::uniform_real_distribution distribution(MinimumValue, MaximumValue); + + for (size_t nd = 0; nd < N * D; nd++) { + Input[nd] = MLAS_FP16(distribution(generator)); + InputReference[nd] = Input[nd].ToFloat(); + } + + MlasComputeSoftmax(Input, Output, N, D, LogSoftmax, SmoothSoftmax, threadpool_); + ReferenceSoftmax(InputReference, OutputReference, N, D, LogSoftmax, SmoothSoftmax); + + constexpr float AbsoluteTolerance = 5e-3f; + constexpr float RelativeTolerance = 5e-3f; + + for (size_t nd = 0; nd < N * D; nd++) { + float in = Input[nd].ToFloat(); + float ref = OutputReference[nd]; + float out = Output[nd].ToFloat(); + float diff = std::fabs(out - ref); + ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance) + << "LogSoftmax:" << LogSoftmax << ", SmoothSoftmax: " << SmoothSoftmax << ", input " << in + << ", got: " << out << ", expecting: " << ref << ", diff: " << diff << ", r-diff: " << diff / std::fabs(ref); + } + + } +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + void ReferenceSoftmax(const float* Input, float* Output, size_t N, size_t D, bool LogSoftmax, bool SmoothSoftmax) { for (size_t n = 0; n < N; n++) { float MaximumValue = std::numeric_limits::lowest(); @@ -99,11 +280,32 @@ class MlasSoftmaxTest : public MlasTestBase { void ExecuteShort(void) override { for (size_t d = 1; d < 128; d++) { Test(1, d, -10.f, 10.f); +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + TestReduceMaxFp16(d, -10.f, 10.f); + TestFp16(1, d, -10.f, 10.f, false, true); + TestFp16(1, d, -10.f, 10.f, true, true); + TestFp16(1, d, -10.f, 10.f, false, false); + TestFp16(1, d, -10.f, 10.f, true, false); +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) } Test(3, 128, 20.f, 30.f); Test(63, 95, -150.f, 190.f); Test(16, 211, 20.f, 30.f); +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + TestFp16(3, 128, 3.f, 7.f, false, true); + TestFp16(3, 128, 3.f, 7.f, true, true); + TestFp16(3, 128, 3.f, 7.f, false, false); + TestFp16(3, 128, 3.f, 7.f, true, false); + TestFp16(63, 95, -15.f, 19.f, false, true); + TestFp16(63, 95, -15.f, 19.f, true, true); + TestFp16(63, 95, -15.f, 19.f, false, false); + TestFp16(63, 95, -15.f, 19.f, true, false); + TestFp16(16, 211, -7.f, -3.f, false, true); + TestFp16(16, 211, -7.f, -3.f, true, true); + TestFp16(16, 211, -7.f, -3.f, false, false); + TestFp16(16, 211, -7.f, -3.f, true, false); +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) } }; @@ -111,6 +313,7 @@ static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_exe size_t count = 0; if (is_short_execute) { count += MlasDirectShortExecuteTests>::RegisterShortExecute(); + count += MlasDirectShortExecuteTests::RegisterShortExecute(); if (GetMlasThreadPool() != nullptr) { count += MlasDirectShortExecuteTests>::RegisterShortExecute(); }