finished ut

This commit is contained in:
Jing Fang 2025-02-07 00:06:02 +00:00
parent c93c17a6d9
commit d9919fb35d
7 changed files with 326 additions and 67 deletions

View file

@ -995,7 +995,7 @@ Return Value:
if (LogSoftmax) {
dispatch->LogSoftmax_Fp16(Input, Output, D, NegativeMaximum, MLAS_FP16(std::log(accumulation_fp32)));
} else {
dispatch->Softmax_Fp16(Output, Output, D, MLAS_FP16(1.0f / accumulation_fp32));
dispatch->Softmax_Fp16(Output, Output, D, MLAS_FP16(accumulation_fp32));
}
Input += D;

View file

@ -94,17 +94,17 @@ struct MLAS_SOFTMAX_DISPATCH {
SumExp_Fp16_Fn* SumExp_Fp16 = nullptr;
/**
* @brief Compute the softmax output for each element of the input array. input * scale.
* @brief Compute the softmax output for each element of the input array. input / sum.
* @param Input Address of the input array. Values of exp(x)
* @param Output Address of the output array. Could be the same as the input array.
* @param N Number of elements in the input array
* @param scale The scale factor to apply to the output
* @param Sum Sum of exp(input)
*/
typedef void(Softmax_Fp16_Fn)(
const MLAS_FP16* Input,
MLAS_FP16* Output,
size_t N,
const MLAS_FP16 scale
const MLAS_FP16 Sum
);
Softmax_Fp16_Fn* Softmax_Fp16 = nullptr;

View file

@ -33,7 +33,7 @@ MLAS_FP16 ReduceMax_Kernel_Fp16(const MLAS_FP16* Input, size_t N);
MLAS_FP16 SumExp_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 NegativeMaximum);
void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 scale);
void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 Sum);
void LogSoftmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 NegativeMaximum, const MLAS_FP16 LogSum);

View file

@ -747,11 +747,13 @@ MLAS_FP16 ReduceMax_Kernel_Fp16(const MLAS_FP16* Input, size_t N) {
return MLAS_FP16::FromBits(result);
}
void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 scale) {
void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 Sum) {
const auto* input = reinterpret_cast<const _mlas_fp16_*>(Input);
auto* output = reinterpret_cast<_mlas_fp16_*>(Output);
auto scale8 = MlasBroadcastFloat16x8(scale.val);
auto scale4 = MlasBroadcastFloat16x4(scale.val);
auto sum8 = MlasBroadcastFloat16x8(Sum.val);
auto sum4 = MlasBroadcastFloat16x4(Sum.val);
auto scale8 = MlasDivide(MlasBroadcastFloat16x8((_mlas_fp16_)0x3c00), sum8);
auto scale4 = MlasDivide(MlasBroadcastFloat16x4((_mlas_fp16_)0x3c00), sum4);
while (N >= 32) {
auto v0 = MlasLoadFloat16x8(input);

View file

@ -1,59 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "test_util.h"
class MlasComputeExpTest : public MlasTestBase {
private:
MatrixGuardBuffer<float> BufferInput;
MatrixGuardBuffer<float> BufferOutput;
MatrixGuardBuffer<float> BufferOutputReference;
void Test(size_t N, float MinimumValue, float MaximumValue) {
float* Input = BufferInput.GetBuffer(N);
float* Output = BufferOutput.GetBuffer(N);
float* OutputReference = BufferOutputReference.GetBuffer(N);
std::default_random_engine generator(static_cast<unsigned>(N));
std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
for (size_t n = 0; n < N; n++) {
Input[n] = distribution(generator);
}
for (size_t n = 0; n < N; n++) {
OutputReference[n] = std::exp(Input[n]);
}
MlasComputeExp(Input, Output, N);
constexpr float AbsoluteTolerance = 1e-6f;
constexpr float RelativeTolerance = 1e-6f;
for (size_t n = 0; n < N; n++) {
float diff = std::fabs(Output[n] - OutputReference[n]);
ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(OutputReference[n]) * RelativeTolerance)
<< " @" << n << " of " << N << ", got: " << Output[n] << ", expecting: " << OutputReference[n];
}
}
public:
static const char* GetTestSuiteName() {
static const std::string suite_name("Exp");
return suite_name.c_str();
}
void ExecuteShort(void) override {
for (size_t n = 1; n < 128; n++) {
Test(n, -10.f, 10.f);
}
}
};
static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
// no long execute needed
if (is_short_execute) {
return MlasDirectShortExecuteTests<MlasComputeExpTest>::RegisterShortExecute();
}
return 0ul;
});

View file

@ -0,0 +1,113 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "test_util.h"
#include "core/mlas/lib/mlasi.h"
#include "core/mlas/lib/softmax.h"
class MlasComputeTanhTest : public MlasTestBase {
private:
MatrixGuardBuffer<MLAS_FP16> BufferInputFp16;
MatrixGuardBuffer<MLAS_FP16> BufferOutputFp16;
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
void TestFp16(size_t N, float MinimumValue, float MaximumValue) {
MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N);
std::default_random_engine generator(static_cast<unsigned>(N));
std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
for (size_t n = 0; n < N; n++) {
Input[n] = MLAS_FP16(distribution(generator));
}
MlasComputeTanh(Input, Output, N);
constexpr float AbsoluteTolerance = 5e-3f;
constexpr float RelativeTolerance = 5e-3f;
for (size_t n = 0; n < N; n++) {
float in = Input[n].ToFloat();
float ref = std::tanh(in);
float out = Output[n].ToFloat();
float diff = std::fabs(out - ref);
ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
<< " @ " << in << ", got: " << out << ", expecting: " << ref
<< ", diff: " << diff << ", r-diff: " << diff / std::fabs(ref);
}
}
#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
public:
static const char* GetTestSuiteName() {
static const std::string suite_name("Tanh");
return suite_name.c_str();
}
void ExecuteShort(void) override {
for (size_t n = 1; n < 128; n++) {
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
TestFp16(n, -3.51562f, 3.51562f);
#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
}
}
};
class MlasComputeSoftcapTest : public MlasTestBase {
private:
MatrixGuardBuffer<MLAS_FP16> BufferInputFp16;
MatrixGuardBuffer<MLAS_FP16> BufferOutputFp16;
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
void TestFp16(size_t N, float MinimumValue, float MaximumValue, float cap) {
MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N);
std::default_random_engine generator(static_cast<unsigned>(N));
std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
for (size_t n = 0; n < N; n++) {
Input[n] = MLAS_FP16(distribution(generator));
}
MlasComputeSoftcap(Input, Output, N, MLAS_FP16(cap));
constexpr float AbsoluteTolerance = 5e-3f;
constexpr float RelativeTolerance = 5e-3f;
for (size_t n = 0; n < N; n++) {
float in = Input[n].ToFloat();
float ref = std::tanh(in/cap) * cap;
float out = Output[n].ToFloat();
float diff = std::fabs(out - ref);
ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
<< " @ " << in << ", got: " << out << ", expecting: " << ref << ", r-diff " << diff / std::fabs(ref);
}
}
#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
public:
static const char* GetTestSuiteName() {
static const std::string suite_name("Softcap");
return suite_name.c_str();
}
void ExecuteShort(void) override {
for (size_t n = 1; n < 128; n++) {
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
TestFp16(n, -10.f, 10.f, 3.2f);
#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
}
}
};
static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
size_t count = 0;
if (is_short_execute) {
count += MlasDirectShortExecuteTests<MlasComputeTanhTest>::RegisterShortExecute();
count += MlasDirectShortExecuteTests<MlasComputeSoftcapTest>::RegisterShortExecute();
}
return count;
});

View file

@ -2,6 +2,126 @@
// Licensed under the MIT License.
#include "test_util.h"
#include "core/mlas/lib/mlasi.h"
#include "core/mlas/lib/softmax.h"
class MlasComputeExpTest : public MlasTestBase {
private:
MatrixGuardBuffer<float> BufferInput;
MatrixGuardBuffer<float> BufferOutput;
MatrixGuardBuffer<float> BufferOutputReference;
MatrixGuardBuffer<MLAS_FP16> BufferInputFp16;
MatrixGuardBuffer<MLAS_FP16> BufferOutputFp16;
void Test(size_t N, float MinimumValue, float MaximumValue) {
float* Input = BufferInput.GetBuffer(N);
float* Output = BufferOutput.GetBuffer(N);
float* OutputReference = BufferOutputReference.GetBuffer(N);
std::default_random_engine generator(static_cast<unsigned>(N));
std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
for (size_t n = 0; n < N; n++) {
Input[n] = distribution(generator);
}
for (size_t n = 0; n < N; n++) {
OutputReference[n] = std::exp(Input[n]);
}
MlasComputeExp(Input, Output, N);
constexpr float AbsoluteTolerance = 1e-6f;
constexpr float RelativeTolerance = 1e-6f;
for (size_t n = 0; n < N; n++) {
float diff = std::fabs(Output[n] - OutputReference[n]);
ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(OutputReference[n]) * RelativeTolerance)
<< " @" << n << " of " << N << ", got: " << Output[n] << ", expecting: " << OutputReference[n];
}
}
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
void TestFp16(size_t N, float MinimumValue, float MaximumValue) {
MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N);
std::default_random_engine generator(N);
std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
for (size_t n = 0; n < N; n++) {
Input[n] = MLAS_FP16(distribution(generator));
}
MlasComputeExp(Input, Output, N);
constexpr float AbsoluteTolerance = 5e-4f;
constexpr float RelativeTolerance = 1e-3f;
for (size_t n = 0; n < N; n++) {
float in = Input[n].ToFloat();
float ref = std::exp(in);
float out = Output[n].ToFloat();
float diff = std::fabs(out - ref);
ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
<< " @ " << in << ", got: " << out << ", expecting: " << ref << ", r-diff: " << diff / std::fabs(ref);
}
}
void TestSumFp16(size_t N, float MinimumValue, float MaximumValue) {
MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N);
std::default_random_engine generator(N);
std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
float max_val = std::numeric_limits<float>::min();
for (size_t n = 0; n < N; n++) {
Input[n] = MLAS_FP16(distribution(generator));
max_val = std::max(max_val, Input[n].ToFloat());
}
const auto* dispatch = GetMlasPlatform().SoftmaxDispatch;
auto sum = dispatch->SumExp_Fp16(Input, Output, N, MLAS_FP16(-max_val));
constexpr float AbsoluteTolerance = 5e-4f;
constexpr float RelativeTolerance = 1e-3f;
float sum_ref = 0.0f;
for (size_t n = 0; n < N; n++) {
float in = Input[n].ToFloat();
float ref = std::exp(in - max_val);
sum_ref += ref;
float out = Output[n].ToFloat();
float diff = std::fabs(out - ref);
ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
<< " @ " << in << ", got: " << out << ", expecting: " << ref << ", r-diff: " << diff / std::fabs(ref);
}
float diff = std::fabs(sum.ToFloat() - sum_ref);
ASSERT_TRUE(diff <= 1e-3f || diff <= std::fabs(sum_ref) * 5e-3f)
<< " sum: " << sum.ToFloat() << ", expecting: " << sum_ref << ", r-diff: " << diff / std::fabs(sum_ref);
}
#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
public:
static const char* GetTestSuiteName() {
static const std::string suite_name("Exp");
return suite_name.c_str();
}
void ExecuteShort(void) override {
for (size_t n = 1; n < 128; n++) {
Test(n, -10.f, 10.f);
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
TestFp16(n, -17.f, 11.f);
TestSumFp16(n, -10.f, 10.f);
#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
}
}
};
template <bool Threaded>
class MlasSoftmaxTest : public MlasTestBase {
@ -9,6 +129,8 @@ class MlasSoftmaxTest : public MlasTestBase {
MatrixGuardBuffer<float> BufferInput;
MatrixGuardBuffer<float> BufferOutput;
MatrixGuardBuffer<float> BufferOutputReference;
MatrixGuardBuffer<MLAS_FP16> BufferInputFp16;
MatrixGuardBuffer<MLAS_FP16> BufferOutputFp16;
MLAS_THREADPOOL* threadpool_;
void Test(size_t N, size_t D, float MinimumValue, float MaximumValue) {
@ -44,6 +166,65 @@ class MlasSoftmaxTest : public MlasTestBase {
}
}
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
void TestReduceMaxFp16(size_t N, float MinimumValue, float MaximumValue) {
MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
std::default_random_engine generator(static_cast<unsigned>(N));
std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
float ref = std::numeric_limits<float>::lowest();
for (size_t nd = 0; nd < N; nd++) {
Input[nd] = MLAS_FP16(distribution(generator));
ref = std::max(ref, Input[nd].ToFloat());
}
const auto* dispatch = GetMlasPlatform().SoftmaxDispatch;
auto out = dispatch->ReduceMax_Fp16(Input, N).ToFloat();
constexpr float AbsoluteTolerance = 1e-3f;
constexpr float RelativeTolerance = 1e-3f;
float diff = std::fabs(out - ref);
ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
<< "ReduceMaxFp16: " << N << ", got: " << out << ", expecting: " << ref
<< ", diff: " << diff << ", r-diff: " << diff / std::fabs(ref);
}
void TestFp16(size_t N, size_t D, float MinimumValue, float MaximumValue, bool LogSoftmax, bool SmoothSoftmax) {
MLAS_FP16* Input = BufferInputFp16.GetBuffer(N * D);
MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N * D);
float* InputReference = BufferInput.GetBuffer(N * D);
float* OutputReference = BufferOutputReference.GetBuffer(N * D);
std::default_random_engine generator(static_cast<unsigned>(N * D));
std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
for (size_t nd = 0; nd < N * D; nd++) {
Input[nd] = MLAS_FP16(distribution(generator));
InputReference[nd] = Input[nd].ToFloat();
}
MlasComputeSoftmax(Input, Output, N, D, LogSoftmax, SmoothSoftmax, threadpool_);
ReferenceSoftmax(InputReference, OutputReference, N, D, LogSoftmax, SmoothSoftmax);
constexpr float AbsoluteTolerance = 5e-3f;
constexpr float RelativeTolerance = 5e-3f;
for (size_t nd = 0; nd < N * D; nd++) {
float in = Input[nd].ToFloat();
float ref = OutputReference[nd];
float out = Output[nd].ToFloat();
float diff = std::fabs(out - ref);
ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
<< "LogSoftmax:" << LogSoftmax << ", SmoothSoftmax: " << SmoothSoftmax << ", input " << in
<< ", got: " << out << ", expecting: " << ref << ", diff: " << diff << ", r-diff: " << diff / std::fabs(ref);
}
}
#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
void ReferenceSoftmax(const float* Input, float* Output, size_t N, size_t D, bool LogSoftmax, bool SmoothSoftmax) {
for (size_t n = 0; n < N; n++) {
float MaximumValue = std::numeric_limits<float>::lowest();
@ -99,11 +280,32 @@ class MlasSoftmaxTest : public MlasTestBase {
void ExecuteShort(void) override {
for (size_t d = 1; d < 128; d++) {
Test(1, d, -10.f, 10.f);
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
TestReduceMaxFp16(d, -10.f, 10.f);
TestFp16(1, d, -10.f, 10.f, false, true);
TestFp16(1, d, -10.f, 10.f, true, true);
TestFp16(1, d, -10.f, 10.f, false, false);
TestFp16(1, d, -10.f, 10.f, true, false);
#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
}
Test(3, 128, 20.f, 30.f);
Test(63, 95, -150.f, 190.f);
Test(16, 211, 20.f, 30.f);
#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
TestFp16(3, 128, 3.f, 7.f, false, true);
TestFp16(3, 128, 3.f, 7.f, true, true);
TestFp16(3, 128, 3.f, 7.f, false, false);
TestFp16(3, 128, 3.f, 7.f, true, false);
TestFp16(63, 95, -15.f, 19.f, false, true);
TestFp16(63, 95, -15.f, 19.f, true, true);
TestFp16(63, 95, -15.f, 19.f, false, false);
TestFp16(63, 95, -15.f, 19.f, true, false);
TestFp16(16, 211, -7.f, -3.f, false, true);
TestFp16(16, 211, -7.f, -3.f, true, true);
TestFp16(16, 211, -7.f, -3.f, false, false);
TestFp16(16, 211, -7.f, -3.f, true, false);
#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
}
};
@ -111,6 +313,7 @@ static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_exe
size_t count = 0;
if (is_short_execute) {
count += MlasDirectShortExecuteTests<MlasSoftmaxTest<false>>::RegisterShortExecute();
count += MlasDirectShortExecuteTests<MlasComputeExpTest>::RegisterShortExecute();
if (GetMlasThreadPool() != nullptr) {
count += MlasDirectShortExecuteTests<MlasSoftmaxTest<true>>::RegisterShortExecute();
}