diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp
index 97ad1ca3a5..2ae593ec96 100644
--- a/onnxruntime/core/mlas/lib/compute.cpp
+++ b/onnxruntime/core/mlas/lib/compute.cpp
@@ -995,7 +995,7 @@ Return Value:
         if (LogSoftmax) {
             dispatch->LogSoftmax_Fp16(Input, Output, D, NegativeMaximum, MLAS_FP16(std::log(accumulation_fp32)));
         } else {
-            dispatch->Softmax_Fp16(Output, Output, D, MLAS_FP16(1.0f / accumulation_fp32));
+            dispatch->Softmax_Fp16(Output, Output, D, MLAS_FP16(accumulation_fp32));
         }
 
         Input += D;
diff --git a/onnxruntime/core/mlas/lib/softmax.h b/onnxruntime/core/mlas/lib/softmax.h
index 04bd1f990b..69fe1ae86d 100644
--- a/onnxruntime/core/mlas/lib/softmax.h
+++ b/onnxruntime/core/mlas/lib/softmax.h
@@ -94,17 +94,17 @@ struct MLAS_SOFTMAX_DISPATCH {
     SumExp_Fp16_Fn* SumExp_Fp16 = nullptr;
 
     /**
-     * @brief Compute the softmax output for each element of the input array. input * scale.
+     * @brief Compute the softmax output for each element of the input array. input / sum.
      * @param Input         Address of the input array. Values of exp(x)
      * @param Output        Address of the output array. Could be the same as the input array.
      * @param N             Number of elements in the input array
-     * @param scale         The scale factor to apply to the output
+     * @param Sum           Sum of exp(input)
      */
     typedef void(Softmax_Fp16_Fn)(
         const MLAS_FP16* Input,
         MLAS_FP16* Output,
         size_t N,
-        const MLAS_FP16 scale
+        const MLAS_FP16 Sum
     );
 
     Softmax_Fp16_Fn* Softmax_Fp16 = nullptr;
diff --git a/onnxruntime/core/mlas/lib/softmax_kernel_neon.h b/onnxruntime/core/mlas/lib/softmax_kernel_neon.h
index 53207a0448..e362e5d4cc 100644
--- a/onnxruntime/core/mlas/lib/softmax_kernel_neon.h
+++ b/onnxruntime/core/mlas/lib/softmax_kernel_neon.h
@@ -33,7 +33,7 @@ MLAS_FP16 ReduceMax_Kernel_Fp16(const MLAS_FP16* Input, size_t N);
 
 MLAS_FP16 SumExp_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 NegativeMaximum);
 
-void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 scale);
+void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 Sum);
 
 void LogSoftmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 NegativeMaximum, const MLAS_FP16 LogSum);
 
diff --git a/onnxruntime/core/mlas/lib/softmax_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/softmax_kernel_neon_fp16.cpp
index 876128b48c..819b7fe941 100644
--- a/onnxruntime/core/mlas/lib/softmax_kernel_neon_fp16.cpp
+++ b/onnxruntime/core/mlas/lib/softmax_kernel_neon_fp16.cpp
@@ -747,11 +747,13 @@ MLAS_FP16 ReduceMax_Kernel_Fp16(const MLAS_FP16* Input, size_t N) {
     return MLAS_FP16::FromBits(result);
 }
 
-void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 scale) {
+void Softmax_Kernel_Fp16(const MLAS_FP16* Input, MLAS_FP16* Output, size_t N, const MLAS_FP16 Sum) {
     const auto* input = reinterpret_cast<const _mlas_fp16_*>(Input);
     auto* output = reinterpret_cast<_mlas_fp16_*>(Output);
-    auto scale8 = MlasBroadcastFloat16x8(scale.val);
-    auto scale4 = MlasBroadcastFloat16x4(scale.val);
+    auto sum8 = MlasBroadcastFloat16x8(Sum.val);
+    auto sum4 = MlasBroadcastFloat16x4(Sum.val);
+    auto scale8 = MlasDivide(MlasBroadcastFloat16x8((_mlas_fp16_)0x3c00), sum8);
+    auto scale4 = MlasDivide(MlasBroadcastFloat16x4((_mlas_fp16_)0x3c00), sum4);
 
     while (N >= 32) {
         auto v0 = MlasLoadFloat16x8(input);
diff --git a/onnxruntime/test/mlas/unittest/test_exp.cpp b/onnxruntime/test/mlas/unittest/test_exp.cpp
deleted file mode 100644
index e69dd4376c..0000000000
--- a/onnxruntime/test/mlas/unittest/test_exp.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "test_util.h"
-
-class MlasComputeExpTest : public MlasTestBase {
- private:
-  MatrixGuardBuffer<float> BufferInput;
-  MatrixGuardBuffer<float> BufferOutput;
-  MatrixGuardBuffer<float> BufferOutputReference;
-
-  void Test(size_t N, float MinimumValue, float MaximumValue) {
-    float* Input = BufferInput.GetBuffer(N);
-    float* Output = BufferOutput.GetBuffer(N);
-    float* OutputReference = BufferOutputReference.GetBuffer(N);
-
-    std::default_random_engine generator(static_cast<unsigned>(N));
-    std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
-
-    for (size_t n = 0; n < N; n++) {
-      Input[n] = distribution(generator);
-    }
-
-    for (size_t n = 0; n < N; n++) {
-      OutputReference[n] = std::exp(Input[n]);
-    }
-
-    MlasComputeExp(Input, Output, N);
-
-    constexpr float AbsoluteTolerance = 1e-6f;
-    constexpr float RelativeTolerance = 1e-6f;
-
-    for (size_t n = 0; n < N; n++) {
-      float diff = std::fabs(Output[n] - OutputReference[n]);
-      ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(OutputReference[n]) * RelativeTolerance)
-          << " @" << n << " of " << N << ", got: " << Output[n] << ", expecting: " << OutputReference[n];
-    }
-  }
-
- public:
-  static const char* GetTestSuiteName() {
-    static const std::string suite_name("Exp");
-    return suite_name.c_str();
-  }
-
-  void ExecuteShort(void) override {
-    for (size_t n = 1; n < 128; n++) {
-      Test(n, -10.f, 10.f);
-    }
-  }
-};
-
-static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
-  // no long execute needed
-  if (is_short_execute) {
-    return MlasDirectShortExecuteTests<MlasComputeExpTest>::RegisterShortExecute();
-  }
-  return 0ul;
-});
diff --git a/onnxruntime/test/mlas/unittest/test_softcap.cpp b/onnxruntime/test/mlas/unittest/test_softcap.cpp
new file mode 100644
index 0000000000..9c40e3c381
--- /dev/null
+++ b/onnxruntime/test/mlas/unittest/test_softcap.cpp
@@ -0,0 +1,113 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "test_util.h"
+#include "core/mlas/lib/mlasi.h"
+#include "core/mlas/lib/softmax.h"
+
+class MlasComputeTanhTest : public MlasTestBase {
+private:
+  MatrixGuardBuffer<MLAS_FP16> BufferInputFp16;
+  MatrixGuardBuffer<MLAS_FP16> BufferOutputFp16;
+
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+  void TestFp16(size_t N, float MinimumValue, float MaximumValue) {
+    MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
+    MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N);
+
+    std::default_random_engine generator(static_cast<unsigned>(N));
+    std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
+
+    for (size_t n = 0; n < N; n++) {
+      Input[n] = MLAS_FP16(distribution(generator));
+    }
+
+    MlasComputeTanh(Input, Output, N);
+
+    constexpr float AbsoluteTolerance = 5e-3f;
+    constexpr float RelativeTolerance = 5e-3f;
+
+    for (size_t n = 0; n < N; n++) {
+      float in = Input[n].ToFloat();
+      float ref = std::tanh(in);
+      float out = Output[n].ToFloat();
+      float diff = std::fabs(out - ref);
+      ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
+          << " @ " << in << ", got: " << out << ", expecting: " << ref
+          << ", diff: " << diff << ", r-diff: " << diff / std::fabs(ref);
+    }
+  }
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+
+public:
+  static const char* GetTestSuiteName() {
+    static const std::string suite_name("Tanh");
+    return suite_name.c_str();
+  }
+
+  void ExecuteShort(void) override {
+    for (size_t n = 1; n < 128; n++) {
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+      TestFp16(n, -3.51562f, 3.51562f);
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+    }
+  }
+};
+
+class MlasComputeSoftcapTest : public MlasTestBase {
+private:
+  MatrixGuardBuffer<MLAS_FP16> BufferInputFp16;
+  MatrixGuardBuffer<MLAS_FP16> BufferOutputFp16;
+
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+  void TestFp16(size_t N, float MinimumValue, float MaximumValue, float cap) {
+    MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
+    MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N);
+
+    std::default_random_engine generator(static_cast<unsigned>(N));
+    std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
+
+    for (size_t n = 0; n < N; n++) {
+      Input[n] = MLAS_FP16(distribution(generator));
+    }
+
+    MlasComputeSoftcap(Input, Output, N, MLAS_FP16(cap));
+
+    constexpr float AbsoluteTolerance = 5e-3f;
+    constexpr float RelativeTolerance = 5e-3f;
+
+    for (size_t n = 0; n < N; n++) {
+      float in = Input[n].ToFloat();
+      float ref = std::tanh(in/cap) * cap;
+      float out = Output[n].ToFloat();
+      float diff = std::fabs(out - ref);
+      ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
+          << " @ " << in << ", got: " << out << ", expecting: " << ref << ", r-diff " << diff / std::fabs(ref);
+    }
+  }
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+
+public:
+  static const char* GetTestSuiteName() {
+    static const std::string suite_name("Softcap");
+    return suite_name.c_str();
+  }
+
+  void ExecuteShort(void) override {
+    for (size_t n = 1; n < 128; n++) {
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+      TestFp16(n, -10.f, 10.f, 3.2f);
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+    }
+  }
+};
+
+
+static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
+  size_t count = 0;
+  if (is_short_execute) {
+    count += MlasDirectShortExecuteTests<MlasComputeTanhTest>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasComputeSoftcapTest>::RegisterShortExecute();
+  }
+  return count;
+});
diff --git a/onnxruntime/test/mlas/unittest/test_softmax.cpp b/onnxruntime/test/mlas/unittest/test_softmax.cpp
index fb4ebbee77..e0820517d4 100644
--- a/onnxruntime/test/mlas/unittest/test_softmax.cpp
+++ b/onnxruntime/test/mlas/unittest/test_softmax.cpp
@@ -2,6 +2,126 @@
 // Licensed under the MIT License.
 
 #include "test_util.h"
+#include "core/mlas/lib/mlasi.h"
+#include "core/mlas/lib/softmax.h"
+
+class MlasComputeExpTest : public MlasTestBase {
+ private:
+  MatrixGuardBuffer<float> BufferInput;
+  MatrixGuardBuffer<float> BufferOutput;
+  MatrixGuardBuffer<float> BufferOutputReference;
+  MatrixGuardBuffer<MLAS_FP16> BufferInputFp16;
+  MatrixGuardBuffer<MLAS_FP16> BufferOutputFp16;
+
+  void Test(size_t N, float MinimumValue, float MaximumValue) {
+    float* Input = BufferInput.GetBuffer(N);
+    float* Output = BufferOutput.GetBuffer(N);
+    float* OutputReference = BufferOutputReference.GetBuffer(N);
+
+    std::default_random_engine generator(static_cast<unsigned>(N));
+    std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
+
+    for (size_t n = 0; n < N; n++) {
+      Input[n] = distribution(generator);
+    }
+
+    for (size_t n = 0; n < N; n++) {
+      OutputReference[n] = std::exp(Input[n]);
+    }
+
+    MlasComputeExp(Input, Output, N);
+
+    constexpr float AbsoluteTolerance = 1e-6f;
+    constexpr float RelativeTolerance = 1e-6f;
+
+    for (size_t n = 0; n < N; n++) {
+      float diff = std::fabs(Output[n] - OutputReference[n]);
+      ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(OutputReference[n]) * RelativeTolerance)
+          << " @" << n << " of " << N << ", got: " << Output[n] << ", expecting: " << OutputReference[n];
+    }
+  }
+
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+
+  void TestFp16(size_t N, float MinimumValue, float MaximumValue) {
+    MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
+    MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N);
+
+    std::default_random_engine generator(N);
+    std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
+
+    for (size_t n = 0; n < N; n++) {
+      Input[n] = MLAS_FP16(distribution(generator));
+    }
+
+    MlasComputeExp(Input, Output, N);
+
+    constexpr float AbsoluteTolerance = 5e-4f;
+    constexpr float RelativeTolerance = 1e-3f;
+
+    for (size_t n = 0; n < N; n++) {
+      float in = Input[n].ToFloat();
+      float ref = std::exp(in);
+      float out = Output[n].ToFloat();
+      float diff = std::fabs(out - ref);
+      ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
+          << " @ " << in << ", got: " << out << ", expecting: " << ref << ", r-diff: " << diff / std::fabs(ref);
+    }
+  }
+
+  void TestSumFp16(size_t N, float MinimumValue, float MaximumValue) {
+    MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
+    MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N);
+
+    std::default_random_engine generator(N);
+    std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
+
+    float max_val = std::numeric_limits<float>::min();
+    for (size_t n = 0; n < N; n++) {
+      Input[n] = MLAS_FP16(distribution(generator));
+      max_val = std::max(max_val, Input[n].ToFloat());
+    }
+
+    const auto* dispatch = GetMlasPlatform().SoftmaxDispatch;
+    auto sum = dispatch->SumExp_Fp16(Input, Output, N, MLAS_FP16(-max_val));
+
+    constexpr float AbsoluteTolerance = 5e-4f;
+    constexpr float RelativeTolerance = 1e-3f;
+
+    float sum_ref = 0.0f;
+    for (size_t n = 0; n < N; n++) {
+      float in = Input[n].ToFloat();
+      float ref = std::exp(in - max_val);
+      sum_ref += ref;
+      float out = Output[n].ToFloat();
+      float diff = std::fabs(out - ref);
+      ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
+          << " @ " << in << ", got: " << out << ", expecting: " << ref << ", r-diff: " << diff / std::fabs(ref);
+    }
+
+    float diff = std::fabs(sum.ToFloat() - sum_ref);
+    ASSERT_TRUE(diff <= 1e-3f || diff <= std::fabs(sum_ref) * 5e-3f)
+        << " sum: " << sum.ToFloat() << ", expecting: " << sum_ref << ", r-diff: " << diff / std::fabs(sum_ref);
+  }
+
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+
+ public:
+  static const char* GetTestSuiteName() {
+    static const std::string suite_name("Exp");
+    return suite_name.c_str();
+  }
+
+  void ExecuteShort(void) override {
+    for (size_t n = 1; n < 128; n++) {
+      Test(n, -10.f, 10.f);
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+      TestFp16(n, -17.f, 11.f);
+      TestSumFp16(n, -10.f, 10.f);
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+    }
+  }
+};
 
 template <bool Threaded>
 class MlasSoftmaxTest : public MlasTestBase {
@@ -9,6 +129,8 @@ class MlasSoftmaxTest : public MlasTestBase {
   MatrixGuardBuffer<float> BufferInput;
   MatrixGuardBuffer<float> BufferOutput;
   MatrixGuardBuffer<float> BufferOutputReference;
+  MatrixGuardBuffer<MLAS_FP16> BufferInputFp16;
+  MatrixGuardBuffer<MLAS_FP16> BufferOutputFp16;
   MLAS_THREADPOOL* threadpool_;
 
   void Test(size_t N, size_t D, float MinimumValue, float MaximumValue) {
@@ -44,6 +166,65 @@ class MlasSoftmaxTest : public MlasTestBase {
     }
   }
 
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+  void TestReduceMaxFp16(size_t N, float MinimumValue, float MaximumValue) {
+    MLAS_FP16* Input = BufferInputFp16.GetBuffer(N);
+
+    std::default_random_engine generator(static_cast<unsigned>(N));
+    std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
+
+    float ref = std::numeric_limits<float>::lowest();
+
+    for (size_t nd = 0; nd < N; nd++) {
+      Input[nd] = MLAS_FP16(distribution(generator));
+      ref = std::max(ref, Input[nd].ToFloat());
+    }
+
+    const auto* dispatch = GetMlasPlatform().SoftmaxDispatch;
+    auto out = dispatch->ReduceMax_Fp16(Input, N).ToFloat();
+
+    constexpr float AbsoluteTolerance = 1e-3f;
+    constexpr float RelativeTolerance = 1e-3f;
+
+    float diff = std::fabs(out - ref);
+    ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
+        << "ReduceMaxFp16: " << N << ", got: " << out << ", expecting: " << ref
+        << ", diff: " << diff << ", r-diff: " << diff / std::fabs(ref);
+  }
+
+  void TestFp16(size_t N, size_t D, float MinimumValue, float MaximumValue, bool LogSoftmax, bool SmoothSoftmax) {
+    MLAS_FP16* Input = BufferInputFp16.GetBuffer(N * D);
+    MLAS_FP16* Output = BufferOutputFp16.GetBuffer(N * D);
+    float* InputReference = BufferInput.GetBuffer(N * D);
+    float* OutputReference = BufferOutputReference.GetBuffer(N * D);
+
+    std::default_random_engine generator(static_cast<unsigned>(N * D));
+    std::uniform_real_distribution<float> distribution(MinimumValue, MaximumValue);
+
+    for (size_t nd = 0; nd < N * D; nd++) {
+      Input[nd] = MLAS_FP16(distribution(generator));
+      InputReference[nd] = Input[nd].ToFloat();
+    }
+
+    MlasComputeSoftmax(Input, Output, N, D, LogSoftmax, SmoothSoftmax, threadpool_);
+    ReferenceSoftmax(InputReference, OutputReference, N, D, LogSoftmax, SmoothSoftmax);
+
+    constexpr float AbsoluteTolerance = 5e-3f;
+    constexpr float RelativeTolerance = 5e-3f;
+
+    for (size_t nd = 0; nd < N * D; nd++) {
+      float in = Input[nd].ToFloat();
+      float ref = OutputReference[nd];
+      float out = Output[nd].ToFloat();
+      float diff = std::fabs(out - ref);
+      ASSERT_TRUE(diff <= AbsoluteTolerance || diff <= std::fabs(ref) * RelativeTolerance)
+          << "LogSoftmax:" << LogSoftmax << ", SmoothSoftmax: " << SmoothSoftmax << ", input " << in
+          << ", got: " << out << ", expecting: " << ref << ", diff: " << diff << ", r-diff: " << diff / std::fabs(ref);
+    }
+
+  }
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+
   void ReferenceSoftmax(const float* Input, float* Output, size_t N, size_t D, bool LogSoftmax, bool SmoothSoftmax) {
     for (size_t n = 0; n < N; n++) {
       float MaximumValue = std::numeric_limits<float>::lowest();
@@ -99,11 +280,32 @@ class MlasSoftmaxTest : public MlasTestBase {
   void ExecuteShort(void) override {
     for (size_t d = 1; d < 128; d++) {
       Test(1, d, -10.f, 10.f);
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+      TestReduceMaxFp16(d, -10.f, 10.f);
+      TestFp16(1, d, -10.f, 10.f, false, true);
+      TestFp16(1, d, -10.f, 10.f, true, true);
+      TestFp16(1, d, -10.f, 10.f, false, false);
+      TestFp16(1, d, -10.f, 10.f, true, false);
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
     }
 
     Test(3, 128, 20.f, 30.f);
     Test(63, 95, -150.f, 190.f);
     Test(16, 211, 20.f, 30.f);
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
+    TestFp16(3, 128, 3.f, 7.f, false, true);
+    TestFp16(3, 128, 3.f, 7.f, true, true);
+    TestFp16(3, 128, 3.f, 7.f, false, false);
+    TestFp16(3, 128, 3.f, 7.f, true, false);
+    TestFp16(63, 95, -15.f, 19.f, false, true);
+    TestFp16(63, 95, -15.f, 19.f, true, true);
+    TestFp16(63, 95, -15.f, 19.f, false, false);
+    TestFp16(63, 95, -15.f, 19.f, true, false);
+    TestFp16(16, 211, -7.f, -3.f, false, true);
+    TestFp16(16, 211, -7.f, -3.f, true, true);
+    TestFp16(16, 211, -7.f, -3.f, false, false);
+    TestFp16(16, 211, -7.f, -3.f, true, false);
+#endif  // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)
   }
 };
 
@@ -111,6 +313,7 @@ static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_exe
   size_t count = 0;
   if (is_short_execute) {
     count += MlasDirectShortExecuteTests<MlasSoftmaxTest<false>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasComputeExpTest>::RegisterShortExecute();
     if (GetMlasThreadPool() != nullptr) {
       count += MlasDirectShortExecuteTests<MlasSoftmaxTest<true>>::RegisterShortExecute();
     }