MLAS: more code cleanup (#4101)

Cleanup vector intrinsics, optimized SSE quantized GEMM.
2026-05-16 21:00:14 +00:00 · 2020-06-01 21:19:42 -07:00 · 2020-06-01 21:19:42 -07:00 · 3f7b97a63d
commit 3f7b97a63d
parent 08e5f89b37
11 changed files with 449 additions and 313 deletions
--- a/onnxruntime/core/mlas/lib/activate.cpp
+++ b/onnxruntime/core/mlas/lib/activate.cpp
@ -109,7 +109,7 @@ struct MLAS_ACTIVATION_FUNCTION<MlasReluActivation>
 #if defined(MLAS_SSE2_INTRINSICS)
        return _mm_cvtss_f32(Activate(_mm_set_ss(Value)));
 #else
-        return (std::max)(Value, 0.0f);
+        return std::max(Value, 0.0f);
 #endif
    }
 };
@ -140,12 +140,11 @@ struct MLAS_ACTIVATION_FUNCTION<MlasLeakyReluActivation>
 #elif defined(MLAS_AVX_INTRINSICS)
        return _mm_blendv_ps(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
 #elif defined(MLAS_SSE2_INTRINSICS)
-        __m128 Selection = _mm_cmple_ps(ZeroFloat32x4, Value);
-        return _mm_or_ps(_mm_and_ps(Value, Selection), _mm_andnot_ps(Selection, ValueTimesAlpha));
+        return MlasBlendFloat32x4(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
 #elif defined(MLAS_VSX_INTRINSICS)
        return vec_sel(ValueTimesAlpha, Value, vec_cmple(ZeroFloat32x4, Value));
 #else
-#error Unsupported architecture.
+        return MlasBlendFloat32x4(ValueTimesAlpha, Value, ZeroFloat32x4 < Value);
 #endif
    }

@ -186,8 +185,8 @@ struct MLAS_ACTIVATION_FUNCTION<MlasClipActivation>
 #if defined(MLAS_SSE2_INTRINSICS)
        return _mm_cvtss_f32(Activate(_mm_set_ss(Value)));
 #else
-        Value = (std::max)(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast));
-        Value = (std::min)(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast));
+        Value = std::max(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast));
+        Value = std::min(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast));

        return Value;
 #endif
--- a/onnxruntime/core/mlas/lib/compute.cpp
+++ b/onnxruntime/core/mlas/lib/compute.cpp
@ -552,7 +552,7 @@ Return Value:

    while (N > 0) {

-        Maximum = (std::max)(Maximum, *Input);
+        Maximum = std::max(Maximum, *Input);

        Input += 1;
        N -= 1;
--- a/onnxruntime/core/mlas/lib/erf.cpp
+++ b/onnxruntime/core/mlas/lib/erf.cpp
@ -191,7 +191,7 @@ Return Value:

        float r;
        if (AbsValue > MlasErfConstants.ErfSplitBoundary) {
-            AbsValue = (std::min)(MlasErfConstants.ErfUpperAbsRange, AbsValue);
+            AbsValue = std::min(MlasErfConstants.ErfUpperAbsRange, AbsValue);
            float r_big = MlasErfConstants.ErfBIG_P0;
            r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P1;
            r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P2;
@ -201,7 +201,7 @@ Return Value:
            r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P6_Minus_One;
            r_big = r_big * AbsValue + AbsValue;

-            r_big = (std::max)(-r_big, MlasErfConstants.Exp_LowerRange);
+            r_big = std::max(-r_big, MlasErfConstants.Exp_LowerRange);
            r = MlasErfConstants.Exp_Log2Reciprocal * r_big + MlasErfConstants.Exp_C;
            r -= MlasErfConstants.Exp_C;
            float fx = r * MlasErfConstants.Exp_log2_hi + r_big;
--- a/onnxruntime/core/mlas/lib/logistic.cpp
+++ b/onnxruntime/core/mlas/lib/logistic.cpp
@ -121,7 +121,7 @@ Return Value:

        float Value = *Input++;

-        Value = (std::min)(MlasLogisticConstants.UpperRange, (std::max)(MlasLogisticConstants.LowerRange, Value));
+        Value = std::min(MlasLogisticConstants.UpperRange, std::max(MlasLogisticConstants.LowerRange, Value));

        float ValueSquared = Value * Value;

--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@ -25,6 +25,12 @@ Abstract:
 #include <type_traits>

 #if defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
 #include <windows.h>
 #include <intrin.h>
 #else
@ -753,8 +759,6 @@ MlasPartitionWork(
 #if defined(__FMA__) || (defined(_MSC_VER) && defined(__AVX2__))
 #define MLAS_FMA3_INTRINSICS
 #endif
-#else
-#error Unsupported architecture.
 #endif

 #if defined(MLAS_NEON_INTRINSICS)
@ -785,6 +789,21 @@ MlasReinterpretAsInt32x4(MLAS_FLOAT32X4 Vector)
 #endif
 }

+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasCastToInt32x4(MLAS_FLOAT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vcvtq_s32_f32(Vector);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_cvttps_epi32(Vector);
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_cts(Vector, 0);
+#else
+    return MLAS_INT32X4{int32_t(Vector[0]), int32_t(Vector[1]), int32_t(Vector[2]), int32_t(Vector[3])};
+#endif
+}
+
 MLAS_FORCEINLINE
 MLAS_INT32X4
 MlasBroadcastInt32x4(int32_t Value)
@ -798,6 +817,182 @@ MlasBroadcastInt32x4(int32_t Value)
 #endif
 }

+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasLoadInt32x4(const int32_t* Buffer)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vld1q_s32(Buffer);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_loadu_si128((const __m128i*)Buffer);
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_vsx_ld(0, Buffer);
+#else
+    return *((MLAS_INT32X4*)Buffer);
+#endif
+}
+
+MLAS_FORCEINLINE
+void
+MlasStoreInt32x4(int32_t* Buffer, MLAS_INT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    vst1q_s32(Buffer, Vector);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    _mm_storeu_si128((__m128i*)Buffer, Vector);
+#elif defined(MLAS_VSX_INTRINSICS)
+    vec_vsx_st(Vector, 0, Buffer);
+#else
+    *((MLAS_INT32X4*)Buffer) = Vector;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vaddq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_add_epi32(Vector1, Vector2);
+#else
+    return Vector1 + Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vsubq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_sub_epi32(Vector1, Vector2);
+#else
+    return Vector1 - Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasAndInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vandq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_and_si128(Vector1, Vector2);
+#else
+    return Vector1 & Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasOrInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vorrq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_or_si128(Vector1, Vector2);
+#else
+    return Vector1 | Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasAndNotInt32x4(MLAS_INT32X4 VectorNot, MLAS_INT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vandq_s32(vmvnq_s32(VectorNot), Vector);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_andnot_si128(VectorNot, Vector);
+#else
+    return (~VectorNot) & Vector;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasXorInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return veorq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_xor_si128(Vector1, Vector2);
+#else
+    return Vector1 ^ Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasBlendInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2, MLAS_INT32X4 Selection)
+{
+    return MlasOrInt32x4(MlasAndInt32x4(Vector2, Selection), MlasAndNotInt32x4(Selection, Vector1));
+}
+
+template<unsigned ShiftCount>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasShiftLeftInt32x4(MLAS_INT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vshlq_n_s32(Vector, ShiftCount);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_slli_epi32(Vector, ShiftCount);
+#else
+    return Vector << ShiftCount;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vmaxq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE41_INTRINSICS)
+    return _mm_max_epi32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector1, Vector2));
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_vmaxsw(Vector1, Vector2);
+#else
+    return MlasBlendInt32x4(Vector2, Vector1, Vector1 > Vector2);
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vminq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE41_INTRINSICS)
+    return _mm_min_epi32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector2, Vector1));
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_vminsw(Vector1, Vector2);
+#else
+    return MlasBlendInt32x4(Vector2, Vector1, Vector2 > Vector1);
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vreinterpretq_f32_s32(Vector);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_castsi128_ps(Vector);
+#else
+    return MLAS_FLOAT32X4(Vector);
+#endif
+}
+
 MLAS_FORCEINLINE
 MLAS_FLOAT32X4
 MlasBroadcastFloat32x4(float Value)
@ -847,6 +1042,8 @@ MlasLoadFloat32x4(const float* Buffer)
    return _mm_loadu_ps(Buffer);
 #elif defined(MLAS_VSX_INTRINSICS)
    return vec_vsx_ld(0, Buffer);
+#else
+    return *((MLAS_FLOAT32X4*)Buffer);
 #endif
 }

@ -860,6 +1057,8 @@ MlasStoreFloat32x4(float* Buffer, MLAS_FLOAT32X4 Vector)
    _mm_storeu_ps(Buffer, Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
    vec_vsx_st(Vector, 0, Buffer);
+#else
+    *((MLAS_FLOAT32X4*)Buffer) = Vector;
 #endif
 }

@ -945,8 +1144,76 @@ MlasExtractLaneFloat32x4<0>(MLAS_FLOAT32X4 Vector)
    return _mm_cvtss_f32(Vector);
 }

+template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector)
+{
+    return _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(Index3, Index2, Index1, Index0));
+}
+
 #endif

+#if !defined(MLAS_SSE2_INTRINSICS) && !defined(_MSC_VER)
+
+template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(__clang__)
+    return __builtin_shufflevector(Vector1, Vector2, Index0, Index1, Index2, Index3);
+#else
+    return __builtin_shuffle(Vector1, Vector2, MLAS_INT32X4{Index0, Index1, Index2, Index3});
+#endif
+}
+
+template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector)
+{
+    return MlasShuffleFloat32x4<Index0, Index1, Index2, Index3>(Vector, Vector);
+}
+
+#endif
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasInterleaveLowFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_NEON64_INTRINSICS)
+    return vzip1q_f32(Vector1, Vector2);
+#elif defined(MLAS_NEON32_INTRINSICS)
+    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
+    return zipped.val[0];
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_unpacklo_ps(Vector1, Vector2);
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_mergeh(Vector1, Vector2);
+#else
+    return MlasShuffleFloat32x4<0, 4, 1, 5>(Vector1, Vector2);
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasInterleaveHighFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_NEON64_INTRINSICS)
+    return vzip2q_f32(Vector1, Vector2);
+#elif defined(MLAS_NEON32_INTRINSICS)
+    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
+    return zipped.val[1];
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_unpackhi_ps(Vector1, Vector2);
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_mergel(Vector1, Vector2);
+#else
+    return MlasShuffleFloat32x4<2, 6, 3, 7>(Vector1, Vector2);
+#endif
+}
+
 MLAS_FORCEINLINE
 MLAS_FLOAT32X4
 MlasAddFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
@ -1036,6 +1303,70 @@ MlasDivideFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
 #endif
 }

+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2));
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_cmpgt_ps(Vector1, Vector2);
+#else
+    return Vector1 > Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_SSE2_INTRINSICS)
+    return _mm_and_ps(Vector1, Vector2);
+#else
+    return MlasReinterpretAsFloat32x4(MlasAndInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_SSE2_INTRINSICS)
+    return _mm_or_ps(Vector1, Vector2);
+#else
+    return MlasReinterpretAsFloat32x4(MlasOrInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector)
+{
+#if defined(MLAS_SSE2_INTRINSICS)
+    return _mm_andnot_ps(VectorNot, Vector);
+#else
+    return MlasReinterpretAsFloat32x4(MlasAndNotInt32x4(MlasReinterpretAsInt32x4(VectorNot), MlasReinterpretAsInt32x4(Vector)));
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_SSE2_INTRINSICS)
+    return _mm_xor_ps(Vector1, Vector2);
+#else
+    return MlasReinterpretAsFloat32x4(MlasXorInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasBlendFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2, MLAS_FLOAT32X4 Selection)
+{
+    return MlasOrFloat32x4(MlasAndFloat32x4(Vector2, Selection), MlasAndNotFloat32x4(Selection, Vector1));
+}
+
 MLAS_FORCEINLINE
 MLAS_FLOAT32X4
 MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
@ -1047,7 +1378,7 @@ MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
 #elif defined(MLAS_VSX_INTRINSICS)
    return vec_sel(Vector2, Vector1, vec_cmpgt(Vector1, Vector2));
 #else
-#error Unsupported architecture.
+    return MlasBlendFloat32x4(Vector2, Vector1, Vector1 > Vector2);
 #endif
 }

@ -1062,7 +1393,7 @@ MlasMinimumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
 #elif defined(MLAS_VSX_INTRINSICS)
    return vec_sel(Vector2, Vector1, vec_cmpgt(Vector2, Vector1));
 #else
-#error Unsupported architecture.
+    return MlasBlendFloat32x4(Vector2, Vector1, Vector2 > Vector1);
 #endif
 }

@ -1079,86 +1410,6 @@ MlasClampFloat32x4(MLAS_FLOAT32X4 Value, float LowerRange, float UpperRange)
    return Value;
 }

-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_cmpgt_ps(Vector1, Vector2);
-#elif defined(MLAS_VSX_INTRINSICS)
-    return MLAS_FLOAT32X4(vec_cmpgt(Vector1, Vector2));
-#else
-#error Unsupported architecture.
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_and_ps(Vector1, Vector2);
-#else
-    return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) & MLAS_INT32X4(Vector2));
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_or_ps(Vector1, Vector2);
-#else
-    return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) | MLAS_INT32X4(Vector2));
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(vreinterpretq_u32_f32(VectorNot)), vreinterpretq_u32_f32(Vector)));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_andnot_ps(VectorNot, Vector);
-#else
-    return MLAS_FLOAT32X4(~MLAS_INT32X4(VectorNot) & MLAS_INT32X4(Vector));
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_xor_ps(Vector1, Vector2);
-#else
-    return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) ^ MLAS_INT32X4(Vector2));
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_s32(Vector);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_castsi128_ps(Vector);
-#else
-    return MLAS_FLOAT32X4(Vector);
-#endif
-}
-
 MLAS_FORCEINLINE
 float
 MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector)
@ -1173,16 +1424,14 @@ MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector)
    VectorLow = vpadd_f32(VectorLow, VectorHigh);
    VectorLow = vpadd_f32(VectorLow, VectorHigh);
    return vget_lane_f32(VectorLow, 0);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2)));
-    Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
-    return _mm_cvtss_f32(Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
    Vector = MlasAddFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1)));
    Vector = MlasAddFloat32x4(Vector, vec_splat(Vector, 1));
    return Vector[0];
 #else
-#error Unsupported architecture.
+    Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector));
+    Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector));
+    return MlasExtractLaneFloat32x4<0>(Vector);
 #endif
 }

@ -1198,108 +1447,24 @@ MlasReduceMaximumFloat32x4(MLAS_FLOAT32X4 Vector)
    VectorLow = vpmax_f32(VectorLow, VectorHigh);
    VectorLow = vpmax_f32(VectorLow, VectorHigh);
    return vget_lane_f32(VectorLow, 0);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2)));
-    Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
-    return _mm_cvtss_f32(Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
    Vector = MlasMaximumFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1)));
    Vector = MlasMaximumFloat32x4(Vector, vec_splat(Vector, 1));
    return Vector[0];
 #else
-#error Unsupported architecture.
+    Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector));
+    Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector));
+    return MlasExtractLaneFloat32x4<0>(Vector);
 #endif
 }

-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vaddq_s32(Vector1, Vector2);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_add_epi32(Vector1, Vector2);
-#else
-    return Vector1 + Vector2;
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vsubq_s32(Vector1, Vector2);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_sub_epi32(Vector1, Vector2);
-#else
-    return Vector1 - Vector2;
-#endif
-}
-
-template<unsigned ShiftCount>
-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasShiftLeftInt32x4(MLAS_INT32X4 Vector)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vshlq_n_s32(Vector, ShiftCount);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_slli_epi32(Vector, ShiftCount);
-#else
-    return Vector << ShiftCount;
-#endif
-}
-
-#if !defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_SSE41_INTRINSICS)
-
-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vmaxq_s32(Vector1, Vector2);
-#elif defined(MLAS_SSE41_INTRINSICS)
-    return _mm_max_epi32(Vector1, Vector2);
-#elif defined(MLAS_VSX_INTRINSICS)
-    return vec_vmaxsw(Vector1, Vector2);
-#else
-#error Unsupported architecture.
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vminq_s32(Vector1, Vector2);
-#elif defined(MLAS_SSE41_INTRINSICS)
-    return _mm_min_epi32(Vector1, Vector2);
-#elif defined(MLAS_VSX_INTRINSICS)
-    return vec_vminsw(Vector1, Vector2);
-#else
-#error Unsupported architecture.
-#endif
-}
-
-#endif
-
 // calc 2^int(N)
 MLAS_FORCEINLINE
 MLAS_FLOAT32X4
 MlasPowerOf2Float32x4(MLAS_FLOAT32X4 Vector)
 {
-#if defined(MLAS_NEON_INTRINSICS)
-    MLAS_INT32X4 emm0 = vaddq_s32(vcvtq_s32_f32(Vector), MlasBroadcastInt32x4(127));
-    return vreinterpretq_f32_s32(vshlq_n_s32(emm0, 23));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    MLAS_INT32X4 emm0 = _mm_add_epi32(_mm_cvttps_epi32(Vector), MlasBroadcastInt32x4(127));
-    return _mm_castsi128_ps(_mm_slli_epi32(emm0, 23));
-#elif defined(MLAS_VSX_INTRINSICS)
-    MLAS_INT32X4 emm0 = vec_cts(Vector, 0) + MlasBroadcastInt32x4(127);
-    return MLAS_FLOAT32X4(vec_sl(emm0, MLAS_UINT32X4(MlasBroadcastInt32x4(23))));
-#endif
+    MLAS_INT32X4 emm0 = MlasAddInt32x4(MlasCastToInt32x4(Vector), MlasBroadcastInt32x4(127));
+    return MlasReinterpretAsFloat32x4(MlasShiftLeftInt32x4<23>(emm0));
 }

 //
--- a/onnxruntime/core/mlas/lib/pooling.cpp
+++ b/onnxruntime/core/mlas/lib/pooling.cpp
@ -80,7 +80,7 @@ struct MLAS_MAXIMUM_POOLING

    static float Reduce(float Reduction, float Value)
    {
-        return (std::max)(Reduction, Value);
+        return std::max(Reduction, Value);
    }

    static MLAS_FLOAT32X4 Reduce(MLAS_FLOAT32X4 Reduction, MLAS_FLOAT32X4 Value)
@ -293,8 +293,8 @@ Return Value:
            const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
            const int64_t iwEnd64 = iwStart64 + KernelWidth;

-            const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
-            const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
+            const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
+            const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));

            float m = PoolingType::InitialValue();

@ -370,16 +370,16 @@ Return Value:
            const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
            const int64_t ihEnd64 = ihStart64 + KernelHeight;

-            const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
-            const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
+            const size_t ihStart = size_t(std::max(ihStart64, int64_t(0)));
+            const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight)));

            for (size_t pw = 0; pw < OutputWidth; pw++) {

                const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
                const int64_t iwEnd64 = iwStart64 + KernelWidth;

-                const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
-                const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
+                const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
+                const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));

                float m = PoolingType::InitialValue();

@ -604,14 +604,12 @@ Return Value:
                        break;
                    }

-#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
-                    MlasStoreLaneFloat32x4<0>(Output, Reduction);
-                    MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
-#elif defined(MLAS_SSE2_INTRINSICS)
+#if defined(MLAS_SSE2_INTRINSICS)
                    Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
                    MlasStoreLowHalfFloat32x4(Output, Reduction);
 #else
-#error Unsupported architecture.
+                    MlasStoreLaneFloat32x4<0>(Output, Reduction);
+                    MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
 #endif

                    Output += 2;
@ -688,24 +686,24 @@ Return Value:
            const int64_t idStart64 = pd * StrideDepth - PaddingLeftDepth;
            const int64_t idEnd64 = idStart64 + KernelDepth;

-            const size_t idStart = size_t((std::max)(idStart64, int64_t(0)));
-            const size_t idEnd = size_t((std::min)(idEnd64, int64_t(InputDepth)));
+            const size_t idStart = size_t(std::max(idStart64, int64_t(0)));
+            const size_t idEnd = size_t(std::min(idEnd64, int64_t(InputDepth)));

            for (size_t ph = 0; ph < OutputHeight; ph++) {

                const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
                const int64_t ihEnd64 = ihStart64 + KernelHeight;

-                const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
-                const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
+                const size_t ihStart = size_t(std::max(ihStart64, int64_t(0)));
+                const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight)));

                for (size_t pw = 0; pw < OutputWidth; pw++) {

                    const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
                    const int64_t iwEnd64 = iwStart64 + KernelWidth;

-                    const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
-                    const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
+                    const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
+                    const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));

                    float m = PoolingType::InitialValue();

@ -976,14 +974,12 @@ Return Value:
                            break;
                        }

-#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
-                        MlasStoreLaneFloat32x4<0>(Output, Reduction);
-                        MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
-#elif defined(MLAS_SSE2_INTRINSICS)
+#if defined(MLAS_SSE2_INTRINSICS)
                        Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
                        MlasStoreLowHalfFloat32x4(Output, Reduction);
 #else
-#error Unsupported architecture.
+                        MlasStoreLaneFloat32x4<0>(Output, Reduction);
+                        MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
 #endif

                        Output += 2;
--- a/onnxruntime/core/mlas/lib/qgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm.cpp
@ -181,48 +181,39 @@ Return Value:
    }
 }

-template<typename BType>
 void
 MlasGemmU8X8CopyPackBProcessSse(
    int16_t* D,
    __m128i BytesRow0,
    __m128i BytesRow1,
-    __m128i ZeroVector,
+    __m128i BitFlipVector,
    __m128i ColumnSums[2]
    )
 {
    __m128i BytesInterleaved = _mm_unpacklo_epi8(BytesRow0, BytesRow1);
-    __m128i WordsInterleaved[2];

-    //
-    // Zero or sign extend the bytes to words.
-    //
+    BytesInterleaved = _mm_xor_si128(BytesInterleaved, BitFlipVector);

-    if (std::is_same<BType, uint8_t>::value) {
-        WordsInterleaved[0] = _mm_unpacklo_epi8(BytesInterleaved, ZeroVector);
-        WordsInterleaved[1] = _mm_unpackhi_epi8(BytesInterleaved, ZeroVector);
-    } else {
-        WordsInterleaved[0] = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8);
-        WordsInterleaved[1] = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8);
-    }
+    __m128i WordsInterleaved0 = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8);
+    __m128i WordsInterleaved1 = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8);

-    ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved[0]);
-    ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved[1]);
+    ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved0);
+    ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved1);

-    _mm_storeu_si128((__m128i*)&D[0], WordsInterleaved[0]);
-    _mm_storeu_si128((__m128i*)&D[8], WordsInterleaved[1]);
+    _mm_storeu_si128((__m128i*)&D[0], WordsInterleaved0);
+    _mm_storeu_si128((__m128i*)&D[8], WordsInterleaved1);
 }

-template<typename BType>
 void
 MlasGemmU8X8CopyPackBSse(
    int16_t* D,
-    const BType* B,
+    const uint8_t* B,
    size_t ldb,
    size_t CountN,
    size_t CountK,
    int32_t* ColumnSumVector,
-    int16_t offa
+    int16_t offa,
+    bool BTypeIsSigned
    )
 /*++

@ -256,9 +247,8 @@ Return Value:

 --*/
 {
-    const __m128i ZeroVector = _mm_setzero_si128();
    const __m128i OffsetBroadcast = _mm_set1_epi16(offa);
-    BType PaddedMatrixBData[16] = { 0 };
+    const __m128i BitFlipVector = _mm_set1_epi32(BTypeIsSigned ? 0 : 0x80808080);

    //
    // Process 8 columns of matrix B in a loop.
@ -266,12 +256,12 @@ Return Value:

    while (CountN >= 8) {

-        const BType* b = B;
+        const uint8_t* b = B;
        size_t k = CountK;
        __m128i ColumnSums[2];

-        ColumnSums[0] = ZeroVector;
-        ColumnSums[1] = ZeroVector;
+        ColumnSums[0] = _mm_setzero_si128();
+        ColumnSums[1] = _mm_setzero_si128();

        //
        // Interleave rows of matrix B and write to the packed buffer.
@ -286,7 +276,7 @@ Return Value:
            __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]);
            __m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&b[ldb]);

-            MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums);
+            MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);

            b += ldb * 2;
            D += 16;
@ -297,7 +287,7 @@ Return Value:

            __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]);

-            MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums);
+            MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);

            D += 16;
        }
@ -325,12 +315,15 @@ Return Value:

    if (CountN > 0) {

-        const BType* b = B;
+        const uint8_t* b = B;
        size_t k = CountK;
        __m128i ColumnSums[2];
+        uint8_t PaddedMatrixBData[16];

-        ColumnSums[0] = ZeroVector;
-        ColumnSums[1] = ZeroVector;
+        _mm_storeu_si128((__m128i*)PaddedMatrixBData, BitFlipVector);
+
+        ColumnSums[0] = _mm_setzero_si128();
+        ColumnSums[1] = _mm_setzero_si128();

        //
        // Interleave rows of matrix B using an intermediate zero padded stack
@ -339,9 +332,9 @@ Return Value:

        while (k >= 2) {

-            const BType* bcopy = b;
-            BType* padded = PaddedMatrixBData;
-            BType* padded_end = padded + CountN;
+            const uint8_t* bcopy = b;
+            uint8_t* padded = PaddedMatrixBData;
+            uint8_t* padded_end = padded + CountN;

            do {
                padded[0] = bcopy[0];
@ -353,7 +346,7 @@ Return Value:
            __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]);
            __m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[8]);

-            MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums);
+            MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);

            b += ldb * 2;
            D += 16;
@ -362,9 +355,9 @@ Return Value:

        if (k > 0) {

-            const BType* bcopy = b;
-            BType* padded = PaddedMatrixBData;
-            BType* padded_end = padded + CountN;
+            const uint8_t* bcopy = b;
+            uint8_t* padded = PaddedMatrixBData;
+            uint8_t* padded_end = padded + CountN;

            do {
                padded[0] = bcopy[0];
@ -374,7 +367,7 @@ Return Value:

            __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]);

-            MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums);
+            MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);
        }

        //
@ -648,6 +641,10 @@ Return Value:
    size_t StrideN = MLAS_GEMM_U8X8_STRIDEN_SSE;
    size_t StrideK = MLAS_GEMM_U8X8_STRIDEK_SSE;

+    if (!WorkBlock->BTypeIsSigned) {
+        offb = int8_t(offb ^ 0x80);
+    }
+
    //
    // Step through each slice of matrix B along the K dimension.
    //
@ -656,7 +653,7 @@ Return Value:

    for (size_t k = 0; k < K; k += CountK) {

-        CountK = (std::min)(K - k, StrideK);
+        CountK = std::min(K - k, StrideK);

        //
        // Step through each slice of matrix B along the N dimension.
@ -666,7 +663,7 @@ Return Value:

        for (size_t n = 0; n < N; n += CountN) {

-            CountN = (std::min)(N - n, StrideN);
+            CountN = std::min(N - n, StrideN);

            //
            // Copy a panel of matrix B to a local packed buffer.
@ -674,13 +671,8 @@ Return Value:

            const uint8_t* b = B + n + k * ldb;

-            if (WorkBlock->BTypeIsSigned) {
-                MlasGemmU8X8CopyPackBSse(PanelB, (const int8_t*)b, ldb, CountN,
-                    CountK, ColumnSumVector, -int16_t(offa));
-            } else {
-                MlasGemmU8X8CopyPackBSse(PanelB, (const uint8_t*)b, ldb, CountN,
-                    CountK, ColumnSumVector, -int16_t(offa));
-            }
+            MlasGemmU8X8CopyPackBSse(PanelB, b, ldb, CountN, CountK,
+                ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);

            //
            // Step through each slice of matrix A along the M dimension.
@ -694,7 +686,7 @@ Return Value:

            for (size_t m = 0; m < M; m += CountM) {

-                CountM = (std::min)(M - m, StrideM);
+                CountM = std::min(M - m, StrideM);

                //
                // Copy a panel of matrix A to a local packed buffer.
@ -892,7 +884,7 @@ Return Value:

    for (size_t k = 0; k < K; k += CountK) {

-        CountK = (std::min)(K - k, StrideK);
+        CountK = std::min(K - k, StrideK);

        //
        // Step through each slice of matrix B along the N dimension.
@ -902,7 +894,7 @@ Return Value:

        for (size_t n = 0; n < N; n += CountN) {

-            CountN = (std::min)(N - n, StrideN);
+            CountN = std::min(N - n, StrideN);

            //
            // Copy a panel of matrix B to a local packed buffer.
@ -910,8 +902,8 @@ Return Value:

            const int8_t* b = (const int8_t*)B + n + k * ldb;

-            MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN,
-                CountK, ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
+            MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK,
+                ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);

            //
            // Step through each slice of matrix A along the M dimension.
@ -925,7 +917,7 @@ Return Value:

            for (size_t m = 0; m < M; m += CountM) {

-                CountM = (std::min)(M - m, StrideM);
+                CountM = std::min(M - m, StrideM);

                //
                // Copy a panel of matrix A to a local packed buffer.
@ -1038,7 +1030,7 @@ Return Value:

    for (size_t k = 0; k < K; k += CountK) {

-        CountK = (std::min)(K - k, StrideK);
+        CountK = std::min(K - k, StrideK);

        //
        // Step through each slice of matrix B along the N dimension.
@ -1048,16 +1040,16 @@ Return Value:

        for (size_t n = 0; n < N; n += CountN) {

-            CountN = (std::min)(N - n, StrideN);
+            CountN = std::min(N - n, StrideN);

            //
            // Copy a panel of matrix B to a local packed buffer.
            //

-            const uint8_t* b = (const uint8_t*)B + n + k * ldb;
+            const uint8_t* b = B + n + k * ldb;

-            MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK, ColumnSumVector,
-                -int16_t(offa));
+            MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK,
+                ColumnSumVector, -int16_t(offa));

            //
            // Step through each slice of matrix A along the M dimension.
@ -1071,7 +1063,7 @@ Return Value:

            for (size_t m = 0; m < M; m += CountM) {

-                CountM = (std::min)(M - m, StrideM);
+                CountM = std::min(M - m, StrideM);

                //
                // Copy a panel of matrix A to a local packed buffer.
--- a/onnxruntime/core/mlas/lib/reorder.cpp
+++ b/onnxruntime/core/mlas/lib/reorder.cpp
@ -213,7 +213,7 @@ Return Value:

    for (size_t i = InputChannels; i > 0;) {

-        const size_t InputChannelsThisIteration = (std::min)(i, BlockSize);
+        const size_t InputChannelsThisIteration = std::min(i, BlockSize);
        i -= InputChannelsThisIteration;

        const float* s = S;
@ -308,7 +308,7 @@ Return Value:

        for (size_t o = OutputChannels; o > 0;) {

-            const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
+            const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
            const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
            o -= OutputChannelsThisIteration;

@ -415,7 +415,7 @@ Return Value:

            for (size_t o = OutputChannels; o > 0;) {

-                const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
+                const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
                const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
                o -= OutputChannelsThisIteration;

@ -523,7 +523,7 @@ Return Value:

    for (size_t o = OutputChannels; o > 0;) {

-        const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
+        const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
        const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
        o -= OutputChannelsThisIteration;

@ -538,7 +538,7 @@ Return Value:

        for (size_t i = InputChannels; i > 0;) {

-            const size_t InputChannelsThisIteration = (std::min)(i, BlockSize);
+            const size_t InputChannelsThisIteration = std::min(i, BlockSize);
            i -= InputChannelsThisIteration;

            //
@ -671,7 +671,7 @@ Return Value:

    for (size_t o = OutputChannels; o > 0;) {

-        const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
+        const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
        const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
        o -= OutputChannelsThisIteration;

--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@ -422,29 +422,15 @@ Return Value:
        t1 = o0.val[1];
        t2 = o1.val[0];
        t3 = o1.val[1];
-#elif defined(MLAS_SSE2_INTRINSICS)
-        // N.B. The MSVC version of _MM_TRANSPOSE4_PS uses shufps which is
-        // slightly larger than the below sequence, so manually expand the
-        // matrix transpose.
-        __m128 z0 = _mm_unpacklo_ps(t0, t1);
-        __m128 z1 = _mm_unpackhi_ps(t0, t1);
-        __m128 z2 = _mm_unpacklo_ps(t2, t3);
-        __m128 z3 = _mm_unpackhi_ps(t2, t3);
-        t0 = _mm_movelh_ps(z0, z2);
-        t1 = _mm_movehl_ps(z2, z0);
-        t2 = _mm_movelh_ps(z1, z3);
-        t3 = _mm_movehl_ps(z3, z1);
-#elif defined(MLAS_VSX_INTRINSICS)
-        __vector float z0 = vec_mergeh(t0, t2);
-        __vector float z1 = vec_mergel(t0, t2);
-        __vector float z2 = vec_mergeh(t1, t3);
-        __vector float z3 = vec_mergel(t1, t3);
-        t0 = vec_mergeh(z0, z2);
-        t1 = vec_mergel(z0, z2);
-        t2 = vec_mergeh(z1, z3);
-        t3 = vec_mergel(z1, z3);
 #else
-#error Unsupported architecture.
+        MLAS_FLOAT32X4 z0 = MlasInterleaveLowFloat32x4(t0, t2);
+        MLAS_FLOAT32X4 z1 = MlasInterleaveHighFloat32x4(t0, t2);
+        MLAS_FLOAT32X4 z2 = MlasInterleaveLowFloat32x4(t1, t3);
+        MLAS_FLOAT32X4 z3 = MlasInterleaveHighFloat32x4(t1, t3);
+        t0 = MlasInterleaveLowFloat32x4(z0, z2);
+        t1 = MlasInterleaveHighFloat32x4(z0, z2);
+        t2 = MlasInterleaveLowFloat32x4(z1, z3);
+        t3 = MlasInterleaveHighFloat32x4(z1, z3);
 #endif

        MlasStoreAlignedFloat32x4(&D[0], t0);
@ -639,7 +625,14 @@ Return Value:
                MLAS_FLOAT32X4 t0 = MlasLoadFloat32x4(&b[0]);
                MLAS_FLOAT32X4 t1 = MlasLoadFloat32x4(&b[ldb]);

-#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
+#if defined(MLAS_SSE2_INTRINSICS)
+                __m128 v0 = _mm_unpacklo_ps(t0, t1);
+                __m128 v1 = _mm_unpackhi_ps(t0, t1);
+                _mm_storel_pi((__m64*)&d[0], v0);
+                _mm_storeh_pi((__m64*)&d[16], v0);
+                _mm_storel_pi((__m64*)&d[32], v1);
+                _mm_storeh_pi((__m64*)&d[48], v1);
+#else
                MlasStoreLaneFloat32x4<0>(&d[0], t0);
                MlasStoreLaneFloat32x4<0>(&d[1], t1);
                MlasStoreLaneFloat32x4<1>(&d[16], t0);
@ -648,15 +641,6 @@ Return Value:
                MlasStoreLaneFloat32x4<2>(&d[33], t1);
                MlasStoreLaneFloat32x4<3>(&d[48], t0);
                MlasStoreLaneFloat32x4<3>(&d[49], t1);
-#elif defined(MLAS_SSE2_INTRINSICS)
-                __m128 v0 = _mm_unpacklo_ps(t0, t1);
-                __m128 v1 = _mm_unpackhi_ps(t0, t1);
-                _mm_storel_pi((__m64*)&d[0], v0);
-                _mm_storeh_pi((__m64*)&d[16], v0);
-                _mm_storel_pi((__m64*)&d[32], v1);
-                _mm_storeh_pi((__m64*)&d[48], v1);
-#else
-#error Unsupported architecture.
 #endif

                d += 2;
--- a/onnxruntime/core/mlas/lib/snchwc.cpp
+++ b/onnxruntime/core/mlas/lib/snchwc.cpp
@ -539,7 +539,7 @@ struct MLAS_NCHWC_GROUPED_CONV_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM

    void ComputeFilterCount(void)
    {
-        FilterCount = (std::min)(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize);
+        FilterCount = std::min(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize);
    }

    void PrepareWork(int32_t Index)
@ -686,7 +686,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
            // Compute the number of output lines to process in this iteration.
            //

-            size_t WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);
+            size_t WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph);

            //
            // Walk over each input image organized as a set of NCHWc blocks.
@ -898,7 +898,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
            size_t WorkThisIteration;

            if (StrideHeight == 1 && StrideWidth == 1) {
-                WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);
+                WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph);
            } else {
                WorkThisIteration = 1;
            }
@ -923,7 +923,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM

                constexpr size_t MaximumInputChannelBatch = 128;

-                InputChannelBatch = (std::min)(InputChannels - ic, MaximumInputChannelBatch);
+                InputChannelBatch = std::min(InputChannels - ic, MaximumInputChannelBatch);

                unsigned KernelFlags = ComputeKernelFlags(ic, InputChannelBatch);

--- a/onnxruntime/core/mlas/lib/tanh.cpp
+++ b/onnxruntime/core/mlas/lib/tanh.cpp
@ -119,7 +119,7 @@ Return Value:

        float Value = *Input++;

-        Value = (std::min)(MlasTanhConstants.UpperRange, (std::max)(MlasTanhConstants.LowerRange, Value));
+        Value = std::min(MlasTanhConstants.UpperRange, std::max(MlasTanhConstants.LowerRange, Value));

        float ValueSquared = Value * Value;