diff --git a/onnxruntime/core/mlas/lib/activate.cpp b/onnxruntime/core/mlas/lib/activate.cpp
index b0a2e331fc..8317ec1033 100644
--- a/onnxruntime/core/mlas/lib/activate.cpp
+++ b/onnxruntime/core/mlas/lib/activate.cpp
@@ -109,7 +109,7 @@ struct MLAS_ACTIVATION_FUNCTION<MlasReluActivation>
 #if defined(MLAS_SSE2_INTRINSICS)
         return _mm_cvtss_f32(Activate(_mm_set_ss(Value)));
 #else
-        return (std::max)(Value, 0.0f);
+        return std::max(Value, 0.0f);
 #endif
     }
 };
@@ -140,12 +140,11 @@ struct MLAS_ACTIVATION_FUNCTION<MlasLeakyReluActivation>
 #elif defined(MLAS_AVX_INTRINSICS)
         return _mm_blendv_ps(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
 #elif defined(MLAS_SSE2_INTRINSICS)
-        __m128 Selection = _mm_cmple_ps(ZeroFloat32x4, Value);
-        return _mm_or_ps(_mm_and_ps(Value, Selection), _mm_andnot_ps(Selection, ValueTimesAlpha));
+        return MlasBlendFloat32x4(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
 #elif defined(MLAS_VSX_INTRINSICS)
         return vec_sel(ValueTimesAlpha, Value, vec_cmple(ZeroFloat32x4, Value));
 #else
-#error Unsupported architecture.
+        return MlasBlendFloat32x4(ValueTimesAlpha, Value, ZeroFloat32x4 < Value);
 #endif
     }
 
@@ -186,8 +185,8 @@ struct MLAS_ACTIVATION_FUNCTION<MlasClipActivation>
 #if defined(MLAS_SSE2_INTRINSICS)
         return _mm_cvtss_f32(Activate(_mm_set_ss(Value)));
 #else
-        Value = (std::max)(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast));
-        Value = (std::min)(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast));
+        Value = std::max(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast));
+        Value = std::min(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast));
 
         return Value;
 #endif
diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp
index b81c4b07b8..a070127928 100644
--- a/onnxruntime/core/mlas/lib/compute.cpp
+++ b/onnxruntime/core/mlas/lib/compute.cpp
@@ -552,7 +552,7 @@ Return Value:
 
     while (N > 0) {
 
-        Maximum = (std::max)(Maximum, *Input);
+        Maximum = std::max(Maximum, *Input);
 
         Input += 1;
         N -= 1;
diff --git a/onnxruntime/core/mlas/lib/erf.cpp b/onnxruntime/core/mlas/lib/erf.cpp
index 34390f9582..ebd5a3dd54 100644
--- a/onnxruntime/core/mlas/lib/erf.cpp
+++ b/onnxruntime/core/mlas/lib/erf.cpp
@@ -191,7 +191,7 @@ Return Value:
 
         float r;
         if (AbsValue > MlasErfConstants.ErfSplitBoundary) {
-            AbsValue = (std::min)(MlasErfConstants.ErfUpperAbsRange, AbsValue);
+            AbsValue = std::min(MlasErfConstants.ErfUpperAbsRange, AbsValue);
             float r_big = MlasErfConstants.ErfBIG_P0;
             r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P1;
             r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P2;
@@ -201,7 +201,7 @@ Return Value:
             r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P6_Minus_One;
             r_big = r_big * AbsValue + AbsValue;
 
-            r_big = (std::max)(-r_big, MlasErfConstants.Exp_LowerRange);
+            r_big = std::max(-r_big, MlasErfConstants.Exp_LowerRange);
             r = MlasErfConstants.Exp_Log2Reciprocal * r_big + MlasErfConstants.Exp_C;
             r -= MlasErfConstants.Exp_C;
             float fx = r * MlasErfConstants.Exp_log2_hi + r_big;
diff --git a/onnxruntime/core/mlas/lib/logistic.cpp b/onnxruntime/core/mlas/lib/logistic.cpp
index 9e657f1892..b9fe78c37f 100644
--- a/onnxruntime/core/mlas/lib/logistic.cpp
+++ b/onnxruntime/core/mlas/lib/logistic.cpp
@@ -121,7 +121,7 @@ Return Value:
 
         float Value = *Input++;
 
-        Value = (std::min)(MlasLogisticConstants.UpperRange, (std::max)(MlasLogisticConstants.LowerRange, Value));
+        Value = std::min(MlasLogisticConstants.UpperRange, std::max(MlasLogisticConstants.LowerRange, Value));
 
         float ValueSquared = Value * Value;
 
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 6be271309b..6b610de268 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -25,6 +25,12 @@ Abstract:
 #include <type_traits>
 
 #if defined(_WIN32)
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
 #include <windows.h>
 #include <intrin.h>
 #else
@@ -753,8 +759,6 @@ MlasPartitionWork(
 #if defined(__FMA__) || (defined(_MSC_VER) && defined(__AVX2__))
 #define MLAS_FMA3_INTRINSICS
 #endif
-#else
-#error Unsupported architecture.
 #endif
 
 #if defined(MLAS_NEON_INTRINSICS)
@@ -785,6 +789,21 @@ MlasReinterpretAsInt32x4(MLAS_FLOAT32X4 Vector)
 #endif
 }
 
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasCastToInt32x4(MLAS_FLOAT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vcvtq_s32_f32(Vector);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_cvttps_epi32(Vector);
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_cts(Vector, 0);
+#else
+    return MLAS_INT32X4{int32_t(Vector[0]), int32_t(Vector[1]), int32_t(Vector[2]), int32_t(Vector[3])};
+#endif
+}
+
 MLAS_FORCEINLINE
 MLAS_INT32X4
 MlasBroadcastInt32x4(int32_t Value)
@@ -798,6 +817,182 @@ MlasBroadcastInt32x4(int32_t Value)
 #endif
 }
 
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasLoadInt32x4(const int32_t* Buffer)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vld1q_s32(Buffer);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_loadu_si128((const __m128i*)Buffer);
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_vsx_ld(0, Buffer);
+#else
+    return *((MLAS_INT32X4*)Buffer);
+#endif
+}
+
+MLAS_FORCEINLINE
+void
+MlasStoreInt32x4(int32_t* Buffer, MLAS_INT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    vst1q_s32(Buffer, Vector);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    _mm_storeu_si128((__m128i*)Buffer, Vector);
+#elif defined(MLAS_VSX_INTRINSICS)
+    vec_vsx_st(Vector, 0, Buffer);
+#else
+    *((MLAS_INT32X4*)Buffer) = Vector;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vaddq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_add_epi32(Vector1, Vector2);
+#else
+    return Vector1 + Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vsubq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_sub_epi32(Vector1, Vector2);
+#else
+    return Vector1 - Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasAndInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vandq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_and_si128(Vector1, Vector2);
+#else
+    return Vector1 & Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasOrInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vorrq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_or_si128(Vector1, Vector2);
+#else
+    return Vector1 | Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasAndNotInt32x4(MLAS_INT32X4 VectorNot, MLAS_INT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vandq_s32(vmvnq_s32(VectorNot), Vector);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_andnot_si128(VectorNot, Vector);
+#else
+    return (~VectorNot) & Vector;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasXorInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return veorq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_xor_si128(Vector1, Vector2);
+#else
+    return Vector1 ^ Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasBlendInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2, MLAS_INT32X4 Selection)
+{
+    return MlasOrInt32x4(MlasAndInt32x4(Vector2, Selection), MlasAndNotInt32x4(Selection, Vector1));
+}
+
+template<unsigned ShiftCount>
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasShiftLeftInt32x4(MLAS_INT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vshlq_n_s32(Vector, ShiftCount);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_slli_epi32(Vector, ShiftCount);
+#else
+    return Vector << ShiftCount;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vmaxq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE41_INTRINSICS)
+    return _mm_max_epi32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector1, Vector2));
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_vmaxsw(Vector1, Vector2);
+#else
+    return MlasBlendInt32x4(Vector2, Vector1, Vector1 > Vector2);
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_INT32X4
+MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vminq_s32(Vector1, Vector2);
+#elif defined(MLAS_SSE41_INTRINSICS)
+    return _mm_min_epi32(Vector1, Vector2);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector2, Vector1));
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_vminsw(Vector1, Vector2);
+#else
+    return MlasBlendInt32x4(Vector2, Vector1, Vector2 > Vector1);
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vreinterpretq_f32_s32(Vector);
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_castsi128_ps(Vector);
+#else
+    return MLAS_FLOAT32X4(Vector);
+#endif
+}
+
 MLAS_FORCEINLINE
 MLAS_FLOAT32X4
 MlasBroadcastFloat32x4(float Value)
@@ -847,6 +1042,8 @@ MlasLoadFloat32x4(const float* Buffer)
     return _mm_loadu_ps(Buffer);
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_vsx_ld(0, Buffer);
+#else
+    return *((MLAS_FLOAT32X4*)Buffer);
 #endif
 }
 
@@ -860,6 +1057,8 @@ MlasStoreFloat32x4(float* Buffer, MLAS_FLOAT32X4 Vector)
     _mm_storeu_ps(Buffer, Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
     vec_vsx_st(Vector, 0, Buffer);
+#else
+    *((MLAS_FLOAT32X4*)Buffer) = Vector;
 #endif
 }
 
@@ -945,8 +1144,76 @@ MlasExtractLaneFloat32x4<0>(MLAS_FLOAT32X4 Vector)
     return _mm_cvtss_f32(Vector);
 }
 
+template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector)
+{
+    return _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(Index3, Index2, Index1, Index0));
+}
+
 #endif
 
+#if !defined(MLAS_SSE2_INTRINSICS) && !defined(_MSC_VER)
+
+template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(__clang__)
+    return __builtin_shufflevector(Vector1, Vector2, Index0, Index1, Index2, Index3);
+#else
+    return __builtin_shuffle(Vector1, Vector2, MLAS_INT32X4{Index0, Index1, Index2, Index3});
+#endif
+}
+
+template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector)
+{
+    return MlasShuffleFloat32x4<Index0, Index1, Index2, Index3>(Vector, Vector);
+}
+
+#endif
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasInterleaveLowFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_NEON64_INTRINSICS)
+    return vzip1q_f32(Vector1, Vector2);
+#elif defined(MLAS_NEON32_INTRINSICS)
+    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
+    return zipped.val[0];
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_unpacklo_ps(Vector1, Vector2);
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_mergeh(Vector1, Vector2);
+#else
+    return MlasShuffleFloat32x4<0, 4, 1, 5>(Vector1, Vector2);
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasInterleaveHighFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_NEON64_INTRINSICS)
+    return vzip2q_f32(Vector1, Vector2);
+#elif defined(MLAS_NEON32_INTRINSICS)
+    float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
+    return zipped.val[1];
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_unpackhi_ps(Vector1, Vector2);
+#elif defined(MLAS_VSX_INTRINSICS)
+    return vec_mergel(Vector1, Vector2);
+#else
+    return MlasShuffleFloat32x4<2, 6, 3, 7>(Vector1, Vector2);
+#endif
+}
+
 MLAS_FORCEINLINE
 MLAS_FLOAT32X4
 MlasAddFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
@@ -1036,6 +1303,70 @@ MlasDivideFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
 #endif
 }
 
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_NEON_INTRINSICS)
+    return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2));
+#elif defined(MLAS_SSE2_INTRINSICS)
+    return _mm_cmpgt_ps(Vector1, Vector2);
+#else
+    return Vector1 > Vector2;
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_SSE2_INTRINSICS)
+    return _mm_and_ps(Vector1, Vector2);
+#else
+    return MlasReinterpretAsFloat32x4(MlasAndInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_SSE2_INTRINSICS)
+    return _mm_or_ps(Vector1, Vector2);
+#else
+    return MlasReinterpretAsFloat32x4(MlasOrInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector)
+{
+#if defined(MLAS_SSE2_INTRINSICS)
+    return _mm_andnot_ps(VectorNot, Vector);
+#else
+    return MlasReinterpretAsFloat32x4(MlasAndNotInt32x4(MlasReinterpretAsInt32x4(VectorNot), MlasReinterpretAsInt32x4(Vector)));
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
+{
+#if defined(MLAS_SSE2_INTRINSICS)
+    return _mm_xor_ps(Vector1, Vector2);
+#else
+    return MlasReinterpretAsFloat32x4(MlasXorInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
+#endif
+}
+
+MLAS_FORCEINLINE
+MLAS_FLOAT32X4
+MlasBlendFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2, MLAS_FLOAT32X4 Selection)
+{
+    return MlasOrFloat32x4(MlasAndFloat32x4(Vector2, Selection), MlasAndNotFloat32x4(Selection, Vector1));
+}
+
 MLAS_FORCEINLINE
 MLAS_FLOAT32X4
 MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
@@ -1047,7 +1378,7 @@ MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_sel(Vector2, Vector1, vec_cmpgt(Vector1, Vector2));
 #else
-#error Unsupported architecture.
+    return MlasBlendFloat32x4(Vector2, Vector1, Vector1 > Vector2);
 #endif
 }
 
@@ -1062,7 +1393,7 @@ MlasMinimumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
 #elif defined(MLAS_VSX_INTRINSICS)
     return vec_sel(Vector2, Vector1, vec_cmpgt(Vector2, Vector1));
 #else
-#error Unsupported architecture.
+    return MlasBlendFloat32x4(Vector2, Vector1, Vector2 > Vector1);
 #endif
 }
 
@@ -1079,86 +1410,6 @@ MlasClampFloat32x4(MLAS_FLOAT32X4 Value, float LowerRange, float UpperRange)
     return Value;
 }
 
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_cmpgt_ps(Vector1, Vector2);
-#elif defined(MLAS_VSX_INTRINSICS)
-    return MLAS_FLOAT32X4(vec_cmpgt(Vector1, Vector2));
-#else
-#error Unsupported architecture.
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_and_ps(Vector1, Vector2);
-#else
-    return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) & MLAS_INT32X4(Vector2));
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_or_ps(Vector1, Vector2);
-#else
-    return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) | MLAS_INT32X4(Vector2));
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(vreinterpretq_u32_f32(VectorNot)), vreinterpretq_u32_f32(Vector)));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_andnot_ps(VectorNot, Vector);
-#else
-    return MLAS_FLOAT32X4(~MLAS_INT32X4(VectorNot) & MLAS_INT32X4(Vector));
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_xor_ps(Vector1, Vector2);
-#else
-    return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) ^ MLAS_INT32X4(Vector2));
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_FLOAT32X4
-MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vreinterpretq_f32_s32(Vector);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_castsi128_ps(Vector);
-#else
-    return MLAS_FLOAT32X4(Vector);
-#endif
-}
-
 MLAS_FORCEINLINE
 float
 MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector)
@@ -1173,16 +1424,14 @@ MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector)
     VectorLow = vpadd_f32(VectorLow, VectorHigh);
     VectorLow = vpadd_f32(VectorLow, VectorHigh);
     return vget_lane_f32(VectorLow, 0);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2)));
-    Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
-    return _mm_cvtss_f32(Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
     Vector = MlasAddFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1)));
     Vector = MlasAddFloat32x4(Vector, vec_splat(Vector, 1));
     return Vector[0];
 #else
-#error Unsupported architecture.
+    Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector));
+    Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector));
+    return MlasExtractLaneFloat32x4<0>(Vector);
 #endif
 }
 
@@ -1198,108 +1447,24 @@ MlasReduceMaximumFloat32x4(MLAS_FLOAT32X4 Vector)
     VectorLow = vpmax_f32(VectorLow, VectorHigh);
     VectorLow = vpmax_f32(VectorLow, VectorHigh);
     return vget_lane_f32(VectorLow, 0);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2)));
-    Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
-    return _mm_cvtss_f32(Vector);
 #elif defined(MLAS_VSX_INTRINSICS)
     Vector = MlasMaximumFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1)));
     Vector = MlasMaximumFloat32x4(Vector, vec_splat(Vector, 1));
     return Vector[0];
 #else
-#error Unsupported architecture.
+    Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector));
+    Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector));
+    return MlasExtractLaneFloat32x4<0>(Vector);
 #endif
 }
 
-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vaddq_s32(Vector1, Vector2);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_add_epi32(Vector1, Vector2);
-#else
-    return Vector1 + Vector2;
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vsubq_s32(Vector1, Vector2);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_sub_epi32(Vector1, Vector2);
-#else
-    return Vector1 - Vector2;
-#endif
-}
-
-template<unsigned ShiftCount>
-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasShiftLeftInt32x4(MLAS_INT32X4 Vector)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vshlq_n_s32(Vector, ShiftCount);
-#elif defined(MLAS_SSE2_INTRINSICS)
-    return _mm_slli_epi32(Vector, ShiftCount);
-#else
-    return Vector << ShiftCount;
-#endif
-}
-
-#if !defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_SSE41_INTRINSICS)
-
-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vmaxq_s32(Vector1, Vector2);
-#elif defined(MLAS_SSE41_INTRINSICS)
-    return _mm_max_epi32(Vector1, Vector2);
-#elif defined(MLAS_VSX_INTRINSICS)
-    return vec_vmaxsw(Vector1, Vector2);
-#else
-#error Unsupported architecture.
-#endif
-}
-
-MLAS_FORCEINLINE
-MLAS_INT32X4
-MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
-{
-#if defined(MLAS_NEON_INTRINSICS)
-    return vminq_s32(Vector1, Vector2);
-#elif defined(MLAS_SSE41_INTRINSICS)
-    return _mm_min_epi32(Vector1, Vector2);
-#elif defined(MLAS_VSX_INTRINSICS)
-    return vec_vminsw(Vector1, Vector2);
-#else
-#error Unsupported architecture.
-#endif
-}
-
-#endif
-
 // calc 2^int(N)
 MLAS_FORCEINLINE
 MLAS_FLOAT32X4
 MlasPowerOf2Float32x4(MLAS_FLOAT32X4 Vector)
 {
-#if defined(MLAS_NEON_INTRINSICS)
-    MLAS_INT32X4 emm0 = vaddq_s32(vcvtq_s32_f32(Vector), MlasBroadcastInt32x4(127));
-    return vreinterpretq_f32_s32(vshlq_n_s32(emm0, 23));
-#elif defined(MLAS_SSE2_INTRINSICS)
-    MLAS_INT32X4 emm0 = _mm_add_epi32(_mm_cvttps_epi32(Vector), MlasBroadcastInt32x4(127));
-    return _mm_castsi128_ps(_mm_slli_epi32(emm0, 23));
-#elif defined(MLAS_VSX_INTRINSICS)
-    MLAS_INT32X4 emm0 = vec_cts(Vector, 0) + MlasBroadcastInt32x4(127);
-    return MLAS_FLOAT32X4(vec_sl(emm0, MLAS_UINT32X4(MlasBroadcastInt32x4(23))));
-#endif
+    MLAS_INT32X4 emm0 = MlasAddInt32x4(MlasCastToInt32x4(Vector), MlasBroadcastInt32x4(127));
+    return MlasReinterpretAsFloat32x4(MlasShiftLeftInt32x4<23>(emm0));
 }
 
 //
diff --git a/onnxruntime/core/mlas/lib/pooling.cpp b/onnxruntime/core/mlas/lib/pooling.cpp
index 0399b56f76..805b6e243f 100644
--- a/onnxruntime/core/mlas/lib/pooling.cpp
+++ b/onnxruntime/core/mlas/lib/pooling.cpp
@@ -80,7 +80,7 @@ struct MLAS_MAXIMUM_POOLING
 
     static float Reduce(float Reduction, float Value)
     {
-        return (std::max)(Reduction, Value);
+        return std::max(Reduction, Value);
     }
 
     static MLAS_FLOAT32X4 Reduce(MLAS_FLOAT32X4 Reduction, MLAS_FLOAT32X4 Value)
@@ -293,8 +293,8 @@ Return Value:
             const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
             const int64_t iwEnd64 = iwStart64 + KernelWidth;
 
-            const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
-            const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
+            const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
+            const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));
 
             float m = PoolingType::InitialValue();
 
@@ -370,16 +370,16 @@ Return Value:
             const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
             const int64_t ihEnd64 = ihStart64 + KernelHeight;
 
-            const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
-            const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
+            const size_t ihStart = size_t(std::max(ihStart64, int64_t(0)));
+            const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight)));
 
             for (size_t pw = 0; pw < OutputWidth; pw++) {
 
                 const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
                 const int64_t iwEnd64 = iwStart64 + KernelWidth;
 
-                const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
-                const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
+                const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
+                const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));
 
                 float m = PoolingType::InitialValue();
 
@@ -604,14 +604,12 @@ Return Value:
                         break;
                     }
 
-#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
-                    MlasStoreLaneFloat32x4<0>(Output, Reduction);
-                    MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
-#elif defined(MLAS_SSE2_INTRINSICS)
+#if defined(MLAS_SSE2_INTRINSICS)
                     Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
                     MlasStoreLowHalfFloat32x4(Output, Reduction);
 #else
-#error Unsupported architecture.
+                    MlasStoreLaneFloat32x4<0>(Output, Reduction);
+                    MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
 #endif
 
                     Output += 2;
@@ -688,24 +686,24 @@ Return Value:
             const int64_t idStart64 = pd * StrideDepth - PaddingLeftDepth;
             const int64_t idEnd64 = idStart64 + KernelDepth;
 
-            const size_t idStart = size_t((std::max)(idStart64, int64_t(0)));
-            const size_t idEnd = size_t((std::min)(idEnd64, int64_t(InputDepth)));
+            const size_t idStart = size_t(std::max(idStart64, int64_t(0)));
+            const size_t idEnd = size_t(std::min(idEnd64, int64_t(InputDepth)));
 
             for (size_t ph = 0; ph < OutputHeight; ph++) {
 
                 const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
                 const int64_t ihEnd64 = ihStart64 + KernelHeight;
 
-                const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
-                const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
+                const size_t ihStart = size_t(std::max(ihStart64, int64_t(0)));
+                const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight)));
 
                 for (size_t pw = 0; pw < OutputWidth; pw++) {
 
                     const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
                     const int64_t iwEnd64 = iwStart64 + KernelWidth;
 
-                    const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
-                    const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
+                    const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
+                    const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));
 
                     float m = PoolingType::InitialValue();
 
@@ -976,14 +974,12 @@ Return Value:
                             break;
                         }
 
-#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
-                        MlasStoreLaneFloat32x4<0>(Output, Reduction);
-                        MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
-#elif defined(MLAS_SSE2_INTRINSICS)
+#if defined(MLAS_SSE2_INTRINSICS)
                         Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
                         MlasStoreLowHalfFloat32x4(Output, Reduction);
 #else
-#error Unsupported architecture.
+                        MlasStoreLaneFloat32x4<0>(Output, Reduction);
+                        MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
 #endif
 
                         Output += 2;
diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp
index 9bede5864b..d90ff78739 100644
--- a/onnxruntime/core/mlas/lib/qgemm.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm.cpp
@@ -181,48 +181,39 @@ Return Value:
     }
 }
 
-template<typename BType>
 void
 MlasGemmU8X8CopyPackBProcessSse(
     int16_t* D,
     __m128i BytesRow0,
     __m128i BytesRow1,
-    __m128i ZeroVector,
+    __m128i BitFlipVector,
     __m128i ColumnSums[2]
     )
 {
     __m128i BytesInterleaved = _mm_unpacklo_epi8(BytesRow0, BytesRow1);
-    __m128i WordsInterleaved[2];
 
-    //
-    // Zero or sign extend the bytes to words.
-    //
+    BytesInterleaved = _mm_xor_si128(BytesInterleaved, BitFlipVector);
 
-    if (std::is_same<BType, uint8_t>::value) {
-        WordsInterleaved[0] = _mm_unpacklo_epi8(BytesInterleaved, ZeroVector);
-        WordsInterleaved[1] = _mm_unpackhi_epi8(BytesInterleaved, ZeroVector);
-    } else {
-        WordsInterleaved[0] = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8);
-        WordsInterleaved[1] = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8);
-    }
+    __m128i WordsInterleaved0 = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8);
+    __m128i WordsInterleaved1 = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8);
 
-    ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved[0]);
-    ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved[1]);
+    ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved0);
+    ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved1);
 
-    _mm_storeu_si128((__m128i*)&D[0], WordsInterleaved[0]);
-    _mm_storeu_si128((__m128i*)&D[8], WordsInterleaved[1]);
+    _mm_storeu_si128((__m128i*)&D[0], WordsInterleaved0);
+    _mm_storeu_si128((__m128i*)&D[8], WordsInterleaved1);
 }
 
-template<typename BType>
 void
 MlasGemmU8X8CopyPackBSse(
     int16_t* D,
-    const BType* B,
+    const uint8_t* B,
     size_t ldb,
     size_t CountN,
     size_t CountK,
     int32_t* ColumnSumVector,
-    int16_t offa
+    int16_t offa,
+    bool BTypeIsSigned
     )
 /*++
 
@@ -256,9 +247,8 @@ Return Value:
 
 --*/
 {
-    const __m128i ZeroVector = _mm_setzero_si128();
     const __m128i OffsetBroadcast = _mm_set1_epi16(offa);
-    BType PaddedMatrixBData[16] = { 0 };
+    const __m128i BitFlipVector = _mm_set1_epi32(BTypeIsSigned ? 0 : 0x80808080);
 
     //
     // Process 8 columns of matrix B in a loop.
@@ -266,12 +256,12 @@ Return Value:
 
     while (CountN >= 8) {
 
-        const BType* b = B;
+        const uint8_t* b = B;
         size_t k = CountK;
         __m128i ColumnSums[2];
 
-        ColumnSums[0] = ZeroVector;
-        ColumnSums[1] = ZeroVector;
+        ColumnSums[0] = _mm_setzero_si128();
+        ColumnSums[1] = _mm_setzero_si128();
 
         //
         // Interleave rows of matrix B and write to the packed buffer.
@@ -286,7 +276,7 @@ Return Value:
             __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]);
             __m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&b[ldb]);
 
-            MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums);
+            MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);
 
             b += ldb * 2;
             D += 16;
@@ -297,7 +287,7 @@ Return Value:
 
             __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]);
 
-            MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums);
+            MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);
 
             D += 16;
         }
@@ -325,12 +315,15 @@ Return Value:
 
     if (CountN > 0) {
 
-        const BType* b = B;
+        const uint8_t* b = B;
         size_t k = CountK;
         __m128i ColumnSums[2];
+        uint8_t PaddedMatrixBData[16];
 
-        ColumnSums[0] = ZeroVector;
-        ColumnSums[1] = ZeroVector;
+        _mm_storeu_si128((__m128i*)PaddedMatrixBData, BitFlipVector);
+
+        ColumnSums[0] = _mm_setzero_si128();
+        ColumnSums[1] = _mm_setzero_si128();
 
         //
         // Interleave rows of matrix B using an intermediate zero padded stack
@@ -339,9 +332,9 @@ Return Value:
 
         while (k >= 2) {
 
-            const BType* bcopy = b;
-            BType* padded = PaddedMatrixBData;
-            BType* padded_end = padded + CountN;
+            const uint8_t* bcopy = b;
+            uint8_t* padded = PaddedMatrixBData;
+            uint8_t* padded_end = padded + CountN;
 
             do {
                 padded[0] = bcopy[0];
@@ -353,7 +346,7 @@ Return Value:
             __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]);
             __m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[8]);
 
-            MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums);
+            MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);
 
             b += ldb * 2;
             D += 16;
@@ -362,9 +355,9 @@ Return Value:
 
         if (k > 0) {
 
-            const BType* bcopy = b;
-            BType* padded = PaddedMatrixBData;
-            BType* padded_end = padded + CountN;
+            const uint8_t* bcopy = b;
+            uint8_t* padded = PaddedMatrixBData;
+            uint8_t* padded_end = padded + CountN;
 
             do {
                 padded[0] = bcopy[0];
@@ -374,7 +367,7 @@ Return Value:
 
             __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]);
 
-            MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums);
+            MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);
         }
 
         //
@@ -648,6 +641,10 @@ Return Value:
     size_t StrideN = MLAS_GEMM_U8X8_STRIDEN_SSE;
     size_t StrideK = MLAS_GEMM_U8X8_STRIDEK_SSE;
 
+    if (!WorkBlock->BTypeIsSigned) {
+        offb = int8_t(offb ^ 0x80);
+    }
+
     //
     // Step through each slice of matrix B along the K dimension.
     //
@@ -656,7 +653,7 @@ Return Value:
 
     for (size_t k = 0; k < K; k += CountK) {
 
-        CountK = (std::min)(K - k, StrideK);
+        CountK = std::min(K - k, StrideK);
 
         //
         // Step through each slice of matrix B along the N dimension.
@@ -666,7 +663,7 @@ Return Value:
 
         for (size_t n = 0; n < N; n += CountN) {
 
-            CountN = (std::min)(N - n, StrideN);
+            CountN = std::min(N - n, StrideN);
 
             //
             // Copy a panel of matrix B to a local packed buffer.
@@ -674,13 +671,8 @@ Return Value:
 
             const uint8_t* b = B + n + k * ldb;
 
-            if (WorkBlock->BTypeIsSigned) {
-                MlasGemmU8X8CopyPackBSse(PanelB, (const int8_t*)b, ldb, CountN,
-                    CountK, ColumnSumVector, -int16_t(offa));
-            } else {
-                MlasGemmU8X8CopyPackBSse(PanelB, (const uint8_t*)b, ldb, CountN,
-                    CountK, ColumnSumVector, -int16_t(offa));
-            }
+            MlasGemmU8X8CopyPackBSse(PanelB, b, ldb, CountN, CountK,
+                ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
 
             //
             // Step through each slice of matrix A along the M dimension.
@@ -694,7 +686,7 @@ Return Value:
 
             for (size_t m = 0; m < M; m += CountM) {
 
-                CountM = (std::min)(M - m, StrideM);
+                CountM = std::min(M - m, StrideM);
 
                 //
                 // Copy a panel of matrix A to a local packed buffer.
@@ -892,7 +884,7 @@ Return Value:
 
     for (size_t k = 0; k < K; k += CountK) {
 
-        CountK = (std::min)(K - k, StrideK);
+        CountK = std::min(K - k, StrideK);
 
         //
         // Step through each slice of matrix B along the N dimension.
@@ -902,7 +894,7 @@ Return Value:
 
         for (size_t n = 0; n < N; n += CountN) {
 
-            CountN = (std::min)(N - n, StrideN);
+            CountN = std::min(N - n, StrideN);
 
             //
             // Copy a panel of matrix B to a local packed buffer.
@@ -910,8 +902,8 @@ Return Value:
 
             const int8_t* b = (const int8_t*)B + n + k * ldb;
 
-            MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN,
-                CountK, ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
+            MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK,
+                ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
 
             //
             // Step through each slice of matrix A along the M dimension.
@@ -925,7 +917,7 @@ Return Value:
 
             for (size_t m = 0; m < M; m += CountM) {
 
-                CountM = (std::min)(M - m, StrideM);
+                CountM = std::min(M - m, StrideM);
 
                 //
                 // Copy a panel of matrix A to a local packed buffer.
@@ -1038,7 +1030,7 @@ Return Value:
 
     for (size_t k = 0; k < K; k += CountK) {
 
-        CountK = (std::min)(K - k, StrideK);
+        CountK = std::min(K - k, StrideK);
 
         //
         // Step through each slice of matrix B along the N dimension.
@@ -1048,16 +1040,16 @@ Return Value:
 
         for (size_t n = 0; n < N; n += CountN) {
 
-            CountN = (std::min)(N - n, StrideN);
+            CountN = std::min(N - n, StrideN);
 
             //
             // Copy a panel of matrix B to a local packed buffer.
             //
 
-            const uint8_t* b = (const uint8_t*)B + n + k * ldb;
+            const uint8_t* b = B + n + k * ldb;
 
-            MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK, ColumnSumVector,
-                -int16_t(offa));
+            MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK,
+                ColumnSumVector, -int16_t(offa));
 
             //
             // Step through each slice of matrix A along the M dimension.
@@ -1071,7 +1063,7 @@ Return Value:
 
             for (size_t m = 0; m < M; m += CountM) {
 
-                CountM = (std::min)(M - m, StrideM);
+                CountM = std::min(M - m, StrideM);
 
                 //
                 // Copy a panel of matrix A to a local packed buffer.
diff --git a/onnxruntime/core/mlas/lib/reorder.cpp b/onnxruntime/core/mlas/lib/reorder.cpp
index 7687a207c7..821bb0c9a2 100644
--- a/onnxruntime/core/mlas/lib/reorder.cpp
+++ b/onnxruntime/core/mlas/lib/reorder.cpp
@@ -213,7 +213,7 @@ Return Value:
 
     for (size_t i = InputChannels; i > 0;) {
 
-        const size_t InputChannelsThisIteration = (std::min)(i, BlockSize);
+        const size_t InputChannelsThisIteration = std::min(i, BlockSize);
         i -= InputChannelsThisIteration;
 
         const float* s = S;
@@ -308,7 +308,7 @@ Return Value:
 
         for (size_t o = OutputChannels; o > 0;) {
 
-            const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
+            const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
             const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
             o -= OutputChannelsThisIteration;
 
@@ -415,7 +415,7 @@ Return Value:
 
             for (size_t o = OutputChannels; o > 0;) {
 
-                const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
+                const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
                 const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
                 o -= OutputChannelsThisIteration;
 
@@ -523,7 +523,7 @@ Return Value:
 
     for (size_t o = OutputChannels; o > 0;) {
 
-        const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
+        const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
         const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
         o -= OutputChannelsThisIteration;
 
@@ -538,7 +538,7 @@ Return Value:
 
         for (size_t i = InputChannels; i > 0;) {
 
-            const size_t InputChannelsThisIteration = (std::min)(i, BlockSize);
+            const size_t InputChannelsThisIteration = std::min(i, BlockSize);
             i -= InputChannelsThisIteration;
 
             //
@@ -671,7 +671,7 @@ Return Value:
 
     for (size_t o = OutputChannels; o > 0;) {
 
-        const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
+        const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
         const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
         o -= OutputChannelsThisIteration;
 
diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
index 52816e22f7..5efdd36b1c 100644
--- a/onnxruntime/core/mlas/lib/sgemm.cpp
+++ b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -422,29 +422,15 @@ Return Value:
         t1 = o0.val[1];
         t2 = o1.val[0];
         t3 = o1.val[1];
-#elif defined(MLAS_SSE2_INTRINSICS)
-        // N.B. The MSVC version of _MM_TRANSPOSE4_PS uses shufps which is
-        // slightly larger than the below sequence, so manually expand the
-        // matrix transpose.
-        __m128 z0 = _mm_unpacklo_ps(t0, t1);
-        __m128 z1 = _mm_unpackhi_ps(t0, t1);
-        __m128 z2 = _mm_unpacklo_ps(t2, t3);
-        __m128 z3 = _mm_unpackhi_ps(t2, t3);
-        t0 = _mm_movelh_ps(z0, z2);
-        t1 = _mm_movehl_ps(z2, z0);
-        t2 = _mm_movelh_ps(z1, z3);
-        t3 = _mm_movehl_ps(z3, z1);
-#elif defined(MLAS_VSX_INTRINSICS)
-        __vector float z0 = vec_mergeh(t0, t2);
-        __vector float z1 = vec_mergel(t0, t2);
-        __vector float z2 = vec_mergeh(t1, t3);
-        __vector float z3 = vec_mergel(t1, t3);
-        t0 = vec_mergeh(z0, z2);
-        t1 = vec_mergel(z0, z2);
-        t2 = vec_mergeh(z1, z3);
-        t3 = vec_mergel(z1, z3);
 #else
-#error Unsupported architecture.
+        MLAS_FLOAT32X4 z0 = MlasInterleaveLowFloat32x4(t0, t2);
+        MLAS_FLOAT32X4 z1 = MlasInterleaveHighFloat32x4(t0, t2);
+        MLAS_FLOAT32X4 z2 = MlasInterleaveLowFloat32x4(t1, t3);
+        MLAS_FLOAT32X4 z3 = MlasInterleaveHighFloat32x4(t1, t3);
+        t0 = MlasInterleaveLowFloat32x4(z0, z2);
+        t1 = MlasInterleaveHighFloat32x4(z0, z2);
+        t2 = MlasInterleaveLowFloat32x4(z1, z3);
+        t3 = MlasInterleaveHighFloat32x4(z1, z3);
 #endif
 
         MlasStoreAlignedFloat32x4(&D[0], t0);
@@ -639,7 +625,14 @@ Return Value:
                 MLAS_FLOAT32X4 t0 = MlasLoadFloat32x4(&b[0]);
                 MLAS_FLOAT32X4 t1 = MlasLoadFloat32x4(&b[ldb]);
 
-#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
+#if defined(MLAS_SSE2_INTRINSICS)
+                __m128 v0 = _mm_unpacklo_ps(t0, t1);
+                __m128 v1 = _mm_unpackhi_ps(t0, t1);
+                _mm_storel_pi((__m64*)&d[0], v0);
+                _mm_storeh_pi((__m64*)&d[16], v0);
+                _mm_storel_pi((__m64*)&d[32], v1);
+                _mm_storeh_pi((__m64*)&d[48], v1);
+#else
                 MlasStoreLaneFloat32x4<0>(&d[0], t0);
                 MlasStoreLaneFloat32x4<0>(&d[1], t1);
                 MlasStoreLaneFloat32x4<1>(&d[16], t0);
@@ -648,15 +641,6 @@ Return Value:
                 MlasStoreLaneFloat32x4<2>(&d[33], t1);
                 MlasStoreLaneFloat32x4<3>(&d[48], t0);
                 MlasStoreLaneFloat32x4<3>(&d[49], t1);
-#elif defined(MLAS_SSE2_INTRINSICS)
-                __m128 v0 = _mm_unpacklo_ps(t0, t1);
-                __m128 v1 = _mm_unpackhi_ps(t0, t1);
-                _mm_storel_pi((__m64*)&d[0], v0);
-                _mm_storeh_pi((__m64*)&d[16], v0);
-                _mm_storel_pi((__m64*)&d[32], v1);
-                _mm_storeh_pi((__m64*)&d[48], v1);
-#else
-#error Unsupported architecture.
 #endif
 
                 d += 2;
diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp
index 325cdbd33d..3afcd7b451 100644
--- a/onnxruntime/core/mlas/lib/snchwc.cpp
+++ b/onnxruntime/core/mlas/lib/snchwc.cpp
@@ -539,7 +539,7 @@ struct MLAS_NCHWC_GROUPED_CONV_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
 
     void ComputeFilterCount(void)
     {
-        FilterCount = (std::min)(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize);
+        FilterCount = std::min(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize);
     }
 
     void PrepareWork(int32_t Index)
@@ -686,7 +686,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
             // Compute the number of output lines to process in this iteration.
             //
 
-            size_t WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);
+            size_t WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph);
 
             //
             // Walk over each input image organized as a set of NCHWc blocks.
@@ -898,7 +898,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
             size_t WorkThisIteration;
 
             if (StrideHeight == 1 && StrideWidth == 1) {
-                WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);
+                WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph);
             } else {
                 WorkThisIteration = 1;
             }
@@ -923,7 +923,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
 
                 constexpr size_t MaximumInputChannelBatch = 128;
 
-                InputChannelBatch = (std::min)(InputChannels - ic, MaximumInputChannelBatch);
+                InputChannelBatch = std::min(InputChannels - ic, MaximumInputChannelBatch);
 
                 unsigned KernelFlags = ComputeKernelFlags(ic, InputChannelBatch);
 
diff --git a/onnxruntime/core/mlas/lib/tanh.cpp b/onnxruntime/core/mlas/lib/tanh.cpp
index 2fbeaef3d9..8533d85f8e 100644
--- a/onnxruntime/core/mlas/lib/tanh.cpp
+++ b/onnxruntime/core/mlas/lib/tanh.cpp
@@ -119,7 +119,7 @@ Return Value:
 
         float Value = *Input++;
 
-        Value = (std::min)(MlasTanhConstants.UpperRange, (std::max)(MlasTanhConstants.LowerRange, Value));
+        Value = std::min(MlasTanhConstants.UpperRange, std::max(MlasTanhConstants.LowerRange, Value));
 
         float ValueSquared = Value * Value;