diff --git a/onnxruntime/core/mlas/lib/activate.cpp b/onnxruntime/core/mlas/lib/activate.cpp index b0a2e331fc..8317ec1033 100644 --- a/onnxruntime/core/mlas/lib/activate.cpp +++ b/onnxruntime/core/mlas/lib/activate.cpp @@ -109,7 +109,7 @@ struct MLAS_ACTIVATION_FUNCTION #if defined(MLAS_SSE2_INTRINSICS) return _mm_cvtss_f32(Activate(_mm_set_ss(Value))); #else - return (std::max)(Value, 0.0f); + return std::max(Value, 0.0f); #endif } }; @@ -140,12 +140,11 @@ struct MLAS_ACTIVATION_FUNCTION #elif defined(MLAS_AVX_INTRINSICS) return _mm_blendv_ps(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value)); #elif defined(MLAS_SSE2_INTRINSICS) - __m128 Selection = _mm_cmple_ps(ZeroFloat32x4, Value); - return _mm_or_ps(_mm_and_ps(Value, Selection), _mm_andnot_ps(Selection, ValueTimesAlpha)); + return MlasBlendFloat32x4(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value)); #elif defined(MLAS_VSX_INTRINSICS) return vec_sel(ValueTimesAlpha, Value, vec_cmple(ZeroFloat32x4, Value)); #else -#error Unsupported architecture. + return MlasBlendFloat32x4(ValueTimesAlpha, Value, ZeroFloat32x4 < Value); #endif } @@ -186,8 +185,8 @@ struct MLAS_ACTIVATION_FUNCTION #if defined(MLAS_SSE2_INTRINSICS) return _mm_cvtss_f32(Activate(_mm_set_ss(Value))); #else - Value = (std::max)(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast)); - Value = (std::min)(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast)); + Value = std::max(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast)); + Value = std::min(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast)); return Value; #endif diff --git a/onnxruntime/core/mlas/lib/compute.cpp b/onnxruntime/core/mlas/lib/compute.cpp index b81c4b07b8..a070127928 100644 --- a/onnxruntime/core/mlas/lib/compute.cpp +++ b/onnxruntime/core/mlas/lib/compute.cpp @@ -552,7 +552,7 @@ Return Value: while (N > 0) { - Maximum = (std::max)(Maximum, *Input); + Maximum = std::max(Maximum, *Input); Input += 1; N -= 1; diff --git a/onnxruntime/core/mlas/lib/erf.cpp b/onnxruntime/core/mlas/lib/erf.cpp index 34390f9582..ebd5a3dd54 100644 --- a/onnxruntime/core/mlas/lib/erf.cpp +++ b/onnxruntime/core/mlas/lib/erf.cpp @@ -191,7 +191,7 @@ Return Value: float r; if (AbsValue > MlasErfConstants.ErfSplitBoundary) { - AbsValue = (std::min)(MlasErfConstants.ErfUpperAbsRange, AbsValue); + AbsValue = std::min(MlasErfConstants.ErfUpperAbsRange, AbsValue); float r_big = MlasErfConstants.ErfBIG_P0; r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P1; r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P2; @@ -201,7 +201,7 @@ Return Value: r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P6_Minus_One; r_big = r_big * AbsValue + AbsValue; - r_big = (std::max)(-r_big, MlasErfConstants.Exp_LowerRange); + r_big = std::max(-r_big, MlasErfConstants.Exp_LowerRange); r = MlasErfConstants.Exp_Log2Reciprocal * r_big + MlasErfConstants.Exp_C; r -= MlasErfConstants.Exp_C; float fx = r * MlasErfConstants.Exp_log2_hi + r_big; diff --git a/onnxruntime/core/mlas/lib/logistic.cpp b/onnxruntime/core/mlas/lib/logistic.cpp index 9e657f1892..b9fe78c37f 100644 --- a/onnxruntime/core/mlas/lib/logistic.cpp +++ b/onnxruntime/core/mlas/lib/logistic.cpp @@ -121,7 +121,7 @@ Return Value: float Value = *Input++; - Value = (std::min)(MlasLogisticConstants.UpperRange, (std::max)(MlasLogisticConstants.LowerRange, Value)); + Value = std::min(MlasLogisticConstants.UpperRange, std::max(MlasLogisticConstants.LowerRange, Value)); float ValueSquared = Value * Value; diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 6be271309b..6b610de268 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -25,6 +25,12 @@ Abstract: #include #if defined(_WIN32) +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#ifndef NOMINMAX +#define NOMINMAX +#endif #include #include #else @@ -753,8 +759,6 @@ MlasPartitionWork( #if defined(__FMA__) || (defined(_MSC_VER) && defined(__AVX2__)) #define MLAS_FMA3_INTRINSICS #endif -#else -#error Unsupported architecture. #endif #if defined(MLAS_NEON_INTRINSICS) @@ -785,6 +789,21 @@ MlasReinterpretAsInt32x4(MLAS_FLOAT32X4 Vector) #endif } +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasCastToInt32x4(MLAS_FLOAT32X4 Vector) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vcvtq_s32_f32(Vector); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_cvttps_epi32(Vector); +#elif defined(MLAS_VSX_INTRINSICS) + return vec_cts(Vector, 0); +#else + return MLAS_INT32X4{int32_t(Vector[0]), int32_t(Vector[1]), int32_t(Vector[2]), int32_t(Vector[3])}; +#endif +} + MLAS_FORCEINLINE MLAS_INT32X4 MlasBroadcastInt32x4(int32_t Value) @@ -798,6 +817,182 @@ MlasBroadcastInt32x4(int32_t Value) #endif } +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasLoadInt32x4(const int32_t* Buffer) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vld1q_s32(Buffer); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_loadu_si128((const __m128i*)Buffer); +#elif defined(MLAS_VSX_INTRINSICS) + return vec_vsx_ld(0, Buffer); +#else + return *((MLAS_INT32X4*)Buffer); +#endif +} + +MLAS_FORCEINLINE +void +MlasStoreInt32x4(int32_t* Buffer, MLAS_INT32X4 Vector) +{ +#if defined(MLAS_NEON_INTRINSICS) + vst1q_s32(Buffer, Vector); +#elif defined(MLAS_SSE2_INTRINSICS) + _mm_storeu_si128((__m128i*)Buffer, Vector); +#elif defined(MLAS_VSX_INTRINSICS) + vec_vsx_st(Vector, 0, Buffer); +#else + *((MLAS_INT32X4*)Buffer) = Vector; +#endif +} + +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vaddq_s32(Vector1, Vector2); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_add_epi32(Vector1, Vector2); +#else + return Vector1 + Vector2; +#endif +} + +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vsubq_s32(Vector1, Vector2); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_sub_epi32(Vector1, Vector2); +#else + return Vector1 - Vector2; +#endif +} + +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasAndInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vandq_s32(Vector1, Vector2); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_and_si128(Vector1, Vector2); +#else + return Vector1 & Vector2; +#endif +} + +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasOrInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vorrq_s32(Vector1, Vector2); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_or_si128(Vector1, Vector2); +#else + return Vector1 | Vector2; +#endif +} + +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasAndNotInt32x4(MLAS_INT32X4 VectorNot, MLAS_INT32X4 Vector) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vandq_s32(vmvnq_s32(VectorNot), Vector); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_andnot_si128(VectorNot, Vector); +#else + return (~VectorNot) & Vector; +#endif +} + +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasXorInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) +{ +#if defined(MLAS_NEON_INTRINSICS) + return veorq_s32(Vector1, Vector2); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_xor_si128(Vector1, Vector2); +#else + return Vector1 ^ Vector2; +#endif +} + +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasBlendInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2, MLAS_INT32X4 Selection) +{ + return MlasOrInt32x4(MlasAndInt32x4(Vector2, Selection), MlasAndNotInt32x4(Selection, Vector1)); +} + +template +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasShiftLeftInt32x4(MLAS_INT32X4 Vector) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vshlq_n_s32(Vector, ShiftCount); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_slli_epi32(Vector, ShiftCount); +#else + return Vector << ShiftCount; +#endif +} + +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vmaxq_s32(Vector1, Vector2); +#elif defined(MLAS_SSE41_INTRINSICS) + return _mm_max_epi32(Vector1, Vector2); +#elif defined(MLAS_SSE2_INTRINSICS) + return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector1, Vector2)); +#elif defined(MLAS_VSX_INTRINSICS) + return vec_vmaxsw(Vector1, Vector2); +#else + return MlasBlendInt32x4(Vector2, Vector1, Vector1 > Vector2); +#endif +} + +MLAS_FORCEINLINE +MLAS_INT32X4 +MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vminq_s32(Vector1, Vector2); +#elif defined(MLAS_SSE41_INTRINSICS) + return _mm_min_epi32(Vector1, Vector2); +#elif defined(MLAS_SSE2_INTRINSICS) + return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector2, Vector1)); +#elif defined(MLAS_VSX_INTRINSICS) + return vec_vminsw(Vector1, Vector2); +#else + return MlasBlendInt32x4(Vector2, Vector1, Vector2 > Vector1); +#endif +} + +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vreinterpretq_f32_s32(Vector); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_castsi128_ps(Vector); +#else + return MLAS_FLOAT32X4(Vector); +#endif +} + MLAS_FORCEINLINE MLAS_FLOAT32X4 MlasBroadcastFloat32x4(float Value) @@ -847,6 +1042,8 @@ MlasLoadFloat32x4(const float* Buffer) return _mm_loadu_ps(Buffer); #elif defined(MLAS_VSX_INTRINSICS) return vec_vsx_ld(0, Buffer); +#else + return *((MLAS_FLOAT32X4*)Buffer); #endif } @@ -860,6 +1057,8 @@ MlasStoreFloat32x4(float* Buffer, MLAS_FLOAT32X4 Vector) _mm_storeu_ps(Buffer, Vector); #elif defined(MLAS_VSX_INTRINSICS) vec_vsx_st(Vector, 0, Buffer); +#else + *((MLAS_FLOAT32X4*)Buffer) = Vector; #endif } @@ -945,8 +1144,76 @@ MlasExtractLaneFloat32x4<0>(MLAS_FLOAT32X4 Vector) return _mm_cvtss_f32(Vector); } +template +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector) +{ + return _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(Index3, Index2, Index1, Index0)); +} + #endif +#if !defined(MLAS_SSE2_INTRINSICS) && !defined(_MSC_VER) + +template +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) +{ +#if defined(__clang__) + return __builtin_shufflevector(Vector1, Vector2, Index0, Index1, Index2, Index3); +#else + return __builtin_shuffle(Vector1, Vector2, MLAS_INT32X4{Index0, Index1, Index2, Index3}); +#endif +} + +template +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector) +{ + return MlasShuffleFloat32x4(Vector, Vector); +} + +#endif + +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasInterleaveLowFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) +{ +#if defined(MLAS_NEON64_INTRINSICS) + return vzip1q_f32(Vector1, Vector2); +#elif defined(MLAS_NEON32_INTRINSICS) + float32x4x2_t zipped = vzipq_f32(Vector1, Vector2); + return zipped.val[0]; +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_unpacklo_ps(Vector1, Vector2); +#elif defined(MLAS_VSX_INTRINSICS) + return vec_mergeh(Vector1, Vector2); +#else + return MlasShuffleFloat32x4<0, 4, 1, 5>(Vector1, Vector2); +#endif +} + +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasInterleaveHighFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) +{ +#if defined(MLAS_NEON64_INTRINSICS) + return vzip2q_f32(Vector1, Vector2); +#elif defined(MLAS_NEON32_INTRINSICS) + float32x4x2_t zipped = vzipq_f32(Vector1, Vector2); + return zipped.val[1]; +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_unpackhi_ps(Vector1, Vector2); +#elif defined(MLAS_VSX_INTRINSICS) + return vec_mergel(Vector1, Vector2); +#else + return MlasShuffleFloat32x4<2, 6, 3, 7>(Vector1, Vector2); +#endif +} + MLAS_FORCEINLINE MLAS_FLOAT32X4 MlasAddFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) @@ -1036,6 +1303,70 @@ MlasDivideFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) #endif } +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) +{ +#if defined(MLAS_NEON_INTRINSICS) + return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2)); +#elif defined(MLAS_SSE2_INTRINSICS) + return _mm_cmpgt_ps(Vector1, Vector2); +#else + return Vector1 > Vector2; +#endif +} + +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) +{ +#if defined(MLAS_SSE2_INTRINSICS) + return _mm_and_ps(Vector1, Vector2); +#else + return MlasReinterpretAsFloat32x4(MlasAndInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2))); +#endif +} + +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) +{ +#if defined(MLAS_SSE2_INTRINSICS) + return _mm_or_ps(Vector1, Vector2); +#else + return MlasReinterpretAsFloat32x4(MlasOrInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2))); +#endif +} + +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector) +{ +#if defined(MLAS_SSE2_INTRINSICS) + return _mm_andnot_ps(VectorNot, Vector); +#else + return MlasReinterpretAsFloat32x4(MlasAndNotInt32x4(MlasReinterpretAsInt32x4(VectorNot), MlasReinterpretAsInt32x4(Vector))); +#endif +} + +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) +{ +#if defined(MLAS_SSE2_INTRINSICS) + return _mm_xor_ps(Vector1, Vector2); +#else + return MlasReinterpretAsFloat32x4(MlasXorInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2))); +#endif +} + +MLAS_FORCEINLINE +MLAS_FLOAT32X4 +MlasBlendFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2, MLAS_FLOAT32X4 Selection) +{ + return MlasOrFloat32x4(MlasAndFloat32x4(Vector2, Selection), MlasAndNotFloat32x4(Selection, Vector1)); +} + MLAS_FORCEINLINE MLAS_FLOAT32X4 MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) @@ -1047,7 +1378,7 @@ MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) #elif defined(MLAS_VSX_INTRINSICS) return vec_sel(Vector2, Vector1, vec_cmpgt(Vector1, Vector2)); #else -#error Unsupported architecture. + return MlasBlendFloat32x4(Vector2, Vector1, Vector1 > Vector2); #endif } @@ -1062,7 +1393,7 @@ MlasMinimumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) #elif defined(MLAS_VSX_INTRINSICS) return vec_sel(Vector2, Vector1, vec_cmpgt(Vector2, Vector1)); #else -#error Unsupported architecture. + return MlasBlendFloat32x4(Vector2, Vector1, Vector2 > Vector1); #endif } @@ -1079,86 +1410,6 @@ MlasClampFloat32x4(MLAS_FLOAT32X4 Value, float LowerRange, float UpperRange) return Value; } -MLAS_FORCEINLINE -MLAS_FLOAT32X4 -MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2)); -#elif defined(MLAS_SSE2_INTRINSICS) - return _mm_cmpgt_ps(Vector1, Vector2); -#elif defined(MLAS_VSX_INTRINSICS) - return MLAS_FLOAT32X4(vec_cmpgt(Vector1, Vector2)); -#else -#error Unsupported architecture. -#endif -} - -MLAS_FORCEINLINE -MLAS_FLOAT32X4 -MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2))); -#elif defined(MLAS_SSE2_INTRINSICS) - return _mm_and_ps(Vector1, Vector2); -#else - return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) & MLAS_INT32X4(Vector2)); -#endif -} - -MLAS_FORCEINLINE -MLAS_FLOAT32X4 -MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2))); -#elif defined(MLAS_SSE2_INTRINSICS) - return _mm_or_ps(Vector1, Vector2); -#else - return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) | MLAS_INT32X4(Vector2)); -#endif -} - -MLAS_FORCEINLINE -MLAS_FLOAT32X4 -MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(vreinterpretq_u32_f32(VectorNot)), vreinterpretq_u32_f32(Vector))); -#elif defined(MLAS_SSE2_INTRINSICS) - return _mm_andnot_ps(VectorNot, Vector); -#else - return MLAS_FLOAT32X4(~MLAS_INT32X4(VectorNot) & MLAS_INT32X4(Vector)); -#endif -} - -MLAS_FORCEINLINE -MLAS_FLOAT32X4 -MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2))); -#elif defined(MLAS_SSE2_INTRINSICS) - return _mm_xor_ps(Vector1, Vector2); -#else - return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) ^ MLAS_INT32X4(Vector2)); -#endif -} - -MLAS_FORCEINLINE -MLAS_FLOAT32X4 -MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vreinterpretq_f32_s32(Vector); -#elif defined(MLAS_SSE2_INTRINSICS) - return _mm_castsi128_ps(Vector); -#else - return MLAS_FLOAT32X4(Vector); -#endif -} - MLAS_FORCEINLINE float MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector) @@ -1173,16 +1424,14 @@ MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector) VectorLow = vpadd_f32(VectorLow, VectorHigh); VectorLow = vpadd_f32(VectorLow, VectorHigh); return vget_lane_f32(VectorLow, 0); -#elif defined(MLAS_SSE2_INTRINSICS) - Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2))); - Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtss_f32(Vector); #elif defined(MLAS_VSX_INTRINSICS) Vector = MlasAddFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1))); Vector = MlasAddFloat32x4(Vector, vec_splat(Vector, 1)); return Vector[0]; #else -#error Unsupported architecture. + Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector)); + Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector)); + return MlasExtractLaneFloat32x4<0>(Vector); #endif } @@ -1198,108 +1447,24 @@ MlasReduceMaximumFloat32x4(MLAS_FLOAT32X4 Vector) VectorLow = vpmax_f32(VectorLow, VectorHigh); VectorLow = vpmax_f32(VectorLow, VectorHigh); return vget_lane_f32(VectorLow, 0); -#elif defined(MLAS_SSE2_INTRINSICS) - Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2))); - Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1))); - return _mm_cvtss_f32(Vector); #elif defined(MLAS_VSX_INTRINSICS) Vector = MlasMaximumFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1))); Vector = MlasMaximumFloat32x4(Vector, vec_splat(Vector, 1)); return Vector[0]; #else -#error Unsupported architecture. + Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector)); + Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector)); + return MlasExtractLaneFloat32x4<0>(Vector); #endif } -MLAS_FORCEINLINE -MLAS_INT32X4 -MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vaddq_s32(Vector1, Vector2); -#elif defined(MLAS_SSE2_INTRINSICS) - return _mm_add_epi32(Vector1, Vector2); -#else - return Vector1 + Vector2; -#endif -} - -MLAS_FORCEINLINE -MLAS_INT32X4 -MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vsubq_s32(Vector1, Vector2); -#elif defined(MLAS_SSE2_INTRINSICS) - return _mm_sub_epi32(Vector1, Vector2); -#else - return Vector1 - Vector2; -#endif -} - -template -MLAS_FORCEINLINE -MLAS_INT32X4 -MlasShiftLeftInt32x4(MLAS_INT32X4 Vector) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vshlq_n_s32(Vector, ShiftCount); -#elif defined(MLAS_SSE2_INTRINSICS) - return _mm_slli_epi32(Vector, ShiftCount); -#else - return Vector << ShiftCount; -#endif -} - -#if !defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_SSE41_INTRINSICS) - -MLAS_FORCEINLINE -MLAS_INT32X4 -MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vmaxq_s32(Vector1, Vector2); -#elif defined(MLAS_SSE41_INTRINSICS) - return _mm_max_epi32(Vector1, Vector2); -#elif defined(MLAS_VSX_INTRINSICS) - return vec_vmaxsw(Vector1, Vector2); -#else -#error Unsupported architecture. -#endif -} - -MLAS_FORCEINLINE -MLAS_INT32X4 -MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2) -{ -#if defined(MLAS_NEON_INTRINSICS) - return vminq_s32(Vector1, Vector2); -#elif defined(MLAS_SSE41_INTRINSICS) - return _mm_min_epi32(Vector1, Vector2); -#elif defined(MLAS_VSX_INTRINSICS) - return vec_vminsw(Vector1, Vector2); -#else -#error Unsupported architecture. -#endif -} - -#endif - // calc 2^int(N) MLAS_FORCEINLINE MLAS_FLOAT32X4 MlasPowerOf2Float32x4(MLAS_FLOAT32X4 Vector) { -#if defined(MLAS_NEON_INTRINSICS) - MLAS_INT32X4 emm0 = vaddq_s32(vcvtq_s32_f32(Vector), MlasBroadcastInt32x4(127)); - return vreinterpretq_f32_s32(vshlq_n_s32(emm0, 23)); -#elif defined(MLAS_SSE2_INTRINSICS) - MLAS_INT32X4 emm0 = _mm_add_epi32(_mm_cvttps_epi32(Vector), MlasBroadcastInt32x4(127)); - return _mm_castsi128_ps(_mm_slli_epi32(emm0, 23)); -#elif defined(MLAS_VSX_INTRINSICS) - MLAS_INT32X4 emm0 = vec_cts(Vector, 0) + MlasBroadcastInt32x4(127); - return MLAS_FLOAT32X4(vec_sl(emm0, MLAS_UINT32X4(MlasBroadcastInt32x4(23)))); -#endif + MLAS_INT32X4 emm0 = MlasAddInt32x4(MlasCastToInt32x4(Vector), MlasBroadcastInt32x4(127)); + return MlasReinterpretAsFloat32x4(MlasShiftLeftInt32x4<23>(emm0)); } // diff --git a/onnxruntime/core/mlas/lib/pooling.cpp b/onnxruntime/core/mlas/lib/pooling.cpp index 0399b56f76..805b6e243f 100644 --- a/onnxruntime/core/mlas/lib/pooling.cpp +++ b/onnxruntime/core/mlas/lib/pooling.cpp @@ -80,7 +80,7 @@ struct MLAS_MAXIMUM_POOLING static float Reduce(float Reduction, float Value) { - return (std::max)(Reduction, Value); + return std::max(Reduction, Value); } static MLAS_FLOAT32X4 Reduce(MLAS_FLOAT32X4 Reduction, MLAS_FLOAT32X4 Value) @@ -293,8 +293,8 @@ Return Value: const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth; const int64_t iwEnd64 = iwStart64 + KernelWidth; - const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0))); - const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth))); + const size_t iwStart = size_t(std::max(iwStart64, int64_t(0))); + const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth))); float m = PoolingType::InitialValue(); @@ -370,16 +370,16 @@ Return Value: const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight; const int64_t ihEnd64 = ihStart64 + KernelHeight; - const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0))); - const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight))); + const size_t ihStart = size_t(std::max(ihStart64, int64_t(0))); + const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight))); for (size_t pw = 0; pw < OutputWidth; pw++) { const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth; const int64_t iwEnd64 = iwStart64 + KernelWidth; - const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0))); - const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth))); + const size_t iwStart = size_t(std::max(iwStart64, int64_t(0))); + const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth))); float m = PoolingType::InitialValue(); @@ -604,14 +604,12 @@ Return Value: break; } -#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS) - MlasStoreLaneFloat32x4<0>(Output, Reduction); - MlasStoreLaneFloat32x4<2>(Output + 1, Reduction); -#elif defined(MLAS_SSE2_INTRINSICS) +#if defined(MLAS_SSE2_INTRINSICS) Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0)); MlasStoreLowHalfFloat32x4(Output, Reduction); #else -#error Unsupported architecture. + MlasStoreLaneFloat32x4<0>(Output, Reduction); + MlasStoreLaneFloat32x4<2>(Output + 1, Reduction); #endif Output += 2; @@ -688,24 +686,24 @@ Return Value: const int64_t idStart64 = pd * StrideDepth - PaddingLeftDepth; const int64_t idEnd64 = idStart64 + KernelDepth; - const size_t idStart = size_t((std::max)(idStart64, int64_t(0))); - const size_t idEnd = size_t((std::min)(idEnd64, int64_t(InputDepth))); + const size_t idStart = size_t(std::max(idStart64, int64_t(0))); + const size_t idEnd = size_t(std::min(idEnd64, int64_t(InputDepth))); for (size_t ph = 0; ph < OutputHeight; ph++) { const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight; const int64_t ihEnd64 = ihStart64 + KernelHeight; - const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0))); - const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight))); + const size_t ihStart = size_t(std::max(ihStart64, int64_t(0))); + const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight))); for (size_t pw = 0; pw < OutputWidth; pw++) { const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth; const int64_t iwEnd64 = iwStart64 + KernelWidth; - const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0))); - const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth))); + const size_t iwStart = size_t(std::max(iwStart64, int64_t(0))); + const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth))); float m = PoolingType::InitialValue(); @@ -976,14 +974,12 @@ Return Value: break; } -#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS) - MlasStoreLaneFloat32x4<0>(Output, Reduction); - MlasStoreLaneFloat32x4<2>(Output + 1, Reduction); -#elif defined(MLAS_SSE2_INTRINSICS) +#if defined(MLAS_SSE2_INTRINSICS) Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0)); MlasStoreLowHalfFloat32x4(Output, Reduction); #else -#error Unsupported architecture. + MlasStoreLaneFloat32x4<0>(Output, Reduction); + MlasStoreLaneFloat32x4<2>(Output + 1, Reduction); #endif Output += 2; diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp index 9bede5864b..d90ff78739 100644 --- a/onnxruntime/core/mlas/lib/qgemm.cpp +++ b/onnxruntime/core/mlas/lib/qgemm.cpp @@ -181,48 +181,39 @@ Return Value: } } -template void MlasGemmU8X8CopyPackBProcessSse( int16_t* D, __m128i BytesRow0, __m128i BytesRow1, - __m128i ZeroVector, + __m128i BitFlipVector, __m128i ColumnSums[2] ) { __m128i BytesInterleaved = _mm_unpacklo_epi8(BytesRow0, BytesRow1); - __m128i WordsInterleaved[2]; - // - // Zero or sign extend the bytes to words. - // + BytesInterleaved = _mm_xor_si128(BytesInterleaved, BitFlipVector); - if (std::is_same::value) { - WordsInterleaved[0] = _mm_unpacklo_epi8(BytesInterleaved, ZeroVector); - WordsInterleaved[1] = _mm_unpackhi_epi8(BytesInterleaved, ZeroVector); - } else { - WordsInterleaved[0] = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8); - WordsInterleaved[1] = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8); - } + __m128i WordsInterleaved0 = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8); + __m128i WordsInterleaved1 = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8); - ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved[0]); - ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved[1]); + ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved0); + ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved1); - _mm_storeu_si128((__m128i*)&D[0], WordsInterleaved[0]); - _mm_storeu_si128((__m128i*)&D[8], WordsInterleaved[1]); + _mm_storeu_si128((__m128i*)&D[0], WordsInterleaved0); + _mm_storeu_si128((__m128i*)&D[8], WordsInterleaved1); } -template void MlasGemmU8X8CopyPackBSse( int16_t* D, - const BType* B, + const uint8_t* B, size_t ldb, size_t CountN, size_t CountK, int32_t* ColumnSumVector, - int16_t offa + int16_t offa, + bool BTypeIsSigned ) /*++ @@ -256,9 +247,8 @@ Return Value: --*/ { - const __m128i ZeroVector = _mm_setzero_si128(); const __m128i OffsetBroadcast = _mm_set1_epi16(offa); - BType PaddedMatrixBData[16] = { 0 }; + const __m128i BitFlipVector = _mm_set1_epi32(BTypeIsSigned ? 0 : 0x80808080); // // Process 8 columns of matrix B in a loop. @@ -266,12 +256,12 @@ Return Value: while (CountN >= 8) { - const BType* b = B; + const uint8_t* b = B; size_t k = CountK; __m128i ColumnSums[2]; - ColumnSums[0] = ZeroVector; - ColumnSums[1] = ZeroVector; + ColumnSums[0] = _mm_setzero_si128(); + ColumnSums[1] = _mm_setzero_si128(); // // Interleave rows of matrix B and write to the packed buffer. @@ -286,7 +276,7 @@ Return Value: __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]); __m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&b[ldb]); - MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums); + MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums); b += ldb * 2; D += 16; @@ -297,7 +287,7 @@ Return Value: __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]); - MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums); + MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums); D += 16; } @@ -325,12 +315,15 @@ Return Value: if (CountN > 0) { - const BType* b = B; + const uint8_t* b = B; size_t k = CountK; __m128i ColumnSums[2]; + uint8_t PaddedMatrixBData[16]; - ColumnSums[0] = ZeroVector; - ColumnSums[1] = ZeroVector; + _mm_storeu_si128((__m128i*)PaddedMatrixBData, BitFlipVector); + + ColumnSums[0] = _mm_setzero_si128(); + ColumnSums[1] = _mm_setzero_si128(); // // Interleave rows of matrix B using an intermediate zero padded stack @@ -339,9 +332,9 @@ Return Value: while (k >= 2) { - const BType* bcopy = b; - BType* padded = PaddedMatrixBData; - BType* padded_end = padded + CountN; + const uint8_t* bcopy = b; + uint8_t* padded = PaddedMatrixBData; + uint8_t* padded_end = padded + CountN; do { padded[0] = bcopy[0]; @@ -353,7 +346,7 @@ Return Value: __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]); __m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[8]); - MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums); + MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums); b += ldb * 2; D += 16; @@ -362,9 +355,9 @@ Return Value: if (k > 0) { - const BType* bcopy = b; - BType* padded = PaddedMatrixBData; - BType* padded_end = padded + CountN; + const uint8_t* bcopy = b; + uint8_t* padded = PaddedMatrixBData; + uint8_t* padded_end = padded + CountN; do { padded[0] = bcopy[0]; @@ -374,7 +367,7 @@ Return Value: __m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]); - MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums); + MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums); } // @@ -648,6 +641,10 @@ Return Value: size_t StrideN = MLAS_GEMM_U8X8_STRIDEN_SSE; size_t StrideK = MLAS_GEMM_U8X8_STRIDEK_SSE; + if (!WorkBlock->BTypeIsSigned) { + offb = int8_t(offb ^ 0x80); + } + // // Step through each slice of matrix B along the K dimension. // @@ -656,7 +653,7 @@ Return Value: for (size_t k = 0; k < K; k += CountK) { - CountK = (std::min)(K - k, StrideK); + CountK = std::min(K - k, StrideK); // // Step through each slice of matrix B along the N dimension. @@ -666,7 +663,7 @@ Return Value: for (size_t n = 0; n < N; n += CountN) { - CountN = (std::min)(N - n, StrideN); + CountN = std::min(N - n, StrideN); // // Copy a panel of matrix B to a local packed buffer. @@ -674,13 +671,8 @@ Return Value: const uint8_t* b = B + n + k * ldb; - if (WorkBlock->BTypeIsSigned) { - MlasGemmU8X8CopyPackBSse(PanelB, (const int8_t*)b, ldb, CountN, - CountK, ColumnSumVector, -int16_t(offa)); - } else { - MlasGemmU8X8CopyPackBSse(PanelB, (const uint8_t*)b, ldb, CountN, - CountK, ColumnSumVector, -int16_t(offa)); - } + MlasGemmU8X8CopyPackBSse(PanelB, b, ldb, CountN, CountK, + ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned); // // Step through each slice of matrix A along the M dimension. @@ -694,7 +686,7 @@ Return Value: for (size_t m = 0; m < M; m += CountM) { - CountM = (std::min)(M - m, StrideM); + CountM = std::min(M - m, StrideM); // // Copy a panel of matrix A to a local packed buffer. @@ -892,7 +884,7 @@ Return Value: for (size_t k = 0; k < K; k += CountK) { - CountK = (std::min)(K - k, StrideK); + CountK = std::min(K - k, StrideK); // // Step through each slice of matrix B along the N dimension. @@ -902,7 +894,7 @@ Return Value: for (size_t n = 0; n < N; n += CountN) { - CountN = (std::min)(N - n, StrideN); + CountN = std::min(N - n, StrideN); // // Copy a panel of matrix B to a local packed buffer. @@ -910,8 +902,8 @@ Return Value: const int8_t* b = (const int8_t*)B + n + k * ldb; - MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN, - CountK, ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned); + MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK, + ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned); // // Step through each slice of matrix A along the M dimension. @@ -925,7 +917,7 @@ Return Value: for (size_t m = 0; m < M; m += CountM) { - CountM = (std::min)(M - m, StrideM); + CountM = std::min(M - m, StrideM); // // Copy a panel of matrix A to a local packed buffer. @@ -1038,7 +1030,7 @@ Return Value: for (size_t k = 0; k < K; k += CountK) { - CountK = (std::min)(K - k, StrideK); + CountK = std::min(K - k, StrideK); // // Step through each slice of matrix B along the N dimension. @@ -1048,16 +1040,16 @@ Return Value: for (size_t n = 0; n < N; n += CountN) { - CountN = (std::min)(N - n, StrideN); + CountN = std::min(N - n, StrideN); // // Copy a panel of matrix B to a local packed buffer. // - const uint8_t* b = (const uint8_t*)B + n + k * ldb; + const uint8_t* b = B + n + k * ldb; - MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK, ColumnSumVector, - -int16_t(offa)); + MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK, + ColumnSumVector, -int16_t(offa)); // // Step through each slice of matrix A along the M dimension. @@ -1071,7 +1063,7 @@ Return Value: for (size_t m = 0; m < M; m += CountM) { - CountM = (std::min)(M - m, StrideM); + CountM = std::min(M - m, StrideM); // // Copy a panel of matrix A to a local packed buffer. diff --git a/onnxruntime/core/mlas/lib/reorder.cpp b/onnxruntime/core/mlas/lib/reorder.cpp index 7687a207c7..821bb0c9a2 100644 --- a/onnxruntime/core/mlas/lib/reorder.cpp +++ b/onnxruntime/core/mlas/lib/reorder.cpp @@ -213,7 +213,7 @@ Return Value: for (size_t i = InputChannels; i > 0;) { - const size_t InputChannelsThisIteration = (std::min)(i, BlockSize); + const size_t InputChannelsThisIteration = std::min(i, BlockSize); i -= InputChannelsThisIteration; const float* s = S; @@ -308,7 +308,7 @@ Return Value: for (size_t o = OutputChannels; o > 0;) { - const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize); + const size_t OutputChannelsThisIteration = std::min(o, BlockSize); const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3); o -= OutputChannelsThisIteration; @@ -415,7 +415,7 @@ Return Value: for (size_t o = OutputChannels; o > 0;) { - const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize); + const size_t OutputChannelsThisIteration = std::min(o, BlockSize); const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3); o -= OutputChannelsThisIteration; @@ -523,7 +523,7 @@ Return Value: for (size_t o = OutputChannels; o > 0;) { - const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize); + const size_t OutputChannelsThisIteration = std::min(o, BlockSize); const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3); o -= OutputChannelsThisIteration; @@ -538,7 +538,7 @@ Return Value: for (size_t i = InputChannels; i > 0;) { - const size_t InputChannelsThisIteration = (std::min)(i, BlockSize); + const size_t InputChannelsThisIteration = std::min(i, BlockSize); i -= InputChannelsThisIteration; // @@ -671,7 +671,7 @@ Return Value: for (size_t o = OutputChannels; o > 0;) { - const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize); + const size_t OutputChannelsThisIteration = std::min(o, BlockSize); const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3); o -= OutputChannelsThisIteration; diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp index 52816e22f7..5efdd36b1c 100644 --- a/onnxruntime/core/mlas/lib/sgemm.cpp +++ b/onnxruntime/core/mlas/lib/sgemm.cpp @@ -422,29 +422,15 @@ Return Value: t1 = o0.val[1]; t2 = o1.val[0]; t3 = o1.val[1]; -#elif defined(MLAS_SSE2_INTRINSICS) - // N.B. The MSVC version of _MM_TRANSPOSE4_PS uses shufps which is - // slightly larger than the below sequence, so manually expand the - // matrix transpose. - __m128 z0 = _mm_unpacklo_ps(t0, t1); - __m128 z1 = _mm_unpackhi_ps(t0, t1); - __m128 z2 = _mm_unpacklo_ps(t2, t3); - __m128 z3 = _mm_unpackhi_ps(t2, t3); - t0 = _mm_movelh_ps(z0, z2); - t1 = _mm_movehl_ps(z2, z0); - t2 = _mm_movelh_ps(z1, z3); - t3 = _mm_movehl_ps(z3, z1); -#elif defined(MLAS_VSX_INTRINSICS) - __vector float z0 = vec_mergeh(t0, t2); - __vector float z1 = vec_mergel(t0, t2); - __vector float z2 = vec_mergeh(t1, t3); - __vector float z3 = vec_mergel(t1, t3); - t0 = vec_mergeh(z0, z2); - t1 = vec_mergel(z0, z2); - t2 = vec_mergeh(z1, z3); - t3 = vec_mergel(z1, z3); #else -#error Unsupported architecture. + MLAS_FLOAT32X4 z0 = MlasInterleaveLowFloat32x4(t0, t2); + MLAS_FLOAT32X4 z1 = MlasInterleaveHighFloat32x4(t0, t2); + MLAS_FLOAT32X4 z2 = MlasInterleaveLowFloat32x4(t1, t3); + MLAS_FLOAT32X4 z3 = MlasInterleaveHighFloat32x4(t1, t3); + t0 = MlasInterleaveLowFloat32x4(z0, z2); + t1 = MlasInterleaveHighFloat32x4(z0, z2); + t2 = MlasInterleaveLowFloat32x4(z1, z3); + t3 = MlasInterleaveHighFloat32x4(z1, z3); #endif MlasStoreAlignedFloat32x4(&D[0], t0); @@ -639,7 +625,14 @@ Return Value: MLAS_FLOAT32X4 t0 = MlasLoadFloat32x4(&b[0]); MLAS_FLOAT32X4 t1 = MlasLoadFloat32x4(&b[ldb]); -#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS) +#if defined(MLAS_SSE2_INTRINSICS) + __m128 v0 = _mm_unpacklo_ps(t0, t1); + __m128 v1 = _mm_unpackhi_ps(t0, t1); + _mm_storel_pi((__m64*)&d[0], v0); + _mm_storeh_pi((__m64*)&d[16], v0); + _mm_storel_pi((__m64*)&d[32], v1); + _mm_storeh_pi((__m64*)&d[48], v1); +#else MlasStoreLaneFloat32x4<0>(&d[0], t0); MlasStoreLaneFloat32x4<0>(&d[1], t1); MlasStoreLaneFloat32x4<1>(&d[16], t0); @@ -648,15 +641,6 @@ Return Value: MlasStoreLaneFloat32x4<2>(&d[33], t1); MlasStoreLaneFloat32x4<3>(&d[48], t0); MlasStoreLaneFloat32x4<3>(&d[49], t1); -#elif defined(MLAS_SSE2_INTRINSICS) - __m128 v0 = _mm_unpacklo_ps(t0, t1); - __m128 v1 = _mm_unpackhi_ps(t0, t1); - _mm_storel_pi((__m64*)&d[0], v0); - _mm_storeh_pi((__m64*)&d[16], v0); - _mm_storel_pi((__m64*)&d[32], v1); - _mm_storeh_pi((__m64*)&d[48], v1); -#else -#error Unsupported architecture. #endif d += 2; diff --git a/onnxruntime/core/mlas/lib/snchwc.cpp b/onnxruntime/core/mlas/lib/snchwc.cpp index 325cdbd33d..3afcd7b451 100644 --- a/onnxruntime/core/mlas/lib/snchwc.cpp +++ b/onnxruntime/core/mlas/lib/snchwc.cpp @@ -539,7 +539,7 @@ struct MLAS_NCHWC_GROUPED_CONV_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM void ComputeFilterCount(void) { - FilterCount = (std::min)(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize); + FilterCount = std::min(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize); } void PrepareWork(int32_t Index) @@ -686,7 +686,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM // Compute the number of output lines to process in this iteration. // - size_t WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph); + size_t WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph); // // Walk over each input image organized as a set of NCHWc blocks. @@ -898,7 +898,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM size_t WorkThisIteration; if (StrideHeight == 1 && StrideWidth == 1) { - WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph); + WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph); } else { WorkThisIteration = 1; } @@ -923,7 +923,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM constexpr size_t MaximumInputChannelBatch = 128; - InputChannelBatch = (std::min)(InputChannels - ic, MaximumInputChannelBatch); + InputChannelBatch = std::min(InputChannels - ic, MaximumInputChannelBatch); unsigned KernelFlags = ComputeKernelFlags(ic, InputChannelBatch); diff --git a/onnxruntime/core/mlas/lib/tanh.cpp b/onnxruntime/core/mlas/lib/tanh.cpp index 2fbeaef3d9..8533d85f8e 100644 --- a/onnxruntime/core/mlas/lib/tanh.cpp +++ b/onnxruntime/core/mlas/lib/tanh.cpp @@ -119,7 +119,7 @@ Return Value: float Value = *Input++; - Value = (std::min)(MlasTanhConstants.UpperRange, (std::max)(MlasTanhConstants.LowerRange, Value)); + Value = std::min(MlasTanhConstants.UpperRange, std::max(MlasTanhConstants.LowerRange, Value)); float ValueSquared = Value * Value;