MLAS: more code cleanup (#4101)

Cleanup vector intrinsics, optimized SSE quantized GEMM.
This commit is contained in:
Tracy Sharpe 2020-06-01 21:19:42 -07:00 committed by GitHub
parent 08e5f89b37
commit 3f7b97a63d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 449 additions and 313 deletions

View file

@ -109,7 +109,7 @@ struct MLAS_ACTIVATION_FUNCTION<MlasReluActivation>
#if defined(MLAS_SSE2_INTRINSICS)
return _mm_cvtss_f32(Activate(_mm_set_ss(Value)));
#else
return (std::max)(Value, 0.0f);
return std::max(Value, 0.0f);
#endif
}
};
@ -140,12 +140,11 @@ struct MLAS_ACTIVATION_FUNCTION<MlasLeakyReluActivation>
#elif defined(MLAS_AVX_INTRINSICS)
return _mm_blendv_ps(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
#elif defined(MLAS_SSE2_INTRINSICS)
__m128 Selection = _mm_cmple_ps(ZeroFloat32x4, Value);
return _mm_or_ps(_mm_and_ps(Value, Selection), _mm_andnot_ps(Selection, ValueTimesAlpha));
return MlasBlendFloat32x4(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
#elif defined(MLAS_VSX_INTRINSICS)
return vec_sel(ValueTimesAlpha, Value, vec_cmple(ZeroFloat32x4, Value));
#else
#error Unsupported architecture.
return MlasBlendFloat32x4(ValueTimesAlpha, Value, ZeroFloat32x4 < Value);
#endif
}
@ -186,8 +185,8 @@ struct MLAS_ACTIVATION_FUNCTION<MlasClipActivation>
#if defined(MLAS_SSE2_INTRINSICS)
return _mm_cvtss_f32(Activate(_mm_set_ss(Value)));
#else
Value = (std::max)(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast));
Value = (std::min)(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast));
Value = std::max(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast));
Value = std::min(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast));
return Value;
#endif

View file

@ -552,7 +552,7 @@ Return Value:
while (N > 0) {
Maximum = (std::max)(Maximum, *Input);
Maximum = std::max(Maximum, *Input);
Input += 1;
N -= 1;

View file

@ -191,7 +191,7 @@ Return Value:
float r;
if (AbsValue > MlasErfConstants.ErfSplitBoundary) {
AbsValue = (std::min)(MlasErfConstants.ErfUpperAbsRange, AbsValue);
AbsValue = std::min(MlasErfConstants.ErfUpperAbsRange, AbsValue);
float r_big = MlasErfConstants.ErfBIG_P0;
r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P1;
r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P2;
@ -201,7 +201,7 @@ Return Value:
r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P6_Minus_One;
r_big = r_big * AbsValue + AbsValue;
r_big = (std::max)(-r_big, MlasErfConstants.Exp_LowerRange);
r_big = std::max(-r_big, MlasErfConstants.Exp_LowerRange);
r = MlasErfConstants.Exp_Log2Reciprocal * r_big + MlasErfConstants.Exp_C;
r -= MlasErfConstants.Exp_C;
float fx = r * MlasErfConstants.Exp_log2_hi + r_big;

View file

@ -121,7 +121,7 @@ Return Value:
float Value = *Input++;
Value = (std::min)(MlasLogisticConstants.UpperRange, (std::max)(MlasLogisticConstants.LowerRange, Value));
Value = std::min(MlasLogisticConstants.UpperRange, std::max(MlasLogisticConstants.LowerRange, Value));
float ValueSquared = Value * Value;

View file

@ -25,6 +25,12 @@ Abstract:
#include <type_traits>
#if defined(_WIN32)
#ifndef WIN32_LEAN_AND_MEAN
#define WIN32_LEAN_AND_MEAN
#endif
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <intrin.h>
#else
@ -753,8 +759,6 @@ MlasPartitionWork(
#if defined(__FMA__) || (defined(_MSC_VER) && defined(__AVX2__))
#define MLAS_FMA3_INTRINSICS
#endif
#else
#error Unsupported architecture.
#endif
#if defined(MLAS_NEON_INTRINSICS)
@ -785,6 +789,21 @@ MlasReinterpretAsInt32x4(MLAS_FLOAT32X4 Vector)
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasCastToInt32x4(MLAS_FLOAT32X4 Vector)
{
#if defined(MLAS_NEON_INTRINSICS)
return vcvtq_s32_f32(Vector);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_cvttps_epi32(Vector);
#elif defined(MLAS_VSX_INTRINSICS)
return vec_cts(Vector, 0);
#else
return MLAS_INT32X4{int32_t(Vector[0]), int32_t(Vector[1]), int32_t(Vector[2]), int32_t(Vector[3])};
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasBroadcastInt32x4(int32_t Value)
@ -798,6 +817,182 @@ MlasBroadcastInt32x4(int32_t Value)
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasLoadInt32x4(const int32_t* Buffer)
{
#if defined(MLAS_NEON_INTRINSICS)
return vld1q_s32(Buffer);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_loadu_si128((const __m128i*)Buffer);
#elif defined(MLAS_VSX_INTRINSICS)
return vec_vsx_ld(0, Buffer);
#else
return *((MLAS_INT32X4*)Buffer);
#endif
}
MLAS_FORCEINLINE
void
MlasStoreInt32x4(int32_t* Buffer, MLAS_INT32X4 Vector)
{
#if defined(MLAS_NEON_INTRINSICS)
vst1q_s32(Buffer, Vector);
#elif defined(MLAS_SSE2_INTRINSICS)
_mm_storeu_si128((__m128i*)Buffer, Vector);
#elif defined(MLAS_VSX_INTRINSICS)
vec_vsx_st(Vector, 0, Buffer);
#else
*((MLAS_INT32X4*)Buffer) = Vector;
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vaddq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_add_epi32(Vector1, Vector2);
#else
return Vector1 + Vector2;
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vsubq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_sub_epi32(Vector1, Vector2);
#else
return Vector1 - Vector2;
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasAndInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vandq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_and_si128(Vector1, Vector2);
#else
return Vector1 & Vector2;
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasOrInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vorrq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_or_si128(Vector1, Vector2);
#else
return Vector1 | Vector2;
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasAndNotInt32x4(MLAS_INT32X4 VectorNot, MLAS_INT32X4 Vector)
{
#if defined(MLAS_NEON_INTRINSICS)
return vandq_s32(vmvnq_s32(VectorNot), Vector);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_andnot_si128(VectorNot, Vector);
#else
return (~VectorNot) & Vector;
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasXorInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return veorq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_xor_si128(Vector1, Vector2);
#else
return Vector1 ^ Vector2;
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasBlendInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2, MLAS_INT32X4 Selection)
{
return MlasOrInt32x4(MlasAndInt32x4(Vector2, Selection), MlasAndNotInt32x4(Selection, Vector1));
}
template<unsigned ShiftCount>
MLAS_FORCEINLINE
MLAS_INT32X4
MlasShiftLeftInt32x4(MLAS_INT32X4 Vector)
{
#if defined(MLAS_NEON_INTRINSICS)
return vshlq_n_s32(Vector, ShiftCount);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_slli_epi32(Vector, ShiftCount);
#else
return Vector << ShiftCount;
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vmaxq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE41_INTRINSICS)
return _mm_max_epi32(Vector1, Vector2);
#elif defined(MLAS_SSE2_INTRINSICS)
return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector1, Vector2));
#elif defined(MLAS_VSX_INTRINSICS)
return vec_vmaxsw(Vector1, Vector2);
#else
return MlasBlendInt32x4(Vector2, Vector1, Vector1 > Vector2);
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vminq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE41_INTRINSICS)
return _mm_min_epi32(Vector1, Vector2);
#elif defined(MLAS_SSE2_INTRINSICS)
return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector2, Vector1));
#elif defined(MLAS_VSX_INTRINSICS)
return vec_vminsw(Vector1, Vector2);
#else
return MlasBlendInt32x4(Vector2, Vector1, Vector2 > Vector1);
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector)
{
#if defined(MLAS_NEON_INTRINSICS)
return vreinterpretq_f32_s32(Vector);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_castsi128_ps(Vector);
#else
return MLAS_FLOAT32X4(Vector);
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasBroadcastFloat32x4(float Value)
@ -847,6 +1042,8 @@ MlasLoadFloat32x4(const float* Buffer)
return _mm_loadu_ps(Buffer);
#elif defined(MLAS_VSX_INTRINSICS)
return vec_vsx_ld(0, Buffer);
#else
return *((MLAS_FLOAT32X4*)Buffer);
#endif
}
@ -860,6 +1057,8 @@ MlasStoreFloat32x4(float* Buffer, MLAS_FLOAT32X4 Vector)
_mm_storeu_ps(Buffer, Vector);
#elif defined(MLAS_VSX_INTRINSICS)
vec_vsx_st(Vector, 0, Buffer);
#else
*((MLAS_FLOAT32X4*)Buffer) = Vector;
#endif
}
@ -945,8 +1144,76 @@ MlasExtractLaneFloat32x4<0>(MLAS_FLOAT32X4 Vector)
return _mm_cvtss_f32(Vector);
}
template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector)
{
return _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(Index3, Index2, Index1, Index0));
}
#endif
#if !defined(MLAS_SSE2_INTRINSICS) && !defined(_MSC_VER)
template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(__clang__)
return __builtin_shufflevector(Vector1, Vector2, Index0, Index1, Index2, Index3);
#else
return __builtin_shuffle(Vector1, Vector2, MLAS_INT32X4{Index0, Index1, Index2, Index3});
#endif
}
template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector)
{
return MlasShuffleFloat32x4<Index0, Index1, Index2, Index3>(Vector, Vector);
}
#endif
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasInterleaveLowFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_NEON64_INTRINSICS)
return vzip1q_f32(Vector1, Vector2);
#elif defined(MLAS_NEON32_INTRINSICS)
float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
return zipped.val[0];
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_unpacklo_ps(Vector1, Vector2);
#elif defined(MLAS_VSX_INTRINSICS)
return vec_mergeh(Vector1, Vector2);
#else
return MlasShuffleFloat32x4<0, 4, 1, 5>(Vector1, Vector2);
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasInterleaveHighFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_NEON64_INTRINSICS)
return vzip2q_f32(Vector1, Vector2);
#elif defined(MLAS_NEON32_INTRINSICS)
float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
return zipped.val[1];
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_unpackhi_ps(Vector1, Vector2);
#elif defined(MLAS_VSX_INTRINSICS)
return vec_mergel(Vector1, Vector2);
#else
return MlasShuffleFloat32x4<2, 6, 3, 7>(Vector1, Vector2);
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasAddFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
@ -1036,6 +1303,70 @@ MlasDivideFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2));
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_cmpgt_ps(Vector1, Vector2);
#else
return Vector1 > Vector2;
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_SSE2_INTRINSICS)
return _mm_and_ps(Vector1, Vector2);
#else
return MlasReinterpretAsFloat32x4(MlasAndInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_SSE2_INTRINSICS)
return _mm_or_ps(Vector1, Vector2);
#else
return MlasReinterpretAsFloat32x4(MlasOrInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector)
{
#if defined(MLAS_SSE2_INTRINSICS)
return _mm_andnot_ps(VectorNot, Vector);
#else
return MlasReinterpretAsFloat32x4(MlasAndNotInt32x4(MlasReinterpretAsInt32x4(VectorNot), MlasReinterpretAsInt32x4(Vector)));
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_SSE2_INTRINSICS)
return _mm_xor_ps(Vector1, Vector2);
#else
return MlasReinterpretAsFloat32x4(MlasXorInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasBlendFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2, MLAS_FLOAT32X4 Selection)
{
return MlasOrFloat32x4(MlasAndFloat32x4(Vector2, Selection), MlasAndNotFloat32x4(Selection, Vector1));
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
@ -1047,7 +1378,7 @@ MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
#elif defined(MLAS_VSX_INTRINSICS)
return vec_sel(Vector2, Vector1, vec_cmpgt(Vector1, Vector2));
#else
#error Unsupported architecture.
return MlasBlendFloat32x4(Vector2, Vector1, Vector1 > Vector2);
#endif
}
@ -1062,7 +1393,7 @@ MlasMinimumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
#elif defined(MLAS_VSX_INTRINSICS)
return vec_sel(Vector2, Vector1, vec_cmpgt(Vector2, Vector1));
#else
#error Unsupported architecture.
return MlasBlendFloat32x4(Vector2, Vector1, Vector2 > Vector1);
#endif
}
@ -1079,86 +1410,6 @@ MlasClampFloat32x4(MLAS_FLOAT32X4 Value, float LowerRange, float UpperRange)
return Value;
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2));
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_cmpgt_ps(Vector1, Vector2);
#elif defined(MLAS_VSX_INTRINSICS)
return MLAS_FLOAT32X4(vec_cmpgt(Vector1, Vector2));
#else
#error Unsupported architecture.
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_and_ps(Vector1, Vector2);
#else
return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) & MLAS_INT32X4(Vector2));
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_or_ps(Vector1, Vector2);
#else
return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) | MLAS_INT32X4(Vector2));
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector)
{
#if defined(MLAS_NEON_INTRINSICS)
return vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(vreinterpretq_u32_f32(VectorNot)), vreinterpretq_u32_f32(Vector)));
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_andnot_ps(VectorNot, Vector);
#else
return MLAS_FLOAT32X4(~MLAS_INT32X4(VectorNot) & MLAS_INT32X4(Vector));
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_xor_ps(Vector1, Vector2);
#else
return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) ^ MLAS_INT32X4(Vector2));
#endif
}
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector)
{
#if defined(MLAS_NEON_INTRINSICS)
return vreinterpretq_f32_s32(Vector);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_castsi128_ps(Vector);
#else
return MLAS_FLOAT32X4(Vector);
#endif
}
MLAS_FORCEINLINE
float
MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector)
@ -1173,16 +1424,14 @@ MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector)
VectorLow = vpadd_f32(VectorLow, VectorHigh);
VectorLow = vpadd_f32(VectorLow, VectorHigh);
return vget_lane_f32(VectorLow, 0);
#elif defined(MLAS_SSE2_INTRINSICS)
Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2)));
Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(Vector);
#elif defined(MLAS_VSX_INTRINSICS)
Vector = MlasAddFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1)));
Vector = MlasAddFloat32x4(Vector, vec_splat(Vector, 1));
return Vector[0];
#else
#error Unsupported architecture.
Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector));
Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector));
return MlasExtractLaneFloat32x4<0>(Vector);
#endif
}
@ -1198,108 +1447,24 @@ MlasReduceMaximumFloat32x4(MLAS_FLOAT32X4 Vector)
VectorLow = vpmax_f32(VectorLow, VectorHigh);
VectorLow = vpmax_f32(VectorLow, VectorHigh);
return vget_lane_f32(VectorLow, 0);
#elif defined(MLAS_SSE2_INTRINSICS)
Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2)));
Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
return _mm_cvtss_f32(Vector);
#elif defined(MLAS_VSX_INTRINSICS)
Vector = MlasMaximumFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1)));
Vector = MlasMaximumFloat32x4(Vector, vec_splat(Vector, 1));
return Vector[0];
#else
#error Unsupported architecture.
Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector));
Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector));
return MlasExtractLaneFloat32x4<0>(Vector);
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vaddq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_add_epi32(Vector1, Vector2);
#else
return Vector1 + Vector2;
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vsubq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_sub_epi32(Vector1, Vector2);
#else
return Vector1 - Vector2;
#endif
}
template<unsigned ShiftCount>
MLAS_FORCEINLINE
MLAS_INT32X4
MlasShiftLeftInt32x4(MLAS_INT32X4 Vector)
{
#if defined(MLAS_NEON_INTRINSICS)
return vshlq_n_s32(Vector, ShiftCount);
#elif defined(MLAS_SSE2_INTRINSICS)
return _mm_slli_epi32(Vector, ShiftCount);
#else
return Vector << ShiftCount;
#endif
}
#if !defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_SSE41_INTRINSICS)
MLAS_FORCEINLINE
MLAS_INT32X4
MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vmaxq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE41_INTRINSICS)
return _mm_max_epi32(Vector1, Vector2);
#elif defined(MLAS_VSX_INTRINSICS)
return vec_vmaxsw(Vector1, Vector2);
#else
#error Unsupported architecture.
#endif
}
MLAS_FORCEINLINE
MLAS_INT32X4
MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
{
#if defined(MLAS_NEON_INTRINSICS)
return vminq_s32(Vector1, Vector2);
#elif defined(MLAS_SSE41_INTRINSICS)
return _mm_min_epi32(Vector1, Vector2);
#elif defined(MLAS_VSX_INTRINSICS)
return vec_vminsw(Vector1, Vector2);
#else
#error Unsupported architecture.
#endif
}
#endif
// calc 2^int(N)
MLAS_FORCEINLINE
MLAS_FLOAT32X4
MlasPowerOf2Float32x4(MLAS_FLOAT32X4 Vector)
{
#if defined(MLAS_NEON_INTRINSICS)
MLAS_INT32X4 emm0 = vaddq_s32(vcvtq_s32_f32(Vector), MlasBroadcastInt32x4(127));
return vreinterpretq_f32_s32(vshlq_n_s32(emm0, 23));
#elif defined(MLAS_SSE2_INTRINSICS)
MLAS_INT32X4 emm0 = _mm_add_epi32(_mm_cvttps_epi32(Vector), MlasBroadcastInt32x4(127));
return _mm_castsi128_ps(_mm_slli_epi32(emm0, 23));
#elif defined(MLAS_VSX_INTRINSICS)
MLAS_INT32X4 emm0 = vec_cts(Vector, 0) + MlasBroadcastInt32x4(127);
return MLAS_FLOAT32X4(vec_sl(emm0, MLAS_UINT32X4(MlasBroadcastInt32x4(23))));
#endif
MLAS_INT32X4 emm0 = MlasAddInt32x4(MlasCastToInt32x4(Vector), MlasBroadcastInt32x4(127));
return MlasReinterpretAsFloat32x4(MlasShiftLeftInt32x4<23>(emm0));
}
//

View file

@ -80,7 +80,7 @@ struct MLAS_MAXIMUM_POOLING
static float Reduce(float Reduction, float Value)
{
return (std::max)(Reduction, Value);
return std::max(Reduction, Value);
}
static MLAS_FLOAT32X4 Reduce(MLAS_FLOAT32X4 Reduction, MLAS_FLOAT32X4 Value)
@ -293,8 +293,8 @@ Return Value:
const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
const int64_t iwEnd64 = iwStart64 + KernelWidth;
const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));
float m = PoolingType::InitialValue();
@ -370,16 +370,16 @@ Return Value:
const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
const int64_t ihEnd64 = ihStart64 + KernelHeight;
const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
const size_t ihStart = size_t(std::max(ihStart64, int64_t(0)));
const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight)));
for (size_t pw = 0; pw < OutputWidth; pw++) {
const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
const int64_t iwEnd64 = iwStart64 + KernelWidth;
const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));
float m = PoolingType::InitialValue();
@ -604,14 +604,12 @@ Return Value:
break;
}
#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
MlasStoreLaneFloat32x4<0>(Output, Reduction);
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
#elif defined(MLAS_SSE2_INTRINSICS)
#if defined(MLAS_SSE2_INTRINSICS)
Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
MlasStoreLowHalfFloat32x4(Output, Reduction);
#else
#error Unsupported architecture.
MlasStoreLaneFloat32x4<0>(Output, Reduction);
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
#endif
Output += 2;
@ -688,24 +686,24 @@ Return Value:
const int64_t idStart64 = pd * StrideDepth - PaddingLeftDepth;
const int64_t idEnd64 = idStart64 + KernelDepth;
const size_t idStart = size_t((std::max)(idStart64, int64_t(0)));
const size_t idEnd = size_t((std::min)(idEnd64, int64_t(InputDepth)));
const size_t idStart = size_t(std::max(idStart64, int64_t(0)));
const size_t idEnd = size_t(std::min(idEnd64, int64_t(InputDepth)));
for (size_t ph = 0; ph < OutputHeight; ph++) {
const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
const int64_t ihEnd64 = ihStart64 + KernelHeight;
const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
const size_t ihStart = size_t(std::max(ihStart64, int64_t(0)));
const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight)));
for (size_t pw = 0; pw < OutputWidth; pw++) {
const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
const int64_t iwEnd64 = iwStart64 + KernelWidth;
const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));
float m = PoolingType::InitialValue();
@ -976,14 +974,12 @@ Return Value:
break;
}
#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
MlasStoreLaneFloat32x4<0>(Output, Reduction);
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
#elif defined(MLAS_SSE2_INTRINSICS)
#if defined(MLAS_SSE2_INTRINSICS)
Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
MlasStoreLowHalfFloat32x4(Output, Reduction);
#else
#error Unsupported architecture.
MlasStoreLaneFloat32x4<0>(Output, Reduction);
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
#endif
Output += 2;

View file

@ -181,48 +181,39 @@ Return Value:
}
}
template<typename BType>
void
MlasGemmU8X8CopyPackBProcessSse(
int16_t* D,
__m128i BytesRow0,
__m128i BytesRow1,
__m128i ZeroVector,
__m128i BitFlipVector,
__m128i ColumnSums[2]
)
{
__m128i BytesInterleaved = _mm_unpacklo_epi8(BytesRow0, BytesRow1);
__m128i WordsInterleaved[2];
//
// Zero or sign extend the bytes to words.
//
BytesInterleaved = _mm_xor_si128(BytesInterleaved, BitFlipVector);
if (std::is_same<BType, uint8_t>::value) {
WordsInterleaved[0] = _mm_unpacklo_epi8(BytesInterleaved, ZeroVector);
WordsInterleaved[1] = _mm_unpackhi_epi8(BytesInterleaved, ZeroVector);
} else {
WordsInterleaved[0] = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8);
WordsInterleaved[1] = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8);
}
__m128i WordsInterleaved0 = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8);
__m128i WordsInterleaved1 = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8);
ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved[0]);
ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved[1]);
ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved0);
ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved1);
_mm_storeu_si128((__m128i*)&D[0], WordsInterleaved[0]);
_mm_storeu_si128((__m128i*)&D[8], WordsInterleaved[1]);
_mm_storeu_si128((__m128i*)&D[0], WordsInterleaved0);
_mm_storeu_si128((__m128i*)&D[8], WordsInterleaved1);
}
template<typename BType>
void
MlasGemmU8X8CopyPackBSse(
int16_t* D,
const BType* B,
const uint8_t* B,
size_t ldb,
size_t CountN,
size_t CountK,
int32_t* ColumnSumVector,
int16_t offa
int16_t offa,
bool BTypeIsSigned
)
/*++
@ -256,9 +247,8 @@ Return Value:
--*/
{
const __m128i ZeroVector = _mm_setzero_si128();
const __m128i OffsetBroadcast = _mm_set1_epi16(offa);
BType PaddedMatrixBData[16] = { 0 };
const __m128i BitFlipVector = _mm_set1_epi32(BTypeIsSigned ? 0 : 0x80808080);
//
// Process 8 columns of matrix B in a loop.
@ -266,12 +256,12 @@ Return Value:
while (CountN >= 8) {
const BType* b = B;
const uint8_t* b = B;
size_t k = CountK;
__m128i ColumnSums[2];
ColumnSums[0] = ZeroVector;
ColumnSums[1] = ZeroVector;
ColumnSums[0] = _mm_setzero_si128();
ColumnSums[1] = _mm_setzero_si128();
//
// Interleave rows of matrix B and write to the packed buffer.
@ -286,7 +276,7 @@ Return Value:
__m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]);
__m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&b[ldb]);
MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums);
MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);
b += ldb * 2;
D += 16;
@ -297,7 +287,7 @@ Return Value:
__m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]);
MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums);
MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);
D += 16;
}
@ -325,12 +315,15 @@ Return Value:
if (CountN > 0) {
const BType* b = B;
const uint8_t* b = B;
size_t k = CountK;
__m128i ColumnSums[2];
uint8_t PaddedMatrixBData[16];
ColumnSums[0] = ZeroVector;
ColumnSums[1] = ZeroVector;
_mm_storeu_si128((__m128i*)PaddedMatrixBData, BitFlipVector);
ColumnSums[0] = _mm_setzero_si128();
ColumnSums[1] = _mm_setzero_si128();
//
// Interleave rows of matrix B using an intermediate zero padded stack
@ -339,9 +332,9 @@ Return Value:
while (k >= 2) {
const BType* bcopy = b;
BType* padded = PaddedMatrixBData;
BType* padded_end = padded + CountN;
const uint8_t* bcopy = b;
uint8_t* padded = PaddedMatrixBData;
uint8_t* padded_end = padded + CountN;
do {
padded[0] = bcopy[0];
@ -353,7 +346,7 @@ Return Value:
__m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]);
__m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[8]);
MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums);
MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);
b += ldb * 2;
D += 16;
@ -362,9 +355,9 @@ Return Value:
if (k > 0) {
const BType* bcopy = b;
BType* padded = PaddedMatrixBData;
BType* padded_end = padded + CountN;
const uint8_t* bcopy = b;
uint8_t* padded = PaddedMatrixBData;
uint8_t* padded_end = padded + CountN;
do {
padded[0] = bcopy[0];
@ -374,7 +367,7 @@ Return Value:
__m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]);
MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums);
MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);
}
//
@ -648,6 +641,10 @@ Return Value:
size_t StrideN = MLAS_GEMM_U8X8_STRIDEN_SSE;
size_t StrideK = MLAS_GEMM_U8X8_STRIDEK_SSE;
if (!WorkBlock->BTypeIsSigned) {
offb = int8_t(offb ^ 0x80);
}
//
// Step through each slice of matrix B along the K dimension.
//
@ -656,7 +653,7 @@ Return Value:
for (size_t k = 0; k < K; k += CountK) {
CountK = (std::min)(K - k, StrideK);
CountK = std::min(K - k, StrideK);
//
// Step through each slice of matrix B along the N dimension.
@ -666,7 +663,7 @@ Return Value:
for (size_t n = 0; n < N; n += CountN) {
CountN = (std::min)(N - n, StrideN);
CountN = std::min(N - n, StrideN);
//
// Copy a panel of matrix B to a local packed buffer.
@ -674,13 +671,8 @@ Return Value:
const uint8_t* b = B + n + k * ldb;
if (WorkBlock->BTypeIsSigned) {
MlasGemmU8X8CopyPackBSse(PanelB, (const int8_t*)b, ldb, CountN,
CountK, ColumnSumVector, -int16_t(offa));
} else {
MlasGemmU8X8CopyPackBSse(PanelB, (const uint8_t*)b, ldb, CountN,
CountK, ColumnSumVector, -int16_t(offa));
}
MlasGemmU8X8CopyPackBSse(PanelB, b, ldb, CountN, CountK,
ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
//
// Step through each slice of matrix A along the M dimension.
@ -694,7 +686,7 @@ Return Value:
for (size_t m = 0; m < M; m += CountM) {
CountM = (std::min)(M - m, StrideM);
CountM = std::min(M - m, StrideM);
//
// Copy a panel of matrix A to a local packed buffer.
@ -892,7 +884,7 @@ Return Value:
for (size_t k = 0; k < K; k += CountK) {
CountK = (std::min)(K - k, StrideK);
CountK = std::min(K - k, StrideK);
//
// Step through each slice of matrix B along the N dimension.
@ -902,7 +894,7 @@ Return Value:
for (size_t n = 0; n < N; n += CountN) {
CountN = (std::min)(N - n, StrideN);
CountN = std::min(N - n, StrideN);
//
// Copy a panel of matrix B to a local packed buffer.
@ -910,8 +902,8 @@ Return Value:
const int8_t* b = (const int8_t*)B + n + k * ldb;
MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN,
CountK, ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK,
ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
//
// Step through each slice of matrix A along the M dimension.
@ -925,7 +917,7 @@ Return Value:
for (size_t m = 0; m < M; m += CountM) {
CountM = (std::min)(M - m, StrideM);
CountM = std::min(M - m, StrideM);
//
// Copy a panel of matrix A to a local packed buffer.
@ -1038,7 +1030,7 @@ Return Value:
for (size_t k = 0; k < K; k += CountK) {
CountK = (std::min)(K - k, StrideK);
CountK = std::min(K - k, StrideK);
//
// Step through each slice of matrix B along the N dimension.
@ -1048,16 +1040,16 @@ Return Value:
for (size_t n = 0; n < N; n += CountN) {
CountN = (std::min)(N - n, StrideN);
CountN = std::min(N - n, StrideN);
//
// Copy a panel of matrix B to a local packed buffer.
//
const uint8_t* b = (const uint8_t*)B + n + k * ldb;
const uint8_t* b = B + n + k * ldb;
MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK, ColumnSumVector,
-int16_t(offa));
MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK,
ColumnSumVector, -int16_t(offa));
//
// Step through each slice of matrix A along the M dimension.
@ -1071,7 +1063,7 @@ Return Value:
for (size_t m = 0; m < M; m += CountM) {
CountM = (std::min)(M - m, StrideM);
CountM = std::min(M - m, StrideM);
//
// Copy a panel of matrix A to a local packed buffer.

View file

@ -213,7 +213,7 @@ Return Value:
for (size_t i = InputChannels; i > 0;) {
const size_t InputChannelsThisIteration = (std::min)(i, BlockSize);
const size_t InputChannelsThisIteration = std::min(i, BlockSize);
i -= InputChannelsThisIteration;
const float* s = S;
@ -308,7 +308,7 @@ Return Value:
for (size_t o = OutputChannels; o > 0;) {
const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
o -= OutputChannelsThisIteration;
@ -415,7 +415,7 @@ Return Value:
for (size_t o = OutputChannels; o > 0;) {
const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
o -= OutputChannelsThisIteration;
@ -523,7 +523,7 @@ Return Value:
for (size_t o = OutputChannels; o > 0;) {
const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
o -= OutputChannelsThisIteration;
@ -538,7 +538,7 @@ Return Value:
for (size_t i = InputChannels; i > 0;) {
const size_t InputChannelsThisIteration = (std::min)(i, BlockSize);
const size_t InputChannelsThisIteration = std::min(i, BlockSize);
i -= InputChannelsThisIteration;
//
@ -671,7 +671,7 @@ Return Value:
for (size_t o = OutputChannels; o > 0;) {
const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
o -= OutputChannelsThisIteration;

View file

@ -422,29 +422,15 @@ Return Value:
t1 = o0.val[1];
t2 = o1.val[0];
t3 = o1.val[1];
#elif defined(MLAS_SSE2_INTRINSICS)
// N.B. The MSVC version of _MM_TRANSPOSE4_PS uses shufps which is
// slightly larger than the below sequence, so manually expand the
// matrix transpose.
__m128 z0 = _mm_unpacklo_ps(t0, t1);
__m128 z1 = _mm_unpackhi_ps(t0, t1);
__m128 z2 = _mm_unpacklo_ps(t2, t3);
__m128 z3 = _mm_unpackhi_ps(t2, t3);
t0 = _mm_movelh_ps(z0, z2);
t1 = _mm_movehl_ps(z2, z0);
t2 = _mm_movelh_ps(z1, z3);
t3 = _mm_movehl_ps(z3, z1);
#elif defined(MLAS_VSX_INTRINSICS)
__vector float z0 = vec_mergeh(t0, t2);
__vector float z1 = vec_mergel(t0, t2);
__vector float z2 = vec_mergeh(t1, t3);
__vector float z3 = vec_mergel(t1, t3);
t0 = vec_mergeh(z0, z2);
t1 = vec_mergel(z0, z2);
t2 = vec_mergeh(z1, z3);
t3 = vec_mergel(z1, z3);
#else
#error Unsupported architecture.
MLAS_FLOAT32X4 z0 = MlasInterleaveLowFloat32x4(t0, t2);
MLAS_FLOAT32X4 z1 = MlasInterleaveHighFloat32x4(t0, t2);
MLAS_FLOAT32X4 z2 = MlasInterleaveLowFloat32x4(t1, t3);
MLAS_FLOAT32X4 z3 = MlasInterleaveHighFloat32x4(t1, t3);
t0 = MlasInterleaveLowFloat32x4(z0, z2);
t1 = MlasInterleaveHighFloat32x4(z0, z2);
t2 = MlasInterleaveLowFloat32x4(z1, z3);
t3 = MlasInterleaveHighFloat32x4(z1, z3);
#endif
MlasStoreAlignedFloat32x4(&D[0], t0);
@ -639,7 +625,14 @@ Return Value:
MLAS_FLOAT32X4 t0 = MlasLoadFloat32x4(&b[0]);
MLAS_FLOAT32X4 t1 = MlasLoadFloat32x4(&b[ldb]);
#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
#if defined(MLAS_SSE2_INTRINSICS)
__m128 v0 = _mm_unpacklo_ps(t0, t1);
__m128 v1 = _mm_unpackhi_ps(t0, t1);
_mm_storel_pi((__m64*)&d[0], v0);
_mm_storeh_pi((__m64*)&d[16], v0);
_mm_storel_pi((__m64*)&d[32], v1);
_mm_storeh_pi((__m64*)&d[48], v1);
#else
MlasStoreLaneFloat32x4<0>(&d[0], t0);
MlasStoreLaneFloat32x4<0>(&d[1], t1);
MlasStoreLaneFloat32x4<1>(&d[16], t0);
@ -648,15 +641,6 @@ Return Value:
MlasStoreLaneFloat32x4<2>(&d[33], t1);
MlasStoreLaneFloat32x4<3>(&d[48], t0);
MlasStoreLaneFloat32x4<3>(&d[49], t1);
#elif defined(MLAS_SSE2_INTRINSICS)
__m128 v0 = _mm_unpacklo_ps(t0, t1);
__m128 v1 = _mm_unpackhi_ps(t0, t1);
_mm_storel_pi((__m64*)&d[0], v0);
_mm_storeh_pi((__m64*)&d[16], v0);
_mm_storel_pi((__m64*)&d[32], v1);
_mm_storeh_pi((__m64*)&d[48], v1);
#else
#error Unsupported architecture.
#endif
d += 2;

View file

@ -539,7 +539,7 @@ struct MLAS_NCHWC_GROUPED_CONV_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
void ComputeFilterCount(void)
{
FilterCount = (std::min)(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize);
FilterCount = std::min(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize);
}
void PrepareWork(int32_t Index)
@ -686,7 +686,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
// Compute the number of output lines to process in this iteration.
//
size_t WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);
size_t WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph);
//
// Walk over each input image organized as a set of NCHWc blocks.
@ -898,7 +898,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
size_t WorkThisIteration;
if (StrideHeight == 1 && StrideWidth == 1) {
WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);
WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph);
} else {
WorkThisIteration = 1;
}
@ -923,7 +923,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
constexpr size_t MaximumInputChannelBatch = 128;
InputChannelBatch = (std::min)(InputChannels - ic, MaximumInputChannelBatch);
InputChannelBatch = std::min(InputChannels - ic, MaximumInputChannelBatch);
unsigned KernelFlags = ComputeKernelFlags(ic, InputChannelBatch);

View file

@ -119,7 +119,7 @@ Return Value:
float Value = *Input++;
Value = (std::min)(MlasTanhConstants.UpperRange, (std::max)(MlasTanhConstants.LowerRange, Value));
Value = std::min(MlasTanhConstants.UpperRange, std::max(MlasTanhConstants.LowerRange, Value));
float ValueSquared = Value * Value;