mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-16 21:00:14 +00:00
MLAS: more code cleanup (#4101)
Cleanup vector intrinsics, optimized SSE quantized GEMM.
This commit is contained in:
parent
08e5f89b37
commit
3f7b97a63d
11 changed files with 449 additions and 313 deletions
|
|
@ -109,7 +109,7 @@ struct MLAS_ACTIVATION_FUNCTION<MlasReluActivation>
|
|||
#if defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_cvtss_f32(Activate(_mm_set_ss(Value)));
|
||||
#else
|
||||
return (std::max)(Value, 0.0f);
|
||||
return std::max(Value, 0.0f);
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
|
@ -140,12 +140,11 @@ struct MLAS_ACTIVATION_FUNCTION<MlasLeakyReluActivation>
|
|||
#elif defined(MLAS_AVX_INTRINSICS)
|
||||
return _mm_blendv_ps(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
__m128 Selection = _mm_cmple_ps(ZeroFloat32x4, Value);
|
||||
return _mm_or_ps(_mm_and_ps(Value, Selection), _mm_andnot_ps(Selection, ValueTimesAlpha));
|
||||
return MlasBlendFloat32x4(ValueTimesAlpha, Value, _mm_cmple_ps(ZeroFloat32x4, Value));
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_sel(ValueTimesAlpha, Value, vec_cmple(ZeroFloat32x4, Value));
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
return MlasBlendFloat32x4(ValueTimesAlpha, Value, ZeroFloat32x4 < Value);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -186,8 +185,8 @@ struct MLAS_ACTIVATION_FUNCTION<MlasClipActivation>
|
|||
#if defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_cvtss_f32(Activate(_mm_set_ss(Value)));
|
||||
#else
|
||||
Value = (std::max)(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast));
|
||||
Value = (std::min)(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast));
|
||||
Value = std::max(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast));
|
||||
Value = std::min(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast));
|
||||
|
||||
return Value;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -552,7 +552,7 @@ Return Value:
|
|||
|
||||
while (N > 0) {
|
||||
|
||||
Maximum = (std::max)(Maximum, *Input);
|
||||
Maximum = std::max(Maximum, *Input);
|
||||
|
||||
Input += 1;
|
||||
N -= 1;
|
||||
|
|
|
|||
|
|
@ -191,7 +191,7 @@ Return Value:
|
|||
|
||||
float r;
|
||||
if (AbsValue > MlasErfConstants.ErfSplitBoundary) {
|
||||
AbsValue = (std::min)(MlasErfConstants.ErfUpperAbsRange, AbsValue);
|
||||
AbsValue = std::min(MlasErfConstants.ErfUpperAbsRange, AbsValue);
|
||||
float r_big = MlasErfConstants.ErfBIG_P0;
|
||||
r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P1;
|
||||
r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P2;
|
||||
|
|
@ -201,7 +201,7 @@ Return Value:
|
|||
r_big = r_big * AbsValue + MlasErfConstants.ErfBIG_P6_Minus_One;
|
||||
r_big = r_big * AbsValue + AbsValue;
|
||||
|
||||
r_big = (std::max)(-r_big, MlasErfConstants.Exp_LowerRange);
|
||||
r_big = std::max(-r_big, MlasErfConstants.Exp_LowerRange);
|
||||
r = MlasErfConstants.Exp_Log2Reciprocal * r_big + MlasErfConstants.Exp_C;
|
||||
r -= MlasErfConstants.Exp_C;
|
||||
float fx = r * MlasErfConstants.Exp_log2_hi + r_big;
|
||||
|
|
|
|||
|
|
@ -121,7 +121,7 @@ Return Value:
|
|||
|
||||
float Value = *Input++;
|
||||
|
||||
Value = (std::min)(MlasLogisticConstants.UpperRange, (std::max)(MlasLogisticConstants.LowerRange, Value));
|
||||
Value = std::min(MlasLogisticConstants.UpperRange, std::max(MlasLogisticConstants.LowerRange, Value));
|
||||
|
||||
float ValueSquared = Value * Value;
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,12 @@ Abstract:
|
|||
#include <type_traits>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#ifndef WIN32_LEAN_AND_MEAN
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#endif
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <intrin.h>
|
||||
#else
|
||||
|
|
@ -753,8 +759,6 @@ MlasPartitionWork(
|
|||
#if defined(__FMA__) || (defined(_MSC_VER) && defined(__AVX2__))
|
||||
#define MLAS_FMA3_INTRINSICS
|
||||
#endif
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
#endif
|
||||
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
|
|
@ -785,6 +789,21 @@ MlasReinterpretAsInt32x4(MLAS_FLOAT32X4 Vector)
|
|||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasCastToInt32x4(MLAS_FLOAT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vcvtq_s32_f32(Vector);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_cvttps_epi32(Vector);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_cts(Vector, 0);
|
||||
#else
|
||||
return MLAS_INT32X4{int32_t(Vector[0]), int32_t(Vector[1]), int32_t(Vector[2]), int32_t(Vector[3])};
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasBroadcastInt32x4(int32_t Value)
|
||||
|
|
@ -798,6 +817,182 @@ MlasBroadcastInt32x4(int32_t Value)
|
|||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasLoadInt32x4(const int32_t* Buffer)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vld1q_s32(Buffer);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_loadu_si128((const __m128i*)Buffer);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_vsx_ld(0, Buffer);
|
||||
#else
|
||||
return *((MLAS_INT32X4*)Buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
void
|
||||
MlasStoreInt32x4(int32_t* Buffer, MLAS_INT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
vst1q_s32(Buffer, Vector);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
_mm_storeu_si128((__m128i*)Buffer, Vector);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
vec_vsx_st(Vector, 0, Buffer);
|
||||
#else
|
||||
*((MLAS_INT32X4*)Buffer) = Vector;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vaddq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_add_epi32(Vector1, Vector2);
|
||||
#else
|
||||
return Vector1 + Vector2;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vsubq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_sub_epi32(Vector1, Vector2);
|
||||
#else
|
||||
return Vector1 - Vector2;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasAndInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vandq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_and_si128(Vector1, Vector2);
|
||||
#else
|
||||
return Vector1 & Vector2;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasOrInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vorrq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_or_si128(Vector1, Vector2);
|
||||
#else
|
||||
return Vector1 | Vector2;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasAndNotInt32x4(MLAS_INT32X4 VectorNot, MLAS_INT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vandq_s32(vmvnq_s32(VectorNot), Vector);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_andnot_si128(VectorNot, Vector);
|
||||
#else
|
||||
return (~VectorNot) & Vector;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasXorInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return veorq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_xor_si128(Vector1, Vector2);
|
||||
#else
|
||||
return Vector1 ^ Vector2;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasBlendInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2, MLAS_INT32X4 Selection)
|
||||
{
|
||||
return MlasOrInt32x4(MlasAndInt32x4(Vector2, Selection), MlasAndNotInt32x4(Selection, Vector1));
|
||||
}
|
||||
|
||||
template<unsigned ShiftCount>
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasShiftLeftInt32x4(MLAS_INT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vshlq_n_s32(Vector, ShiftCount);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_slli_epi32(Vector, ShiftCount);
|
||||
#else
|
||||
return Vector << ShiftCount;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vmaxq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE41_INTRINSICS)
|
||||
return _mm_max_epi32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector1, Vector2));
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_vmaxsw(Vector1, Vector2);
|
||||
#else
|
||||
return MlasBlendInt32x4(Vector2, Vector1, Vector1 > Vector2);
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vminq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE41_INTRINSICS)
|
||||
return _mm_min_epi32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return MlasBlendInt32x4(Vector2, Vector1, _mm_cmpgt_epi32(Vector2, Vector1));
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_vminsw(Vector1, Vector2);
|
||||
#else
|
||||
return MlasBlendInt32x4(Vector2, Vector1, Vector2 > Vector1);
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vreinterpretq_f32_s32(Vector);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_castsi128_ps(Vector);
|
||||
#else
|
||||
return MLAS_FLOAT32X4(Vector);
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasBroadcastFloat32x4(float Value)
|
||||
|
|
@ -847,6 +1042,8 @@ MlasLoadFloat32x4(const float* Buffer)
|
|||
return _mm_loadu_ps(Buffer);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_vsx_ld(0, Buffer);
|
||||
#else
|
||||
return *((MLAS_FLOAT32X4*)Buffer);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -860,6 +1057,8 @@ MlasStoreFloat32x4(float* Buffer, MLAS_FLOAT32X4 Vector)
|
|||
_mm_storeu_ps(Buffer, Vector);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
vec_vsx_st(Vector, 0, Buffer);
|
||||
#else
|
||||
*((MLAS_FLOAT32X4*)Buffer) = Vector;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -945,8 +1144,76 @@ MlasExtractLaneFloat32x4<0>(MLAS_FLOAT32X4 Vector)
|
|||
return _mm_cvtss_f32(Vector);
|
||||
}
|
||||
|
||||
template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector)
|
||||
{
|
||||
return _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(Index3, Index2, Index1, Index0));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(MLAS_SSE2_INTRINSICS) && !defined(_MSC_VER)
|
||||
|
||||
template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(__clang__)
|
||||
return __builtin_shufflevector(Vector1, Vector2, Index0, Index1, Index2, Index3);
|
||||
#else
|
||||
return __builtin_shuffle(Vector1, Vector2, MLAS_INT32X4{Index0, Index1, Index2, Index3});
|
||||
#endif
|
||||
}
|
||||
|
||||
template<unsigned Index0, unsigned Index1, unsigned Index2, unsigned Index3>
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasShuffleFloat32x4(MLAS_FLOAT32X4 Vector)
|
||||
{
|
||||
return MlasShuffleFloat32x4<Index0, Index1, Index2, Index3>(Vector, Vector);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasInterleaveLowFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON64_INTRINSICS)
|
||||
return vzip1q_f32(Vector1, Vector2);
|
||||
#elif defined(MLAS_NEON32_INTRINSICS)
|
||||
float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
|
||||
return zipped.val[0];
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_unpacklo_ps(Vector1, Vector2);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_mergeh(Vector1, Vector2);
|
||||
#else
|
||||
return MlasShuffleFloat32x4<0, 4, 1, 5>(Vector1, Vector2);
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasInterleaveHighFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON64_INTRINSICS)
|
||||
return vzip2q_f32(Vector1, Vector2);
|
||||
#elif defined(MLAS_NEON32_INTRINSICS)
|
||||
float32x4x2_t zipped = vzipq_f32(Vector1, Vector2);
|
||||
return zipped.val[1];
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_unpackhi_ps(Vector1, Vector2);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_mergel(Vector1, Vector2);
|
||||
#else
|
||||
return MlasShuffleFloat32x4<2, 6, 3, 7>(Vector1, Vector2);
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasAddFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
|
|
@ -1036,6 +1303,70 @@ MlasDivideFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
|||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2));
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_cmpgt_ps(Vector1, Vector2);
|
||||
#else
|
||||
return Vector1 > Vector2;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_and_ps(Vector1, Vector2);
|
||||
#else
|
||||
return MlasReinterpretAsFloat32x4(MlasAndInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_or_ps(Vector1, Vector2);
|
||||
#else
|
||||
return MlasReinterpretAsFloat32x4(MlasOrInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_andnot_ps(VectorNot, Vector);
|
||||
#else
|
||||
return MlasReinterpretAsFloat32x4(MlasAndNotInt32x4(MlasReinterpretAsInt32x4(VectorNot), MlasReinterpretAsInt32x4(Vector)));
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_xor_ps(Vector1, Vector2);
|
||||
#else
|
||||
return MlasReinterpretAsFloat32x4(MlasXorInt32x4(MlasReinterpretAsInt32x4(Vector1), MlasReinterpretAsInt32x4(Vector2)));
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasBlendFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2, MLAS_FLOAT32X4 Selection)
|
||||
{
|
||||
return MlasOrFloat32x4(MlasAndFloat32x4(Vector2, Selection), MlasAndNotFloat32x4(Selection, Vector1));
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
|
|
@ -1047,7 +1378,7 @@ MlasMaximumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
|||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_sel(Vector2, Vector1, vec_cmpgt(Vector1, Vector2));
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
return MlasBlendFloat32x4(Vector2, Vector1, Vector1 > Vector2);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1062,7 +1393,7 @@ MlasMinimumFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
|||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_sel(Vector2, Vector1, vec_cmpgt(Vector2, Vector1));
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
return MlasBlendFloat32x4(Vector2, Vector1, Vector2 > Vector1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1079,86 +1410,6 @@ MlasClampFloat32x4(MLAS_FLOAT32X4 Value, float LowerRange, float UpperRange)
|
|||
return Value;
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasGreaterThanFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vreinterpretq_f32_u32(vcgtq_f32(Vector1, Vector2));
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_cmpgt_ps(Vector1, Vector2);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return MLAS_FLOAT32X4(vec_cmpgt(Vector1, Vector2));
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasAndFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_and_ps(Vector1, Vector2);
|
||||
#else
|
||||
return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) & MLAS_INT32X4(Vector2));
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasOrFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_or_ps(Vector1, Vector2);
|
||||
#else
|
||||
return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) | MLAS_INT32X4(Vector2));
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasAndNotFloat32x4(MLAS_FLOAT32X4 VectorNot, MLAS_FLOAT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(vreinterpretq_u32_f32(VectorNot)), vreinterpretq_u32_f32(Vector)));
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_andnot_ps(VectorNot, Vector);
|
||||
#else
|
||||
return MLAS_FLOAT32X4(~MLAS_INT32X4(VectorNot) & MLAS_INT32X4(Vector));
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasXorFloat32x4(MLAS_FLOAT32X4 Vector1, MLAS_FLOAT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(Vector1), vreinterpretq_u32_f32(Vector2)));
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_xor_ps(Vector1, Vector2);
|
||||
#else
|
||||
return MLAS_FLOAT32X4(MLAS_INT32X4(Vector1) ^ MLAS_INT32X4(Vector2));
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasReinterpretAsFloat32x4(MLAS_INT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vreinterpretq_f32_s32(Vector);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_castsi128_ps(Vector);
|
||||
#else
|
||||
return MLAS_FLOAT32X4(Vector);
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
float
|
||||
MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector)
|
||||
|
|
@ -1173,16 +1424,14 @@ MlasReduceAddFloat32x4(MLAS_FLOAT32X4 Vector)
|
|||
VectorLow = vpadd_f32(VectorLow, VectorHigh);
|
||||
VectorLow = vpadd_f32(VectorLow, VectorHigh);
|
||||
return vget_lane_f32(VectorLow, 0);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
Vector = MlasAddFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
return _mm_cvtss_f32(Vector);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
Vector = MlasAddFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1)));
|
||||
Vector = MlasAddFloat32x4(Vector, vec_splat(Vector, 1));
|
||||
return Vector[0];
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector));
|
||||
Vector = MlasAddFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector));
|
||||
return MlasExtractLaneFloat32x4<0>(Vector);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
@ -1198,108 +1447,24 @@ MlasReduceMaximumFloat32x4(MLAS_FLOAT32X4 Vector)
|
|||
VectorLow = vpmax_f32(VectorLow, VectorHigh);
|
||||
VectorLow = vpmax_f32(VectorLow, VectorHigh);
|
||||
return vget_lane_f32(VectorLow, 0);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||
Vector = MlasMaximumFloat32x4(Vector, _mm_shuffle_ps(Vector, Vector, _MM_SHUFFLE(1, 1, 1, 1)));
|
||||
return _mm_cvtss_f32(Vector);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
Vector = MlasMaximumFloat32x4(Vector, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Vector, 1)));
|
||||
Vector = MlasMaximumFloat32x4(Vector, vec_splat(Vector, 1));
|
||||
return Vector[0];
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<2, 3, 2, 3>(Vector));
|
||||
Vector = MlasMaximumFloat32x4(Vector, MlasShuffleFloat32x4<1, 1, 1, 1>(Vector));
|
||||
return MlasExtractLaneFloat32x4<0>(Vector);
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasAddInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vaddq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_add_epi32(Vector1, Vector2);
|
||||
#else
|
||||
return Vector1 + Vector2;
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasSubtractInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vsubq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_sub_epi32(Vector1, Vector2);
|
||||
#else
|
||||
return Vector1 - Vector2;
|
||||
#endif
|
||||
}
|
||||
|
||||
template<unsigned ShiftCount>
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasShiftLeftInt32x4(MLAS_INT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vshlq_n_s32(Vector, ShiftCount);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
return _mm_slli_epi32(Vector, ShiftCount);
|
||||
#else
|
||||
return Vector << ShiftCount;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_SSE41_INTRINSICS)
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasMaximumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vmaxq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE41_INTRINSICS)
|
||||
return _mm_max_epi32(Vector1, Vector2);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_vmaxsw(Vector1, Vector2);
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
#endif
|
||||
}
|
||||
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_INT32X4
|
||||
MlasMinimumInt32x4(MLAS_INT32X4 Vector1, MLAS_INT32X4 Vector2)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
return vminq_s32(Vector1, Vector2);
|
||||
#elif defined(MLAS_SSE41_INTRINSICS)
|
||||
return _mm_min_epi32(Vector1, Vector2);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
return vec_vminsw(Vector1, Vector2);
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// calc 2^int(N)
|
||||
MLAS_FORCEINLINE
|
||||
MLAS_FLOAT32X4
|
||||
MlasPowerOf2Float32x4(MLAS_FLOAT32X4 Vector)
|
||||
{
|
||||
#if defined(MLAS_NEON_INTRINSICS)
|
||||
MLAS_INT32X4 emm0 = vaddq_s32(vcvtq_s32_f32(Vector), MlasBroadcastInt32x4(127));
|
||||
return vreinterpretq_f32_s32(vshlq_n_s32(emm0, 23));
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
MLAS_INT32X4 emm0 = _mm_add_epi32(_mm_cvttps_epi32(Vector), MlasBroadcastInt32x4(127));
|
||||
return _mm_castsi128_ps(_mm_slli_epi32(emm0, 23));
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
MLAS_INT32X4 emm0 = vec_cts(Vector, 0) + MlasBroadcastInt32x4(127);
|
||||
return MLAS_FLOAT32X4(vec_sl(emm0, MLAS_UINT32X4(MlasBroadcastInt32x4(23))));
|
||||
#endif
|
||||
MLAS_INT32X4 emm0 = MlasAddInt32x4(MlasCastToInt32x4(Vector), MlasBroadcastInt32x4(127));
|
||||
return MlasReinterpretAsFloat32x4(MlasShiftLeftInt32x4<23>(emm0));
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
|||
|
|
@ -80,7 +80,7 @@ struct MLAS_MAXIMUM_POOLING
|
|||
|
||||
static float Reduce(float Reduction, float Value)
|
||||
{
|
||||
return (std::max)(Reduction, Value);
|
||||
return std::max(Reduction, Value);
|
||||
}
|
||||
|
||||
static MLAS_FLOAT32X4 Reduce(MLAS_FLOAT32X4 Reduction, MLAS_FLOAT32X4 Value)
|
||||
|
|
@ -293,8 +293,8 @@ Return Value:
|
|||
const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
|
||||
const int64_t iwEnd64 = iwStart64 + KernelWidth;
|
||||
|
||||
const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
|
||||
const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
|
||||
const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
|
||||
const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));
|
||||
|
||||
float m = PoolingType::InitialValue();
|
||||
|
||||
|
|
@ -370,16 +370,16 @@ Return Value:
|
|||
const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
|
||||
const int64_t ihEnd64 = ihStart64 + KernelHeight;
|
||||
|
||||
const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
|
||||
const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
|
||||
const size_t ihStart = size_t(std::max(ihStart64, int64_t(0)));
|
||||
const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight)));
|
||||
|
||||
for (size_t pw = 0; pw < OutputWidth; pw++) {
|
||||
|
||||
const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
|
||||
const int64_t iwEnd64 = iwStart64 + KernelWidth;
|
||||
|
||||
const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
|
||||
const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
|
||||
const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
|
||||
const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));
|
||||
|
||||
float m = PoolingType::InitialValue();
|
||||
|
||||
|
|
@ -604,14 +604,12 @@ Return Value:
|
|||
break;
|
||||
}
|
||||
|
||||
#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
|
||||
MlasStoreLaneFloat32x4<0>(Output, Reduction);
|
||||
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
#if defined(MLAS_SSE2_INTRINSICS)
|
||||
Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
MlasStoreLowHalfFloat32x4(Output, Reduction);
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
MlasStoreLaneFloat32x4<0>(Output, Reduction);
|
||||
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
|
||||
#endif
|
||||
|
||||
Output += 2;
|
||||
|
|
@ -688,24 +686,24 @@ Return Value:
|
|||
const int64_t idStart64 = pd * StrideDepth - PaddingLeftDepth;
|
||||
const int64_t idEnd64 = idStart64 + KernelDepth;
|
||||
|
||||
const size_t idStart = size_t((std::max)(idStart64, int64_t(0)));
|
||||
const size_t idEnd = size_t((std::min)(idEnd64, int64_t(InputDepth)));
|
||||
const size_t idStart = size_t(std::max(idStart64, int64_t(0)));
|
||||
const size_t idEnd = size_t(std::min(idEnd64, int64_t(InputDepth)));
|
||||
|
||||
for (size_t ph = 0; ph < OutputHeight; ph++) {
|
||||
|
||||
const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
|
||||
const int64_t ihEnd64 = ihStart64 + KernelHeight;
|
||||
|
||||
const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
|
||||
const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
|
||||
const size_t ihStart = size_t(std::max(ihStart64, int64_t(0)));
|
||||
const size_t ihEnd = size_t(std::min(ihEnd64, int64_t(InputHeight)));
|
||||
|
||||
for (size_t pw = 0; pw < OutputWidth; pw++) {
|
||||
|
||||
const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
|
||||
const int64_t iwEnd64 = iwStart64 + KernelWidth;
|
||||
|
||||
const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
|
||||
const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
|
||||
const size_t iwStart = size_t(std::max(iwStart64, int64_t(0)));
|
||||
const size_t iwEnd = size_t(std::min(iwEnd64, int64_t(InputWidth)));
|
||||
|
||||
float m = PoolingType::InitialValue();
|
||||
|
||||
|
|
@ -976,14 +974,12 @@ Return Value:
|
|||
break;
|
||||
}
|
||||
|
||||
#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
|
||||
MlasStoreLaneFloat32x4<0>(Output, Reduction);
|
||||
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
#if defined(MLAS_SSE2_INTRINSICS)
|
||||
Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
MlasStoreLowHalfFloat32x4(Output, Reduction);
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
MlasStoreLaneFloat32x4<0>(Output, Reduction);
|
||||
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
|
||||
#endif
|
||||
|
||||
Output += 2;
|
||||
|
|
|
|||
|
|
@ -181,48 +181,39 @@ Return Value:
|
|||
}
|
||||
}
|
||||
|
||||
template<typename BType>
|
||||
void
|
||||
MlasGemmU8X8CopyPackBProcessSse(
|
||||
int16_t* D,
|
||||
__m128i BytesRow0,
|
||||
__m128i BytesRow1,
|
||||
__m128i ZeroVector,
|
||||
__m128i BitFlipVector,
|
||||
__m128i ColumnSums[2]
|
||||
)
|
||||
{
|
||||
__m128i BytesInterleaved = _mm_unpacklo_epi8(BytesRow0, BytesRow1);
|
||||
__m128i WordsInterleaved[2];
|
||||
|
||||
//
|
||||
// Zero or sign extend the bytes to words.
|
||||
//
|
||||
BytesInterleaved = _mm_xor_si128(BytesInterleaved, BitFlipVector);
|
||||
|
||||
if (std::is_same<BType, uint8_t>::value) {
|
||||
WordsInterleaved[0] = _mm_unpacklo_epi8(BytesInterleaved, ZeroVector);
|
||||
WordsInterleaved[1] = _mm_unpackhi_epi8(BytesInterleaved, ZeroVector);
|
||||
} else {
|
||||
WordsInterleaved[0] = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8);
|
||||
WordsInterleaved[1] = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8);
|
||||
}
|
||||
__m128i WordsInterleaved0 = _mm_srai_epi16(_mm_unpacklo_epi8(BytesInterleaved, BytesInterleaved), 8);
|
||||
__m128i WordsInterleaved1 = _mm_srai_epi16(_mm_unpackhi_epi8(BytesInterleaved, BytesInterleaved), 8);
|
||||
|
||||
ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved[0]);
|
||||
ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved[1]);
|
||||
ColumnSums[0] = _mm_add_epi16(ColumnSums[0], WordsInterleaved0);
|
||||
ColumnSums[1] = _mm_add_epi16(ColumnSums[1], WordsInterleaved1);
|
||||
|
||||
_mm_storeu_si128((__m128i*)&D[0], WordsInterleaved[0]);
|
||||
_mm_storeu_si128((__m128i*)&D[8], WordsInterleaved[1]);
|
||||
_mm_storeu_si128((__m128i*)&D[0], WordsInterleaved0);
|
||||
_mm_storeu_si128((__m128i*)&D[8], WordsInterleaved1);
|
||||
}
|
||||
|
||||
template<typename BType>
|
||||
void
|
||||
MlasGemmU8X8CopyPackBSse(
|
||||
int16_t* D,
|
||||
const BType* B,
|
||||
const uint8_t* B,
|
||||
size_t ldb,
|
||||
size_t CountN,
|
||||
size_t CountK,
|
||||
int32_t* ColumnSumVector,
|
||||
int16_t offa
|
||||
int16_t offa,
|
||||
bool BTypeIsSigned
|
||||
)
|
||||
/*++
|
||||
|
||||
|
|
@ -256,9 +247,8 @@ Return Value:
|
|||
|
||||
--*/
|
||||
{
|
||||
const __m128i ZeroVector = _mm_setzero_si128();
|
||||
const __m128i OffsetBroadcast = _mm_set1_epi16(offa);
|
||||
BType PaddedMatrixBData[16] = { 0 };
|
||||
const __m128i BitFlipVector = _mm_set1_epi32(BTypeIsSigned ? 0 : 0x80808080);
|
||||
|
||||
//
|
||||
// Process 8 columns of matrix B in a loop.
|
||||
|
|
@ -266,12 +256,12 @@ Return Value:
|
|||
|
||||
while (CountN >= 8) {
|
||||
|
||||
const BType* b = B;
|
||||
const uint8_t* b = B;
|
||||
size_t k = CountK;
|
||||
__m128i ColumnSums[2];
|
||||
|
||||
ColumnSums[0] = ZeroVector;
|
||||
ColumnSums[1] = ZeroVector;
|
||||
ColumnSums[0] = _mm_setzero_si128();
|
||||
ColumnSums[1] = _mm_setzero_si128();
|
||||
|
||||
//
|
||||
// Interleave rows of matrix B and write to the packed buffer.
|
||||
|
|
@ -286,7 +276,7 @@ Return Value:
|
|||
__m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]);
|
||||
__m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&b[ldb]);
|
||||
|
||||
MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums);
|
||||
MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);
|
||||
|
||||
b += ldb * 2;
|
||||
D += 16;
|
||||
|
|
@ -297,7 +287,7 @@ Return Value:
|
|||
|
||||
__m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&b[0]);
|
||||
|
||||
MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums);
|
||||
MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);
|
||||
|
||||
D += 16;
|
||||
}
|
||||
|
|
@ -325,12 +315,15 @@ Return Value:
|
|||
|
||||
if (CountN > 0) {
|
||||
|
||||
const BType* b = B;
|
||||
const uint8_t* b = B;
|
||||
size_t k = CountK;
|
||||
__m128i ColumnSums[2];
|
||||
uint8_t PaddedMatrixBData[16];
|
||||
|
||||
ColumnSums[0] = ZeroVector;
|
||||
ColumnSums[1] = ZeroVector;
|
||||
_mm_storeu_si128((__m128i*)PaddedMatrixBData, BitFlipVector);
|
||||
|
||||
ColumnSums[0] = _mm_setzero_si128();
|
||||
ColumnSums[1] = _mm_setzero_si128();
|
||||
|
||||
//
|
||||
// Interleave rows of matrix B using an intermediate zero padded stack
|
||||
|
|
@ -339,9 +332,9 @@ Return Value:
|
|||
|
||||
while (k >= 2) {
|
||||
|
||||
const BType* bcopy = b;
|
||||
BType* padded = PaddedMatrixBData;
|
||||
BType* padded_end = padded + CountN;
|
||||
const uint8_t* bcopy = b;
|
||||
uint8_t* padded = PaddedMatrixBData;
|
||||
uint8_t* padded_end = padded + CountN;
|
||||
|
||||
do {
|
||||
padded[0] = bcopy[0];
|
||||
|
|
@ -353,7 +346,7 @@ Return Value:
|
|||
__m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]);
|
||||
__m128i BytesRow1 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[8]);
|
||||
|
||||
MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, BytesRow1, ZeroVector, ColumnSums);
|
||||
MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BytesRow1, BitFlipVector, ColumnSums);
|
||||
|
||||
b += ldb * 2;
|
||||
D += 16;
|
||||
|
|
@ -362,9 +355,9 @@ Return Value:
|
|||
|
||||
if (k > 0) {
|
||||
|
||||
const BType* bcopy = b;
|
||||
BType* padded = PaddedMatrixBData;
|
||||
BType* padded_end = padded + CountN;
|
||||
const uint8_t* bcopy = b;
|
||||
uint8_t* padded = PaddedMatrixBData;
|
||||
uint8_t* padded_end = padded + CountN;
|
||||
|
||||
do {
|
||||
padded[0] = bcopy[0];
|
||||
|
|
@ -374,7 +367,7 @@ Return Value:
|
|||
|
||||
__m128i BytesRow0 = _mm_loadl_epi64((__m128i*)&PaddedMatrixBData[0]);
|
||||
|
||||
MlasGemmU8X8CopyPackBProcessSse<BType>(D, BytesRow0, ZeroVector, ZeroVector, ColumnSums);
|
||||
MlasGemmU8X8CopyPackBProcessSse(D, BytesRow0, BitFlipVector, BitFlipVector, ColumnSums);
|
||||
}
|
||||
|
||||
//
|
||||
|
|
@ -648,6 +641,10 @@ Return Value:
|
|||
size_t StrideN = MLAS_GEMM_U8X8_STRIDEN_SSE;
|
||||
size_t StrideK = MLAS_GEMM_U8X8_STRIDEK_SSE;
|
||||
|
||||
if (!WorkBlock->BTypeIsSigned) {
|
||||
offb = int8_t(offb ^ 0x80);
|
||||
}
|
||||
|
||||
//
|
||||
// Step through each slice of matrix B along the K dimension.
|
||||
//
|
||||
|
|
@ -656,7 +653,7 @@ Return Value:
|
|||
|
||||
for (size_t k = 0; k < K; k += CountK) {
|
||||
|
||||
CountK = (std::min)(K - k, StrideK);
|
||||
CountK = std::min(K - k, StrideK);
|
||||
|
||||
//
|
||||
// Step through each slice of matrix B along the N dimension.
|
||||
|
|
@ -666,7 +663,7 @@ Return Value:
|
|||
|
||||
for (size_t n = 0; n < N; n += CountN) {
|
||||
|
||||
CountN = (std::min)(N - n, StrideN);
|
||||
CountN = std::min(N - n, StrideN);
|
||||
|
||||
//
|
||||
// Copy a panel of matrix B to a local packed buffer.
|
||||
|
|
@ -674,13 +671,8 @@ Return Value:
|
|||
|
||||
const uint8_t* b = B + n + k * ldb;
|
||||
|
||||
if (WorkBlock->BTypeIsSigned) {
|
||||
MlasGemmU8X8CopyPackBSse(PanelB, (const int8_t*)b, ldb, CountN,
|
||||
CountK, ColumnSumVector, -int16_t(offa));
|
||||
} else {
|
||||
MlasGemmU8X8CopyPackBSse(PanelB, (const uint8_t*)b, ldb, CountN,
|
||||
CountK, ColumnSumVector, -int16_t(offa));
|
||||
}
|
||||
MlasGemmU8X8CopyPackBSse(PanelB, b, ldb, CountN, CountK,
|
||||
ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
|
||||
|
||||
//
|
||||
// Step through each slice of matrix A along the M dimension.
|
||||
|
|
@ -694,7 +686,7 @@ Return Value:
|
|||
|
||||
for (size_t m = 0; m < M; m += CountM) {
|
||||
|
||||
CountM = (std::min)(M - m, StrideM);
|
||||
CountM = std::min(M - m, StrideM);
|
||||
|
||||
//
|
||||
// Copy a panel of matrix A to a local packed buffer.
|
||||
|
|
@ -892,7 +884,7 @@ Return Value:
|
|||
|
||||
for (size_t k = 0; k < K; k += CountK) {
|
||||
|
||||
CountK = (std::min)(K - k, StrideK);
|
||||
CountK = std::min(K - k, StrideK);
|
||||
|
||||
//
|
||||
// Step through each slice of matrix B along the N dimension.
|
||||
|
|
@ -902,7 +894,7 @@ Return Value:
|
|||
|
||||
for (size_t n = 0; n < N; n += CountN) {
|
||||
|
||||
CountN = (std::min)(N - n, StrideN);
|
||||
CountN = std::min(N - n, StrideN);
|
||||
|
||||
//
|
||||
// Copy a panel of matrix B to a local packed buffer.
|
||||
|
|
@ -910,8 +902,8 @@ Return Value:
|
|||
|
||||
const int8_t* b = (const int8_t*)B + n + k * ldb;
|
||||
|
||||
MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN,
|
||||
CountK, ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
|
||||
MlasGemmU8S8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK,
|
||||
ColumnSumVector, -int16_t(offa), WorkBlock->BTypeIsSigned);
|
||||
|
||||
//
|
||||
// Step through each slice of matrix A along the M dimension.
|
||||
|
|
@ -925,7 +917,7 @@ Return Value:
|
|||
|
||||
for (size_t m = 0; m < M; m += CountM) {
|
||||
|
||||
CountM = (std::min)(M - m, StrideM);
|
||||
CountM = std::min(M - m, StrideM);
|
||||
|
||||
//
|
||||
// Copy a panel of matrix A to a local packed buffer.
|
||||
|
|
@ -1038,7 +1030,7 @@ Return Value:
|
|||
|
||||
for (size_t k = 0; k < K; k += CountK) {
|
||||
|
||||
CountK = (std::min)(K - k, StrideK);
|
||||
CountK = std::min(K - k, StrideK);
|
||||
|
||||
//
|
||||
// Step through each slice of matrix B along the N dimension.
|
||||
|
|
@ -1048,16 +1040,16 @@ Return Value:
|
|||
|
||||
for (size_t n = 0; n < N; n += CountN) {
|
||||
|
||||
CountN = (std::min)(N - n, StrideN);
|
||||
CountN = std::min(N - n, StrideN);
|
||||
|
||||
//
|
||||
// Copy a panel of matrix B to a local packed buffer.
|
||||
//
|
||||
|
||||
const uint8_t* b = (const uint8_t*)B + n + k * ldb;
|
||||
const uint8_t* b = B + n + k * ldb;
|
||||
|
||||
MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK, ColumnSumVector,
|
||||
-int16_t(offa));
|
||||
MlasGemmU8U8CopyPackBAvx2(PanelB, b, ldb, CountN, CountK,
|
||||
ColumnSumVector, -int16_t(offa));
|
||||
|
||||
//
|
||||
// Step through each slice of matrix A along the M dimension.
|
||||
|
|
@ -1071,7 +1063,7 @@ Return Value:
|
|||
|
||||
for (size_t m = 0; m < M; m += CountM) {
|
||||
|
||||
CountM = (std::min)(M - m, StrideM);
|
||||
CountM = std::min(M - m, StrideM);
|
||||
|
||||
//
|
||||
// Copy a panel of matrix A to a local packed buffer.
|
||||
|
|
|
|||
|
|
@ -213,7 +213,7 @@ Return Value:
|
|||
|
||||
for (size_t i = InputChannels; i > 0;) {
|
||||
|
||||
const size_t InputChannelsThisIteration = (std::min)(i, BlockSize);
|
||||
const size_t InputChannelsThisIteration = std::min(i, BlockSize);
|
||||
i -= InputChannelsThisIteration;
|
||||
|
||||
const float* s = S;
|
||||
|
|
@ -308,7 +308,7 @@ Return Value:
|
|||
|
||||
for (size_t o = OutputChannels; o > 0;) {
|
||||
|
||||
const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
|
||||
const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
|
||||
const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
|
||||
o -= OutputChannelsThisIteration;
|
||||
|
||||
|
|
@ -415,7 +415,7 @@ Return Value:
|
|||
|
||||
for (size_t o = OutputChannels; o > 0;) {
|
||||
|
||||
const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
|
||||
const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
|
||||
const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
|
||||
o -= OutputChannelsThisIteration;
|
||||
|
||||
|
|
@ -523,7 +523,7 @@ Return Value:
|
|||
|
||||
for (size_t o = OutputChannels; o > 0;) {
|
||||
|
||||
const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
|
||||
const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
|
||||
const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
|
||||
o -= OutputChannelsThisIteration;
|
||||
|
||||
|
|
@ -538,7 +538,7 @@ Return Value:
|
|||
|
||||
for (size_t i = InputChannels; i > 0;) {
|
||||
|
||||
const size_t InputChannelsThisIteration = (std::min)(i, BlockSize);
|
||||
const size_t InputChannelsThisIteration = std::min(i, BlockSize);
|
||||
i -= InputChannelsThisIteration;
|
||||
|
||||
//
|
||||
|
|
@ -671,7 +671,7 @@ Return Value:
|
|||
|
||||
for (size_t o = OutputChannels; o > 0;) {
|
||||
|
||||
const size_t OutputChannelsThisIteration = (std::min)(o, BlockSize);
|
||||
const size_t OutputChannelsThisIteration = std::min(o, BlockSize);
|
||||
const size_t AlignedOutputChannelsThisIteration = OutputChannelsThisIteration & (~3);
|
||||
o -= OutputChannelsThisIteration;
|
||||
|
||||
|
|
|
|||
|
|
@ -422,29 +422,15 @@ Return Value:
|
|||
t1 = o0.val[1];
|
||||
t2 = o1.val[0];
|
||||
t3 = o1.val[1];
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
// N.B. The MSVC version of _MM_TRANSPOSE4_PS uses shufps which is
|
||||
// slightly larger than the below sequence, so manually expand the
|
||||
// matrix transpose.
|
||||
__m128 z0 = _mm_unpacklo_ps(t0, t1);
|
||||
__m128 z1 = _mm_unpackhi_ps(t0, t1);
|
||||
__m128 z2 = _mm_unpacklo_ps(t2, t3);
|
||||
__m128 z3 = _mm_unpackhi_ps(t2, t3);
|
||||
t0 = _mm_movelh_ps(z0, z2);
|
||||
t1 = _mm_movehl_ps(z2, z0);
|
||||
t2 = _mm_movelh_ps(z1, z3);
|
||||
t3 = _mm_movehl_ps(z3, z1);
|
||||
#elif defined(MLAS_VSX_INTRINSICS)
|
||||
__vector float z0 = vec_mergeh(t0, t2);
|
||||
__vector float z1 = vec_mergel(t0, t2);
|
||||
__vector float z2 = vec_mergeh(t1, t3);
|
||||
__vector float z3 = vec_mergel(t1, t3);
|
||||
t0 = vec_mergeh(z0, z2);
|
||||
t1 = vec_mergel(z0, z2);
|
||||
t2 = vec_mergeh(z1, z3);
|
||||
t3 = vec_mergel(z1, z3);
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
MLAS_FLOAT32X4 z0 = MlasInterleaveLowFloat32x4(t0, t2);
|
||||
MLAS_FLOAT32X4 z1 = MlasInterleaveHighFloat32x4(t0, t2);
|
||||
MLAS_FLOAT32X4 z2 = MlasInterleaveLowFloat32x4(t1, t3);
|
||||
MLAS_FLOAT32X4 z3 = MlasInterleaveHighFloat32x4(t1, t3);
|
||||
t0 = MlasInterleaveLowFloat32x4(z0, z2);
|
||||
t1 = MlasInterleaveHighFloat32x4(z0, z2);
|
||||
t2 = MlasInterleaveLowFloat32x4(z1, z3);
|
||||
t3 = MlasInterleaveHighFloat32x4(z1, z3);
|
||||
#endif
|
||||
|
||||
MlasStoreAlignedFloat32x4(&D[0], t0);
|
||||
|
|
@ -639,7 +625,14 @@ Return Value:
|
|||
MLAS_FLOAT32X4 t0 = MlasLoadFloat32x4(&b[0]);
|
||||
MLAS_FLOAT32X4 t1 = MlasLoadFloat32x4(&b[ldb]);
|
||||
|
||||
#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
|
||||
#if defined(MLAS_SSE2_INTRINSICS)
|
||||
__m128 v0 = _mm_unpacklo_ps(t0, t1);
|
||||
__m128 v1 = _mm_unpackhi_ps(t0, t1);
|
||||
_mm_storel_pi((__m64*)&d[0], v0);
|
||||
_mm_storeh_pi((__m64*)&d[16], v0);
|
||||
_mm_storel_pi((__m64*)&d[32], v1);
|
||||
_mm_storeh_pi((__m64*)&d[48], v1);
|
||||
#else
|
||||
MlasStoreLaneFloat32x4<0>(&d[0], t0);
|
||||
MlasStoreLaneFloat32x4<0>(&d[1], t1);
|
||||
MlasStoreLaneFloat32x4<1>(&d[16], t0);
|
||||
|
|
@ -648,15 +641,6 @@ Return Value:
|
|||
MlasStoreLaneFloat32x4<2>(&d[33], t1);
|
||||
MlasStoreLaneFloat32x4<3>(&d[48], t0);
|
||||
MlasStoreLaneFloat32x4<3>(&d[49], t1);
|
||||
#elif defined(MLAS_SSE2_INTRINSICS)
|
||||
__m128 v0 = _mm_unpacklo_ps(t0, t1);
|
||||
__m128 v1 = _mm_unpackhi_ps(t0, t1);
|
||||
_mm_storel_pi((__m64*)&d[0], v0);
|
||||
_mm_storeh_pi((__m64*)&d[16], v0);
|
||||
_mm_storel_pi((__m64*)&d[32], v1);
|
||||
_mm_storeh_pi((__m64*)&d[48], v1);
|
||||
#else
|
||||
#error Unsupported architecture.
|
||||
#endif
|
||||
|
||||
d += 2;
|
||||
|
|
|
|||
|
|
@ -539,7 +539,7 @@ struct MLAS_NCHWC_GROUPED_CONV_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
|
|||
|
||||
void ComputeFilterCount(void)
|
||||
{
|
||||
FilterCount = (std::min)(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize);
|
||||
FilterCount = std::min(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize);
|
||||
}
|
||||
|
||||
void PrepareWork(int32_t Index)
|
||||
|
|
@ -686,7 +686,7 @@ struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
|
|||
// Compute the number of output lines to process in this iteration.
|
||||
//
|
||||
|
||||
size_t WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);
|
||||
size_t WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph);
|
||||
|
||||
//
|
||||
// Walk over each input image organized as a set of NCHWc blocks.
|
||||
|
|
@ -898,7 +898,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
|
|||
size_t WorkThisIteration;
|
||||
|
||||
if (StrideHeight == 1 && StrideWidth == 1) {
|
||||
WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);
|
||||
WorkThisIteration = std::min(WorkRemaining, OutputHeight - ph);
|
||||
} else {
|
||||
WorkThisIteration = 1;
|
||||
}
|
||||
|
|
@ -923,7 +923,7 @@ struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
|
|||
|
||||
constexpr size_t MaximumInputChannelBatch = 128;
|
||||
|
||||
InputChannelBatch = (std::min)(InputChannels - ic, MaximumInputChannelBatch);
|
||||
InputChannelBatch = std::min(InputChannels - ic, MaximumInputChannelBatch);
|
||||
|
||||
unsigned KernelFlags = ComputeKernelFlags(ic, InputChannelBatch);
|
||||
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ Return Value:
|
|||
|
||||
float Value = *Input++;
|
||||
|
||||
Value = (std::min)(MlasTanhConstants.UpperRange, (std::max)(MlasTanhConstants.LowerRange, Value));
|
||||
Value = std::min(MlasTanhConstants.UpperRange, std::max(MlasTanhConstants.LowerRange, Value));
|
||||
|
||||
float ValueSquared = Value * Value;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue