onnxruntime/onnxruntime/core/mlas/lib/quantize.cpp
Chen Fu 8140e3fde5
Make requantize a qgemm post processor (#7850)
Description:
Change requantize interface so it can be processed block by block. This enable as to make requantize to be a post processor of QGEMM.

Motivation and Context

Previous changes show we improve performance by parallelize batch gemm. Unfortunately we could not parallelize the batch gemm in quantize_linear_matmul due to the requantize operation at the end of each gemm. By changing requantize to be a qgemm post processor, we now can parallelize the batch operation.

Co-authored-by: Chen Fu <fuchen@microsoft.com>
2021-05-27 15:05:04 -07:00

923 lines
25 KiB
C++

/*++
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
Module Name:
quantize.cpp
Abstract:
This module implements routines to quantize buffers.
For quantization formula as specified in the ONNX operator documentation is:
Output = Saturate(RoundToEven(Input / Scale) + ZeroPoint)
--*/
#include "mlasi.h"
#if defined(MLAS_NEON64_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS)
//
// QuantizeLinear implementation using NEON or SSE2 intrinsics.
//
MLAS_FORCEINLINE
MLAS_INT32X4
MlasQuantizeLinearVector(
MLAS_FLOAT32X4 FloatVector,
MLAS_FLOAT32X4 ScaleVector,
MLAS_FLOAT32X4 MinimumValueVector,
MLAS_FLOAT32X4 MaximumValueVector,
MLAS_INT32X4 ZeroPointVector
)
{
//
// Scale the input vector and clamp the values to the minimum and maximum
// range (adjusted by the zero point value).
//
FloatVector = MlasDivideFloat32x4(FloatVector, ScaleVector);
#if defined(MLAS_NEON64_INTRINSICS)
// N.B. FMINNM and FMAXNM returns the numeric value if either of the values
// is a NaN.
FloatVector = vmaxnmq_f32(FloatVector, MinimumValueVector);
FloatVector = vminnmq_f32(FloatVector, MaximumValueVector);
#else
// N.B. MINPS and MAXPS returns the value from the second vector if the
// value from the first vector is a NaN.
FloatVector = _mm_max_ps(FloatVector, MinimumValueVector);
FloatVector = _mm_min_ps(FloatVector, MaximumValueVector);
#endif
//
// Convert the float values to integer using "round to nearest even" and
// then shift the output range using the zero point value.
//
#if defined(MLAS_NEON64_INTRINSICS)
auto IntegerVector = vcvtnq_s32_f32(FloatVector);
IntegerVector = vaddq_s32(IntegerVector, ZeroPointVector);
#else
// N.B. Assumes MXCSR has been configured with the default rounding mode of
// "round to nearest even".
auto IntegerVector = _mm_cvtps_epi32(FloatVector);
IntegerVector = _mm_add_epi32(IntegerVector, ZeroPointVector);
#endif
return IntegerVector;
}
template<typename OutputType>
MLAS_INT32X4
MlasQuantizeLinearPackBytes(
MLAS_INT32X4 IntegerVector
);
#if defined(MLAS_NEON64_INTRINSICS)
template<typename OutputType>
MLAS_INT32X4
MlasQuantizeLinearPackBytes(
MLAS_INT32X4 IntegerVector
)
{
//
// Swizzle the least significant byte from each int32_t element to the
// bottom four bytes of the vector register.
//
uint16x8_t WordVector = vreinterpretq_u16_s32(IntegerVector);
WordVector = vuzp1q_u16(WordVector, WordVector);
uint8x16_t ByteVector = vreinterpretq_u8_u16(WordVector);
ByteVector = vuzp1q_u8(ByteVector, ByteVector);
return vreinterpretq_s32_u8(ByteVector);
}
#else
template<>
MLAS_FORCEINLINE
MLAS_INT32X4
MlasQuantizeLinearPackBytes<uint8_t>(
MLAS_INT32X4 IntegerVector
)
{
IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
return IntegerVector;
}
template<>
MLAS_FORCEINLINE
MLAS_INT32X4
MlasQuantizeLinearPackBytes<int8_t>(
MLAS_INT32X4 IntegerVector
)
{
IntegerVector = _mm_packs_epi16(IntegerVector, IntegerVector);
IntegerVector = _mm_packs_epi16(IntegerVector, IntegerVector);
return IntegerVector;
}
#endif
template<typename OutputType>
void
MLASCALL
MlasQuantizeLinearKernel(
const float* Input,
OutputType* Output,
size_t N,
float Scale,
OutputType ZeroPoint
)
/*++
Routine Description:
This routine quantizes the input buffer using the supplied quantization
parameters.
Arguments:
Input - Supplies the input buffer.
Output - Supplies the output buffer.
N - Supplies the number of elements to process.
Scale - Supplies the quantization scale.
ZeroPoint - Supplies the quantization zero point value.
Return Value:
None.
--*/
{
constexpr int32_t MinimumValue = std::numeric_limits<OutputType>::min();
constexpr int32_t MaximumValue = std::numeric_limits<OutputType>::max();
auto ScaleVector = MlasBroadcastFloat32x4(Scale);
auto MinimumValueVector = MlasBroadcastFloat32x4(float(MinimumValue - ZeroPoint));
auto MaximumValueVector = MlasBroadcastFloat32x4(float(MaximumValue - ZeroPoint));
auto ZeroPointVector = MlasBroadcastInt32x4(ZeroPoint);
while (N >= 4) {
auto FloatVector = MlasLoadFloat32x4(Input);
auto IntegerVector = MlasQuantizeLinearVector(FloatVector, ScaleVector,
MinimumValueVector, MaximumValueVector, ZeroPointVector);
IntegerVector = MlasQuantizeLinearPackBytes<OutputType>(IntegerVector);
#if defined(MLAS_NEON64_INTRINSICS)
vst1q_lane_s32((int32_t*)Output, IntegerVector, 0);
#else
*((int32_t*)Output) = _mm_cvtsi128_si32(IntegerVector);
#endif
Input += 4;
Output += 4;
N -= 4;
}
for (size_t n = 0; n < N; n++) {
#if defined(MLAS_NEON64_INTRINSICS)
auto FloatVector = vld1q_dup_f32(Input + n);
#else
auto FloatVector = _mm_load_ss(Input + n);
#endif
auto IntegerVector = MlasQuantizeLinearVector(FloatVector, ScaleVector,
MinimumValueVector, MaximumValueVector, ZeroPointVector);
#if defined(MLAS_NEON64_INTRINSICS)
vst1q_lane_u8((uint8_t*)Output + n, vreinterpretq_u8_s32(IntegerVector), 0);
#else
*((uint8_t*)Output + n) = (uint8_t)_mm_cvtsi128_si32(IntegerVector);
#endif
}
}
void
MLASCALL
MlasQuantizeLinearS8Kernel(
const float* Input,
int8_t* Output,
size_t N,
float Scale,
int8_t ZeroPoint
)
{
MlasQuantizeLinearKernel<int8_t>(Input, Output, N, Scale, ZeroPoint);
}
void
MLASCALL
MlasQuantizeLinearU8Kernel(
const float* Input,
uint8_t* Output,
size_t N,
float Scale,
uint8_t ZeroPoint
)
{
MlasQuantizeLinearKernel<uint8_t>(Input, Output, N, Scale, ZeroPoint);
}
template<>
void
MLASCALL
MlasQuantizeLinear<int8_t>(
const float* Input,
int8_t* Output,
size_t N,
float Scale,
int8_t ZeroPoint
)
{
#if defined(MLAS_TARGET_AMD64)
MlasPlatform.QuantizeLinearS8Kernel(
#else
MlasQuantizeLinearS8Kernel(
#endif
Input, Output, N, Scale, ZeroPoint);
}
template<>
void
MLASCALL
MlasQuantizeLinear<uint8_t>(
const float* Input,
uint8_t* Output,
size_t N,
float Scale,
uint8_t ZeroPoint
)
{
#if defined(MLAS_TARGET_AMD64)
MlasPlatform.QuantizeLinearU8Kernel(
#else
MlasQuantizeLinearU8Kernel(
#endif
Input, Output, N, Scale, ZeroPoint);
}
#else
//
// QuantizeLinear implementation using the C++ runtime.
//
template<typename OutputType>
void
MLASCALL
MlasQuantizeLinear(
const float* Input,
OutputType* Output,
size_t N,
float Scale,
OutputType ZeroPoint
)
/*++
Routine Description:
This routine quantizes the input buffer using the supplied quantization
parameters.
Arguments:
Input - Supplies the input buffer.
Output - Supplies the output buffer.
N - Supplies the number of elements to process.
Scale - Supplies the quantization scale.
ZeroPoint - Supplies the quantization zero point value.
Return Value:
None.
--*/
{
constexpr int32_t MinimumValue = std::numeric_limits<OutputType>::min();
constexpr int32_t MaximumValue = std::numeric_limits<OutputType>::max();
for (size_t n = 0; n < N; n++) {
float FloatValue = std::nearbyintf(Input[n] / Scale) + float(ZeroPoint);
FloatValue = std::max(FloatValue, float(MinimumValue));
FloatValue = std::min(FloatValue, float(MaximumValue));
Output[n] = (OutputType)(int32_t)FloatValue;
}
}
template
void
MLASCALL
MlasQuantizeLinear<int8_t>(
const float* Input,
int8_t* Output,
size_t N,
float Scale,
int8_t ZeroPoint
);
template
void
MLASCALL
MlasQuantizeLinear<uint8_t>(
const float* Input,
uint8_t* Output,
size_t N,
float Scale,
uint8_t ZeroPoint
);
#endif
#if defined(MLAS_SSE2_INTRINSICS)
void
MLASCALL
MlasRequantizeOutput(
const int32_t* Input,
size_t InputLeadingDimension,
uint8_t* Output,
size_t OutputLeadingDimension,
const int32_t* Bias,
const float* Scale,
bool PerColumnScale,
uint8_t ZeroPoint,
size_t StartM,
size_t StartN,
size_t CountM,
size_t CountN
)
{
const __m128 PerMatrixScaleVector = PerColumnScale ? _mm_setzero_ps() : _mm_load1_ps(Scale);
const __m128 MinimumValueVector = _mm_set1_ps(float(0 - ZeroPoint));
const __m128 MaximumValueVector = _mm_set1_ps(float(255 - ZeroPoint));
const __m128i ZeroPointVector = _mm_set1_epi32(ZeroPoint);
if (nullptr != Bias) {
Bias += StartN;
}
if (PerColumnScale) {
Scale += StartN;
}
Input += StartM * InputLeadingDimension + StartN;
Output += StartM * OutputLeadingDimension + StartN;
//
// Step through each row of the output matrix.
//
while (CountM-- > 0) {
const int32_t* bias = Bias;
const float* scale = PerColumnScale ? Scale : nullptr;
size_t n = CountN;
auto* RowInput = Input;
auto* RowOutput = Output;
//
// Process 16 columns of the matrices at a time.
//
while (n >= 16) {
//
// Load the input data and optionally add the per-column bias.
//
__m128i IntegerVector0 = _mm_loadu_si128((const __m128i*)&RowInput[0]);
__m128i IntegerVector1 = _mm_loadu_si128((const __m128i*)&RowInput[4]);
__m128i IntegerVector2 = _mm_loadu_si128((const __m128i*)&RowInput[8]);
__m128i IntegerVector3 = _mm_loadu_si128((const __m128i*)&RowInput[12]);
RowInput += 16;
if (bias != nullptr) {
IntegerVector0 = _mm_add_epi32(IntegerVector0, _mm_loadu_si128((const __m128i *)&bias[0]));
IntegerVector1 = _mm_add_epi32(IntegerVector1, _mm_loadu_si128((const __m128i *)&bias[4]));
IntegerVector2 = _mm_add_epi32(IntegerVector2, _mm_loadu_si128((const __m128i *)&bias[8]));
IntegerVector3 = _mm_add_epi32(IntegerVector3, _mm_loadu_si128((const __m128i *)&bias[12]));
bias += 16;
}
//
// Convert to integer values to float and apply the per-tensor or
// per-column scaling.
//
__m128 FloatVector0 = _mm_cvtepi32_ps(IntegerVector0);
__m128 FloatVector1 = _mm_cvtepi32_ps(IntegerVector1);
__m128 FloatVector2 = _mm_cvtepi32_ps(IntegerVector2);
__m128 FloatVector3 = _mm_cvtepi32_ps(IntegerVector3);
if (scale != nullptr) {
FloatVector0 = _mm_mul_ps(FloatVector0, _mm_loadu_ps(&scale[0]));
FloatVector1 = _mm_mul_ps(FloatVector1, _mm_loadu_ps(&scale[4]));
FloatVector2 = _mm_mul_ps(FloatVector2, _mm_loadu_ps(&scale[8]));
FloatVector3 = _mm_mul_ps(FloatVector3, _mm_loadu_ps(&scale[12]));
scale += 16;
} else {
FloatVector0 = _mm_mul_ps(FloatVector0, PerMatrixScaleVector);
FloatVector1 = _mm_mul_ps(FloatVector1, PerMatrixScaleVector);
FloatVector2 = _mm_mul_ps(FloatVector2, PerMatrixScaleVector);
FloatVector3 = _mm_mul_ps(FloatVector3, PerMatrixScaleVector);
}
FloatVector0 = _mm_max_ps(FloatVector0, MinimumValueVector);
FloatVector1 = _mm_max_ps(FloatVector1, MinimumValueVector);
FloatVector2 = _mm_max_ps(FloatVector2, MinimumValueVector);
FloatVector3 = _mm_max_ps(FloatVector3, MinimumValueVector);
FloatVector0 = _mm_min_ps(FloatVector0, MaximumValueVector);
FloatVector1 = _mm_min_ps(FloatVector1, MaximumValueVector);
FloatVector2 = _mm_min_ps(FloatVector2, MaximumValueVector);
FloatVector3 = _mm_min_ps(FloatVector3, MaximumValueVector);
IntegerVector0 = _mm_cvtps_epi32(FloatVector0);
IntegerVector1 = _mm_cvtps_epi32(FloatVector1);
IntegerVector2 = _mm_cvtps_epi32(FloatVector2);
IntegerVector3 = _mm_cvtps_epi32(FloatVector3);
IntegerVector0 = _mm_add_epi32(IntegerVector0, ZeroPointVector);
IntegerVector1 = _mm_add_epi32(IntegerVector1, ZeroPointVector);
IntegerVector2 = _mm_add_epi32(IntegerVector2, ZeroPointVector);
IntegerVector3 = _mm_add_epi32(IntegerVector3, ZeroPointVector);
__m128i WordVector0 = _mm_packus_epi16(IntegerVector0, IntegerVector1);
__m128i WordVector1 = _mm_packus_epi16(IntegerVector2, IntegerVector3);
__m128i ByteVector = _mm_packus_epi16(WordVector0, WordVector1);
_mm_storeu_si128((__m128i*)RowOutput, ByteVector);
RowOutput += 16;
n -= 16;
}
//
// Process the remaining columns of the matrices.
//
while (n > 0) {
//
// Load the input data and optionally add the per-column bias.
//
__m128i IntegerVector;
if (n >= 4) {
IntegerVector = _mm_loadu_si128((const __m128i*)&RowInput[0]);
RowInput += 4;
if (bias != nullptr) {
IntegerVector = _mm_add_epi32(IntegerVector, _mm_loadu_si128((const __m128i*)&bias[0]));
bias += 4;
}
} else {
int32_t IntegerValue = *RowInput++;
if (bias != nullptr) {
IntegerValue += *bias++;
}
IntegerVector = _mm_cvtsi32_si128(IntegerValue);
}
//
// Convert to integer values to float and apply the per-tensor or
// per-column scaling.
//
__m128 FloatVector = _mm_cvtepi32_ps(IntegerVector);
__m128 ScaleVector;
if (scale != nullptr) {
if (n >= 4) {
ScaleVector = _mm_loadu_ps(scale);
scale += 4;
} else {
ScaleVector = _mm_load_ss(scale);
scale += 1;
}
} else {
ScaleVector = PerMatrixScaleVector;
}
FloatVector = _mm_mul_ps(FloatVector, ScaleVector);
FloatVector = _mm_max_ps(FloatVector, MinimumValueVector);
FloatVector = _mm_min_ps(FloatVector, MaximumValueVector);
IntegerVector = _mm_cvtps_epi32(FloatVector);
IntegerVector = _mm_add_epi32(IntegerVector, ZeroPointVector);
IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
IntegerVector = _mm_packus_epi16(IntegerVector, IntegerVector);
uint32_t OutputValue = uint32_t(_mm_cvtsi128_si32(IntegerVector));
if (n >= 4) {
*reinterpret_cast<uint32_t*>(RowOutput) = OutputValue;
RowOutput += 4;
n -= 4;
} else {
*RowOutput = uint8_t(OutputValue);
RowOutput += 1;
n -= 1;
}
}
// Next Row
Input += InputLeadingDimension;
Output += OutputLeadingDimension;
}
}
#elif defined(MLAS_NEON64_INTRINSICS)
void
MLASCALL
MlasRequantizeOutput(
const int32_t* Input,
size_t InputLeadingDimension,
uint8_t* Output,
size_t OutputLeadingDimension,
const int32_t* Bias,
const float* Scale,
bool PerColumnScale,
uint8_t ZeroPoint,
size_t StartM,
size_t StartN,
size_t CountM,
size_t CountN
)
{
const float32x4_t PerMatrixScaleVector = PerColumnScale ? vdupq_n_f32(0) : vld1q_dup_f32(Scale);
const int16x8_t ZeroPointVector = vdupq_n_s16(ZeroPoint);
if (nullptr != Bias) {
Bias += StartN;
}
if (PerColumnScale) {
Scale += StartN;
}
Input += StartM * InputLeadingDimension + StartN;
Output += StartM * OutputLeadingDimension + StartN;
//
// Step through each row of the output matrix.
//
while (CountM-- > 0) {
const int32_t* bias = Bias;
const float* scale = PerColumnScale ? Scale : nullptr;
size_t n = CountN;
auto* RowInput = Input;
auto* RowOutput = Output;
//
// Process 16 columns of the matrices at a time.
//
while (n >= 16) {
//
// Load the input data and optionally add the per-column bias.
//
int32x4x4_t IntegerVector;
IntegerVector.val[0] = vld1q_s32(&RowInput[0]);
IntegerVector.val[1] = vld1q_s32(&RowInput[4]);
IntegerVector.val[2] = vld1q_s32(&RowInput[8]);
IntegerVector.val[3] = vld1q_s32(&RowInput[12]);
RowInput += 16;
if (bias != nullptr) {
IntegerVector.val[0] = vaddq_s32(IntegerVector.val[0], vld1q_s32(&bias[0]));
IntegerVector.val[1] = vaddq_s32(IntegerVector.val[1], vld1q_s32(&bias[4]));
IntegerVector.val[2] = vaddq_s32(IntegerVector.val[2], vld1q_s32(&bias[8]));
IntegerVector.val[3] = vaddq_s32(IntegerVector.val[3], vld1q_s32(&bias[12]));
bias += 16;
}
//
// Convert to integer values to float and apply the per-tensor or
// per-column scaling.
//
float32x4x4_t FloatVector;
FloatVector.val[0] = vcvtq_f32_s32(IntegerVector.val[0]);
FloatVector.val[1] = vcvtq_f32_s32(IntegerVector.val[1]);
FloatVector.val[2] = vcvtq_f32_s32(IntegerVector.val[2]);
FloatVector.val[3] = vcvtq_f32_s32(IntegerVector.val[3]);
if (scale != nullptr) {
float32x4x4_t PerColumnScaleVector;
PerColumnScaleVector.val[0] = vld1q_f32(&scale[0]);
PerColumnScaleVector.val[1] = vld1q_f32(&scale[4]);
PerColumnScaleVector.val[2] = vld1q_f32(&scale[8]);
PerColumnScaleVector.val[3] = vld1q_f32(&scale[12]);
scale += 16;
FloatVector.val[0] = vmulq_f32(FloatVector.val[0], PerColumnScaleVector.val[0]);
FloatVector.val[1] = vmulq_f32(FloatVector.val[1], PerColumnScaleVector.val[1]);
FloatVector.val[2] = vmulq_f32(FloatVector.val[2], PerColumnScaleVector.val[2]);
FloatVector.val[3] = vmulq_f32(FloatVector.val[3], PerColumnScaleVector.val[3]);
} else {
FloatVector.val[0] = vmulq_f32(FloatVector.val[0], PerMatrixScaleVector);
FloatVector.val[1] = vmulq_f32(FloatVector.val[1], PerMatrixScaleVector);
FloatVector.val[2] = vmulq_f32(FloatVector.val[2], PerMatrixScaleVector);
FloatVector.val[3] = vmulq_f32(FloatVector.val[3], PerMatrixScaleVector);
}
//
// Convert the float values to integer using "round to nearest even".
// Results are saturated to the range of int32_t.
//
IntegerVector.val[0] = vcvtnq_s32_f32(FloatVector.val[0]);
IntegerVector.val[1] = vcvtnq_s32_f32(FloatVector.val[1]);
IntegerVector.val[2] = vcvtnq_s32_f32(FloatVector.val[2]);
IntegerVector.val[3] = vcvtnq_s32_f32(FloatVector.val[3]);
//
// Pack the integers with saturation to 16-bit values and shift by
// the zero point, then pack the integers again to unsigned bytes.
//
int16x8x2_t WordVector;
WordVector.val[0] = vqmovn_high_s32(vqmovn_s32(IntegerVector.val[0]), IntegerVector.val[1]);
WordVector.val[1] = vqmovn_high_s32(vqmovn_s32(IntegerVector.val[2]), IntegerVector.val[3]);
WordVector.val[0] = vqaddq_s16(WordVector.val[0], ZeroPointVector);
WordVector.val[1] = vqaddq_s16(WordVector.val[1], ZeroPointVector);
vst1q_u8(RowOutput, vqmovun_high_s16(vqmovun_s16(WordVector.val[0]), WordVector.val[1]));
RowOutput += 16;
n -= 16;
}
//
// Process the remaining columns of the matrices.
//
while (n > 0) {
//
// Load the input data and optionally add the per-column bias.
//
int32x4_t IntegerVector;
if (n >= 4) {
IntegerVector = vld1q_s32(&RowInput[0]);
RowInput += 4;
if (bias != nullptr) {
IntegerVector = vaddq_s32(IntegerVector, vld1q_s32(&bias[0]));
bias += 4;
}
} else {
IntegerVector = vld1q_dup_s32(RowInput);
RowInput += 1;
if (bias != nullptr) {
IntegerVector = vaddq_s32(IntegerVector, vld1q_dup_s32(bias));
bias += 1;
}
}
//
// Convert to integer values to float and apply the per-tensor or
// per-column scaling.
//
float32x4_t FloatVector = vcvtq_f32_s32(IntegerVector);
float32x4_t ScaleVector;
if (scale != nullptr) {
if (n >= 4) {
ScaleVector = vld1q_f32(scale);
scale += 4;
} else {
ScaleVector = vld1q_dup_f32(scale);
scale += 1;
}
} else {
ScaleVector = PerMatrixScaleVector;
}
FloatVector = vmulq_f32(FloatVector, ScaleVector);
//
// Convert the float values to integer using "round to nearest even".
// Results are saturated to the range of int32_t.
//
IntegerVector = vcvtnq_s32_f32(FloatVector);
//
// Pack the integers with saturation to 16-bit values and shift by
// the zero point, then pack the integers again to unsigned bytes.
//
int16x8_t WordVector = vcombine_s16(vqmovn_s32(IntegerVector), vdup_n_s16(0));
WordVector = vqaddq_s16(WordVector, ZeroPointVector);
uint8x16_t ByteVector = vcombine_u8(vqmovun_s16(WordVector), vdup_n_u8(0));
if (n >= 4) {
vst1q_lane_u32(reinterpret_cast<uint32_t*>(RowOutput),
vreinterpretq_u32_u8(ByteVector), 0);
RowOutput += 4;
n -= 4;
} else {
vst1q_lane_u8(RowOutput, ByteVector, 0);
RowOutput += 1;
n -= 1;
}
}
// Next Row
Input += InputLeadingDimension;
Output += OutputLeadingDimension;
}
}
#else
void
MLASCALL
MlasRequantizeOutput(
const int32_t* Input,
size_t InputLeadingDimension,
uint8_t* Output,
size_t OutputLeadingDimension,
const int32_t* Bias,
const float* Scale,
bool PerColumnScale,
uint8_t ZeroPoint,
size_t StartM,
size_t StartN,
size_t CountM,
size_t CountN
)
{
const float PerMatrixScaleValue = PerColumnScale ? 0.0f : *Scale;
const float MinimumValue = float(0 - ZeroPoint);
const float MaximumValue = float(255 - ZeroPoint);
if (nullptr != Bias) {
Bias += StartN;
}
if (PerColumnScale) {
Scale += StartN;
}
Input += StartM * InputLeadingDimension + StartN;
Output += StartM * OutputLeadingDimension + StartN;
//
// Step through each row of the output matrix.
//
while (CountM-- > 0) {
const int32_t* bias = Bias;
const float* scale = Scale;
size_t n = CountN;
auto* RowInput = Input;
auto* RowOutput = Output;
while (n > 0) {
int32_t IntegerValue = *RowInput++;
if (bias != nullptr) {
IntegerValue += *bias++;
}
float FloatValue = float(IntegerValue);
float ScaleValue = PerColumnScale ? *scale++ : PerMatrixScaleValue;
FloatValue *= ScaleValue;
FloatValue = std::max(FloatValue, MinimumValue);
FloatValue = std::min(FloatValue, MaximumValue);
//
// Use the fast rounding trick adapted from XNNPACK: bias the floating
// point value by the first floating point value that has no
// fractional bits. The add operation performs the "round to nearest
// even". Extract the mantissa bits from this floating point value to
// obtain the rounded integer value.
//
IntegerValue = int32_t(MlasBitsOfFp32(FloatValue + MLAS_ROUNDING_BIAS_MAGIC)) -
MLAS_ROUNDING_BIAS_MAGIC_BITS;
*RowOutput++ = uint8_t(IntegerValue + ZeroPoint);
n -= 1;
}
// Next Row
Input += InputLeadingDimension;
Output += OutputLeadingDimension;
}
}
#endif
void
MLASCALL
MlasFindMinMaxElement(
const float* Input,
float* Min,
float* Max,
size_t N
)
/*++
Routine Description:
This routine finds the minimum and maximum values of the supplied buffer.
Arguments:
Input - Supplies the input buffer.
Min - Returns the minimum value of the supplied buffer.
Max - Returns the maximum value of the supplied buffer.
N - Supplies the number of elements to process.
Return Value:
None.
--*/
{
#if defined(MLAS_TARGET_AMD64)
MlasPlatform.ReduceMinimumMaximumF32Kernel(Input, Min, Max, N);
#else
MlasReduceMinimumMaximumF32Kernel(Input, Min, Max, N);
#endif
}