onnxruntime/onnxruntime/core/mlas/lib/pooling.cpp
Tracy Sharpe bf1caba2b2
Port MLAS to Power architecture (#3703)
Updates to MLAS to support building for the Power architecture.
2020-04-25 19:31:55 -07:00

1358 lines
41 KiB
C++

/*++
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
Module Name:
pooling.cpp
Abstract:
This module implements the pooling operation.
--*/
#include "mlasi.h"
//
// Define the parameters to execute segments of a pooling operation on worker
// threads.
//
struct MLAS_WORK_BLOCK {
MLAS_POOLING_KIND PoolingKind;
size_t InputShape[3];
size_t InputSize;
size_t OutputShape[3];
int64_t KernelShape[3];
int64_t Padding[6];
int64_t StrideShape[3];
};
//
// Define the prototype of the pooling kernel routine.
//
typedef
void
(MLAS_POOL_KERNEL_ROUTINE)(
const MLAS_WORK_BLOCK* WorkBlock,
size_t ChannelCount,
const float* Input,
float* Output
);
typedef MLAS_POOL_KERNEL_ROUTINE* PMLAS_POOL_KERNEL_ROUTINE;
//
// Define the number of elements to allocate on the stack for the reduction
// buffer in the vectorized kernels.
//
#define MLAS_POOL_REDUCTION_BUFFER_STACK 2048
//
// Define the number of reduction buffer elements reserved for over-reading
// an entire vector to avoid special handling at the right edge of the
// buffer.
//
#define MLAS_POOL_REDUCTION_BUFFER_PADDING ((sizeof(MLAS_FLOAT32X4) / sizeof(float)) - 1)
//
// Abstraction for maximum pooling.
//
struct MLAS_MAXIMUM_POOLING
{
static float InitialValue()
{
return std::numeric_limits<float>::lowest();
}
static MLAS_FLOAT32X4 InitialVector()
{
return MlasBroadcastFloat32x4(InitialValue());
}
static float Reduce(float Reduction, float Value)
{
return (std::max)(Reduction, Value);
}
static MLAS_FLOAT32X4 Reduce(MLAS_FLOAT32X4 Reduction, MLAS_FLOAT32X4 Value)
{
return MlasMaximumFloat32x4(Reduction, Value);
}
#if defined(MLAS_NEON64_INTRINSICS)
static float ReduceFloat32x4(MLAS_FLOAT32X4 Reduction)
{
return vmaxvq_f32(Reduction);
}
#elif defined(MLAS_NEON32_INTRINSICS)
static float32x2_t ReducePairwise(float32x2_t Vector0, float32x2_t Vector1)
{
return vpmax_f32(Vector0, Vector1);
}
#endif
static float AveragePool(float Reduction, float Size)
{
MLAS_UNREFERENCED_PARAMETER(Size);
return Reduction;
}
struct DividerVectorContext
{
void PrepareExcludePad(size_t PaddingLeftWidth, size_t InputWidth, size_t KernelWidth)
{
MLAS_UNREFERENCED_PARAMETER(PaddingLeftWidth);
MLAS_UNREFERENCED_PARAMETER(InputWidth);
MLAS_UNREFERENCED_PARAMETER(KernelWidth);
}
void PrepareIncludePad(size_t KernelSize)
{
MLAS_UNREFERENCED_PARAMETER(KernelSize);
}
void StartNextOutputRow(size_t InputRowsCount)
{
MLAS_UNREFERENCED_PARAMETER(InputRowsCount);
}
MLAS_FLOAT32X4 DivideExcludePad(MLAS_FLOAT32X4 Reduction)
{
return Reduction;
}
MLAS_FLOAT32X4 DivideIncludePad(MLAS_FLOAT32X4 Reduction)
{
return Reduction;
}
};
};
//
// Abstraction for average pooling.
//
MLAS_DECLSPEC_ALIGN(static const float MlasInitialReductionInputIndex[], sizeof(MLAS_FLOAT32X4)) = { 0.0f, 1.0f, 2.0f, 3.0f };
struct MLAS_AVERAGE_POOLING
{
static float InitialValue()
{
return 0.0f;
}
static MLAS_FLOAT32X4 InitialVector()
{
return MlasZeroFloat32x4();
}
static float Reduce(float Reduction, float Value)
{
return Reduction + Value;
}
static MLAS_FLOAT32X4 Reduce(MLAS_FLOAT32X4 Reduction, MLAS_FLOAT32X4 Value)
{
return MlasAddFloat32x4(Reduction, Value);
}
#if defined(MLAS_NEON64_INTRINSICS)
static float ReduceFloat32x4(MLAS_FLOAT32X4 Reduction)
{
Reduction = vpaddq_f32(Reduction, Reduction);
Reduction = vpaddq_f32(Reduction, Reduction);
return vgetq_lane_f32(Reduction, 0);
}
#elif defined(MLAS_NEON32_INTRINSICS)
static float32x2_t ReducePairwise(float32x2_t Vector0, float32x2_t Vector1)
{
return vpadd_f32(Vector0, Vector1);
}
#endif
static float AveragePool(float Reduction, float Size)
{
return Reduction / Size;
}
struct DividerVectorContext
{
MLAS_FLOAT32X4 KernelSizeBroadcast;
MLAS_FLOAT32X4 KernelWidthBroadcast;
MLAS_FLOAT32X4 PaddingLowerBound;
MLAS_FLOAT32X4 PaddingUpperBound;
MLAS_FLOAT32X4 ReductionInputIndex;
MLAS_FLOAT32X4 InputRowsBroadcast;
void PrepareExcludePad(size_t PaddingLeftWidth, size_t InputWidth, size_t KernelWidth)
{
KernelWidthBroadcast = MlasBroadcastFloat32x4(float(unsigned(KernelWidth)));
PaddingLowerBound = MlasBroadcastFloat32x4(float(unsigned(PaddingLeftWidth)));
PaddingUpperBound = MlasBroadcastFloat32x4(float(unsigned(PaddingLeftWidth + InputWidth)));
}
void PrepareIncludePad(size_t KernelSize)
{
KernelSizeBroadcast = MlasBroadcastFloat32x4(float(unsigned(KernelSize)));
}
void StartNextOutputRow(size_t InputRowsCount)
{
ReductionInputIndex = MlasLoadFloat32x4(MlasInitialReductionInputIndex);
InputRowsBroadcast = MlasBroadcastFloat32x4(float(unsigned(InputRowsCount)));
}
MLAS_FLOAT32X4 DivideExcludePad(MLAS_FLOAT32X4 Reduction)
{
MLAS_FLOAT32X4 Divisor;
//
// Compute the ending input index for each column and bound the index
// range by the padding indices, then compute the number of input
// column contributions from the delta.
//
MLAS_FLOAT32X4 ReductionInputEndingIndex =
MlasAddFloat32x4(ReductionInputIndex, KernelWidthBroadcast);
MLAS_FLOAT32X4 LowerInputIndex =
MlasMaximumFloat32x4(ReductionInputIndex, PaddingLowerBound);
MLAS_FLOAT32X4 UpperInputIndex =
MlasMinimumFloat32x4(ReductionInputEndingIndex, PaddingUpperBound);
MLAS_FLOAT32X4 InputIndexDelta =
MlasSubtractFloat32x4(UpperInputIndex, LowerInputIndex);
//
// Advance the input index vector for the next iteration.
//
ReductionInputIndex =
MlasAddFloat32x4(ReductionInputIndex, MlasBroadcastFloat32x4(4.0f));
//
// Compute the per-column number of input elements used for the sum.
//
// At the end of the input row, the index range computed above may be
// zero for unused trailing vector elements, so avoid any divide by zero
// penalty by enforcing a minimum of 1.0f.
//
Divisor = MlasMultiplyFloat32x4(InputIndexDelta, InputRowsBroadcast);
Divisor = MlasMaximumFloat32x4(Divisor, MlasBroadcastFloat32x4(1.0f));
return MlasDivideFloat32x4(Reduction, Divisor);
}
MLAS_FLOAT32X4 DivideIncludePad(MLAS_FLOAT32X4 Reduction)
{
return MlasDivideFloat32x4(Reduction, KernelSizeBroadcast);
}
};
};
template<typename PoolingType>
void
MlasPool1DKernel(
const MLAS_WORK_BLOCK* WorkBlock,
size_t ChannelCount,
const float* Input,
float* Output
)
/*++
Routine Description:
This routine implements the 1D pooling operation using generic constructs.
Arguments:
WorkBlock - Supplies the structure that contains the pooling parameters.
ChannelCount - Supplies the number of channels to process.
Input - Supplies the input tensor.
Output - Supplies the output tensor.
Return Value:
None.
--*/
{
constexpr size_t WidthShapeIndex = 0;
const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;
const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];
const int64_t KernelWidth = WorkBlock->KernelShape[WidthShapeIndex];
const int64_t PaddingLeftWidth = WorkBlock->Padding[WidthShapeIndex];
const int64_t StrideWidth = WorkBlock->StrideShape[WidthShapeIndex];
for (size_t c = 0; c < ChannelCount; c++) {
for (size_t pw = 0; pw < OutputWidth; pw++) {
const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
const int64_t iwEnd64 = iwStart64 + KernelWidth;
const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
float m = PoolingType::InitialValue();
for (size_t iw = size_t(iwStart); iw < size_t(iwEnd); iw++) {
m = PoolingType::Reduce(m, Input[iw]);
}
if (PoolingKind == MlasAveragePoolingExcludePad) {
m = PoolingType::AveragePool(m, float(iwEnd - iwStart));
} else {
m = PoolingType::AveragePool(m, float(KernelWidth));
}
*Output++ = m;
}
Input += InputWidth;
}
}
template<typename PoolingType>
void
MlasPool2DKernel(
const MLAS_WORK_BLOCK* WorkBlock,
size_t ChannelCount,
const float* Input,
float* Output
)
/*++
Routine Description:
This routine implements the 2D pooling operation using generic constructs.
Arguments:
WorkBlock - Supplies the structure that contains the pooling parameters.
ChannelCount - Supplies the number of channels to process.
Input - Supplies the input tensor.
Output - Supplies the output tensor.
Return Value:
None.
--*/
{
constexpr size_t HeightShapeIndex = 0;
constexpr size_t WidthShapeIndex = 1;
const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;
const size_t InputHeight = WorkBlock->InputShape[HeightShapeIndex];
const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
const size_t InputSize = WorkBlock->InputSize;
const size_t OutputHeight = WorkBlock->OutputShape[HeightShapeIndex];
const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];
const int64_t KernelHeight = WorkBlock->KernelShape[HeightShapeIndex];
const int64_t KernelWidth = WorkBlock->KernelShape[WidthShapeIndex];
const int64_t PaddingLeftHeight = WorkBlock->Padding[HeightShapeIndex];
const int64_t PaddingLeftWidth = WorkBlock->Padding[WidthShapeIndex];
const int64_t StrideHeight = WorkBlock->StrideShape[HeightShapeIndex];
const int64_t StrideWidth = WorkBlock->StrideShape[WidthShapeIndex];
for (size_t c = 0; c < ChannelCount; c++) {
for (size_t ph = 0; ph < OutputHeight; ph++) {
const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
const int64_t ihEnd64 = ihStart64 + KernelHeight;
const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
for (size_t pw = 0; pw < OutputWidth; pw++) {
const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
const int64_t iwEnd64 = iwStart64 + KernelWidth;
const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
float m = PoolingType::InitialValue();
for (size_t ih = ihStart; ih < ihEnd; ih++) {
for (size_t iw = iwStart; iw < iwEnd; iw++) {
m = PoolingType::Reduce(m, Input[ih * InputWidth + iw]);
}
}
if (PoolingKind == MlasAveragePoolingExcludePad) {
m = PoolingType::AveragePool(m, float((ihEnd - ihStart) * (iwEnd - iwStart)));
} else {
m = PoolingType::AveragePool(m, float(KernelHeight * KernelWidth));
}
*Output++ = m;
}
}
Input += InputSize;
}
}
template<typename PoolingType>
void
MlasPool2DVectorKernel(
const MLAS_WORK_BLOCK* WorkBlock,
size_t ChannelCount,
const float* Input,
float* Output
)
/*++
Routine Description:
This routine implements an optimized 2D pooling operation using vector
instructions.
Arguments:
WorkBlock - Supplies the structure that contains the pooling parameters.
ChannelCount - Supplies the number of channels to process.
Input - Supplies the input tensor.
Output - Supplies the output tensor.
Return Value:
None.
--*/
{
constexpr size_t Dimensions = 2;
constexpr size_t HeightShapeIndex = 0;
constexpr size_t WidthShapeIndex = 1;
const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;
const size_t InputHeight = WorkBlock->InputShape[HeightShapeIndex];
const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
const size_t InputSize = WorkBlock->InputSize;
const size_t OutputHeight = WorkBlock->OutputShape[HeightShapeIndex];
const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];
const size_t KernelHeight = size_t(WorkBlock->KernelShape[HeightShapeIndex]);
const size_t KernelWidth = size_t(WorkBlock->KernelShape[WidthShapeIndex]);
const size_t PaddingLeftHeight = size_t(WorkBlock->Padding[HeightShapeIndex]);
const size_t PaddingLeftWidth = size_t(WorkBlock->Padding[WidthShapeIndex]);
const size_t PaddingRightWidth = size_t(WorkBlock->Padding[Dimensions + WidthShapeIndex]);
const size_t StrideHeight = size_t(WorkBlock->StrideShape[HeightShapeIndex]);
const size_t StrideWidth = size_t(WorkBlock->StrideShape[WidthShapeIndex]);
float ReductionBuffer[MLAS_POOL_REDUCTION_BUFFER_STACK];
//
// Fill the edges of the reduction buffer with the padding value.
//
float* FillReductionBuffer = ReductionBuffer;
float* FillReductionBufferEnd = FillReductionBuffer + PaddingLeftWidth;
while (FillReductionBuffer < FillReductionBufferEnd) {
*FillReductionBuffer++ = PoolingType::InitialValue();
}
FillReductionBuffer = FillReductionBuffer + InputWidth;
FillReductionBufferEnd = FillReductionBuffer + PaddingRightWidth + MLAS_POOL_REDUCTION_BUFFER_PADDING;
while (FillReductionBuffer < FillReductionBufferEnd) {
*FillReductionBuffer++ = PoolingType::InitialValue();
}
//
// Apply the pooling operation to each channel.
//
typename PoolingType::DividerVectorContext divider;
divider.PrepareExcludePad(PaddingLeftWidth, InputWidth, KernelWidth);
divider.PrepareIncludePad(KernelHeight * KernelWidth);
for (size_t c = 0; c < ChannelCount; c++) {
for (size_t ph = 0; ph < OutputHeight; ph++) {
size_t ihStart = ph * StrideHeight - PaddingLeftHeight;
size_t ihEnd = ihStart + KernelHeight;
if (ihStart >= InputHeight) {
ihStart = 0;
}
if (ihEnd > InputHeight) {
ihEnd = InputHeight;
}
divider.StartNextOutputRow(ihEnd - ihStart);
//
// Reduce the input across the kernel height and store in a local
// reduction buffer.
//
const float* InputRowStart = &Input[ihStart * InputWidth];
const size_t InputRowsCount = ihEnd - ihStart - 1;
size_t InputWidthRemaining = InputWidth;
float* ReductionOutput = &ReductionBuffer[PaddingLeftWidth];
while (InputWidthRemaining >= 4) {
const float* InputRow = InputRowStart;
size_t InputRowsRemaining = InputRowsCount;
MLAS_FLOAT32X4 Reduction = MlasLoadFloat32x4(InputRow);
while (InputRowsRemaining > 0) {
InputRow += InputWidth;
Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(InputRow));
InputRowsRemaining--;
}
MlasStoreFloat32x4(ReductionOutput, Reduction);
ReductionOutput += 4;
InputRowStart += 4;
InputWidthRemaining -= 4;
}
while (InputWidthRemaining > 0) {
const float* InputRow = InputRowStart;
size_t InputRowsRemaining = InputRowsCount;
float Reduction = *InputRow;
while (InputRowsRemaining > 0) {
InputRow += InputWidth;
Reduction = PoolingType::Reduce(Reduction, *InputRow);
InputRowsRemaining--;
}
*ReductionOutput++ = Reduction;
InputRowStart += 1;
InputWidthRemaining -= 1;
}
//
// Reduce the input across the kernel width and store to the output
// tensor.
//
size_t OutputWidthRemaining = OutputWidth;
const float* ReductionInputStart = ReductionBuffer;
do {
const float* ReductionInput = ReductionInputStart;
const float* ReductionInputEnd = ReductionInput + KernelWidth;
MLAS_FLOAT32X4 Reduction = MlasLoadFloat32x4(ReductionInput++);
while (ReductionInput < ReductionInputEnd) {
Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(ReductionInput++));
}
if (PoolingKind == MlasAveragePoolingExcludePad) {
Reduction = divider.DivideExcludePad(Reduction);
} else {
Reduction = divider.DivideIncludePad(Reduction);
}
if (StrideWidth == 1) {
if (OutputWidthRemaining < 4) {
if (OutputWidthRemaining >= 2) {
MlasStoreLowHalfFloat32x4(Output, Reduction);
if (OutputWidthRemaining > 2) {
MlasStoreLaneFloat32x4<2>(Output + 2, Reduction);
}
} else {
MlasStoreLaneFloat32x4<0>(Output, Reduction);
}
Output += OutputWidthRemaining;
break;
}
MlasStoreFloat32x4(Output, Reduction);
Output += 4;
OutputWidthRemaining -= 4;
} else {
if (OutputWidthRemaining == 1) {
MlasStoreLaneFloat32x4<0>(Output++, Reduction);
break;
}
#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
MlasStoreLaneFloat32x4<0>(Output, Reduction);
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
#elif defined(MLAS_SSE2_INTRINSICS)
Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
MlasStoreLowHalfFloat32x4(Output, Reduction);
#else
#error Unsupported architecture.
#endif
Output += 2;
OutputWidthRemaining -= 2;
}
ReductionInputStart += 4;
} while (OutputWidthRemaining > 0);
}
Input += InputSize;
}
}
template<typename PoolingType>
void
MlasPool3DKernel(
const MLAS_WORK_BLOCK* WorkBlock,
size_t ChannelCount,
const float* Input,
float* Output
)
/*++
Routine Description:
This routine implements the 3D pooling operation using generic constructs.
Arguments:
WorkBlock - Supplies the structure that contains the pooling parameters.
ChannelCount - Supplies the number of channels to process.
Input - Supplies the input tensor.
Output - Supplies the output tensor.
Return Value:
None.
--*/
{
constexpr size_t DepthShapeIndex = 0;
constexpr size_t HeightShapeIndex = 1;
constexpr size_t WidthShapeIndex = 2;
const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;
const size_t InputDepth = WorkBlock->InputShape[DepthShapeIndex];
const size_t InputHeight = WorkBlock->InputShape[HeightShapeIndex];
const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
const size_t InputSize = WorkBlock->InputSize;
const size_t OutputDepth = WorkBlock->OutputShape[DepthShapeIndex];
const size_t OutputHeight = WorkBlock->OutputShape[HeightShapeIndex];
const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];
const int64_t KernelDepth = WorkBlock->KernelShape[DepthShapeIndex];
const int64_t KernelHeight = WorkBlock->KernelShape[HeightShapeIndex];
const int64_t KernelWidth = WorkBlock->KernelShape[WidthShapeIndex];
const int64_t PaddingLeftDepth = WorkBlock->Padding[DepthShapeIndex];
const int64_t PaddingLeftHeight = WorkBlock->Padding[HeightShapeIndex];
const int64_t PaddingLeftWidth = WorkBlock->Padding[WidthShapeIndex];
const int64_t StrideDepth = WorkBlock->StrideShape[DepthShapeIndex];
const int64_t StrideHeight = WorkBlock->StrideShape[HeightShapeIndex];
const int64_t StrideWidth = WorkBlock->StrideShape[WidthShapeIndex];
for (size_t c = 0; c < ChannelCount; c++) {
for (size_t pd = 0; pd < OutputDepth; pd++) {
const int64_t idStart64 = pd * StrideDepth - PaddingLeftDepth;
const int64_t idEnd64 = idStart64 + KernelDepth;
const size_t idStart = size_t((std::max)(idStart64, int64_t(0)));
const size_t idEnd = size_t((std::min)(idEnd64, int64_t(InputDepth)));
for (size_t ph = 0; ph < OutputHeight; ph++) {
const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
const int64_t ihEnd64 = ihStart64 + KernelHeight;
const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));
for (size_t pw = 0; pw < OutputWidth; pw++) {
const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
const int64_t iwEnd64 = iwStart64 + KernelWidth;
const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));
float m = PoolingType::InitialValue();
for (size_t id = idStart; id < idEnd; id++) {
for (size_t ih = ihStart; ih < ihEnd; ih++) {
for (size_t iw = iwStart; iw < iwEnd; iw++) {
m = PoolingType::Reduce(m, Input[id * InputHeight * InputWidth + ih * InputWidth + iw]);
}
}
}
if (PoolingKind == MlasAveragePoolingExcludePad) {
m = PoolingType::AveragePool(m, float((idEnd - idStart) * (ihEnd - ihStart) * (iwEnd - iwStart)));
} else {
m = PoolingType::AveragePool(m, float(KernelDepth * KernelHeight * KernelWidth));
}
*Output++ = m;
}
}
}
Input += InputSize;
}
}
template<typename PoolingType>
void
MlasPool3DVectorKernel(
const MLAS_WORK_BLOCK* WorkBlock,
size_t ChannelCount,
const float* Input,
float* Output
)
/*++
Routine Description:
This routine implements an optimized 2D pooling operation using vector
instructions.
Arguments:
WorkBlock - Supplies the structure that contains the pooling parameters.
ChannelCount - Supplies the number of channels to process.
Input - Supplies the input tensor.
Output - Supplies the output tensor.
Return Value:
None.
--*/
{
constexpr size_t Dimensions = 3;
constexpr size_t DepthShapeIndex = 0;
constexpr size_t HeightShapeIndex = 1;
constexpr size_t WidthShapeIndex = 2;
const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;
const size_t InputDepth = WorkBlock->InputShape[DepthShapeIndex];
const size_t InputHeight = WorkBlock->InputShape[HeightShapeIndex];
const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
const size_t InputSize = WorkBlock->InputSize;
const size_t OutputDepth = WorkBlock->OutputShape[DepthShapeIndex];
const size_t OutputHeight = WorkBlock->OutputShape[HeightShapeIndex];
const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];
const size_t KernelDepth = size_t(WorkBlock->KernelShape[DepthShapeIndex]);
const size_t KernelHeight = size_t(WorkBlock->KernelShape[HeightShapeIndex]);
const size_t KernelWidth = size_t(WorkBlock->KernelShape[WidthShapeIndex]);
const size_t PaddingLeftDepth = size_t(WorkBlock->Padding[DepthShapeIndex]);
const size_t PaddingLeftHeight = size_t(WorkBlock->Padding[HeightShapeIndex]);
const size_t PaddingLeftWidth = size_t(WorkBlock->Padding[WidthShapeIndex]);
const size_t PaddingRightWidth = size_t(WorkBlock->Padding[Dimensions + WidthShapeIndex]);
const size_t StrideDepth = size_t(WorkBlock->StrideShape[DepthShapeIndex]);
const size_t StrideHeight = size_t(WorkBlock->StrideShape[HeightShapeIndex]);
const size_t StrideWidth = size_t(WorkBlock->StrideShape[WidthShapeIndex]);
float ReductionBuffer[MLAS_POOL_REDUCTION_BUFFER_STACK];
//
// Fill the edges of the reduction buffer with the padding value.
//
float* FillReductionBuffer = ReductionBuffer;
float* FillReductionBufferEnd = FillReductionBuffer + PaddingLeftWidth;
while (FillReductionBuffer < FillReductionBufferEnd) {
*FillReductionBuffer++ = PoolingType::InitialValue();
}
FillReductionBuffer = FillReductionBuffer + InputWidth;
FillReductionBufferEnd = FillReductionBuffer + PaddingRightWidth + MLAS_POOL_REDUCTION_BUFFER_PADDING;
while (FillReductionBuffer < FillReductionBufferEnd) {
*FillReductionBuffer++ = PoolingType::InitialValue();
}
//
// Apply the pooling operation to each channel.
//
typename PoolingType::DividerVectorContext divider;
divider.PrepareExcludePad(PaddingLeftWidth, InputWidth, KernelWidth);
divider.PrepareIncludePad(KernelDepth * KernelHeight * KernelWidth);
for (size_t c = 0; c < ChannelCount; c++) {
for (size_t pd = 0; pd < OutputDepth; pd++) {
size_t idStart = pd * StrideDepth - PaddingLeftDepth;
size_t idEnd = idStart + KernelDepth;
if (idStart >= InputDepth) {
idStart = 0;
}
if (idEnd > InputDepth) {
idEnd = InputDepth;
}
for (size_t ph = 0; ph < OutputHeight; ph++) {
size_t ihStart = ph * StrideHeight - PaddingLeftHeight;
size_t ihEnd = ihStart + KernelHeight;
if (ihStart >= InputHeight) {
ihStart = 0;
}
if (ihEnd > InputHeight) {
ihEnd = InputHeight;
}
divider.StartNextOutputRow((idEnd - idStart) * (ihEnd - ihStart));
//
// Reduce the input across the kernel height and store in a local
// reduction buffer.
//
const float* InputRowStart = &Input[idStart * InputHeight * InputWidth + ihStart * InputWidth];
const size_t InputPlanesCount = idEnd - idStart;
const size_t InputRowsCount = ihEnd - ihStart;
size_t InputWidthRemaining = InputWidth;
float* ReductionOutput = &ReductionBuffer[PaddingLeftWidth];
const size_t InputAdvancePlane = (InputHeight - InputRowsCount) * InputWidth;
while (InputWidthRemaining >= 4) {
const float* InputRow = InputRowStart;
size_t InputPlanesRemaining = InputPlanesCount;
MLAS_FLOAT32X4 Reduction = PoolingType::InitialVector();
do {
size_t InputRowsRemaining = InputRowsCount;
do {
Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(InputRow));
InputRow += InputWidth;
InputRowsRemaining--;
} while (InputRowsRemaining > 0);
InputRow += InputAdvancePlane;
InputPlanesRemaining--;
} while (InputPlanesRemaining > 0);
MlasStoreFloat32x4(ReductionOutput, Reduction);
ReductionOutput += 4;
InputRowStart += 4;
InputWidthRemaining -= 4;
}
while (InputWidthRemaining > 0) {
const float* InputRow = InputRowStart;
size_t InputPlanesRemaining = InputPlanesCount;
float Reduction = PoolingType::InitialValue();
do {
size_t InputRowsRemaining = InputRowsCount;
do {
Reduction = PoolingType::Reduce(Reduction, *InputRow);
InputRow += InputWidth;
InputRowsRemaining--;
} while (InputRowsRemaining > 0);
InputRow += InputAdvancePlane;
InputPlanesRemaining--;
} while (InputPlanesRemaining > 0);
*ReductionOutput++ = Reduction;
InputRowStart += 1;
InputWidthRemaining -= 1;
}
//
// Reduce the input across the kernel width and store to the output
// tensor.
//
size_t OutputWidthRemaining = OutputWidth;
const float* ReductionInputStart = ReductionBuffer;
do {
const float* ReductionInput = ReductionInputStart;
const float* ReductionInputEnd = ReductionInput + KernelWidth;
MLAS_FLOAT32X4 Reduction = MlasLoadFloat32x4(ReductionInput++);
while (ReductionInput < ReductionInputEnd) {
Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(ReductionInput++));
}
if (PoolingKind == MlasAveragePoolingExcludePad) {
Reduction = divider.DivideExcludePad(Reduction);
} else {
Reduction = divider.DivideIncludePad(Reduction);
}
if (StrideWidth == 1) {
if (OutputWidthRemaining < 4) {
if (OutputWidthRemaining >= 2) {
MlasStoreLowHalfFloat32x4(Output, Reduction);
if (OutputWidthRemaining > 2) {
MlasStoreLaneFloat32x4<2>(Output + 2, Reduction);
}
} else {
MlasStoreLaneFloat32x4<0>(Output, Reduction);
}
Output += OutputWidthRemaining;
break;
}
MlasStoreFloat32x4(Output, Reduction);
Output += 4;
OutputWidthRemaining -= 4;
} else {
if (OutputWidthRemaining == 1) {
MlasStoreLaneFloat32x4<0>(Output++, Reduction);
break;
}
#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
MlasStoreLaneFloat32x4<0>(Output, Reduction);
MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
#elif defined(MLAS_SSE2_INTRINSICS)
Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
MlasStoreLowHalfFloat32x4(Output, Reduction);
#else
#error Unsupported architecture.
#endif
Output += 2;
OutputWidthRemaining -= 2;
}
ReductionInputStart += 4;
} while (OutputWidthRemaining > 0);
}
}
Input += InputSize;
}
}
template<typename PoolingType>
void
MlasPoolGlobalKernel(
const MLAS_WORK_BLOCK* WorkBlock,
size_t ChannelCount,
const float* Input,
float* Output
)
/*++
Routine Description:
This routine implements a global pooling operation.
Arguments:
WorkBlock - Supplies the structure that contains the pooling parameters.
ChannelCount - Supplies the number of channels to process.
Input - Supplies the input tensor.
Output - Supplies the output tensor.
Return Value:
None.
--*/
{
const size_t InputSize = WorkBlock->InputSize;
const float InputSizeFloat = float(InputSize);
//
// Apply the pooling operation to each channel.
//
for (size_t c = 0; c < ChannelCount; c++) {
size_t InputSizeRemaining = InputSize;
//
// Iterate over the input buffer a vector at a time.
//
MLAS_FLOAT32X4 Reduction = PoolingType::InitialVector();
while (InputSizeRemaining >= 4) {
Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(Input));
Input += 4;
InputSizeRemaining -= 4;
}
//
// Reduce the vector to a single float value.
//
#if defined(MLAS_NEON64_INTRINSICS)
float ReductionValue = PoolingType::ReduceFloat32x4(Reduction);
#elif defined(MLAS_NEON32_INTRINSICS)
float32x2_t ReductionLow = vget_low_f32(Reduction);
float32x2_t ReductionHigh = vget_high_f32(Reduction);
ReductionLow = PoolingType::ReducePairwise(ReductionLow, ReductionHigh);
ReductionLow = PoolingType::ReducePairwise(ReductionLow, ReductionHigh);
float ReductionValue = vget_lane_f32(ReductionLow, 0);
#elif defined(MLAS_SSE2_INTRINSICS)
Reduction = PoolingType::Reduce(Reduction, _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(3, 2, 3, 2)));
Reduction = PoolingType::Reduce(Reduction, _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(1, 1, 1, 1)));
float ReductionValue = _mm_cvtss_f32(Reduction);
#elif defined(MLAS_VSX_INTRINSICS)
Reduction = PoolingType::Reduce(Reduction, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Reduction, 1)));
Reduction = PoolingType::Reduce(Reduction, vec_splat(Reduction, 1));
float ReductionValue = Reduction[0];
#else
#error Unsupported architecture.
#endif
//
// Iterate over the remaining input buffer an element at a time.
//
while (InputSizeRemaining > 0) {
ReductionValue = PoolingType::Reduce(ReductionValue, *Input++);
InputSizeRemaining -= 1;
}
//
// Apply average pooling if necessary.
//
ReductionValue = PoolingType::AveragePool(ReductionValue, InputSizeFloat);
*Output++ = ReductionValue;
}
}
//
// Stores pointers to the pooling kernel routines.
//
static const PMLAS_POOL_KERNEL_ROUTINE MlasPoolGenericKernels[][3] =
{
{
MlasPool1DKernel<MLAS_MAXIMUM_POOLING>,
MlasPool2DKernel<MLAS_MAXIMUM_POOLING>,
MlasPool3DKernel<MLAS_MAXIMUM_POOLING>,
},
{
MlasPool1DKernel<MLAS_AVERAGE_POOLING>,
MlasPool2DKernel<MLAS_AVERAGE_POOLING>,
MlasPool3DKernel<MLAS_AVERAGE_POOLING>,
},
{
MlasPool1DKernel<MLAS_AVERAGE_POOLING>,
MlasPool2DKernel<MLAS_AVERAGE_POOLING>,
MlasPool3DKernel<MLAS_AVERAGE_POOLING>,
},
};
static const PMLAS_POOL_KERNEL_ROUTINE MlasPoolGlobalKernels[] =
{
MlasPoolGlobalKernel<MLAS_MAXIMUM_POOLING>,
MlasPoolGlobalKernel<MLAS_AVERAGE_POOLING>,
MlasPoolGlobalKernel<MLAS_AVERAGE_POOLING>,
};
static const PMLAS_POOL_KERNEL_ROUTINE MlasPoolVectorKernels[][2] =
{
{
MlasPool2DVectorKernel<MLAS_MAXIMUM_POOLING>,
MlasPool3DVectorKernel<MLAS_MAXIMUM_POOLING>,
},
{
MlasPool2DVectorKernel<MLAS_AVERAGE_POOLING>,
MlasPool3DVectorKernel<MLAS_AVERAGE_POOLING>,
},
{
MlasPool2DVectorKernel<MLAS_AVERAGE_POOLING>,
MlasPool3DVectorKernel<MLAS_AVERAGE_POOLING>,
},
};
void
MLASCALL
MlasPool(
MLAS_POOLING_KIND PoolingKind,
size_t Dimensions,
const int64_t* InputShape,
const int64_t* KernelShape,
const int64_t* Padding,
const int64_t* StrideShape,
const int64_t* OutputShape,
const float* Input,
float* Output,
MLAS_THREADPOOL* ThreadPool
)
/*++
Routine Description:
This routine implements the pooling operation.
Arguments:
PoolingKind - Supplies the kind of pooling operation to perform.
Dimensions - Supplies the number of dimensions.
InputShape - Supplies the shape of the input tensor.
KernelShape - Supplies the shape of the kernel transform.
Padding - Supplies the number of padding elements at the edge of the input
tensor.
StrideShape - Supplies the shape of the stride.
OutputShape - Supplies the shape of the output tensor.
Input - Supplies the input tensor.
Output - Supplies the output tensor.
ThreadPool - Supplies the thread pool object to use, else nullptr if the
base library threading support should be used.
Return Value:
None.
--*/
{
MLAS_WORK_BLOCK WorkBlock;
WorkBlock.PoolingKind = PoolingKind;
//
// Compute the total number of channels to process and advance the input
// and output shapes over the batch and channel counts.
//
//TODO: use a safeint here and make sure the result value can fit into int32_t
size_t TotalChannelCount = size_t(InputShape[0]) * size_t(InputShape[1]);
InputShape += 2;
OutputShape += 2;
//
// Save the pooling parameters.
//
size_t InputSize = 1;
size_t OutputSize = 1;
bool InputAndKernelShapeMatch = true;
bool AllStridesAreOne = true;
bool AllPaddingIsZero = true;
bool AllKernelsAreSmall = true;
for (size_t dim = 0; dim < Dimensions; dim++) {
WorkBlock.InputShape[dim] = size_t(InputShape[dim]);
WorkBlock.OutputShape[dim] = size_t(OutputShape[dim]);
if (KernelShape != nullptr) {
WorkBlock.KernelShape[dim] = KernelShape[dim];
} else {
WorkBlock.KernelShape[dim] = InputShape[dim];
}
if (Padding != nullptr) {
WorkBlock.Padding[dim] = Padding[dim];
WorkBlock.Padding[dim + Dimensions] = Padding[dim + Dimensions];
} else {
WorkBlock.Padding[dim] = 0;
WorkBlock.Padding[dim + Dimensions] = 0;
}
if (StrideShape != nullptr) {
WorkBlock.StrideShape[dim] = StrideShape[dim];
} else {
WorkBlock.StrideShape[dim] = 1;
}
InputSize *= WorkBlock.InputShape[dim];
OutputSize *= WorkBlock.OutputShape[dim];
InputAndKernelShapeMatch &= (WorkBlock.KernelShape[dim] == int64_t(WorkBlock.InputShape[dim]));
AllStridesAreOne &= (WorkBlock.StrideShape[dim] == 1);
AllPaddingIsZero &= (WorkBlock.Padding[dim] == 0 && WorkBlock.Padding[dim + Dimensions] == 0);
AllKernelsAreSmall &= (WorkBlock.KernelShape[dim] <= 32);
}
WorkBlock.InputSize = InputSize;
//
// Determine which pooling kernel routine to use.
//
// The vectorized kernels only support strides of 1 or 2. The kernel size
// should be kept low in order to keep the divisors for average pooling to
// be exactly representable as float. The input width plus padding must fit
// in the reduction buffer.
//
PMLAS_POOL_KERNEL_ROUTINE PoolKernelRoutine = MlasPoolGenericKernels[PoolingKind][Dimensions - 1];
if (InputAndKernelShapeMatch && AllStridesAreOne && AllPaddingIsZero) {
PoolKernelRoutine = MlasPoolGlobalKernels[PoolingKind];
} else if (Dimensions >= 2 && WorkBlock.StrideShape[Dimensions - 1] <= 2 && AllKernelsAreSmall) {
int64_t ReductionBufferRemaining = MLAS_POOL_REDUCTION_BUFFER_STACK - MLAS_POOL_REDUCTION_BUFFER_PADDING;
if (ReductionBufferRemaining >= WorkBlock.Padding[Dimensions - 1]) {
ReductionBufferRemaining -= WorkBlock.Padding[Dimensions - 1];
} else {
ReductionBufferRemaining = 0;
}
if (ReductionBufferRemaining >= WorkBlock.Padding[Dimensions * 2 - 1]) {
ReductionBufferRemaining -= WorkBlock.Padding[Dimensions * 2 - 1];
} else {
ReductionBufferRemaining = 0;
}
if (ReductionBufferRemaining >= int64_t(WorkBlock.InputShape[Dimensions - 1])) {
PoolKernelRoutine = MlasPoolVectorKernels[PoolingKind][Dimensions - 2];
}
}
#ifdef MLAS_NO_ONNXRUNTIME_THREADPOOL
MLAS_UNREFERENCED_PARAMETER(ThreadPool);
//
// Execute the pooling kernel routine.
//
#if defined(_OPENMP)
#pragma omp parallel for
for (int64_t c = 0; c < int64_t(TotalChannelCount); c++) {
PoolKernelRoutine(&WorkBlock, 1, Input + c * InputSize, Output + c * OutputSize);
}
#else
PoolKernelRoutine(&WorkBlock, TotalChannelCount, Input, Output);
#endif
#else
//
// Use an external thread pool if one is provided.
// TODO: change to use MlasExecuteThreaded
onnxruntime::concurrency::ThreadPool::TryBatchParallelFor(ThreadPool, static_cast<ptrdiff_t>(TotalChannelCount), [&](ptrdiff_t c) {
PoolKernelRoutine(&WorkBlock, 1, Input + c * InputSize, Output + c * OutputSize);
}, 0);
return;
#endif
}