onnxruntime/onnxruntime/core/mlas/lib/pooling.cpp

/*++

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

Module Name:

    pooling.cpp

Abstract:

    This module implements the pooling operation.

--*/

#include "mlasi.h"

//
// Define the parameters to execute segments of a pooling operation on worker
// threads.
//

struct MLAS_WORK_BLOCK {
    MLAS_POOLING_KIND PoolingKind;
    size_t InputShape[3];
    size_t InputSize;
    size_t OutputShape[3];
    int64_t KernelShape[3];
    int64_t Padding[6];
    int64_t StrideShape[3];
};

//
// Define the prototype of the pooling kernel routine.
//

typedef
void
(MLAS_POOL_KERNEL_ROUTINE)(
    const MLAS_WORK_BLOCK* WorkBlock,
    size_t ChannelCount,
    const float* Input,
    float* Output
    );

typedef MLAS_POOL_KERNEL_ROUTINE* PMLAS_POOL_KERNEL_ROUTINE;

//
// Define the number of elements to allocate on the stack for the reduction
// buffer in the vectorized kernels.
//

#define MLAS_POOL_REDUCTION_BUFFER_STACK    2048

//
// Define the number of reduction buffer elements reserved for over-reading
// an entire vector to avoid special handling at the right edge of the
// buffer.
//

#define MLAS_POOL_REDUCTION_BUFFER_PADDING  ((sizeof(MLAS_FLOAT32X4) / sizeof(float)) - 1)

//
// Abstraction for maximum pooling.
//

struct MLAS_MAXIMUM_POOLING
{
    static float InitialValue()
    {
        return std::numeric_limits<float>::lowest();
    }

    static MLAS_FLOAT32X4 InitialVector()
    {
        return MlasBroadcastFloat32x4(InitialValue());
    }

    static float Reduce(float Reduction, float Value)
    {
        return (std::max)(Reduction, Value);
    }

    static MLAS_FLOAT32X4 Reduce(MLAS_FLOAT32X4 Reduction, MLAS_FLOAT32X4 Value)
    {
        return MlasMaximumFloat32x4(Reduction, Value);
    }

#if defined(MLAS_NEON64_INTRINSICS)

    static float ReduceFloat32x4(MLAS_FLOAT32X4 Reduction)
    {
        return vmaxvq_f32(Reduction);
    }

#elif defined(MLAS_NEON32_INTRINSICS)

    static float32x2_t ReducePairwise(float32x2_t Vector0, float32x2_t Vector1)
    {
        return vpmax_f32(Vector0, Vector1);
    }

#endif

    static float AveragePool(float Reduction, float Size)
    {
        MLAS_UNREFERENCED_PARAMETER(Size);

        return Reduction;
    }

    struct DividerVectorContext
    {
        void PrepareExcludePad(size_t PaddingLeftWidth, size_t InputWidth, size_t KernelWidth)
        {
            MLAS_UNREFERENCED_PARAMETER(PaddingLeftWidth);
            MLAS_UNREFERENCED_PARAMETER(InputWidth);
            MLAS_UNREFERENCED_PARAMETER(KernelWidth);
        }

        void PrepareIncludePad(size_t KernelSize)
        {
            MLAS_UNREFERENCED_PARAMETER(KernelSize);
        }

        void StartNextOutputRow(size_t InputRowsCount)
        {
            MLAS_UNREFERENCED_PARAMETER(InputRowsCount);
        }

        MLAS_FLOAT32X4 DivideExcludePad(MLAS_FLOAT32X4 Reduction)
        {
            return Reduction;
        }

        MLAS_FLOAT32X4 DivideIncludePad(MLAS_FLOAT32X4 Reduction)
        {
            return Reduction;
        }
    };
};

//
// Abstraction for average pooling.
//

MLAS_DECLSPEC_ALIGN(static const float MlasInitialReductionInputIndex[], sizeof(MLAS_FLOAT32X4)) = { 0.0f, 1.0f, 2.0f, 3.0f };

struct MLAS_AVERAGE_POOLING
{
    static float InitialValue()
    {
        return 0.0f;
    }

    static MLAS_FLOAT32X4 InitialVector()
    {
        return MlasZeroFloat32x4();
    }

    static float Reduce(float Reduction, float Value)
    {
        return Reduction + Value;
    }

    static MLAS_FLOAT32X4 Reduce(MLAS_FLOAT32X4 Reduction, MLAS_FLOAT32X4 Value)
    {
        return MlasAddFloat32x4(Reduction, Value);
    }

#if defined(MLAS_NEON64_INTRINSICS)

    static float ReduceFloat32x4(MLAS_FLOAT32X4 Reduction)
    {
        Reduction = vpaddq_f32(Reduction, Reduction);
        Reduction = vpaddq_f32(Reduction, Reduction);

        return vgetq_lane_f32(Reduction, 0);
    }

#elif defined(MLAS_NEON32_INTRINSICS)

    static float32x2_t ReducePairwise(float32x2_t Vector0, float32x2_t Vector1)
    {
        return vpadd_f32(Vector0, Vector1);
    }

#endif

    static float AveragePool(float Reduction, float Size)
    {
        return Reduction / Size;
    }

    struct DividerVectorContext
    {
        MLAS_FLOAT32X4 KernelSizeBroadcast;
        MLAS_FLOAT32X4 KernelWidthBroadcast;
        MLAS_FLOAT32X4 PaddingLowerBound;
        MLAS_FLOAT32X4 PaddingUpperBound;
        MLAS_FLOAT32X4 ReductionInputIndex;
        MLAS_FLOAT32X4 InputRowsBroadcast;

        void PrepareExcludePad(size_t PaddingLeftWidth, size_t InputWidth, size_t KernelWidth)
        {
            KernelWidthBroadcast = MlasBroadcastFloat32x4(float(unsigned(KernelWidth)));
            PaddingLowerBound = MlasBroadcastFloat32x4(float(unsigned(PaddingLeftWidth)));
            PaddingUpperBound = MlasBroadcastFloat32x4(float(unsigned(PaddingLeftWidth + InputWidth)));
        }

        void PrepareIncludePad(size_t KernelSize)
        {
            KernelSizeBroadcast = MlasBroadcastFloat32x4(float(unsigned(KernelSize)));
        }

        void StartNextOutputRow(size_t InputRowsCount)
        {
            ReductionInputIndex = MlasLoadFloat32x4(MlasInitialReductionInputIndex);
            InputRowsBroadcast = MlasBroadcastFloat32x4(float(unsigned(InputRowsCount)));
        }

        MLAS_FLOAT32X4 DivideExcludePad(MLAS_FLOAT32X4 Reduction)
        {
            MLAS_FLOAT32X4 Divisor;

            //
            // Compute the ending input index for each column and bound the index
            // range by the padding indices, then compute the number of input
            // column contributions from the delta.
            //

            MLAS_FLOAT32X4 ReductionInputEndingIndex =
                MlasAddFloat32x4(ReductionInputIndex, KernelWidthBroadcast);

            MLAS_FLOAT32X4 LowerInputIndex =
                MlasMaximumFloat32x4(ReductionInputIndex, PaddingLowerBound);
            MLAS_FLOAT32X4 UpperInputIndex =
                MlasMinimumFloat32x4(ReductionInputEndingIndex, PaddingUpperBound);

            MLAS_FLOAT32X4 InputIndexDelta =
                MlasSubtractFloat32x4(UpperInputIndex, LowerInputIndex);

            //
            // Advance the input index vector for the next iteration.
            //

            ReductionInputIndex =
                MlasAddFloat32x4(ReductionInputIndex, MlasBroadcastFloat32x4(4.0f));

            //
            // Compute the per-column number of input elements used for the sum.
            //
            // At the end of the input row, the index range computed above may be
            // zero for unused trailing vector elements, so avoid any divide by zero
            // penalty by enforcing a minimum of 1.0f.
            //

            Divisor = MlasMultiplyFloat32x4(InputIndexDelta, InputRowsBroadcast);
            Divisor = MlasMaximumFloat32x4(Divisor, MlasBroadcastFloat32x4(1.0f));

            return MlasDivideFloat32x4(Reduction, Divisor);
        }

        MLAS_FLOAT32X4 DivideIncludePad(MLAS_FLOAT32X4 Reduction)
        {
            return MlasDivideFloat32x4(Reduction, KernelSizeBroadcast);
        }
    };
};

template<typename PoolingType>
void
MlasPool1DKernel(
    const MLAS_WORK_BLOCK* WorkBlock,
    size_t ChannelCount,
    const float* Input,
    float* Output
    )
/*++

Routine Description:

    This routine implements the 1D pooling operation using generic constructs.

Arguments:

    WorkBlock - Supplies the structure that contains the pooling parameters.

    ChannelCount - Supplies the number of channels to process.

    Input - Supplies the input tensor.

    Output - Supplies the output tensor.

Return Value:

    None.

--*/
{
    constexpr size_t WidthShapeIndex = 0;

    const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;

    const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
    const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];

    const int64_t KernelWidth = WorkBlock->KernelShape[WidthShapeIndex];
    const int64_t PaddingLeftWidth = WorkBlock->Padding[WidthShapeIndex];
    const int64_t StrideWidth = WorkBlock->StrideShape[WidthShapeIndex];

    for (size_t c = 0; c < ChannelCount; c++) {

        for (size_t pw = 0; pw < OutputWidth; pw++) {

            const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
            const int64_t iwEnd64 = iwStart64 + KernelWidth;

            const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
            const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));

            float m = PoolingType::InitialValue();

            for (size_t iw = size_t(iwStart); iw < size_t(iwEnd); iw++) {
                m = PoolingType::Reduce(m, Input[iw]);
            }

            if (PoolingKind == MlasAveragePoolingExcludePad) {
                m = PoolingType::AveragePool(m, float(iwEnd - iwStart));
            } else {
                m = PoolingType::AveragePool(m, float(KernelWidth));
            }

            *Output++ = m;
        }

        Input += InputWidth;
    }
}

template<typename PoolingType>
void
MlasPool2DKernel(
    const MLAS_WORK_BLOCK* WorkBlock,
    size_t ChannelCount,
    const float* Input,
    float* Output
    )
/*++

Routine Description:

    This routine implements the 2D pooling operation using generic constructs.

Arguments:

    WorkBlock - Supplies the structure that contains the pooling parameters.

    ChannelCount - Supplies the number of channels to process.

    Input - Supplies the input tensor.

    Output - Supplies the output tensor.

Return Value:

    None.

--*/
{
    constexpr size_t HeightShapeIndex = 0;
    constexpr size_t WidthShapeIndex = 1;

    const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;

    const size_t InputHeight = WorkBlock->InputShape[HeightShapeIndex];
    const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
    const size_t InputSize = WorkBlock->InputSize;
    const size_t OutputHeight = WorkBlock->OutputShape[HeightShapeIndex];
    const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];

    const int64_t KernelHeight = WorkBlock->KernelShape[HeightShapeIndex];
    const int64_t KernelWidth = WorkBlock->KernelShape[WidthShapeIndex];
    const int64_t PaddingLeftHeight = WorkBlock->Padding[HeightShapeIndex];
    const int64_t PaddingLeftWidth = WorkBlock->Padding[WidthShapeIndex];
    const int64_t StrideHeight = WorkBlock->StrideShape[HeightShapeIndex];
    const int64_t StrideWidth = WorkBlock->StrideShape[WidthShapeIndex];

    for (size_t c = 0; c < ChannelCount; c++) {

        for (size_t ph = 0; ph < OutputHeight; ph++) {

            const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
            const int64_t ihEnd64 = ihStart64 + KernelHeight;

            const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
            const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));

            for (size_t pw = 0; pw < OutputWidth; pw++) {

                const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
                const int64_t iwEnd64 = iwStart64 + KernelWidth;

                const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
                const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));

                float m = PoolingType::InitialValue();

                for (size_t ih = ihStart; ih < ihEnd; ih++) {
                    for (size_t iw = iwStart; iw < iwEnd; iw++) {
                        m = PoolingType::Reduce(m, Input[ih * InputWidth + iw]);
                    }
                }

                if (PoolingKind == MlasAveragePoolingExcludePad) {
                    m = PoolingType::AveragePool(m, float((ihEnd - ihStart) * (iwEnd - iwStart)));
                } else {
                    m = PoolingType::AveragePool(m, float(KernelHeight * KernelWidth));
                }

                *Output++ = m;
            }
        }

        Input += InputSize;
    }
}

template<typename PoolingType>
void
MlasPool2DVectorKernel(
    const MLAS_WORK_BLOCK* WorkBlock,
    size_t ChannelCount,
    const float* Input,
    float* Output
    )
/*++

Routine Description:

    This routine implements an optimized 2D pooling operation using vector
    instructions.

Arguments:

    WorkBlock - Supplies the structure that contains the pooling parameters.

    ChannelCount - Supplies the number of channels to process.

    Input - Supplies the input tensor.

    Output - Supplies the output tensor.

Return Value:

    None.

--*/
{
    constexpr size_t Dimensions = 2;

    constexpr size_t HeightShapeIndex = 0;
    constexpr size_t WidthShapeIndex = 1;

    const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;

    const size_t InputHeight = WorkBlock->InputShape[HeightShapeIndex];
    const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
    const size_t InputSize = WorkBlock->InputSize;
    const size_t OutputHeight = WorkBlock->OutputShape[HeightShapeIndex];
    const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];

    const size_t KernelHeight = size_t(WorkBlock->KernelShape[HeightShapeIndex]);
    const size_t KernelWidth = size_t(WorkBlock->KernelShape[WidthShapeIndex]);
    const size_t PaddingLeftHeight = size_t(WorkBlock->Padding[HeightShapeIndex]);
    const size_t PaddingLeftWidth = size_t(WorkBlock->Padding[WidthShapeIndex]);
    const size_t PaddingRightWidth = size_t(WorkBlock->Padding[Dimensions + WidthShapeIndex]);
    const size_t StrideHeight = size_t(WorkBlock->StrideShape[HeightShapeIndex]);
    const size_t StrideWidth = size_t(WorkBlock->StrideShape[WidthShapeIndex]);

    float ReductionBuffer[MLAS_POOL_REDUCTION_BUFFER_STACK];

    //
    // Fill the edges of the reduction buffer with the padding value.
    //

    float* FillReductionBuffer = ReductionBuffer;
    float* FillReductionBufferEnd = FillReductionBuffer + PaddingLeftWidth;

    while (FillReductionBuffer < FillReductionBufferEnd) {
        *FillReductionBuffer++ = PoolingType::InitialValue();
    }

    FillReductionBuffer = FillReductionBuffer + InputWidth;
    FillReductionBufferEnd = FillReductionBuffer + PaddingRightWidth + MLAS_POOL_REDUCTION_BUFFER_PADDING;

    while (FillReductionBuffer < FillReductionBufferEnd) {
        *FillReductionBuffer++ = PoolingType::InitialValue();
    }

    //
    // Apply the pooling operation to each channel.
    //

    typename PoolingType::DividerVectorContext divider;
    divider.PrepareExcludePad(PaddingLeftWidth, InputWidth, KernelWidth);
    divider.PrepareIncludePad(KernelHeight * KernelWidth);

    for (size_t c = 0; c < ChannelCount; c++) {

        for (size_t ph = 0; ph < OutputHeight; ph++) {

            size_t ihStart = ph * StrideHeight - PaddingLeftHeight;
            size_t ihEnd = ihStart + KernelHeight;

            if (ihStart >= InputHeight) {
                ihStart = 0;
            }

            if (ihEnd > InputHeight) {
                ihEnd = InputHeight;
            }

            divider.StartNextOutputRow(ihEnd - ihStart);

            //
            // Reduce the input across the kernel height and store in a local
            // reduction buffer.
            //

            const float* InputRowStart = &Input[ihStart * InputWidth];
            const size_t InputRowsCount = ihEnd - ihStart - 1;
            size_t InputWidthRemaining = InputWidth;
            float* ReductionOutput = &ReductionBuffer[PaddingLeftWidth];

            while (InputWidthRemaining >= 4) {

                const float* InputRow = InputRowStart;
                size_t InputRowsRemaining = InputRowsCount;
                MLAS_FLOAT32X4 Reduction = MlasLoadFloat32x4(InputRow);

                while (InputRowsRemaining > 0) {
                    InputRow += InputWidth;
                    Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(InputRow));
                    InputRowsRemaining--;
                }

                MlasStoreFloat32x4(ReductionOutput, Reduction);
                ReductionOutput += 4;

                InputRowStart += 4;
                InputWidthRemaining -= 4;
            }

            while (InputWidthRemaining > 0) {

                const float* InputRow = InputRowStart;
                size_t InputRowsRemaining = InputRowsCount;
                float Reduction = *InputRow;

                while (InputRowsRemaining > 0) {
                    InputRow += InputWidth;
                    Reduction = PoolingType::Reduce(Reduction, *InputRow);
                    InputRowsRemaining--;
                }

                *ReductionOutput++ = Reduction;

                InputRowStart += 1;
                InputWidthRemaining -= 1;
            }

            //
            // Reduce the input across the kernel width and store to the output
            // tensor.
            //

            size_t OutputWidthRemaining = OutputWidth;
            const float* ReductionInputStart = ReductionBuffer;

            do {

                const float* ReductionInput = ReductionInputStart;
                const float* ReductionInputEnd = ReductionInput + KernelWidth;
                MLAS_FLOAT32X4 Reduction = MlasLoadFloat32x4(ReductionInput++);

                while (ReductionInput < ReductionInputEnd) {
                    Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(ReductionInput++));
                }

                if (PoolingKind == MlasAveragePoolingExcludePad) {
                    Reduction = divider.DivideExcludePad(Reduction);
                } else {
                    Reduction = divider.DivideIncludePad(Reduction);
                }

                if (StrideWidth == 1) {

                    if (OutputWidthRemaining < 4) {

                        if (OutputWidthRemaining >= 2) {

                            MlasStoreLowHalfFloat32x4(Output, Reduction);

                            if (OutputWidthRemaining > 2) {
                                MlasStoreLaneFloat32x4<2>(Output + 2, Reduction);
                            }

                        } else {
                            MlasStoreLaneFloat32x4<0>(Output, Reduction);
                        }

                        Output += OutputWidthRemaining;

                        break;
                    }

                    MlasStoreFloat32x4(Output, Reduction);

                    Output += 4;
                    OutputWidthRemaining -= 4;

                } else {

                    if (OutputWidthRemaining == 1) {
                        MlasStoreLaneFloat32x4<0>(Output++, Reduction);
                        break;
                    }

#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
                    MlasStoreLaneFloat32x4<0>(Output, Reduction);
                    MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
#elif defined(MLAS_SSE2_INTRINSICS)
                    Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
                    MlasStoreLowHalfFloat32x4(Output, Reduction);
#else
#error Unsupported architecture.
#endif

                    Output += 2;
                    OutputWidthRemaining -= 2;
                }

                ReductionInputStart += 4;

            } while (OutputWidthRemaining > 0);
        }

        Input += InputSize;
    }
}

template<typename PoolingType>
void
MlasPool3DKernel(
    const MLAS_WORK_BLOCK* WorkBlock,
    size_t ChannelCount,
    const float* Input,
    float* Output
    )
/*++

Routine Description:

    This routine implements the 3D pooling operation using generic constructs.

Arguments:

    WorkBlock - Supplies the structure that contains the pooling parameters.

    ChannelCount - Supplies the number of channels to process.

    Input - Supplies the input tensor.

    Output - Supplies the output tensor.

Return Value:

    None.

--*/
{
    constexpr size_t DepthShapeIndex = 0;
    constexpr size_t HeightShapeIndex = 1;
    constexpr size_t WidthShapeIndex = 2;

    const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;

    const size_t InputDepth = WorkBlock->InputShape[DepthShapeIndex];
    const size_t InputHeight = WorkBlock->InputShape[HeightShapeIndex];
    const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
    const size_t InputSize = WorkBlock->InputSize;
    const size_t OutputDepth = WorkBlock->OutputShape[DepthShapeIndex];
    const size_t OutputHeight = WorkBlock->OutputShape[HeightShapeIndex];
    const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];

    const int64_t KernelDepth = WorkBlock->KernelShape[DepthShapeIndex];
    const int64_t KernelHeight = WorkBlock->KernelShape[HeightShapeIndex];
    const int64_t KernelWidth = WorkBlock->KernelShape[WidthShapeIndex];
    const int64_t PaddingLeftDepth = WorkBlock->Padding[DepthShapeIndex];
    const int64_t PaddingLeftHeight = WorkBlock->Padding[HeightShapeIndex];
    const int64_t PaddingLeftWidth = WorkBlock->Padding[WidthShapeIndex];
    const int64_t StrideDepth = WorkBlock->StrideShape[DepthShapeIndex];
    const int64_t StrideHeight = WorkBlock->StrideShape[HeightShapeIndex];
    const int64_t StrideWidth = WorkBlock->StrideShape[WidthShapeIndex];

    for (size_t c = 0; c < ChannelCount; c++) {

        for (size_t pd = 0; pd < OutputDepth; pd++) {

            const int64_t idStart64 = pd * StrideDepth - PaddingLeftDepth;
            const int64_t idEnd64 = idStart64 + KernelDepth;

            const size_t idStart = size_t((std::max)(idStart64, int64_t(0)));
            const size_t idEnd = size_t((std::min)(idEnd64, int64_t(InputDepth)));

            for (size_t ph = 0; ph < OutputHeight; ph++) {

                const int64_t ihStart64 = ph * StrideHeight - PaddingLeftHeight;
                const int64_t ihEnd64 = ihStart64 + KernelHeight;

                const size_t ihStart = size_t((std::max)(ihStart64, int64_t(0)));
                const size_t ihEnd = size_t((std::min)(ihEnd64, int64_t(InputHeight)));

                for (size_t pw = 0; pw < OutputWidth; pw++) {

                    const int64_t iwStart64 = pw * StrideWidth - PaddingLeftWidth;
                    const int64_t iwEnd64 = iwStart64 + KernelWidth;

                    const size_t iwStart = size_t((std::max)(iwStart64, int64_t(0)));
                    const size_t iwEnd = size_t((std::min)(iwEnd64, int64_t(InputWidth)));

                    float m = PoolingType::InitialValue();

                    for (size_t id = idStart; id < idEnd; id++) {
                        for (size_t ih = ihStart; ih < ihEnd; ih++) {
                            for (size_t iw = iwStart; iw < iwEnd; iw++) {
                                m = PoolingType::Reduce(m, Input[id * InputHeight * InputWidth + ih * InputWidth + iw]);
                            }
                        }
                    }

                    if (PoolingKind == MlasAveragePoolingExcludePad) {
                        m = PoolingType::AveragePool(m, float((idEnd - idStart) * (ihEnd - ihStart) * (iwEnd - iwStart)));
                    } else {
                        m = PoolingType::AveragePool(m, float(KernelDepth * KernelHeight * KernelWidth));
                    }

                    *Output++ = m;
                }
            }
        }

        Input += InputSize;
    }
}

template<typename PoolingType>
void
MlasPool3DVectorKernel(
    const MLAS_WORK_BLOCK* WorkBlock,
    size_t ChannelCount,
    const float* Input,
    float* Output
    )
/*++

Routine Description:

    This routine implements an optimized 2D pooling operation using vector
    instructions.

Arguments:

    WorkBlock - Supplies the structure that contains the pooling parameters.

    ChannelCount - Supplies the number of channels to process.

    Input - Supplies the input tensor.

    Output - Supplies the output tensor.

Return Value:

    None.

--*/
{
    constexpr size_t Dimensions = 3;

    constexpr size_t DepthShapeIndex = 0;
    constexpr size_t HeightShapeIndex = 1;
    constexpr size_t WidthShapeIndex = 2;

    const MLAS_POOLING_KIND PoolingKind = WorkBlock->PoolingKind;

    const size_t InputDepth = WorkBlock->InputShape[DepthShapeIndex];
    const size_t InputHeight = WorkBlock->InputShape[HeightShapeIndex];
    const size_t InputWidth = WorkBlock->InputShape[WidthShapeIndex];
    const size_t InputSize = WorkBlock->InputSize;
    const size_t OutputDepth = WorkBlock->OutputShape[DepthShapeIndex];
    const size_t OutputHeight = WorkBlock->OutputShape[HeightShapeIndex];
    const size_t OutputWidth = WorkBlock->OutputShape[WidthShapeIndex];

    const size_t KernelDepth = size_t(WorkBlock->KernelShape[DepthShapeIndex]);
    const size_t KernelHeight = size_t(WorkBlock->KernelShape[HeightShapeIndex]);
    const size_t KernelWidth = size_t(WorkBlock->KernelShape[WidthShapeIndex]);
    const size_t PaddingLeftDepth = size_t(WorkBlock->Padding[DepthShapeIndex]);
    const size_t PaddingLeftHeight = size_t(WorkBlock->Padding[HeightShapeIndex]);
    const size_t PaddingLeftWidth = size_t(WorkBlock->Padding[WidthShapeIndex]);
    const size_t PaddingRightWidth = size_t(WorkBlock->Padding[Dimensions + WidthShapeIndex]);
    const size_t StrideDepth = size_t(WorkBlock->StrideShape[DepthShapeIndex]);
    const size_t StrideHeight = size_t(WorkBlock->StrideShape[HeightShapeIndex]);
    const size_t StrideWidth = size_t(WorkBlock->StrideShape[WidthShapeIndex]);

    float ReductionBuffer[MLAS_POOL_REDUCTION_BUFFER_STACK];

    //
    // Fill the edges of the reduction buffer with the padding value.
    //

    float* FillReductionBuffer = ReductionBuffer;
    float* FillReductionBufferEnd = FillReductionBuffer + PaddingLeftWidth;

    while (FillReductionBuffer < FillReductionBufferEnd) {
        *FillReductionBuffer++ = PoolingType::InitialValue();
    }

    FillReductionBuffer = FillReductionBuffer + InputWidth;
    FillReductionBufferEnd = FillReductionBuffer + PaddingRightWidth + MLAS_POOL_REDUCTION_BUFFER_PADDING;

    while (FillReductionBuffer < FillReductionBufferEnd) {
        *FillReductionBuffer++ = PoolingType::InitialValue();
    }

    //
    // Apply the pooling operation to each channel.
    //

    typename PoolingType::DividerVectorContext divider;
    divider.PrepareExcludePad(PaddingLeftWidth, InputWidth, KernelWidth);
    divider.PrepareIncludePad(KernelDepth * KernelHeight * KernelWidth);

    for (size_t c = 0; c < ChannelCount; c++) {

        for (size_t pd = 0; pd < OutputDepth; pd++) {

            size_t idStart = pd * StrideDepth - PaddingLeftDepth;
            size_t idEnd = idStart + KernelDepth;

            if (idStart >= InputDepth) {
                idStart = 0;
            }

            if (idEnd > InputDepth) {
                idEnd = InputDepth;
            }

            for (size_t ph = 0; ph < OutputHeight; ph++) {

                size_t ihStart = ph * StrideHeight - PaddingLeftHeight;
                size_t ihEnd = ihStart + KernelHeight;

                if (ihStart >= InputHeight) {
                    ihStart = 0;
                }

                if (ihEnd > InputHeight) {
                    ihEnd = InputHeight;
                }

                divider.StartNextOutputRow((idEnd - idStart) * (ihEnd - ihStart));

                //
                // Reduce the input across the kernel height and store in a local
                // reduction buffer.
                //

                const float* InputRowStart = &Input[idStart * InputHeight * InputWidth + ihStart * InputWidth];
                const size_t InputPlanesCount = idEnd - idStart;
                const size_t InputRowsCount = ihEnd - ihStart;
                size_t InputWidthRemaining = InputWidth;
                float* ReductionOutput = &ReductionBuffer[PaddingLeftWidth];
                const size_t InputAdvancePlane = (InputHeight - InputRowsCount) * InputWidth;

                while (InputWidthRemaining >= 4) {

                    const float* InputRow = InputRowStart;
                    size_t InputPlanesRemaining = InputPlanesCount;
                    MLAS_FLOAT32X4 Reduction = PoolingType::InitialVector();

                    do {

                        size_t InputRowsRemaining = InputRowsCount;

                        do {

                            Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(InputRow));
                            InputRow += InputWidth;
                            InputRowsRemaining--;

                        } while (InputRowsRemaining > 0);

                        InputRow += InputAdvancePlane;
                        InputPlanesRemaining--;

                    } while (InputPlanesRemaining > 0);

                    MlasStoreFloat32x4(ReductionOutput, Reduction);
                    ReductionOutput += 4;

                    InputRowStart += 4;
                    InputWidthRemaining -= 4;
                }

                while (InputWidthRemaining > 0) {

                    const float* InputRow = InputRowStart;
                    size_t InputPlanesRemaining = InputPlanesCount;
                    float Reduction = PoolingType::InitialValue();

                    do {

                        size_t InputRowsRemaining = InputRowsCount;

                        do {

                            Reduction = PoolingType::Reduce(Reduction, *InputRow);
                            InputRow += InputWidth;
                            InputRowsRemaining--;

                        } while (InputRowsRemaining > 0);

                        InputRow += InputAdvancePlane;
                        InputPlanesRemaining--;

                    } while (InputPlanesRemaining > 0);

                    *ReductionOutput++ = Reduction;

                    InputRowStart += 1;
                    InputWidthRemaining -= 1;
                }

                //
                // Reduce the input across the kernel width and store to the output
                // tensor.
                //

                size_t OutputWidthRemaining = OutputWidth;
                const float* ReductionInputStart = ReductionBuffer;

                do {

                    const float* ReductionInput = ReductionInputStart;
                    const float* ReductionInputEnd = ReductionInput + KernelWidth;
                    MLAS_FLOAT32X4 Reduction = MlasLoadFloat32x4(ReductionInput++);

                    while (ReductionInput < ReductionInputEnd) {
                        Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(ReductionInput++));
                    }

                    if (PoolingKind == MlasAveragePoolingExcludePad) {
                        Reduction = divider.DivideExcludePad(Reduction);
                    } else {
                        Reduction = divider.DivideIncludePad(Reduction);
                    }

                    if (StrideWidth == 1) {

                        if (OutputWidthRemaining < 4) {

                            if (OutputWidthRemaining >= 2) {

                                MlasStoreLowHalfFloat32x4(Output, Reduction);

                                if (OutputWidthRemaining > 2) {
                                    MlasStoreLaneFloat32x4<2>(Output + 2, Reduction);
                                }

                            } else {
                                MlasStoreLaneFloat32x4<0>(Output, Reduction);
                            }

                            Output += OutputWidthRemaining;

                            break;
                        }

                        MlasStoreFloat32x4(Output, Reduction);

                        Output += 4;
                        OutputWidthRemaining -= 4;

                    } else {

                        if (OutputWidthRemaining == 1) {
                            MlasStoreLaneFloat32x4<0>(Output++, Reduction);
                            break;
                        }

#if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_VSX_INTRINSICS)
                        MlasStoreLaneFloat32x4<0>(Output, Reduction);
                        MlasStoreLaneFloat32x4<2>(Output + 1, Reduction);
#elif defined(MLAS_SSE2_INTRINSICS)
                        Reduction = _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(2, 0, 2, 0));
                        MlasStoreLowHalfFloat32x4(Output, Reduction);
#else
#error Unsupported architecture.
#endif

                        Output += 2;
                        OutputWidthRemaining -= 2;
                    }

                    ReductionInputStart += 4;

                } while (OutputWidthRemaining > 0);
            }
        }

        Input += InputSize;
    }
}

template<typename PoolingType>
void
MlasPoolGlobalKernel(
    const MLAS_WORK_BLOCK* WorkBlock,
    size_t ChannelCount,
    const float* Input,
    float* Output
    )
/*++

Routine Description:

    This routine implements a global pooling operation.

Arguments:

    WorkBlock - Supplies the structure that contains the pooling parameters.

    ChannelCount - Supplies the number of channels to process.

    Input - Supplies the input tensor.

    Output - Supplies the output tensor.

Return Value:

    None.

--*/
{
    const size_t InputSize = WorkBlock->InputSize;
    const float InputSizeFloat = float(InputSize);

    //
    // Apply the pooling operation to each channel.
    //

    for (size_t c = 0; c < ChannelCount; c++) {

        size_t InputSizeRemaining = InputSize;

        //
        // Iterate over the input buffer a vector at a time.
        //

        MLAS_FLOAT32X4 Reduction = PoolingType::InitialVector();

        while (InputSizeRemaining >= 4) {
            Reduction = PoolingType::Reduce(Reduction, MlasLoadFloat32x4(Input));
            Input += 4;
            InputSizeRemaining -= 4;
        }

        //
        // Reduce the vector to a single float value.
        //

#if defined(MLAS_NEON64_INTRINSICS)

        float ReductionValue = PoolingType::ReduceFloat32x4(Reduction);

#elif defined(MLAS_NEON32_INTRINSICS)

        float32x2_t ReductionLow = vget_low_f32(Reduction);
        float32x2_t ReductionHigh = vget_high_f32(Reduction);

        ReductionLow = PoolingType::ReducePairwise(ReductionLow, ReductionHigh);
        ReductionLow = PoolingType::ReducePairwise(ReductionLow, ReductionHigh);

        float ReductionValue = vget_lane_f32(ReductionLow, 0);

#elif defined(MLAS_SSE2_INTRINSICS)

        Reduction = PoolingType::Reduce(Reduction, _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(3, 2, 3, 2)));
        Reduction = PoolingType::Reduce(Reduction, _mm_shuffle_ps(Reduction, Reduction, _MM_SHUFFLE(1, 1, 1, 1)));

        float ReductionValue = _mm_cvtss_f32(Reduction);

#elif defined(MLAS_VSX_INTRINSICS)

        Reduction = PoolingType::Reduce(Reduction, MLAS_FLOAT32X4(vec_splat((__vector int64_t)Reduction, 1)));
        Reduction = PoolingType::Reduce(Reduction, vec_splat(Reduction, 1));

        float ReductionValue = Reduction[0];

#else
#error Unsupported architecture.
#endif

        //
        // Iterate over the remaining input buffer an element at a time.
        //

        while (InputSizeRemaining > 0) {
            ReductionValue = PoolingType::Reduce(ReductionValue, *Input++);
            InputSizeRemaining -= 1;
        }

        //
        // Apply average pooling if necessary.
        //

        ReductionValue = PoolingType::AveragePool(ReductionValue, InputSizeFloat);

        *Output++ = ReductionValue;
    }
}

//
// Stores pointers to the pooling kernel routines.
//

static const PMLAS_POOL_KERNEL_ROUTINE MlasPoolGenericKernels[][3] =
{
    {
        MlasPool1DKernel<MLAS_MAXIMUM_POOLING>,
        MlasPool2DKernel<MLAS_MAXIMUM_POOLING>,
        MlasPool3DKernel<MLAS_MAXIMUM_POOLING>,
    },
    {
        MlasPool1DKernel<MLAS_AVERAGE_POOLING>,
        MlasPool2DKernel<MLAS_AVERAGE_POOLING>,
        MlasPool3DKernel<MLAS_AVERAGE_POOLING>,
    },
    {
        MlasPool1DKernel<MLAS_AVERAGE_POOLING>,
        MlasPool2DKernel<MLAS_AVERAGE_POOLING>,
        MlasPool3DKernel<MLAS_AVERAGE_POOLING>,
    },
};

static const PMLAS_POOL_KERNEL_ROUTINE MlasPoolGlobalKernels[] =
{
    MlasPoolGlobalKernel<MLAS_MAXIMUM_POOLING>,
    MlasPoolGlobalKernel<MLAS_AVERAGE_POOLING>,
    MlasPoolGlobalKernel<MLAS_AVERAGE_POOLING>,
};

static const PMLAS_POOL_KERNEL_ROUTINE MlasPoolVectorKernels[][2] =
{
    {
        MlasPool2DVectorKernel<MLAS_MAXIMUM_POOLING>,
        MlasPool3DVectorKernel<MLAS_MAXIMUM_POOLING>,
    },
    {
        MlasPool2DVectorKernel<MLAS_AVERAGE_POOLING>,
        MlasPool3DVectorKernel<MLAS_AVERAGE_POOLING>,
    },
    {
        MlasPool2DVectorKernel<MLAS_AVERAGE_POOLING>,
        MlasPool3DVectorKernel<MLAS_AVERAGE_POOLING>,
    },
};

void
MLASCALL
MlasPool(
    MLAS_POOLING_KIND PoolingKind,
    size_t Dimensions,
    const int64_t* InputShape,
    const int64_t* KernelShape,
    const int64_t* Padding,
    const int64_t* StrideShape,
    const int64_t* OutputShape,
    const float* Input,
    float* Output,
    MLAS_THREADPOOL* ThreadPool
    )
/*++

Routine Description:

    This routine implements the pooling operation.

Arguments:

    PoolingKind - Supplies the kind of pooling operation to perform.

    Dimensions - Supplies the number of dimensions.

    InputShape - Supplies the shape of the input tensor.

    KernelShape - Supplies the shape of the kernel transform.

    Padding - Supplies the number of padding elements at the edge of the input
        tensor.

    StrideShape - Supplies the shape of the stride.

    OutputShape - Supplies the shape of the output tensor.

    Input - Supplies the input tensor.

    Output - Supplies the output tensor.

    ThreadPool - Supplies the thread pool object to use, else nullptr if the
        base library threading support should be used.

Return Value:

    None.

--*/
{
    MLAS_WORK_BLOCK WorkBlock;

    WorkBlock.PoolingKind = PoolingKind;

    //
    // Compute the total number of channels to process and advance the input
    // and output shapes over the batch and channel counts.
    //

    //TODO: use a safeint here and make sure the result value can fit into int32_t
    size_t TotalChannelCount = size_t(InputShape[0]) * size_t(InputShape[1]);


    InputShape += 2;
    OutputShape += 2;

    //
    // Save the pooling parameters.
    //

    size_t InputSize = 1;
    size_t OutputSize = 1;

    bool InputAndKernelShapeMatch = true;
    bool AllStridesAreOne = true;
    bool AllPaddingIsZero = true;
    bool AllKernelsAreSmall = true;

    for (size_t dim = 0; dim < Dimensions; dim++) {

        WorkBlock.InputShape[dim] = size_t(InputShape[dim]);
        WorkBlock.OutputShape[dim] = size_t(OutputShape[dim]);

        if (KernelShape != nullptr) {
            WorkBlock.KernelShape[dim] = KernelShape[dim];
        } else {
            WorkBlock.KernelShape[dim] = InputShape[dim];
        }

        if (Padding != nullptr) {
            WorkBlock.Padding[dim] = Padding[dim];
            WorkBlock.Padding[dim + Dimensions] = Padding[dim + Dimensions];
        } else {
            WorkBlock.Padding[dim] = 0;
            WorkBlock.Padding[dim + Dimensions] = 0;
        }

        if (StrideShape != nullptr) {
            WorkBlock.StrideShape[dim] = StrideShape[dim];
        } else {
            WorkBlock.StrideShape[dim] = 1;
        }

        InputSize *= WorkBlock.InputShape[dim];
        OutputSize *= WorkBlock.OutputShape[dim];

        InputAndKernelShapeMatch &= (WorkBlock.KernelShape[dim] == int64_t(WorkBlock.InputShape[dim]));
        AllStridesAreOne &= (WorkBlock.StrideShape[dim] == 1);
        AllPaddingIsZero &= (WorkBlock.Padding[dim] == 0 && WorkBlock.Padding[dim + Dimensions] == 0);
        AllKernelsAreSmall &= (WorkBlock.KernelShape[dim] <= 32);
    }

    WorkBlock.InputSize = InputSize;

    //
    // Determine which pooling kernel routine to use.
    //
    // The vectorized kernels only support strides of 1 or 2. The kernel size
    // should be kept low in order to keep the divisors for average pooling to
    // be exactly representable as float. The input width plus padding must fit
    // in the reduction buffer.
    //

    PMLAS_POOL_KERNEL_ROUTINE PoolKernelRoutine = MlasPoolGenericKernels[PoolingKind][Dimensions - 1];

    if (InputAndKernelShapeMatch && AllStridesAreOne && AllPaddingIsZero) {

        PoolKernelRoutine = MlasPoolGlobalKernels[PoolingKind];

    } else if (Dimensions >= 2 && WorkBlock.StrideShape[Dimensions - 1] <= 2 && AllKernelsAreSmall) {

        int64_t ReductionBufferRemaining = MLAS_POOL_REDUCTION_BUFFER_STACK - MLAS_POOL_REDUCTION_BUFFER_PADDING;

        if (ReductionBufferRemaining >= WorkBlock.Padding[Dimensions - 1]) {
            ReductionBufferRemaining -= WorkBlock.Padding[Dimensions - 1];
        } else {
            ReductionBufferRemaining = 0;
        }

        if (ReductionBufferRemaining >= WorkBlock.Padding[Dimensions * 2 - 1]) {
            ReductionBufferRemaining -= WorkBlock.Padding[Dimensions * 2 - 1];
        } else {
            ReductionBufferRemaining = 0;
        }

        if (ReductionBufferRemaining >= int64_t(WorkBlock.InputShape[Dimensions - 1])) {
            PoolKernelRoutine = MlasPoolVectorKernels[PoolingKind][Dimensions - 2];
        }
    }

#ifdef MLAS_NO_ONNXRUNTIME_THREADPOOL
    MLAS_UNREFERENCED_PARAMETER(ThreadPool);
    //
    // Execute the pooling kernel routine.
    //

#if defined(_OPENMP)

#pragma omp parallel for
    for (int64_t c = 0; c < int64_t(TotalChannelCount); c++) {
      PoolKernelRoutine(&WorkBlock, 1, Input + c * InputSize, Output + c * OutputSize);
    }

#else

    PoolKernelRoutine(&WorkBlock, TotalChannelCount, Input, Output);

#endif
#else
    //
    // Use an external thread pool if one is provided.
    // TODO: change to use MlasExecuteThreaded
    onnxruntime::concurrency::ThreadPool::TryBatchParallelFor(ThreadPool, static_cast<ptrdiff_t>(TotalChannelCount), [&](ptrdiff_t c) {
      PoolKernelRoutine(&WorkBlock, 1, Input + c * InputSize, Output + c * OutputSize);
    }, 0);
    return;
#endif
}