onnxruntime/onnxruntime/core/mlas/lib/snchwc.cpp

/*++

Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

Module Name:

    snchwc.cpp

Abstract:

    This module implements the single precision operations using the NCHWc
    blocking format.

--*/

#include "mlasi.h"

//
// Define the base thread context for NCWHc convolution or pooling operations.
//

struct MLAS_NCHWC_WORK_BLOCK
{
    int32_t tids;
    size_t BatchCount;
    size_t InputChannels;
    size_t InputShape[3];
    size_t InputSize;
    size_t OutputChannels;
    size_t OutputShape[3];
    size_t OutputSize;
    size_t KernelShape[3];
    size_t DilationShape[3];
    size_t Padding[6];
    size_t StrideShape[3];
    size_t OutputCountLeftPad[3];
    size_t OutputCount[3];
    size_t OutputCountRightPad[3];
};

//
// Define the worker thread context for a NCHWc convolution operation.
//

struct MLAS_NCHWC_CONV_WORK_BLOCK : MLAS_NCHWC_WORK_BLOCK
{
    const float* Input;
    const float* Filter;
    const float* Bias;
    const MLAS_ACTIVATION* Activation;
    float* Output;
    size_t GroupCount;
    bool ZeroMode;
};

//
// Define the worker thread context for a NCHWc pooling operation.
//

struct MLAS_NCHWC_POOL_WORK_BLOCK : MLAS_NCHWC_WORK_BLOCK
{
    const float* Input;
    float* Output;
    MLAS_POOLING_KIND PoolingKind;
};

//
// Define the convolution kernel flags.
//

#define MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT     0x00000001
#define MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION         0x00000002
#define MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION       0x00000004
#define MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION      0x00000008

size_t
MLASCALL
MlasNchwcGetBlockSize(
    void
    )
/*++

Routine Description:

    This routine returns the NCHWc block size for the platform.

Arguments:

    None.

Return Value:

    Returns the NCHWc block size for the platform. If NCHWc support is not
    available for the platform, then returns one.

    N.B. Using the value one as the flag to indicate no support avoids compiler
    warnings in optimized builds when using this value in division or modulus
    math.

--*/
{
#if defined(MLAS_TARGET_AMD64)
    return MlasPlatform.NchwcBlockSize;
#else
    return 1;
#endif
}

void
MlasNchwcPrepareWorkBlock(
    MLAS_NCHWC_WORK_BLOCK* WorkBlock,
    size_t Dimensions,
    const int64_t* InputShape,
    const int64_t* KernelShape,
    const int64_t* DilationShape,
    const int64_t* Padding,
    const int64_t* StrideShape,
    const int64_t* OutputShape
    )
/*++

Routine Description:

    This routine prepares for a convolution or pooling operation by computing
    required parameters given the shape attributes.

Arguments:

    WorkBlock - Supplies the structure that contains the common convolution
        and pooling parameters.

    Dimensions - Supplies the number of dimensions.

    InputShape - Supplies the shape of the input tensor.

    KernelShape - Supplies the shape of the kernel transform.

    DilationShape - Supplies the shape of the dilation.

    Padding - Supplies the number of padding elements at the edge of the input
        tensor.

    StrideShape - Supplies the shape of the stride.

    OutputShape - Supplies the shape of the output tensor.

Return Value:

    None.

--*/
{
    //
    // Extract and skip over the the batch and channel counts.
    //

    WorkBlock->BatchCount = size_t(InputShape[0]);
    WorkBlock->InputChannels = size_t(InputShape[1]);
    WorkBlock->OutputChannels = size_t(OutputShape[1]);

    InputShape += 2;
    OutputShape += 2;

    //
    // Extract the shape information along each dimension.
    //

    size_t InputSize = 1;
    size_t OutputSize = 1;
    bool CanFlattenShape = (Dimensions == 2);

    for (size_t dim = 0; dim < Dimensions; dim++) {

        const size_t InputValue = size_t(InputShape[dim]);
        const size_t OutputValue = size_t(OutputShape[dim]);

        WorkBlock->InputShape[dim] = InputValue;
        WorkBlock->OutputShape[dim] = OutputValue;

        InputSize *= InputValue;
        OutputSize *= OutputValue;

        if (KernelShape != nullptr) {
            WorkBlock->KernelShape[dim] = size_t(KernelShape[dim]);
        } else {
            WorkBlock->KernelShape[dim] = InputValue;
        }

        if (DilationShape != nullptr) {
            WorkBlock->DilationShape[dim] = size_t(DilationShape[dim]);
        } else {
            WorkBlock->DilationShape[dim] = 1;
        }

        CanFlattenShape &= (WorkBlock->DilationShape[dim] == 1);

        if (Padding != nullptr) {
            WorkBlock->Padding[dim] = size_t(Padding[dim]);
            WorkBlock->Padding[dim + Dimensions] = size_t(Padding[dim + Dimensions]);
        } else {
            WorkBlock->Padding[dim] = 0;
            WorkBlock->Padding[dim + Dimensions] = 0;
        }

        CanFlattenShape &= (WorkBlock->Padding[dim] == 0 && WorkBlock->Padding[dim + Dimensions] == 0);

        if (StrideShape != nullptr) {
            WorkBlock->StrideShape[dim] = size_t(StrideShape[dim]);
        } else {
            WorkBlock->StrideShape[dim] = 1;
        }

        CanFlattenShape &= (WorkBlock->StrideShape[dim] == 1);
    }

    WorkBlock->InputSize = InputSize;
    WorkBlock->OutputSize = OutputSize;

    //
    // Detect operations where the kernel is using the entire input width,
    // has strides and dilations set to one, and no padding. These operations
    // are transformed from outputting [N][1] to [1][N] by flattening the
    // operation to a single line using striding equal to the original width.
    //
    // With the originally shape, the NCHWc kernels would process a single
    // output per output line. After reshaping, the NCHWc kernels are able to
    // process multiple outputs per output line which typically performs better,
    // despite potentially using fewer threads due to the decreased output
    // height.
    //

    if (CanFlattenShape && (WorkBlock->InputShape[1] == WorkBlock->KernelShape[1])) {

        WorkBlock->StrideShape[1] = WorkBlock->InputShape[1];

        WorkBlock->InputShape[1] *= WorkBlock->InputShape[0];
        WorkBlock->InputShape[0] = 1;

        WorkBlock->OutputShape[1] *= WorkBlock->OutputShape[0];
        WorkBlock->OutputShape[0] = 1;

        WorkBlock->KernelShape[1] *= WorkBlock->KernelShape[0];
        WorkBlock->KernelShape[0] = 1;
    }

    //
    // Compute the number of output elements affected by left and right padding.
    //

    for (size_t dim = 0; dim < Dimensions; dim++) {

        const size_t SpanValue =
            WorkBlock->DilationShape[dim] * (WorkBlock->KernelShape[dim] - 1) + 1;
        const size_t StrideValue = WorkBlock->StrideShape[dim];
        const size_t PaddingLeftValue = WorkBlock->Padding[dim];
        const size_t InputValue = WorkBlock->InputShape[dim];

        size_t OutputCountWithLeftPad;

        if (InputValue + PaddingLeftValue >= SpanValue) {
            OutputCountWithLeftPad = (InputValue + PaddingLeftValue - SpanValue) / StrideValue + 1;
        } else {
            OutputCountWithLeftPad = 0;
        }

        size_t OutputCountLeftPad = (PaddingLeftValue + StrideValue - 1) / StrideValue;

        if (OutputCountLeftPad > OutputCountWithLeftPad) {
            OutputCountLeftPad = OutputCountWithLeftPad;
        }

        const size_t OutputValue = WorkBlock->OutputShape[dim];

        WorkBlock->OutputCountLeftPad[dim] = OutputCountLeftPad;
        WorkBlock->OutputCount[dim] = OutputCountWithLeftPad - OutputCountLeftPad;
        WorkBlock->OutputCountRightPad[dim] = OutputValue - OutputCountWithLeftPad;
    }
}

//
// Base implementation for neural network algorithms (convolution and pooling).
//

struct MLAS_NCHWC_NN_ALGORITHM
{
    static constexpr size_t HeightShapeIndex = 0;
    static constexpr size_t WidthShapeIndex = 1;

    const size_t BlockSize = MlasNchwcGetBlockSize();

    //
    // Capture these values from the work block for use as local constants.
    //

    const size_t BatchCount;
    const size_t InputChannels;
    const size_t OutputChannels;
    const size_t InputHeight;
    const size_t InputWidth;
    const size_t InputSize;
    const size_t OutputHeight;
    const size_t OutputWidth;
    const size_t OutputSize;
    const size_t KernelHeight;
    const size_t KernelWidth;
    const size_t KernelSize;
    const size_t DilationHeight;
    const size_t DilationWidth;
    const size_t PaddingLeftY;
    const size_t PaddingLeftX;
    const size_t StrideHeight;
    const size_t StrideWidth;
    const size_t OutputCountLeftPadY;
    const size_t OutputCountY;
    const size_t OutputCountLeftPadX;
    const size_t OutputCountX;
    const size_t OutputCountRightPadX;

    MLAS_NCHWC_NN_ALGORITHM(const MLAS_NCHWC_WORK_BLOCK* WorkBlock) :
        BatchCount(WorkBlock->BatchCount),
        InputChannels(WorkBlock->InputChannels),
        OutputChannels(WorkBlock->OutputChannels),
        InputHeight(WorkBlock->InputShape[HeightShapeIndex]),
        InputWidth(WorkBlock->InputShape[WidthShapeIndex]),
        InputSize(WorkBlock->InputSize),
        OutputHeight(WorkBlock->OutputShape[HeightShapeIndex]),
        OutputWidth(WorkBlock->OutputShape[WidthShapeIndex]),
        OutputSize(WorkBlock->OutputSize),
        KernelHeight(WorkBlock->KernelShape[HeightShapeIndex]),
        KernelWidth(WorkBlock->KernelShape[WidthShapeIndex]),
        KernelSize(KernelHeight * KernelWidth),
        DilationHeight(WorkBlock->DilationShape[HeightShapeIndex]),
        DilationWidth(WorkBlock->DilationShape[WidthShapeIndex]),
        PaddingLeftY(WorkBlock->Padding[HeightShapeIndex]),
        PaddingLeftX(WorkBlock->Padding[WidthShapeIndex]),
        StrideHeight(WorkBlock->StrideShape[HeightShapeIndex]),
        StrideWidth(WorkBlock->StrideShape[WidthShapeIndex]),
        OutputCountLeftPadY(WorkBlock->OutputCountLeftPad[HeightShapeIndex]),
        OutputCountY(WorkBlock->OutputCount[HeightShapeIndex]),
        OutputCountLeftPadX(WorkBlock->OutputCountLeftPad[WidthShapeIndex]),
        OutputCountX(WorkBlock->OutputCount[WidthShapeIndex]),
        OutputCountRightPadX(WorkBlock->OutputCountRightPad[WidthShapeIndex])
    {
    }
};

constexpr size_t MLAS_NCHWC_NN_ALGORITHM::HeightShapeIndex;
constexpr size_t MLAS_NCHWC_NN_ALGORITHM::WidthShapeIndex;

template<typename AlgorithmType>
void
MlasNchwcThreaded(
    void* Context,
    int32_t Index
    )
{
    AlgorithmType((decltype(AlgorithmType::WorkBlock))Context).Execute(Index);
}

//
// Base implementation for convolution algorithms.
//

struct MLAS_NCHWC_CONV_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
{
    //
    // Capture these values from the work block for use as local constants.
    //

    const MLAS_NCHWC_CONV_WORK_BLOCK* WorkBlock;
    const size_t GroupCount;
    const MLAS_ACTIVATION* Activation;
    const MLAS_ACTIVATION_KIND ActivationKind;
    const bool ZeroMode;

    //
    // Capture the buffer pointers from the work block.
    //
    // These fields are updated as the threads step through the convolution
    // operation.
    //

    const float* Input;
    const float* Filter;
    const float* Bias;
    float* Output;

    MLAS_NCHWC_CONV_ALGORITHM(const MLAS_NCHWC_CONV_WORK_BLOCK* WorkBlock) :
        MLAS_NCHWC_NN_ALGORITHM(WorkBlock),
        WorkBlock(WorkBlock),
        GroupCount(WorkBlock->GroupCount),
        Activation(WorkBlock->Activation),
        ActivationKind(Activation->ActivationKind),
        ZeroMode(WorkBlock->ZeroMode)
    {
        Input = WorkBlock->Input;
        Filter = WorkBlock->Filter;
        Bias = WorkBlock->Bias;
        Output = WorkBlock->Output;
    }

    unsigned
    ComputeKernelFlags(
        size_t ic,
        size_t ChannelCount
        )
    {
        unsigned KernelFlags = 0;

        //
        // Accumulate into the output buffer if this isn't the first input
        // channel contributing to the output element or if the caller has
        // requested that the output buffer not be zero initialized (Conv/Sum
        // fusion).
        //

        if (ic != 0 || !ZeroMode) {
            KernelFlags |= MLAS_CONV_KERNEL_FLAG_ACCUMULATE_OUTPUT;
        }

        if (ic + ChannelCount == InputChannels) {

            //
            // Add the bias buffer into the output buffer if necessary.
            //

            if (Bias != nullptr) {
                KernelFlags |= MLAS_CONV_KERNEL_FLAG_BIAS_ADDITION;
            }

            //
            // Test for fused ReLU activation or other types of activation run
            // outside of the convolution kernel.
            //

            if (ActivationKind == MlasReluActivation) {
                KernelFlags |= MLAS_CONV_KERNEL_FLAG_RELU_ACTIVATION;
            } else if (ActivationKind != MlasIdentityActivation) {
                KernelFlags |= MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION;
            }
        }

        return KernelFlags;
    }

    void
    ComputeEffectiveKernel(
        size_t ph,
        size_t FilterStride,
        const float** filter,
        size_t* ih,
        size_t* EffectiveKernelHeight
        )
    {
        //
        // Compute the first input row and kernel height. If this output row
        // uses padding from one or more input padding rows, then adjust the
        // kernel parameters to keep within the input bounds.
        //

        *ih = ph * StrideHeight - PaddingLeftY;
        *EffectiveKernelHeight = KernelHeight;

        if ((ph - OutputCountLeftPadY) >= OutputCountY) {

            size_t ihStep = *ih;

            for (size_t kh = 0; kh < KernelHeight; kh++) {

                if (ihStep >= InputHeight) {

                    if (ihStep == *ih) {
                        *ih += DilationHeight;
                        *filter += FilterStride;
                    }

                    *EffectiveKernelHeight -= 1;
                }

                ihStep += DilationHeight;
            }
        }
    }

    void
    DoActivation(
        float* output,
        size_t FilterCount,
        size_t BlockedOutputWidth
        )
    {
        //
        // Invoke activation doing an inplace update.
        //
        // The width of the output matrix is the number of written output
        // elements. Pointwise convolution may write multiple logical rows
        // at once, so this output count may be greater than OutputWidth.
        //
        // The convolution kernels write to one or more output positions
        // across NCHWc output planes, so the stride is set to the blocked
        // output size instead of the output width as done in NCHW convolution.
        //

        MlasActivation(Activation, output, nullptr, FilterCount,
            BlockedOutputWidth, BlockSize * OutputSize);
    }
};

//
// Base implementation for grouped convolution algorithms.
//

struct MLAS_NCHWC_GROUPED_CONV_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
{
    //
    // Slice the convolution operation such that multiple filter blocks are
    // reused for a given set of input inside the kernel.
    //

    static constexpr size_t FilterSetSize = 4;

    const size_t FilterSetCount;

    //
    // Stores the current output line, filter cluster, and group that this thread
    // is operating on.
    //

    size_t ph;
    size_t FilterSet;
    size_t Group;
    size_t WorkRemaining;
    size_t FilterCount;

    MLAS_NCHWC_GROUPED_CONV_ALGORITHM(const MLAS_NCHWC_CONV_WORK_BLOCK* WorkBlock) :
        MLAS_NCHWC_CONV_ALGORITHM(WorkBlock),
        FilterSetCount((OutputChannels + (BlockSize * FilterSetSize) - 1) / (BlockSize * FilterSetSize))
    {
    }

    void ComputeFilterCount(void)
    {
        FilterCount = (std::min)(FilterSetSize, (OutputChannels / BlockSize) - FilterSet * FilterSetSize);
    }

    void PrepareWork(int32_t Index)
    {
        const size_t TotalWork = BatchCount * GroupCount * FilterSetCount * OutputHeight;

        size_t WorkIndex;

        MlasPartitionWork(Index, WorkBlock->tids, TotalWork, &WorkIndex, &WorkRemaining);

        //
        // Extract the current batch, group, filter cluster, and output line
        // from the starting work index.
        //

        ph = WorkIndex % OutputHeight;
        const size_t BatchGroupFilterSet = WorkIndex / OutputHeight;

        FilterSet = BatchGroupFilterSet % FilterSetCount;
        const size_t BatchGroup = BatchGroupFilterSet / FilterSetCount;

        Group = BatchGroup % GroupCount;

        //
        // Advance the convolution buffer pointers to the current position
        // computed above.
        //

        Input += BatchGroup * InputChannels * InputSize;

        Output += BatchGroup * OutputChannels * OutputSize;
        Output += BlockSize * FilterSet * FilterSetSize * OutputSize;

        Filter += Group * OutputChannels * InputChannels * KernelSize;
        Filter += BlockSize * FilterSet * FilterSetSize * InputChannels * KernelSize;

        if (Bias != nullptr) {
            Bias += Group * OutputChannels;
            Bias += BlockSize * FilterSet * FilterSetSize;
        }

        //
        // Compute the number of filter set to use for the next iteration.
        //

        ComputeFilterCount();
    }

    void CompleteWork(size_t WorkThisIteration)
    {
        //
        // Adjust the amount of work remaining and check if the end of an output
        // image has been reached.
        //

        WorkRemaining -= WorkThisIteration;

        if ((ph += WorkThisIteration) == OutputHeight) {

            size_t BlockedFilterCount = BlockSize * FilterCount;

            Output += BlockedFilterCount * OutputSize;
            Filter += BlockedFilterCount * InputChannels * KernelSize;

            if (Bias != nullptr) {
                Bias += BlockedFilterCount;
            }

            //
            // Advance the input if the all filter sets have been processed.
            //

            if (++FilterSet == FilterSetCount) {

                Input += InputChannels * InputSize;

                //
                // Reset filter and bias if all groups have been processed.
                //

                if (++Group == GroupCount) {

                    Filter = WorkBlock->Filter;
                    Bias = WorkBlock->Bias;

                    Group = 0;
                }

                FilterSet = 0;
            }

            ComputeFilterCount();

            ph = 0;
        }
    }
};

constexpr size_t MLAS_NCHWC_GROUPED_CONV_ALGORITHM::FilterSetSize;

//
// Implementation of the direct convolution algorithm where the input buffer is
// in NCHWc format.
//

struct MLAS_NCHWC_CONV_NCHWC_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
{
    MLAS_NCHWC_CONV_NCHWC_ALGORITHM(const MLAS_NCHWC_CONV_WORK_BLOCK* WorkBlock) :
        MLAS_NCHWC_GROUPED_CONV_ALGORITHM(WorkBlock)
    {
    }

    void Execute(int32_t Index)
    {
        //
        // Setup the convolution state based on the thread index.
        //

        PrepareWork(Index);

        //
        // Loop until all of the work has been completed.
        //

        const size_t StrideWidthBytes = BlockSize * StrideWidth * sizeof(float);
        const size_t DilationWidthBytes = BlockSize * DilationWidth * sizeof(float);
        const size_t FilterStrideBytes = BlockSize * InputChannels * KernelSize * sizeof(float);
        const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float);
        const size_t InputWidthBytes = BlockSize * InputWidth * sizeof(float);
        const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float);
        const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes;

        const size_t BlockedOutputWidth = BlockSize * OutputWidth;

#if defined(MLAS_TARGET_AMD64)
        MLAS_CONV_FLOAT_KERNEL* Kernel = MlasPlatform.ConvNchwcFloatKernel;
#else
        MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwcFloatKernel;
#endif

        while (WorkRemaining > 0) {

            //
            // Compute the number of output lines to process in this iteration.
            //

            size_t WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);

            //
            // Walk over each input image organized as a set of NCHWc blocks.
            //

            for (size_t ic = 0; ic < InputChannels; ic += BlockSize) {

                unsigned KernelFlags = ComputeKernelFlags(ic, BlockSize);

                //
                // Apply the convolution kernel to each row of the output batch.
                //

                const float* input = Input + ic * InputSize;
                float* output = Output + ph * BlockedOutputWidth;

                for (size_t work = 0; work < WorkThisIteration; work++) {

                    //
                    // Constrain the effective kernel parameters if the output row
                    // uses one or more input padding rows.
                    //

                    const float* filter = Filter + BlockSize * ic * KernelSize;
                    size_t ih;
                    size_t EffectiveKernelHeight;

                    ComputeEffectiveKernel(ph + work, BlockSize * BlockSize * KernelWidth,
                        &filter, &ih, &EffectiveKernelHeight);

                    //
                    // Invoke the convolution kernel.
                    //

                    Kernel(input + BlockSize * (ih * InputWidth - PaddingLeftX),
                        filter, output, StrideWidthBytes, DilationWidthBytes,
                        FilterCount, InputStrideBytes, FilterStrideBytes,
                        OutputStrideBytes, EffectiveKernelHeight, KernelWidth,
                        input + BlockSize * (ih * InputWidth), InputWidthBytes,
                        DilatedInputWidthBytes, OutputCountLeftPadX, OutputCountX,
                        OutputCountRightPadX, Bias, KernelFlags);

                    //
                    // Test for fused non-ReLU activation.
                    //

                    if ((KernelFlags & MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION) != 0) {
                        DoActivation(output, FilterCount, BlockedOutputWidth);
                    }

                    output += BlockedOutputWidth;
                }
            }

            //
            // Advance the convolution state based on the completed work.
            //

            CompleteWork(WorkThisIteration);
        }
    }
};

//
// Implementation of the direct convolution algorithm where the input buffer is
// in NCHW format.
//

struct MLAS_NCHWC_CONV_NCHW_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
{
    MLAS_NCHWC_CONV_NCHW_ALGORITHM(const MLAS_NCHWC_CONV_WORK_BLOCK* WorkBlock) :
        MLAS_NCHWC_GROUPED_CONV_ALGORITHM(WorkBlock)
    {
    }

    void Execute(int32_t Index)
    {
        //
        // Setup the convolution state based on the thread index.
        //

        PrepareWork(Index);

        //
        // Loop until all of the work has been completed.
        //

        const size_t StrideWidthBytes = StrideWidth * sizeof(float);
        const size_t DilationWidthBytes = DilationWidth * sizeof(float);
        const size_t FilterStrideBytes = BlockSize * InputChannels * KernelSize * sizeof(float);
        const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float);
        const size_t InputWidthBytes = InputWidth * sizeof(float);
        const size_t DilatedInputWidthBytes = DilationHeight * InputWidth * sizeof(float);
        const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes;

        const size_t BlockedOutputWidth = BlockSize * OutputWidth;

#if defined(MLAS_TARGET_AMD64)
        MLAS_CONV_FLOAT_KERNEL* Kernel = MlasPlatform.ConvNchwFloatKernel;
#else
        MLAS_CONV_FLOAT_KERNEL* Kernel = MlasConvNchwFloatKernel;
#endif

        while (WorkRemaining > 0) {

            //
            // Constrain the effective kernel parameters if the output row uses
            // one or more input padding rows.
            //

            const float* filter = Filter;
            size_t ih;
            size_t EffectiveKernelHeight;

            ComputeEffectiveKernel(ph, BlockSize * KernelWidth, &filter, &ih,
                &EffectiveKernelHeight);

            //
            // Apply the convolution kernel to each channel of the input tensor.
            //

            const float* input = Input;
            float* output = Output + BlockSize * ph * OutputWidth;

            for (size_t ic = 0; ic < InputChannels; ic += 1) {

                unsigned KernelFlags = ComputeKernelFlags(ic, 1);

                //
                // Invoke the convolution kernel.
                //

                Kernel(input + (ih * InputWidth - PaddingLeftX), filter, output,
                    StrideWidthBytes, DilationWidthBytes, FilterCount, InputStrideBytes,
                    FilterStrideBytes, OutputStrideBytes, EffectiveKernelHeight,
                    KernelWidth, input + (ih * InputWidth), InputWidthBytes,
                    DilatedInputWidthBytes, OutputCountLeftPadX, OutputCountX,
                    OutputCountRightPadX, Bias, KernelFlags);

                //
                // Test for fused non-ReLU activation.
                //

                if ((KernelFlags & MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION) != 0) {
                    DoActivation(output, FilterCount, BlockedOutputWidth);
                }

                input += InputSize;
                filter += BlockSize * KernelSize;
            }

            //
            // Advance the convolution state based on the completed work.
            //

            CompleteWork(1);
        }
    }
};

//
// Implementation of the pointwise convolution algorithm.
//
// Pointwise convolutions have a kernel size of one. To simplify this
// implementation, no input padding is allowed, which matches typical
// usage in models.
//

struct MLAS_NCHWC_CONV_POINTWISE_ALGORITHM : MLAS_NCHWC_GROUPED_CONV_ALGORITHM
{
    MLAS_NCHWC_CONV_POINTWISE_ALGORITHM(const MLAS_NCHWC_CONV_WORK_BLOCK* WorkBlock) :
        MLAS_NCHWC_GROUPED_CONV_ALGORITHM(WorkBlock)
    {
    }

    void Execute(int32_t Index)
    {
        //
        // Setup the convolution state based on the thread index.
        //

        PrepareWork(Index);

        //
        // Loop until all of the work has been completed.
        //

        const size_t StrideWidthBytes = BlockSize * StrideWidth * sizeof(float);
        const size_t InputStrideBytes = BlockSize * InputSize * sizeof(float);
        const size_t FilterStrideBytes = BlockSize * InputChannels * sizeof(float);
        const size_t OutputStrideBytes = BlockSize * OutputSize * sizeof(float);

#if defined(MLAS_TARGET_AMD64)
        MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasPlatform.ConvPointwiseFloatKernel;
#else
        MLAS_CONV_POINTWISE_FLOAT_KERNEL* Kernel = MlasConvPointwiseFloatKernel;
#endif

        while (WorkRemaining > 0) {

            //
            // Compute the number of output blocks that can be computed in this
            // iteration. Unstrided convolutions can treat the input and output
            // as a single line which in turn allows the kernel to use wider
            // multiply/accumulate loops. Otherwise, a strided convolution can
            // output a single line at a time.
            //

            size_t WorkThisIteration;

            if (StrideHeight == 1 && StrideWidth == 1) {
                WorkThisIteration = (std::min)(WorkRemaining, OutputHeight - ph);
            } else {
                WorkThisIteration = 1;
            }

            const size_t OutputThisIteration = WorkThisIteration * OutputWidth;

            //
            // Apply the convolution kernel to batches of the input tensor.
            //
            // Shrinking the batch size causes a slowdown from additional
            // flushing of intermediate results to the output tensor. Extending
            // the batch sizes causes a slowdown from processor cache thrashing.
            //

            const float* input = Input + BlockSize * (ph * StrideHeight * InputWidth);
            const float* filter = Filter;
            float* output = Output + BlockSize * ph * OutputWidth;

            size_t InputChannelBatch;

            for (size_t ic = 0; ic < InputChannels; ic += InputChannelBatch) {

                constexpr size_t MaximumInputChannelBatch = 128;

                InputChannelBatch = (std::min)(InputChannels - ic, MaximumInputChannelBatch);

                unsigned KernelFlags = ComputeKernelFlags(ic, InputChannelBatch);

                //
                // Invoke the convolution kernel.
                //

                Kernel(input, filter, output, StrideWidthBytes, InputChannelBatch /
                    BlockSize, FilterCount, InputStrideBytes, FilterStrideBytes,
                    OutputStrideBytes, OutputThisIteration, Bias, KernelFlags);

                //
                // Test for fused non-ReLU activation.
                //

                if ((KernelFlags & MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION) != 0) {
                    DoActivation(output, FilterCount, BlockSize * OutputThisIteration);
                }

                input += MaximumInputChannelBatch * InputSize;
                filter += BlockSize * MaximumInputChannelBatch;
            }

            //
            // Advance the convolution state based on the completed work.
            //

            CompleteWork(WorkThisIteration);
        }
    }
};

//
// Implementation of the depthwise separable convolution algorithm.
//
// Depthwise separable convolutions are a form of grouped convolution where
// the number of input and output channels per group are one.
//

struct MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM : MLAS_NCHWC_CONV_ALGORITHM
{
    MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM(const MLAS_NCHWC_CONV_WORK_BLOCK* WorkBlock) :
        MLAS_NCHWC_CONV_ALGORITHM(WorkBlock)
    {
    }

    void Execute(int32_t Index)
    {
        const size_t GroupBlockCount = ((GroupCount + BlockSize - 1) / BlockSize);

        const size_t TotalWork = BatchCount * GroupBlockCount * OutputHeight;

        size_t WorkIndex;
        size_t WorkRemaining;

        MlasPartitionWork(Index, WorkBlock->tids, TotalWork, &WorkIndex, &WorkRemaining);

        //
        // Extract the current batch, group block, and output line from the
        // starting work index.
        //

        size_t ph = WorkIndex % OutputHeight;
        const size_t BatchGroup = WorkIndex / OutputHeight;

        size_t Group = BatchGroup % GroupBlockCount;

        //
        // Advance the convolution buffer pointers to the current position
        // computed above.
        //

        Input += BatchGroup * BlockSize * InputSize;
        Output += WorkIndex * BlockSize * OutputWidth;
        Filter += Group * BlockSize * KernelSize;

        if (Bias != nullptr) {
            Bias += BlockSize * Group;
        }

        //
        // Loop until all of the work has been completed.
        //

        const size_t StrideWidthBytes = BlockSize * StrideWidth * sizeof(float);
        const size_t DilationWidthBytes = BlockSize * DilationWidth * sizeof(float);
        const size_t InputWidthBytes = BlockSize * InputWidth * sizeof(float);
        const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float);
        const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes;

        const size_t BlockedOutputWidth = BlockSize * OutputWidth;

#if defined(MLAS_TARGET_AMD64)
        MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasPlatform.ConvDepthwiseFloatKernel;
#else
        MLAS_CONV_DEPTHWISE_FLOAT_KERNEL* Kernel = MlasConvDepthwiseFloatKernel;
#endif

        unsigned KernelFlags = ComputeKernelFlags(0, InputChannels);

        while (WorkRemaining > 0) {

            //
            // Constrain the effective kernel parameters if the output row uses
            // one or more input padding rows.
            //

            const float* filter = Filter;
            size_t ih;
            size_t EffectiveKernelHeight;

            ComputeEffectiveKernel(ph, BlockSize * KernelWidth, &filter, &ih, &EffectiveKernelHeight);

            //
            // Invoke the convolution kernel.
            //

            Kernel(Input + BlockSize * (ih * InputWidth - PaddingLeftX), filter,
                Output, StrideWidthBytes, DilationWidthBytes, InputStrideBytes,
                EffectiveKernelHeight, KernelWidth, Input + BlockSize * (ih * InputWidth),
                InputWidthBytes, DilatedInputWidthBytes, OutputCountLeftPadX,
                OutputCountX, OutputCountRightPadX, Bias, KernelFlags);

            //
            // Test for fused non-ReLU activation.
            //

            if ((KernelFlags & MLAS_CONV_KERNEL_FLAG_OTHER_ACTIVATION) != 0) {
                DoActivation(Output, 1, BlockedOutputWidth);
            }

            Output += BlockedOutputWidth;

            //
            // Adjust the amount of work remaining and check if the end of an
            // output image has been reached.
            //

            WorkRemaining -= 1;

            if (++ph == OutputHeight) {

                Input += BlockSize * InputSize;
                Filter += BlockSize * KernelSize;

                if (Bias != nullptr) {
                    Bias += BlockSize;
                }

                if (++Group == GroupBlockCount) {

                    Filter = WorkBlock->Filter;
                    Bias = WorkBlock->Bias;

                    Group = 0;
                }

                ph = 0;
            }
        }
    }
};

//
// Implementation of the pooling algorithm.
//

struct MLAS_NCHWC_POOL_ALGORITHM : MLAS_NCHWC_NN_ALGORITHM
{
#if !defined(MLAS_TARGET_AMD64)
    static const PMLAS_POOL_FLOAT_KERNEL PoolKernels[];
#endif

    const MLAS_NCHWC_POOL_WORK_BLOCK* WorkBlock;

    MLAS_NCHWC_POOL_ALGORITHM(const MLAS_NCHWC_POOL_WORK_BLOCK* WorkBlock) :
        MLAS_NCHWC_NN_ALGORITHM(WorkBlock),
        WorkBlock(WorkBlock)
    {
    }

    void Execute(int32_t Index)
    {
        const size_t TotalWork =
            ((BatchCount * InputChannels + BlockSize - 1) / BlockSize) * OutputHeight;

        size_t WorkIndex;
        size_t WorkRemaining;

        MlasPartitionWork(Index, WorkBlock->tids, TotalWork, &WorkIndex, &WorkRemaining);

        size_t ph = WorkIndex % OutputHeight;
        const size_t BatchChannel = WorkIndex / OutputHeight;

        const float* Input = WorkBlock->Input + BatchChannel * BlockSize * InputSize;
        float* Output = WorkBlock->Output + WorkIndex * BlockSize * OutputWidth;

        //
        // Loop until all of the work has been completed.
        //

        const size_t StrideWidthBytes = BlockSize * StrideWidth * sizeof(float);
        const size_t DilationWidthBytes = BlockSize * DilationWidth * sizeof(float);
        const size_t InputWidthBytes = BlockSize * InputWidth * sizeof(float);
        const size_t DilatedInputWidthBytes = BlockSize * DilationHeight * InputWidth * sizeof(float);
        const size_t InputStrideBytes = DilatedInputWidthBytes - KernelWidth * DilationWidthBytes;

#if defined(MLAS_TARGET_AMD64)
        MLAS_POOL_FLOAT_KERNEL* Kernel = MlasPlatform.PoolFloatKernel[WorkBlock->PoolingKind];
#else
        MLAS_POOL_FLOAT_KERNEL* Kernel = PoolKernels[WorkBlock->PoolingKind];
#endif

        while (WorkRemaining > 0) {

            //
            // Compute the first input row and kernel height. If this output row
            // uses padding from one or more input padding rows, then adjust the
            // kernel parameters to keep within the input bounds.
            //

            size_t ih = ph * StrideHeight - PaddingLeftY;
            size_t EffectiveKernelHeight = KernelHeight;

            if ((ph - OutputCountLeftPadY) >= OutputCountY) {

                size_t ihStep = ih;

                for (size_t kh = 0; kh < KernelHeight; kh++) {

                    if (ihStep >= InputHeight) {

                        if (ihStep == ih) {
                            ih += DilationHeight;
                        }

                        EffectiveKernelHeight -= 1;
                    }

                    ihStep += DilationHeight;
                }
            }

            //
            // Invoke the pooling kernel.
            //

            Kernel(Input + BlockSize * (ih * InputWidth - PaddingLeftX), Output,
                StrideWidthBytes, DilationWidthBytes, InputStrideBytes,
                KernelSize, EffectiveKernelHeight, KernelWidth,
                Input + BlockSize * (ih * InputWidth), InputWidthBytes,
                DilatedInputWidthBytes, OutputCountLeftPadX, OutputCountX,
                OutputCountRightPadX);

            Output += BlockSize * OutputWidth;

            //
            // Adjust the amount of work remaining and check if the end of an output
            // image has been reached.
            //

            WorkRemaining -= 1;

            if (++ph == OutputHeight) {

                Input += BlockSize * InputSize;

                ph = 0;
            }
        }
    }
};

#if !defined(MLAS_TARGET_AMD64)

const PMLAS_POOL_FLOAT_KERNEL MLAS_NCHWC_POOL_ALGORITHM::PoolKernels[] =
{
    MlasPoolMaximumFloatKernel,
    MlasPoolAverageExcludePadFloatKernel,
    MlasPoolAverageIncludePadFloatKernel,
};

#endif

void
MLASCALL
MlasNchwcConv(
    size_t Dimensions,
    const int64_t* InputShape,
    const int64_t* KernelShape,
    const int64_t* DilationShape,
    const int64_t* Padding,
    const int64_t* StrideShape,
    const int64_t* OutputShape,
    size_t GroupCount,
    const float* Input,
    const float* Filter,
    const float* Bias,
    float* Output,
    const MLAS_ACTIVATION* Activation,
    bool ZeroMode,
    MLAS_THREADPOOL* ThreadPool
    )
/*++

Routine Description:

    This routine implements the NCHWc convolution operation.

Arguments:

    Dimensions - Supplies the number of dimensions.

    InputShape - Supplies the shape of the input tensor.

    KernelShape - Supplies the shape of the kernel transform.

    DilationShape - Supplies the shape of the dilation.

    Padding - Supplies the number of padding elements at the edge of the input
        tensor.

    StrideShape - Supplies the shape of the stride.

    OutputShape - Supplies the shape of the output tensor.

    GroupCount - Supplies the number of channel groups.

    Input - Supplies the input tensor.

    Filter - Supplies the filter tensor.

    Bias - Optionally supplies the bias vector.

    Output - Supplies the output tensor.

    Activation - Supplies the parameters for the activation to apply to the
        convolution output.

    ZeroMode - Supplies true if the output tensor must be zero initialized
        first, else false if the output tensor is accumulated into. This flag is
        used to implement Conv/Sum fusion.

    ThreadPool - Supplies the thread pool object to use, else nullptr if the
        base library threading support should be used.

Return Value:

    None.

--*/
{
    MLAS_NCHWC_CONV_WORK_BLOCK WorkBlock;

    //
    // Capture the convolution specific parameters to the work block.
    //

    WorkBlock.Input = Input;
    WorkBlock.Output = Output;
    WorkBlock.GroupCount = GroupCount;
    WorkBlock.Filter = Filter;
    WorkBlock.Bias = Bias;
    WorkBlock.Activation = Activation;
    WorkBlock.ZeroMode = ZeroMode;

    //
    // Capture the generic shape parameters to the work block.
    //

    MlasNchwcPrepareWorkBlock(&WorkBlock, Dimensions, InputShape, KernelShape,
        DilationShape, Padding, StrideShape, OutputShape);

    WorkBlock.InputChannels /= GroupCount;
    WorkBlock.OutputChannels /= GroupCount;

    //
    // Determine the type of convolution to perform based on the shape
    // parameters.
    //
    // N.B. The caller must be aware of the selection algorithm in order to
    // reorder the filter tensor in the expected format for the given algorithm.
    //

    PMLAS_THREADED_ROUTINE ThreadedRoutine;

    if (WorkBlock.InputChannels >= MlasNchwcGetBlockSize()) {
        if (WorkBlock.KernelShape[0] == 1 && WorkBlock.KernelShape[1] == 1 &&
            WorkBlock.Padding[0] == 0 && WorkBlock.Padding[1] == 0 &&
            WorkBlock.Padding[2] == 0 && WorkBlock.Padding[3] == 0) {
            ThreadedRoutine = MlasNchwcThreaded<MLAS_NCHWC_CONV_POINTWISE_ALGORITHM>;
        } else {
            ThreadedRoutine = MlasNchwcThreaded<MLAS_NCHWC_CONV_NCHWC_ALGORITHM>;
        }
    } else if (WorkBlock.InputChannels == 1 && WorkBlock.OutputChannels == 1) {
        ThreadedRoutine = MlasNchwcThreaded<MLAS_NCHWC_CONV_DEPTHWISE_ALGORITHM>;
    } else {
        ThreadedRoutine = MlasNchwcThreaded<MLAS_NCHWC_CONV_NCHW_ALGORITHM>;
    }

    //
    // Schedule the operation across a set of worker threads.
    //

    WorkBlock.tids = MlasGetMaximumThreadCount(ThreadPool);

    MlasExecuteThreaded(ThreadedRoutine, &WorkBlock, WorkBlock.tids, ThreadPool);
}

void
MLASCALL
MlasNchwcPool(
    MLAS_POOLING_KIND PoolingKind,
    size_t Dimensions,
    const int64_t* InputShape,
    const int64_t* KernelShape,
    const int64_t* DilationShape,
    const int64_t* Padding,
    const int64_t* StrideShape,
    const int64_t* OutputShape,
    const float* Input,
    float* Output,
    MLAS_THREADPOOL* ThreadPool
    )
/*++

Routine Description:

    This routine implements the NCHWc pooling operation.

Arguments:

    PoolingKind - Supplies the kind of pooling operation to perform.

    Dimensions - Supplies the number of dimensions.

    InputShape - Supplies the shape of the input tensor.

    KernelShape - Supplies the shape of the kernel transform.

    DilationShape - Supplies the shape of the dilation.

    Padding - Supplies the number of padding elements at the edge of the input
        tensor.

    StrideShape - Supplies the shape of the stride.

    OutputShape - Supplies the shape of the output tensor.

    Input - Supplies the input tensor.

    Output - Supplies the output tensor.

    ThreadPool - Supplies the thread pool object to use, else nullptr if the
        base library threading support should be used.

Return Value:

    None.

--*/
{
    MLAS_NCHWC_POOL_WORK_BLOCK WorkBlock;

    //
    // Capture the pooling specific parameters to the work block.
    //

    WorkBlock.Input = Input;
    WorkBlock.Output = Output;
    WorkBlock.PoolingKind = PoolingKind;

    //
    // Capture the generic shape parameters to the work block.
    //

    MlasNchwcPrepareWorkBlock(&WorkBlock, Dimensions, InputShape, KernelShape,
        DilationShape, Padding, StrideShape, OutputShape);

    //
    // Schedule the operation across a set of worker threads.
    //

    WorkBlock.tids = MlasGetMaximumThreadCount(ThreadPool);

    MlasExecuteThreaded(MlasNchwcThreaded<MLAS_NCHWC_POOL_ALGORITHM>, &WorkBlock, WorkBlock.tids, ThreadPool);
}

#if !defined(MLAS_TARGET_AMD64)

//
// Convolution and pooling kernel stubs for architectures that do not yet have
// native support.
//

void
MLASCALL
MlasConvNchwFloatKernel(
    const float* Input,
    const float* Filter,
    float* Output,
    size_t StrideWidth,
    size_t DilationWidth,
    size_t FilterCount,
    size_t InputStride,
    size_t FilterStride,
    size_t OutputStride,
    size_t KernelHeight,
    size_t KernelWidth,
    const float* InputBase,
    size_t InputWidth,
    size_t DilatedInputWidth,
    size_t OutputCountLeftPad,
    size_t OutputCount,
    size_t OutputCountRightPad,
    const float* Bias,
    unsigned Flags
    )
{
    MLAS_UNREFERENCED_PARAMETER(Input);
    MLAS_UNREFERENCED_PARAMETER(Filter);
    MLAS_UNREFERENCED_PARAMETER(Output);
    MLAS_UNREFERENCED_PARAMETER(StrideWidth);
    MLAS_UNREFERENCED_PARAMETER(DilationWidth);
    MLAS_UNREFERENCED_PARAMETER(FilterCount);
    MLAS_UNREFERENCED_PARAMETER(InputStride);
    MLAS_UNREFERENCED_PARAMETER(FilterStride);
    MLAS_UNREFERENCED_PARAMETER(OutputStride);
    MLAS_UNREFERENCED_PARAMETER(KernelHeight);
    MLAS_UNREFERENCED_PARAMETER(KernelWidth);
    MLAS_UNREFERENCED_PARAMETER(InputBase);
    MLAS_UNREFERENCED_PARAMETER(InputWidth);
    MLAS_UNREFERENCED_PARAMETER(DilatedInputWidth);
    MLAS_UNREFERENCED_PARAMETER(OutputCountLeftPad);
    MLAS_UNREFERENCED_PARAMETER(OutputCount);
    MLAS_UNREFERENCED_PARAMETER(OutputCountRightPad);
    MLAS_UNREFERENCED_PARAMETER(Bias);
    MLAS_UNREFERENCED_PARAMETER(Flags);
}

void
MLASCALL
MlasConvNchwcFloatKernel(
    const float* Input,
    const float* Filter,
    float* Output,
    size_t StrideWidth,
    size_t DilationWidth,
    size_t FilterCount,
    size_t InputStride,
    size_t FilterStride,
    size_t OutputStride,
    size_t KernelHeight,
    size_t KernelWidth,
    const float* InputBase,
    size_t InputWidth,
    size_t DilatedInputWidth,
    size_t OutputCountLeftPad,
    size_t OutputCount,
    size_t OutputCountRightPad,
    const float* Bias,
    unsigned Flags
    )
{
    MLAS_UNREFERENCED_PARAMETER(Input);
    MLAS_UNREFERENCED_PARAMETER(Filter);
    MLAS_UNREFERENCED_PARAMETER(Output);
    MLAS_UNREFERENCED_PARAMETER(StrideWidth);
    MLAS_UNREFERENCED_PARAMETER(DilationWidth);
    MLAS_UNREFERENCED_PARAMETER(FilterCount);
    MLAS_UNREFERENCED_PARAMETER(InputStride);
    MLAS_UNREFERENCED_PARAMETER(FilterStride);
    MLAS_UNREFERENCED_PARAMETER(OutputStride);
    MLAS_UNREFERENCED_PARAMETER(KernelHeight);
    MLAS_UNREFERENCED_PARAMETER(KernelWidth);
    MLAS_UNREFERENCED_PARAMETER(InputBase);
    MLAS_UNREFERENCED_PARAMETER(InputWidth);
    MLAS_UNREFERENCED_PARAMETER(DilatedInputWidth);
    MLAS_UNREFERENCED_PARAMETER(OutputCountLeftPad);
    MLAS_UNREFERENCED_PARAMETER(OutputCount);
    MLAS_UNREFERENCED_PARAMETER(OutputCountRightPad);
    MLAS_UNREFERENCED_PARAMETER(Bias);
    MLAS_UNREFERENCED_PARAMETER(Flags);
}

void
MLASCALL
MlasConvDepthwiseFloatKernel(
    const float* Input,
    const float* Filter,
    float* Output,
    size_t StrideWidth,
    size_t DilationWidth,
    size_t InputStride,
    size_t KernelHeight,
    size_t KernelWidth,
    const float* InputBase,
    size_t InputWidth,
    size_t DilatedInputWidth,
    size_t OutputCountLeftPad,
    size_t OutputCount,
    size_t OutputCountRightPad,
    const float* Bias,
    unsigned Flags
    )
{
    MLAS_UNREFERENCED_PARAMETER(Input);
    MLAS_UNREFERENCED_PARAMETER(Filter);
    MLAS_UNREFERENCED_PARAMETER(Output);
    MLAS_UNREFERENCED_PARAMETER(StrideWidth);
    MLAS_UNREFERENCED_PARAMETER(DilationWidth);
    MLAS_UNREFERENCED_PARAMETER(InputStride);
    MLAS_UNREFERENCED_PARAMETER(KernelHeight);
    MLAS_UNREFERENCED_PARAMETER(KernelWidth);
    MLAS_UNREFERENCED_PARAMETER(InputBase);
    MLAS_UNREFERENCED_PARAMETER(InputWidth);
    MLAS_UNREFERENCED_PARAMETER(DilatedInputWidth);
    MLAS_UNREFERENCED_PARAMETER(OutputCountLeftPad);
    MLAS_UNREFERENCED_PARAMETER(OutputCount);
    MLAS_UNREFERENCED_PARAMETER(OutputCountRightPad);
    MLAS_UNREFERENCED_PARAMETER(Bias);
    MLAS_UNREFERENCED_PARAMETER(Flags);
}

void
MLASCALL
MlasConvPointwiseFloatKernel(
    const float* Input,
    const float* Filter,
    float* Output,
    size_t StrideWidth,
    size_t InputChannels,
    size_t FilterCount,
    size_t InputStride,
    size_t FilterStride,
    size_t OutputStride,
    size_t OutputCount,
    const float* Bias,
    unsigned Flags
    )
{
    MLAS_UNREFERENCED_PARAMETER(Input);
    MLAS_UNREFERENCED_PARAMETER(Filter);
    MLAS_UNREFERENCED_PARAMETER(Output);
    MLAS_UNREFERENCED_PARAMETER(StrideWidth);
    MLAS_UNREFERENCED_PARAMETER(InputChannels);
    MLAS_UNREFERENCED_PARAMETER(FilterCount);
    MLAS_UNREFERENCED_PARAMETER(InputStride);
    MLAS_UNREFERENCED_PARAMETER(FilterStride);
    MLAS_UNREFERENCED_PARAMETER(OutputStride);
    MLAS_UNREFERENCED_PARAMETER(OutputCount);
    MLAS_UNREFERENCED_PARAMETER(Bias);
    MLAS_UNREFERENCED_PARAMETER(Flags);
}

void
MLASCALL
MlasPoolMaximumFloatKernel(
    const float* Input,
    float* Output,
    size_t StrideWidth,
    size_t DilationWidth,
    size_t InputStride,
    size_t ActualKernelSize,
    size_t KernelHeight,
    size_t KernelWidth,
    const float* InputBase,
    size_t InputWidth,
    size_t DilatedInputWidth,
    size_t OutputCountLeftPad,
    size_t OutputCount,
    size_t OutputCountRightPad
    )
{
    MLAS_UNREFERENCED_PARAMETER(Input);
    MLAS_UNREFERENCED_PARAMETER(Output);
    MLAS_UNREFERENCED_PARAMETER(StrideWidth);
    MLAS_UNREFERENCED_PARAMETER(DilationWidth);
    MLAS_UNREFERENCED_PARAMETER(InputStride);
    MLAS_UNREFERENCED_PARAMETER(ActualKernelSize);
    MLAS_UNREFERENCED_PARAMETER(KernelHeight);
    MLAS_UNREFERENCED_PARAMETER(KernelWidth);
    MLAS_UNREFERENCED_PARAMETER(InputBase);
    MLAS_UNREFERENCED_PARAMETER(InputWidth);
    MLAS_UNREFERENCED_PARAMETER(DilatedInputWidth);
    MLAS_UNREFERENCED_PARAMETER(OutputCountLeftPad);
    MLAS_UNREFERENCED_PARAMETER(OutputCount);
    MLAS_UNREFERENCED_PARAMETER(OutputCountRightPad);
}

void
MLASCALL
MlasPoolAverageExcludePadFloatKernel(
    const float* Input,
    float* Output,
    size_t StrideWidth,
    size_t DilationWidth,
    size_t InputStride,
    size_t ActualKernelSize,
    size_t KernelHeight,
    size_t KernelWidth,
    const float* InputBase,
    size_t InputWidth,
    size_t DilatedInputWidth,
    size_t OutputCountLeftPad,
    size_t OutputCount,
    size_t OutputCountRightPad
    )
{
    MLAS_UNREFERENCED_PARAMETER(Input);
    MLAS_UNREFERENCED_PARAMETER(Output);
    MLAS_UNREFERENCED_PARAMETER(StrideWidth);
    MLAS_UNREFERENCED_PARAMETER(DilationWidth);
    MLAS_UNREFERENCED_PARAMETER(InputStride);
    MLAS_UNREFERENCED_PARAMETER(ActualKernelSize);
    MLAS_UNREFERENCED_PARAMETER(KernelHeight);
    MLAS_UNREFERENCED_PARAMETER(KernelWidth);
    MLAS_UNREFERENCED_PARAMETER(InputBase);
    MLAS_UNREFERENCED_PARAMETER(InputWidth);
    MLAS_UNREFERENCED_PARAMETER(DilatedInputWidth);
    MLAS_UNREFERENCED_PARAMETER(OutputCountLeftPad);
    MLAS_UNREFERENCED_PARAMETER(OutputCount);
    MLAS_UNREFERENCED_PARAMETER(OutputCountRightPad);
}

void
MLASCALL
MlasPoolAverageIncludePadFloatKernel(
    const float* Input,
    float* Output,
    size_t StrideWidth,
    size_t DilationWidth,
    size_t InputStride,
    size_t ActualKernelSize,
    size_t KernelHeight,
    size_t KernelWidth,
    const float* InputBase,
    size_t InputWidth,
    size_t DilatedInputWidth,
    size_t OutputCountLeftPad,
    size_t OutputCount,
    size_t OutputCountRightPad
    )
{
    MLAS_UNREFERENCED_PARAMETER(Input);
    MLAS_UNREFERENCED_PARAMETER(Output);
    MLAS_UNREFERENCED_PARAMETER(StrideWidth);
    MLAS_UNREFERENCED_PARAMETER(DilationWidth);
    MLAS_UNREFERENCED_PARAMETER(InputStride);
    MLAS_UNREFERENCED_PARAMETER(ActualKernelSize);
    MLAS_UNREFERENCED_PARAMETER(KernelHeight);
    MLAS_UNREFERENCED_PARAMETER(KernelWidth);
    MLAS_UNREFERENCED_PARAMETER(InputBase);
    MLAS_UNREFERENCED_PARAMETER(InputWidth);
    MLAS_UNREFERENCED_PARAMETER(DilatedInputWidth);
    MLAS_UNREFERENCED_PARAMETER(OutputCountLeftPad);
    MLAS_UNREFERENCED_PARAMETER(OutputCount);
    MLAS_UNREFERENCED_PARAMETER(OutputCountRightPad);
}

#endif