### Description

Convolution for fp16 datatype. Use NHWC for computation. For NCHW input,
it rearranges the input tensor to NHWC format before computing the
result.

Support two optional fusion:

1. Activation
2. Add (not yet implemented)

### Motivation and Context

Accelerating fp16 inference
This commit is contained in:
Chen Fu 2023-03-21 10:32:43 -07:00 committed by GitHub
parent a3570eb5bf
commit 34175f0b7c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 1924 additions and 0 deletions

View file

@ -21,6 +21,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
${MLAS_SRC_DIR}/sgemm.cpp
${MLAS_SRC_DIR}/halfgemm.cpp
${MLAS_SRC_DIR}/qgemm.cpp
${MLAS_SRC_DIR}/dwconv.cpp
${MLAS_SRC_DIR}/qdwconv.cpp
${MLAS_SRC_DIR}/convolve.cpp
${MLAS_SRC_DIR}/convsym.cpp
@ -326,6 +327,7 @@ else()
)
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/activate_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
if(ONNXRUNTIME_MLAS_MULTI_ARCH)
onnxruntime_add_static_library(onnxruntime_mlas_arm64 ${mlas_platform_srcs})

View file

@ -16,6 +16,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv);
#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, FusedConv);
#endif
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling);
@ -198,6 +201,9 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>,
#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, FusedConv)>,
#endif
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling)>,

View file

@ -1603,3 +1603,41 @@ MlasHalfGemmConvertPackB(
size_t ldb,
void* PackedB
);
/**
* @brief Indirect Depthwise convolution for fp16
* @param Input Supplies the indirect buffer for NHWC input
* @param Filter Supplies the address for filter tensor
* @param Output Supplies the address for the result tensor
* @param Channels # of input channels
* @param OutputCount # of output pixels
* @param KernelSize # kernel size
* @return
*/
void
MLASCALL
MlasConvDepthwise(
const MLAS_FP16* const* Input,
const MLAS_FP16* Filter,
MLAS_FP16* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
);
inline
void
MlasTranspose(
const MLAS_FP16* Input,
MLAS_FP16* Output,
size_t M,
size_t N
)
{
MlasTranspose(
reinterpret_cast<const uint16_t*>(Input),
reinterpret_cast<uint16_t*>(Output),
M, N);
}

View file

@ -0,0 +1,154 @@
/*++
Copyright (c) Microsoft Corporation. All rights reserved.
Licensed under the MIT License.
Module Name:
dwconv.cpp
Abstract:
This module implements the half precision floating point depthwise convolution routines.
--*/
#include "fp16_common.h"
#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
MLAS_FORCEINLINE
void
MlasConvDepthwiseKernel(
const _mlas_fp16_* const* Input,
const _mlas_fp16_* Filter,
_mlas_fp16_* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
)
{
while (OutputCount > 0) {
size_t ChannelOffset = 0;
size_t c = Channels;
while (c >= 8) {
MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8();
size_t ChannelKernelOffset = ChannelOffset;
for (size_t k = 0; k < KernelSize; k++) {
MLAS_FLOAT16X8 InputVector = MlasLoadFloat16x8(&Input[k][ChannelOffset]);
MLAS_FLOAT16X8 FilterVector = MlasLoadFloat16x8(&Filter[ChannelKernelOffset]);
Accumulator = MlasMultiplyAddFloat16x8(InputVector, FilterVector, Accumulator);
ChannelKernelOffset += Channels;
}
MlasStoreFloat16x8(Output, Accumulator);
Output += 8;
ChannelOffset += 8;
c -= 8;
}
if (c >= 4) {
MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
size_t ChannelKernelOffset = ChannelOffset;
for (size_t k = 0; k < KernelSize; k++) {
MLAS_FLOAT16X4 InputVector = MlasLoadFloat16x4(&Input[k][ChannelOffset]);
MLAS_FLOAT16X4 FilterVector = MlasLoadFloat16x4(&Filter[ChannelKernelOffset]);
Accumulator = MlasMultiplyAddFloat16x4(InputVector, FilterVector, Accumulator);
ChannelKernelOffset += Channels;
}
MlasStoreFloat16x4(Output, Accumulator);
Output += 4;
ChannelOffset += 4;
c -= 4;
}
if (c > 0) {
MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
size_t ChannelKernelOffset = ChannelOffset;
for (size_t k = 0; k < KernelSize; k++) {
MLAS_FLOAT16X4 InputValue = MlasLoadFloat16x4(&Input[k][ChannelOffset]);
MLAS_FLOAT16X4 FilterValue = MlasLoadFloat16x4(&Filter[ChannelKernelOffset]);
Accumulator = MlasMultiplyAddFloat16x4(InputValue, FilterValue, Accumulator);
ChannelKernelOffset += Channels;
}
MlasStorePartialFloat16x4(Output, Accumulator, c);
Output += c;
}
if (PostProc) {
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
Channels);
}
Input += KernelSize;
OutputCount -= 1;
}
}
#else
MLAS_FORCEINLINE
void
MlasConvDepthwiseKernel(
const _mlas_fp16_* const* Input,
const _mlas_fp16_* Filter,
_mlas_fp16_* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
)
{
while (OutputCount > 0) {
for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) {
float Accumulator = 0.0f;
size_t ChannelKernelOffset = ChannelOffset;
for (size_t k = 0; k < KernelSize; k++) {
Accumulator += MLAS_Half2Float(Input[k][ChannelOffset]) * MLAS_Half2Float(Filter[ChannelKernelOffset]);
ChannelKernelOffset += Channels;
}
*Output++ = MLAS_Float2Half(Accumulator);
}
if (PostProc) {
PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
Channels);
}
Input += KernelSize;
OutputCount -= 1;
}
}
#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED
void
MLASCALL
MlasConvDepthwise(
const MLAS_FP16* const* Input,
const MLAS_FP16* Filter,
MLAS_FP16* Output,
size_t Channels,
size_t OutputCount,
size_t KernelSize,
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
)
{
MlasConvDepthwiseKernel(
reinterpret_cast<const _mlas_fp16_* const*>(Input),
reinterpret_cast<const _mlas_fp16_*>(Filter),
reinterpret_cast<_mlas_fp16_*>(Output),
Channels,
OutputCount,
KernelSize,
PostProc);
}

View file

@ -52,6 +52,10 @@ MLAS_FORCEINLINE
MLAS_FLOAT16X8
MlasZeroFloat16x8(void) { return vreinterpretq_f16_f32(vdupq_n_f32(0.0f)); }
MLAS_FORCEINLINE
MLAS_FLOAT16X4
MlasZeroFloat16x4(void) { return vreinterpret_f16_f32(vdup_n_f32(0.0f)); }
MLAS_FORCEINLINE
MLAS_FLOAT16X8
MlasLoadFloat16x8(const _mlas_fp16_* Buffer) { return vreinterpretq_f16_u16(vld1q_u16(Buffer)); }

View file

@ -441,6 +441,9 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Av
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv);
#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv);
#endif
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose);
class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceLength);
@ -1474,6 +1477,9 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv)>,
#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv)>,
#endif
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceLength)>,

View file

@ -0,0 +1,602 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
//
// This file contains implementation of a fp16 convolution operator.
//
#include "core/mlas/inc/mlas.h"
#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
#include "core/common/safeint.h"
#include "core/framework/float16.h"
#include "core/framework/op_kernel.h"
#include "core/providers/cpu/nn/conv_attributes.h"
#include "contrib_ops/cpu/fused_activation.h"
namespace onnxruntime {
using ConvPadVector = ConvAttributes::ConvPadVector;
/**
* @brief Convolution Operator for FP16 tensors
*
* With two optional fused operations:
*
* 1. Add
* It takes an extra (optional) input Sum, a tensor same shape as the output.
* Sum is added to the output tensor.
*
* 2. Activation
* It takes an operator attribute 'activation', which supplies the activation info.
*
* Add is performed BEFORE activation.
*
* The implementation runs faster with NHWC. By default, it converts NCHW to NHWC
* before processing, and convert the result back. It can take NHWC tensors directly.
* Use operator attribute 'channels_last' to specify that the data layout is NHWC.
*
*/
class FusedConvFp16 final : public OpKernel {
public:
FusedConvFp16(const OpKernelInfo& info) : OpKernel(info), conv_attrs_(info) {
ORT_ENFORCE(GetFusedActivationAttr(info, activation_).IsOK());
channels_last_ = (info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(0)) != 0);
}
Status Compute(OpKernelContext* context) const override;
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
/*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) override;
Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
int input_idx,
/*out*/ bool& used_shared_buffers) override;
private:
/**
* @brief Reorder filter data to facilitate compute.
*
* Based on Conv operator spec, filters are organized as (M x C/group x kH x kW),
* where C is the number of input channels, and kH and kW are the height and width
* of the kernel, and M is the number of feature maps. We need to change it into
* (kH x kW x C/group) x M, forming a matrix of M columns, where each kernel is a
* single column in channel last format.
*
* @param input
* @param output
* @param output_channels number of feature maps
* @param input_channels
* @param kernel_size kH x kW
*/
static void ReorderFilter(const MLFloat16* input,
MLFloat16* output,
size_t output_channels,
size_t input_channels,
size_t kernel_size) {
for (size_t k = 0; k < kernel_size; k++) {
for (size_t ic = 0; ic < input_channels; ic++) {
for (size_t oc = 0; oc < output_channels; oc++) {
size_t index = (oc * input_channels * kernel_size) + (ic * kernel_size) + k;
*output++ = input[index];
}
}
}
}
MLAS_ACTIVATION activation_;
ConvAttributes conv_attrs_;
bool channels_last_{false};
TensorShape W_shape_;
BufferUniquePtr packed_W_buffer_;
size_t packed_W_size_{0};
bool is_W_packed_{false};
BufferUniquePtr reordered_W_buffer_;
};
Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) {
is_packed = false;
if (input_idx != 1) {
// Only pack filter tensor (aka weights)
return Status::OK();
}
const auto& shape = tensor.Shape().GetDims();
size_t rank = shape.size();
if (rank <= 2) {
return Status::OK();
}
const int64_t M = shape[0];
const int64_t C = shape[1];
// Verify that the total number of output channels is a multiple of the group count.
if (M % conv_attrs_.group != 0) {
return Status::OK();
}
// Note: The tensor has already been allocated with this tensor shape, so all
// shape indices are guaranteed to fit inside size_t.
const size_t output_channels = static_cast<size_t>(M);
const size_t group_input_channels = static_cast<size_t>(C);
const size_t kernel_size =
static_cast<size_t>(std::accumulate(shape.data() + 2, shape.data() + rank, 1LL, std::multiplies<int64_t>()));
const auto* Wdata = static_cast<const MLFloat16*>(tensor.DataRaw());
W_shape_ = shape;
const size_t group_count = static_cast<size_t>(conv_attrs_.group);
const size_t group_output_channels = output_channels / group_count;
const size_t kernel_dim = group_input_channels * kernel_size;
bool share_prepacked_weights = (prepacked_weights != nullptr);
// Don't pack the filter buffer if the MlasConvDepthwise path is used.
if (!(group_input_channels == 1 && group_output_channels == 1)) {
packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false);
if (packed_W_size_ != 0) {
size_t packed_W_data_size = SafeInt<size_t>(group_count) * packed_W_size_;
auto* packed_W = static_cast<MLFloat16*>(alloc->Alloc(packed_W_data_size));
// Initialize memory to 0 as there could be some padding associated with pre-packed
// buffer memory and we don not want it uninitialized and generate different hashes
// if and when we try to cache this pre-packed buffer for sharing between sessions.
memset((void*)packed_W, 0, packed_W_data_size);
packed_W_buffer_ = BufferUniquePtr(packed_W, BufferDeleter(alloc));
// Allocate a temporary buffer to hold the reordered oihw->hwio filter for
// a single group.
//
// Note: The size of this buffer is less than or equal to the size of the original
// weight tensor, so the allocation size is guaranteed to fit inside size_t.
auto* group_reordered_W = static_cast<MLFloat16*>(
alloc->Alloc(group_output_channels * kernel_dim * sizeof(MLFloat16)));
BufferUniquePtr group_reordered_W_buffer(group_reordered_W, BufferDeleter(alloc));
const size_t W_offset = group_output_channels * kernel_dim;
for (int64_t group_id = 0; group_id < conv_attrs_.group; ++group_id) {
ReorderFilter(Wdata, group_reordered_W, group_output_channels, group_input_channels, kernel_size);
MlasHalfGemmPackB(group_output_channels, kernel_dim, group_reordered_W, group_output_channels, packed_W);
packed_W += packed_W_size_;
Wdata += W_offset;
}
if (share_prepacked_weights) {
prepacked_weights->buffers_.push_back(std::move(packed_W_buffer_));
prepacked_weights->buffer_sizes_.push_back(packed_W_data_size);
}
is_W_packed_ = true;
is_packed = true;
return Status::OK();
}
}
if (share_prepacked_weights) {
prepacked_weights->buffers_.push_back(nullptr); // packed_W_buffer_ is nullptr
prepacked_weights->buffer_sizes_.push_back(0);
}
size_t reordered_w_data_size = SafeInt<size_t>(sizeof(MLFloat16)) * output_channels * kernel_dim;
auto* reordered_W = static_cast<MLFloat16*>(alloc->Alloc(reordered_w_data_size));
// Initialize memory to 0 as there could be some padding associated with pre-packed
// buffer memory and we don not want it uninitialized and generate different hashes
// if and when we try to cache this pre-packed buffer for sharing between sessions.
memset((void*)reordered_W, 0, reordered_w_data_size);
reordered_W_buffer_ = BufferUniquePtr(reordered_W, BufferDeleter(alloc));
ReorderFilter(Wdata, reordered_W, output_channels, group_input_channels, kernel_size);
if (share_prepacked_weights) {
prepacked_weights->buffers_.push_back(std::move(reordered_W_buffer_));
prepacked_weights->buffer_sizes_.push_back(reordered_w_data_size);
}
is_W_packed_ = true;
is_packed = true;
return Status::OK();
}
Status FusedConvFp16::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
int input_idx,
/*out*/ bool& used_shared_buffers) {
if (input_idx != 1) {
// only the filter tensor is packed
return Status::OK();
}
used_shared_buffers = true;
if (prepacked_buffers.size() == 1) { // This means that only packed_W_ exists
packed_W_buffer_ = std::move(prepacked_buffers[0]);
} else if (prepacked_buffers.size() == 2) { // This means that only reordered_W_ exists
// Enforce that the first "placeholder" buffer is nullptr
ORT_ENFORCE(prepacked_buffers[0].get() == nullptr);
reordered_W_buffer_ = std::move(prepacked_buffers[1]);
}
return Status::OK();
}
Status FusedConvFp16::Compute(OpKernelContext* context) const {
size_t num_inputs = OpKernel::Node().InputDefs().size();
const Tensor* X = context->Input<Tensor>(0);
const Tensor* W = is_W_packed_? nullptr : context->Input<Tensor>(1);
const auto& W_shape = W ? W->Shape() : W_shape_;
const Tensor* B = num_inputs >= 3 ? context->Input<Tensor>(2) : nullptr;
// TODO!!
// This tensor should be added to the result before activation is applied
// We need to augment the post processor to accept an addition operation.
// const Tensor* Sum = num_inputs >= 4 ? context->Input<Tensor>(3) : nullptr;
const int64_t N = X->Shape()[0];
const int64_t M = W_shape[0];
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W_shape, channels_last_));
TensorShapeVector kernel_shape;
ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W_shape, kernel_shape));
const size_t kernel_rank = kernel_shape.size();
ConvPadVector pads(conv_attrs_.pads);
if (pads.empty()) {
pads.resize(kernel_rank * 2, 0);
}
TensorShapeVector dilations(conv_attrs_.dilations);
if (dilations.empty()) {
dilations.resize(kernel_rank, 1);
}
TensorShapeVector strides(conv_attrs_.strides);
if (strides.empty()) {
strides.resize(kernel_rank, 1);
}
const int64_t C = X->Shape()[channels_last_ ? 1 + kernel_rank : 1];
const size_t spatial_dim_start = channels_last_ ? 1 : 2;
const size_t spatial_dim_end = spatial_dim_start + kernel_rank;
TensorShapeVector Y_dims({N});
if (!channels_last_) {
Y_dims.push_back(M);
}
TensorShape input_shape = X->Shape().Slice(spatial_dim_start, spatial_dim_end);
ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
if (channels_last_) {
Y_dims.push_back(M);
}
Tensor* Y = context->Output(0, TensorShape(Y_dims));
TensorShape output_shape = Y->Shape().Slice(spatial_dim_start, spatial_dim_end);
// Bail out early if one of the dimensions is zero.
if (Y->Shape().Size() == 0) {
return Status::OK();
}
const int64_t input_image_size = input_shape.Size();
const int64_t output_image_size = output_shape.Size();
const int64_t kernel_size = TensorShape(kernel_shape).Size();
AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
// Handle the case of a dynamic weight filter.
BufferUniquePtr reordered_W_buffer;
MLFloat16* reordered_W = nullptr;
if (!packed_W_buffer_) {
if (reordered_W_buffer_) {
// Weight was constant and reordered.
reordered_W = static_cast<MLFloat16*>(reordered_W_buffer_.get());
} else {
// Weight tensor was not constant or prepacking is disabled.
reordered_W = static_cast<MLFloat16*>(alloc->Alloc(SafeInt<size_t>(sizeof(MLFloat16)) * W_shape.Size()));
reordered_W_buffer = BufferUniquePtr(reordered_W, BufferDeleter(alloc));
ReorderFilter(
static_cast<const MLFloat16*>(W->DataRaw()),
reordered_W,
static_cast<size_t>(M),
static_cast<size_t>(W_shape[1]),
static_cast<size_t>(kernel_size));
}
}
int64_t group_count = conv_attrs_.group;
int64_t group_input_channels = W_shape[1];
int64_t group_output_channels = M / group_count;
// Test for depthwise convolution.
const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1);
if (is_depthwise_conv) {
// Update the input and output channels to the number of groups in order to
// reuse as much of the below standard convolution path.
group_input_channels = group_count;
group_output_channels = group_count;
group_count = 1;
}
const int64_t X_offset = C * input_image_size;
const int64_t Y_offset = M * output_image_size;
const int64_t kernel_dim = group_input_channels * kernel_size;
const int64_t col_buffer_size = kernel_dim * output_image_size;
const auto* Xdata = X->Data<MLFloat16>();
const auto* Bdata = B != nullptr ? B->Data<MLFloat16>() : nullptr;
auto* Ydata = Y->MutableData<MLFloat16>();
BufferUniquePtr transpose_input_buffer;
BufferUniquePtr transpose_output_buffer;
// Allocate temporary buffers for transposing to channels last format.
if (!channels_last_) {
auto* transpose_input = alloc->Alloc(SafeInt<size_t>(sizeof(MLFloat16)) * X_offset + MLAS_SYMM_QGEMM_BUF_OVERRUN);
transpose_input_buffer = BufferUniquePtr(transpose_input, BufferDeleter(alloc));
auto* transpose_output = alloc->Alloc(SafeInt<size_t>(sizeof(MLFloat16)) * Y_offset);
transpose_output_buffer = BufferUniquePtr(transpose_output, BufferDeleter(alloc));
}
BufferUniquePtr col_buffer;
BufferUniquePtr indirection_buffer;
size_t ind_buf_length = 0;
std::vector<MLFloat16> padding_data;
bool use_indirection_buffer = false;
if (is_depthwise_conv) {
use_indirection_buffer = true;
} else if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) {
// if (is_symmetric_conv_) {
// use_indirection_buffer = true;
// } else {
// Pointwise convolutions can use the original input tensor in place,
// otherwise a temporary buffer is required for the im2col transform.
int64_t group_col_buffer_size = (kernel_rank > 2) ? group_count * col_buffer_size : col_buffer_size;
group_col_buffer_size += MLAS_SYMM_QGEMM_BUF_OVERRUN;
auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(MLFloat16)) * group_col_buffer_size);
col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
memset(col_data, 0, SafeInt<size_t>(sizeof(MLFloat16)) * group_col_buffer_size);
// }
}
// bool parallel_batch = is_symmetric_conv_ && channels_last_;
if (use_indirection_buffer) {
// Allocate indirection buffer pointers and prepare a padding vector for
// the im2col transform.
ind_buf_length = SafeInt<size_t>(sizeof(const MLFloat16*)) * kernel_size * output_image_size;
// if (parallel_batch)
// ind_buf_length *= SafeInt<size_t>(N); // ind buffer per each image in the batch
auto* indirection_data = alloc->Alloc(ind_buf_length);
indirection_buffer = BufferUniquePtr(indirection_data, BufferDeleter(alloc));
padding_data.resize(static_cast<size_t>(C), MLFloat16());
}
concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool();
/*************************************
* Thread partition idea: we are essentially partition a GEMM A[M,K] x B[K,N].
* Here B contains the conv filters, which are usually not big, so we assume
* it can be in cache entirely. Then we simply partition A horizontally into
* thin slices along M dimension. This would ensure that the slice of A fits
* into the cache and reduce the chance of kernel waiting for memory.
*
* The thickness of A slice should be multiple of kernel stride M. Since
* we have to choose from many different kernels, the logic of finding
* the stride M is hacky.
*/
// The following convoluted branches must match the kernel selection logic
// in conv_worker.
const int32_t stride_m = 6;
const int64_t task_count = (output_image_size + stride_m - 1) / stride_m;
for (int64_t image_id = 0; image_id < N; ++image_id) {
const auto* input_data = Xdata;
auto* output_data = Ydata;
if (!channels_last_) {
// Transpose the input from channels first (CHW) to channels last (HWC).
MlasTranspose(
Xdata,
static_cast<MLFloat16*>(transpose_input_buffer.get()),
static_cast<size_t>(C),
static_cast<size_t>(input_image_size));
input_data = static_cast<MLFloat16*>(transpose_input_buffer.get());
output_data = static_cast<MLFloat16*>(transpose_output_buffer.get());
}
// Threaded implementation of ND convolution is not yet supported, so
// prepare all im2col transformations here.
if (col_buffer && kernel_rank > 2) {
for (int64_t group_id = 0; group_id < group_count; ++group_id) {
math::Im2col<MLFloat16, StorageOrder::NHWC>()(
input_data + group_id * group_input_channels,
group_input_channels,
C,
input_shape.GetDims().data(),
output_shape.GetDims().data(),
kernel_shape.data(),
strides.data(),
dilations.data(),
pads.data(),
static_cast<int64_t>(kernel_rank),
static_cast<MLFloat16*>(col_buffer.get()) + group_id * col_buffer_size,
MLFloat16());
}
}
auto conv_worker = [&](ptrdiff_t batch) {
int64_t output_start = (int64_t)batch * (int64_t)stride_m;
int64_t output_count = std::min((int64_t)stride_m, output_image_size - output_start);
MLFloat16 const** worker_indirection_buffer = nullptr;
if (indirection_buffer) {
worker_indirection_buffer = static_cast<MLFloat16 const**>(indirection_buffer.get()) + output_start * kernel_size;
math::Im2col<MLFloat16, StorageOrder::NHWC>()(
input_data,
C,
input_shape.GetDims().data(),
output_shape.GetDims().data(),
kernel_shape.data(),
strides.data(),
dilations.data(),
pads.data(),
static_cast<ptrdiff_t>(kernel_rank),
output_start,
output_count,
worker_indirection_buffer,
padding_data.data());
}
auto* worker_output = output_data + output_start * M;
if (is_depthwise_conv) {
// TODO!! add Sum tensor to activation
MLAS_HALF_GEMM_ACTIVATION_PROCESSOR act(activation_);
MlasConvDepthwise(
worker_indirection_buffer,
reordered_W,
worker_output,
static_cast<size_t>(M),
static_cast<size_t>(output_count),
static_cast<size_t>(kernel_size),
&act);
} else {
for (int64_t group_id = 0; group_id < group_count; ++group_id) {
// Prepare the im2col transformation or use the input buffer directly for
// pointwise convolutions.
const auto* group_input_data = input_data + group_id * group_input_channels;
const MLFloat16* AData;
size_t lda;
if (col_buffer) {
auto* worker_col_buffer = static_cast<MLFloat16*>(col_buffer.get()) + output_start * kernel_dim;
if (kernel_rank == 2) {
math::Im2col<MLFloat16, StorageOrder::NHWC>()(
group_input_data,
group_input_channels,
C,
input_shape[0],
input_shape[1],
kernel_shape[0],
kernel_shape[1],
dilations[0],
dilations[1],
pads[0],
pads[1],
strides[0],
strides[1],
output_shape[1],
output_start,
output_count,
worker_col_buffer,
MLFloat16());
} else if (kernel_rank == 1) {
math::Im2col<MLFloat16, StorageOrder::NHWC>()(
group_input_data,
group_input_channels,
C,
1,
input_shape[0],
1,
kernel_shape[0],
1,
dilations[0],
0,
pads[0],
1,
strides[0],
output_shape[0],
output_start,
output_count,
worker_col_buffer,
MLFloat16());
} else {
// Use the im2col buffer prepared outside the thread, indexed by group.
worker_col_buffer += group_id * col_buffer_size;
}
AData = reinterpret_cast<const MLFloat16*>(worker_col_buffer);
lda = static_cast<size_t>(kernel_dim);
} else {
AData = reinterpret_cast<const MLFloat16*>(group_input_data + output_start * C);
lda = static_cast<size_t>(C);
}
// TODO!! add Sum tensor to activation
MLAS_HALF_GEMM_ACTIVATION_PROCESSOR act(activation_);
MLAS_HALF_GEMM_DATA_PARAMS gemm_params;
gemm_params.A = AData;
gemm_params.lda = lda;
if (packed_W_buffer_) {
gemm_params.B = static_cast<const MLFloat16*>(packed_W_buffer_.get()) + group_id * packed_W_size_,
gemm_params.ldb = 0;
} else {
gemm_params.B = reordered_W + group_id * group_output_channels,
gemm_params.ldb = static_cast<size_t>(M);
}
gemm_params.C = worker_output + group_id * group_output_channels;
gemm_params.ldc = static_cast<size_t>(M);
gemm_params.Bias = Bdata;
gemm_params.OutputProcessor = &act; // process fused activation and add
MlasHalfGemmBatch(
static_cast<size_t>(output_count),
static_cast<size_t>(group_output_channels),
static_cast<size_t>(kernel_dim),
1, &gemm_params, thread_pool);
}
}
};
concurrency::ThreadPool::TrySimpleParallelFor(thread_pool, onnxruntime::narrow<ptrdiff_t>(task_count), conv_worker);
if (!channels_last_) {
// Transpose the output from channels last (NHWC) to channels first (NCHW).
MlasTranspose(
output_data,
Ydata,
static_cast<size_t>(output_image_size),
static_cast<size_t>(M));
}
Xdata += X_offset;
Ydata += Y_offset;
}
return Status::OK();
}
ONNX_CPU_OPERATOR_TYPED_KERNEL(
Conv,
11,
MLFloat16,
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
FusedConvFp16);
#ifndef DISABLE_CONTRIB_OPS
namespace contrib {
ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
FusedConv,
1,
MLFloat16,
KernelDefBuilder()
.TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
FusedConvFp16);
} // namespace contrib
#endif
} // namespace onnxruntime
#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED

View file

@ -17,6 +17,7 @@
#include "core/util/math_cpuonly.h"
#include "core/util/math.h"
#include "core/framework/float16.h"
#include <algorithm>
#include "core/common/narrow.h"
@ -654,6 +655,7 @@ void Im2col<T, StorageOrder::NHWC>::operator()(
template struct Im2col<int8_t, StorageOrder::NHWC>;
template struct Im2col<uint8_t, StorageOrder::NHWC>;
template struct Im2col<MLFloat16, StorageOrder::NHWC>;
template <>
void Col2im<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, int64_t channels, int64_t height,

File diff suppressed because it is too large Load diff