FP16 conv (#15062)

### Description Convolution for fp16 datatype. Use NHWC for computation. For NCHW input, it rearranges the input tensor to NHWC format before computing the result. Support two optional fusion: 1. Activation 2. Add (not yet implemented) ### Motivation and Context Accelerating fp16 inference
2026-06-24 02:47:54 +00:00 · 2023-03-21 10:32:43 -07:00 · 2023-03-21 10:32:43 -07:00 · 34175f0b7c
commit 34175f0b7c
parent a3570eb5bf
9 changed files with 1924 additions and 0 deletions
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@ -21,6 +21,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
  ${MLAS_SRC_DIR}/sgemm.cpp
  ${MLAS_SRC_DIR}/halfgemm.cpp
  ${MLAS_SRC_DIR}/qgemm.cpp
+  ${MLAS_SRC_DIR}/dwconv.cpp
  ${MLAS_SRC_DIR}/qdwconv.cpp
  ${MLAS_SRC_DIR}/convolve.cpp
  ${MLAS_SRC_DIR}/convsym.cpp
@ -326,6 +327,7 @@ else()
        )
        set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
        set_source_files_properties(${MLAS_SRC_DIR}/activate_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+        set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")

        if(ONNXRUNTIME_MLAS_MULTI_ARCH)
            onnxruntime_add_static_library(onnxruntime_mlas_arm64 ${mlas_platform_srcs})
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@ -16,6 +16,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv);
+#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, FusedConv);
+#endif
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling);
@ -198,6 +201,9 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>,
+#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, MLFloat16, FusedConv)>,
+#endif
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, Sampling)>,
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@ -1603,3 +1603,41 @@ MlasHalfGemmConvertPackB(
    size_t ldb,
    void* PackedB
    );
+
+/**
+ * @brief Indirect Depthwise convolution for fp16
+ * @param Input         Supplies the indirect buffer for NHWC input
+ * @param Filter        Supplies the address for filter tensor
+ * @param Output        Supplies the address for the result tensor
+ * @param Channels      # of input channels
+ * @param OutputCount   # of output pixels
+ * @param KernelSize    # kernel size
+ * @return 
+*/
+void
+MLASCALL
+MlasConvDepthwise(
+    const MLAS_FP16* const* Input,
+    const MLAS_FP16* Filter,
+    MLAS_FP16* Output,
+    size_t Channels,
+    size_t OutputCount,
+    size_t KernelSize,
+    MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
+    );
+
+
+inline
+void
+MlasTranspose(
+    const MLAS_FP16* Input,
+    MLAS_FP16* Output,
+    size_t M,
+    size_t N
+    )
+{
+    MlasTranspose(
+        reinterpret_cast<const uint16_t*>(Input),
+        reinterpret_cast<uint16_t*>(Output),
+        M, N);
+}
--- a/onnxruntime/core/mlas/lib/dwconv.cpp
+++ b/onnxruntime/core/mlas/lib/dwconv.cpp
@ -0,0 +1,154 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    dwconv.cpp
+
+Abstract:
+
+    This module implements the half precision floating point depthwise convolution routines.
+
+--*/
+
+
+#include "fp16_common.h"
+
+#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
+
+MLAS_FORCEINLINE
+void
+MlasConvDepthwiseKernel(
+    const _mlas_fp16_* const* Input,
+    const _mlas_fp16_* Filter,
+    _mlas_fp16_* Output,
+    size_t Channels,
+    size_t OutputCount,
+    size_t KernelSize,
+    MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
+    )
+{
+    while (OutputCount > 0) {
+        size_t ChannelOffset = 0;
+        size_t c = Channels;
+
+        while (c >= 8) {
+            MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8();
+            size_t ChannelKernelOffset = ChannelOffset;
+
+            for (size_t k = 0; k < KernelSize; k++) {
+                MLAS_FLOAT16X8 InputVector = MlasLoadFloat16x8(&Input[k][ChannelOffset]);
+                MLAS_FLOAT16X8 FilterVector = MlasLoadFloat16x8(&Filter[ChannelKernelOffset]);
+
+                Accumulator = MlasMultiplyAddFloat16x8(InputVector, FilterVector, Accumulator);
+                ChannelKernelOffset += Channels;
+            }
+            MlasStoreFloat16x8(Output, Accumulator);
+            Output += 8;
+
+            ChannelOffset += 8;
+            c -= 8;
+        }
+
+        if (c >= 4) {
+            MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
+            size_t ChannelKernelOffset = ChannelOffset;
+
+            for (size_t k = 0; k < KernelSize; k++) {
+                MLAS_FLOAT16X4 InputVector = MlasLoadFloat16x4(&Input[k][ChannelOffset]);
+                MLAS_FLOAT16X4 FilterVector = MlasLoadFloat16x4(&Filter[ChannelKernelOffset]);
+
+                Accumulator = MlasMultiplyAddFloat16x4(InputVector, FilterVector, Accumulator);
+                ChannelKernelOffset += Channels;
+            }
+            MlasStoreFloat16x4(Output, Accumulator);
+            Output += 4;
+
+            ChannelOffset += 4;
+            c -= 4;
+        }
+
+        if (c > 0) {
+            MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
+            size_t ChannelKernelOffset = ChannelOffset;
+
+            for (size_t k = 0; k < KernelSize; k++) {
+                MLAS_FLOAT16X4 InputValue = MlasLoadFloat16x4(&Input[k][ChannelOffset]);
+                MLAS_FLOAT16X4 FilterValue = MlasLoadFloat16x4(&Filter[ChannelKernelOffset]);
+
+                Accumulator = MlasMultiplyAddFloat16x4(InputValue, FilterValue, Accumulator);
+                ChannelKernelOffset += Channels;
+            }
+            MlasStorePartialFloat16x4(Output, Accumulator, c);
+            Output += c;
+        }
+        if (PostProc) {
+            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
+                              Channels);
+        }
+        Input += KernelSize;
+        OutputCount -= 1;
+    }
+}
+
+#else
+
+MLAS_FORCEINLINE
+void
+MlasConvDepthwiseKernel(
+    const _mlas_fp16_* const* Input,
+    const _mlas_fp16_* Filter,
+    _mlas_fp16_* Output,
+    size_t Channels,
+    size_t OutputCount,
+    size_t KernelSize,
+    MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
+    )
+{
+    while (OutputCount > 0) {
+        for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) {
+            float Accumulator = 0.0f;
+            size_t ChannelKernelOffset = ChannelOffset;
+
+            for (size_t k = 0; k < KernelSize; k++) {
+                Accumulator += MLAS_Half2Float(Input[k][ChannelOffset]) * MLAS_Half2Float(Filter[ChannelKernelOffset]);
+                ChannelKernelOffset += Channels;
+            }
+            *Output++ = MLAS_Float2Half(Accumulator);
+        }
+        if (PostProc) {
+            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
+                              Channels);
+        }
+        Input += KernelSize;
+        OutputCount -= 1;
+    }
+}
+
+#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED
+
+
+void
+MLASCALL
+MlasConvDepthwise(
+    const MLAS_FP16* const* Input,
+    const MLAS_FP16* Filter,
+    MLAS_FP16* Output,
+    size_t Channels,
+    size_t OutputCount,
+    size_t KernelSize,
+    MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
+    )
+{
+    MlasConvDepthwiseKernel(
+        reinterpret_cast<const _mlas_fp16_* const*>(Input),
+        reinterpret_cast<const _mlas_fp16_*>(Filter),
+        reinterpret_cast<_mlas_fp16_*>(Output),
+        Channels,
+        OutputCount,
+        KernelSize,
+        PostProc);
+}
--- a/onnxruntime/core/mlas/lib/fp16_common.h
+++ b/onnxruntime/core/mlas/lib/fp16_common.h
@ -52,6 +52,10 @@ MLAS_FORCEINLINE
 MLAS_FLOAT16X8
 MlasZeroFloat16x8(void) { return vreinterpretq_f16_f32(vdupq_n_f32(0.0f)); }

+MLAS_FORCEINLINE
+MLAS_FLOAT16X4
+MlasZeroFloat16x4(void) { return vreinterpret_f16_f32(vdup_n_f32(0.0f)); }
+
 MLAS_FORCEINLINE
 MLAS_FLOAT16X8
 MlasLoadFloat16x8(const _mlas_fp16_* Buffer) { return vreinterpretq_f16_u16(vld1q_u16(Buffer)); }
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@ -441,6 +441,9 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Av
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv);
+#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv);
+#endif
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceLength);
@ -1474,6 +1477,9 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MaxUnpool)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 17, LpPool)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, Conv)>,
+#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, MLFloat16, Conv)>,
+#endif
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, ConvTranspose)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, 12, If)>,
    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 11, SequenceLength)>,
--- a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
+++ b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
@ -0,0 +1,602 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+//
+// This file contains implementation of a fp16 convolution operator.
+//
+
+#include "core/mlas/inc/mlas.h"
+
+#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
+
+#include "core/common/safeint.h"
+#include "core/framework/float16.h"
+#include "core/framework/op_kernel.h"
+#include "core/providers/cpu/nn/conv_attributes.h"
+
+#include "contrib_ops/cpu/fused_activation.h"
+
+namespace onnxruntime {
+
+using ConvPadVector = ConvAttributes::ConvPadVector;
+
+/**
+ * @brief Convolution Operator for FP16 tensors
+ *
+ * With two optional fused operations:
+ *
+ * 1. Add
+ * It takes an extra (optional) input Sum, a tensor same shape as the output.
+ * Sum is added to the output tensor.
+ *
+ * 2. Activation
+ * It takes an operator attribute 'activation', which supplies the activation info.
+ *
+ * Add is performed BEFORE activation.
+ *
+ * The implementation runs faster with NHWC. By default, it converts NCHW to NHWC
+ * before processing, and convert the result back. It can take NHWC tensors directly.
+ * Use operator attribute 'channels_last' to specify that the data layout is NHWC.
+ *
+*/
+class FusedConvFp16 final : public OpKernel {
+ public:
+  FusedConvFp16(const OpKernelInfo& info) : OpKernel(info), conv_attrs_(info) {
+    ORT_ENFORCE(GetFusedActivationAttr(info, activation_).IsOK());
+    channels_last_ = (info.GetAttrOrDefault<int64_t>("channels_last", static_cast<int64_t>(0)) != 0);
+  }
+
+  Status Compute(OpKernelContext* context) const override;
+
+  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+      /*out*/ bool& is_packed, /*out*/ PrePackedWeights* prepacked_weights) override;
+
+  Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
+                                   int input_idx,
+                                   /*out*/ bool& used_shared_buffers) override;
+
+ private:
+
+  /**
+   * @brief Reorder filter data to facilitate compute.
+   *
+   *        Based on Conv operator spec, filters are organized as (M x C/group x kH x kW),
+   *        where C is the number of input channels, and kH and kW are the height and width
+   *        of the kernel, and M is the number of feature maps. We need to change it into
+   *        (kH x kW x C/group) x M, forming a matrix of M columns, where each kernel is a
+   *        single column in channel last format.
+   *
+   * @param input
+   * @param output
+   * @param output_channels  number of feature maps
+   * @param input_channels
+   * @param kernel_size      kH x kW
+  */
+  static void ReorderFilter(const MLFloat16* input,
+                            MLFloat16* output,
+                            size_t output_channels,
+                            size_t input_channels,
+                            size_t kernel_size) {
+    for (size_t k = 0; k < kernel_size; k++) {
+      for (size_t ic = 0; ic < input_channels; ic++) {
+        for (size_t oc = 0; oc < output_channels; oc++) {
+          size_t index = (oc * input_channels * kernel_size) + (ic * kernel_size) + k;
+          *output++ = input[index];
+        }
+      }
+    }
+  }
+
+  MLAS_ACTIVATION activation_;
+  ConvAttributes conv_attrs_;
+  bool channels_last_{false};
+  TensorShape W_shape_;
+  BufferUniquePtr packed_W_buffer_;
+  size_t packed_W_size_{0};
+  bool is_W_packed_{false};
+  BufferUniquePtr reordered_W_buffer_;
+};
+
+
+Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
+                              /*out*/ bool& is_packed,
+                              /*out*/ PrePackedWeights* prepacked_weights) {
+  is_packed = false;
+  if (input_idx != 1) {
+    // Only pack filter tensor (aka weights)
+    return Status::OK();
+  }
+
+  const auto& shape = tensor.Shape().GetDims();
+  size_t rank = shape.size();
+  if (rank <= 2) {
+    return Status::OK();
+  }
+
+  const int64_t M = shape[0];
+  const int64_t C = shape[1];
+
+  // Verify that the total number of output channels is a multiple of the group count.
+  if (M % conv_attrs_.group != 0) {
+    return Status::OK();
+  }
+
+  // Note: The tensor has already been allocated with this tensor shape, so all
+  // shape indices are guaranteed to fit inside size_t.
+  const size_t output_channels = static_cast<size_t>(M);
+  const size_t group_input_channels = static_cast<size_t>(C);
+  const size_t kernel_size =
+      static_cast<size_t>(std::accumulate(shape.data() + 2, shape.data() + rank, 1LL, std::multiplies<int64_t>()));
+
+  const auto* Wdata = static_cast<const MLFloat16*>(tensor.DataRaw());
+  W_shape_ = shape;
+
+  const size_t group_count = static_cast<size_t>(conv_attrs_.group);
+  const size_t group_output_channels = output_channels / group_count;
+  const size_t kernel_dim = group_input_channels * kernel_size;
+
+  bool share_prepacked_weights = (prepacked_weights != nullptr);
+
+  // Don't pack the filter buffer if the MlasConvDepthwise path is used.
+  if (!(group_input_channels == 1 && group_output_channels == 1)) {
+    packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false);
+    if (packed_W_size_ != 0) {
+      size_t packed_W_data_size = SafeInt<size_t>(group_count) * packed_W_size_;
+      auto* packed_W = static_cast<MLFloat16*>(alloc->Alloc(packed_W_data_size));
+
+      // Initialize memory to 0 as there could be some padding associated with pre-packed
+      // buffer memory and we don not want it uninitialized and generate different hashes
+      // if and when we try to cache this pre-packed buffer for sharing between sessions.
+      memset((void*)packed_W, 0, packed_W_data_size);
+
+      packed_W_buffer_ = BufferUniquePtr(packed_W, BufferDeleter(alloc));
+
+      // Allocate a temporary buffer to hold the reordered oihw->hwio filter for
+      // a single group.
+      //
+      // Note: The size of this buffer is less than or equal to the size of the original
+      // weight tensor, so the allocation size is guaranteed to fit inside size_t.
+      auto* group_reordered_W = static_cast<MLFloat16*>(
+          alloc->Alloc(group_output_channels * kernel_dim * sizeof(MLFloat16)));
+      BufferUniquePtr group_reordered_W_buffer(group_reordered_W, BufferDeleter(alloc));
+
+      const size_t W_offset = group_output_channels * kernel_dim;
+
+      for (int64_t group_id = 0; group_id < conv_attrs_.group; ++group_id) {
+        ReorderFilter(Wdata, group_reordered_W, group_output_channels, group_input_channels, kernel_size);
+        MlasHalfGemmPackB(group_output_channels, kernel_dim, group_reordered_W, group_output_channels, packed_W);
+        packed_W += packed_W_size_;
+        Wdata += W_offset;
+      }
+
+      if (share_prepacked_weights) {
+        prepacked_weights->buffers_.push_back(std::move(packed_W_buffer_));
+        prepacked_weights->buffer_sizes_.push_back(packed_W_data_size);
+      }
+
+      is_W_packed_ = true;
+      is_packed = true;
+      return Status::OK();
+    }
+  }
+
+  if (share_prepacked_weights) {
+    prepacked_weights->buffers_.push_back(nullptr);  // packed_W_buffer_ is nullptr
+    prepacked_weights->buffer_sizes_.push_back(0);
+  }
+
+  size_t reordered_w_data_size = SafeInt<size_t>(sizeof(MLFloat16)) * output_channels * kernel_dim;
+  auto* reordered_W = static_cast<MLFloat16*>(alloc->Alloc(reordered_w_data_size));
+
+  // Initialize memory to 0 as there could be some padding associated with pre-packed
+  // buffer memory and we don not want it uninitialized and generate different hashes
+  // if and when we try to cache this pre-packed buffer for sharing between sessions.
+  memset((void*)reordered_W, 0, reordered_w_data_size);
+
+  reordered_W_buffer_ = BufferUniquePtr(reordered_W, BufferDeleter(alloc));
+
+  ReorderFilter(Wdata, reordered_W, output_channels, group_input_channels, kernel_size);
+
+  if (share_prepacked_weights) {
+    prepacked_weights->buffers_.push_back(std::move(reordered_W_buffer_));
+    prepacked_weights->buffer_sizes_.push_back(reordered_w_data_size);
+  }
+
+  is_W_packed_ = true;
+  is_packed = true;
+  return Status::OK();
+}
+
+Status FusedConvFp16::UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
+                                                       int input_idx,
+                                                       /*out*/ bool& used_shared_buffers) {
+  if (input_idx != 1) {
+    // only the filter tensor is packed
+    return Status::OK();
+  }
+
+  used_shared_buffers = true;
+
+  if (prepacked_buffers.size() == 1) {  // This means that only packed_W_ exists
+    packed_W_buffer_ = std::move(prepacked_buffers[0]);
+  } else if (prepacked_buffers.size() == 2) {  // This means that only reordered_W_ exists
+    // Enforce that the first "placeholder" buffer is nullptr
+    ORT_ENFORCE(prepacked_buffers[0].get() == nullptr);
+    reordered_W_buffer_ = std::move(prepacked_buffers[1]);
+  }
+
+  return Status::OK();
+}
+
+
+Status FusedConvFp16::Compute(OpKernelContext* context) const {
+  size_t num_inputs = OpKernel::Node().InputDefs().size();
+  const Tensor* X = context->Input<Tensor>(0);
+  const Tensor* W = is_W_packed_? nullptr : context->Input<Tensor>(1);
+  const auto& W_shape = W ? W->Shape() : W_shape_;
+  const Tensor* B = num_inputs >= 3 ? context->Input<Tensor>(2) : nullptr;
+
+  // TODO!!
+  // This tensor should be added to the result before activation is applied
+  // We need to augment the post processor to accept an addition operation.
+  // const Tensor* Sum = num_inputs >= 4 ? context->Input<Tensor>(3) : nullptr;
+
+  const int64_t N = X->Shape()[0];
+  const int64_t M = W_shape[0];
+  ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W_shape, channels_last_));
+
+  TensorShapeVector kernel_shape;
+  ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W_shape, kernel_shape));
+  const size_t kernel_rank = kernel_shape.size();
+
+  ConvPadVector pads(conv_attrs_.pads);
+  if (pads.empty()) {
+    pads.resize(kernel_rank * 2, 0);
+  }
+  TensorShapeVector dilations(conv_attrs_.dilations);
+  if (dilations.empty()) {
+    dilations.resize(kernel_rank, 1);
+  }
+  TensorShapeVector strides(conv_attrs_.strides);
+  if (strides.empty()) {
+    strides.resize(kernel_rank, 1);
+  }
+
+  const int64_t C = X->Shape()[channels_last_ ? 1 + kernel_rank : 1];
+  const size_t spatial_dim_start = channels_last_ ? 1 : 2;
+  const size_t spatial_dim_end = spatial_dim_start + kernel_rank;
+
+  TensorShapeVector Y_dims({N});
+  if (!channels_last_) {
+    Y_dims.push_back(M);
+  }
+  TensorShape input_shape = X->Shape().Slice(spatial_dim_start, spatial_dim_end);
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+  if (channels_last_) {
+    Y_dims.push_back(M);
+  }
+  Tensor* Y = context->Output(0, TensorShape(Y_dims));
+  TensorShape output_shape = Y->Shape().Slice(spatial_dim_start, spatial_dim_end);
+
+  // Bail out early if one of the dimensions is zero.
+  if (Y->Shape().Size() == 0) {
+    return Status::OK();
+  }
+
+  const int64_t input_image_size = input_shape.Size();
+  const int64_t output_image_size = output_shape.Size();
+  const int64_t kernel_size = TensorShape(kernel_shape).Size();
+
+  AllocatorPtr alloc;
+  ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
+
+  // Handle the case of a dynamic weight filter.
+  BufferUniquePtr reordered_W_buffer;
+  MLFloat16* reordered_W = nullptr;
+  if (!packed_W_buffer_) {
+    if (reordered_W_buffer_) {
+      // Weight was constant and reordered.
+      reordered_W = static_cast<MLFloat16*>(reordered_W_buffer_.get());
+    } else {
+      // Weight tensor was not constant or prepacking is disabled.
+      reordered_W = static_cast<MLFloat16*>(alloc->Alloc(SafeInt<size_t>(sizeof(MLFloat16)) * W_shape.Size()));
+      reordered_W_buffer = BufferUniquePtr(reordered_W, BufferDeleter(alloc));
+      ReorderFilter(
+          static_cast<const MLFloat16*>(W->DataRaw()),
+          reordered_W,
+          static_cast<size_t>(M),
+          static_cast<size_t>(W_shape[1]),
+          static_cast<size_t>(kernel_size));
+    }
+  }
+
+  int64_t group_count = conv_attrs_.group;
+  int64_t group_input_channels = W_shape[1];
+  int64_t group_output_channels = M / group_count;
+
+  // Test for depthwise convolution.
+  const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1);
+  if (is_depthwise_conv) {
+    // Update the input and output channels to the number of groups in order to
+    // reuse as much of the below standard convolution path.
+    group_input_channels = group_count;
+    group_output_channels = group_count;
+    group_count = 1;
+  }
+
+  const int64_t X_offset = C * input_image_size;
+  const int64_t Y_offset = M * output_image_size;
+  const int64_t kernel_dim = group_input_channels * kernel_size;
+  const int64_t col_buffer_size = kernel_dim * output_image_size;
+
+  const auto* Xdata = X->Data<MLFloat16>();
+  const auto* Bdata = B != nullptr ? B->Data<MLFloat16>() : nullptr;
+  auto* Ydata = Y->MutableData<MLFloat16>();
+
+  BufferUniquePtr transpose_input_buffer;
+  BufferUniquePtr transpose_output_buffer;
+
+  // Allocate temporary buffers for transposing to channels last format.
+  if (!channels_last_) {
+    auto* transpose_input = alloc->Alloc(SafeInt<size_t>(sizeof(MLFloat16)) * X_offset + MLAS_SYMM_QGEMM_BUF_OVERRUN);
+    transpose_input_buffer = BufferUniquePtr(transpose_input, BufferDeleter(alloc));
+    auto* transpose_output = alloc->Alloc(SafeInt<size_t>(sizeof(MLFloat16)) * Y_offset);
+    transpose_output_buffer = BufferUniquePtr(transpose_output, BufferDeleter(alloc));
+  }
+
+  BufferUniquePtr col_buffer;
+  BufferUniquePtr indirection_buffer;
+  size_t ind_buf_length = 0;
+  std::vector<MLFloat16> padding_data;
+
+  bool use_indirection_buffer = false;
+  if (is_depthwise_conv) {
+    use_indirection_buffer = true;
+  } else if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) {
+//    if (is_symmetric_conv_) {
+//      use_indirection_buffer = true;
+//    } else {
+      // Pointwise convolutions can use the original input tensor in place,
+      // otherwise a temporary buffer is required for the im2col transform.
+      int64_t group_col_buffer_size = (kernel_rank > 2) ? group_count * col_buffer_size : col_buffer_size;
+      group_col_buffer_size += MLAS_SYMM_QGEMM_BUF_OVERRUN;
+      auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(MLFloat16)) * group_col_buffer_size);
+      col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
+      memset(col_data, 0, SafeInt<size_t>(sizeof(MLFloat16)) * group_col_buffer_size);
+//    }
+  }
+
+//  bool parallel_batch = is_symmetric_conv_ && channels_last_;
+
+  if (use_indirection_buffer) {
+    // Allocate indirection buffer pointers and prepare a padding vector for
+    // the im2col transform.
+    ind_buf_length = SafeInt<size_t>(sizeof(const MLFloat16*)) * kernel_size * output_image_size;
+//    if (parallel_batch)
+//      ind_buf_length *= SafeInt<size_t>(N);  // ind buffer per each image in the batch
+    auto* indirection_data = alloc->Alloc(ind_buf_length);
+    indirection_buffer = BufferUniquePtr(indirection_data, BufferDeleter(alloc));
+    padding_data.resize(static_cast<size_t>(C), MLFloat16());
+  }
+
+
+  concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool();
+
+  /*************************************
+   * Thread partition idea: we are essentially partition a GEMM A[M,K] x B[K,N].
+   * Here B contains the conv filters, which are usually not big, so we assume
+   * it can be in cache entirely. Then we simply partition A horizontally into
+   * thin slices along M dimension. This would ensure that the slice of A fits
+   * into the cache and reduce the chance of kernel waiting for memory.
+   *
+   * The thickness of A slice should be multiple of kernel stride M. Since
+   * we have to choose from many different kernels, the logic of finding
+   * the stride M is hacky.
+   */
+
+  // The following convoluted branches must match the kernel selection logic
+  // in conv_worker.
+
+  const int32_t stride_m = 6;
+  const int64_t task_count = (output_image_size + stride_m - 1) / stride_m;
+
+  for (int64_t image_id = 0; image_id < N; ++image_id) {
+    const auto* input_data = Xdata;
+    auto* output_data = Ydata;
+
+    if (!channels_last_) {
+      // Transpose the input from channels first (CHW) to channels last (HWC).
+      MlasTranspose(
+          Xdata,
+          static_cast<MLFloat16*>(transpose_input_buffer.get()),
+          static_cast<size_t>(C),
+          static_cast<size_t>(input_image_size));
+      input_data = static_cast<MLFloat16*>(transpose_input_buffer.get());
+      output_data = static_cast<MLFloat16*>(transpose_output_buffer.get());
+    }
+
+    // Threaded implementation of ND convolution is not yet supported, so
+    // prepare all im2col transformations here.
+    if (col_buffer && kernel_rank > 2) {
+      for (int64_t group_id = 0; group_id < group_count; ++group_id) {
+        math::Im2col<MLFloat16, StorageOrder::NHWC>()(
+            input_data + group_id * group_input_channels,
+            group_input_channels,
+            C,
+            input_shape.GetDims().data(),
+            output_shape.GetDims().data(),
+            kernel_shape.data(),
+            strides.data(),
+            dilations.data(),
+            pads.data(),
+            static_cast<int64_t>(kernel_rank),
+            static_cast<MLFloat16*>(col_buffer.get()) + group_id * col_buffer_size,
+            MLFloat16());
+      }
+    }
+
+    auto conv_worker = [&](ptrdiff_t batch) {
+      int64_t output_start = (int64_t)batch * (int64_t)stride_m;
+      int64_t output_count = std::min((int64_t)stride_m, output_image_size - output_start);
+
+      MLFloat16 const** worker_indirection_buffer = nullptr;
+      if (indirection_buffer) {
+        worker_indirection_buffer = static_cast<MLFloat16 const**>(indirection_buffer.get()) + output_start * kernel_size;
+        math::Im2col<MLFloat16, StorageOrder::NHWC>()(
+            input_data,
+            C,
+            input_shape.GetDims().data(),
+            output_shape.GetDims().data(),
+            kernel_shape.data(),
+            strides.data(),
+            dilations.data(),
+            pads.data(),
+            static_cast<ptrdiff_t>(kernel_rank),
+            output_start,
+            output_count,
+            worker_indirection_buffer,
+            padding_data.data());
+      }
+
+      auto* worker_output = output_data + output_start * M;
+
+      if (is_depthwise_conv) {
+        // TODO!! add Sum tensor to activation
+        MLAS_HALF_GEMM_ACTIVATION_PROCESSOR act(activation_);
+        MlasConvDepthwise(
+            worker_indirection_buffer,
+            reordered_W,
+            worker_output,
+            static_cast<size_t>(M),
+            static_cast<size_t>(output_count),
+            static_cast<size_t>(kernel_size),
+            &act);
+      } else {
+        for (int64_t group_id = 0; group_id < group_count; ++group_id) {
+          // Prepare the im2col transformation or use the input buffer directly for
+          // pointwise convolutions.
+          const auto* group_input_data = input_data + group_id * group_input_channels;
+          const MLFloat16* AData;
+          size_t lda;
+          if (col_buffer) {
+            auto* worker_col_buffer = static_cast<MLFloat16*>(col_buffer.get()) + output_start * kernel_dim;
+            if (kernel_rank == 2) {
+              math::Im2col<MLFloat16, StorageOrder::NHWC>()(
+                  group_input_data,
+                  group_input_channels,
+                  C,
+                  input_shape[0],
+                  input_shape[1],
+                  kernel_shape[0],
+                  kernel_shape[1],
+                  dilations[0],
+                  dilations[1],
+                  pads[0],
+                  pads[1],
+                  strides[0],
+                  strides[1],
+                  output_shape[1],
+                  output_start,
+                  output_count,
+                  worker_col_buffer,
+                  MLFloat16());
+            } else if (kernel_rank == 1) {
+              math::Im2col<MLFloat16, StorageOrder::NHWC>()(
+                  group_input_data,
+                  group_input_channels,
+                  C,
+                  1,
+                  input_shape[0],
+                  1,
+                  kernel_shape[0],
+                  1,
+                  dilations[0],
+                  0,
+                  pads[0],
+                  1,
+                  strides[0],
+                  output_shape[0],
+                  output_start,
+                  output_count,
+                  worker_col_buffer,
+                  MLFloat16());
+            } else {
+              // Use the im2col buffer prepared outside the thread, indexed by group.
+              worker_col_buffer += group_id * col_buffer_size;
+            }
+            AData = reinterpret_cast<const MLFloat16*>(worker_col_buffer);
+            lda = static_cast<size_t>(kernel_dim);
+          } else {
+            AData = reinterpret_cast<const MLFloat16*>(group_input_data + output_start * C);
+            lda = static_cast<size_t>(C);
+          }
+
+          // TODO!! add Sum tensor to activation
+          MLAS_HALF_GEMM_ACTIVATION_PROCESSOR act(activation_);
+          MLAS_HALF_GEMM_DATA_PARAMS gemm_params;
+          gemm_params.A = AData;
+          gemm_params.lda = lda;
+          if (packed_W_buffer_) {
+            gemm_params.B = static_cast<const MLFloat16*>(packed_W_buffer_.get()) + group_id * packed_W_size_,
+            gemm_params.ldb = 0;
+          } else {
+            gemm_params.B = reordered_W + group_id * group_output_channels,
+            gemm_params.ldb = static_cast<size_t>(M);
+          }
+          gemm_params.C = worker_output + group_id * group_output_channels;
+          gemm_params.ldc = static_cast<size_t>(M);
+          gemm_params.Bias = Bdata;
+          gemm_params.OutputProcessor = &act; // process fused activation and add
+
+          MlasHalfGemmBatch(
+              static_cast<size_t>(output_count),
+              static_cast<size_t>(group_output_channels),
+              static_cast<size_t>(kernel_dim),
+              1, &gemm_params, thread_pool);
+        }
+      }
+
+    };
+
+    concurrency::ThreadPool::TrySimpleParallelFor(thread_pool, onnxruntime::narrow<ptrdiff_t>(task_count), conv_worker);
+
+    if (!channels_last_) {
+      // Transpose the output from channels last (NHWC) to channels first (NCHW).
+      MlasTranspose(
+          output_data,
+          Ydata,
+          static_cast<size_t>(output_image_size),
+          static_cast<size_t>(M));
+    }
+
+    Xdata += X_offset;
+    Ydata += Y_offset;
+  }
+
+  return Status::OK();
+}
+
+
+ONNX_CPU_OPERATOR_TYPED_KERNEL(
+    Conv,
+    11,
+    MLFloat16,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
+    FusedConvFp16);
+
+
+#ifndef DISABLE_CONTRIB_OPS
+namespace contrib {
+    ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
+        FusedConv,
+        1,
+        MLFloat16,
+        KernelDefBuilder()
+            .TypeConstraint("T", DataTypeImpl::GetTensorType<MLFloat16>()),
+        FusedConvFp16);
+}  // namespace contrib
+#endif
+
+}  // namespace onnxruntime
+
+#endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@ -17,6 +17,7 @@

 #include "core/util/math_cpuonly.h"
 #include "core/util/math.h"
+#include "core/framework/float16.h"

 #include <algorithm>
 #include "core/common/narrow.h"
@ -654,6 +655,7 @@ void Im2col<T, StorageOrder::NHWC>::operator()(

 template struct Im2col<int8_t, StorageOrder::NHWC>;
 template struct Im2col<uint8_t, StorageOrder::NHWC>;
+template struct Im2col<MLFloat16, StorageOrder::NHWC>;

 template <>
 void Col2im<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, int64_t channels, int64_t height,
--- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc