have Im2ColNd support all types and allow customized padding value. (#273)

* have Im2ColNd support all types and allow customized padding value. * only specialize the template in order NCHW. * fix build break. * fix build break
2026-07-17 18:40:28 +00:00 · 2019-01-03 19:24:06 -08:00 · 2019-01-03 19:24:06 -08:00 · 75934af896
commit 75934af896
parent 058803086d
4 changed files with 114 additions and 108 deletions
--- a/onnxruntime/core/providers/cpu/nn/conv.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv.cc
@ -20,15 +20,15 @@ Status Conv<float>::Compute(OpKernelContext* context) const {

  if (kernel_shape.size() + 2 != W->Shape().NumDimensions()) {
    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "kernel_shape num_dims is not compatible with W num_dims.",
-                                   " kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
-                                   " W: ", W->Shape().ToString().c_str());
+                           " kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
+                           " W: ", W->Shape().ToString().c_str());
  }

  for (size_t i = 0; i < kernel_shape.size(); ++i) {
    if (kernel_shape[i] != W->Shape()[i + 2]) {
      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "kernel_shape is not compatible with W shape.",
-                                     " kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
-                                     " W: ", W->Shape().ToString().c_str());
+                             " kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
+                             " W: ", W->Shape().ToString().c_str());
    }
  }

@ -111,7 +111,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {

    for (int image_id = 0; image_id < N; ++image_id) {
      for (int group_id = 0; group_id < group_; ++group_id) {
-        math::Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>(
+        math::Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>()(
            Xdata + group_id * X_offset,
            image_shape.GetDims().data(),
            col_buffer_shape.data(),
--- a/onnxruntime/core/providers/cpu/nn/conv_impl.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_impl.h
@ -57,15 +57,15 @@ Status Conv<T>::Compute(OpKernelContext* context) const {

  if (kernel_shape.size() + 2 != W->Shape().NumDimensions()) {
    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "kernel_shape num_dims is not compatible with W num_dims.",
-                                   " kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
-                                   " W: ", W->Shape().ToString().c_str());
+                           " kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
+                           " W: ", W->Shape().ToString().c_str());
  }

  for (size_t i = 0; i < kernel_shape.size(); ++i) {
    if (kernel_shape[i] != W->Shape()[i + 2]) {
      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "kernel_shape is not compatible with W shape.",
-                                     " kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
-                                     " W: ", W->Shape().ToString().c_str());
+                             " kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
+                             " W: ", W->Shape().ToString().c_str());
    }
  }

@ -135,7 +135,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
            col_buffer_data,
            &CPUMathUtil::Instance());
      } else {
-        math::Im2colNd<T, CPUMathUtil, StorageOrder::NCHW>(
+        math::Im2colNd<T, CPUMathUtil, StorageOrder::NCHW>()(
            Xdata + group_id * X_offset,
            image_shape.GetDims().data(),
            col_buffer_shape.data(),
--- a/onnxruntime/core/util/math.h
+++ b/onnxruntime/core/util/math.h
@ -327,20 +327,100 @@ void Axpby(
    Provider* provider);

 template <typename T, class Provider, int order>
-void Im2colNd(
-    const T* data_img,
-    const int64_t* im_shape,
-    const int64_t* col_shape,
-    const int64_t img_size,
-    const int64_t col_size,
-    const int64_t* kernel_shape,
-    const int64_t* stride,
-    const int64_t* dilation,
-    const int64_t* pad,
-    const int64_t N,
-    T* data_col,
-    Provider* provider,
-    bool accumulate_output = false);
+struct Im2colNd {
+  void operator()(
+      const T* data_img,
+      const int64_t* im_shape,
+      const int64_t* col_shape,
+      const int64_t img_size,
+      const int64_t col_size,
+      const int64_t* kernel_shape,
+      const int64_t* stride,
+      const int64_t* dilation,
+      const int64_t* pad,
+      const int64_t N,
+      T* data_col,
+      Provider* /*provider*/,
+      bool accumulate_output = false,
+      T padding_value = 0);
+};
+
+template <typename T, class Provider>
+struct Im2colNd<T, Provider, StorageOrder::NCHW> {
+  void operator()(
+      const T* data_img,
+      const int64_t* im_shape,
+      const int64_t* col_shape,
+      const int64_t /*img_size*/,
+      const int64_t /*col_size*/,
+      const int64_t* kernel_shape,
+      const int64_t* stride,
+      const int64_t* dilation,
+      const int64_t* pad,
+      const int64_t N,
+      T* data_col,
+      Provider* /*provider*/,
+      bool accumulate_output = false,
+      T padding_value = 0) {
+    int64_t kernel_size = 1;
+    for (int64_t i = 0; i < N; ++i) {
+      kernel_size *= kernel_shape[i];
+    }
+    const int64_t channels_col = col_shape[0];
+    std::vector<int64_t> d_offset(N, 0);
+    std::vector<int64_t> d_iter(N, 0);
+    for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+      // Loop over spatial axes in reverse order to compute a per-axis offset.
+      int64_t offset = c_col;
+      for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
+        if (d_i < N - 1) {
+          offset /= kernel_shape[d_i + 1];
+        }
+        d_offset[d_i] = offset % kernel_shape[d_i];
+      }
+      for (bool incremented = true; incremented;) {
+        // Loop over spatial axes in forward order to compute the indices in the
+        // image and column, and whether the index lies in the padding.
+        int64_t index_col = c_col;
+        int64_t index_im = c_col / kernel_size;
+        bool is_padding = false;
+        for (int64_t d_i = 0; d_i < N; ++d_i) {
+          const int64_t d = d_iter[d_i];
+          const int64_t d_im =
+              d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
+          is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
+          index_col *= col_shape[d_i + 1];
+          index_col += d;
+          index_im *= im_shape[d_i + 1];
+          index_im += d_im;
+        }
+        if (!accumulate_output) {
+          if (is_padding) {
+            data_col[index_col] = padding_value;
+          } else {
+            data_col[index_col] = data_img[index_im];
+          }
+        } else if (!is_padding) {  // col2im
+          data_col[index_im] += data_img[index_col];
+        }
+        // Loop over spatial axes in reverse order to choose an index,
+        // like counting.
+        incremented = false;
+        for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
+          const int64_t d_max = col_shape[d_i + 1];
+          ORT_ENFORCE(d_iter[d_i] < d_max);
+          if (d_iter[d_i] == d_max - 1) {
+            d_iter[d_i] = 0;
+          } else {  // d_iter[d_i] < d_max - 1
+            ++d_iter[d_i];
+            incremented = true;
+            break;
+          }
+        }
+      }  // while(incremented) {
+    }    // for (int c = 0; c < channels_col; ++c) {
+  }
+};

 template <typename T, class Provider, int order>
 void Col2imNd(
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@ -475,15 +475,15 @@ void GemmBatched<float, CPUMathUtil>(
  }
 }

-// MKL will be implmenet as an execution provider
-////////////////////////////////////////////////////////////////////////////////
-// MKL VML alternatives.
-// Depending on whether we are using MKL, we will delegate the Caffe math
-// functions that are VML-related to either the VML call or the Eigen
-// implementation. If you are setting the flags (such as AVX) right for your CPU
-// architecture, usually Eigen will deliver a throughput as fast as the VML
-// functions.
-////////////////////////////////////////////////////////////////////////////////
+  // MKL will be implmenet as an execution provider
+  ////////////////////////////////////////////////////////////////////////////////
+  // MKL VML alternatives.
+  // Depending on whether we are using MKL, we will delegate the Caffe math
+  // functions that are VML-related to either the VML call or the Eigen
+  // implementation. If you are setting the flags (such as AVX) right for your CPU
+  // architecture, usually Eigen will deliver a throughput as fast as the VML
+  // functions.
+  ////////////////////////////////////////////////////////////////////////////////

 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)                      \
  template <>                                                                  \
@ -859,80 +859,6 @@ void Select<float, CPUMathUtil>(
    y[i] = x[i * D + idx[i]];
  }
 }
-// Ported from caffe 1.
-template <>
-void Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>(
-    const float* data_img,
-    const int64_t* im_shape,
-    const int64_t* col_shape,
-    const int64_t /* img_size*/,
-    const int64_t /* col_size*/,
-    const int64_t* kernel_shape,
-    const int64_t* stride,
-    const int64_t* dilation,
-    const int64_t* pad,
-    const int64_t N,
-    float* data_col,
-    CPUMathUtil* /* context */,
-    bool accumulate_output) {
-  int64_t kernel_size = 1;
-  for (int64_t i = 0; i < N; ++i) {
-    kernel_size *= kernel_shape[i];
-  }
-  const int64_t channels_col = col_shape[0];
-  std::vector<int64_t> d_offset(N, 0);
-  std::vector<int64_t> d_iter(N, 0);
-  for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
-    // Loop over spatial axes in reverse order to compute a per-axis offset.
-    int64_t offset = c_col;
-    for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
-      if (d_i < N - 1) {
-        offset /= kernel_shape[d_i + 1];
-      }
-      d_offset[d_i] = offset % kernel_shape[d_i];
-    }
-    for (bool incremented = true; incremented;) {
-      // Loop over spatial axes in forward order to compute the indices in the
-      // image and column, and whether the index lies in the padding.
-      int64_t index_col = c_col;
-      int64_t index_im = c_col / kernel_size;
-      bool is_padding = false;
-      for (int64_t d_i = 0; d_i < N; ++d_i) {
-        const int64_t d = d_iter[d_i];
-        const int64_t d_im =
-            d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
-        is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
-        index_col *= col_shape[d_i + 1];
-        index_col += d;
-        index_im *= im_shape[d_i + 1];
-        index_im += d_im;
-      }
-      if (!accumulate_output) {
-        if (is_padding) {
-          data_col[index_col] = 0;
-        } else {
-          data_col[index_col] = data_img[index_im];
-        }
-      } else if (!is_padding) {  // col2im
-        data_col[index_im] += data_img[index_col];
-      }
-      // Loop over spatial axes in reverse order to choose an index,
-      // like counting.
-      incremented = false;
-      for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
-        const int64_t d_max = col_shape[d_i + 1];
-        ORT_ENFORCE(d_iter[d_i] < d_max);
-        if (d_iter[d_i] == d_max - 1) {
-          d_iter[d_i] = 0;
-        } else {  // d_iter[d_i] < d_max - 1
-          ++d_iter[d_i];
-          incremented = true;
-          break;
-        }
-      }
-    }  // while(incremented) {
-  }    // for (int c = 0; c < channels_col; ++c) {
-}

 template <>
 void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(
@ -949,7 +875,7 @@ void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(
    float* data_img,
    CPUMathUtil* context) {
  Set<float, CPUMathUtil>(img_size, 0, data_img, context);
-  Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>(
+  Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>()(
      data_col,
      img_shape,
      col_shape,