mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-29 23:06:41 +00:00
have Im2ColNd support all types and allow customized padding value. (#273)
* have Im2ColNd support all types and allow customized padding value. * only specialize the template in order NCHW. * fix build break. * fix build break
This commit is contained in:
parent
058803086d
commit
75934af896
4 changed files with 114 additions and 108 deletions
|
|
@ -20,15 +20,15 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
|
|||
|
||||
if (kernel_shape.size() + 2 != W->Shape().NumDimensions()) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "kernel_shape num_dims is not compatible with W num_dims.",
|
||||
" kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
|
||||
" W: ", W->Shape().ToString().c_str());
|
||||
" kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
|
||||
" W: ", W->Shape().ToString().c_str());
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < kernel_shape.size(); ++i) {
|
||||
if (kernel_shape[i] != W->Shape()[i + 2]) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "kernel_shape is not compatible with W shape.",
|
||||
" kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
|
||||
" W: ", W->Shape().ToString().c_str());
|
||||
" kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
|
||||
" W: ", W->Shape().ToString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -111,7 +111,7 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
|
|||
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
for (int group_id = 0; group_id < group_; ++group_id) {
|
||||
math::Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>(
|
||||
math::Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>()(
|
||||
Xdata + group_id * X_offset,
|
||||
image_shape.GetDims().data(),
|
||||
col_buffer_shape.data(),
|
||||
|
|
|
|||
|
|
@ -57,15 +57,15 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
|
||||
if (kernel_shape.size() + 2 != W->Shape().NumDimensions()) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "kernel_shape num_dims is not compatible with W num_dims.",
|
||||
" kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
|
||||
" W: ", W->Shape().ToString().c_str());
|
||||
" kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
|
||||
" W: ", W->Shape().ToString().c_str());
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < kernel_shape.size(); ++i) {
|
||||
if (kernel_shape[i] != W->Shape()[i + 2]) {
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "kernel_shape is not compatible with W shape.",
|
||||
" kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
|
||||
" W: ", W->Shape().ToString().c_str());
|
||||
" kernel_shape: ", TensorShape(kernel_shape).ToString().c_str(),
|
||||
" W: ", W->Shape().ToString().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -135,7 +135,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
col_buffer_data,
|
||||
&CPUMathUtil::Instance());
|
||||
} else {
|
||||
math::Im2colNd<T, CPUMathUtil, StorageOrder::NCHW>(
|
||||
math::Im2colNd<T, CPUMathUtil, StorageOrder::NCHW>()(
|
||||
Xdata + group_id * X_offset,
|
||||
image_shape.GetDims().data(),
|
||||
col_buffer_shape.data(),
|
||||
|
|
|
|||
|
|
@ -327,20 +327,100 @@ void Axpby(
|
|||
Provider* provider);
|
||||
|
||||
template <typename T, class Provider, int order>
|
||||
void Im2colNd(
|
||||
const T* data_img,
|
||||
const int64_t* im_shape,
|
||||
const int64_t* col_shape,
|
||||
const int64_t img_size,
|
||||
const int64_t col_size,
|
||||
const int64_t* kernel_shape,
|
||||
const int64_t* stride,
|
||||
const int64_t* dilation,
|
||||
const int64_t* pad,
|
||||
const int64_t N,
|
||||
T* data_col,
|
||||
Provider* provider,
|
||||
bool accumulate_output = false);
|
||||
struct Im2colNd {
|
||||
void operator()(
|
||||
const T* data_img,
|
||||
const int64_t* im_shape,
|
||||
const int64_t* col_shape,
|
||||
const int64_t img_size,
|
||||
const int64_t col_size,
|
||||
const int64_t* kernel_shape,
|
||||
const int64_t* stride,
|
||||
const int64_t* dilation,
|
||||
const int64_t* pad,
|
||||
const int64_t N,
|
||||
T* data_col,
|
||||
Provider* /*provider*/,
|
||||
bool accumulate_output = false,
|
||||
T padding_value = 0);
|
||||
};
|
||||
|
||||
template <typename T, class Provider>
|
||||
struct Im2colNd<T, Provider, StorageOrder::NCHW> {
|
||||
void operator()(
|
||||
const T* data_img,
|
||||
const int64_t* im_shape,
|
||||
const int64_t* col_shape,
|
||||
const int64_t /*img_size*/,
|
||||
const int64_t /*col_size*/,
|
||||
const int64_t* kernel_shape,
|
||||
const int64_t* stride,
|
||||
const int64_t* dilation,
|
||||
const int64_t* pad,
|
||||
const int64_t N,
|
||||
T* data_col,
|
||||
Provider* /*provider*/,
|
||||
bool accumulate_output = false,
|
||||
T padding_value = 0) {
|
||||
int64_t kernel_size = 1;
|
||||
for (int64_t i = 0; i < N; ++i) {
|
||||
kernel_size *= kernel_shape[i];
|
||||
}
|
||||
const int64_t channels_col = col_shape[0];
|
||||
std::vector<int64_t> d_offset(N, 0);
|
||||
std::vector<int64_t> d_iter(N, 0);
|
||||
for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
|
||||
// Loop over spatial axes in reverse order to compute a per-axis offset.
|
||||
int64_t offset = c_col;
|
||||
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
|
||||
if (d_i < N - 1) {
|
||||
offset /= kernel_shape[d_i + 1];
|
||||
}
|
||||
d_offset[d_i] = offset % kernel_shape[d_i];
|
||||
}
|
||||
for (bool incremented = true; incremented;) {
|
||||
// Loop over spatial axes in forward order to compute the indices in the
|
||||
// image and column, and whether the index lies in the padding.
|
||||
int64_t index_col = c_col;
|
||||
int64_t index_im = c_col / kernel_size;
|
||||
bool is_padding = false;
|
||||
for (int64_t d_i = 0; d_i < N; ++d_i) {
|
||||
const int64_t d = d_iter[d_i];
|
||||
const int64_t d_im =
|
||||
d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
|
||||
is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
|
||||
index_col *= col_shape[d_i + 1];
|
||||
index_col += d;
|
||||
index_im *= im_shape[d_i + 1];
|
||||
index_im += d_im;
|
||||
}
|
||||
if (!accumulate_output) {
|
||||
if (is_padding) {
|
||||
data_col[index_col] = padding_value;
|
||||
} else {
|
||||
data_col[index_col] = data_img[index_im];
|
||||
}
|
||||
} else if (!is_padding) { // col2im
|
||||
data_col[index_im] += data_img[index_col];
|
||||
}
|
||||
// Loop over spatial axes in reverse order to choose an index,
|
||||
// like counting.
|
||||
incremented = false;
|
||||
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
|
||||
const int64_t d_max = col_shape[d_i + 1];
|
||||
ORT_ENFORCE(d_iter[d_i] < d_max);
|
||||
if (d_iter[d_i] == d_max - 1) {
|
||||
d_iter[d_i] = 0;
|
||||
} else { // d_iter[d_i] < d_max - 1
|
||||
++d_iter[d_i];
|
||||
incremented = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // while(incremented) {
|
||||
} // for (int c = 0; c < channels_col; ++c) {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class Provider, int order>
|
||||
void Col2imNd(
|
||||
|
|
|
|||
|
|
@ -475,15 +475,15 @@ void GemmBatched<float, CPUMathUtil>(
|
|||
}
|
||||
}
|
||||
|
||||
// MKL will be implmenet as an execution provider
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// MKL VML alternatives.
|
||||
// Depending on whether we are using MKL, we will delegate the Caffe math
|
||||
// functions that are VML-related to either the VML call or the Eigen
|
||||
// implementation. If you are setting the flags (such as AVX) right for your CPU
|
||||
// architecture, usually Eigen will deliver a throughput as fast as the VML
|
||||
// functions.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// MKL will be implmenet as an execution provider
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// MKL VML alternatives.
|
||||
// Depending on whether we are using MKL, we will delegate the Caffe math
|
||||
// functions that are VML-related to either the VML call or the Eigen
|
||||
// implementation. If you are setting the flags (such as AVX) right for your CPU
|
||||
// architecture, usually Eigen will deliver a throughput as fast as the VML
|
||||
// functions.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \
|
||||
template <> \
|
||||
|
|
@ -859,80 +859,6 @@ void Select<float, CPUMathUtil>(
|
|||
y[i] = x[i * D + idx[i]];
|
||||
}
|
||||
}
|
||||
// Ported from caffe 1.
|
||||
template <>
|
||||
void Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>(
|
||||
const float* data_img,
|
||||
const int64_t* im_shape,
|
||||
const int64_t* col_shape,
|
||||
const int64_t /* img_size*/,
|
||||
const int64_t /* col_size*/,
|
||||
const int64_t* kernel_shape,
|
||||
const int64_t* stride,
|
||||
const int64_t* dilation,
|
||||
const int64_t* pad,
|
||||
const int64_t N,
|
||||
float* data_col,
|
||||
CPUMathUtil* /* context */,
|
||||
bool accumulate_output) {
|
||||
int64_t kernel_size = 1;
|
||||
for (int64_t i = 0; i < N; ++i) {
|
||||
kernel_size *= kernel_shape[i];
|
||||
}
|
||||
const int64_t channels_col = col_shape[0];
|
||||
std::vector<int64_t> d_offset(N, 0);
|
||||
std::vector<int64_t> d_iter(N, 0);
|
||||
for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
|
||||
// Loop over spatial axes in reverse order to compute a per-axis offset.
|
||||
int64_t offset = c_col;
|
||||
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
|
||||
if (d_i < N - 1) {
|
||||
offset /= kernel_shape[d_i + 1];
|
||||
}
|
||||
d_offset[d_i] = offset % kernel_shape[d_i];
|
||||
}
|
||||
for (bool incremented = true; incremented;) {
|
||||
// Loop over spatial axes in forward order to compute the indices in the
|
||||
// image and column, and whether the index lies in the padding.
|
||||
int64_t index_col = c_col;
|
||||
int64_t index_im = c_col / kernel_size;
|
||||
bool is_padding = false;
|
||||
for (int64_t d_i = 0; d_i < N; ++d_i) {
|
||||
const int64_t d = d_iter[d_i];
|
||||
const int64_t d_im =
|
||||
d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
|
||||
is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
|
||||
index_col *= col_shape[d_i + 1];
|
||||
index_col += d;
|
||||
index_im *= im_shape[d_i + 1];
|
||||
index_im += d_im;
|
||||
}
|
||||
if (!accumulate_output) {
|
||||
if (is_padding) {
|
||||
data_col[index_col] = 0;
|
||||
} else {
|
||||
data_col[index_col] = data_img[index_im];
|
||||
}
|
||||
} else if (!is_padding) { // col2im
|
||||
data_col[index_im] += data_img[index_col];
|
||||
}
|
||||
// Loop over spatial axes in reverse order to choose an index,
|
||||
// like counting.
|
||||
incremented = false;
|
||||
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
|
||||
const int64_t d_max = col_shape[d_i + 1];
|
||||
ORT_ENFORCE(d_iter[d_i] < d_max);
|
||||
if (d_iter[d_i] == d_max - 1) {
|
||||
d_iter[d_i] = 0;
|
||||
} else { // d_iter[d_i] < d_max - 1
|
||||
++d_iter[d_i];
|
||||
incremented = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // while(incremented) {
|
||||
} // for (int c = 0; c < channels_col; ++c) {
|
||||
}
|
||||
|
||||
template <>
|
||||
void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(
|
||||
|
|
@ -949,7 +875,7 @@ void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(
|
|||
float* data_img,
|
||||
CPUMathUtil* context) {
|
||||
Set<float, CPUMathUtil>(img_size, 0, data_img, context);
|
||||
Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>(
|
||||
Im2colNd<float, CPUMathUtil, StorageOrder::NCHW>()(
|
||||
data_col,
|
||||
img_shape,
|
||||
col_shape,
|
||||
|
|
|
|||
Loading…
Reference in a new issue