Implement Im2colNd NHWC and related qlinearconv logic for u8s8. (#5612)

Implement Im2colNd NHWC and related qlinearconv logic for u8s8, and training.
This commit is contained in:
Zhang Lei 2020-10-30 15:28:30 -07:00 committed by GitHub
parent d7f3baed18
commit 17bce6f07e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 377 additions and 196 deletions

View file

@ -71,7 +71,6 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
const size_t kernel_rank = kernel_shape.size();
BufferUniquePtr col_buffer;
std::vector<int64_t> col_buffer_shape;
// Pointwise convolutions can use the original input tensor in place,
// otherwise a temporary buffer is required for the im2col transform.
@ -81,13 +80,6 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(T)) * col_buffer_size);
col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
if (kernel_rank != 2) {
const auto& output_dims = output_shape.GetDims();
col_buffer_shape.reserve(1 + output_dims.size());
col_buffer_shape.push_back(kernel_dim);
col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end());
}
}
T* col_buffer_data = static_cast<T*>(col_buffer.get());
@ -120,10 +112,9 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
} else {
math::Im2colNd<T, StorageOrder::NCHW>()(
Xdata + group_id * X_offset,
X->Shape().GetDims().data() + 1,
col_buffer_shape.data(),
C * input_image_size,
col_buffer_size,
input_shape.GetDims().data(),
output_shape.GetDims().data(),
kernel_dim,
kernel_shape.data(),
strides.data(),
dilations.data(),
@ -251,19 +242,13 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc));
auto* col_buffer_data = static_cast<float*>(col_buffer.get());
TensorShape image_shape = X->Shape().Slice(1);
std::vector<int64_t> col_buffer_shape{kernel_dim};
col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(),
output_shape.GetDims().end());
for (int image_id = 0; image_id < N; ++image_id) {
for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
math::Im2colNd<float, StorageOrder::NCHW>()(
Xdata + group_id * X_offset,
image_shape.GetDims().data(),
col_buffer_shape.data(),
C * input_image_size,
col_buffer_size,
input_shape.GetDims().data(),
output_shape.GetDims().data(),
kernel_dim,
kernel_shape.data(),
strides.data(),
dilations.data(),

View file

@ -92,7 +92,6 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
const size_t kernel_rank = kernel_shape.size();
BufferUniquePtr col_buffer;
std::vector<int64_t> col_buffer_shape;
// Pointwise convolutions can use the original input tensor in place,
// otherwise a temporary buffer is required for the im2col transform.
@ -102,13 +101,6 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * col_buffer_size);
col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
if (kernel_rank != 2) {
const auto& output_dims = output_shape.GetDims();
col_buffer_shape.reserve(1 + output_dims.size());
col_buffer_shape.push_back(kernel_dim);
col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end());
}
}
auto* col_buffer_data = static_cast<uint8_t*>(col_buffer.get());
@ -143,10 +135,9 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
} else {
math::Im2colNd<uint8_t, StorageOrder::NCHW>()(
Xdata,
X->Shape().GetDims().data() + 1,
col_buffer_shape.data(),
C * input_image_size,
col_buffer_size,
input_shape.GetDims().data(),
output_shape.GetDims().data(),
kernel_dim,
kernel_shape.data(),
strides.data(),
dilations.data(),

View file

@ -74,9 +74,6 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
const T* filter_data = p.F->template Data<T>();
T* Ydata = p.Y->template MutableData<T>();
std::vector<int64_t> col_buffer_shape{kernel_dim};
col_buffer_shape.insert(col_buffer_shape.end(), p.input_shape.GetDims().begin(), p.input_shape.GetDims().end());
if (p.X->Shape().NumDimensions() == 4) {
for (auto image_id = 0; image_id < p.N; ++image_id) {
for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
@ -124,8 +121,7 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
Ydata += Y_offset * conv_transpose_attrs_.group;
}
} else {
TensorShape output_shape = p.Y->Shape().Slice(1);
output_shape[0] = output_shape[0] / conv_transpose_attrs_.group;
TensorShape output_shape = p.Y->Shape().Slice(2);
for (auto image_id = 0; image_id < p.N; ++image_id) {
for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
@ -147,9 +143,9 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
col_buffer_data,
output_shape.GetDims().data(),
col_buffer_shape.data(),
output_shape.Size(),
col_buffer_size,
p.input_shape.GetDims().data(),
kernel_dim,
Y_offset,
p.kernel_shape.data(),
p.strides.data(),
p.dilations.data(),

View file

@ -75,7 +75,6 @@ Status QLinearConv<uint8_t>::Compute(OpKernelContext* context) const {
const Tensor* B = context->Input<Tensor>(8);
const int64_t N = X->Shape()[0];
const int64_t C = X->Shape()[1];
const int64_t M = W->Shape()[0];
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
@ -125,20 +124,12 @@ Status QLinearConv<uint8_t>::Compute(OpKernelContext* context) const {
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
BufferUniquePtr col_buffer;
std::vector<int64_t> col_buffer_shape;
// Pointwise convolutions can use the original input tensor in place,
// otherwise a temporary buffer is required for the im2col transform.
if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) {
auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * col_buffer_size);
col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
if (kernel_rank != 2) {
const auto& output_dims = output_shape.GetDims();
col_buffer_shape.reserve(1 + output_dims.size());
col_buffer_shape.push_back(kernel_dim);
col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end());
}
}
auto* col_buffer_data = static_cast<uint8_t*>(col_buffer.get());
@ -187,10 +178,9 @@ Status QLinearConv<uint8_t>::Compute(OpKernelContext* context) const {
} else {
math::Im2colNd<uint8_t, StorageOrder::NCHW>()(
Xdata,
X->Shape().GetDims().data() + 1,
col_buffer_shape.data(),
C * input_image_size,
col_buffer_size,
input_shape.GetDims().data(),
output_shape.GetDims().data(),
kernel_dim,
kernel_shape.data(),
strides.data(),
dilations.data(),
@ -450,7 +440,6 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W_shape, kernel_shape));
const size_t kernel_rank = kernel_shape.size();
ORT_ENFORCE(kernel_rank == 2, "QLinearConv : must be 2D convolution");
std::vector<int64_t> pads(conv_attrs_.pads);
if (pads.empty()) {
@ -544,9 +533,10 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
auto* transpose_output = static_cast<uint8_t*>(alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * Y_offset));
BufferUniquePtr transpose_output_buffer(transpose_output, BufferDeleter(alloc));
BufferUniquePtr col_buffer;
// Pointwise convolutions can use the original input tensor in place,
// otherwise a temporary buffer is required for the im2col transform.
BufferUniquePtr col_buffer;
if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) {
auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * col_buffer_size);
col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
@ -582,6 +572,23 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
static_cast<size_t>(group_input_channels),
static_cast<size_t>(input_image_size));
if (kernel_rank != 2 && col_buffer_data != nullptr) {
// Try big Im2ColNd in this case, parallel it later if needed
math::Im2colNd<uint8_t, StorageOrder::NHWC>()(
transpose_input,
input_shape.GetDims().data(),
output_shape.GetDims().data(),
kernel_dim,
kernel_shape.data(),
strides.data(),
dilations.data(),
pads.data(),
static_cast<int>(kernel_rank),
col_buffer_data,
false,
X_zero_point_value);
}
auto conv_worker = [&](ptrdiff_t batch) {
auto work = concurrency::ThreadPool::PartitionWork(batch, thread_count, static_cast<ptrdiff_t>(output_image_size));
int64_t output_start = static_cast<int64_t>(work.start);
@ -592,24 +599,26 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
uint8_t* worker_gemm_input;
if (col_buffer_data != nullptr) {
worker_gemm_input = col_buffer_data + output_start * kernel_dim;
math::Im2col<uint8_t, StorageOrder::NHWC>()(
transpose_input,
group_input_channels,
input_shape[0],
input_shape[1],
kernel_shape[0],
kernel_shape[1],
dilations[0],
dilations[1],
pads[0],
pads[1],
strides[0],
strides[1],
output_shape[1],
output_start,
output_count,
worker_gemm_input,
X_zero_point_value);
if (kernel_rank == 2) {
math::Im2col<uint8_t, StorageOrder::NHWC>()(
transpose_input,
group_input_channels,
input_shape[0],
input_shape[1],
kernel_shape[0],
kernel_shape[1],
dilations[0],
dilations[1],
pads[0],
pads[1],
strides[0],
strides[1],
output_shape[1],
output_start,
output_count,
worker_gemm_input,
X_zero_point_value);
}
} else {
worker_gemm_input = transpose_input + output_start * kernel_dim;
}

View file

@ -254,9 +254,8 @@ struct Im2colNd {
void operator()(
const T* data_img,
const int64_t* im_shape,
const int64_t* col_shape,
int64_t img_size,
int64_t col_size,
const int64_t* output_shape,
int64_t channels_col,
const int64_t* kernel_shape,
const int64_t* stride,
const int64_t* dilation,
@ -267,78 +266,13 @@ struct Im2colNd {
T padding_value = 0);
};
template <typename T>
struct Im2colNd<T, StorageOrder::NCHW> {
void operator()(const T* data_img, const int64_t* im_shape, const int64_t* col_shape, int64_t /*img_size*/,
int64_t /*col_size*/, const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation,
const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false,
T padding_value = 0) {
int64_t kernel_size = 1;
for (int64_t i = 0; i < N; ++i) {
kernel_size *= kernel_shape[i];
}
int64_t channels_col = col_shape[0];
std::vector<int64_t> d_offset(N, 0);
std::vector<int64_t> d_iter(N, 0);
for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
// Loop over spatial axes in reverse order to compute a per-axis offset.
int64_t offset = c_col;
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
if (d_i < N - 1) {
offset /= kernel_shape[d_i + 1];
}
d_offset[d_i] = offset % kernel_shape[d_i];
}
for (bool incremented = true; incremented;) {
// Loop over spatial axes in forward order to compute the indices in the
// image and column, and whether the index lies in the padding.
int64_t index_col = c_col;
int64_t index_im = c_col / kernel_size;
bool is_padding = false;
for (int64_t d_i = 0; d_i < N; ++d_i) {
int64_t d = d_iter[d_i];
int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
index_col *= col_shape[d_i + 1];
index_col += d;
index_im *= im_shape[d_i + 1];
index_im += d_im;
}
if (!accumulate_output) {
if (is_padding) {
data_col[index_col] = padding_value;
} else {
data_col[index_col] = data_img[index_im];
}
} else if (!is_padding) { // col2im
data_col[index_im] += data_img[index_col];
}
// Loop over spatial axes in reverse order to choose an index,
// like counting.
incremented = false;
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
int64_t d_max = col_shape[d_i + 1];
ORT_ENFORCE(d_iter[d_i] < d_max);
if (d_iter[d_i] == d_max - 1) {
d_iter[d_i] = 0;
} else { // d_iter[d_i] < d_max - 1
++d_iter[d_i];
incremented = true;
break;
}
}
} // while(incremented) {
} // for (int c = 0; c < channels_col; ++c) {
}
};
template <typename T, class Provider, int order>
void Col2imNd(
const T* data_col,
const int64_t* img_shape,
const int64_t* col_shape,
const int64_t* output_shape,
int64_t channels_col,
int64_t img_size,
int64_t col_size,
const int64_t* kernel_shape,
const int64_t* stride,
const int64_t* dilation,

View file

@ -210,7 +210,6 @@ template void Gemv<double, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, int M, int
SPECIALIZED_AXPY(float)
#undef SPECIALIZED_AXPY
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \
template <> \
void Funcname<T, CPUMathUtil>(int N, const T* x, T* y, CPUMathUtil*) { \
@ -420,6 +419,130 @@ void Im2col<T, StorageOrder::NHWC>::operator()(const T* data_im, int64_t channel
template struct Im2col<uint8_t, StorageOrder::NHWC>;
// Loop over spatial axes in reverse order to choose an index, like counting.
static inline bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) {
bool has_next_output = false;
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
int64_t d_max = shape[d_i];
ORT_ENFORCE(dims[d_i] < d_max);
if (dims[d_i] == d_max - 1) {
dims[d_i] = 0;
} else { // dims[d_i] < d_max - 1
++dims[d_i];
has_next_output = true;
break;
}
}
return has_next_output;
}
template <typename T>
struct Im2colNd<T, StorageOrder::NCHW> {
void operator()(const T* data_img, const int64_t* im_shape, const int64_t* output_shape, int64_t channels_col,
const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation,
const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false,
T padding_value = 0) {
int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1LL, std::multiplies<int64_t>());
std::vector<int64_t> d_offset(N, 0);
std::vector<int64_t> d_iter(N, 0);
for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
// Loop over spatial axes in reverse order to compute a per-axis offset.
int64_t offset = c_col;
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
if (d_i < N - 1) {
offset /= kernel_shape[d_i + 1];
}
d_offset[d_i] = offset % kernel_shape[d_i];
}
do {
// Loop over spatial axes in forward order to compute the indices in the
// image and column, and whether the index lies in the padding.
int64_t index_col = c_col;
int64_t index_im = c_col / kernel_size;
bool is_padding = false;
for (int64_t d_i = 0; d_i < N; ++d_i) {
int64_t d = d_iter[d_i];
int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]);
index_col *= output_shape[d_i];
index_col += d;
index_im *= im_shape[d_i];
index_im += d_im;
}
if (!accumulate_output) {
if (is_padding) {
data_col[index_col] = padding_value;
} else {
data_col[index_col] = data_img[index_im];
}
} else if (!is_padding) { // col2im
data_col[index_im] += data_img[index_col];
}
} while (NextPosition(N, output_shape, d_iter.data()));
} // for (int c = 0; c < channels_col; ++c) {
}
};
template struct Im2colNd<float, StorageOrder::NCHW>;
template struct Im2colNd<uint8_t, StorageOrder::NCHW>;
template <typename T>
struct Im2colNd<T, StorageOrder::NHWC> {
void operator()(const T* data_img, const int64_t* im_shape, const int64_t* output_shape, int64_t channels_col,
const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation,
const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false,
T padding_value = 0) {
int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1LL, std::multiplies<int64_t>());
int64_t input_channels = channels_col / kernel_size;
ORT_ENFORCE(input_channels * kernel_size == channels_col, "Dimensions not match!");
// iterate dimensions on output image shape (without Batch and Channel)
std::vector<int64_t> d_output(N, 0);
// inner iterate dimensions on kernel shape (without output channel and input channel)
std::vector<int64_t> d_kernel(N, 0);
// Loop over spatial axes along the output image shape
int64_t outer_col_index = 0;
do {
// Loop over spatial axes in reverse order to choose an index on kernel dimensions
int64_t inner_col_index = 0;
do {
// Loop over spatial axes in forward order to compute the indices in the image
// and the inner col, and whether the index lies in the padding.
int64_t index_im = 0;
bool is_padding = false;
for (int64_t d_i = 0; d_i < N; ++d_i) {
int64_t d_im = d_output[d_i] * stride[d_i] - pad[d_i] + d_kernel[d_i] * dilation[d_i];
is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]);
index_im *= im_shape[d_i];
index_im += d_im;
}
index_im *= input_channels;
auto index_col = outer_col_index + inner_col_index;
if (!accumulate_output) {
if (is_padding) {
std::fill_n(data_col + index_col, input_channels, padding_value);
} else {
std::copy_n(data_img + index_im, input_channels, data_col + index_col);
}
} else if (!is_padding) { // col2im
const T* ptr_im = data_img + index_col;
T* ptr_col = data_col + index_im;
for (int64_t i = 0; i < input_channels; ++i) {
*ptr_col++ += *ptr_im++;
}
}
inner_col_index += input_channels;
} while (NextPosition(N, kernel_shape, d_kernel.data()));
outer_col_index += channels_col;
} while (NextPosition(N, output_shape, d_output.data()));
}
};
template struct Im2colNd<uint8_t, StorageOrder::NHWC>;
template <>
void Col2im<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, int64_t channels, int64_t height,
int64_t width, int64_t kernel_h, int64_t kernel_w,
@ -558,7 +681,7 @@ void Col2im<float, CPUMathUtil, StorageOrder::NHWC>(const float* data_col, int64
template <>
void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, const int64_t* img_shape,
const int64_t* col_shape, int64_t img_size, int64_t col_size,
const int64_t* output_shape, int64_t channels_col, int64_t img_size,
const int64_t* kernel_shape, const int64_t* stride,
const int64_t* dilation, const int64_t* pad, int64_t N,
float* data_img, CPUMathUtil* context) {
@ -566,9 +689,8 @@ void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, con
Im2colNd<float, StorageOrder::NCHW>()(
data_col,
img_shape,
col_shape,
img_size,
col_size,
output_shape,
channels_col,
kernel_shape,
stride,
dilation,

View file

@ -1,6 +1,8 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <algorithm>
#include "core/util/math.h"
#include "gtest/gtest.h"
#include "test/providers/provider_test_utils.h"
#include "core/mlas/inc/mlas.h"
@ -334,6 +336,23 @@ class QLinearConvOpTester {
return static_cast<T>(RoundHalfToEven(f) + requantize_values.zero_point_);
}
static bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) {
// Loop over spatial axes in reverse order to choose an index, like counting.
bool incremented = false;
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
int64_t d_max = shape[d_i];
ORT_ENFORCE(dims[d_i] < d_max);
if (dims[d_i] == d_max - 1) {
dims[d_i] = 0;
} else { // dims[d_i] < d_max - 1
++dims[d_i];
incremented = true;
break;
}
}
return incremented;
}
void ComputeExpectedOutput(std::vector<T1>& Y_data, std::vector<int64_t>& Y_shape) {
ORT_ENFORCE(W_.shape_.size() > 2);
ORT_ENFORCE(X_.shape_.size() == W_.shape_.size());
@ -377,20 +396,10 @@ class QLinearConvOpTester {
const int64_t* output_shape = Y_shape.data() + 2;
Y_data.resize(ShapeSize(Y_shape));
const int64_t input_h = input_shape[0];
const int64_t input_w = input_shape[1];
const int64_t input_image_size = input_h * input_w;
const int64_t kernel_h = kernel_shape[0];
const int64_t kernel_w = kernel_shape[1];
const int64_t kernel_size = kernel_h * kernel_w;
const int64_t output_h = output_shape[0];
const int64_t output_w = output_shape[1];
const int64_t pad_t = pads[0];
const int64_t pad_l = pads[1];
const int64_t dilation_h = dilations[0];
const int64_t dilation_w = dilations[1];
const int64_t stride_h = strides[0];
const int64_t stride_w = strides[1];
const int64_t input_image_size = std::accumulate(
input_shape, input_shape + kernel_rank, 1LL, std::multiplies<int64_t>());
const int64_t kernel_size = std::accumulate(
kernel_shape, kernel_shape + kernel_rank, 1LL, std::multiplies<int64_t>());
const int32_t X_zero_point = X_.zero_point_;
const T1* Xdata = X_.data_.data();
@ -409,29 +418,34 @@ class QLinearConvOpTester {
float weight_scale = W_.scale_[(W_.scale_.size() == 1) ? 0 : channel_index];
float requantize_scale = (X_.scale_[0] * weight_scale) / output_scale_;
for (int64_t oh = 0; oh < output_h; oh++) {
for (int64_t ow = 0; ow < output_w; ow++) {
int32_t sum = bias;
const T1* input_image = Xdata;
const T2* weight_data = weight_row;
for (int64_t ic = 0; ic < group_input_channels; ic++) {
for (int64_t kh = 0; kh < kernel_h; kh++) {
int64_t ih = kh * dilation_h + oh * stride_h - pad_t;
for (int64_t kw = 0; kw < kernel_w; kw++) {
int64_t iw = kw * dilation_w + ow * stride_w - pad_l;
int32_t w_value = static_cast<int32_t>(*weight_data++);
if (static_cast<uint64_t>(ih) < static_cast<uint64_t>(input_h) &&
static_cast<uint64_t>(iw) < static_cast<uint64_t>(input_w)) {
int32_t x_value = static_cast<int32_t>(input_image[ih * input_w + iw]) - X_zero_point;
sum += x_value * w_value;
}
}
std::vector<int64_t> d_output(kernel_rank, 0);
std::vector<int64_t> d_kernel(kernel_rank, 0);
do {
int32_t sum = bias;
const T1* input_image = Xdata;
const T2* weight_data = weight_row;
for (int64_t ic = 0; ic < group_input_channels; ic++) {
do {
int64_t input_offset = 0;
bool is_padding = false;
for (size_t axis = 0; axis < kernel_rank; ++axis) {
int64_t input_dim = d_kernel[axis] * dilations[axis] + d_output[axis] * strides[axis] - pads[axis];
is_padding |= !math::is_a_ge_zero_and_a_lt_b(input_dim, input_shape[axis]);
input_offset *= input_shape[axis];
input_offset += input_dim;
}
input_image += input_image_size;
}
*Ydata++ = RequantizeOutput<T1>(sum, requantize_scale, requantize_values);
int32_t w_value = static_cast<int32_t>(*weight_data++);
if (!is_padding) {
int32_t x_value = static_cast<int32_t>(input_image[input_offset]) - X_zero_point;
sum += x_value * w_value;
}
} while (NextPosition(kernel_rank, kernel_shape, d_kernel.data()));
input_image += input_image_size;
}
}
*Ydata++ = RequantizeOutput<T1>(sum, requantize_scale, requantize_values);
} while (NextPosition(kernel_rank, output_shape, d_output.data()));
weight_row += group_input_channels * kernel_size;
}
@ -538,6 +552,16 @@ class QLinearConvOpTester {
}
};
TEST(QLinearConvTest, Conv1D_U8S8) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({3, 24, 15}, .05f, 4);
test.GenerateRandomWeights({32, 24, 3}, .125f, 0);
test.GenerateRandomBias();
test.SetPads({1, 1});
test.SetOutputScaleAndZeroPoint(.55f, 54);
test.Run();
}
TEST(QLinearConvTest, Conv2D_U8S8) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({3, 24, 15, 11}, .05f, 4);
@ -548,6 +572,52 @@ TEST(QLinearConvTest, Conv2D_U8S8) {
test.Run();
}
TEST(QLinearConvTest, Conv3D_U8S8) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({2, 2, 15, 11, 6}, .05f, 4);
test.GenerateRandomWeights({5, 2, 3, 3, 3}, .125f, 0);
test.GenerateRandomBias();
test.SetPads({1, 1, 1, 1, 1, 1});
test.SetOutputScaleAndZeroPoint(.55f, 54);
test.Run();
}
TEST(QLinearConvTest, Conv1D_U8S8_Pointwise) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({3, 24, 15}, .05f, 4);
test.GenerateRandomWeights({32, 24, 1}, .125f, 0);
test.GenerateRandomBias();
test.SetOutputScaleAndZeroPoint(.55f, 54);
test.Run();
}
TEST(QLinearConvTest, Conv2D_U8S8_Pointwise) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({3, 24, 15, 11}, .05f, 4);
test.GenerateRandomWeights({32, 24, 1, 1}, .125f, 0);
test.GenerateRandomBias();
test.SetOutputScaleAndZeroPoint(.55f, 54);
test.Run();
}
TEST(QLinearConvTest, Conv3D_U8S8_Pointwise) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({2, 2, 15, 11, 6}, .05f, 4);
test.GenerateRandomWeights({5, 2, 1, 1, 1}, .125f, 0);
test.GenerateRandomBias();
test.SetOutputScaleAndZeroPoint(.55f, 54);
test.Run();
}
TEST(QLinearConvTest, Conv1D_U8S8_Dilations) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 4, 19}, .02f, 20);
test.GenerateRandomWeights({6, 4, 3}, .11f, 0);
test.SetDilations({2});
test.SetOutputScaleAndZeroPoint(.24f, 15);
test.Run();
}
TEST(QLinearConvTest, Conv2D_U8S8_Dilations) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 4, 19, 16}, .02f, 20);
@ -557,6 +627,24 @@ TEST(QLinearConvTest, Conv2D_U8S8_Dilations) {
test.Run();
}
TEST(QLinearConvTest, Conv3D_U8S8_Dilations) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 2, 19, 16, 8}, .02f, 20);
test.GenerateRandomWeights({6, 2, 3, 2, 2}, .11f, 0);
test.SetDilations({2, 2, 2});
test.SetOutputScaleAndZeroPoint(.24f, 15);
test.Run();
}
TEST(QLinearConvTest, Conv1D_U8S8_Strides) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 7, 18}, .04f, 16);
test.GenerateRandomWeights({5, 7, 2}, .14f, 0);
test.SetStrides({2});
test.SetOutputScaleAndZeroPoint(.31f, 30);
test.Run();
}
TEST(QLinearConvTest, Conv2D_U8S8_Strides) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 7, 18, 24}, .04f, 16);
@ -566,6 +654,26 @@ TEST(QLinearConvTest, Conv2D_U8S8_Strides) {
test.Run();
}
TEST(QLinearConvTest, Conv3D_U8S8_Strides) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 3, 18, 24, 18}, .04f, 16);
test.GenerateRandomWeights({2, 3, 2, 3, 2}, .14f, 0);
test.SetStrides({2, 2, 2});
test.SetOutputScaleAndZeroPoint(.31f, 30);
test.Run();
}
TEST(QLinearConvTest, Conv1D_U8S8_Groups) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 8, 13}, .03f, 7);
test.GenerateRandomWeights({12, 4, 3}, .10f, 0);
test.GenerateRandomBias();
test.SetPads({1, 1});
test.SetGroups(2);
test.SetOutputScaleAndZeroPoint(.76f, 88);
test.Run();
}
TEST(QLinearConvTest, Conv2D_U8S8_Groups) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 8, 13, 17}, .03f, 7);
@ -577,6 +685,17 @@ TEST(QLinearConvTest, Conv2D_U8S8_Groups) {
test.Run();
}
TEST(QLinearConvTest, Conv3D_U8S8_Groups) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 4, 13, 17, 13}, .03f, 7);
test.GenerateRandomWeights({6, 2, 3, 3, 3}, .10f, 0);
test.GenerateRandomBias();
test.SetPads({1, 1, 1, 1, 1, 1});
test.SetGroups(2);
test.SetOutputScaleAndZeroPoint(.76f, 88);
test.Run();
}
TEST(QLinearConvTest, Conv2D_U8S8_Groups_PerChannel) {
QLinearConvOpTester<uint8_t, int8_t> test;
test.GenerateRandomInput({1, 8, 13, 17}, .03f, 7);

View file

@ -703,6 +703,21 @@ TEST(GradientCheckerTest, ConvGrad) {
EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
}
//conv3d
{
TensorShape x_shape({2, 1, 5, 5, 5});
TensorShape w_shape({1, 1, 3, 3, 3});
TensorShape b_shape({1});
TensorShape y_shape({2, 1, 5, 5, 5});
gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
{MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1})},
// TODO: ConvGrad does not handle the case where W does not have gradient.
// Check for not has_gradient need to be disabled to pass this test.
false);
EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
}
//conv_with_strides
{
TensorShape x_shape({2, 1, 7, 5});
@ -718,6 +733,22 @@ TEST(GradientCheckerTest, ConvGrad) {
false);
EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
}
//conv3d_with_strides
{
TensorShape x_shape({2, 1, 7, 5, 5});
TensorShape w_shape({1, 1, 3, 3, 3});
TensorShape b_shape({1});
TensorShape y_shape({2, 1, 4, 3, 3});
gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
{MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1}),
MakeAttribute("strides", std::vector<int64_t>{2, 2, 2})},
// TODO: ConvGrad does not handle the case where W does not have gradient.
// Check for not has_gradient need to be disabled to pass this test.
false);
EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
}
}
static void TestConcatOpGrad(const std::string& op_type,

View file

@ -98,11 +98,6 @@ Status ConvGrad<T>::Compute(OpKernelContext* context) const {
&CPUMathUtil::Instance());
}
TensorShape image_shape = X->Shape().Slice(1);
std::vector<int64_t> col_buffer_shape{kernel_dim};
col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(),
output_shape.GetDims().end());
for (int image_id = 0; image_id < N; ++image_id) {
for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
if (Is2DKernel) {
@ -125,10 +120,9 @@ Status ConvGrad<T>::Compute(OpKernelContext* context) const {
} else {
math::Im2colNd<T, StorageOrder::NCHW>()(
Xdata + group_id * X_offset,
image_shape.GetDims().data(),
col_buffer_shape.data(),
C * input_image_size,
col_buffer_size,
input_shape.GetDims().data(),
output_shape.GetDims().data(),
kernel_dim,
kernel_shape.data(),
strides.data(),
dilations.data(),
@ -208,10 +202,10 @@ Status ConvGrad<T>::Compute(OpKernelContext* context) const {
} else {
math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
col_buffer_data,
image_shape.GetDims().data(),
col_buffer_shape.data(),
input_shape.GetDims().data(),
output_shape.GetDims().data(),
kernel_dim,
C * input_image_size,
col_buffer_size,
kernel_shape.data(),
strides.data(),
dilations.data(),