mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-04 23:59:56 +00:00
Implement Im2colNd NHWC and related qlinearconv logic for u8s8. (#5612)
Implement Im2colNd NHWC and related qlinearconv logic for u8s8, and training.
This commit is contained in:
parent
d7f3baed18
commit
17bce6f07e
9 changed files with 377 additions and 196 deletions
|
|
@ -71,7 +71,6 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
const size_t kernel_rank = kernel_shape.size();
|
||||
|
||||
BufferUniquePtr col_buffer;
|
||||
std::vector<int64_t> col_buffer_shape;
|
||||
|
||||
// Pointwise convolutions can use the original input tensor in place,
|
||||
// otherwise a temporary buffer is required for the im2col transform.
|
||||
|
|
@ -81,13 +80,6 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
|
||||
auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(T)) * col_buffer_size);
|
||||
col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
|
||||
|
||||
if (kernel_rank != 2) {
|
||||
const auto& output_dims = output_shape.GetDims();
|
||||
col_buffer_shape.reserve(1 + output_dims.size());
|
||||
col_buffer_shape.push_back(kernel_dim);
|
||||
col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end());
|
||||
}
|
||||
}
|
||||
|
||||
T* col_buffer_data = static_cast<T*>(col_buffer.get());
|
||||
|
|
@ -120,10 +112,9 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
|
|||
} else {
|
||||
math::Im2colNd<T, StorageOrder::NCHW>()(
|
||||
Xdata + group_id * X_offset,
|
||||
X->Shape().GetDims().data() + 1,
|
||||
col_buffer_shape.data(),
|
||||
C * input_image_size,
|
||||
col_buffer_size,
|
||||
input_shape.GetDims().data(),
|
||||
output_shape.GetDims().data(),
|
||||
kernel_dim,
|
||||
kernel_shape.data(),
|
||||
strides.data(),
|
||||
dilations.data(),
|
||||
|
|
@ -251,19 +242,13 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
|
|||
BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc));
|
||||
auto* col_buffer_data = static_cast<float*>(col_buffer.get());
|
||||
|
||||
TensorShape image_shape = X->Shape().Slice(1);
|
||||
std::vector<int64_t> col_buffer_shape{kernel_dim};
|
||||
col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(),
|
||||
output_shape.GetDims().end());
|
||||
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
|
||||
math::Im2colNd<float, StorageOrder::NCHW>()(
|
||||
Xdata + group_id * X_offset,
|
||||
image_shape.GetDims().data(),
|
||||
col_buffer_shape.data(),
|
||||
C * input_image_size,
|
||||
col_buffer_size,
|
||||
input_shape.GetDims().data(),
|
||||
output_shape.GetDims().data(),
|
||||
kernel_dim,
|
||||
kernel_shape.data(),
|
||||
strides.data(),
|
||||
dilations.data(),
|
||||
|
|
|
|||
|
|
@ -92,7 +92,6 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
|
|||
const size_t kernel_rank = kernel_shape.size();
|
||||
|
||||
BufferUniquePtr col_buffer;
|
||||
std::vector<int64_t> col_buffer_shape;
|
||||
|
||||
// Pointwise convolutions can use the original input tensor in place,
|
||||
// otherwise a temporary buffer is required for the im2col transform.
|
||||
|
|
@ -102,13 +101,6 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
|
|||
|
||||
auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * col_buffer_size);
|
||||
col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
|
||||
|
||||
if (kernel_rank != 2) {
|
||||
const auto& output_dims = output_shape.GetDims();
|
||||
col_buffer_shape.reserve(1 + output_dims.size());
|
||||
col_buffer_shape.push_back(kernel_dim);
|
||||
col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end());
|
||||
}
|
||||
}
|
||||
|
||||
auto* col_buffer_data = static_cast<uint8_t*>(col_buffer.get());
|
||||
|
|
@ -143,10 +135,9 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
|
|||
} else {
|
||||
math::Im2colNd<uint8_t, StorageOrder::NCHW>()(
|
||||
Xdata,
|
||||
X->Shape().GetDims().data() + 1,
|
||||
col_buffer_shape.data(),
|
||||
C * input_image_size,
|
||||
col_buffer_size,
|
||||
input_shape.GetDims().data(),
|
||||
output_shape.GetDims().data(),
|
||||
kernel_dim,
|
||||
kernel_shape.data(),
|
||||
strides.data(),
|
||||
dilations.data(),
|
||||
|
|
|
|||
|
|
@ -74,9 +74,6 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
|
|||
const T* filter_data = p.F->template Data<T>();
|
||||
T* Ydata = p.Y->template MutableData<T>();
|
||||
|
||||
std::vector<int64_t> col_buffer_shape{kernel_dim};
|
||||
col_buffer_shape.insert(col_buffer_shape.end(), p.input_shape.GetDims().begin(), p.input_shape.GetDims().end());
|
||||
|
||||
if (p.X->Shape().NumDimensions() == 4) {
|
||||
for (auto image_id = 0; image_id < p.N; ++image_id) {
|
||||
for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
|
||||
|
|
@ -124,8 +121,7 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
|
|||
Ydata += Y_offset * conv_transpose_attrs_.group;
|
||||
}
|
||||
} else {
|
||||
TensorShape output_shape = p.Y->Shape().Slice(1);
|
||||
output_shape[0] = output_shape[0] / conv_transpose_attrs_.group;
|
||||
TensorShape output_shape = p.Y->Shape().Slice(2);
|
||||
|
||||
for (auto image_id = 0; image_id < p.N; ++image_id) {
|
||||
for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
|
||||
|
|
@ -147,9 +143,9 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
|
|||
math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
|
||||
col_buffer_data,
|
||||
output_shape.GetDims().data(),
|
||||
col_buffer_shape.data(),
|
||||
output_shape.Size(),
|
||||
col_buffer_size,
|
||||
p.input_shape.GetDims().data(),
|
||||
kernel_dim,
|
||||
Y_offset,
|
||||
p.kernel_shape.data(),
|
||||
p.strides.data(),
|
||||
p.dilations.data(),
|
||||
|
|
|
|||
|
|
@ -75,7 +75,6 @@ Status QLinearConv<uint8_t>::Compute(OpKernelContext* context) const {
|
|||
const Tensor* B = context->Input<Tensor>(8);
|
||||
|
||||
const int64_t N = X->Shape()[0];
|
||||
const int64_t C = X->Shape()[1];
|
||||
const int64_t M = W->Shape()[0];
|
||||
ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
|
||||
|
||||
|
|
@ -125,20 +124,12 @@ Status QLinearConv<uint8_t>::Compute(OpKernelContext* context) const {
|
|||
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
|
||||
|
||||
BufferUniquePtr col_buffer;
|
||||
std::vector<int64_t> col_buffer_shape;
|
||||
|
||||
// Pointwise convolutions can use the original input tensor in place,
|
||||
// otherwise a temporary buffer is required for the im2col transform.
|
||||
if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) {
|
||||
auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * col_buffer_size);
|
||||
col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
|
||||
|
||||
if (kernel_rank != 2) {
|
||||
const auto& output_dims = output_shape.GetDims();
|
||||
col_buffer_shape.reserve(1 + output_dims.size());
|
||||
col_buffer_shape.push_back(kernel_dim);
|
||||
col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end());
|
||||
}
|
||||
}
|
||||
|
||||
auto* col_buffer_data = static_cast<uint8_t*>(col_buffer.get());
|
||||
|
|
@ -187,10 +178,9 @@ Status QLinearConv<uint8_t>::Compute(OpKernelContext* context) const {
|
|||
} else {
|
||||
math::Im2colNd<uint8_t, StorageOrder::NCHW>()(
|
||||
Xdata,
|
||||
X->Shape().GetDims().data() + 1,
|
||||
col_buffer_shape.data(),
|
||||
C * input_image_size,
|
||||
col_buffer_size,
|
||||
input_shape.GetDims().data(),
|
||||
output_shape.GetDims().data(),
|
||||
kernel_dim,
|
||||
kernel_shape.data(),
|
||||
strides.data(),
|
||||
dilations.data(),
|
||||
|
|
@ -450,7 +440,6 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
|
|||
ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W_shape, kernel_shape));
|
||||
|
||||
const size_t kernel_rank = kernel_shape.size();
|
||||
ORT_ENFORCE(kernel_rank == 2, "QLinearConv : must be 2D convolution");
|
||||
|
||||
std::vector<int64_t> pads(conv_attrs_.pads);
|
||||
if (pads.empty()) {
|
||||
|
|
@ -544,9 +533,10 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
|
|||
auto* transpose_output = static_cast<uint8_t*>(alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * Y_offset));
|
||||
BufferUniquePtr transpose_output_buffer(transpose_output, BufferDeleter(alloc));
|
||||
|
||||
BufferUniquePtr col_buffer;
|
||||
|
||||
// Pointwise convolutions can use the original input tensor in place,
|
||||
// otherwise a temporary buffer is required for the im2col transform.
|
||||
BufferUniquePtr col_buffer;
|
||||
if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) {
|
||||
auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * col_buffer_size);
|
||||
col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
|
||||
|
|
@ -582,6 +572,23 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
|
|||
static_cast<size_t>(group_input_channels),
|
||||
static_cast<size_t>(input_image_size));
|
||||
|
||||
if (kernel_rank != 2 && col_buffer_data != nullptr) {
|
||||
// Try big Im2ColNd in this case, parallel it later if needed
|
||||
math::Im2colNd<uint8_t, StorageOrder::NHWC>()(
|
||||
transpose_input,
|
||||
input_shape.GetDims().data(),
|
||||
output_shape.GetDims().data(),
|
||||
kernel_dim,
|
||||
kernel_shape.data(),
|
||||
strides.data(),
|
||||
dilations.data(),
|
||||
pads.data(),
|
||||
static_cast<int>(kernel_rank),
|
||||
col_buffer_data,
|
||||
false,
|
||||
X_zero_point_value);
|
||||
}
|
||||
|
||||
auto conv_worker = [&](ptrdiff_t batch) {
|
||||
auto work = concurrency::ThreadPool::PartitionWork(batch, thread_count, static_cast<ptrdiff_t>(output_image_size));
|
||||
int64_t output_start = static_cast<int64_t>(work.start);
|
||||
|
|
@ -592,24 +599,26 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
|
|||
uint8_t* worker_gemm_input;
|
||||
if (col_buffer_data != nullptr) {
|
||||
worker_gemm_input = col_buffer_data + output_start * kernel_dim;
|
||||
math::Im2col<uint8_t, StorageOrder::NHWC>()(
|
||||
transpose_input,
|
||||
group_input_channels,
|
||||
input_shape[0],
|
||||
input_shape[1],
|
||||
kernel_shape[0],
|
||||
kernel_shape[1],
|
||||
dilations[0],
|
||||
dilations[1],
|
||||
pads[0],
|
||||
pads[1],
|
||||
strides[0],
|
||||
strides[1],
|
||||
output_shape[1],
|
||||
output_start,
|
||||
output_count,
|
||||
worker_gemm_input,
|
||||
X_zero_point_value);
|
||||
if (kernel_rank == 2) {
|
||||
math::Im2col<uint8_t, StorageOrder::NHWC>()(
|
||||
transpose_input,
|
||||
group_input_channels,
|
||||
input_shape[0],
|
||||
input_shape[1],
|
||||
kernel_shape[0],
|
||||
kernel_shape[1],
|
||||
dilations[0],
|
||||
dilations[1],
|
||||
pads[0],
|
||||
pads[1],
|
||||
strides[0],
|
||||
strides[1],
|
||||
output_shape[1],
|
||||
output_start,
|
||||
output_count,
|
||||
worker_gemm_input,
|
||||
X_zero_point_value);
|
||||
}
|
||||
} else {
|
||||
worker_gemm_input = transpose_input + output_start * kernel_dim;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -254,9 +254,8 @@ struct Im2colNd {
|
|||
void operator()(
|
||||
const T* data_img,
|
||||
const int64_t* im_shape,
|
||||
const int64_t* col_shape,
|
||||
int64_t img_size,
|
||||
int64_t col_size,
|
||||
const int64_t* output_shape,
|
||||
int64_t channels_col,
|
||||
const int64_t* kernel_shape,
|
||||
const int64_t* stride,
|
||||
const int64_t* dilation,
|
||||
|
|
@ -267,78 +266,13 @@ struct Im2colNd {
|
|||
T padding_value = 0);
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct Im2colNd<T, StorageOrder::NCHW> {
|
||||
void operator()(const T* data_img, const int64_t* im_shape, const int64_t* col_shape, int64_t /*img_size*/,
|
||||
int64_t /*col_size*/, const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation,
|
||||
const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false,
|
||||
T padding_value = 0) {
|
||||
int64_t kernel_size = 1;
|
||||
for (int64_t i = 0; i < N; ++i) {
|
||||
kernel_size *= kernel_shape[i];
|
||||
}
|
||||
int64_t channels_col = col_shape[0];
|
||||
std::vector<int64_t> d_offset(N, 0);
|
||||
std::vector<int64_t> d_iter(N, 0);
|
||||
for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
|
||||
// Loop over spatial axes in reverse order to compute a per-axis offset.
|
||||
int64_t offset = c_col;
|
||||
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
|
||||
if (d_i < N - 1) {
|
||||
offset /= kernel_shape[d_i + 1];
|
||||
}
|
||||
d_offset[d_i] = offset % kernel_shape[d_i];
|
||||
}
|
||||
for (bool incremented = true; incremented;) {
|
||||
// Loop over spatial axes in forward order to compute the indices in the
|
||||
// image and column, and whether the index lies in the padding.
|
||||
int64_t index_col = c_col;
|
||||
int64_t index_im = c_col / kernel_size;
|
||||
bool is_padding = false;
|
||||
for (int64_t d_i = 0; d_i < N; ++d_i) {
|
||||
int64_t d = d_iter[d_i];
|
||||
int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
|
||||
is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
|
||||
index_col *= col_shape[d_i + 1];
|
||||
index_col += d;
|
||||
index_im *= im_shape[d_i + 1];
|
||||
index_im += d_im;
|
||||
}
|
||||
if (!accumulate_output) {
|
||||
if (is_padding) {
|
||||
data_col[index_col] = padding_value;
|
||||
} else {
|
||||
data_col[index_col] = data_img[index_im];
|
||||
}
|
||||
} else if (!is_padding) { // col2im
|
||||
data_col[index_im] += data_img[index_col];
|
||||
}
|
||||
// Loop over spatial axes in reverse order to choose an index,
|
||||
// like counting.
|
||||
incremented = false;
|
||||
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
|
||||
int64_t d_max = col_shape[d_i + 1];
|
||||
ORT_ENFORCE(d_iter[d_i] < d_max);
|
||||
if (d_iter[d_i] == d_max - 1) {
|
||||
d_iter[d_i] = 0;
|
||||
} else { // d_iter[d_i] < d_max - 1
|
||||
++d_iter[d_i];
|
||||
incremented = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // while(incremented) {
|
||||
} // for (int c = 0; c < channels_col; ++c) {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class Provider, int order>
|
||||
void Col2imNd(
|
||||
const T* data_col,
|
||||
const int64_t* img_shape,
|
||||
const int64_t* col_shape,
|
||||
const int64_t* output_shape,
|
||||
int64_t channels_col,
|
||||
int64_t img_size,
|
||||
int64_t col_size,
|
||||
const int64_t* kernel_shape,
|
||||
const int64_t* stride,
|
||||
const int64_t* dilation,
|
||||
|
|
|
|||
|
|
@ -210,7 +210,6 @@ template void Gemv<double, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, int M, int
|
|||
SPECIALIZED_AXPY(float)
|
||||
#undef SPECIALIZED_AXPY
|
||||
|
||||
|
||||
#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \
|
||||
template <> \
|
||||
void Funcname<T, CPUMathUtil>(int N, const T* x, T* y, CPUMathUtil*) { \
|
||||
|
|
@ -420,6 +419,130 @@ void Im2col<T, StorageOrder::NHWC>::operator()(const T* data_im, int64_t channel
|
|||
|
||||
template struct Im2col<uint8_t, StorageOrder::NHWC>;
|
||||
|
||||
// Loop over spatial axes in reverse order to choose an index, like counting.
|
||||
static inline bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) {
|
||||
bool has_next_output = false;
|
||||
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
|
||||
int64_t d_max = shape[d_i];
|
||||
ORT_ENFORCE(dims[d_i] < d_max);
|
||||
if (dims[d_i] == d_max - 1) {
|
||||
dims[d_i] = 0;
|
||||
} else { // dims[d_i] < d_max - 1
|
||||
++dims[d_i];
|
||||
has_next_output = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return has_next_output;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct Im2colNd<T, StorageOrder::NCHW> {
|
||||
void operator()(const T* data_img, const int64_t* im_shape, const int64_t* output_shape, int64_t channels_col,
|
||||
const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation,
|
||||
const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false,
|
||||
T padding_value = 0) {
|
||||
int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1LL, std::multiplies<int64_t>());
|
||||
std::vector<int64_t> d_offset(N, 0);
|
||||
std::vector<int64_t> d_iter(N, 0);
|
||||
for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
|
||||
// Loop over spatial axes in reverse order to compute a per-axis offset.
|
||||
int64_t offset = c_col;
|
||||
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
|
||||
if (d_i < N - 1) {
|
||||
offset /= kernel_shape[d_i + 1];
|
||||
}
|
||||
d_offset[d_i] = offset % kernel_shape[d_i];
|
||||
}
|
||||
do {
|
||||
// Loop over spatial axes in forward order to compute the indices in the
|
||||
// image and column, and whether the index lies in the padding.
|
||||
int64_t index_col = c_col;
|
||||
int64_t index_im = c_col / kernel_size;
|
||||
bool is_padding = false;
|
||||
for (int64_t d_i = 0; d_i < N; ++d_i) {
|
||||
int64_t d = d_iter[d_i];
|
||||
int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
|
||||
is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]);
|
||||
index_col *= output_shape[d_i];
|
||||
index_col += d;
|
||||
index_im *= im_shape[d_i];
|
||||
index_im += d_im;
|
||||
}
|
||||
if (!accumulate_output) {
|
||||
if (is_padding) {
|
||||
data_col[index_col] = padding_value;
|
||||
} else {
|
||||
data_col[index_col] = data_img[index_im];
|
||||
}
|
||||
} else if (!is_padding) { // col2im
|
||||
data_col[index_im] += data_img[index_col];
|
||||
}
|
||||
} while (NextPosition(N, output_shape, d_iter.data()));
|
||||
} // for (int c = 0; c < channels_col; ++c) {
|
||||
}
|
||||
};
|
||||
|
||||
template struct Im2colNd<float, StorageOrder::NCHW>;
|
||||
template struct Im2colNd<uint8_t, StorageOrder::NCHW>;
|
||||
|
||||
template <typename T>
|
||||
struct Im2colNd<T, StorageOrder::NHWC> {
|
||||
void operator()(const T* data_img, const int64_t* im_shape, const int64_t* output_shape, int64_t channels_col,
|
||||
const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation,
|
||||
const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false,
|
||||
T padding_value = 0) {
|
||||
int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1LL, std::multiplies<int64_t>());
|
||||
int64_t input_channels = channels_col / kernel_size;
|
||||
ORT_ENFORCE(input_channels * kernel_size == channels_col, "Dimensions not match!");
|
||||
|
||||
// iterate dimensions on output image shape (without Batch and Channel)
|
||||
std::vector<int64_t> d_output(N, 0);
|
||||
// inner iterate dimensions on kernel shape (without output channel and input channel)
|
||||
std::vector<int64_t> d_kernel(N, 0);
|
||||
|
||||
// Loop over spatial axes along the output image shape
|
||||
int64_t outer_col_index = 0;
|
||||
do {
|
||||
// Loop over spatial axes in reverse order to choose an index on kernel dimensions
|
||||
int64_t inner_col_index = 0;
|
||||
do {
|
||||
// Loop over spatial axes in forward order to compute the indices in the image
|
||||
// and the inner col, and whether the index lies in the padding.
|
||||
int64_t index_im = 0;
|
||||
bool is_padding = false;
|
||||
for (int64_t d_i = 0; d_i < N; ++d_i) {
|
||||
int64_t d_im = d_output[d_i] * stride[d_i] - pad[d_i] + d_kernel[d_i] * dilation[d_i];
|
||||
is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]);
|
||||
index_im *= im_shape[d_i];
|
||||
index_im += d_im;
|
||||
}
|
||||
index_im *= input_channels;
|
||||
auto index_col = outer_col_index + inner_col_index;
|
||||
|
||||
if (!accumulate_output) {
|
||||
if (is_padding) {
|
||||
std::fill_n(data_col + index_col, input_channels, padding_value);
|
||||
} else {
|
||||
std::copy_n(data_img + index_im, input_channels, data_col + index_col);
|
||||
}
|
||||
} else if (!is_padding) { // col2im
|
||||
const T* ptr_im = data_img + index_col;
|
||||
T* ptr_col = data_col + index_im;
|
||||
for (int64_t i = 0; i < input_channels; ++i) {
|
||||
*ptr_col++ += *ptr_im++;
|
||||
}
|
||||
}
|
||||
inner_col_index += input_channels;
|
||||
} while (NextPosition(N, kernel_shape, d_kernel.data()));
|
||||
|
||||
outer_col_index += channels_col;
|
||||
} while (NextPosition(N, output_shape, d_output.data()));
|
||||
}
|
||||
};
|
||||
|
||||
template struct Im2colNd<uint8_t, StorageOrder::NHWC>;
|
||||
|
||||
template <>
|
||||
void Col2im<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, int64_t channels, int64_t height,
|
||||
int64_t width, int64_t kernel_h, int64_t kernel_w,
|
||||
|
|
@ -558,7 +681,7 @@ void Col2im<float, CPUMathUtil, StorageOrder::NHWC>(const float* data_col, int64
|
|||
|
||||
template <>
|
||||
void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, const int64_t* img_shape,
|
||||
const int64_t* col_shape, int64_t img_size, int64_t col_size,
|
||||
const int64_t* output_shape, int64_t channels_col, int64_t img_size,
|
||||
const int64_t* kernel_shape, const int64_t* stride,
|
||||
const int64_t* dilation, const int64_t* pad, int64_t N,
|
||||
float* data_img, CPUMathUtil* context) {
|
||||
|
|
@ -566,9 +689,8 @@ void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, con
|
|||
Im2colNd<float, StorageOrder::NCHW>()(
|
||||
data_col,
|
||||
img_shape,
|
||||
col_shape,
|
||||
img_size,
|
||||
col_size,
|
||||
output_shape,
|
||||
channels_col,
|
||||
kernel_shape,
|
||||
stride,
|
||||
dilation,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include <algorithm>
|
||||
#include "core/util/math.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "test/providers/provider_test_utils.h"
|
||||
#include "core/mlas/inc/mlas.h"
|
||||
|
|
@ -334,6 +336,23 @@ class QLinearConvOpTester {
|
|||
return static_cast<T>(RoundHalfToEven(f) + requantize_values.zero_point_);
|
||||
}
|
||||
|
||||
static bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) {
|
||||
// Loop over spatial axes in reverse order to choose an index, like counting.
|
||||
bool incremented = false;
|
||||
for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
|
||||
int64_t d_max = shape[d_i];
|
||||
ORT_ENFORCE(dims[d_i] < d_max);
|
||||
if (dims[d_i] == d_max - 1) {
|
||||
dims[d_i] = 0;
|
||||
} else { // dims[d_i] < d_max - 1
|
||||
++dims[d_i];
|
||||
incremented = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return incremented;
|
||||
}
|
||||
|
||||
void ComputeExpectedOutput(std::vector<T1>& Y_data, std::vector<int64_t>& Y_shape) {
|
||||
ORT_ENFORCE(W_.shape_.size() > 2);
|
||||
ORT_ENFORCE(X_.shape_.size() == W_.shape_.size());
|
||||
|
|
@ -377,20 +396,10 @@ class QLinearConvOpTester {
|
|||
const int64_t* output_shape = Y_shape.data() + 2;
|
||||
Y_data.resize(ShapeSize(Y_shape));
|
||||
|
||||
const int64_t input_h = input_shape[0];
|
||||
const int64_t input_w = input_shape[1];
|
||||
const int64_t input_image_size = input_h * input_w;
|
||||
const int64_t kernel_h = kernel_shape[0];
|
||||
const int64_t kernel_w = kernel_shape[1];
|
||||
const int64_t kernel_size = kernel_h * kernel_w;
|
||||
const int64_t output_h = output_shape[0];
|
||||
const int64_t output_w = output_shape[1];
|
||||
const int64_t pad_t = pads[0];
|
||||
const int64_t pad_l = pads[1];
|
||||
const int64_t dilation_h = dilations[0];
|
||||
const int64_t dilation_w = dilations[1];
|
||||
const int64_t stride_h = strides[0];
|
||||
const int64_t stride_w = strides[1];
|
||||
const int64_t input_image_size = std::accumulate(
|
||||
input_shape, input_shape + kernel_rank, 1LL, std::multiplies<int64_t>());
|
||||
const int64_t kernel_size = std::accumulate(
|
||||
kernel_shape, kernel_shape + kernel_rank, 1LL, std::multiplies<int64_t>());
|
||||
const int32_t X_zero_point = X_.zero_point_;
|
||||
|
||||
const T1* Xdata = X_.data_.data();
|
||||
|
|
@ -409,29 +418,34 @@ class QLinearConvOpTester {
|
|||
float weight_scale = W_.scale_[(W_.scale_.size() == 1) ? 0 : channel_index];
|
||||
float requantize_scale = (X_.scale_[0] * weight_scale) / output_scale_;
|
||||
|
||||
for (int64_t oh = 0; oh < output_h; oh++) {
|
||||
for (int64_t ow = 0; ow < output_w; ow++) {
|
||||
int32_t sum = bias;
|
||||
const T1* input_image = Xdata;
|
||||
const T2* weight_data = weight_row;
|
||||
for (int64_t ic = 0; ic < group_input_channels; ic++) {
|
||||
for (int64_t kh = 0; kh < kernel_h; kh++) {
|
||||
int64_t ih = kh * dilation_h + oh * stride_h - pad_t;
|
||||
for (int64_t kw = 0; kw < kernel_w; kw++) {
|
||||
int64_t iw = kw * dilation_w + ow * stride_w - pad_l;
|
||||
int32_t w_value = static_cast<int32_t>(*weight_data++);
|
||||
if (static_cast<uint64_t>(ih) < static_cast<uint64_t>(input_h) &&
|
||||
static_cast<uint64_t>(iw) < static_cast<uint64_t>(input_w)) {
|
||||
int32_t x_value = static_cast<int32_t>(input_image[ih * input_w + iw]) - X_zero_point;
|
||||
sum += x_value * w_value;
|
||||
}
|
||||
}
|
||||
std::vector<int64_t> d_output(kernel_rank, 0);
|
||||
std::vector<int64_t> d_kernel(kernel_rank, 0);
|
||||
do {
|
||||
int32_t sum = bias;
|
||||
const T1* input_image = Xdata;
|
||||
const T2* weight_data = weight_row;
|
||||
for (int64_t ic = 0; ic < group_input_channels; ic++) {
|
||||
do {
|
||||
int64_t input_offset = 0;
|
||||
bool is_padding = false;
|
||||
for (size_t axis = 0; axis < kernel_rank; ++axis) {
|
||||
int64_t input_dim = d_kernel[axis] * dilations[axis] + d_output[axis] * strides[axis] - pads[axis];
|
||||
is_padding |= !math::is_a_ge_zero_and_a_lt_b(input_dim, input_shape[axis]);
|
||||
input_offset *= input_shape[axis];
|
||||
input_offset += input_dim;
|
||||
}
|
||||
input_image += input_image_size;
|
||||
}
|
||||
*Ydata++ = RequantizeOutput<T1>(sum, requantize_scale, requantize_values);
|
||||
int32_t w_value = static_cast<int32_t>(*weight_data++);
|
||||
if (!is_padding) {
|
||||
int32_t x_value = static_cast<int32_t>(input_image[input_offset]) - X_zero_point;
|
||||
sum += x_value * w_value;
|
||||
}
|
||||
} while (NextPosition(kernel_rank, kernel_shape, d_kernel.data()));
|
||||
|
||||
input_image += input_image_size;
|
||||
}
|
||||
}
|
||||
*Ydata++ = RequantizeOutput<T1>(sum, requantize_scale, requantize_values);
|
||||
|
||||
} while (NextPosition(kernel_rank, output_shape, d_output.data()));
|
||||
|
||||
weight_row += group_input_channels * kernel_size;
|
||||
}
|
||||
|
|
@ -538,6 +552,16 @@ class QLinearConvOpTester {
|
|||
}
|
||||
};
|
||||
|
||||
TEST(QLinearConvTest, Conv1D_U8S8) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({3, 24, 15}, .05f, 4);
|
||||
test.GenerateRandomWeights({32, 24, 3}, .125f, 0);
|
||||
test.GenerateRandomBias();
|
||||
test.SetPads({1, 1});
|
||||
test.SetOutputScaleAndZeroPoint(.55f, 54);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv2D_U8S8) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({3, 24, 15, 11}, .05f, 4);
|
||||
|
|
@ -548,6 +572,52 @@ TEST(QLinearConvTest, Conv2D_U8S8) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv3D_U8S8) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({2, 2, 15, 11, 6}, .05f, 4);
|
||||
test.GenerateRandomWeights({5, 2, 3, 3, 3}, .125f, 0);
|
||||
test.GenerateRandomBias();
|
||||
test.SetPads({1, 1, 1, 1, 1, 1});
|
||||
test.SetOutputScaleAndZeroPoint(.55f, 54);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv1D_U8S8_Pointwise) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({3, 24, 15}, .05f, 4);
|
||||
test.GenerateRandomWeights({32, 24, 1}, .125f, 0);
|
||||
test.GenerateRandomBias();
|
||||
test.SetOutputScaleAndZeroPoint(.55f, 54);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv2D_U8S8_Pointwise) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({3, 24, 15, 11}, .05f, 4);
|
||||
test.GenerateRandomWeights({32, 24, 1, 1}, .125f, 0);
|
||||
test.GenerateRandomBias();
|
||||
test.SetOutputScaleAndZeroPoint(.55f, 54);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv3D_U8S8_Pointwise) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({2, 2, 15, 11, 6}, .05f, 4);
|
||||
test.GenerateRandomWeights({5, 2, 1, 1, 1}, .125f, 0);
|
||||
test.GenerateRandomBias();
|
||||
test.SetOutputScaleAndZeroPoint(.55f, 54);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv1D_U8S8_Dilations) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 4, 19}, .02f, 20);
|
||||
test.GenerateRandomWeights({6, 4, 3}, .11f, 0);
|
||||
test.SetDilations({2});
|
||||
test.SetOutputScaleAndZeroPoint(.24f, 15);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv2D_U8S8_Dilations) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 4, 19, 16}, .02f, 20);
|
||||
|
|
@ -557,6 +627,24 @@ TEST(QLinearConvTest, Conv2D_U8S8_Dilations) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv3D_U8S8_Dilations) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 2, 19, 16, 8}, .02f, 20);
|
||||
test.GenerateRandomWeights({6, 2, 3, 2, 2}, .11f, 0);
|
||||
test.SetDilations({2, 2, 2});
|
||||
test.SetOutputScaleAndZeroPoint(.24f, 15);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv1D_U8S8_Strides) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 7, 18}, .04f, 16);
|
||||
test.GenerateRandomWeights({5, 7, 2}, .14f, 0);
|
||||
test.SetStrides({2});
|
||||
test.SetOutputScaleAndZeroPoint(.31f, 30);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv2D_U8S8_Strides) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 7, 18, 24}, .04f, 16);
|
||||
|
|
@ -566,6 +654,26 @@ TEST(QLinearConvTest, Conv2D_U8S8_Strides) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv3D_U8S8_Strides) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 3, 18, 24, 18}, .04f, 16);
|
||||
test.GenerateRandomWeights({2, 3, 2, 3, 2}, .14f, 0);
|
||||
test.SetStrides({2, 2, 2});
|
||||
test.SetOutputScaleAndZeroPoint(.31f, 30);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv1D_U8S8_Groups) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 8, 13}, .03f, 7);
|
||||
test.GenerateRandomWeights({12, 4, 3}, .10f, 0);
|
||||
test.GenerateRandomBias();
|
||||
test.SetPads({1, 1});
|
||||
test.SetGroups(2);
|
||||
test.SetOutputScaleAndZeroPoint(.76f, 88);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv2D_U8S8_Groups) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 8, 13, 17}, .03f, 7);
|
||||
|
|
@ -577,6 +685,17 @@ TEST(QLinearConvTest, Conv2D_U8S8_Groups) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv3D_U8S8_Groups) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 4, 13, 17, 13}, .03f, 7);
|
||||
test.GenerateRandomWeights({6, 2, 3, 3, 3}, .10f, 0);
|
||||
test.GenerateRandomBias();
|
||||
test.SetPads({1, 1, 1, 1, 1, 1});
|
||||
test.SetGroups(2);
|
||||
test.SetOutputScaleAndZeroPoint(.76f, 88);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QLinearConvTest, Conv2D_U8S8_Groups_PerChannel) {
|
||||
QLinearConvOpTester<uint8_t, int8_t> test;
|
||||
test.GenerateRandomInput({1, 8, 13, 17}, .03f, 7);
|
||||
|
|
|
|||
|
|
@ -703,6 +703,21 @@ TEST(GradientCheckerTest, ConvGrad) {
|
|||
EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
|
||||
}
|
||||
|
||||
//conv3d
|
||||
{
|
||||
TensorShape x_shape({2, 1, 5, 5, 5});
|
||||
TensorShape w_shape({1, 1, 3, 3, 3});
|
||||
TensorShape b_shape({1});
|
||||
TensorShape y_shape({2, 1, 5, 5, 5});
|
||||
gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
|
||||
{MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
|
||||
MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1})},
|
||||
// TODO: ConvGrad does not handle the case where W does not have gradient.
|
||||
// Check for not has_gradient need to be disabled to pass this test.
|
||||
false);
|
||||
EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
|
||||
}
|
||||
|
||||
//conv_with_strides
|
||||
{
|
||||
TensorShape x_shape({2, 1, 7, 5});
|
||||
|
|
@ -718,6 +733,22 @@ TEST(GradientCheckerTest, ConvGrad) {
|
|||
false);
|
||||
EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
|
||||
}
|
||||
|
||||
//conv3d_with_strides
|
||||
{
|
||||
TensorShape x_shape({2, 1, 7, 5, 5});
|
||||
TensorShape w_shape({1, 1, 3, 3, 3});
|
||||
TensorShape b_shape({1});
|
||||
TensorShape y_shape({2, 1, 4, 3, 3});
|
||||
gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
|
||||
{MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
|
||||
MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1}),
|
||||
MakeAttribute("strides", std::vector<int64_t>{2, 2, 2})},
|
||||
// TODO: ConvGrad does not handle the case where W does not have gradient.
|
||||
// Check for not has_gradient need to be disabled to pass this test.
|
||||
false);
|
||||
EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
|
||||
}
|
||||
}
|
||||
|
||||
static void TestConcatOpGrad(const std::string& op_type,
|
||||
|
|
|
|||
|
|
@ -98,11 +98,6 @@ Status ConvGrad<T>::Compute(OpKernelContext* context) const {
|
|||
&CPUMathUtil::Instance());
|
||||
}
|
||||
|
||||
TensorShape image_shape = X->Shape().Slice(1);
|
||||
std::vector<int64_t> col_buffer_shape{kernel_dim};
|
||||
col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(),
|
||||
output_shape.GetDims().end());
|
||||
|
||||
for (int image_id = 0; image_id < N; ++image_id) {
|
||||
for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
|
||||
if (Is2DKernel) {
|
||||
|
|
@ -125,10 +120,9 @@ Status ConvGrad<T>::Compute(OpKernelContext* context) const {
|
|||
} else {
|
||||
math::Im2colNd<T, StorageOrder::NCHW>()(
|
||||
Xdata + group_id * X_offset,
|
||||
image_shape.GetDims().data(),
|
||||
col_buffer_shape.data(),
|
||||
C * input_image_size,
|
||||
col_buffer_size,
|
||||
input_shape.GetDims().data(),
|
||||
output_shape.GetDims().data(),
|
||||
kernel_dim,
|
||||
kernel_shape.data(),
|
||||
strides.data(),
|
||||
dilations.data(),
|
||||
|
|
@ -208,10 +202,10 @@ Status ConvGrad<T>::Compute(OpKernelContext* context) const {
|
|||
} else {
|
||||
math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
|
||||
col_buffer_data,
|
||||
image_shape.GetDims().data(),
|
||||
col_buffer_shape.data(),
|
||||
input_shape.GetDims().data(),
|
||||
output_shape.GetDims().data(),
|
||||
kernel_dim,
|
||||
C * input_image_size,
|
||||
col_buffer_size,
|
||||
kernel_shape.data(),
|
||||
strides.data(),
|
||||
dilations.data(),
|
||||
|
|
|
|||
Loading…
Reference in a new issue