diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc index 4581d1f83a..e7d69e95a2 100644 --- a/onnxruntime/core/providers/cpu/nn/conv.cc +++ b/onnxruntime/core/providers/cpu/nn/conv.cc @@ -71,7 +71,6 @@ Status Conv::Compute(OpKernelContext* context) const { const size_t kernel_rank = kernel_shape.size(); BufferUniquePtr col_buffer; - std::vector col_buffer_shape; // Pointwise convolutions can use the original input tensor in place, // otherwise a temporary buffer is required for the im2col transform. @@ -81,13 +80,6 @@ Status Conv::Compute(OpKernelContext* context) const { auto* col_data = alloc->Alloc(SafeInt(sizeof(T)) * col_buffer_size); col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc)); - - if (kernel_rank != 2) { - const auto& output_dims = output_shape.GetDims(); - col_buffer_shape.reserve(1 + output_dims.size()); - col_buffer_shape.push_back(kernel_dim); - col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end()); - } } T* col_buffer_data = static_cast(col_buffer.get()); @@ -120,10 +112,9 @@ Status Conv::Compute(OpKernelContext* context) const { } else { math::Im2colNd()( Xdata + group_id * X_offset, - X->Shape().GetDims().data() + 1, - col_buffer_shape.data(), - C * input_image_size, - col_buffer_size, + input_shape.GetDims().data(), + output_shape.GetDims().data(), + kernel_dim, kernel_shape.data(), strides.data(), dilations.data(), @@ -251,19 +242,13 @@ Status Conv::Compute(OpKernelContext* context) const { BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc)); auto* col_buffer_data = static_cast(col_buffer.get()); - TensorShape image_shape = X->Shape().Slice(1); - std::vector col_buffer_shape{kernel_dim}; - col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(), - output_shape.GetDims().end()); - for (int image_id = 0; image_id < N; ++image_id) { for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) { math::Im2colNd()( Xdata + group_id * X_offset, - image_shape.GetDims().data(), - col_buffer_shape.data(), - C * input_image_size, - col_buffer_size, + input_shape.GetDims().data(), + output_shape.GetDims().data(), + kernel_dim, kernel_shape.data(), strides.data(), dilations.data(), diff --git a/onnxruntime/core/providers/cpu/nn/conv_integer.cc b/onnxruntime/core/providers/cpu/nn/conv_integer.cc index 10ef6a3133..850da62f46 100644 --- a/onnxruntime/core/providers/cpu/nn/conv_integer.cc +++ b/onnxruntime/core/providers/cpu/nn/conv_integer.cc @@ -92,7 +92,6 @@ Status ConvInteger::Compute(OpKernelContext* context) const { const size_t kernel_rank = kernel_shape.size(); BufferUniquePtr col_buffer; - std::vector col_buffer_shape; // Pointwise convolutions can use the original input tensor in place, // otherwise a temporary buffer is required for the im2col transform. @@ -102,13 +101,6 @@ Status ConvInteger::Compute(OpKernelContext* context) const { auto* col_data = alloc->Alloc(SafeInt(sizeof(uint8_t)) * col_buffer_size); col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc)); - - if (kernel_rank != 2) { - const auto& output_dims = output_shape.GetDims(); - col_buffer_shape.reserve(1 + output_dims.size()); - col_buffer_shape.push_back(kernel_dim); - col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end()); - } } auto* col_buffer_data = static_cast(col_buffer.get()); @@ -143,10 +135,9 @@ Status ConvInteger::Compute(OpKernelContext* context) const { } else { math::Im2colNd()( Xdata, - X->Shape().GetDims().data() + 1, - col_buffer_shape.data(), - C * input_image_size, - col_buffer_size, + input_shape.GetDims().data(), + output_shape.GetDims().data(), + kernel_dim, kernel_shape.data(), strides.data(), dilations.data(), diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc index 0e07642868..67f3cca4cb 100644 --- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc @@ -74,9 +74,6 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ const T* filter_data = p.F->template Data(); T* Ydata = p.Y->template MutableData(); - std::vector col_buffer_shape{kernel_dim}; - col_buffer_shape.insert(col_buffer_shape.end(), p.input_shape.GetDims().begin(), p.input_shape.GetDims().end()); - if (p.X->Shape().NumDimensions() == 4) { for (auto image_id = 0; image_id < p.N; ++image_id) { for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) { @@ -124,8 +121,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ Ydata += Y_offset * conv_transpose_attrs_.group; } } else { - TensorShape output_shape = p.Y->Shape().Slice(1); - output_shape[0] = output_shape[0] / conv_transpose_attrs_.group; + TensorShape output_shape = p.Y->Shape().Slice(2); for (auto image_id = 0; image_id < p.N; ++image_id) { for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) { @@ -147,9 +143,9 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ math::Col2imNd( col_buffer_data, output_shape.GetDims().data(), - col_buffer_shape.data(), - output_shape.Size(), - col_buffer_size, + p.input_shape.GetDims().data(), + kernel_dim, + Y_offset, p.kernel_shape.data(), p.strides.data(), p.dilations.data(), diff --git a/onnxruntime/core/providers/cpu/nn/qlinearconv.cc b/onnxruntime/core/providers/cpu/nn/qlinearconv.cc index 328233ad6b..1b09dc0905 100644 --- a/onnxruntime/core/providers/cpu/nn/qlinearconv.cc +++ b/onnxruntime/core/providers/cpu/nn/qlinearconv.cc @@ -75,7 +75,6 @@ Status QLinearConv::Compute(OpKernelContext* context) const { const Tensor* B = context->Input(8); const int64_t N = X->Shape()[0]; - const int64_t C = X->Shape()[1]; const int64_t M = W->Shape()[0]; ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W)); @@ -125,20 +124,12 @@ Status QLinearConv::Compute(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); BufferUniquePtr col_buffer; - std::vector col_buffer_shape; // Pointwise convolutions can use the original input tensor in place, // otherwise a temporary buffer is required for the im2col transform. if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) { auto* col_data = alloc->Alloc(SafeInt(sizeof(uint8_t)) * col_buffer_size); col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc)); - - if (kernel_rank != 2) { - const auto& output_dims = output_shape.GetDims(); - col_buffer_shape.reserve(1 + output_dims.size()); - col_buffer_shape.push_back(kernel_dim); - col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end()); - } } auto* col_buffer_data = static_cast(col_buffer.get()); @@ -187,10 +178,9 @@ Status QLinearConv::Compute(OpKernelContext* context) const { } else { math::Im2colNd()( Xdata, - X->Shape().GetDims().data() + 1, - col_buffer_shape.data(), - C * input_image_size, - col_buffer_size, + input_shape.GetDims().data(), + output_shape.GetDims().data(), + kernel_dim, kernel_shape.data(), strides.data(), dilations.data(), @@ -450,7 +440,6 @@ Status QLinearConv::Compute(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W_shape, kernel_shape)); const size_t kernel_rank = kernel_shape.size(); - ORT_ENFORCE(kernel_rank == 2, "QLinearConv : must be 2D convolution"); std::vector pads(conv_attrs_.pads); if (pads.empty()) { @@ -544,9 +533,10 @@ Status QLinearConv::Compute(OpKernelContext* context) const { auto* transpose_output = static_cast(alloc->Alloc(SafeInt(sizeof(uint8_t)) * Y_offset)); BufferUniquePtr transpose_output_buffer(transpose_output, BufferDeleter(alloc)); + BufferUniquePtr col_buffer; + // Pointwise convolutions can use the original input tensor in place, // otherwise a temporary buffer is required for the im2col transform. - BufferUniquePtr col_buffer; if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) { auto* col_data = alloc->Alloc(SafeInt(sizeof(uint8_t)) * col_buffer_size); col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc)); @@ -582,6 +572,23 @@ Status QLinearConv::Compute(OpKernelContext* context) const { static_cast(group_input_channels), static_cast(input_image_size)); + if (kernel_rank != 2 && col_buffer_data != nullptr) { + // Try big Im2ColNd in this case, parallel it later if needed + math::Im2colNd()( + transpose_input, + input_shape.GetDims().data(), + output_shape.GetDims().data(), + kernel_dim, + kernel_shape.data(), + strides.data(), + dilations.data(), + pads.data(), + static_cast(kernel_rank), + col_buffer_data, + false, + X_zero_point_value); + } + auto conv_worker = [&](ptrdiff_t batch) { auto work = concurrency::ThreadPool::PartitionWork(batch, thread_count, static_cast(output_image_size)); int64_t output_start = static_cast(work.start); @@ -592,24 +599,26 @@ Status QLinearConv::Compute(OpKernelContext* context) const { uint8_t* worker_gemm_input; if (col_buffer_data != nullptr) { worker_gemm_input = col_buffer_data + output_start * kernel_dim; - math::Im2col()( - transpose_input, - group_input_channels, - input_shape[0], - input_shape[1], - kernel_shape[0], - kernel_shape[1], - dilations[0], - dilations[1], - pads[0], - pads[1], - strides[0], - strides[1], - output_shape[1], - output_start, - output_count, - worker_gemm_input, - X_zero_point_value); + if (kernel_rank == 2) { + math::Im2col()( + transpose_input, + group_input_channels, + input_shape[0], + input_shape[1], + kernel_shape[0], + kernel_shape[1], + dilations[0], + dilations[1], + pads[0], + pads[1], + strides[0], + strides[1], + output_shape[1], + output_start, + output_count, + worker_gemm_input, + X_zero_point_value); + } } else { worker_gemm_input = transpose_input + output_start * kernel_dim; } diff --git a/onnxruntime/core/util/math.h b/onnxruntime/core/util/math.h index 16138e680f..14755102cd 100644 --- a/onnxruntime/core/util/math.h +++ b/onnxruntime/core/util/math.h @@ -254,9 +254,8 @@ struct Im2colNd { void operator()( const T* data_img, const int64_t* im_shape, - const int64_t* col_shape, - int64_t img_size, - int64_t col_size, + const int64_t* output_shape, + int64_t channels_col, const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation, @@ -267,78 +266,13 @@ struct Im2colNd { T padding_value = 0); }; -template -struct Im2colNd { - void operator()(const T* data_img, const int64_t* im_shape, const int64_t* col_shape, int64_t /*img_size*/, - int64_t /*col_size*/, const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation, - const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false, - T padding_value = 0) { - int64_t kernel_size = 1; - for (int64_t i = 0; i < N; ++i) { - kernel_size *= kernel_shape[i]; - } - int64_t channels_col = col_shape[0]; - std::vector d_offset(N, 0); - std::vector d_iter(N, 0); - for (int64_t c_col = 0; c_col < channels_col; ++c_col) { - // Loop over spatial axes in reverse order to compute a per-axis offset. - int64_t offset = c_col; - for (int64_t d_i = N - 1; d_i >= 0; --d_i) { - if (d_i < N - 1) { - offset /= kernel_shape[d_i + 1]; - } - d_offset[d_i] = offset % kernel_shape[d_i]; - } - for (bool incremented = true; incremented;) { - // Loop over spatial axes in forward order to compute the indices in the - // image and column, and whether the index lies in the padding. - int64_t index_col = c_col; - int64_t index_im = c_col / kernel_size; - bool is_padding = false; - for (int64_t d_i = 0; d_i < N; ++d_i) { - int64_t d = d_iter[d_i]; - int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i]; - is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1]; - index_col *= col_shape[d_i + 1]; - index_col += d; - index_im *= im_shape[d_i + 1]; - index_im += d_im; - } - if (!accumulate_output) { - if (is_padding) { - data_col[index_col] = padding_value; - } else { - data_col[index_col] = data_img[index_im]; - } - } else if (!is_padding) { // col2im - data_col[index_im] += data_img[index_col]; - } - // Loop over spatial axes in reverse order to choose an index, - // like counting. - incremented = false; - for (int64_t d_i = N - 1; d_i >= 0; --d_i) { - int64_t d_max = col_shape[d_i + 1]; - ORT_ENFORCE(d_iter[d_i] < d_max); - if (d_iter[d_i] == d_max - 1) { - d_iter[d_i] = 0; - } else { // d_iter[d_i] < d_max - 1 - ++d_iter[d_i]; - incremented = true; - break; - } - } - } // while(incremented) { - } // for (int c = 0; c < channels_col; ++c) { - } -}; - template void Col2imNd( const T* data_col, const int64_t* img_shape, - const int64_t* col_shape, + const int64_t* output_shape, + int64_t channels_col, int64_t img_size, - int64_t col_size, const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation, diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc index b9455d6340..1b38f2512d 100644 --- a/onnxruntime/core/util/math_cpu.cc +++ b/onnxruntime/core/util/math_cpu.cc @@ -210,7 +210,6 @@ template void Gemv(const CBLAS_TRANSPOSE TransA, int M, int SPECIALIZED_AXPY(float) #undef SPECIALIZED_AXPY - #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr) \ template <> \ void Funcname(int N, const T* x, T* y, CPUMathUtil*) { \ @@ -420,6 +419,130 @@ void Im2col::operator()(const T* data_im, int64_t channel template struct Im2col; +// Loop over spatial axes in reverse order to choose an index, like counting. +static inline bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) { + bool has_next_output = false; + for (int64_t d_i = N - 1; d_i >= 0; --d_i) { + int64_t d_max = shape[d_i]; + ORT_ENFORCE(dims[d_i] < d_max); + if (dims[d_i] == d_max - 1) { + dims[d_i] = 0; + } else { // dims[d_i] < d_max - 1 + ++dims[d_i]; + has_next_output = true; + break; + } + } + return has_next_output; +} + +template +struct Im2colNd { + void operator()(const T* data_img, const int64_t* im_shape, const int64_t* output_shape, int64_t channels_col, + const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation, + const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false, + T padding_value = 0) { + int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1LL, std::multiplies()); + std::vector d_offset(N, 0); + std::vector d_iter(N, 0); + for (int64_t c_col = 0; c_col < channels_col; ++c_col) { + // Loop over spatial axes in reverse order to compute a per-axis offset. + int64_t offset = c_col; + for (int64_t d_i = N - 1; d_i >= 0; --d_i) { + if (d_i < N - 1) { + offset /= kernel_shape[d_i + 1]; + } + d_offset[d_i] = offset % kernel_shape[d_i]; + } + do { + // Loop over spatial axes in forward order to compute the indices in the + // image and column, and whether the index lies in the padding. + int64_t index_col = c_col; + int64_t index_im = c_col / kernel_size; + bool is_padding = false; + for (int64_t d_i = 0; d_i < N; ++d_i) { + int64_t d = d_iter[d_i]; + int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i]; + is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]); + index_col *= output_shape[d_i]; + index_col += d; + index_im *= im_shape[d_i]; + index_im += d_im; + } + if (!accumulate_output) { + if (is_padding) { + data_col[index_col] = padding_value; + } else { + data_col[index_col] = data_img[index_im]; + } + } else if (!is_padding) { // col2im + data_col[index_im] += data_img[index_col]; + } + } while (NextPosition(N, output_shape, d_iter.data())); + } // for (int c = 0; c < channels_col; ++c) { + } +}; + +template struct Im2colNd; +template struct Im2colNd; + +template +struct Im2colNd { + void operator()(const T* data_img, const int64_t* im_shape, const int64_t* output_shape, int64_t channels_col, + const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation, + const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false, + T padding_value = 0) { + int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1LL, std::multiplies()); + int64_t input_channels = channels_col / kernel_size; + ORT_ENFORCE(input_channels * kernel_size == channels_col, "Dimensions not match!"); + + // iterate dimensions on output image shape (without Batch and Channel) + std::vector d_output(N, 0); + // inner iterate dimensions on kernel shape (without output channel and input channel) + std::vector d_kernel(N, 0); + + // Loop over spatial axes along the output image shape + int64_t outer_col_index = 0; + do { + // Loop over spatial axes in reverse order to choose an index on kernel dimensions + int64_t inner_col_index = 0; + do { + // Loop over spatial axes in forward order to compute the indices in the image + // and the inner col, and whether the index lies in the padding. + int64_t index_im = 0; + bool is_padding = false; + for (int64_t d_i = 0; d_i < N; ++d_i) { + int64_t d_im = d_output[d_i] * stride[d_i] - pad[d_i] + d_kernel[d_i] * dilation[d_i]; + is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]); + index_im *= im_shape[d_i]; + index_im += d_im; + } + index_im *= input_channels; + auto index_col = outer_col_index + inner_col_index; + + if (!accumulate_output) { + if (is_padding) { + std::fill_n(data_col + index_col, input_channels, padding_value); + } else { + std::copy_n(data_img + index_im, input_channels, data_col + index_col); + } + } else if (!is_padding) { // col2im + const T* ptr_im = data_img + index_col; + T* ptr_col = data_col + index_im; + for (int64_t i = 0; i < input_channels; ++i) { + *ptr_col++ += *ptr_im++; + } + } + inner_col_index += input_channels; + } while (NextPosition(N, kernel_shape, d_kernel.data())); + + outer_col_index += channels_col; + } while (NextPosition(N, output_shape, d_output.data())); + } +}; + +template struct Im2colNd; + template <> void Col2im(const float* data_col, int64_t channels, int64_t height, int64_t width, int64_t kernel_h, int64_t kernel_w, @@ -558,7 +681,7 @@ void Col2im(const float* data_col, int64 template <> void Col2imNd(const float* data_col, const int64_t* img_shape, - const int64_t* col_shape, int64_t img_size, int64_t col_size, + const int64_t* output_shape, int64_t channels_col, int64_t img_size, const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation, const int64_t* pad, int64_t N, float* data_img, CPUMathUtil* context) { @@ -566,9 +689,8 @@ void Col2imNd(const float* data_col, con Im2colNd()( data_col, img_shape, - col_shape, - img_size, - col_size, + output_shape, + channels_col, kernel_shape, stride, dilation, diff --git a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc index a48e411a1b..9f6cc00b2e 100644 --- a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include +#include "core/util/math.h" #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" #include "core/mlas/inc/mlas.h" @@ -334,6 +336,23 @@ class QLinearConvOpTester { return static_cast(RoundHalfToEven(f) + requantize_values.zero_point_); } + static bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) { + // Loop over spatial axes in reverse order to choose an index, like counting. + bool incremented = false; + for (int64_t d_i = N - 1; d_i >= 0; --d_i) { + int64_t d_max = shape[d_i]; + ORT_ENFORCE(dims[d_i] < d_max); + if (dims[d_i] == d_max - 1) { + dims[d_i] = 0; + } else { // dims[d_i] < d_max - 1 + ++dims[d_i]; + incremented = true; + break; + } + } + return incremented; + } + void ComputeExpectedOutput(std::vector& Y_data, std::vector& Y_shape) { ORT_ENFORCE(W_.shape_.size() > 2); ORT_ENFORCE(X_.shape_.size() == W_.shape_.size()); @@ -377,20 +396,10 @@ class QLinearConvOpTester { const int64_t* output_shape = Y_shape.data() + 2; Y_data.resize(ShapeSize(Y_shape)); - const int64_t input_h = input_shape[0]; - const int64_t input_w = input_shape[1]; - const int64_t input_image_size = input_h * input_w; - const int64_t kernel_h = kernel_shape[0]; - const int64_t kernel_w = kernel_shape[1]; - const int64_t kernel_size = kernel_h * kernel_w; - const int64_t output_h = output_shape[0]; - const int64_t output_w = output_shape[1]; - const int64_t pad_t = pads[0]; - const int64_t pad_l = pads[1]; - const int64_t dilation_h = dilations[0]; - const int64_t dilation_w = dilations[1]; - const int64_t stride_h = strides[0]; - const int64_t stride_w = strides[1]; + const int64_t input_image_size = std::accumulate( + input_shape, input_shape + kernel_rank, 1LL, std::multiplies()); + const int64_t kernel_size = std::accumulate( + kernel_shape, kernel_shape + kernel_rank, 1LL, std::multiplies()); const int32_t X_zero_point = X_.zero_point_; const T1* Xdata = X_.data_.data(); @@ -409,29 +418,34 @@ class QLinearConvOpTester { float weight_scale = W_.scale_[(W_.scale_.size() == 1) ? 0 : channel_index]; float requantize_scale = (X_.scale_[0] * weight_scale) / output_scale_; - for (int64_t oh = 0; oh < output_h; oh++) { - for (int64_t ow = 0; ow < output_w; ow++) { - int32_t sum = bias; - const T1* input_image = Xdata; - const T2* weight_data = weight_row; - for (int64_t ic = 0; ic < group_input_channels; ic++) { - for (int64_t kh = 0; kh < kernel_h; kh++) { - int64_t ih = kh * dilation_h + oh * stride_h - pad_t; - for (int64_t kw = 0; kw < kernel_w; kw++) { - int64_t iw = kw * dilation_w + ow * stride_w - pad_l; - int32_t w_value = static_cast(*weight_data++); - if (static_cast(ih) < static_cast(input_h) && - static_cast(iw) < static_cast(input_w)) { - int32_t x_value = static_cast(input_image[ih * input_w + iw]) - X_zero_point; - sum += x_value * w_value; - } - } + std::vector d_output(kernel_rank, 0); + std::vector d_kernel(kernel_rank, 0); + do { + int32_t sum = bias; + const T1* input_image = Xdata; + const T2* weight_data = weight_row; + for (int64_t ic = 0; ic < group_input_channels; ic++) { + do { + int64_t input_offset = 0; + bool is_padding = false; + for (size_t axis = 0; axis < kernel_rank; ++axis) { + int64_t input_dim = d_kernel[axis] * dilations[axis] + d_output[axis] * strides[axis] - pads[axis]; + is_padding |= !math::is_a_ge_zero_and_a_lt_b(input_dim, input_shape[axis]); + input_offset *= input_shape[axis]; + input_offset += input_dim; } - input_image += input_image_size; - } - *Ydata++ = RequantizeOutput(sum, requantize_scale, requantize_values); + int32_t w_value = static_cast(*weight_data++); + if (!is_padding) { + int32_t x_value = static_cast(input_image[input_offset]) - X_zero_point; + sum += x_value * w_value; + } + } while (NextPosition(kernel_rank, kernel_shape, d_kernel.data())); + + input_image += input_image_size; } - } + *Ydata++ = RequantizeOutput(sum, requantize_scale, requantize_values); + + } while (NextPosition(kernel_rank, output_shape, d_output.data())); weight_row += group_input_channels * kernel_size; } @@ -538,6 +552,16 @@ class QLinearConvOpTester { } }; +TEST(QLinearConvTest, Conv1D_U8S8) { + QLinearConvOpTester test; + test.GenerateRandomInput({3, 24, 15}, .05f, 4); + test.GenerateRandomWeights({32, 24, 3}, .125f, 0); + test.GenerateRandomBias(); + test.SetPads({1, 1}); + test.SetOutputScaleAndZeroPoint(.55f, 54); + test.Run(); +} + TEST(QLinearConvTest, Conv2D_U8S8) { QLinearConvOpTester test; test.GenerateRandomInput({3, 24, 15, 11}, .05f, 4); @@ -548,6 +572,52 @@ TEST(QLinearConvTest, Conv2D_U8S8) { test.Run(); } +TEST(QLinearConvTest, Conv3D_U8S8) { + QLinearConvOpTester test; + test.GenerateRandomInput({2, 2, 15, 11, 6}, .05f, 4); + test.GenerateRandomWeights({5, 2, 3, 3, 3}, .125f, 0); + test.GenerateRandomBias(); + test.SetPads({1, 1, 1, 1, 1, 1}); + test.SetOutputScaleAndZeroPoint(.55f, 54); + test.Run(); +} + +TEST(QLinearConvTest, Conv1D_U8S8_Pointwise) { + QLinearConvOpTester test; + test.GenerateRandomInput({3, 24, 15}, .05f, 4); + test.GenerateRandomWeights({32, 24, 1}, .125f, 0); + test.GenerateRandomBias(); + test.SetOutputScaleAndZeroPoint(.55f, 54); + test.Run(); +} + +TEST(QLinearConvTest, Conv2D_U8S8_Pointwise) { + QLinearConvOpTester test; + test.GenerateRandomInput({3, 24, 15, 11}, .05f, 4); + test.GenerateRandomWeights({32, 24, 1, 1}, .125f, 0); + test.GenerateRandomBias(); + test.SetOutputScaleAndZeroPoint(.55f, 54); + test.Run(); +} + +TEST(QLinearConvTest, Conv3D_U8S8_Pointwise) { + QLinearConvOpTester test; + test.GenerateRandomInput({2, 2, 15, 11, 6}, .05f, 4); + test.GenerateRandomWeights({5, 2, 1, 1, 1}, .125f, 0); + test.GenerateRandomBias(); + test.SetOutputScaleAndZeroPoint(.55f, 54); + test.Run(); +} + +TEST(QLinearConvTest, Conv1D_U8S8_Dilations) { + QLinearConvOpTester test; + test.GenerateRandomInput({1, 4, 19}, .02f, 20); + test.GenerateRandomWeights({6, 4, 3}, .11f, 0); + test.SetDilations({2}); + test.SetOutputScaleAndZeroPoint(.24f, 15); + test.Run(); +} + TEST(QLinearConvTest, Conv2D_U8S8_Dilations) { QLinearConvOpTester test; test.GenerateRandomInput({1, 4, 19, 16}, .02f, 20); @@ -557,6 +627,24 @@ TEST(QLinearConvTest, Conv2D_U8S8_Dilations) { test.Run(); } +TEST(QLinearConvTest, Conv3D_U8S8_Dilations) { + QLinearConvOpTester test; + test.GenerateRandomInput({1, 2, 19, 16, 8}, .02f, 20); + test.GenerateRandomWeights({6, 2, 3, 2, 2}, .11f, 0); + test.SetDilations({2, 2, 2}); + test.SetOutputScaleAndZeroPoint(.24f, 15); + test.Run(); +} + +TEST(QLinearConvTest, Conv1D_U8S8_Strides) { + QLinearConvOpTester test; + test.GenerateRandomInput({1, 7, 18}, .04f, 16); + test.GenerateRandomWeights({5, 7, 2}, .14f, 0); + test.SetStrides({2}); + test.SetOutputScaleAndZeroPoint(.31f, 30); + test.Run(); +} + TEST(QLinearConvTest, Conv2D_U8S8_Strides) { QLinearConvOpTester test; test.GenerateRandomInput({1, 7, 18, 24}, .04f, 16); @@ -566,6 +654,26 @@ TEST(QLinearConvTest, Conv2D_U8S8_Strides) { test.Run(); } +TEST(QLinearConvTest, Conv3D_U8S8_Strides) { + QLinearConvOpTester test; + test.GenerateRandomInput({1, 3, 18, 24, 18}, .04f, 16); + test.GenerateRandomWeights({2, 3, 2, 3, 2}, .14f, 0); + test.SetStrides({2, 2, 2}); + test.SetOutputScaleAndZeroPoint(.31f, 30); + test.Run(); +} + +TEST(QLinearConvTest, Conv1D_U8S8_Groups) { + QLinearConvOpTester test; + test.GenerateRandomInput({1, 8, 13}, .03f, 7); + test.GenerateRandomWeights({12, 4, 3}, .10f, 0); + test.GenerateRandomBias(); + test.SetPads({1, 1}); + test.SetGroups(2); + test.SetOutputScaleAndZeroPoint(.76f, 88); + test.Run(); +} + TEST(QLinearConvTest, Conv2D_U8S8_Groups) { QLinearConvOpTester test; test.GenerateRandomInput({1, 8, 13, 17}, .03f, 7); @@ -577,6 +685,17 @@ TEST(QLinearConvTest, Conv2D_U8S8_Groups) { test.Run(); } +TEST(QLinearConvTest, Conv3D_U8S8_Groups) { + QLinearConvOpTester test; + test.GenerateRandomInput({1, 4, 13, 17, 13}, .03f, 7); + test.GenerateRandomWeights({6, 2, 3, 3, 3}, .10f, 0); + test.GenerateRandomBias(); + test.SetPads({1, 1, 1, 1, 1, 1}); + test.SetGroups(2); + test.SetOutputScaleAndZeroPoint(.76f, 88); + test.Run(); +} + TEST(QLinearConvTest, Conv2D_U8S8_Groups_PerChannel) { QLinearConvOpTester test; test.GenerateRandomInput({1, 8, 13, 17}, .03f, 7); diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc index 1afe5e85c5..b8fa8a2a0e 100644 --- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc +++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc @@ -703,6 +703,21 @@ TEST(GradientCheckerTest, ConvGrad) { EXPECT_IS_TINIER_THAN(max_error, error_tolerance); } + //conv3d + { + TensorShape x_shape({2, 1, 5, 5, 5}); + TensorShape w_shape({1, 1, 3, 3, 3}); + TensorShape b_shape({1}); + TensorShape y_shape({2, 1, 5, 5, 5}); + gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error, + {MakeAttribute("kernel_shape", std::vector{3, 3, 3}), + MakeAttribute("pads", std::vector{1, 1, 1, 1, 1, 1})}, + // TODO: ConvGrad does not handle the case where W does not have gradient. + // Check for not has_gradient need to be disabled to pass this test. + false); + EXPECT_IS_TINIER_THAN(max_error, error_tolerance); + } + //conv_with_strides { TensorShape x_shape({2, 1, 7, 5}); @@ -718,6 +733,22 @@ TEST(GradientCheckerTest, ConvGrad) { false); EXPECT_IS_TINIER_THAN(max_error, error_tolerance); } + + //conv3d_with_strides + { + TensorShape x_shape({2, 1, 7, 5, 5}); + TensorShape w_shape({1, 1, 3, 3, 3}); + TensorShape b_shape({1}); + TensorShape y_shape({2, 1, 4, 3, 3}); + gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error, + {MakeAttribute("kernel_shape", std::vector{3, 3, 3}), + MakeAttribute("pads", std::vector{1, 1, 1, 1, 1, 1}), + MakeAttribute("strides", std::vector{2, 2, 2})}, + // TODO: ConvGrad does not handle the case where W does not have gradient. + // Check for not has_gradient need to be disabled to pass this test. + false); + EXPECT_IS_TINIER_THAN(max_error, error_tolerance); + } } static void TestConcatOpGrad(const std::string& op_type, diff --git a/orttraining/orttraining/training_ops/cpu/nn/conv_grad.cc b/orttraining/orttraining/training_ops/cpu/nn/conv_grad.cc index 27a6729716..416b79d326 100644 --- a/orttraining/orttraining/training_ops/cpu/nn/conv_grad.cc +++ b/orttraining/orttraining/training_ops/cpu/nn/conv_grad.cc @@ -98,11 +98,6 @@ Status ConvGrad::Compute(OpKernelContext* context) const { &CPUMathUtil::Instance()); } - TensorShape image_shape = X->Shape().Slice(1); - std::vector col_buffer_shape{kernel_dim}; - col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(), - output_shape.GetDims().end()); - for (int image_id = 0; image_id < N; ++image_id) { for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) { if (Is2DKernel) { @@ -125,10 +120,9 @@ Status ConvGrad::Compute(OpKernelContext* context) const { } else { math::Im2colNd()( Xdata + group_id * X_offset, - image_shape.GetDims().data(), - col_buffer_shape.data(), - C * input_image_size, - col_buffer_size, + input_shape.GetDims().data(), + output_shape.GetDims().data(), + kernel_dim, kernel_shape.data(), strides.data(), dilations.data(), @@ -208,10 +202,10 @@ Status ConvGrad::Compute(OpKernelContext* context) const { } else { math::Col2imNd( col_buffer_data, - image_shape.GetDims().data(), - col_buffer_shape.data(), + input_shape.GetDims().data(), + output_shape.GetDims().data(), + kernel_dim, C * input_image_size, - col_buffer_size, kernel_shape.data(), strides.data(), dilations.data(),