From 06cd81d79199ab826a239d894be38dc7b9830abb Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Fri, 2 Oct 2020 11:02:43 -0700 Subject: [PATCH] Support trilinear sampling in Resize CPU and CUDA kernels (#5300) --- .../core/providers/cpu/tensor/upsample.cc | 318 ++++++++++++++++-- .../core/providers/cpu/tensor/upsample.h | 17 +- .../core/providers/cuda/tensor/resize_impl.cu | 249 +++++++++++--- .../core/providers/cuda/tensor/resize_impl.h | 2 +- .../core/providers/cuda/tensor/upsample.cc | 2 +- .../providers/cpu/tensor/resize_op_test.cc | 83 ++++- 6 files changed, 580 insertions(+), 91 deletions(-) diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc index 5f443e87ae..0cf7b0d0cb 100644 --- a/onnxruntime/core/providers/cpu/tensor/upsample.cc +++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc @@ -321,24 +321,46 @@ void UpsampleBilinear(int64_t batch_size, AllocatorPtr& alloc, GetOriginalCoordinateFunc get_original_coordinate) { std::vector y_original; - std::vector x_original; + y_original.reserve(output_height); + std::vector x_original; + x_original.reserve(output_width); + + // For each index in the output height and output width, cache its corresponding indices in the input + // while multiplying it with the input stride for that dimension (cache because we don't have to re-compute + // each time we come across the output width/ output height value while iterating the output image tensor SafeInt idx_buffer_size = SafeInt(2) * sizeof(int64_t) * (output_height + output_width); + + // For each index in the output height and output width, cache its corresponding "weights/scales" for its + // corresponding indices in the input which proportionately indicates how much they will influence the final + // pixel value in the output + // (cache because we don't have to re-compute each time we come across the output width/ output height value while iterating the output image tensor SafeInt scale_buffer_size = SafeInt(2) * sizeof(float_t) * (output_height + output_width); + + // Limit number of allocations to just 1 auto inx_scale_data_buffer = alloc->Alloc(idx_buffer_size + scale_buffer_size); BufferUniquePtr idx_scale_data_buffer_holder(inx_scale_data_buffer, BufferDeleter(alloc)); + + // Get pointers to appropriate memory locations in the scratch buffer auto* idx_data = static_cast(idx_scale_data_buffer_holder.get()); + + // input_width is the stride for the height dimension int64_t* input_width_mul_y1 = idx_data; - int64_t* input_width_mul_y2 = idx_data + output_height; - int64_t* in_x1 = idx_data + 2 * output_height; - int64_t* in_x2 = idx_data + 2 * output_height + output_width; + int64_t* input_width_mul_y2 = input_width_mul_y1 + output_height; + + // stride for width is 1 (no multiplication needed) + int64_t* in_x1 = input_width_mul_y1 + 2 * output_height; + int64_t* in_x2 = in_x1 + output_width; auto* scale_data = reinterpret_cast(in_x2 + output_width); - float* dy1 = scale_data; - float* dy2 = scale_data + output_height; - float* dx1 = scale_data + 2 * output_height; - float* dx2 = scale_data + 2 * output_height + output_width; + float* dy1 = scale_data; + float* dy2 = dy1 + output_height; + + float* dx1 = dy1 + 2 * output_height; + float* dx2 = dx1 + output_width; + + // Start processing auto roi_y_start = roi.size() / 2 - 2; auto roi_y_end = roi.size() - 2; for (int64_t y = 0; y < output_height; ++y) { @@ -379,8 +401,8 @@ void UpsampleBilinear(int64_t batch_size, in_x1[x] = std::min(static_cast(in_x), input_width - 1); in_x2[x] = std::min(in_x1[x] + 1, input_width - 1); - dx1[x] = std::abs(in_x - in_x1[x]); - dx2[x] = std::abs(in_x - in_x2[x]); + dx1[x] = std::fabs(in_x - in_x1[x]); + dx2[x] = std::fabs(in_x - in_x2[x]); if (in_x1[x] == in_x2[x]) { dx1[x] = 0.5f; dx2[x] = 0.5f; @@ -400,6 +422,7 @@ void UpsampleBilinear(int64_t batch_size, continue; } + // subscript ordering in the variable - (xy) T X11 = Xdata[input_width_mul_y1[y] + in_x1[x]]; T X21 = Xdata[input_width_mul_y1[y] + in_x2[x]]; T X12 = Xdata[input_width_mul_y2[y] + in_x1[x]]; @@ -417,6 +440,198 @@ void UpsampleBilinear(int64_t batch_size, } } +// The following method supports a 5-D input in 'Linear mode' +// that amounts to 'Trilinear' Upsampling/Resizing in the sense that it assumes +// the scale values for the outermost 2 dimensions are 1. +// This is the common use-case where the 5-D input (batched multi-channel volumes) +// is usually of shape [N, C, D, H, W] and the scales are [1.0, 1.0, depth_scale, height_scale, width_scale] +template +void UpsampleTrilinear(int64_t batch_size, + int64_t num_channels, + int64_t input_depth, + int64_t input_height, + int64_t input_width, + int64_t output_depth, + int64_t output_height, + int64_t output_width, + float depth_scale, + float height_scale, + float width_scale, + const std::vector& roi, + bool use_extrapolation, + float extrapolation_value, + const T* Xdata, + T* Ydata, + AllocatorPtr& alloc, + GetOriginalCoordinateFunc get_original_coordinate) { + std::vector z_original; + z_original.reserve(output_depth); + + std::vector y_original; + y_original.reserve(output_height); + + std::vector x_original; + x_original.reserve(output_width); + + // For each index in the output height and output width, cache its corresponding indices in the input + // while multiplying it with the input stride for that dimension (cache because we don't have to re-compute + // each time we come across the output width/ output height value while iterating the output image tensor + SafeInt idx_buffer_size = SafeInt(2) * sizeof(int64_t) * + (output_depth + output_height + output_width); + + // For each index in the output height and output width, cache its corresponding "weights/scales" for its + // corresponding indices in the input which proportionately indicates how much they will influence the final + // pixel value in the output + // (cache because we don't have to re-compute each time we come across the output width/ output height value while iterating the output image tensor + SafeInt scale_buffer_size = SafeInt(2) * sizeof(float_t) * + (output_depth + output_height + output_width); + + // Limit number of allocations to just 1 + auto inx_scale_data_buffer = alloc->Alloc(idx_buffer_size + scale_buffer_size); + BufferUniquePtr idx_scale_data_buffer_holder(inx_scale_data_buffer, BufferDeleter(alloc)); + + // Get pointers to appropriate memory locations in the scratch buffer + auto* idx_data = static_cast(idx_scale_data_buffer_holder.get()); + + // input_width * input_height is the stride for the depth dimension + int64_t* input_height_width_mul_z1 = idx_data; + int64_t* input_height_width_mul_z2 = input_height_width_mul_z1 + output_depth; + + // input_width is the stride for the height dimension + int64_t* input_width_mul_y1 = input_height_width_mul_z1 + 2 * output_depth; + int64_t* input_width_mul_y2 = input_width_mul_y1 + output_height; + + // stride for width is 1 (no multiplication needed) + int64_t* in_x1 = input_width_mul_y1 + 2 * output_height; + int64_t* in_x2 = in_x1 + output_width; + + auto* scale_data = reinterpret_cast(in_x2 + output_width); + + float* dz1 = scale_data; + float* dz2 = dz1 + output_depth; + + float* dy1 = dz1 + 2 * output_depth; + float* dy2 = dy1 + output_height; + + float* dx1 = dy1 + 2 * output_height; + float* dx2 = dx1 + output_width; + + // Start processing + auto roi_z_start = roi.size() / 2 - 3; + auto roi_z_end = roi.size() - 3; + for (int64_t z = 0; z < output_depth; ++z) { + float in_z = depth_scale == 1 ? static_cast(z) + : get_original_coordinate(static_cast(z), depth_scale, + static_cast(output_depth), static_cast(input_depth), + roi[roi_z_start], roi[roi_z_end]); + z_original.emplace_back(in_z); + in_z = std::max(0.0f, std::min(in_z, static_cast(input_depth - 1))); + + const int64_t in_z1 = std::min(static_cast(in_z), input_depth - 1); + const int64_t in_z2 = std::min(in_z1 + 1, input_depth - 1); + dz1[z] = std::fabs(in_z - in_z1); + dz2[z] = std::fabs(in_z - in_z2); + + if (in_z1 == in_z2) { + dz1[z] = 0.5f; + dz2[z] = 0.5f; + } + + input_height_width_mul_z1[z] = input_height * input_width * in_z1; + input_height_width_mul_z2[z] = input_height * input_width * in_z2; + } + + auto roi_y_start = roi.size() / 2 - 2; + auto roi_y_end = roi.size() - 2; + for (int64_t y = 0; y < output_height; ++y) { + float in_y = height_scale == 1 ? static_cast(y) + : get_original_coordinate(static_cast(y), height_scale, + static_cast(output_height), static_cast(input_height), + roi[roi_y_start], roi[roi_y_end]); + y_original.emplace_back(in_y); + in_y = std::max(0.0f, std::min(in_y, static_cast(input_height - 1))); + + const int64_t in_y1 = std::min(static_cast(in_y), input_height - 1); + const int64_t in_y2 = std::min(in_y1 + 1, input_height - 1); + dy1[y] = std::fabs(in_y - in_y1); + dy2[y] = std::fabs(in_y - in_y2); + + if (in_y1 == in_y2) { + dy1[y] = 0.5f; + dy2[y] = 0.5f; + } + + input_width_mul_y1[y] = input_width * in_y1; + input_width_mul_y2[y] = input_width * in_y2; + } + + auto roi_x_start = roi.size() / 2 - 1; + auto roi_x_end = roi.size() - 1; + for (int64_t x = 0; x < output_width; ++x) { + float in_x = width_scale == 1 ? static_cast(x) + : get_original_coordinate(static_cast(x), width_scale, + static_cast(output_width), static_cast(input_width), + roi[roi_x_start], roi[roi_x_end]); + x_original.emplace_back(in_x); + in_x = std::max(0.0f, std::min(in_x, static_cast(input_width - 1))); + + in_x1[x] = std::min(static_cast(in_x), input_width - 1); + in_x2[x] = std::min(in_x1[x] + 1, input_width - 1); + + dx1[x] = std::fabs(in_x - in_x1[x]); + dx2[x] = std::fabs(in_x - in_x2[x]); + if (in_x1[x] == in_x2[x]) { + dx1[x] = 0.5f; + dx2[x] = 0.5f; + } + } + + for (int64_t n = 0; n < batch_size; ++n) { + for (int64_t c = 0; c < num_channels; ++c) { + for (int64_t z = 0; z < output_depth; ++z) { + for (int64_t y = 0; y < output_height; ++y) { + for (int64_t x = 0; x < output_width; ++x) { + // when use_extrapolation is set and original index of x or y is out of the dim range + // then use extrapolation_value as the output value. + if (use_extrapolation && + ((z_original[z] < 0 || z_original[z] > static_cast(input_depth - 1)) || + (y_original[y] < 0 || y_original[y] > static_cast(input_height - 1)) || + (x_original[x] < 0 || x_original[x] > static_cast(input_width - 1)))) { + Ydata[output_width * output_height * z + output_width * y + x] = + static_cast(extrapolation_value); + continue; + } + + // subscript ordering in the variable - (xyz) + T X111 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y1[y] + in_x1[x]]; + T X211 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y1[y] + in_x2[x]]; + T X121 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y2[y] + in_x1[x]]; + T X221 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y2[y] + in_x2[x]]; + + T X112 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y1[y] + in_x1[x]]; + T X212 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y1[y] + in_x2[x]]; + T X122 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y2[y] + in_x1[x]]; + T X222 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y2[y] + in_x2[x]]; + + Ydata[output_width * output_height * z + output_width * y + x] = + static_cast(dx2[x] * dy2[y] * dz2[z] * X111 + + dx1[x] * dy2[y] * dz2[z] * X211 + + dx2[x] * dy1[y] * dz2[z] * X121 + + dx1[x] * dy1[y] * dz2[z] * X221 + + + dx2[x] * dy2[y] * dz1[z] * X112 + + dx1[x] * dy2[y] * dz1[z] * X212 + + dx2[x] * dy1[y] * dz1[z] * X122 + + dx1[x] * dy1[y] * dz1[z] * X222); + } + } + } + Xdata += input_depth * input_height * input_width; + Ydata += output_depth * output_width * output_height; + } + } +} + // Calculates cubic coeff based on Robert Keys approach // https://ieeexplore.ieee.org/document/1163711 std::array GetCubicCoeffs(float s, float cubic_coeff_a = -0.75) { @@ -491,7 +706,11 @@ void ResizeBiCubic( T* Ydata, GetOriginalCoordinateFunc get_original_coordinate) { std::vector y_original; + y_original.reserve(output_height); + std::vector x_original; + x_original.reserve(output_width); + std::unordered_map> cubic_coeffs; std::unordered_map> coeff_to_1Dinterpolation_map; auto roi_y_start = roi.size() / 2 - 2; @@ -660,33 +879,70 @@ Status Upsample::BaseCompute(OpKernelContext* context, scales, roi, is_resize_, use_extrapolation_, extrapolation_value_, use_nearest2x_optimization_, get_original_coordinate_, get_nearest_pixel_); case UpsampleMode::LINEAR: { - //The correct behavior of 'linear' mode for an N-D input is not clear right now, - //so only support 'bilinear' with 2-D or 4-D input tensor with outermost 2 scales as 1 in the 4-D case + // Supports 'bilinear' and 'trilinear' sampling only + + //'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1 + if (dims.size() == 2 || dims.size() == 4) { + bool is_2D = dims.size() == 2; + + const int64_t batch_size = is_2D ? 1 : dims[0]; + const int64_t num_channels = is_2D ? 1 : dims[1]; + const int64_t input_height = is_2D ? dims[0] : dims[2]; + const int64_t input_width = is_2D ? dims[1] : dims[3]; + + const int64_t output_height = is_2D ? output_dims[0] : output_dims[2]; + const int64_t output_width = is_2D ? output_dims[1] : output_dims[3]; + + AllocatorPtr alloc; + ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); + UpsampleBilinear(batch_size, num_channels, input_height, input_width, output_height, output_width, + is_2D ? scales[0] : scales[2], is_2D ? scales[1] : scales[3], roi, + use_extrapolation_, extrapolation_value_, X->template Data(), + Y->template MutableData(), alloc, get_original_coordinate_); + return Status::OK(); + } else if (dims.size() == 3 || dims.size() == 5) { + //'trilinear' == 3-D input or 5-D input with outermost 2 scales as 1 + bool is_3D = dims.size() == 3; + + const int64_t batch_size = is_3D ? 1 : dims[0]; + const int64_t num_channels = is_3D ? 1 : dims[1]; + const int64_t input_depth = is_3D ? dims[0] : dims[2]; + const int64_t input_height = is_3D ? dims[1] : dims[3]; + const int64_t input_width = is_3D ? dims[2] : dims[4]; + + const int64_t output_depth = is_3D ? output_dims[0] : output_dims[2]; + const int64_t output_height = is_3D ? output_dims[1] : output_dims[3]; + const int64_t output_width = is_3D ? output_dims[2] : output_dims[4]; + + AllocatorPtr alloc; + ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); + UpsampleTrilinear(batch_size, num_channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, + is_3D ? scales[0] : scales[2], is_3D ? scales[1] : scales[3], + is_3D ? scales[2] : scales[4], roi, use_extrapolation_, extrapolation_value_, + X->template Data(), Y->template MutableData(), alloc, get_original_coordinate_); + return Status::OK(); + } else { + // User shouldn't hit this as the check has been performed in ScalesValidation() + std::ostringstream oss; + oss << "'Linear' mode only support 2-D inputs or 3-D inputs ('Bilinear', 'Trilinear') " + "or 4-D inputs or 5-D inputs with the corresponding outermost 2 scale values " + "being 1 in the "; + oss << (is_resize_ ? "Resize operator" : "Upsample operator"); + return Status(ONNXRUNTIME, FAIL, oss.str()); + } + } + case UpsampleMode::CUBIC: { + // Supports 'bicubic' sampling only + + // User shouldn't hit this as the check has been performed in ScalesValidation() if (dims.size() != 2 && dims.size() != 4) { std::ostringstream oss; - oss << "'Linear' mode only support 2-D inputs ('Bilinear') or 4-D inputs " + oss << "'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs " "with the corresponding outermost 2 scale values being 1 in the "; oss << (is_resize_ ? "Resize operator" : "Upsample operator"); return Status(ONNXRUNTIME, FAIL, oss.str()); } - - bool is_2D = dims.size() == 2; - const int64_t batch_size = is_2D ? 1 : dims[0]; - const int64_t num_channels = is_2D ? 1 : dims[1]; - const int64_t input_height = is_2D ? dims[0] : dims[2]; - const int64_t input_width = is_2D ? dims[1] : dims[3]; - const int64_t output_height = is_2D ? output_dims[0] : output_dims[2]; - const int64_t output_width = is_2D ? output_dims[1] : output_dims[3]; - - AllocatorPtr alloc; - ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); - UpsampleBilinear(batch_size, num_channels, input_height, input_width, output_height, output_width, - is_2D ? scales[0] : scales[2], is_2D ? scales[1] : scales[3], roi, - use_extrapolation_, extrapolation_value_, X->template Data(), - Y->template MutableData(), alloc, get_original_coordinate_); - return Status::OK(); - } - case UpsampleMode::CUBIC: { bool is_2D = dims.size() == 2; const int64_t batch_size = is_2D ? 1 : dims[0]; const int64_t num_channels = is_2D ? 1 : dims[1]; diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.h b/onnxruntime/core/providers/cpu/tensor/upsample.h index 18c50d7208..a2b1cd7d7c 100644 --- a/onnxruntime/core/providers/cpu/tensor/upsample.h +++ b/onnxruntime/core/providers/cpu/tensor/upsample.h @@ -276,15 +276,26 @@ class UpsampleBase { } } - if (UpsampleMode::LINEAR == mode || UpsampleMode::CUBIC == mode) { + if (UpsampleMode::LINEAR == mode) { + ORT_ENFORCE(scales.size() == 2 || + (scales.size() == 4 && scales[0] == 1 && scales[1] == 1) || + scales.size() == 3 || + (scales.size() == 5 && scales[0] == 1 && scales[1] == 1), + "'Linear' mode only support 2-D inputs or 3-D inputs ('Bilinear', 'Trilinear') " + "or 4-D inputs or 5-D inputs with the corresponding outermost 2 scale values being 1 in the ", + is_resize_ ? "Resize operator" : "Upsample operator"); + } + + else if (UpsampleMode::CUBIC == mode) { ORT_ENFORCE(scales.size() == 2 || (scales.size() == 4 && scales[0] == 1 && scales[1] == 1), - "'Linear' mode and 'Cubic' mode only support 2-D inputs ('Bilinear', 'Bicubic') or 4-D inputs " + "'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs " "with the corresponding outermost 2 scale values being 1 in the ", is_resize_ ? "Resize operator" : "Upsample operator"); } } - void ParseScalesData(const Tensor* scale, std::vector& scales) const { + void + ParseScalesData(const Tensor* scale, std::vector& scales) const { const auto* scale_data = scale->template Data(); int64_t scales_size = scale->Shape().Size(); ORT_ENFORCE(scales_size > 0, "scales size should be greater than 0."); diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu index c3521d1644..8039e5ed5f 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu @@ -191,7 +191,7 @@ __global__ void _ResizeNearestMappingKernel( const TArray input_shape, const TArray output_shape, const TArray scales, - const TArray roi, + const TArray roi, const size_t total_dim_sum, bool extrapolation_enabled, CudaFunctionOriginalCoordinate transform_coordinate, @@ -276,7 +276,7 @@ __global__ void _ResizeNearestKernel( output_data[id] = extrapolation_occured ? extrapolation_value : input_data[input_index]; } -struct BilinearMappingInfo { +struct LinearMappingInfo { int origin_; float weight_; int extrapolate_; @@ -291,7 +291,7 @@ __global__ void _ResizeBilinearCoordinateMapping( float roi_width_start, float roi_width_end, const size_t SumHW, bool extrapolation_enabled, CudaFunctionOriginalCoordinate transform_coordinate, - BilinearMappingInfo* dims_mapping) { + LinearMappingInfo* dims_mapping) { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW); if (id < output_height) { // y = id float input_y = scale_height == 1 ? static_cast(id) : @@ -316,7 +316,7 @@ __global__ void _ResizeBilinearCoordinateMapping( } } -// The following method supports a N-D input in 'Linear mode'. Last two dimension is [H, W]. +// The following method supports a 2-D or 4-D input in 'Linear mode'. Last two dimension is [H, W]. // the scale values for the outer dimensions except last two are 1. template __global__ void _ResizeBilinearKernel( @@ -325,7 +325,7 @@ __global__ void _ResizeBilinearKernel( fast_divmod div_output_width, fast_divmod div_output_image, const T* input_data, T* output_data, const size_t N, const T extrapolation_value, - BilinearMappingInfo* dims_mapping) { + LinearMappingInfo* dims_mapping) { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); int bxc, output_image_index; div_output_image.divmod(id, bxc, output_image_index); @@ -359,6 +359,124 @@ __global__ void _ResizeBilinearKernel( x11 * static_cast(y_offset_0 * x_offset_0); } +template +__global__ void _ResizeTrilinearCoordinateMapping( + int64_t input_depth, int64_t input_height, int64_t input_width, + int64_t output_depth, int64_t output_height, int64_t output_width, + float scale_depth, float scale_height, float scale_width, + float roi_depth_start, float roi_depth_end, + float roi_height_start, float roi_height_end, + float roi_width_start, float roi_width_end, + const size_t SumDHW, bool extrapolation_enabled, + CudaFunctionOriginalCoordinate transform_coordinate, + LinearMappingInfo* dims_mapping) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumDHW); + if (id < output_depth) { // z = id + float input_z = scale_depth == 1 ? static_cast(id) : + transform_coordinate(static_cast(id), scale_depth, + static_cast(output_depth), static_cast(input_depth), + roi_depth_start, roi_depth_end); + dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_z < 0 || input_z > static_cast(input_depth - 1))); + input_z = max(0.0f, min(input_z, static_cast(input_depth - 1))); + int z_int = static_cast(input_z); + dims_mapping[id].origin_ = z_int; + dims_mapping[id].weight_ = (z_int >= input_depth - 1) ? 0.5f : input_z - z_int; + } else if (id >= output_depth && id < (output_depth + output_height)) { // y = id - output_depth + float input_y = scale_height == 1 ? static_cast(id - output_depth) : + transform_coordinate(static_cast(id - output_depth), scale_height, + static_cast(output_height), static_cast(input_height), + roi_height_start, roi_height_end); + + dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast(input_height - 1))); + input_y = max(0.0f, min(input_y, static_cast(input_height - 1))); + int y_int = static_cast(input_y); + dims_mapping[id].origin_ = y_int; + dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int; + } else { //x = id - output_depth - output_height + float input_x = scale_width == 1 ? static_cast(id - output_depth - output_height) : + transform_coordinate(static_cast(id - output_depth - output_height), scale_width, + static_cast(output_width), static_cast(input_width), + roi_width_start, roi_width_end); + dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast(input_width - 1))); + input_x = max(0.0f, min(input_x, static_cast(input_width - 1))); + int x_int = static_cast(input_x); + dims_mapping[id].origin_ = x_int; + dims_mapping[id].weight_ = (x_int >= input_width - 1) ? 0.5f : input_x - x_int; + } +} + +// The following method supports a 3-D or 5-D input in 'Linear mode'. Last two dimension is [D, sH, W]. +// the scale values for the outer dimensions except last two are 1. +template +__global__ void _ResizeTrilinearKernel( + int64_t input_depth, int64_t input_height, int64_t input_width, + int64_t output_depth, int64_t output_height, int64_t output_width, + fast_divmod div_output_height, fast_divmod div_output_width, fast_divmod div_output_image, + const T* input_data, T* output_data, const size_t N, + const T extrapolation_value, + LinearMappingInfo* dims_mapping) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); + int bxc, output_image_index; + div_output_image.divmod(id, bxc, output_image_index); + CUDA_LONG input_index = bxc * input_depth * input_height * input_width; + int output_z, output_y, output_x, temp; + + div_output_height.divmod(output_image_index, output_z, temp); + div_output_width.divmod(temp, output_y, output_x); + + if (dims_mapping[output_z].extrapolate_ || + dims_mapping[output_y + output_depth].extrapolate_ || + dims_mapping[output_x + output_depth + output_height].extrapolate_) { + output_data[id] = extrapolation_value; + return; + } + + float z_offset_0 = dims_mapping[output_z].weight_; + int z_int = dims_mapping[output_z].origin_; + + float y_offset_0 = dims_mapping[output_y + output_depth].weight_; + int y_int = dims_mapping[output_y + output_depth].origin_; + + float x_offset_0 = dims_mapping[output_x + output_depth + output_height].weight_; + int x_int = dims_mapping[output_x + output_depth + output_height].origin_; + + input_index += z_int * input_height * input_width + y_int * input_width + x_int; + + T x000 = input_data[input_index]; + + bool end_of_h = (y_int >= input_height - 1); + bool end_of_w = (x_int >= input_width - 1); + + T x100 = end_of_w ? x000 : input_data[input_index + 1]; + T x010 = end_of_h ? x000 : input_data[input_index + input_width]; + T x110 = end_of_w ? x010 : (end_of_h ? x100 : input_data[input_index + input_width + 1]); + + bool end_of_d = (z_int >= input_depth - 1); + if (!end_of_d) { + input_index = input_index + input_height * input_width; + } + + T x001 = end_of_d ? x000 : input_data[input_index]; + + T x101 = end_of_w ? x001 : input_data[input_index + 1]; + T x011 = end_of_h ? x001 : input_data[input_index + input_width]; + T x111 = end_of_w ? x011 : (end_of_h ? x101 : input_data[input_index + input_width + 1]); + + float z_offset_1 = 1.0f - z_offset_0; + float y_offset_1 = 1.0f - y_offset_0; + float x_offset_1 = 1.0f - x_offset_0; + output_data[id] = + x000 * static_cast(z_offset_1 * y_offset_1 * x_offset_1) + + x010 * static_cast(z_offset_1 * y_offset_0 * x_offset_1) + + x100 * static_cast(z_offset_1 * y_offset_1 * x_offset_0) + + x110 * static_cast(z_offset_1 * y_offset_0 * x_offset_0) + + + x001 * static_cast(z_offset_0 * y_offset_1 * x_offset_1) + + x011 * static_cast(z_offset_0 * y_offset_0 * x_offset_1) + + x101 * static_cast(z_offset_0 * y_offset_1 * x_offset_0) + + x111 * static_cast(z_offset_0 * y_offset_0 * x_offset_0); +} + template __device__ __forceinline__ float CubicInterpolationRowwise( const T* image, int x, int y, int input_height, int input_width, @@ -464,7 +582,7 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode, case UpsampleMode::NN: return sizeof(int64_t) * output_dims.size() + sizeof(NearestMappingInfo) * std::accumulate(output_dims.begin(), output_dims.end(), 0); case UpsampleMode::LINEAR: - return sizeof(BilinearMappingInfo) * std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, 0); + return sizeof(LinearMappingInfo) * std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, 0); case UpsampleMode::CUBIC: return sizeof(CubicMappingInfo) * std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, 0); } @@ -479,7 +597,7 @@ void ResizeNearestImpl( TArray& input_strides, TArray& output_div_pitches, TArray& scales_vals, - TArray& roi_vals, + TArray& roi_vals, const T* input_data, T* output_data, const size_t N, @@ -556,7 +674,7 @@ void ResizeImpl( TArray& input_strides, TArray& output_div_pitches, TArray& scales_vals, - TArray& roi_vals, + TArray& roi_vals, const T* input_data, T* output_data, const size_t N, @@ -587,45 +705,90 @@ void ResizeImpl( return; } - int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); - fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(gsl::narrow_cast(N)); + // We support a special case of bilinear or bicubic if the input data is 4D with the outer 2 scales being 1.0 + // We would have validated the outer scale values by the time execution reaches this + bool is_2D = (rank == 2 || rank == 4); + + // We support a special case of trilinear or tricubic if the input data is 5D with the outer 2 scales being 1.0 + // We would have validated the outer scale values by the time execution reaches this + bool is_3D = (rank == 3 || rank == 5); + + // Should not hit this as we have already validated input rank/scales and we provide verbose error messages + // to the user. + ORT_ENFORCE(is_2D || is_3D, "Only bilinear/trilinear and bicubic modes are supported in Resize"); + + int blocksPerGrid = static_cast(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); + fast_divmod div_output_image; + if (is_2D) { + div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(gsl::narrow_cast(N)); + } else if (is_3D) { + div_output_image = (rank > 3) ? output_div_pitches[rank - 4] : fast_divmod(gsl::narrow_cast(N)); + } + + int64_t output_depth = is_3D ? output_shape[rank - 3] : 0; int64_t output_height = output_shape[rank - 2]; int64_t output_width = output_shape[rank - 1]; - int blocksPerDimsMappingGrid = (int)(ceil((output_height + output_width) / 32.0)); + int blocksPerDimsMappingGrid = + static_cast(ceil((output_depth + output_height + output_width) / 32.0)); + switch (upsample_mode) { case UpsampleMode::LINEAR: - _ResizeBilinearCoordinateMapping<<>>( - input_shape[rank - 2], input_shape[rank - 1], - output_height, output_width, - scales_vals[rank - 2], scales_vals[rank - 1], - roi_vals[rank - 2], roi_vals[rank - 2 + rank], - roi_vals[rank - 1], roi_vals[rank - 1 + rank], - output_height + output_width, extrapolation_enabled, transform_coordinate, - reinterpret_cast(dims_mapping)); - _ResizeBilinearKernel<<>>( - input_shape[rank - 2], input_shape[rank - 1], - output_height, output_width, - output_div_pitches[rank - 2], div_output_image, - input_data, output_data, N, extrapolation_value, - reinterpret_cast(dims_mapping)); - return; + if (is_2D) { + _ResizeBilinearCoordinateMapping<<>>( + input_shape[rank - 2], input_shape[rank - 1], + output_height, output_width, + scales_vals[rank - 2], scales_vals[rank - 1], + roi_vals[rank - 2], roi_vals[rank - 2 + rank], + roi_vals[rank - 1], roi_vals[rank - 1 + rank], + output_height + output_width, extrapolation_enabled, transform_coordinate, + reinterpret_cast(dims_mapping)); + _ResizeBilinearKernel<<>>( + input_shape[rank - 2], input_shape[rank - 1], + output_height, output_width, + output_div_pitches[rank - 2], div_output_image, + input_data, output_data, N, extrapolation_value, + reinterpret_cast(dims_mapping)); + return; + } else if (is_3D) { + _ResizeTrilinearCoordinateMapping<<>>( + input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1], + output_depth, output_height, output_width, + scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1], + roi_vals[rank - 3], roi_vals[rank - 3 + rank], + roi_vals[rank - 2], roi_vals[rank - 2 + rank], + roi_vals[rank - 1], roi_vals[rank - 1 + rank], + output_depth + output_height + output_width, extrapolation_enabled, transform_coordinate, + reinterpret_cast(dims_mapping)); + _ResizeTrilinearKernel<<>>( + input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1], + output_depth, output_height, output_width, + output_div_pitches[rank - 3], output_div_pitches[rank - 2], div_output_image, + input_data, output_data, N, extrapolation_value, + reinterpret_cast(dims_mapping)); + return; + } + + break; + case UpsampleMode::CUBIC: - _ResizeCubicCoordinateMapping<<>>( - input_shape[rank - 2], input_shape[rank - 1], - output_height, output_width, - scales_vals[rank - 2], scales_vals[rank - 1], - roi_vals[rank - 2], roi_vals[rank - 2 + rank], - roi_vals[rank - 1], roi_vals[rank - 1 + rank], - output_height + output_width, extrapolation_enabled, - cubic_coeff_a, exclude_outside, transform_coordinate, - reinterpret_cast(dims_mapping)); - _ResizeBiCubicKernel<<>>( - input_shape[rank - 2], input_shape[rank - 1], - output_height, output_width, - output_div_pitches[rank - 2], div_output_image, - input_data, output_data, N, extrapolation_value, - reinterpret_cast(dims_mapping)); - return; + if (is_2D) { + _ResizeCubicCoordinateMapping<<>>( + input_shape[rank - 2], input_shape[rank - 1], + output_height, output_width, + scales_vals[rank - 2], scales_vals[rank - 1], + roi_vals[rank - 2], roi_vals[rank - 2 + rank], + roi_vals[rank - 1], roi_vals[rank - 1 + rank], + output_height + output_width, extrapolation_enabled, + cubic_coeff_a, exclude_outside, transform_coordinate, + reinterpret_cast(dims_mapping)); + _ResizeBiCubicKernel<<>>( + input_shape[rank - 2], input_shape[rank - 1], + output_height, output_width, + output_div_pitches[rank - 2], div_output_image, + input_data, output_data, N, extrapolation_value, + reinterpret_cast(dims_mapping)); + return; + } } } @@ -638,7 +801,7 @@ void ResizeImpl( TArray& input_strides, \ TArray& output_div_pitches, \ TArray& scales_vals, \ - TArray& roi_vals, \ + TArray& roi_vals, \ const T* input_data, \ T* output_data, \ const size_t N, \ diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.h b/onnxruntime/core/providers/cuda/tensor/resize_impl.h index ce38ecfaeb..c82616d644 100644 --- a/onnxruntime/core/providers/cuda/tensor/resize_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.h @@ -23,7 +23,7 @@ void ResizeImpl( TArray& input_strides, TArray& output_div_pitches, TArray& scales_vals, - TArray& roi, + TArray& roi, const T* input_data, T* output_data, const size_t N, diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc index 9a3dbb3876..d0b249ae6b 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample.cc +++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc @@ -74,7 +74,7 @@ Status Upsample::BaseCompute(OpKernelContext* context, if (is_resize_) { TArray input_shape(X_dims); TArray output_shape(output_dims); - TArray roi_vals(roi); + TArray roi_vals(roi); TArray scales_vals(scales); size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims); diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc index 33a5d7f302..b19a5b3fc4 100644 --- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc @@ -7,7 +7,7 @@ namespace onnxruntime { namespace test { -TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_tf_crop_and_resize) { +TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) { OpTester test("Resize", 11); std::vector roi{0.4f, 0.6f, 0.6f, 0.8f}; std::vector scales{}; @@ -65,7 +65,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapol test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear) { +TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) { OpTester test("Resize", 11); std::vector roi{}; std::vector scales{1.0f, 1.0f, 0.6f, 0.6f}; @@ -87,7 +87,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear) { test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_align_corners) { +TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) { OpTester test("Resize", 11); std::vector roi{}; std::vector scales{1.0f, 1.0f, 0.6f, 0.6f}; @@ -110,7 +110,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_align_corners) { test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_pytorch_half_pixel) { +TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_pytorch_half_pixel) { OpTester test("Resize", 11); std::vector roi{}; std::vector scales{}; @@ -138,7 +138,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_pytorch_half_pixel) test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch } -TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_asymmetric) { +TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric) { OpTester test("Resize", 11); std::vector roi{}; std::vector scales{1.0f, 1.0f, 2.0f, 4.0f}; @@ -172,7 +172,7 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_asymmetric) { test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_align_corners) { +TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) { OpTester test("Resize", 11); std::vector roi{}; std::vector scales{2.0f, 4.0f}; @@ -197,7 +197,66 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_align_corners) { test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartScalesNoOpTest) { +TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) { + OpTester test("Resize", 11); + std::vector roi{}; + std::vector scales{}; + std::vector sizes{1, 3, 1}; + + test.AddAttribute("mode", "linear"); + test.AddAttribute("coordinate_transformation_mode", "pytorch_half_pixel"); + + const int64_t D = 2, H = 4, W = 4; + + std::vector X = { + 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f, + + 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f}; + + test.AddInput("X", {D, H, W}, X); + test.AddInput("roi", {0}, roi); + test.AddInput("scales", {0}, scales); + test.AddInput("sizes", {3}, sizes); + + std::vector Y = {1.6666666f, 7.0f, 12.333333f}; + + test.AddOutput("Y", {sizes[0], sizes[1], sizes[2]}, Y); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch +} + +TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_5DTrilinear_pytorch_half_pixel) { + OpTester test("Resize", 11); + std::vector roi{}; + std::vector scales{1.0f, 1.0f, 2.0f, 2.0f, 1.0f}; + + test.AddAttribute("mode", "linear"); + test.AddAttribute("coordinate_transformation_mode", "pytorch_half_pixel"); + + const int64_t N = 1, C = 2, D = 2, H = 1, W = 2; + + std::vector X = { + 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}; + + test.AddInput("X", {N, C, D, H, W}, X); + test.AddInput("roi", {0}, roi); + test.AddInput("scales", {5}, scales); + + std::vector Y = {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, + 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, + 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, + 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f}; + + test.AddOutput("Y", {1, 2, 4, 2, 2}, Y); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch +} + +TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) { OpTester test("Resize", 11); std::vector roi{}; std::vector scales{1.0f, 1.0f, 1.0f, 1.0f}; @@ -856,7 +915,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) { test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_Ver10) { +TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) { OpTester test("Resize", 10); std::vector scales{1.0f, 1.0f, 0.6f, 0.6f}; @@ -876,7 +935,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_Ver10) { test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_Ver10) { +TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) { OpTester test("Resize", 10); std::vector scales{0.6f, 0.6f}; @@ -896,7 +955,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_Ver10) { test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_Ver10) { +TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) { OpTester test("Resize", 10); std::vector scales{1.0f, 1.0f, 2.0f, 4.0f}; test.AddAttribute("mode", "linear"); @@ -926,7 +985,7 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_Ver10) { test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_Ver10) { +TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) { OpTester test("Resize", 10); std::vector scales{2.0f, 4.0f}; test.AddAttribute("mode", "linear"); @@ -948,7 +1007,7 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_Ver10) { test.Run(); } -TEST(ResizeOpTest, ResizeOpLineartScalesNoOpTest_Ver10) { +TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) { OpTester test("Resize", 10); std::vector scales{1.0f, 1.0f, 1.0f, 1.0f}; test.AddAttribute("mode", "linear");