Support trilinear sampling in Resize CPU and CUDA kernels (#5300)

This commit is contained in:
Hariharan Seshadri 2020-10-02 11:02:43 -07:00 committed by GitHub
parent e71668f92c
commit 06cd81d791
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 580 additions and 91 deletions

View file

@ -321,24 +321,46 @@ void UpsampleBilinear(int64_t batch_size,
AllocatorPtr& alloc,
GetOriginalCoordinateFunc get_original_coordinate) {
std::vector<float> y_original;
std::vector<float> x_original;
y_original.reserve(output_height);
std::vector<float> x_original;
x_original.reserve(output_width);
// For each index in the output height and output width, cache its corresponding indices in the input
// while multiplying it with the input stride for that dimension (cache because we don't have to re-compute
// each time we come across the output width/ output height value while iterating the output image tensor
SafeInt<size_t> idx_buffer_size = SafeInt<size_t>(2) * sizeof(int64_t) * (output_height + output_width);
// For each index in the output height and output width, cache its corresponding "weights/scales" for its
// corresponding indices in the input which proportionately indicates how much they will influence the final
// pixel value in the output
// (cache because we don't have to re-compute each time we come across the output width/ output height value while iterating the output image tensor
SafeInt<size_t> scale_buffer_size = SafeInt<size_t>(2) * sizeof(float_t) * (output_height + output_width);
// Limit number of allocations to just 1
auto inx_scale_data_buffer = alloc->Alloc(idx_buffer_size + scale_buffer_size);
BufferUniquePtr idx_scale_data_buffer_holder(inx_scale_data_buffer, BufferDeleter(alloc));
// Get pointers to appropriate memory locations in the scratch buffer
auto* idx_data = static_cast<int64_t*>(idx_scale_data_buffer_holder.get());
// input_width is the stride for the height dimension
int64_t* input_width_mul_y1 = idx_data;
int64_t* input_width_mul_y2 = idx_data + output_height;
int64_t* in_x1 = idx_data + 2 * output_height;
int64_t* in_x2 = idx_data + 2 * output_height + output_width;
int64_t* input_width_mul_y2 = input_width_mul_y1 + output_height;
// stride for width is 1 (no multiplication needed)
int64_t* in_x1 = input_width_mul_y1 + 2 * output_height;
int64_t* in_x2 = in_x1 + output_width;
auto* scale_data = reinterpret_cast<float*>(in_x2 + output_width);
float* dy1 = scale_data;
float* dy2 = scale_data + output_height;
float* dx1 = scale_data + 2 * output_height;
float* dx2 = scale_data + 2 * output_height + output_width;
float* dy1 = scale_data;
float* dy2 = dy1 + output_height;
float* dx1 = dy1 + 2 * output_height;
float* dx2 = dx1 + output_width;
// Start processing
auto roi_y_start = roi.size() / 2 - 2;
auto roi_y_end = roi.size() - 2;
for (int64_t y = 0; y < output_height; ++y) {
@ -379,8 +401,8 @@ void UpsampleBilinear(int64_t batch_size,
in_x1[x] = std::min(static_cast<int64_t>(in_x), input_width - 1);
in_x2[x] = std::min(in_x1[x] + 1, input_width - 1);
dx1[x] = std::abs(in_x - in_x1[x]);
dx2[x] = std::abs(in_x - in_x2[x]);
dx1[x] = std::fabs(in_x - in_x1[x]);
dx2[x] = std::fabs(in_x - in_x2[x]);
if (in_x1[x] == in_x2[x]) {
dx1[x] = 0.5f;
dx2[x] = 0.5f;
@ -400,6 +422,7 @@ void UpsampleBilinear(int64_t batch_size,
continue;
}
// subscript ordering in the variable - (xy)
T X11 = Xdata[input_width_mul_y1[y] + in_x1[x]];
T X21 = Xdata[input_width_mul_y1[y] + in_x2[x]];
T X12 = Xdata[input_width_mul_y2[y] + in_x1[x]];
@ -417,6 +440,198 @@ void UpsampleBilinear(int64_t batch_size,
}
}
// The following method supports a 5-D input in 'Linear mode'
// that amounts to 'Trilinear' Upsampling/Resizing in the sense that it assumes
// the scale values for the outermost 2 dimensions are 1.
// This is the common use-case where the 5-D input (batched multi-channel volumes)
// is usually of shape [N, C, D, H, W] and the scales are [1.0, 1.0, depth_scale, height_scale, width_scale]
template <typename T>
void UpsampleTrilinear(int64_t batch_size,
int64_t num_channels,
int64_t input_depth,
int64_t input_height,
int64_t input_width,
int64_t output_depth,
int64_t output_height,
int64_t output_width,
float depth_scale,
float height_scale,
float width_scale,
const std::vector<float>& roi,
bool use_extrapolation,
float extrapolation_value,
const T* Xdata,
T* Ydata,
AllocatorPtr& alloc,
GetOriginalCoordinateFunc get_original_coordinate) {
std::vector<float> z_original;
z_original.reserve(output_depth);
std::vector<float> y_original;
y_original.reserve(output_height);
std::vector<float> x_original;
x_original.reserve(output_width);
// For each index in the output height and output width, cache its corresponding indices in the input
// while multiplying it with the input stride for that dimension (cache because we don't have to re-compute
// each time we come across the output width/ output height value while iterating the output image tensor
SafeInt<size_t> idx_buffer_size = SafeInt<size_t>(2) * sizeof(int64_t) *
(output_depth + output_height + output_width);
// For each index in the output height and output width, cache its corresponding "weights/scales" for its
// corresponding indices in the input which proportionately indicates how much they will influence the final
// pixel value in the output
// (cache because we don't have to re-compute each time we come across the output width/ output height value while iterating the output image tensor
SafeInt<size_t> scale_buffer_size = SafeInt<size_t>(2) * sizeof(float_t) *
(output_depth + output_height + output_width);
// Limit number of allocations to just 1
auto inx_scale_data_buffer = alloc->Alloc(idx_buffer_size + scale_buffer_size);
BufferUniquePtr idx_scale_data_buffer_holder(inx_scale_data_buffer, BufferDeleter(alloc));
// Get pointers to appropriate memory locations in the scratch buffer
auto* idx_data = static_cast<int64_t*>(idx_scale_data_buffer_holder.get());
// input_width * input_height is the stride for the depth dimension
int64_t* input_height_width_mul_z1 = idx_data;
int64_t* input_height_width_mul_z2 = input_height_width_mul_z1 + output_depth;
// input_width is the stride for the height dimension
int64_t* input_width_mul_y1 = input_height_width_mul_z1 + 2 * output_depth;
int64_t* input_width_mul_y2 = input_width_mul_y1 + output_height;
// stride for width is 1 (no multiplication needed)
int64_t* in_x1 = input_width_mul_y1 + 2 * output_height;
int64_t* in_x2 = in_x1 + output_width;
auto* scale_data = reinterpret_cast<float*>(in_x2 + output_width);
float* dz1 = scale_data;
float* dz2 = dz1 + output_depth;
float* dy1 = dz1 + 2 * output_depth;
float* dy2 = dy1 + output_height;
float* dx1 = dy1 + 2 * output_height;
float* dx2 = dx1 + output_width;
// Start processing
auto roi_z_start = roi.size() / 2 - 3;
auto roi_z_end = roi.size() - 3;
for (int64_t z = 0; z < output_depth; ++z) {
float in_z = depth_scale == 1 ? static_cast<float>(z)
: get_original_coordinate(static_cast<float>(z), depth_scale,
static_cast<float>(output_depth), static_cast<float>(input_depth),
roi[roi_z_start], roi[roi_z_end]);
z_original.emplace_back(in_z);
in_z = std::max(0.0f, std::min(in_z, static_cast<float>(input_depth - 1)));
const int64_t in_z1 = std::min(static_cast<int64_t>(in_z), input_depth - 1);
const int64_t in_z2 = std::min(in_z1 + 1, input_depth - 1);
dz1[z] = std::fabs(in_z - in_z1);
dz2[z] = std::fabs(in_z - in_z2);
if (in_z1 == in_z2) {
dz1[z] = 0.5f;
dz2[z] = 0.5f;
}
input_height_width_mul_z1[z] = input_height * input_width * in_z1;
input_height_width_mul_z2[z] = input_height * input_width * in_z2;
}
auto roi_y_start = roi.size() / 2 - 2;
auto roi_y_end = roi.size() - 2;
for (int64_t y = 0; y < output_height; ++y) {
float in_y = height_scale == 1 ? static_cast<float>(y)
: get_original_coordinate(static_cast<float>(y), height_scale,
static_cast<float>(output_height), static_cast<float>(input_height),
roi[roi_y_start], roi[roi_y_end]);
y_original.emplace_back(in_y);
in_y = std::max(0.0f, std::min(in_y, static_cast<float>(input_height - 1)));
const int64_t in_y1 = std::min(static_cast<int64_t>(in_y), input_height - 1);
const int64_t in_y2 = std::min(in_y1 + 1, input_height - 1);
dy1[y] = std::fabs(in_y - in_y1);
dy2[y] = std::fabs(in_y - in_y2);
if (in_y1 == in_y2) {
dy1[y] = 0.5f;
dy2[y] = 0.5f;
}
input_width_mul_y1[y] = input_width * in_y1;
input_width_mul_y2[y] = input_width * in_y2;
}
auto roi_x_start = roi.size() / 2 - 1;
auto roi_x_end = roi.size() - 1;
for (int64_t x = 0; x < output_width; ++x) {
float in_x = width_scale == 1 ? static_cast<float>(x)
: get_original_coordinate(static_cast<float>(x), width_scale,
static_cast<float>(output_width), static_cast<float>(input_width),
roi[roi_x_start], roi[roi_x_end]);
x_original.emplace_back(in_x);
in_x = std::max(0.0f, std::min(in_x, static_cast<float>(input_width - 1)));
in_x1[x] = std::min(static_cast<int64_t>(in_x), input_width - 1);
in_x2[x] = std::min(in_x1[x] + 1, input_width - 1);
dx1[x] = std::fabs(in_x - in_x1[x]);
dx2[x] = std::fabs(in_x - in_x2[x]);
if (in_x1[x] == in_x2[x]) {
dx1[x] = 0.5f;
dx2[x] = 0.5f;
}
}
for (int64_t n = 0; n < batch_size; ++n) {
for (int64_t c = 0; c < num_channels; ++c) {
for (int64_t z = 0; z < output_depth; ++z) {
for (int64_t y = 0; y < output_height; ++y) {
for (int64_t x = 0; x < output_width; ++x) {
// when use_extrapolation is set and original index of x or y is out of the dim range
// then use extrapolation_value as the output value.
if (use_extrapolation &&
((z_original[z] < 0 || z_original[z] > static_cast<float>(input_depth - 1)) ||
(y_original[y] < 0 || y_original[y] > static_cast<float>(input_height - 1)) ||
(x_original[x] < 0 || x_original[x] > static_cast<float>(input_width - 1)))) {
Ydata[output_width * output_height * z + output_width * y + x] =
static_cast<T>(extrapolation_value);
continue;
}
// subscript ordering in the variable - (xyz)
T X111 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y1[y] + in_x1[x]];
T X211 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y1[y] + in_x2[x]];
T X121 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y2[y] + in_x1[x]];
T X221 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y2[y] + in_x2[x]];
T X112 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y1[y] + in_x1[x]];
T X212 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y1[y] + in_x2[x]];
T X122 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y2[y] + in_x1[x]];
T X222 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y2[y] + in_x2[x]];
Ydata[output_width * output_height * z + output_width * y + x] =
static_cast<T>(dx2[x] * dy2[y] * dz2[z] * X111 +
dx1[x] * dy2[y] * dz2[z] * X211 +
dx2[x] * dy1[y] * dz2[z] * X121 +
dx1[x] * dy1[y] * dz2[z] * X221 +
dx2[x] * dy2[y] * dz1[z] * X112 +
dx1[x] * dy2[y] * dz1[z] * X212 +
dx2[x] * dy1[y] * dz1[z] * X122 +
dx1[x] * dy1[y] * dz1[z] * X222);
}
}
}
Xdata += input_depth * input_height * input_width;
Ydata += output_depth * output_width * output_height;
}
}
}
// Calculates cubic coeff based on Robert Keys approach
// https://ieeexplore.ieee.org/document/1163711
std::array<float, CubicModeGridLength> GetCubicCoeffs(float s, float cubic_coeff_a = -0.75) {
@ -491,7 +706,11 @@ void ResizeBiCubic(
T* Ydata,
GetOriginalCoordinateFunc get_original_coordinate) {
std::vector<float> y_original;
y_original.reserve(output_height);
std::vector<float> x_original;
x_original.reserve(output_width);
std::unordered_map<float, std::array<float, CubicModeGridLength>> cubic_coeffs;
std::unordered_map<float, std::unordered_map<int64_t, float>> coeff_to_1Dinterpolation_map;
auto roi_y_start = roi.size() / 2 - 2;
@ -660,33 +879,70 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
scales, roi, is_resize_, use_extrapolation_, extrapolation_value_,
use_nearest2x_optimization_, get_original_coordinate_, get_nearest_pixel_);
case UpsampleMode::LINEAR: {
//The correct behavior of 'linear' mode for an N-D input is not clear right now,
//so only support 'bilinear' with 2-D or 4-D input tensor with outermost 2 scales as 1 in the 4-D case
// Supports 'bilinear' and 'trilinear' sampling only
//'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1
if (dims.size() == 2 || dims.size() == 4) {
bool is_2D = dims.size() == 2;
const int64_t batch_size = is_2D ? 1 : dims[0];
const int64_t num_channels = is_2D ? 1 : dims[1];
const int64_t input_height = is_2D ? dims[0] : dims[2];
const int64_t input_width = is_2D ? dims[1] : dims[3];
const int64_t output_height = is_2D ? output_dims[0] : output_dims[2];
const int64_t output_width = is_2D ? output_dims[1] : output_dims[3];
AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
UpsampleBilinear(batch_size, num_channels, input_height, input_width, output_height, output_width,
is_2D ? scales[0] : scales[2], is_2D ? scales[1] : scales[3], roi,
use_extrapolation_, extrapolation_value_, X->template Data<T>(),
Y->template MutableData<T>(), alloc, get_original_coordinate_);
return Status::OK();
} else if (dims.size() == 3 || dims.size() == 5) {
//'trilinear' == 3-D input or 5-D input with outermost 2 scales as 1
bool is_3D = dims.size() == 3;
const int64_t batch_size = is_3D ? 1 : dims[0];
const int64_t num_channels = is_3D ? 1 : dims[1];
const int64_t input_depth = is_3D ? dims[0] : dims[2];
const int64_t input_height = is_3D ? dims[1] : dims[3];
const int64_t input_width = is_3D ? dims[2] : dims[4];
const int64_t output_depth = is_3D ? output_dims[0] : output_dims[2];
const int64_t output_height = is_3D ? output_dims[1] : output_dims[3];
const int64_t output_width = is_3D ? output_dims[2] : output_dims[4];
AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
UpsampleTrilinear(batch_size, num_channels, input_depth, input_height, input_width,
output_depth, output_height, output_width,
is_3D ? scales[0] : scales[2], is_3D ? scales[1] : scales[3],
is_3D ? scales[2] : scales[4], roi, use_extrapolation_, extrapolation_value_,
X->template Data<T>(), Y->template MutableData<T>(), alloc, get_original_coordinate_);
return Status::OK();
} else {
// User shouldn't hit this as the check has been performed in ScalesValidation()
std::ostringstream oss;
oss << "'Linear' mode only support 2-D inputs or 3-D inputs ('Bilinear', 'Trilinear') "
"or 4-D inputs or 5-D inputs with the corresponding outermost 2 scale values "
"being 1 in the ";
oss << (is_resize_ ? "Resize operator" : "Upsample operator");
return Status(ONNXRUNTIME, FAIL, oss.str());
}
}
case UpsampleMode::CUBIC: {
// Supports 'bicubic' sampling only
// User shouldn't hit this as the check has been performed in ScalesValidation()
if (dims.size() != 2 && dims.size() != 4) {
std::ostringstream oss;
oss << "'Linear' mode only support 2-D inputs ('Bilinear') or 4-D inputs "
oss << "'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs "
"with the corresponding outermost 2 scale values being 1 in the ";
oss << (is_resize_ ? "Resize operator" : "Upsample operator");
return Status(ONNXRUNTIME, FAIL, oss.str());
}
bool is_2D = dims.size() == 2;
const int64_t batch_size = is_2D ? 1 : dims[0];
const int64_t num_channels = is_2D ? 1 : dims[1];
const int64_t input_height = is_2D ? dims[0] : dims[2];
const int64_t input_width = is_2D ? dims[1] : dims[3];
const int64_t output_height = is_2D ? output_dims[0] : output_dims[2];
const int64_t output_width = is_2D ? output_dims[1] : output_dims[3];
AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
UpsampleBilinear(batch_size, num_channels, input_height, input_width, output_height, output_width,
is_2D ? scales[0] : scales[2], is_2D ? scales[1] : scales[3], roi,
use_extrapolation_, extrapolation_value_, X->template Data<T>(),
Y->template MutableData<T>(), alloc, get_original_coordinate_);
return Status::OK();
}
case UpsampleMode::CUBIC: {
bool is_2D = dims.size() == 2;
const int64_t batch_size = is_2D ? 1 : dims[0];
const int64_t num_channels = is_2D ? 1 : dims[1];

View file

@ -276,15 +276,26 @@ class UpsampleBase {
}
}
if (UpsampleMode::LINEAR == mode || UpsampleMode::CUBIC == mode) {
if (UpsampleMode::LINEAR == mode) {
ORT_ENFORCE(scales.size() == 2 ||
(scales.size() == 4 && scales[0] == 1 && scales[1] == 1) ||
scales.size() == 3 ||
(scales.size() == 5 && scales[0] == 1 && scales[1] == 1),
"'Linear' mode only support 2-D inputs or 3-D inputs ('Bilinear', 'Trilinear') "
"or 4-D inputs or 5-D inputs with the corresponding outermost 2 scale values being 1 in the ",
is_resize_ ? "Resize operator" : "Upsample operator");
}
else if (UpsampleMode::CUBIC == mode) {
ORT_ENFORCE(scales.size() == 2 || (scales.size() == 4 && scales[0] == 1 && scales[1] == 1),
"'Linear' mode and 'Cubic' mode only support 2-D inputs ('Bilinear', 'Bicubic') or 4-D inputs "
"'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs "
"with the corresponding outermost 2 scale values being 1 in the ",
is_resize_ ? "Resize operator" : "Upsample operator");
}
}
void ParseScalesData(const Tensor* scale, std::vector<float>& scales) const {
void
ParseScalesData(const Tensor* scale, std::vector<float>& scales) const {
const auto* scale_data = scale->template Data<float>();
int64_t scales_size = scale->Shape().Size();
ORT_ENFORCE(scales_size > 0, "scales size should be greater than 0.");

View file

@ -191,7 +191,7 @@ __global__ void _ResizeNearestMappingKernel(
const TArray<int64_t> input_shape,
const TArray<int64_t> output_shape,
const TArray<float> scales,
const TArray<float> roi,
const TArray<float, 10> roi,
const size_t total_dim_sum,
bool extrapolation_enabled,
CudaFunctionOriginalCoordinate transform_coordinate,
@ -276,7 +276,7 @@ __global__ void _ResizeNearestKernel(
output_data[id] = extrapolation_occured ? extrapolation_value : input_data[input_index];
}
struct BilinearMappingInfo {
struct LinearMappingInfo {
int origin_;
float weight_;
int extrapolate_;
@ -291,7 +291,7 @@ __global__ void _ResizeBilinearCoordinateMapping(
float roi_width_start, float roi_width_end,
const size_t SumHW, bool extrapolation_enabled,
CudaFunctionOriginalCoordinate transform_coordinate,
BilinearMappingInfo* dims_mapping) {
LinearMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
if (id < output_height) { // y = id
float input_y = scale_height == 1 ? static_cast<float>(id) :
@ -316,7 +316,7 @@ __global__ void _ResizeBilinearCoordinateMapping(
}
}
// The following method supports a N-D input in 'Linear mode'. Last two dimension is [H, W].
// The following method supports a 2-D or 4-D input in 'Linear mode'. Last two dimension is [H, W].
// the scale values for the outer dimensions except last two are 1.
template <typename T>
__global__ void _ResizeBilinearKernel(
@ -325,7 +325,7 @@ __global__ void _ResizeBilinearKernel(
fast_divmod div_output_width, fast_divmod div_output_image,
const T* input_data, T* output_data, const size_t N,
const T extrapolation_value,
BilinearMappingInfo* dims_mapping) {
LinearMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int bxc, output_image_index;
div_output_image.divmod(id, bxc, output_image_index);
@ -359,6 +359,124 @@ __global__ void _ResizeBilinearKernel(
x11 * static_cast<T>(y_offset_0 * x_offset_0);
}
template <typename T>
__global__ void _ResizeTrilinearCoordinateMapping(
int64_t input_depth, int64_t input_height, int64_t input_width,
int64_t output_depth, int64_t output_height, int64_t output_width,
float scale_depth, float scale_height, float scale_width,
float roi_depth_start, float roi_depth_end,
float roi_height_start, float roi_height_end,
float roi_width_start, float roi_width_end,
const size_t SumDHW, bool extrapolation_enabled,
CudaFunctionOriginalCoordinate transform_coordinate,
LinearMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumDHW);
if (id < output_depth) { // z = id
float input_z = scale_depth == 1 ? static_cast<float>(id) :
transform_coordinate(static_cast<float>(id), scale_depth,
static_cast<float>(output_depth), static_cast<float>(input_depth),
roi_depth_start, roi_depth_end);
dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_z < 0 || input_z > static_cast<float>(input_depth - 1)));
input_z = max(0.0f, min(input_z, static_cast<float>(input_depth - 1)));
int z_int = static_cast<int>(input_z);
dims_mapping[id].origin_ = z_int;
dims_mapping[id].weight_ = (z_int >= input_depth - 1) ? 0.5f : input_z - z_int;
} else if (id >= output_depth && id < (output_depth + output_height)) { // y = id - output_depth
float input_y = scale_height == 1 ? static_cast<float>(id - output_depth) :
transform_coordinate(static_cast<float>(id - output_depth), scale_height,
static_cast<float>(output_height), static_cast<float>(input_height),
roi_height_start, roi_height_end);
dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
int y_int = static_cast<int>(input_y);
dims_mapping[id].origin_ = y_int;
dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
} else { //x = id - output_depth - output_height
float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height) :
transform_coordinate(static_cast<float>(id - output_depth - output_height), scale_width,
static_cast<float>(output_width), static_cast<float>(input_width),
roi_width_start, roi_width_end);
dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
int x_int = static_cast<int>(input_x);
dims_mapping[id].origin_ = x_int;
dims_mapping[id].weight_ = (x_int >= input_width - 1) ? 0.5f : input_x - x_int;
}
}
// The following method supports a 3-D or 5-D input in 'Linear mode'. Last two dimension is [D, sH, W].
// the scale values for the outer dimensions except last two are 1.
template <typename T>
__global__ void _ResizeTrilinearKernel(
int64_t input_depth, int64_t input_height, int64_t input_width,
int64_t output_depth, int64_t output_height, int64_t output_width,
fast_divmod div_output_height, fast_divmod div_output_width, fast_divmod div_output_image,
const T* input_data, T* output_data, const size_t N,
const T extrapolation_value,
LinearMappingInfo* dims_mapping) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
int bxc, output_image_index;
div_output_image.divmod(id, bxc, output_image_index);
CUDA_LONG input_index = bxc * input_depth * input_height * input_width;
int output_z, output_y, output_x, temp;
div_output_height.divmod(output_image_index, output_z, temp);
div_output_width.divmod(temp, output_y, output_x);
if (dims_mapping[output_z].extrapolate_ ||
dims_mapping[output_y + output_depth].extrapolate_ ||
dims_mapping[output_x + output_depth + output_height].extrapolate_) {
output_data[id] = extrapolation_value;
return;
}
float z_offset_0 = dims_mapping[output_z].weight_;
int z_int = dims_mapping[output_z].origin_;
float y_offset_0 = dims_mapping[output_y + output_depth].weight_;
int y_int = dims_mapping[output_y + output_depth].origin_;
float x_offset_0 = dims_mapping[output_x + output_depth + output_height].weight_;
int x_int = dims_mapping[output_x + output_depth + output_height].origin_;
input_index += z_int * input_height * input_width + y_int * input_width + x_int;
T x000 = input_data[input_index];
bool end_of_h = (y_int >= input_height - 1);
bool end_of_w = (x_int >= input_width - 1);
T x100 = end_of_w ? x000 : input_data[input_index + 1];
T x010 = end_of_h ? x000 : input_data[input_index + input_width];
T x110 = end_of_w ? x010 : (end_of_h ? x100 : input_data[input_index + input_width + 1]);
bool end_of_d = (z_int >= input_depth - 1);
if (!end_of_d) {
input_index = input_index + input_height * input_width;
}
T x001 = end_of_d ? x000 : input_data[input_index];
T x101 = end_of_w ? x001 : input_data[input_index + 1];
T x011 = end_of_h ? x001 : input_data[input_index + input_width];
T x111 = end_of_w ? x011 : (end_of_h ? x101 : input_data[input_index + input_width + 1]);
float z_offset_1 = 1.0f - z_offset_0;
float y_offset_1 = 1.0f - y_offset_0;
float x_offset_1 = 1.0f - x_offset_0;
output_data[id] =
x000 * static_cast<T>(z_offset_1 * y_offset_1 * x_offset_1) +
x010 * static_cast<T>(z_offset_1 * y_offset_0 * x_offset_1) +
x100 * static_cast<T>(z_offset_1 * y_offset_1 * x_offset_0) +
x110 * static_cast<T>(z_offset_1 * y_offset_0 * x_offset_0) +
x001 * static_cast<T>(z_offset_0 * y_offset_1 * x_offset_1) +
x011 * static_cast<T>(z_offset_0 * y_offset_0 * x_offset_1) +
x101 * static_cast<T>(z_offset_0 * y_offset_1 * x_offset_0) +
x111 * static_cast<T>(z_offset_0 * y_offset_0 * x_offset_0);
}
template <typename T>
__device__ __forceinline__ float CubicInterpolationRowwise(
const T* image, int x, int y, int input_height, int input_width,
@ -464,7 +582,7 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
case UpsampleMode::NN:
return sizeof(int64_t) * output_dims.size() + sizeof(NearestMappingInfo) * std::accumulate(output_dims.begin(), output_dims.end(), 0);
case UpsampleMode::LINEAR:
return sizeof(BilinearMappingInfo) * std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, 0);
return sizeof(LinearMappingInfo) * std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, 0);
case UpsampleMode::CUBIC:
return sizeof(CubicMappingInfo) * std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, 0);
}
@ -479,7 +597,7 @@ void ResizeNearestImpl(
TArray<int64_t>& input_strides,
TArray<fast_divmod>& output_div_pitches,
TArray<float>& scales_vals,
TArray<float>& roi_vals,
TArray<float, 10>& roi_vals,
const T* input_data,
T* output_data,
const size_t N,
@ -556,7 +674,7 @@ void ResizeImpl(
TArray<int64_t>& input_strides,
TArray<fast_divmod>& output_div_pitches,
TArray<float>& scales_vals,
TArray<float>& roi_vals,
TArray<float, 10>& roi_vals,
const T* input_data,
T* output_data,
const size_t N,
@ -587,45 +705,90 @@ void ResizeImpl(
return;
}
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(gsl::narrow_cast<int>(N));
// We support a special case of bilinear or bicubic if the input data is 4D with the outer 2 scales being 1.0
// We would have validated the outer scale values by the time execution reaches this
bool is_2D = (rank == 2 || rank == 4);
// We support a special case of trilinear or tricubic if the input data is 5D with the outer 2 scales being 1.0
// We would have validated the outer scale values by the time execution reaches this
bool is_3D = (rank == 3 || rank == 5);
// Should not hit this as we have already validated input rank/scales and we provide verbose error messages
// to the user.
ORT_ENFORCE(is_2D || is_3D, "Only bilinear/trilinear and bicubic modes are supported in Resize");
int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
fast_divmod div_output_image;
if (is_2D) {
div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(gsl::narrow_cast<int>(N));
} else if (is_3D) {
div_output_image = (rank > 3) ? output_div_pitches[rank - 4] : fast_divmod(gsl::narrow_cast<int>(N));
}
int64_t output_depth = is_3D ? output_shape[rank - 3] : 0;
int64_t output_height = output_shape[rank - 2];
int64_t output_width = output_shape[rank - 1];
int blocksPerDimsMappingGrid = (int)(ceil((output_height + output_width) / 32.0));
int blocksPerDimsMappingGrid =
static_cast<int>(ceil((output_depth + output_height + output_width) / 32.0));
switch (upsample_mode) {
case UpsampleMode::LINEAR:
_ResizeBilinearCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
scales_vals[rank - 2], scales_vals[rank - 1],
roi_vals[rank - 2], roi_vals[rank - 2 + rank],
roi_vals[rank - 1], roi_vals[rank - 1 + rank],
output_height + output_width, extrapolation_enabled, transform_coordinate,
reinterpret_cast<BilinearMappingInfo*>(dims_mapping));
_ResizeBilinearKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
output_div_pitches[rank - 2], div_output_image,
input_data, output_data, N, extrapolation_value,
reinterpret_cast<BilinearMappingInfo*>(dims_mapping));
return;
if (is_2D) {
_ResizeBilinearCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
scales_vals[rank - 2], scales_vals[rank - 1],
roi_vals[rank - 2], roi_vals[rank - 2 + rank],
roi_vals[rank - 1], roi_vals[rank - 1 + rank],
output_height + output_width, extrapolation_enabled, transform_coordinate,
reinterpret_cast<LinearMappingInfo*>(dims_mapping));
_ResizeBilinearKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
output_div_pitches[rank - 2], div_output_image,
input_data, output_data, N, extrapolation_value,
reinterpret_cast<LinearMappingInfo*>(dims_mapping));
return;
} else if (is_3D) {
_ResizeTrilinearCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1],
output_depth, output_height, output_width,
scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1],
roi_vals[rank - 3], roi_vals[rank - 3 + rank],
roi_vals[rank - 2], roi_vals[rank - 2 + rank],
roi_vals[rank - 1], roi_vals[rank - 1 + rank],
output_depth + output_height + output_width, extrapolation_enabled, transform_coordinate,
reinterpret_cast<LinearMappingInfo*>(dims_mapping));
_ResizeTrilinearKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1],
output_depth, output_height, output_width,
output_div_pitches[rank - 3], output_div_pitches[rank - 2], div_output_image,
input_data, output_data, N, extrapolation_value,
reinterpret_cast<LinearMappingInfo*>(dims_mapping));
return;
}
break;
case UpsampleMode::CUBIC:
_ResizeCubicCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
scales_vals[rank - 2], scales_vals[rank - 1],
roi_vals[rank - 2], roi_vals[rank - 2 + rank],
roi_vals[rank - 1], roi_vals[rank - 1 + rank],
output_height + output_width, extrapolation_enabled,
cubic_coeff_a, exclude_outside, transform_coordinate,
reinterpret_cast<CubicMappingInfo*>(dims_mapping));
_ResizeBiCubicKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
output_div_pitches[rank - 2], div_output_image,
input_data, output_data, N, extrapolation_value,
reinterpret_cast<CubicMappingInfo*>(dims_mapping));
return;
if (is_2D) {
_ResizeCubicCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
scales_vals[rank - 2], scales_vals[rank - 1],
roi_vals[rank - 2], roi_vals[rank - 2 + rank],
roi_vals[rank - 1], roi_vals[rank - 1 + rank],
output_height + output_width, extrapolation_enabled,
cubic_coeff_a, exclude_outside, transform_coordinate,
reinterpret_cast<CubicMappingInfo*>(dims_mapping));
_ResizeBiCubicKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
input_shape[rank - 2], input_shape[rank - 1],
output_height, output_width,
output_div_pitches[rank - 2], div_output_image,
input_data, output_data, N, extrapolation_value,
reinterpret_cast<CubicMappingInfo*>(dims_mapping));
return;
}
}
}
@ -638,7 +801,7 @@ void ResizeImpl(
TArray<int64_t>& input_strides, \
TArray<fast_divmod>& output_div_pitches, \
TArray<float>& scales_vals, \
TArray<float>& roi_vals, \
TArray<float, 10>& roi_vals, \
const T* input_data, \
T* output_data, \
const size_t N, \

View file

@ -23,7 +23,7 @@ void ResizeImpl(
TArray<int64_t>& input_strides,
TArray<fast_divmod>& output_div_pitches,
TArray<float>& scales_vals,
TArray<float>& roi,
TArray<float, 10>& roi,
const T* input_data,
T* output_data,
const size_t N,

View file

@ -74,7 +74,7 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
if (is_resize_) {
TArray<int64_t> input_shape(X_dims);
TArray<int64_t> output_shape(output_dims);
TArray<float> roi_vals(roi);
TArray<float, 10> roi_vals(roi);
TArray<float> scales_vals(scales);
size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);

View file

@ -7,7 +7,7 @@
namespace onnxruntime {
namespace test {
TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_tf_crop_and_resize) {
TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
OpTester test("Resize", 11);
std::vector<float> roi{0.4f, 0.6f, 0.6f, 0.8f};
std::vector<float> scales{};
@ -65,7 +65,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapol
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear) {
TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
OpTester test("Resize", 11);
std::vector<float> roi{};
std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
@ -87,7 +87,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear) {
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_align_corners) {
TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
OpTester test("Resize", 11);
std::vector<float> roi{};
std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
@ -110,7 +110,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_align_corners) {
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_pytorch_half_pixel) {
TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_pytorch_half_pixel) {
OpTester test("Resize", 11);
std::vector<float> roi{};
std::vector<float> scales{};
@ -138,7 +138,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_pytorch_half_pixel)
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch
}
TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_asymmetric) {
TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric) {
OpTester test("Resize", 11);
std::vector<float> roi{};
std::vector<float> scales{1.0f, 1.0f, 2.0f, 4.0f};
@ -172,7 +172,7 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_asymmetric) {
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_align_corners) {
TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) {
OpTester test("Resize", 11);
std::vector<float> roi{};
std::vector<float> scales{2.0f, 4.0f};
@ -197,7 +197,66 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_align_corners) {
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartScalesNoOpTest) {
TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) {
OpTester test("Resize", 11);
std::vector<float> roi{};
std::vector<float> scales{};
std::vector<int64_t> sizes{1, 3, 1};
test.AddAttribute("mode", "linear");
test.AddAttribute("coordinate_transformation_mode", "pytorch_half_pixel");
const int64_t D = 2, H = 4, W = 4;
std::vector<float> X = {
1.0f, 2.0f, 3.0f, 4.0f,
5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f,
13.0f, 14.0f, 15.0f, 16.0f,
1.0f, 2.0f, 3.0f, 4.0f,
5.0f, 6.0f, 7.0f, 8.0f,
9.0f, 10.0f, 11.0f, 12.0f,
13.0f, 14.0f, 15.0f, 16.0f};
test.AddInput<float>("X", {D, H, W}, X);
test.AddInput<float>("roi", {0}, roi);
test.AddInput<float>("scales", {0}, scales);
test.AddInput<int64_t>("sizes", {3}, sizes);
std::vector<float> Y = {1.6666666f, 7.0f, 12.333333f};
test.AddOutput<float>("Y", {sizes[0], sizes[1], sizes[2]}, Y);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch
}
TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_5DTrilinear_pytorch_half_pixel) {
OpTester test("Resize", 11);
std::vector<float> roi{};
std::vector<float> scales{1.0f, 1.0f, 2.0f, 2.0f, 1.0f};
test.AddAttribute("mode", "linear");
test.AddAttribute("coordinate_transformation_mode", "pytorch_half_pixel");
const int64_t N = 1, C = 2, D = 2, H = 1, W = 2;
std::vector<float> X = {
1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f};
test.AddInput<float>("X", {N, C, D, H, W}, X);
test.AddInput<float>("roi", {0}, roi);
test.AddInput<float>("scales", {5}, scales);
std::vector<float> Y = {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f,
1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f,
1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f,
1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f};
test.AddOutput<float>("Y", {1, 2, 4, 2, 2}, Y);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch
}
TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) {
OpTester test("Resize", 11);
std::vector<float> roi{};
std::vector<float> scales{1.0f, 1.0f, 1.0f, 1.0f};
@ -856,7 +915,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) {
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_Ver10) {
TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
OpTester test("Resize", 10);
std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
@ -876,7 +935,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_Ver10) {
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_Ver10) {
TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
OpTester test("Resize", 10);
std::vector<float> scales{0.6f, 0.6f};
@ -896,7 +955,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_Ver10) {
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_Ver10) {
TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
OpTester test("Resize", 10);
std::vector<float> scales{1.0f, 1.0f, 2.0f, 4.0f};
test.AddAttribute("mode", "linear");
@ -926,7 +985,7 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_Ver10) {
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_Ver10) {
TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
OpTester test("Resize", 10);
std::vector<float> scales{2.0f, 4.0f};
test.AddAttribute("mode", "linear");
@ -948,7 +1007,7 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_Ver10) {
test.Run();
}
TEST(ResizeOpTest, ResizeOpLineartScalesNoOpTest_Ver10) {
TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
OpTester test("Resize", 10);
std::vector<float> scales{1.0f, 1.0f, 1.0f, 1.0f};
test.AddAttribute("mode", "linear");