From 383b1e207c772f6afd0344e4a7dd73e68f63a5be Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Wed, 30 Sep 2020 15:33:33 -0700
Subject: [PATCH] Fix bug in the Resize operator kernels (#5303)

---
 .../core/providers/cpu/tensor/upsample.cc     | 47 ++++++++---
 .../core/providers/cuda/tensor/resize_impl.cu | 79 +++++++++++++------
 .../providers/cpu/tensor/resize_op_test.cc    | 33 +++++++-
 3 files changed, 117 insertions(+), 42 deletions(-)
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc
index 4e9871af43..5f443e87ae 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc
@@ -114,6 +114,17 @@ Status UpsampleNearest(const T* input,
   auto CalculateInputMapping =
       [n_dim, &input_shape, &output_shape, &input_dim_factor, &scales, &roi, extrapolation_enabled, &get_original_coordinate, &get_nearest_pixel](
           std::vector<int64_t>& input_mapping, const int64_t axis) {
+        // When scale is 1.0, there is a one-to-one mapping between the dimension
+        // in the input and the output and there is no need to apply the co-ordinate
+        // transformation which should only be done when there is "resizing" required
+        if (scales[axis] == 1.0f) {
+          for (int64_t dim = 0; dim < output_shape[axis]; dim++) {
+            input_mapping[dim] = dim * input_dim_factor[axis];
+          }
+          return;
+        }
+
+        // scale != 1.0
         const int64_t input_size = input_dim_factor[0] * input_shape[0];
         for (int64_t dim = 0; dim < output_shape[axis]; dim++) {
           float original_dim = get_original_coordinate(static_cast<float>(dim), scales[axis], static_cast<float>(output_shape[axis]),
@@ -211,7 +222,7 @@ Status UpsampleNearest(const T* input,
         break;
       }
       output_dim_counter[dim_idx] = 0;
-      input_idx += input_mappings[dim_idx][0 /* output_dim_counter[dim_idx] */ ];
+      input_idx += input_mappings[dim_idx][0 /* output_dim_counter[dim_idx] */];
     }
   }
 
@@ -331,9 +342,11 @@ void UpsampleBilinear(int64_t batch_size,
   auto roi_y_start = roi.size() / 2 - 2;
   auto roi_y_end = roi.size() - 2;
   for (int64_t y = 0; y < output_height; ++y) {
-    float in_y = get_original_coordinate(static_cast<float>(y), height_scale,
-                                         static_cast<float>(output_height), static_cast<float>(input_height),
-                                         roi[roi_y_start], roi[roi_y_end]);
+    float in_y = height_scale == 1 ? static_cast<float>(y)
+                                   : get_original_coordinate(static_cast<float>(y), height_scale,
+                                                             static_cast<float>(output_height),
+                                                             static_cast<float>(input_height),
+                                                             roi[roi_y_start], roi[roi_y_end]);
     y_original.emplace_back(in_y);
     in_y = std::max(0.0f, std::min(in_y, static_cast<float>(input_height - 1)));
 
@@ -354,9 +367,12 @@ void UpsampleBilinear(int64_t batch_size,
   auto roi_x_start = roi.size() / 2 - 1;
   auto roi_x_end = roi.size() - 1;
   for (int64_t x = 0; x < output_width; ++x) {
-    float in_x = get_original_coordinate(static_cast<float>(x), width_scale,
-                                         static_cast<float>(output_width), static_cast<float>(input_width),
-                                         roi[roi_x_start], roi[roi_x_end]);
+    float in_x = width_scale == 1 ? static_cast<float>(x)
+                                  : get_original_coordinate(static_cast<float>(x),
+                                                            width_scale,
+                                                            static_cast<float>(output_width),
+                                                            static_cast<float>(input_width),
+                                                            roi[roi_x_start], roi[roi_x_end]);
     x_original.emplace_back(in_x);
     in_x = std::max(0.0f, std::min(in_x, static_cast<float>(input_width - 1)));
 
@@ -485,9 +501,11 @@ void ResizeBiCubic(
 
   // generate coefficients in y direction
   for (int64_t y = 0; y < output_height; ++y) {
-    float in_y = get_original_coordinate(static_cast<float>(y), height_scale,
-                                         static_cast<float>(output_height), static_cast<float>(input_height),
-                                         roi[roi_y_start], roi[roi_y_end]);
+    float in_y = height_scale == 1 ? static_cast<float>(y)
+                                   : get_original_coordinate(static_cast<float>(y), height_scale,
+                                                             static_cast<float>(output_height),
+                                                             static_cast<float>(input_height),
+                                                             roi[roi_y_start], roi[roi_y_end]);
     y_original.emplace_back(in_y);
     auto s = y_original[y] - std::floor(y_original[y]);
     if (cubic_coeffs.find(s) == cubic_coeffs.end()) {
@@ -498,9 +516,12 @@ void ResizeBiCubic(
 
   // generate coefficients in x direction
   for (int64_t x = 0; x < output_width; ++x) {
-    float in_x = get_original_coordinate(static_cast<float>(x), width_scale,
-                                         static_cast<float>(output_width), static_cast<float>(input_width),
-                                         roi[roi_x_start], roi[roi_x_end]);
+    float in_x = width_scale == 1 ? static_cast<float>(x)
+                                  : get_original_coordinate(static_cast<float>(x),
+                                                            width_scale,
+                                                            static_cast<float>(output_width),
+                                                            static_cast<float>(input_width),
+                                                            roi[roi_x_start], roi[roi_x_end]);
     x_original.emplace_back(in_x);
     auto s = x_original[x] - std::floor(x_original[x]);
     if (cubic_coeffs.find(s) == cubic_coeffs.end()) {
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index 5bb720aebf..c3521d1644 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -149,21 +149,37 @@ __global__ void _ResizeNearestMappingKernel2D(
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, output_height + output_width);
   if (id >= 0 && id < output_height) {  // for Height
     int dim = id;
-    float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height, static_cast<float>(output_height),
-                                            static_cast<float>(input_height), roi_start_height, roi_end_height);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_height - 1)));
-    dim = calc_nearest_pixel(orig_coord, scales_height < 1);
-    if (dim >= input_height) dim = input_height - 1;
-    if (dim < 0) dim = 0;
+
+    // only apply co-ordinate transformation if scale != 1.0
+    if (scales_height == 1.0f) {
+        dims_mapping[id].extrapolate_ = 0;
+    } else {
+      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_height, static_cast<float>(output_height),
+                                              static_cast<float>(input_height), roi_start_height, roi_end_height);
+      dims_mapping[id].extrapolate_ = static_cast<int>(
+          extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_height - 1)));
+      dim = calc_nearest_pixel(orig_coord, scales_height < 1);
+      if (dim >= input_height) dim = input_height - 1;
+      if (dim < 0) dim = 0;    
+    }
+
     dims_mapping[id].origin_ = dim;
   } else {
     int dim = id - output_height;
-    float orig_coord = transform_coordinate(static_cast<float>(dim), scales_width, static_cast<float>(output_width),
-                                            static_cast<float>(input_width), roi_start_width, roi_end_width);
-    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_width - 1)));
-    dim = calc_nearest_pixel(orig_coord, scales_width < 1);
-    if (dim >= input_width) dim = input_width - 1;
-    if (dim < 0) dim = 0;
+
+    // only apply co-ordinate transformation if scale != 1.0
+    if (scales_width == 1.0f) {
+      dims_mapping[id].extrapolate_ = 0;
+    } else {
+      float orig_coord = transform_coordinate(static_cast<float>(dim), scales_width, static_cast<float>(output_width),
+                                              static_cast<float>(input_width), roi_start_width, roi_end_width);
+      dims_mapping[id].extrapolate_ = static_cast<int>(
+          extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_width - 1)));
+      dim = calc_nearest_pixel(orig_coord, scales_width < 1);
+      if (dim >= input_width) dim = input_width - 1;
+      if (dim < 0) dim = 0; 
+    }
+
     dims_mapping[id].origin_ = dim;
     return;
   }
@@ -190,12 +206,19 @@ __global__ void _ResizeNearestMappingKernel(
     }
     if (id >= dim_sum && id < dim_sum + output_shape[axis]) {
       int dim = id - dim_sum;
-      float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis], static_cast<float>(output_shape[axis]),
-                                              static_cast<float>(input_shape[axis]), roi[axis], roi[axis + rank]);
-      dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_shape[axis] - 1)));
-      dim = calc_nearest_pixel(orig_coord, scales[axis] < 1);
-      if (dim >= input_shape[axis]) dim = input_shape[axis] - 1;
-      if (dim < 0) dim = 0;
+
+      // only apply co-ordinate transformation if scale != 1.0
+      if (scales[axis] == 1.0f) {
+        dims_mapping[id].extrapolate_ = 0;
+      } else {
+        float orig_coord = transform_coordinate(static_cast<float>(dim), scales[axis], static_cast<float>(output_shape[axis]),
+                                                static_cast<float>(input_shape[axis]), roi[axis], roi[axis + rank]);
+        dims_mapping[id].extrapolate_ = static_cast<int>(extrapolation_enabled && (orig_coord < 0.f || orig_coord > static_cast<float>(input_shape[axis] - 1)));
+        dim = calc_nearest_pixel(orig_coord, scales[axis] < 1);
+        if (dim >= input_shape[axis]) dim = input_shape[axis] - 1;
+        if (dim < 0) dim = 0;      
+      }
+
       dims_mapping[id].origin_ = dim;
       return;
     }
@@ -271,18 +294,20 @@ __global__ void _ResizeBilinearCoordinateMapping(
     BilinearMappingInfo* dims_mapping) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
   if (id < output_height) {  //  y = id
-    float input_y = transform_coordinate(static_cast<float>(id), scale_height,
-                                         static_cast<float>(output_height), static_cast<float>(input_height),
-                                         roi_height_start, roi_height_end);
+    float input_y = scale_height == 1 ? static_cast<float>(id) :
+                                        transform_coordinate(static_cast<float>(id), scale_height,
+                                        static_cast<float>(output_height), static_cast<float>(input_height),
+                                        roi_height_start, roi_height_end);
     dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
     input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
     int y_int = static_cast<int>(input_y);
     dims_mapping[id].origin_ = y_int;
     dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
   } else {  //x = id - output_height
-    float input_x = transform_coordinate(static_cast<float>(id - output_height), scale_width,
-                                         static_cast<float>(output_width), static_cast<float>(input_width),
-                                         roi_width_start, roi_width_end);
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_height) :
+                                       transform_coordinate(static_cast<float>(id - output_height), scale_width,
+                                       static_cast<float>(output_width), static_cast<float>(input_width),
+                                       roi_width_start, roi_width_end);
     dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
     input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
     int x_int = static_cast<int>(input_x);
@@ -370,9 +395,11 @@ __global__ void _ResizeCubicCoordinateMapping(
   bool is_y_axis = (id < output_height);
   int max_input_coord = static_cast<int>(is_y_axis ? input_height : input_width);
 
-  float input_coordinat = transform_coordinate(
+  float scale = is_y_axis ? scale_height : scale_width;
+  float input_coordinat = scale == 1 ? (is_y_axis ? id : id - output_height) :
+      transform_coordinate(
       static_cast<float>(is_y_axis ? id : id - output_height),
-      (is_y_axis ? scale_height : scale_width),
+      scale,
       static_cast<float>(is_y_axis ? output_height : output_width),
       static_cast<float>(max_input_coord),
       (is_y_axis ? roi_height_start : roi_width_start),
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 72f7b03074..33a5d7f302 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -135,7 +135,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_pytorch_half_pixel)
   std::vector<float> Y = {1.6666666f, 7.0f, 12.333333f};
 
   test.AddOutput<float>("Y", {sizes[0], sizes[1]}, Y);
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: results mismatch
 }
 
 TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_asymmetric) {
@@ -438,7 +438,6 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample5dTest_WithSizes_CeilMode) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider});
 }
 
-
 TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) {
   OpTester test("Resize", 11);
 
@@ -473,6 +472,34 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) {
   test.Run();
 }
 
+TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) {
+  OpTester test("Resize", 11);
+
+  std::vector<float> roi{};
+  // There is one-to-one mapping in the outermost dim.
+  // This test is to ensure that the co-ordinate transformation is not applied to the
+  // outermost dim as there is no "resizing".
+  // If it were applied using the provided attributes ,it would result in result mismatch
+  std::vector<float> scales{1.0f, 0.5f};
+
+  test.AddAttribute("mode", "nearest");
+  test.AddAttribute("coordinate_transformation_mode", "tf_half_pixel_for_nn");
+  test.AddAttribute("nearest_mode", "ceil");
+
+  const int64_t C = 2, D = 3;
+  std::vector<float> X = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+
+  test.AddInput<float>("X", {C, D}, X);
+  test.AddInput<float>("roi", {0}, roi);
+  test.AddInput<float>("scales", {2}, scales);
+
+  // would produce {5.0f, 5.0f} if co-ordinate transformation was applied
+  // to the outermost dim
+  std::vector<float> Y = {2.0f, 5.0f};
+  test.AddOutput<float>("Y", {2, 1}, Y);
+  test.Run();
+}
+
 // custom OpTester to make 'scales' or 'sizes' a constant initializer if needed
 class ResizeOpTester : public OpTester {
  public:
@@ -573,7 +600,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Nearest2xOptimization_Sizes) {
                             3.0f, 3.0f, 4.0f, 4.0f};
 
     test.AddOutput<float>("Y", {N, C, sizes[2], sizes[3]}, Y);
-    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: results mismatch
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: results mismatch
   };
 
   run_test(false);