From 06cd81d79199ab826a239d894be38dc7b9830abb Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Fri, 2 Oct 2020 11:02:43 -0700
Subject: [PATCH] Support trilinear sampling in Resize CPU and CUDA kernels
 (#5300)

---
 .../core/providers/cpu/tensor/upsample.cc     | 318 ++++++++++++++++--
 .../core/providers/cpu/tensor/upsample.h      |  17 +-
 .../core/providers/cuda/tensor/resize_impl.cu | 249 +++++++++++---
 .../core/providers/cuda/tensor/resize_impl.h  |   2 +-
 .../core/providers/cuda/tensor/upsample.cc    |   2 +-
 .../providers/cpu/tensor/resize_op_test.cc    |  83 ++++-
 6 files changed, 580 insertions(+), 91 deletions(-)
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.cc b/onnxruntime/core/providers/cpu/tensor/upsample.cc
index 5f443e87ae..0cf7b0d0cb 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.cc
@@ -321,24 +321,46 @@ void UpsampleBilinear(int64_t batch_size,
                       AllocatorPtr& alloc,
                       GetOriginalCoordinateFunc get_original_coordinate) {
   std::vector<float> y_original;
-  std::vector<float> x_original;
+  y_original.reserve(output_height);
 
+  std::vector<float> x_original;
+  x_original.reserve(output_width);
+
+  // For each index in the output height and output width, cache its corresponding indices in the input
+  // while multiplying it with the input stride for that dimension (cache because we don't have to re-compute
+  // each time we come across the output width/ output height value while iterating the output image tensor
   SafeInt<size_t> idx_buffer_size = SafeInt<size_t>(2) * sizeof(int64_t) * (output_height + output_width);
+
+  // For each index in the output height and output width, cache its corresponding "weights/scales" for its
+  // corresponding indices in the input which proportionately indicates how much they will influence the final
+  // pixel value in the output
+  // (cache because we don't have to re-compute each time we come across the output width/ output height value while iterating the output image tensor
   SafeInt<size_t> scale_buffer_size = SafeInt<size_t>(2) * sizeof(float_t) * (output_height + output_width);
+
+  // Limit number of allocations to just 1
   auto inx_scale_data_buffer = alloc->Alloc(idx_buffer_size + scale_buffer_size);
   BufferUniquePtr idx_scale_data_buffer_holder(inx_scale_data_buffer, BufferDeleter(alloc));
+
+  // Get pointers to appropriate memory locations in the scratch buffer
   auto* idx_data = static_cast<int64_t*>(idx_scale_data_buffer_holder.get());
+
+  // input_width is the stride for the height dimension
   int64_t* input_width_mul_y1 = idx_data;
-  int64_t* input_width_mul_y2 = idx_data + output_height;
-  int64_t* in_x1 = idx_data + 2 * output_height;
-  int64_t* in_x2 = idx_data + 2 * output_height + output_width;
+  int64_t* input_width_mul_y2 = input_width_mul_y1 + output_height;
+
+  // stride for width is 1 (no multiplication needed)
+  int64_t* in_x1 = input_width_mul_y1 + 2 * output_height;
+  int64_t* in_x2 = in_x1 + output_width;
 
   auto* scale_data = reinterpret_cast<float*>(in_x2 + output_width);
-  float* dy1 = scale_data;
-  float* dy2 = scale_data + output_height;
-  float* dx1 = scale_data + 2 * output_height;
-  float* dx2 = scale_data + 2 * output_height + output_width;
 
+  float* dy1 = scale_data;
+  float* dy2 = dy1 + output_height;
+
+  float* dx1 = dy1 + 2 * output_height;
+  float* dx2 = dx1 + output_width;
+
+  // Start processing
   auto roi_y_start = roi.size() / 2 - 2;
   auto roi_y_end = roi.size() - 2;
   for (int64_t y = 0; y < output_height; ++y) {
@@ -379,8 +401,8 @@ void UpsampleBilinear(int64_t batch_size,
     in_x1[x] = std::min(static_cast<int64_t>(in_x), input_width - 1);
     in_x2[x] = std::min(in_x1[x] + 1, input_width - 1);
 
-    dx1[x] = std::abs(in_x - in_x1[x]);
-    dx2[x] = std::abs(in_x - in_x2[x]);
+    dx1[x] = std::fabs(in_x - in_x1[x]);
+    dx2[x] = std::fabs(in_x - in_x2[x]);
     if (in_x1[x] == in_x2[x]) {
       dx1[x] = 0.5f;
       dx2[x] = 0.5f;
@@ -400,6 +422,7 @@ void UpsampleBilinear(int64_t batch_size,
             continue;
           }
 
+          // subscript ordering in the variable - (xy)
           T X11 = Xdata[input_width_mul_y1[y] + in_x1[x]];
           T X21 = Xdata[input_width_mul_y1[y] + in_x2[x]];
           T X12 = Xdata[input_width_mul_y2[y] + in_x1[x]];
@@ -417,6 +440,198 @@ void UpsampleBilinear(int64_t batch_size,
   }
 }
 
+// The following method supports a 5-D input in 'Linear mode'
+// that amounts to 'Trilinear' Upsampling/Resizing in the sense that it assumes
+// the scale values for the outermost 2 dimensions are 1.
+// This is the common use-case where the 5-D input (batched multi-channel volumes)
+// is usually of shape [N, C, D, H, W] and the scales are [1.0, 1.0, depth_scale, height_scale, width_scale]
+template <typename T>
+void UpsampleTrilinear(int64_t batch_size,
+                       int64_t num_channels,
+                       int64_t input_depth,
+                       int64_t input_height,
+                       int64_t input_width,
+                       int64_t output_depth,
+                       int64_t output_height,
+                       int64_t output_width,
+                       float depth_scale,
+                       float height_scale,
+                       float width_scale,
+                       const std::vector<float>& roi,
+                       bool use_extrapolation,
+                       float extrapolation_value,
+                       const T* Xdata,
+                       T* Ydata,
+                       AllocatorPtr& alloc,
+                       GetOriginalCoordinateFunc get_original_coordinate) {
+  std::vector<float> z_original;
+  z_original.reserve(output_depth);
+
+  std::vector<float> y_original;
+  y_original.reserve(output_height);
+
+  std::vector<float> x_original;
+  x_original.reserve(output_width);
+
+  // For each index in the output height and output width, cache its corresponding indices in the input
+  // while multiplying it with the input stride for that dimension (cache because we don't have to re-compute
+  // each time we come across the output width/ output height value while iterating the output image tensor
+  SafeInt<size_t> idx_buffer_size = SafeInt<size_t>(2) * sizeof(int64_t) *
+                                    (output_depth + output_height + output_width);
+
+  // For each index in the output height and output width, cache its corresponding "weights/scales" for its
+  // corresponding indices in the input which proportionately indicates how much they will influence the final
+  // pixel value in the output
+  // (cache because we don't have to re-compute each time we come across the output width/ output height value while iterating the output image tensor
+  SafeInt<size_t> scale_buffer_size = SafeInt<size_t>(2) * sizeof(float_t) *
+                                      (output_depth + output_height + output_width);
+
+  // Limit number of allocations to just 1
+  auto inx_scale_data_buffer = alloc->Alloc(idx_buffer_size + scale_buffer_size);
+  BufferUniquePtr idx_scale_data_buffer_holder(inx_scale_data_buffer, BufferDeleter(alloc));
+
+  // Get pointers to appropriate memory locations in the scratch buffer
+  auto* idx_data = static_cast<int64_t*>(idx_scale_data_buffer_holder.get());
+
+  // input_width * input_height is the stride for the depth dimension
+  int64_t* input_height_width_mul_z1 = idx_data;
+  int64_t* input_height_width_mul_z2 = input_height_width_mul_z1 + output_depth;
+
+  // input_width is the stride for the height dimension
+  int64_t* input_width_mul_y1 = input_height_width_mul_z1 + 2 * output_depth;
+  int64_t* input_width_mul_y2 = input_width_mul_y1 + output_height;
+
+  // stride for width is 1 (no multiplication needed)
+  int64_t* in_x1 = input_width_mul_y1 + 2 * output_height;
+  int64_t* in_x2 = in_x1 + output_width;
+
+  auto* scale_data = reinterpret_cast<float*>(in_x2 + output_width);
+
+  float* dz1 = scale_data;
+  float* dz2 = dz1 + output_depth;
+
+  float* dy1 = dz1 + 2 * output_depth;
+  float* dy2 = dy1 + output_height;
+
+  float* dx1 = dy1 + 2 * output_height;
+  float* dx2 = dx1 + output_width;
+
+  // Start processing
+  auto roi_z_start = roi.size() / 2 - 3;
+  auto roi_z_end = roi.size() - 3;
+  for (int64_t z = 0; z < output_depth; ++z) {
+    float in_z = depth_scale == 1 ? static_cast<float>(z)
+                                  : get_original_coordinate(static_cast<float>(z), depth_scale,
+                                                            static_cast<float>(output_depth), static_cast<float>(input_depth),
+                                                            roi[roi_z_start], roi[roi_z_end]);
+    z_original.emplace_back(in_z);
+    in_z = std::max(0.0f, std::min(in_z, static_cast<float>(input_depth - 1)));
+
+    const int64_t in_z1 = std::min(static_cast<int64_t>(in_z), input_depth - 1);
+    const int64_t in_z2 = std::min(in_z1 + 1, input_depth - 1);
+    dz1[z] = std::fabs(in_z - in_z1);
+    dz2[z] = std::fabs(in_z - in_z2);
+
+    if (in_z1 == in_z2) {
+      dz1[z] = 0.5f;
+      dz2[z] = 0.5f;
+    }
+
+    input_height_width_mul_z1[z] = input_height * input_width * in_z1;
+    input_height_width_mul_z2[z] = input_height * input_width * in_z2;
+  }
+
+  auto roi_y_start = roi.size() / 2 - 2;
+  auto roi_y_end = roi.size() - 2;
+  for (int64_t y = 0; y < output_height; ++y) {
+    float in_y = height_scale == 1 ? static_cast<float>(y)
+                                   : get_original_coordinate(static_cast<float>(y), height_scale,
+                                                             static_cast<float>(output_height), static_cast<float>(input_height),
+                                                             roi[roi_y_start], roi[roi_y_end]);
+    y_original.emplace_back(in_y);
+    in_y = std::max(0.0f, std::min(in_y, static_cast<float>(input_height - 1)));
+
+    const int64_t in_y1 = std::min(static_cast<int64_t>(in_y), input_height - 1);
+    const int64_t in_y2 = std::min(in_y1 + 1, input_height - 1);
+    dy1[y] = std::fabs(in_y - in_y1);
+    dy2[y] = std::fabs(in_y - in_y2);
+
+    if (in_y1 == in_y2) {
+      dy1[y] = 0.5f;
+      dy2[y] = 0.5f;
+    }
+
+    input_width_mul_y1[y] = input_width * in_y1;
+    input_width_mul_y2[y] = input_width * in_y2;
+  }
+
+  auto roi_x_start = roi.size() / 2 - 1;
+  auto roi_x_end = roi.size() - 1;
+  for (int64_t x = 0; x < output_width; ++x) {
+    float in_x = width_scale == 1 ? static_cast<float>(x)
+                                  : get_original_coordinate(static_cast<float>(x), width_scale,
+                                                            static_cast<float>(output_width), static_cast<float>(input_width),
+                                                            roi[roi_x_start], roi[roi_x_end]);
+    x_original.emplace_back(in_x);
+    in_x = std::max(0.0f, std::min(in_x, static_cast<float>(input_width - 1)));
+
+    in_x1[x] = std::min(static_cast<int64_t>(in_x), input_width - 1);
+    in_x2[x] = std::min(in_x1[x] + 1, input_width - 1);
+
+    dx1[x] = std::fabs(in_x - in_x1[x]);
+    dx2[x] = std::fabs(in_x - in_x2[x]);
+    if (in_x1[x] == in_x2[x]) {
+      dx1[x] = 0.5f;
+      dx2[x] = 0.5f;
+    }
+  }
+
+  for (int64_t n = 0; n < batch_size; ++n) {
+    for (int64_t c = 0; c < num_channels; ++c) {
+      for (int64_t z = 0; z < output_depth; ++z) {
+        for (int64_t y = 0; y < output_height; ++y) {
+          for (int64_t x = 0; x < output_width; ++x) {
+            // when use_extrapolation is set and original index of x or y is out of the dim range
+            // then use extrapolation_value as the output value.
+            if (use_extrapolation &&
+                ((z_original[z] < 0 || z_original[z] > static_cast<float>(input_depth - 1)) ||
+                 (y_original[y] < 0 || y_original[y] > static_cast<float>(input_height - 1)) ||
+                 (x_original[x] < 0 || x_original[x] > static_cast<float>(input_width - 1)))) {
+              Ydata[output_width * output_height * z + output_width * y + x] =
+                  static_cast<T>(extrapolation_value);
+              continue;
+            }
+
+            // subscript ordering in the variable - (xyz)
+            T X111 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y1[y] + in_x1[x]];
+            T X211 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y1[y] + in_x2[x]];
+            T X121 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y2[y] + in_x1[x]];
+            T X221 = Xdata[input_height_width_mul_z1[z] + input_width_mul_y2[y] + in_x2[x]];
+
+            T X112 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y1[y] + in_x1[x]];
+            T X212 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y1[y] + in_x2[x]];
+            T X122 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y2[y] + in_x1[x]];
+            T X222 = Xdata[input_height_width_mul_z2[z] + input_width_mul_y2[y] + in_x2[x]];
+
+            Ydata[output_width * output_height * z + output_width * y + x] =
+                static_cast<T>(dx2[x] * dy2[y] * dz2[z] * X111 +
+                               dx1[x] * dy2[y] * dz2[z] * X211 +
+                               dx2[x] * dy1[y] * dz2[z] * X121 +
+                               dx1[x] * dy1[y] * dz2[z] * X221 +
+
+                               dx2[x] * dy2[y] * dz1[z] * X112 +
+                               dx1[x] * dy2[y] * dz1[z] * X212 +
+                               dx2[x] * dy1[y] * dz1[z] * X122 +
+                               dx1[x] * dy1[y] * dz1[z] * X222);
+          }
+        }
+      }
+      Xdata += input_depth * input_height * input_width;
+      Ydata += output_depth * output_width * output_height;
+    }
+  }
+}
+
 // Calculates cubic coeff based on Robert Keys approach
 // https://ieeexplore.ieee.org/document/1163711
 std::array<float, CubicModeGridLength> GetCubicCoeffs(float s, float cubic_coeff_a = -0.75) {
@@ -491,7 +706,11 @@ void ResizeBiCubic(
     T* Ydata,
     GetOriginalCoordinateFunc get_original_coordinate) {
   std::vector<float> y_original;
+  y_original.reserve(output_height);
+
   std::vector<float> x_original;
+  x_original.reserve(output_width);
+
   std::unordered_map<float, std::array<float, CubicModeGridLength>> cubic_coeffs;
   std::unordered_map<float, std::unordered_map<int64_t, float>> coeff_to_1Dinterpolation_map;
   auto roi_y_start = roi.size() / 2 - 2;
@@ -660,33 +879,70 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
                                 scales, roi, is_resize_, use_extrapolation_, extrapolation_value_,
                                 use_nearest2x_optimization_, get_original_coordinate_, get_nearest_pixel_);
     case UpsampleMode::LINEAR: {
-      //The correct behavior of 'linear' mode for an N-D input is not clear right now,
-      //so only support 'bilinear' with 2-D or 4-D input tensor with outermost 2 scales as 1 in the 4-D case
+      // Supports 'bilinear' and 'trilinear' sampling only
+
+      //'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1
+      if (dims.size() == 2 || dims.size() == 4) {
+        bool is_2D = dims.size() == 2;
+
+        const int64_t batch_size = is_2D ? 1 : dims[0];
+        const int64_t num_channels = is_2D ? 1 : dims[1];
+        const int64_t input_height = is_2D ? dims[0] : dims[2];
+        const int64_t input_width = is_2D ? dims[1] : dims[3];
+
+        const int64_t output_height = is_2D ? output_dims[0] : output_dims[2];
+        const int64_t output_width = is_2D ? output_dims[1] : output_dims[3];
+
+        AllocatorPtr alloc;
+        ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
+        UpsampleBilinear(batch_size, num_channels, input_height, input_width, output_height, output_width,
+                         is_2D ? scales[0] : scales[2], is_2D ? scales[1] : scales[3], roi,
+                         use_extrapolation_, extrapolation_value_, X->template Data<T>(),
+                         Y->template MutableData<T>(), alloc, get_original_coordinate_);
+        return Status::OK();
+      } else if (dims.size() == 3 || dims.size() == 5) {
+        //'trilinear' == 3-D input or 5-D input with outermost 2 scales as 1
+        bool is_3D = dims.size() == 3;
+
+        const int64_t batch_size = is_3D ? 1 : dims[0];
+        const int64_t num_channels = is_3D ? 1 : dims[1];
+        const int64_t input_depth = is_3D ? dims[0] : dims[2];
+        const int64_t input_height = is_3D ? dims[1] : dims[3];
+        const int64_t input_width = is_3D ? dims[2] : dims[4];
+
+        const int64_t output_depth = is_3D ? output_dims[0] : output_dims[2];
+        const int64_t output_height = is_3D ? output_dims[1] : output_dims[3];
+        const int64_t output_width = is_3D ? output_dims[2] : output_dims[4];
+
+        AllocatorPtr alloc;
+        ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
+        UpsampleTrilinear(batch_size, num_channels, input_depth, input_height, input_width,
+                          output_depth, output_height, output_width,
+                          is_3D ? scales[0] : scales[2], is_3D ? scales[1] : scales[3],
+                          is_3D ? scales[2] : scales[4], roi, use_extrapolation_, extrapolation_value_,
+                          X->template Data<T>(), Y->template MutableData<T>(), alloc, get_original_coordinate_);
+        return Status::OK();
+      } else {
+        // User shouldn't hit this as the check has been performed in ScalesValidation()
+        std::ostringstream oss;
+        oss << "'Linear' mode only support 2-D inputs or 3-D inputs ('Bilinear', 'Trilinear') "
+               "or 4-D inputs or 5-D inputs with the corresponding outermost 2 scale values "
+               "being 1 in the ";
+        oss << (is_resize_ ? "Resize operator" : "Upsample operator");
+        return Status(ONNXRUNTIME, FAIL, oss.str());
+      }
+    }
+    case UpsampleMode::CUBIC: {
+      // Supports 'bicubic' sampling only
+
+      // User shouldn't hit this as the check has been performed in ScalesValidation()
       if (dims.size() != 2 && dims.size() != 4) {
         std::ostringstream oss;
-        oss << "'Linear' mode only support 2-D inputs ('Bilinear') or 4-D inputs "
+        oss << "'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs "
                "with the corresponding outermost 2 scale values being 1 in the ";
         oss << (is_resize_ ? "Resize operator" : "Upsample operator");
         return Status(ONNXRUNTIME, FAIL, oss.str());
       }
-
-      bool is_2D = dims.size() == 2;
-      const int64_t batch_size = is_2D ? 1 : dims[0];
-      const int64_t num_channels = is_2D ? 1 : dims[1];
-      const int64_t input_height = is_2D ? dims[0] : dims[2];
-      const int64_t input_width = is_2D ? dims[1] : dims[3];
-      const int64_t output_height = is_2D ? output_dims[0] : output_dims[2];
-      const int64_t output_width = is_2D ? output_dims[1] : output_dims[3];
-
-      AllocatorPtr alloc;
-      ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
-      UpsampleBilinear(batch_size, num_channels, input_height, input_width, output_height, output_width,
-                       is_2D ? scales[0] : scales[2], is_2D ? scales[1] : scales[3], roi,
-                       use_extrapolation_, extrapolation_value_, X->template Data<T>(),
-                       Y->template MutableData<T>(), alloc, get_original_coordinate_);
-      return Status::OK();
-    }
-    case UpsampleMode::CUBIC: {
       bool is_2D = dims.size() == 2;
       const int64_t batch_size = is_2D ? 1 : dims[0];
       const int64_t num_channels = is_2D ? 1 : dims[1];
diff --git a/onnxruntime/core/providers/cpu/tensor/upsample.h b/onnxruntime/core/providers/cpu/tensor/upsample.h
index 18c50d7208..a2b1cd7d7c 100644
--- a/onnxruntime/core/providers/cpu/tensor/upsample.h
+++ b/onnxruntime/core/providers/cpu/tensor/upsample.h
@@ -276,15 +276,26 @@ class UpsampleBase {
       }
     }
 
-    if (UpsampleMode::LINEAR == mode || UpsampleMode::CUBIC == mode) {
+    if (UpsampleMode::LINEAR == mode) {
+      ORT_ENFORCE(scales.size() == 2 ||
+                      (scales.size() == 4 && scales[0] == 1 && scales[1] == 1) ||
+                      scales.size() == 3 ||
+                      (scales.size() == 5 && scales[0] == 1 && scales[1] == 1),
+                  "'Linear' mode only support 2-D inputs or 3-D inputs ('Bilinear', 'Trilinear') "
+                  "or 4-D inputs or 5-D inputs with the corresponding outermost 2 scale values being 1 in the ",
+                  is_resize_ ? "Resize operator" : "Upsample operator");
+    }
+
+    else if (UpsampleMode::CUBIC == mode) {
       ORT_ENFORCE(scales.size() == 2 || (scales.size() == 4 && scales[0] == 1 && scales[1] == 1),
-                  "'Linear' mode and 'Cubic' mode only support 2-D inputs ('Bilinear', 'Bicubic') or 4-D inputs "
+                  "'Cubic' mode only support 2-D inputs ('Bicubic') or 4-D inputs "
                   "with the corresponding outermost 2 scale values being 1 in the ",
                   is_resize_ ? "Resize operator" : "Upsample operator");
     }
   }
 
-  void ParseScalesData(const Tensor* scale, std::vector<float>& scales) const {
+  void
+  ParseScalesData(const Tensor* scale, std::vector<float>& scales) const {
     const auto* scale_data = scale->template Data<float>();
     int64_t scales_size = scale->Shape().Size();
     ORT_ENFORCE(scales_size > 0, "scales size should be greater than 0.");
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
index c3521d1644..8039e5ed5f 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.cu
@@ -191,7 +191,7 @@ __global__ void _ResizeNearestMappingKernel(
     const TArray<int64_t> input_shape,
     const TArray<int64_t> output_shape,
     const TArray<float> scales,
-    const TArray<float> roi,
+    const TArray<float, 10> roi,
     const size_t total_dim_sum,
     bool extrapolation_enabled,
     CudaFunctionOriginalCoordinate transform_coordinate,
@@ -276,7 +276,7 @@ __global__ void _ResizeNearestKernel(
   output_data[id] = extrapolation_occured ? extrapolation_value : input_data[input_index];
 }
 
-struct BilinearMappingInfo {
+struct LinearMappingInfo {
   int origin_;
   float weight_;
   int extrapolate_;
@@ -291,7 +291,7 @@ __global__ void _ResizeBilinearCoordinateMapping(
     float roi_width_start, float roi_width_end,
     const size_t SumHW, bool extrapolation_enabled,
     CudaFunctionOriginalCoordinate transform_coordinate,
-    BilinearMappingInfo* dims_mapping) {
+    LinearMappingInfo* dims_mapping) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumHW);
   if (id < output_height) {  //  y = id
     float input_y = scale_height == 1 ? static_cast<float>(id) :
@@ -316,7 +316,7 @@ __global__ void _ResizeBilinearCoordinateMapping(
   }
 }
 
-// The following method supports a N-D input in 'Linear mode'. Last two dimension is [H, W].
+// The following method supports a 2-D or 4-D input in 'Linear mode'. Last two dimension is [H, W].
 // the scale values for the outer dimensions except last two are 1.
 template <typename T>
 __global__ void _ResizeBilinearKernel(
@@ -325,7 +325,7 @@ __global__ void _ResizeBilinearKernel(
     fast_divmod div_output_width, fast_divmod div_output_image,
     const T* input_data, T* output_data, const size_t N,
     const T extrapolation_value,
-    BilinearMappingInfo* dims_mapping) {
+    LinearMappingInfo* dims_mapping) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
   int bxc, output_image_index;
   div_output_image.divmod(id, bxc, output_image_index);
@@ -359,6 +359,124 @@ __global__ void _ResizeBilinearKernel(
       x11 * static_cast<T>(y_offset_0 * x_offset_0);
 }
 
+template <typename T>
+__global__ void _ResizeTrilinearCoordinateMapping(
+    int64_t input_depth, int64_t input_height, int64_t input_width,
+    int64_t output_depth, int64_t output_height, int64_t output_width,
+    float scale_depth, float scale_height, float scale_width,
+    float roi_depth_start, float roi_depth_end,
+    float roi_height_start, float roi_height_end,
+    float roi_width_start, float roi_width_end,
+    const size_t SumDHW, bool extrapolation_enabled,
+    CudaFunctionOriginalCoordinate transform_coordinate,
+    LinearMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, SumDHW);
+  if (id < output_depth) {  //  z = id
+    float input_z = scale_depth == 1 ? static_cast<float>(id)  :
+                                       transform_coordinate(static_cast<float>(id), scale_depth,
+                                       static_cast<float>(output_depth), static_cast<float>(input_depth),
+                                       roi_depth_start, roi_depth_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_z < 0 || input_z > static_cast<float>(input_depth - 1)));
+    input_z = max(0.0f, min(input_z, static_cast<float>(input_depth - 1)));
+    int z_int = static_cast<int>(input_z);
+    dims_mapping[id].origin_ = z_int;
+    dims_mapping[id].weight_ = (z_int >= input_depth - 1) ? 0.5f : input_z - z_int;
+  } else if (id >= output_depth && id < (output_depth + output_height)) {  //  y = id - output_depth
+    float input_y = scale_height == 1 ? static_cast<float>(id - output_depth) : 
+                                        transform_coordinate(static_cast<float>(id - output_depth), scale_height, 
+                                        static_cast<float>(output_height), static_cast<float>(input_height), 
+                                        roi_height_start, roi_height_end);
+
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_y < 0 || input_y > static_cast<float>(input_height - 1)));
+    input_y = max(0.0f, min(input_y, static_cast<float>(input_height - 1)));
+    int y_int = static_cast<int>(input_y);
+    dims_mapping[id].origin_ = y_int;
+    dims_mapping[id].weight_ = (y_int >= input_height - 1) ? 0.5f : input_y - y_int;
+  } else {  //x = id - output_depth - output_height
+    float input_x = scale_width == 1 ? static_cast<float>(id - output_depth - output_height) :
+                                       transform_coordinate(static_cast<float>(id - output_depth - output_height), scale_width,
+                                       static_cast<float>(output_width), static_cast<float>(input_width),
+                                       roi_width_start, roi_width_end);
+    dims_mapping[id].extrapolate_ = (int)(extrapolation_enabled && (input_x < 0 || input_x > static_cast<float>(input_width - 1)));
+    input_x = max(0.0f, min(input_x, static_cast<float>(input_width - 1)));
+    int x_int = static_cast<int>(input_x);
+    dims_mapping[id].origin_ = x_int;
+    dims_mapping[id].weight_ = (x_int >= input_width - 1) ? 0.5f : input_x - x_int;
+  }
+}
+
+// The following method supports a 3-D or 5-D input in 'Linear mode'. Last two dimension is [D, sH, W].
+// the scale values for the outer dimensions except last two are 1.
+template <typename T>
+__global__ void _ResizeTrilinearKernel(
+    int64_t input_depth, int64_t input_height, int64_t input_width,
+    int64_t output_depth, int64_t output_height, int64_t output_width,
+    fast_divmod div_output_height, fast_divmod div_output_width, fast_divmod div_output_image,
+    const T* input_data, T* output_data, const size_t N,
+    const T extrapolation_value,
+    LinearMappingInfo* dims_mapping) {
+  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
+  int bxc, output_image_index;
+  div_output_image.divmod(id, bxc, output_image_index);
+  CUDA_LONG input_index = bxc * input_depth * input_height * input_width;
+  int output_z, output_y, output_x, temp;
+  
+  div_output_height.divmod(output_image_index, output_z, temp);
+  div_output_width.divmod(temp, output_y, output_x);
+
+  if (dims_mapping[output_z].extrapolate_ || 
+      dims_mapping[output_y + output_depth].extrapolate_ || 
+      dims_mapping[output_x + output_depth + output_height].extrapolate_) {
+    output_data[id] = extrapolation_value;
+    return;
+  }
+
+  float z_offset_0 = dims_mapping[output_z].weight_;
+  int z_int = dims_mapping[output_z].origin_;
+
+  float y_offset_0 = dims_mapping[output_y + output_depth].weight_;
+  int y_int = dims_mapping[output_y + output_depth].origin_;
+
+  float x_offset_0 = dims_mapping[output_x + output_depth + output_height].weight_;
+  int x_int = dims_mapping[output_x + output_depth + output_height].origin_;
+  
+  input_index += z_int * input_height * input_width + y_int * input_width + x_int;
+
+  T x000 = input_data[input_index];
+
+  bool end_of_h = (y_int >= input_height - 1);
+  bool end_of_w = (x_int >= input_width - 1);
+
+  T x100 = end_of_w ? x000 : input_data[input_index + 1];
+  T x010 = end_of_h ? x000 : input_data[input_index + input_width];
+  T x110 = end_of_w ? x010 : (end_of_h ? x100 : input_data[input_index + input_width + 1]);
+
+  bool end_of_d = (z_int >= input_depth - 1);
+  if (!end_of_d) {
+    input_index = input_index + input_height * input_width;
+  }
+
+  T x001 = end_of_d ? x000 : input_data[input_index];
+
+  T x101 = end_of_w ? x001 : input_data[input_index + 1];
+  T x011 = end_of_h ? x001 : input_data[input_index + input_width];
+  T x111 = end_of_w ? x011 : (end_of_h ? x101 : input_data[input_index + input_width + 1]);
+
+  float z_offset_1 = 1.0f - z_offset_0;
+  float y_offset_1 = 1.0f - y_offset_0;
+  float x_offset_1 = 1.0f - x_offset_0;
+  output_data[id] =
+      x000 * static_cast<T>(z_offset_1 * y_offset_1 * x_offset_1) +
+      x010 * static_cast<T>(z_offset_1 * y_offset_0 * x_offset_1) +
+      x100 * static_cast<T>(z_offset_1 * y_offset_1 * x_offset_0) +
+      x110 * static_cast<T>(z_offset_1 * y_offset_0 * x_offset_0) +
+
+      x001 * static_cast<T>(z_offset_0 * y_offset_1 * x_offset_1) +
+      x011 * static_cast<T>(z_offset_0 * y_offset_0 * x_offset_1) +
+      x101 * static_cast<T>(z_offset_0 * y_offset_1 * x_offset_0) +
+      x111 * static_cast<T>(z_offset_0 * y_offset_0 * x_offset_0);
+}
+
 template <typename T>
 __device__ __forceinline__ float CubicInterpolationRowwise(
     const T* image, int x, int y, int input_height, int input_width,
@@ -464,7 +582,7 @@ size_t CalcResizeBufferSize(const onnxruntime::UpsampleMode upsample_mode,
     case UpsampleMode::NN:
       return sizeof(int64_t) * output_dims.size() + sizeof(NearestMappingInfo) * std::accumulate(output_dims.begin(), output_dims.end(), 0);
     case UpsampleMode::LINEAR:
-      return sizeof(BilinearMappingInfo) * std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, 0);
+      return sizeof(LinearMappingInfo) * std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, 0);
     case UpsampleMode::CUBIC:
       return sizeof(CubicMappingInfo) * std::accumulate(output_dims.rbegin(), output_dims.rbegin() + 2, 0);
   }
@@ -479,7 +597,7 @@ void ResizeNearestImpl(
     TArray<int64_t>& input_strides,
     TArray<fast_divmod>& output_div_pitches,
     TArray<float>& scales_vals,
-    TArray<float>& roi_vals,
+    TArray<float, 10>& roi_vals,
     const T* input_data,
     T* output_data,
     const size_t N,
@@ -556,7 +674,7 @@ void ResizeImpl(
     TArray<int64_t>& input_strides,
     TArray<fast_divmod>& output_div_pitches,
     TArray<float>& scales_vals,
-    TArray<float>& roi_vals,
+    TArray<float, 10>& roi_vals,
     const T* input_data,
     T* output_data,
     const size_t N,
@@ -587,45 +705,90 @@ void ResizeImpl(
     return;
   }
 
-  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
-  fast_divmod div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(gsl::narrow_cast<int>(N));
+  // We support a special case of bilinear or bicubic if the input data is 4D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  bool is_2D = (rank == 2 || rank == 4);
+
+  // We support a special case of trilinear or tricubic if the input data is 5D with the outer 2 scales being 1.0
+  // We would have validated the outer scale values by the time execution reaches this
+  bool is_3D = (rank == 3 || rank == 5);
+
+  // Should not hit this as we have already validated input rank/scales and we provide verbose error messages
+  // to the user.
+  ORT_ENFORCE(is_2D || is_3D, "Only bilinear/trilinear and bicubic modes are supported in Resize");
+
+  int blocksPerGrid = static_cast<int>(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
+  fast_divmod div_output_image;
+  if (is_2D) {
+    div_output_image = (rank > 2) ? output_div_pitches[rank - 3] : fast_divmod(gsl::narrow_cast<int>(N));
+  } else if (is_3D) {
+    div_output_image = (rank > 3) ? output_div_pitches[rank - 4] : fast_divmod(gsl::narrow_cast<int>(N));
+  }
+
+  int64_t output_depth = is_3D ? output_shape[rank - 3] : 0;
   int64_t output_height = output_shape[rank - 2];
   int64_t output_width = output_shape[rank - 1];
-  int blocksPerDimsMappingGrid = (int)(ceil((output_height + output_width) / 32.0));
+  int blocksPerDimsMappingGrid =
+      static_cast<int>(ceil((output_depth + output_height + output_width) / 32.0));
+
   switch (upsample_mode) {
     case UpsampleMode::LINEAR:
-      _ResizeBilinearCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
-          input_shape[rank - 2], input_shape[rank - 1],
-          output_height, output_width,
-          scales_vals[rank - 2], scales_vals[rank - 1],
-          roi_vals[rank - 2], roi_vals[rank - 2 + rank],
-          roi_vals[rank - 1], roi_vals[rank - 1 + rank],
-          output_height + output_width, extrapolation_enabled, transform_coordinate,
-          reinterpret_cast<BilinearMappingInfo*>(dims_mapping));
-      _ResizeBilinearKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
-          input_shape[rank - 2], input_shape[rank - 1],
-          output_height, output_width,
-          output_div_pitches[rank - 2], div_output_image,
-          input_data, output_data, N, extrapolation_value,
-          reinterpret_cast<BilinearMappingInfo*>(dims_mapping));
-      return;
+      if (is_2D) {
+        _ResizeBilinearCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
+            input_shape[rank - 2], input_shape[rank - 1],
+            output_height, output_width,
+            scales_vals[rank - 2], scales_vals[rank - 1],
+            roi_vals[rank - 2], roi_vals[rank - 2 + rank],
+            roi_vals[rank - 1], roi_vals[rank - 1 + rank],
+            output_height + output_width, extrapolation_enabled, transform_coordinate,
+            reinterpret_cast<LinearMappingInfo*>(dims_mapping));
+        _ResizeBilinearKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
+            input_shape[rank - 2], input_shape[rank - 1],
+            output_height, output_width,
+            output_div_pitches[rank - 2], div_output_image,
+            input_data, output_data, N, extrapolation_value,
+            reinterpret_cast<LinearMappingInfo*>(dims_mapping));
+        return;
+      } else if (is_3D) {
+        _ResizeTrilinearCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
+            input_shape[rank - 3] , input_shape[rank - 2], input_shape[rank - 1],
+            output_depth, output_height, output_width,
+            scales_vals[rank - 3], scales_vals[rank - 2], scales_vals[rank - 1],
+            roi_vals[rank - 3], roi_vals[rank - 3 + rank],
+            roi_vals[rank - 2], roi_vals[rank - 2 + rank],
+            roi_vals[rank - 1], roi_vals[rank - 1 + rank],
+            output_depth + output_height + output_width, extrapolation_enabled, transform_coordinate,
+            reinterpret_cast<LinearMappingInfo*>(dims_mapping));
+        _ResizeTrilinearKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
+            input_shape[rank - 3], input_shape[rank - 2], input_shape[rank - 1],
+            output_depth, output_height, output_width,
+            output_div_pitches[rank - 3], output_div_pitches[rank - 2], div_output_image,
+            input_data, output_data, N, extrapolation_value,
+            reinterpret_cast<LinearMappingInfo*>(dims_mapping));
+        return;
+      }
+
+      break;
+
     case UpsampleMode::CUBIC:
-      _ResizeCubicCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
-          input_shape[rank - 2], input_shape[rank - 1],
-          output_height, output_width,
-          scales_vals[rank - 2], scales_vals[rank - 1],
-          roi_vals[rank - 2], roi_vals[rank - 2 + rank],
-          roi_vals[rank - 1], roi_vals[rank - 1 + rank],
-          output_height + output_width, extrapolation_enabled,
-          cubic_coeff_a, exclude_outside, transform_coordinate,
-          reinterpret_cast<CubicMappingInfo*>(dims_mapping));
-      _ResizeBiCubicKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
-          input_shape[rank - 2], input_shape[rank - 1],
-          output_height, output_width,
-          output_div_pitches[rank - 2], div_output_image,
-          input_data, output_data, N, extrapolation_value,
-          reinterpret_cast<CubicMappingInfo*>(dims_mapping));
-      return;
+      if (is_2D) {
+        _ResizeCubicCoordinateMapping<T><<<blocksPerDimsMappingGrid, 32, 0>>>(
+            input_shape[rank - 2], input_shape[rank - 1],
+            output_height, output_width,
+            scales_vals[rank - 2], scales_vals[rank - 1],
+            roi_vals[rank - 2], roi_vals[rank - 2 + rank],
+            roi_vals[rank - 1], roi_vals[rank - 1 + rank],
+            output_height + output_width, extrapolation_enabled,
+            cubic_coeff_a, exclude_outside, transform_coordinate,
+            reinterpret_cast<CubicMappingInfo*>(dims_mapping));
+        _ResizeBiCubicKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
+            input_shape[rank - 2], input_shape[rank - 1],
+            output_height, output_width,
+            output_div_pitches[rank - 2], div_output_image,
+            input_data, output_data, N, extrapolation_value,
+            reinterpret_cast<CubicMappingInfo*>(dims_mapping));
+        return;
+      }
   }
 }
 
@@ -638,7 +801,7 @@ void ResizeImpl(
       TArray<int64_t>& input_strides,                               \
       TArray<fast_divmod>& output_div_pitches,                      \
       TArray<float>& scales_vals,                                   \
-      TArray<float>& roi_vals,                                      \
+      TArray<float, 10>& roi_vals,                                  \
       const T* input_data,                                          \
       T* output_data,                                               \
       const size_t N,                                               \
diff --git a/onnxruntime/core/providers/cuda/tensor/resize_impl.h b/onnxruntime/core/providers/cuda/tensor/resize_impl.h
index ce38ecfaeb..c82616d644 100644
--- a/onnxruntime/core/providers/cuda/tensor/resize_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/resize_impl.h
@@ -23,7 +23,7 @@ void ResizeImpl(
     TArray<int64_t>& input_strides,
     TArray<fast_divmod>& output_div_pitches,
     TArray<float>& scales_vals,
-    TArray<float>& roi,
+    TArray<float, 10>& roi,
     const T* input_data,
     T* output_data,
     const size_t N,
diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc
index 9a3dbb3876..d0b249ae6b 100644
--- a/onnxruntime/core/providers/cuda/tensor/upsample.cc
+++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc
@@ -74,7 +74,7 @@ Status Upsample<T>::BaseCompute(OpKernelContext* context,
   if (is_resize_) {
     TArray<int64_t> input_shape(X_dims);
     TArray<int64_t> output_shape(output_dims);
-    TArray<float> roi_vals(roi);
+    TArray<float, 10> roi_vals(roi);
     TArray<float> scales_vals(scales);
 
     size_t temp_buffer_size = CalcResizeBufferSize(mode_, output_dims);
diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
index 33a5d7f302..b19a5b3fc4 100644
--- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc
@@ -7,7 +7,7 @@
 
 namespace onnxruntime {
 namespace test {
-TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_tf_crop_and_resize) {
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) {
   OpTester test("Resize", 11);
   std::vector<float> roi{0.4f, 0.6f, 0.6f, 0.8f};
   std::vector<float> scales{};
@@ -65,7 +65,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize_with_extrapol
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear) {
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) {
   OpTester test("Resize", 11);
   std::vector<float> roi{};
   std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
@@ -87,7 +87,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear) {
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_align_corners) {
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) {
   OpTester test("Resize", 11);
   std::vector<float> roi{};
   std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
@@ -110,7 +110,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_align_corners) {
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_pytorch_half_pixel) {
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_pytorch_half_pixel) {
   OpTester test("Resize", 11);
   std::vector<float> roi{};
   std::vector<float> scales{};
@@ -138,7 +138,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_pytorch_half_pixel)
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: results mismatch
 }
 
-TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_asymmetric) {
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric) {
   OpTester test("Resize", 11);
   std::vector<float> roi{};
   std::vector<float> scales{1.0f, 1.0f, 2.0f, 4.0f};
@@ -172,7 +172,7 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_asymmetric) {
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_align_corners) {
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) {
   OpTester test("Resize", 11);
   std::vector<float> roi{};
   std::vector<float> scales{2.0f, 4.0f};
@@ -197,7 +197,66 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_align_corners) {
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartScalesNoOpTest) {
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) {
+  OpTester test("Resize", 11);
+  std::vector<float> roi{};
+  std::vector<float> scales{};
+  std::vector<int64_t> sizes{1, 3, 1};
+
+  test.AddAttribute("mode", "linear");
+  test.AddAttribute("coordinate_transformation_mode", "pytorch_half_pixel");
+
+  const int64_t D = 2, H = 4, W = 4;
+
+  std::vector<float> X = {
+      1.0f, 2.0f, 3.0f, 4.0f,
+      5.0f, 6.0f, 7.0f, 8.0f,
+      9.0f, 10.0f, 11.0f, 12.0f,
+      13.0f, 14.0f, 15.0f, 16.0f,
+
+      1.0f, 2.0f, 3.0f, 4.0f,
+      5.0f, 6.0f, 7.0f, 8.0f,
+      9.0f, 10.0f, 11.0f, 12.0f,
+      13.0f, 14.0f, 15.0f, 16.0f};
+
+  test.AddInput<float>("X", {D, H, W}, X);
+  test.AddInput<float>("roi", {0}, roi);
+  test.AddInput<float>("scales", {0}, scales);
+  test.AddInput<int64_t>("sizes", {3}, sizes);
+
+  std::vector<float> Y = {1.6666666f, 7.0f, 12.333333f};
+
+  test.AddOutput<float>("Y", {sizes[0], sizes[1], sizes[2]}, Y);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: results mismatch
+}
+
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_5DTrilinear_pytorch_half_pixel) {
+  OpTester test("Resize", 11);
+  std::vector<float> roi{};
+  std::vector<float> scales{1.0f, 1.0f, 2.0f, 2.0f, 1.0f};
+
+  test.AddAttribute("mode", "linear");
+  test.AddAttribute("coordinate_transformation_mode", "pytorch_half_pixel");
+
+  const int64_t N = 1, C = 2, D = 2, H = 1, W = 2;
+
+  std::vector<float> X = {
+      1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f};
+
+  test.AddInput<float>("X", {N, C, D, H, W}, X);
+  test.AddInput<float>("roi", {0}, roi);
+  test.AddInput<float>("scales", {5}, scales);
+
+  std::vector<float> Y = {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f,
+                          1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f,
+                          1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f,
+                          1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f};
+
+  test.AddOutput<float>("Y", {1, 2, 4, 2, 2}, Y);
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // TensorRT: results mismatch
+}
+
+TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) {
   OpTester test("Resize", 11);
   std::vector<float> roi{};
   std::vector<float> scales{1.0f, 1.0f, 1.0f, 1.0f};
@@ -856,7 +915,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) {
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_Ver10) {
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) {
   OpTester test("Resize", 10);
   std::vector<float> scales{1.0f, 1.0f, 0.6f, 0.6f};
 
@@ -876,7 +935,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_4DBilinear_Ver10) {
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_Ver10) {
+TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) {
   OpTester test("Resize", 10);
   std::vector<float> scales{0.6f, 0.6f};
 
@@ -896,7 +955,7 @@ TEST(ResizeOpTest, ResizeOpLineartDownSampleTest_2DBilinear_Ver10) {
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_Ver10) {
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) {
   OpTester test("Resize", 10);
   std::vector<float> scales{1.0f, 1.0f, 2.0f, 4.0f};
   test.AddAttribute("mode", "linear");
@@ -926,7 +985,7 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_4DBilinear_Ver10) {
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_Ver10) {
+TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) {
   OpTester test("Resize", 10);
   std::vector<float> scales{2.0f, 4.0f};
   test.AddAttribute("mode", "linear");
@@ -948,7 +1007,7 @@ TEST(ResizeOpTest, ResizeOpLineartUpSampleTest_2DBilinear_Ver10) {
   test.Run();
 }
 
-TEST(ResizeOpTest, ResizeOpLineartScalesNoOpTest_Ver10) {
+TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) {
   OpTester test("Resize", 10);
   std::vector<float> scales{1.0f, 1.0f, 1.0f, 1.0f};
   test.AddAttribute("mode", "linear");