ResizeGrad CUDA/ROCM kernel implementation (#17772)

This commit is contained in:
Baiju Meswani 2023-10-20 11:39:57 -07:00 committed by GitHub
parent cc7e8cc21f
commit a43c57f59d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 605 additions and 25 deletions

View file

@ -230,7 +230,6 @@ class SymbolicShapeInference:
"upsample_nearest1d": self._infer_aten_upsample,
"upsample_nearest2d": self._infer_aten_upsample,
"upsample_nearest3d": self._infer_aten_upsample,
"upsample_bilinear2d": self._infer_aten_upsample,
}
self.run_ = True
self.suggested_merge_ = {}

View file

@ -2147,5 +2147,13 @@ IMPLEMENT_GRADIENT_BUILDER(GetScaledSumGradient) {
ORT_THROW("ScaledSum gradient builder does not support ", input_count, " inputs");
}
IMPLEMENT_GRADIENT_BUILDER(GetResizeGradient) {
return std::vector<NodeDef>{
NodeDef(OpDef{"ResizeGrad", kMSDomain, 1},
{GO(0), I(0), I(1), I(2)},
{GI(0)},
SrcNodeAttributes())};
}
} // namespace training
} // namespace onnxruntime

View file

@ -90,6 +90,7 @@ DECLARE_GRADIENT_BUILDER(GetGRUGradient)
DECLARE_GRADIENT_BUILDER(GetReciprocalGradient)
DECLARE_GRADIENT_BUILDER(GetLeakyReluGradient)
DECLARE_GRADIENT_BUILDER(GetConvTransposeGradient)
DECLARE_GRADIENT_BUILDER(GetResizeGradient)
DECLARE_GRADIENT_BUILDER(GetExternalGradient)

View file

@ -122,6 +122,7 @@ void GradientBuilderRegistry::RegisterGradientBuilders() {
REGISTER_GRADIENT_BUILDER("Reciprocal", GetReciprocalGradient);
REGISTER_GRADIENT_BUILDER("LeakyRelu", GetLeakyReluGradient);
REGISTER_GRADIENT_BUILDER("ConvTranspose", GetConvTransposeGradient);
REGISTER_GRADIENT_BUILDER("Resize", GetResizeGradient);
REGISTER_GRADIENT_BUILDER("ExternalGradient", GetExternalGradient);
};

View file

@ -5001,6 +5001,26 @@ Return true if all elements are true and false otherwise.
"T",
{"tensor(float16)", "tensor(float)", "tensor(double)"},
"Constrain input and output types to float tensors.");
ONNX_CONTRIB_OPERATOR_SCHEMA(ResizeGrad)
.SetDomain(kMSDomain)
.SinceVersion(1)
.Input(0, "dY", "Gradient of output Y.", "T")
.Input(1, "X", "Input tensor to the Resize operator.", "T")
.Input(2, "roi", "The roi input to the Resize operator.", "T", OpSchema::Optional)
.Input(3, "scales", "The scales input to the Resize operator.", "tensor(float)", OpSchema::Optional)
.Output(0, "dX", "Gradient of the input X.", "T")
.AllowUncheckedAttributes()
.TypeConstraint(
"T",
{"tensor(float16)", "tensor(float)", "tensor(double)"},
"Constrain input and output types to float tensors.")
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
propagateElemTypeFromInputToOutput(ctx, 1, 0);
if (hasInputShape(ctx, 1)) {
propagateShapeFromInputToOutput(ctx, 1, 0);
}
});
}
} // namespace training

View file

@ -271,8 +271,3 @@ def upsample_nearest2d_gradient():
@register_gradient("org.pytorch.aten", "ATen", "upsample_nearest3d", "vec")
def upsample_nearest3d_gradient():
return _upsample_gradient("upsample_nearest3d_backward", 3)
@register_gradient("org.pytorch.aten", "ATen", "upsample_bilinear2d", "vec")
def upsample_bilinear2d_gradient():
return _upsample_gradient("upsample_bilinear2d_backward", 2)

View file

@ -808,16 +808,3 @@ def upsample_nearest2d(g, input, output_size, scale_factors):
@register_symbolic("upsample_nearest3d")
def upsample_nearest3d(g, input, output_size, scale_factors):
return _upsample_nearest(g, input, output_size, scale_factors, "upsample_nearest3d")
@register_symbolic("upsample_bilinear2d")
def upsample_bilinear2d(g, input, output_size, align_corners, scale_factors):
return g.op(
"org.pytorch.aten::ATen",
input,
output_size,
align_corners,
scale_factors,
operator_s="upsample_bilinear2d",
overload_name_s="vec",
)

View file

@ -3298,6 +3298,41 @@ TEST(GradientCheckerTest, ConvTransposeGrad) {
execution_providers.push_back(DefaultCudaExecutionProvider());
ConvTransposeGradientCheckerTest(&execution_providers);
}
// TODO: Enable test for ROCM
TEST(GradientCheckerTest, ResizeGrad) {
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultCudaExecutionProvider());
const std::vector<ONNX_NAMESPACE::AttributeProto> attributes = {
MakeAttribute("coordinate_transformation_mode", "half_pixel"),
MakeAttribute("cubic_coeff_a", -0.75f),
MakeAttribute("exclude_outside", static_cast<int64_t>(0)),
MakeAttribute("extrapolation_value", 0.0f),
MakeAttribute("mode", "linear"),
MakeAttribute("nearest_mode", "floor")};
float max_error;
GradientChecker<float, float, float> gradient_checker;
OpDef op_def{"Resize", kOnnxDomain, 18};
TensorInfo x_info({1, 2, 4, 4}, true);
TensorInfo roi_info({4}, false, nullptr, DataTypeImpl::GetTensorType<float>());
TensorInfo scales_info({4}, false, nullptr, DataTypeImpl::GetTensorType<float>());
TensorInfo y_info({1, 2, 8, 8}, true);
std::vector<std::vector<float>> x_datas = {{0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f,
0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f,
0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f,
0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f},
{1.0f, 1.0f, 1.0f, 1.0f},
{1.0f, 1.0f, 2.0f, 2.0f}};
ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, roi_info, scales_info},
{y_info}, &max_error, x_datas, attributes, true, false, &execution_providers));
EXPECT_IS_TINY(max_error);
}
#endif // USE_CUDA
} // namespace test

View file

@ -1773,13 +1773,17 @@ def test_aten_upsample_nearest(input_rank, use_factor):
_test_helpers.assert_values_are_close(ort_input.grad, pt_input.grad)
def test_aten_upsample_bilinear():
@pytest.mark.parametrize("interpolate_size_scale", ({"size": (8, 12)}, {"scale_factor": 4.7}))
@pytest.mark.parametrize("align_corners", (True, False))
def test_resize_grad_correctness_bilinear_2d(interpolate_size_scale, align_corners):
class _NeuralNetUpsampleBilinear(torch.nn.Module):
def __init__(self):
super().__init__()
def forward(self, input):
return torch.nn.functional.interpolate(input, size=(8, 12), mode="bilinear")
return torch.nn.functional.interpolate(
input, align_corners=align_corners, mode="bilinear", **interpolate_size_scale
)
device = "cuda"
pt_model = _NeuralNetUpsampleBilinear().to(device)

View file

@ -0,0 +1,227 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "test/providers/compare_provider_test_utils.h"
#include "test/providers/provider_test_utils.h"
#include "test/util/include/default_providers.h"
namespace onnxruntime::test {
#if defined(USE_CUDA) || defined(USE_ROCM)
namespace {
void AddResizeGradAttributes(OpTester& test, const std::string& coordinate_transformation_mode) {
test.AddAttribute<std::string>("mode", "linear");
test.AddAttribute<std::string>("coordinate_transformation_mode", coordinate_transformation_mode);
}
} // namespace
TEST(ResizeGradTest, ResizeGradWithSizes) {
std::vector<std::unique_ptr<IExecutionProvider>> providers;
#ifdef USE_CUDA
providers.emplace_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
providers.emplace_back(DefaultRocmExecutionProvider());
#endif
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
AddResizeGradAttributes(test, "half_pixel");
std::vector<float> dY(128, 1.0f);
std::vector<int64_t> dY_shape = {1, 2, 8, 8};
std::vector<float> X(32, 1.0f);
std::vector<int64_t> X_shape = {1, 2, 4, 4};
std::vector<float> dX(32, 4.0f);
std::vector<int64_t> dX_shape = X_shape;
test.AddInput<float>("dY", dY_shape, dY);
test.AddInput<float>("X", X_shape, X);
test.AddOutput<float>("dX", dX_shape, dX);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
}
TEST(ResizeGradTest, ResizeGradWithSizesHalf) {
std::vector<std::unique_ptr<IExecutionProvider>> providers;
#ifdef USE_CUDA
providers.emplace_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
providers.emplace_back(DefaultRocmExecutionProvider());
#endif
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
AddResizeGradAttributes(test, "half_pixel");
std::vector<float> dY(128, 1.0f);
std::vector<MLFloat16> dY_half(dY.size());
ConvertFloatToMLFloat16(dY.data(), dY_half.data(), static_cast<int>(dY.size()));
std::vector<int64_t> dY_shape = {1, 2, 8, 8};
std::vector<float> X(32, 1.0f);
std::vector<MLFloat16> X_half(X.size());
ConvertFloatToMLFloat16(X.data(), X_half.data(), static_cast<int>(X.size()));
std::vector<int64_t> X_shape = {1, 2, 4, 4};
std::vector<float> dX(32, 4.0f);
std::vector<MLFloat16> dX_half(dX.size());
ConvertFloatToMLFloat16(dX.data(), dX_half.data(), static_cast<int>(dX.size()));
std::vector<int64_t> dX_shape = X_shape;
test.AddInput<MLFloat16>("dY", dY_shape, dY_half);
test.AddInput<MLFloat16>("X", X_shape, X_half);
test.AddOutput<MLFloat16>("dX", dX_shape, dX_half);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
}
TEST(ResizeGradTest, ResizeGradWithSizesAndAlignCorners) {
std::vector<std::unique_ptr<IExecutionProvider>> providers;
#ifdef USE_CUDA
providers.emplace_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
providers.emplace_back(DefaultRocmExecutionProvider());
#endif
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
AddResizeGradAttributes(test, "align_corners");
std::vector<float> dY(128, 1.0f);
std::vector<int64_t> dY_shape = {1, 2, 8, 8};
std::vector<float> X(32, 1.0f);
std::vector<int64_t> X_shape = {1, 2, 4, 4};
std::vector<float> dX({2.9388f, 3.9184f, 3.9184f, 2.9388f, 3.9184f, 5.2245f, 5.2245f, 3.9184f,
3.9184f, 5.2245f, 5.2245f, 3.9184f, 2.9388f, 3.9184f, 3.9184f, 2.9388f,
2.9388f, 3.9184f, 3.9184f, 2.9388f, 3.9184f, 5.2245f, 5.2245f, 3.9184f,
3.9184f, 5.2245f, 5.2245f, 3.9184f, 2.9388f, 3.9184f, 3.9184f, 2.9388f});
std::vector<int64_t> dX_shape = X_shape;
test.AddInput<float>("dY", dY_shape, dY);
test.AddInput<float>("X", X_shape, X);
test.AddOutput<float>("dX", dX_shape, dX);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
}
TEST(ResizeGradTest, ResizeGradWithScales) {
std::vector<std::unique_ptr<IExecutionProvider>> providers;
#ifdef USE_CUDA
providers.emplace_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
providers.emplace_back(DefaultRocmExecutionProvider());
#endif
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
AddResizeGradAttributes(test, "half_pixel");
std::vector<float> dY(72, 1.0f);
std::vector<int64_t> dY_shape = {1, 2, 6, 6};
std::vector<float> X(32, 1.0f);
std::vector<int64_t> X_shape = {1, 2, 4, 4};
std::vector<float> dX({2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f,
2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f});
std::vector<int64_t> dX_shape = X_shape;
test.AddInput<float>("dY", dY_shape, dY);
test.AddInput<float>("X", X_shape, X);
test.AddInput<float>("", {0}, {});
test.AddInput<float>("scales", {4}, {1.0f, 1.0f, 1.7f, 1.7f});
test.AddOutput<float>("dX", dX_shape, dX);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
}
TEST(ResizeGradTest, ResizeGradWithScalesHalf) {
std::vector<std::unique_ptr<IExecutionProvider>> providers;
#ifdef USE_CUDA
providers.emplace_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
providers.emplace_back(DefaultRocmExecutionProvider());
#endif
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
AddResizeGradAttributes(test, "half_pixel");
std::vector<float> dY(72, 1.0f);
std::vector<MLFloat16> dY_half(dY.size());
ConvertFloatToMLFloat16(dY.data(), dY_half.data(), static_cast<int>(dY.size()));
std::vector<int64_t> dY_shape = {1, 2, 6, 6};
std::vector<float> X(32, 1.0f);
std::vector<MLFloat16> X_half(X.size());
ConvertFloatToMLFloat16(X.data(), X_half.data(), static_cast<int>(X.size()));
std::vector<int64_t> X_shape = {1, 2, 4, 4};
std::vector<float> dX({2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f,
2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f});
std::vector<MLFloat16> dX_half(dX.size());
ConvertFloatToMLFloat16(dX.data(), dX_half.data(), static_cast<int>(dX.size()));
std::vector<int64_t> dX_shape = X_shape;
test.AddInput<MLFloat16>("dY", dY_shape, dY_half);
test.AddInput<MLFloat16>("X", X_shape, X_half);
test.AddInput<float>("", {0}, {});
test.AddInput<float>("scales", {4}, {1.0f, 1.0f, 1.7f, 1.7f});
test.AddOutput<MLFloat16>("dX", dX_shape, dX_half);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
}
TEST(ResizeGradTest, ResizeGradWithScalesAndAlignCorners) {
std::vector<std::unique_ptr<IExecutionProvider>> providers;
#ifdef USE_CUDA
providers.emplace_back(DefaultCudaExecutionProvider());
#elif USE_ROCM
providers.emplace_back(DefaultRocmExecutionProvider());
#endif
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
AddResizeGradAttributes(test, "align_corners");
std::vector<float> dY(72, 1.0f);
std::vector<int64_t> dY_shape = {1, 2, 6, 6};
std::vector<float> X(32, 1.0f);
std::vector<int64_t> X_shape = {1, 2, 4, 4};
std::vector<float> dX({1.9600f, 2.2400f, 2.2400f, 1.9600f, 2.2400f, 2.5600f, 2.5600f, 2.2400f,
2.2400f, 2.5600f, 2.5600f, 2.2400f, 1.9600f, 2.2400f, 2.2400f, 1.9600f,
1.9600f, 2.2400f, 2.2400f, 1.9600f, 2.2400f, 2.5600f, 2.5600f, 2.2400f,
2.2400f, 2.5600f, 2.5600f, 2.2400f, 1.9600f, 2.2400f, 2.2400f, 1.9600f});
std::vector<int64_t> dX_shape = X_shape;
test.AddInput<float>("dY", dY_shape, dY);
test.AddInput<float>("X", X_shape, X);
test.AddInput<float>("", {0}, {});
test.AddInput<float>("scales", {4}, {1.0f, 1.0f, 1.7f, 1.7f});
test.AddOutput<float>("dX", dX_shape, dX);
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
}
#endif // defined(USE_CUDA) || defined(USE_ROCM)
} // namespace onnxruntime::test

View file

@ -207,6 +207,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ResizeGrad);
// the kernels within the following ifdef are not included in a build with
// --enable_training_ops but without --enable_training
@ -453,13 +456,14 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, InplaceClipGradNorm)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
kCudaExecutionProvider, kMSDomain, 1, float, FakeQuant)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuant)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
// the kernels within the following ifdef are not included in a build with
// --enable_training_ops but without --enable_training
#ifdef ENABLE_TRAINING

View file

@ -0,0 +1,81 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include <memory>
#include <utility>
#include "orttraining/training_ops/cuda/tensor/resize_grad.h"
#include "orttraining/training_ops/cuda/tensor/resize_grad_impl.h"
namespace onnxruntime::cuda {
#define REGISTER_RESIZEGRAD_KERNEL_TYPED(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
ResizeGrad, \
kMSDomain, \
1, \
T, \
kCudaExecutionProvider, \
(*KernelDefBuilder::Create()) \
.InputMemoryType(OrtMemTypeCPUInput, 2) /* Keep roi on CPU */ \
.InputMemoryType(OrtMemTypeCPUInput, 3) /* Keep scales on CPU */ \
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
ResizeGrad<T>);
REGISTER_RESIZEGRAD_KERNEL_TYPED(MLFloat16)
REGISTER_RESIZEGRAD_KERNEL_TYPED(float)
REGISTER_RESIZEGRAD_KERNEL_TYPED(double)
template <typename T>
Status ResizeGrad<T>::ComputeInternal(OpKernelContext* context) const {
typedef typename ToCudaType<T>::MappedType CudaT;
const Tensor* dY = context->Input<Tensor>(0);
const Tensor* X = context->Input<Tensor>(1);
const Tensor* scales = context->Input<Tensor>(3);
ORT_ENFORCE(X->Shape().NumDimensions() == 4, "Expected input tensor to have 4 dimensions. Actual: ",
X->Shape().NumDimensions());
const auto get_scales_from_input = [](const Tensor* scales) {
if (nullptr == scales) {
return std::make_pair(std::optional<float>{}, std::optional<float>{});
}
ORT_ENFORCE(scales->Shape().Size() == 4, "There must be a scale for each dimension.");
const auto* scales_data = scales->Data<float>();
return std::make_pair(std::optional<float>{scales_data[2]}, std::optional<float>{scales_data[3]});
};
std::pair<std::optional<float>, std::optional<float>> scale_factors = get_scales_from_input(scales);
Tensor* dX = context->Output(0, X->Shape());
const int64_t batch_size = X->Shape()[0];
const int64_t num_channels = X->Shape()[1];
const int64_t output_height = dY->Shape()[2];
const int64_t output_width = dY->Shape()[3];
const int64_t input_height = X->Shape()[2];
const int64_t input_width = X->Shape()[3];
if (dX->Shape() == dY->Shape()) {
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dX->MutableDataRaw(), dY->DataRaw(), dY->SizeInBytes(), cudaMemcpyDeviceToDevice));
return Status::OK();
}
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes(), Stream(context)));
const bool align_corners = coordinate_transform_mode_ == ResizeCoordinateTransformationMode::ALIGN_CORNERS;
const CudaT* dy_data = reinterpret_cast<const CudaT*>(dY->Data<T>());
CudaT* dx_data = reinterpret_cast<CudaT*>(dX->MutableData<T>());
ResizeGradImpl(Stream(context), input_height, input_width, output_height,
output_width, batch_size, num_channels, align_corners,
scale_factors.first, scale_factors.second,
dy_data, dx_data);
return Status::OK();
}
} // namespace onnxruntime::cuda

View file

@ -0,0 +1,41 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <string>
#include "core/common/common.h"
#include "core/providers/cuda/cuda_kernel.h"
#include "core/providers/cpu/tensor/upsamplebase.h"
namespace onnxruntime::cuda {
template <typename T>
class ResizeGrad final : public UpsampleBase, public CudaKernel {
public:
ResizeGrad(const OpKernelInfo& info) : UpsampleBase(info), CudaKernel(info) {
ORT_ENFORCE(!antialias_, "Antialiasing is not supported in ResizeGrad yet.");
ORT_ENFORCE(axes_.empty(), "ReizeGrad does not support the `axes` attribute yet.");
std::string coordinate_transform_mode =
info.GetAttrOrDefault<std::string>("coordinate_transformation_mode", "half_pixel");
coordinate_transform_mode_ = StringToCoordinateTransformationMode(coordinate_transform_mode);
ORT_ENFORCE(coordinate_transform_mode_ == ResizeCoordinateTransformationMode::HALF_PIXEL ||
coordinate_transform_mode_ == ResizeCoordinateTransformationMode::ALIGN_CORNERS,
"ReizeGrad only supports the `HALF_PIXEL` and `ALIGN_CORNERS` coordinate_transform_mode ",
coordinate_transform_mode, " is not supported yet.");
ORT_ENFORCE(keep_aspect_ratio_policy_ == AspectRatioPolicy::STRETCH,
"ReizeGrad only supports the `STRETCH` policy.");
std::string mode;
ORT_ENFORCE(info.GetAttr<std::string>("mode", &mode).IsOK());
ORT_ENFORCE((UpsampleMode::LINEAR == mode_),
"ReizeGrad only supports the `LINEAR` mode. ", mode, " mode is not supported yet.");
}
Status ComputeInternal(OpKernelContext* context) const override;
};
} // namespace onnxruntime::cuda

View file

@ -0,0 +1,151 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
// Contents of this file are derived from the pytorch cuda implementation of
// the upsample_bilinear2d_backward implementation at:
// https://github.com/pytorch/pytorch/blob/ce50132748f652ed6079c3db8008a6817594dbae/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
#include "orttraining/training_ops/cuda/tensor/resize_grad_impl.h"
#include "core/providers/cuda/cu_inc/common.cuh"
#include "core/providers/cuda/atomic/common.cuh"
namespace onnxruntime::cuda {
namespace {
constexpr int NumThreadsPerBlock = GridDim::maxThreadsPerBlock;
} // namespace
__device__ __forceinline__ size_t
idx(const size_t nc,
const size_t height,
const size_t width,
const size_t h,
const size_t w) {
return (nc * height + h) * width + w;
}
template <typename T>
__device__ __forceinline__ static T AreaPixelComputeSourceIndex(
T scale,
int dst_index,
bool align_corners,
bool cubic) {
if (align_corners) {
return scale * dst_index;
} else {
T src_idx = scale * (dst_index + static_cast<T>(0.5)) -
static_cast<T>(0.5);
return (!cubic && src_idx < static_cast<T>(0))
? static_cast<T>(0)
: src_idx;
}
}
template <typename T, typename AccT>
__global__ void UpsampleGrad(const int64_t nc, const int64_t input_height,
const int64_t input_width, const int64_t output_height,
const int64_t output_width, const AccT rheight,
const AccT rwidth, const bool align_corners,
const T* dY_data, T* dX_data) {
const size_t dy_numel = nc * output_width * output_height;
const size_t dx_numel = nc * input_width * input_height;
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x;
index < dy_numel;
index += blockDim.x * gridDim.x) {
size_t index_temp = index;
const int w2 = index_temp % output_width; // 0:width2-1
index_temp /= output_width;
const int h2 = index_temp % output_height; // 0:height2-1
const size_t nc = index_temp / output_height;
const AccT h1r = AreaPixelComputeSourceIndex<AccT>(
rheight, h2, align_corners, /*cubic=*/false);
const int h1 = h1r;
const int h1p = (h1 < input_height - 1) ? 1 : 0;
const AccT h1lambda = h1r - h1;
const AccT h0lambda = static_cast<AccT>(1) - h1lambda;
const AccT w1r = AreaPixelComputeSourceIndex<AccT>(
rwidth, w2, align_corners, /*cubic=*/false);
const int w1 = w1r;
const int w1p = (w1 < input_width - 1) ? 1 : 0;
const AccT w1lambda = w1r - w1;
const AccT w0lambda = static_cast<AccT>(1) - w1lambda;
const T d2val = dY_data[index];
AtomicAdd(
dX_data,
idx(nc, input_height, input_width, h1, w1),
dx_numel,
static_cast<T>(h0lambda * w0lambda) * d2val);
AtomicAdd(
dX_data,
idx(nc, input_height, input_width, h1, w1 + w1p),
dx_numel,
static_cast<T>(h0lambda * w1lambda) * d2val);
AtomicAdd(
dX_data,
idx(nc, input_height, input_width, h1 + h1p, w1),
dx_numel,
static_cast<T>(h1lambda * w0lambda) * d2val);
AtomicAdd(
dX_data,
idx(nc, input_height, input_width, h1 + h1p, w1 + w1p),
dx_numel,
static_cast<T>(h1lambda * w1lambda) * d2val);
}
}
template <typename T>
T AreaPixelComputeScale(int64_t input_size, int64_t output_size, bool align_corners,
const std::optional<float>& scale) {
if (align_corners) {
if (output_size <= 1) {
return T{0};
}
return static_cast<T>(input_size - 1) / static_cast<T>(output_size - 1);
} else {
if (scale.has_value()) {
return static_cast<T>(T{1.0} / *scale);
} else {
return static_cast<T>(input_size) / static_cast<T>(output_size);
}
}
}
template <typename T>
void ResizeGradImpl(cudaStream_t stream, int64_t input_height,
int64_t input_width, int64_t output_height,
int64_t output_width, int64_t batch_size,
int64_t channels, bool align_corners,
const std::optional<float>& scale_height,
const std::optional<float>& scale_width,
const T* dY_data, T* dX_data) {
float rheight = AreaPixelComputeScale<float>(input_height, output_height, align_corners, scale_height);
float rwidth = AreaPixelComputeScale<float>(input_width, output_width, align_corners, scale_width);
const size_t output_numel = batch_size * channels * output_height * output_width;
int blocks_per_grid = (int)(ceil(static_cast<float>(output_numel) / NumThreadsPerBlock));
UpsampleGrad<T><<<blocks_per_grid, NumThreadsPerBlock, 0, stream>>>(
batch_size * channels, input_height, input_width, output_height, output_width,
rheight, rwidth, align_corners, dY_data, dX_data);
}
#define SPECIALIZED_RESIZEGRAD_IMPL(T) \
template void ResizeGradImpl<T>(cudaStream_t stream, int64_t input_height, \
int64_t input_width, int64_t output_height, \
int64_t output_width, int64_t batch_size, \
int64_t channels, bool align_corners, \
const std::optional<float>& scale_height, \
const std::optional<float>& scale_width, \
const T* dY_data, T* dX_data);
SPECIALIZED_RESIZEGRAD_IMPL(half)
SPECIALIZED_RESIZEGRAD_IMPL(float)
SPECIALIZED_RESIZEGRAD_IMPL(double)
#undef SPECIALIZED_RESIZEGRAD_IMPL
} // namespace onnxruntime::cuda

View file

@ -0,0 +1,20 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include <optional>
namespace onnxruntime::cuda {
template <typename T>
void ResizeGradImpl(cudaStream_t stream, int64_t input_height,
int64_t input_width, int64_t output_height,
int64_t output_width, int64_t batch_size,
int64_t channels, bool align_corners,
const std::optional<float>& scale_height,
const std::optional<float>& scale_width,
const T* dY_data, T* dX_data);
} // namespace onnxruntime::cuda

View file

@ -187,6 +187,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2);
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad);
#if defined(ORT_USE_NCCL) || defined(USE_MPI)
// P2P communication operators.
@ -387,6 +390,9 @@ Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2)>,
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
// P2P communication operators.
#if defined(ORT_USE_NCCL) || defined(USE_MPI)