mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-05 04:17:53 +00:00
ResizeGrad CUDA/ROCM kernel implementation (#17772)
This commit is contained in:
parent
cc7e8cc21f
commit
a43c57f59d
16 changed files with 605 additions and 25 deletions
|
|
@ -230,7 +230,6 @@ class SymbolicShapeInference:
|
|||
"upsample_nearest1d": self._infer_aten_upsample,
|
||||
"upsample_nearest2d": self._infer_aten_upsample,
|
||||
"upsample_nearest3d": self._infer_aten_upsample,
|
||||
"upsample_bilinear2d": self._infer_aten_upsample,
|
||||
}
|
||||
self.run_ = True
|
||||
self.suggested_merge_ = {}
|
||||
|
|
|
|||
|
|
@ -2147,5 +2147,13 @@ IMPLEMENT_GRADIENT_BUILDER(GetScaledSumGradient) {
|
|||
ORT_THROW("ScaledSum gradient builder does not support ", input_count, " inputs");
|
||||
}
|
||||
|
||||
IMPLEMENT_GRADIENT_BUILDER(GetResizeGradient) {
|
||||
return std::vector<NodeDef>{
|
||||
NodeDef(OpDef{"ResizeGrad", kMSDomain, 1},
|
||||
{GO(0), I(0), I(1), I(2)},
|
||||
{GI(0)},
|
||||
SrcNodeAttributes())};
|
||||
}
|
||||
|
||||
} // namespace training
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -90,6 +90,7 @@ DECLARE_GRADIENT_BUILDER(GetGRUGradient)
|
|||
DECLARE_GRADIENT_BUILDER(GetReciprocalGradient)
|
||||
DECLARE_GRADIENT_BUILDER(GetLeakyReluGradient)
|
||||
DECLARE_GRADIENT_BUILDER(GetConvTransposeGradient)
|
||||
DECLARE_GRADIENT_BUILDER(GetResizeGradient)
|
||||
|
||||
DECLARE_GRADIENT_BUILDER(GetExternalGradient)
|
||||
|
||||
|
|
|
|||
|
|
@ -122,6 +122,7 @@ void GradientBuilderRegistry::RegisterGradientBuilders() {
|
|||
REGISTER_GRADIENT_BUILDER("Reciprocal", GetReciprocalGradient);
|
||||
REGISTER_GRADIENT_BUILDER("LeakyRelu", GetLeakyReluGradient);
|
||||
REGISTER_GRADIENT_BUILDER("ConvTranspose", GetConvTransposeGradient);
|
||||
REGISTER_GRADIENT_BUILDER("Resize", GetResizeGradient);
|
||||
|
||||
REGISTER_GRADIENT_BUILDER("ExternalGradient", GetExternalGradient);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -5001,6 +5001,26 @@ Return true if all elements are true and false otherwise.
|
|||
"T",
|
||||
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
||||
"Constrain input and output types to float tensors.");
|
||||
|
||||
ONNX_CONTRIB_OPERATOR_SCHEMA(ResizeGrad)
|
||||
.SetDomain(kMSDomain)
|
||||
.SinceVersion(1)
|
||||
.Input(0, "dY", "Gradient of output Y.", "T")
|
||||
.Input(1, "X", "Input tensor to the Resize operator.", "T")
|
||||
.Input(2, "roi", "The roi input to the Resize operator.", "T", OpSchema::Optional)
|
||||
.Input(3, "scales", "The scales input to the Resize operator.", "tensor(float)", OpSchema::Optional)
|
||||
.Output(0, "dX", "Gradient of the input X.", "T")
|
||||
.AllowUncheckedAttributes()
|
||||
.TypeConstraint(
|
||||
"T",
|
||||
{"tensor(float16)", "tensor(float)", "tensor(double)"},
|
||||
"Constrain input and output types to float tensors.")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromInputToOutput(ctx, 1, 0);
|
||||
if (hasInputShape(ctx, 1)) {
|
||||
propagateShapeFromInputToOutput(ctx, 1, 0);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace training
|
||||
|
|
|
|||
|
|
@ -271,8 +271,3 @@ def upsample_nearest2d_gradient():
|
|||
@register_gradient("org.pytorch.aten", "ATen", "upsample_nearest3d", "vec")
|
||||
def upsample_nearest3d_gradient():
|
||||
return _upsample_gradient("upsample_nearest3d_backward", 3)
|
||||
|
||||
|
||||
@register_gradient("org.pytorch.aten", "ATen", "upsample_bilinear2d", "vec")
|
||||
def upsample_bilinear2d_gradient():
|
||||
return _upsample_gradient("upsample_bilinear2d_backward", 2)
|
||||
|
|
|
|||
|
|
@ -808,16 +808,3 @@ def upsample_nearest2d(g, input, output_size, scale_factors):
|
|||
@register_symbolic("upsample_nearest3d")
|
||||
def upsample_nearest3d(g, input, output_size, scale_factors):
|
||||
return _upsample_nearest(g, input, output_size, scale_factors, "upsample_nearest3d")
|
||||
|
||||
|
||||
@register_symbolic("upsample_bilinear2d")
|
||||
def upsample_bilinear2d(g, input, output_size, align_corners, scale_factors):
|
||||
return g.op(
|
||||
"org.pytorch.aten::ATen",
|
||||
input,
|
||||
output_size,
|
||||
align_corners,
|
||||
scale_factors,
|
||||
operator_s="upsample_bilinear2d",
|
||||
overload_name_s="vec",
|
||||
)
|
||||
|
|
|
|||
|
|
@ -3298,6 +3298,41 @@ TEST(GradientCheckerTest, ConvTransposeGrad) {
|
|||
execution_providers.push_back(DefaultCudaExecutionProvider());
|
||||
ConvTransposeGradientCheckerTest(&execution_providers);
|
||||
}
|
||||
|
||||
// TODO: Enable test for ROCM
|
||||
TEST(GradientCheckerTest, ResizeGrad) {
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
|
||||
execution_providers.push_back(DefaultCudaExecutionProvider());
|
||||
const std::vector<ONNX_NAMESPACE::AttributeProto> attributes = {
|
||||
MakeAttribute("coordinate_transformation_mode", "half_pixel"),
|
||||
MakeAttribute("cubic_coeff_a", -0.75f),
|
||||
MakeAttribute("exclude_outside", static_cast<int64_t>(0)),
|
||||
MakeAttribute("extrapolation_value", 0.0f),
|
||||
MakeAttribute("mode", "linear"),
|
||||
MakeAttribute("nearest_mode", "floor")};
|
||||
|
||||
float max_error;
|
||||
GradientChecker<float, float, float> gradient_checker;
|
||||
OpDef op_def{"Resize", kOnnxDomain, 18};
|
||||
|
||||
TensorInfo x_info({1, 2, 4, 4}, true);
|
||||
TensorInfo roi_info({4}, false, nullptr, DataTypeImpl::GetTensorType<float>());
|
||||
TensorInfo scales_info({4}, false, nullptr, DataTypeImpl::GetTensorType<float>());
|
||||
|
||||
TensorInfo y_info({1, 2, 8, 8}, true);
|
||||
|
||||
std::vector<std::vector<float>> x_datas = {{0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f,
|
||||
0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f,
|
||||
0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f,
|
||||
0.2f, 0.4f, 0.6f, 0.8f, 0.2f, 0.4f, 0.6f, 0.8f},
|
||||
{1.0f, 1.0f, 1.0f, 1.0f},
|
||||
{1.0f, 1.0f, 2.0f, 2.0f}};
|
||||
|
||||
ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def, {x_info, roi_info, scales_info},
|
||||
{y_info}, &max_error, x_datas, attributes, true, false, &execution_providers));
|
||||
EXPECT_IS_TINY(max_error);
|
||||
}
|
||||
|
||||
#endif // USE_CUDA
|
||||
|
||||
} // namespace test
|
||||
|
|
|
|||
|
|
@ -1773,13 +1773,17 @@ def test_aten_upsample_nearest(input_rank, use_factor):
|
|||
_test_helpers.assert_values_are_close(ort_input.grad, pt_input.grad)
|
||||
|
||||
|
||||
def test_aten_upsample_bilinear():
|
||||
@pytest.mark.parametrize("interpolate_size_scale", ({"size": (8, 12)}, {"scale_factor": 4.7}))
|
||||
@pytest.mark.parametrize("align_corners", (True, False))
|
||||
def test_resize_grad_correctness_bilinear_2d(interpolate_size_scale, align_corners):
|
||||
class _NeuralNetUpsampleBilinear(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def forward(self, input):
|
||||
return torch.nn.functional.interpolate(input, size=(8, 12), mode="bilinear")
|
||||
return torch.nn.functional.interpolate(
|
||||
input, align_corners=align_corners, mode="bilinear", **interpolate_size_scale
|
||||
)
|
||||
|
||||
device = "cuda"
|
||||
pt_model = _NeuralNetUpsampleBilinear().to(device)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,227 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "test/providers/compare_provider_test_utils.h"
|
||||
#include "test/providers/provider_test_utils.h"
|
||||
#include "test/util/include/default_providers.h"
|
||||
|
||||
namespace onnxruntime::test {
|
||||
|
||||
#if defined(USE_CUDA) || defined(USE_ROCM)
|
||||
|
||||
namespace {
|
||||
|
||||
void AddResizeGradAttributes(OpTester& test, const std::string& coordinate_transformation_mode) {
|
||||
test.AddAttribute<std::string>("mode", "linear");
|
||||
test.AddAttribute<std::string>("coordinate_transformation_mode", coordinate_transformation_mode);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
TEST(ResizeGradTest, ResizeGradWithSizes) {
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> providers;
|
||||
#ifdef USE_CUDA
|
||||
providers.emplace_back(DefaultCudaExecutionProvider());
|
||||
#elif USE_ROCM
|
||||
providers.emplace_back(DefaultRocmExecutionProvider());
|
||||
#endif
|
||||
|
||||
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
|
||||
|
||||
AddResizeGradAttributes(test, "half_pixel");
|
||||
|
||||
std::vector<float> dY(128, 1.0f);
|
||||
std::vector<int64_t> dY_shape = {1, 2, 8, 8};
|
||||
|
||||
std::vector<float> X(32, 1.0f);
|
||||
std::vector<int64_t> X_shape = {1, 2, 4, 4};
|
||||
|
||||
std::vector<float> dX(32, 4.0f);
|
||||
std::vector<int64_t> dX_shape = X_shape;
|
||||
|
||||
test.AddInput<float>("dY", dY_shape, dY);
|
||||
test.AddInput<float>("X", X_shape, X);
|
||||
|
||||
test.AddOutput<float>("dX", dX_shape, dX);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
|
||||
}
|
||||
|
||||
TEST(ResizeGradTest, ResizeGradWithSizesHalf) {
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> providers;
|
||||
#ifdef USE_CUDA
|
||||
providers.emplace_back(DefaultCudaExecutionProvider());
|
||||
#elif USE_ROCM
|
||||
providers.emplace_back(DefaultRocmExecutionProvider());
|
||||
#endif
|
||||
|
||||
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
|
||||
|
||||
AddResizeGradAttributes(test, "half_pixel");
|
||||
|
||||
std::vector<float> dY(128, 1.0f);
|
||||
std::vector<MLFloat16> dY_half(dY.size());
|
||||
ConvertFloatToMLFloat16(dY.data(), dY_half.data(), static_cast<int>(dY.size()));
|
||||
std::vector<int64_t> dY_shape = {1, 2, 8, 8};
|
||||
|
||||
std::vector<float> X(32, 1.0f);
|
||||
std::vector<MLFloat16> X_half(X.size());
|
||||
ConvertFloatToMLFloat16(X.data(), X_half.data(), static_cast<int>(X.size()));
|
||||
std::vector<int64_t> X_shape = {1, 2, 4, 4};
|
||||
|
||||
std::vector<float> dX(32, 4.0f);
|
||||
std::vector<MLFloat16> dX_half(dX.size());
|
||||
ConvertFloatToMLFloat16(dX.data(), dX_half.data(), static_cast<int>(dX.size()));
|
||||
std::vector<int64_t> dX_shape = X_shape;
|
||||
|
||||
test.AddInput<MLFloat16>("dY", dY_shape, dY_half);
|
||||
test.AddInput<MLFloat16>("X", X_shape, X_half);
|
||||
|
||||
test.AddOutput<MLFloat16>("dX", dX_shape, dX_half);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
|
||||
}
|
||||
|
||||
TEST(ResizeGradTest, ResizeGradWithSizesAndAlignCorners) {
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> providers;
|
||||
#ifdef USE_CUDA
|
||||
providers.emplace_back(DefaultCudaExecutionProvider());
|
||||
#elif USE_ROCM
|
||||
providers.emplace_back(DefaultRocmExecutionProvider());
|
||||
#endif
|
||||
|
||||
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
|
||||
|
||||
AddResizeGradAttributes(test, "align_corners");
|
||||
|
||||
std::vector<float> dY(128, 1.0f);
|
||||
std::vector<int64_t> dY_shape = {1, 2, 8, 8};
|
||||
|
||||
std::vector<float> X(32, 1.0f);
|
||||
std::vector<int64_t> X_shape = {1, 2, 4, 4};
|
||||
|
||||
std::vector<float> dX({2.9388f, 3.9184f, 3.9184f, 2.9388f, 3.9184f, 5.2245f, 5.2245f, 3.9184f,
|
||||
3.9184f, 5.2245f, 5.2245f, 3.9184f, 2.9388f, 3.9184f, 3.9184f, 2.9388f,
|
||||
2.9388f, 3.9184f, 3.9184f, 2.9388f, 3.9184f, 5.2245f, 5.2245f, 3.9184f,
|
||||
3.9184f, 5.2245f, 5.2245f, 3.9184f, 2.9388f, 3.9184f, 3.9184f, 2.9388f});
|
||||
std::vector<int64_t> dX_shape = X_shape;
|
||||
|
||||
test.AddInput<float>("dY", dY_shape, dY);
|
||||
test.AddInput<float>("X", X_shape, X);
|
||||
|
||||
test.AddOutput<float>("dX", dX_shape, dX);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
|
||||
}
|
||||
|
||||
TEST(ResizeGradTest, ResizeGradWithScales) {
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> providers;
|
||||
#ifdef USE_CUDA
|
||||
providers.emplace_back(DefaultCudaExecutionProvider());
|
||||
#elif USE_ROCM
|
||||
providers.emplace_back(DefaultRocmExecutionProvider());
|
||||
#endif
|
||||
|
||||
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
|
||||
|
||||
AddResizeGradAttributes(test, "half_pixel");
|
||||
|
||||
std::vector<float> dY(72, 1.0f);
|
||||
std::vector<int64_t> dY_shape = {1, 2, 6, 6};
|
||||
|
||||
std::vector<float> X(32, 1.0f);
|
||||
std::vector<int64_t> X_shape = {1, 2, 4, 4};
|
||||
|
||||
std::vector<float> dX({2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
|
||||
2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f,
|
||||
2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
|
||||
2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f});
|
||||
std::vector<int64_t> dX_shape = X_shape;
|
||||
|
||||
test.AddInput<float>("dY", dY_shape, dY);
|
||||
test.AddInput<float>("X", X_shape, X);
|
||||
test.AddInput<float>("", {0}, {});
|
||||
test.AddInput<float>("scales", {4}, {1.0f, 1.0f, 1.7f, 1.7f});
|
||||
|
||||
test.AddOutput<float>("dX", dX_shape, dX);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
|
||||
}
|
||||
|
||||
TEST(ResizeGradTest, ResizeGradWithScalesHalf) {
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> providers;
|
||||
#ifdef USE_CUDA
|
||||
providers.emplace_back(DefaultCudaExecutionProvider());
|
||||
#elif USE_ROCM
|
||||
providers.emplace_back(DefaultRocmExecutionProvider());
|
||||
#endif
|
||||
|
||||
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
|
||||
|
||||
AddResizeGradAttributes(test, "half_pixel");
|
||||
|
||||
std::vector<float> dY(72, 1.0f);
|
||||
std::vector<MLFloat16> dY_half(dY.size());
|
||||
ConvertFloatToMLFloat16(dY.data(), dY_half.data(), static_cast<int>(dY.size()));
|
||||
std::vector<int64_t> dY_shape = {1, 2, 6, 6};
|
||||
|
||||
std::vector<float> X(32, 1.0f);
|
||||
std::vector<MLFloat16> X_half(X.size());
|
||||
ConvertFloatToMLFloat16(X.data(), X_half.data(), static_cast<int>(X.size()));
|
||||
std::vector<int64_t> X_shape = {1, 2, 4, 4};
|
||||
|
||||
std::vector<float> dX({2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
|
||||
2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f,
|
||||
2.7128f, 2.9550f, 2.7612f, 1.4533f, 2.9550f, 3.2189f, 3.0078f, 1.5830f,
|
||||
2.7612f, 3.0078f, 2.8106f, 1.4792f, 1.4533f, 1.5830f, 1.4792f, 0.7785f});
|
||||
std::vector<MLFloat16> dX_half(dX.size());
|
||||
ConvertFloatToMLFloat16(dX.data(), dX_half.data(), static_cast<int>(dX.size()));
|
||||
std::vector<int64_t> dX_shape = X_shape;
|
||||
|
||||
test.AddInput<MLFloat16>("dY", dY_shape, dY_half);
|
||||
test.AddInput<MLFloat16>("X", X_shape, X_half);
|
||||
test.AddInput<float>("", {0}, {});
|
||||
test.AddInput<float>("scales", {4}, {1.0f, 1.0f, 1.7f, 1.7f});
|
||||
|
||||
test.AddOutput<MLFloat16>("dX", dX_shape, dX_half);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
|
||||
}
|
||||
|
||||
TEST(ResizeGradTest, ResizeGradWithScalesAndAlignCorners) {
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> providers;
|
||||
#ifdef USE_CUDA
|
||||
providers.emplace_back(DefaultCudaExecutionProvider());
|
||||
#elif USE_ROCM
|
||||
providers.emplace_back(DefaultRocmExecutionProvider());
|
||||
#endif
|
||||
|
||||
OpTester test("ResizeGrad", 1, onnxruntime::kMSDomain);
|
||||
|
||||
AddResizeGradAttributes(test, "align_corners");
|
||||
|
||||
std::vector<float> dY(72, 1.0f);
|
||||
std::vector<int64_t> dY_shape = {1, 2, 6, 6};
|
||||
|
||||
std::vector<float> X(32, 1.0f);
|
||||
std::vector<int64_t> X_shape = {1, 2, 4, 4};
|
||||
|
||||
std::vector<float> dX({1.9600f, 2.2400f, 2.2400f, 1.9600f, 2.2400f, 2.5600f, 2.5600f, 2.2400f,
|
||||
2.2400f, 2.5600f, 2.5600f, 2.2400f, 1.9600f, 2.2400f, 2.2400f, 1.9600f,
|
||||
1.9600f, 2.2400f, 2.2400f, 1.9600f, 2.2400f, 2.5600f, 2.5600f, 2.2400f,
|
||||
2.2400f, 2.5600f, 2.5600f, 2.2400f, 1.9600f, 2.2400f, 2.2400f, 1.9600f});
|
||||
std::vector<int64_t> dX_shape = X_shape;
|
||||
|
||||
test.AddInput<float>("dY", dY_shape, dY);
|
||||
test.AddInput<float>("X", X_shape, X);
|
||||
test.AddInput<float>("", {0}, {});
|
||||
test.AddInput<float>("scales", {4}, {1.0f, 1.0f, 1.7f, 1.7f});
|
||||
|
||||
test.AddOutput<float>("dX", dX_shape, dX);
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &providers);
|
||||
}
|
||||
|
||||
#endif // defined(USE_CUDA) || defined(USE_ROCM)
|
||||
|
||||
} // namespace onnxruntime::test
|
||||
|
|
@ -207,6 +207,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
|
|||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ResizeGrad);
|
||||
|
||||
// the kernels within the following ifdef are not included in a build with
|
||||
// --enable_training_ops but without --enable_training
|
||||
|
|
@ -453,13 +456,14 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
|
|||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, InplaceClipGradNorm)>,
|
||||
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
|
||||
kCudaExecutionProvider, kMSDomain, 1, float, FakeQuant)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(
|
||||
kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuant)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, FakeQuantGrad)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, BatchScale)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, ScaledSum)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
|
||||
// the kernels within the following ifdef are not included in a build with
|
||||
// --enable_training_ops but without --enable_training
|
||||
#ifdef ENABLE_TRAINING
|
||||
|
|
|
|||
|
|
@ -0,0 +1,81 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "orttraining/training_ops/cuda/tensor/resize_grad.h"
|
||||
#include "orttraining/training_ops/cuda/tensor/resize_grad_impl.h"
|
||||
|
||||
namespace onnxruntime::cuda {
|
||||
|
||||
#define REGISTER_RESIZEGRAD_KERNEL_TYPED(T) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
ResizeGrad, \
|
||||
kMSDomain, \
|
||||
1, \
|
||||
T, \
|
||||
kCudaExecutionProvider, \
|
||||
(*KernelDefBuilder::Create()) \
|
||||
.InputMemoryType(OrtMemTypeCPUInput, 2) /* Keep roi on CPU */ \
|
||||
.InputMemoryType(OrtMemTypeCPUInput, 3) /* Keep scales on CPU */ \
|
||||
.TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
|
||||
ResizeGrad<T>);
|
||||
|
||||
REGISTER_RESIZEGRAD_KERNEL_TYPED(MLFloat16)
|
||||
REGISTER_RESIZEGRAD_KERNEL_TYPED(float)
|
||||
REGISTER_RESIZEGRAD_KERNEL_TYPED(double)
|
||||
|
||||
template <typename T>
|
||||
Status ResizeGrad<T>::ComputeInternal(OpKernelContext* context) const {
|
||||
typedef typename ToCudaType<T>::MappedType CudaT;
|
||||
|
||||
const Tensor* dY = context->Input<Tensor>(0);
|
||||
const Tensor* X = context->Input<Tensor>(1);
|
||||
const Tensor* scales = context->Input<Tensor>(3);
|
||||
|
||||
ORT_ENFORCE(X->Shape().NumDimensions() == 4, "Expected input tensor to have 4 dimensions. Actual: ",
|
||||
X->Shape().NumDimensions());
|
||||
|
||||
const auto get_scales_from_input = [](const Tensor* scales) {
|
||||
if (nullptr == scales) {
|
||||
return std::make_pair(std::optional<float>{}, std::optional<float>{});
|
||||
}
|
||||
|
||||
ORT_ENFORCE(scales->Shape().Size() == 4, "There must be a scale for each dimension.");
|
||||
|
||||
const auto* scales_data = scales->Data<float>();
|
||||
return std::make_pair(std::optional<float>{scales_data[2]}, std::optional<float>{scales_data[3]});
|
||||
};
|
||||
|
||||
std::pair<std::optional<float>, std::optional<float>> scale_factors = get_scales_from_input(scales);
|
||||
|
||||
Tensor* dX = context->Output(0, X->Shape());
|
||||
|
||||
const int64_t batch_size = X->Shape()[0];
|
||||
const int64_t num_channels = X->Shape()[1];
|
||||
const int64_t output_height = dY->Shape()[2];
|
||||
const int64_t output_width = dY->Shape()[3];
|
||||
const int64_t input_height = X->Shape()[2];
|
||||
const int64_t input_width = X->Shape()[3];
|
||||
|
||||
if (dX->Shape() == dY->Shape()) {
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(dX->MutableDataRaw(), dY->DataRaw(), dY->SizeInBytes(), cudaMemcpyDeviceToDevice));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(dX->MutableDataRaw(), 0, dX->SizeInBytes(), Stream(context)));
|
||||
|
||||
const bool align_corners = coordinate_transform_mode_ == ResizeCoordinateTransformationMode::ALIGN_CORNERS;
|
||||
const CudaT* dy_data = reinterpret_cast<const CudaT*>(dY->Data<T>());
|
||||
CudaT* dx_data = reinterpret_cast<CudaT*>(dX->MutableData<T>());
|
||||
|
||||
ResizeGradImpl(Stream(context), input_height, input_width, output_height,
|
||||
output_width, batch_size, num_channels, align_corners,
|
||||
scale_factors.first, scale_factors.second,
|
||||
dy_data, dx_data);
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace onnxruntime::cuda
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include "core/common/common.h"
|
||||
#include "core/providers/cuda/cuda_kernel.h"
|
||||
#include "core/providers/cpu/tensor/upsamplebase.h"
|
||||
|
||||
namespace onnxruntime::cuda {
|
||||
|
||||
template <typename T>
|
||||
class ResizeGrad final : public UpsampleBase, public CudaKernel {
|
||||
public:
|
||||
ResizeGrad(const OpKernelInfo& info) : UpsampleBase(info), CudaKernel(info) {
|
||||
ORT_ENFORCE(!antialias_, "Antialiasing is not supported in ResizeGrad yet.");
|
||||
|
||||
ORT_ENFORCE(axes_.empty(), "ReizeGrad does not support the `axes` attribute yet.");
|
||||
|
||||
std::string coordinate_transform_mode =
|
||||
info.GetAttrOrDefault<std::string>("coordinate_transformation_mode", "half_pixel");
|
||||
coordinate_transform_mode_ = StringToCoordinateTransformationMode(coordinate_transform_mode);
|
||||
ORT_ENFORCE(coordinate_transform_mode_ == ResizeCoordinateTransformationMode::HALF_PIXEL ||
|
||||
coordinate_transform_mode_ == ResizeCoordinateTransformationMode::ALIGN_CORNERS,
|
||||
"ReizeGrad only supports the `HALF_PIXEL` and `ALIGN_CORNERS` coordinate_transform_mode ",
|
||||
coordinate_transform_mode, " is not supported yet.");
|
||||
|
||||
ORT_ENFORCE(keep_aspect_ratio_policy_ == AspectRatioPolicy::STRETCH,
|
||||
"ReizeGrad only supports the `STRETCH` policy.");
|
||||
|
||||
std::string mode;
|
||||
ORT_ENFORCE(info.GetAttr<std::string>("mode", &mode).IsOK());
|
||||
ORT_ENFORCE((UpsampleMode::LINEAR == mode_),
|
||||
"ReizeGrad only supports the `LINEAR` mode. ", mode, " mode is not supported yet.");
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
};
|
||||
|
||||
} // namespace onnxruntime::cuda
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
// Contents of this file are derived from the pytorch cuda implementation of
|
||||
// the upsample_bilinear2d_backward implementation at:
|
||||
// https://github.com/pytorch/pytorch/blob/ce50132748f652ed6079c3db8008a6817594dbae/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu
|
||||
|
||||
#include "orttraining/training_ops/cuda/tensor/resize_grad_impl.h"
|
||||
#include "core/providers/cuda/cu_inc/common.cuh"
|
||||
#include "core/providers/cuda/atomic/common.cuh"
|
||||
|
||||
namespace onnxruntime::cuda {
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr int NumThreadsPerBlock = GridDim::maxThreadsPerBlock;
|
||||
|
||||
} // namespace
|
||||
|
||||
__device__ __forceinline__ size_t
|
||||
idx(const size_t nc,
|
||||
const size_t height,
|
||||
const size_t width,
|
||||
const size_t h,
|
||||
const size_t w) {
|
||||
return (nc * height + h) * width + w;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ static T AreaPixelComputeSourceIndex(
|
||||
T scale,
|
||||
int dst_index,
|
||||
bool align_corners,
|
||||
bool cubic) {
|
||||
if (align_corners) {
|
||||
return scale * dst_index;
|
||||
} else {
|
||||
T src_idx = scale * (dst_index + static_cast<T>(0.5)) -
|
||||
static_cast<T>(0.5);
|
||||
return (!cubic && src_idx < static_cast<T>(0))
|
||||
? static_cast<T>(0)
|
||||
: src_idx;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename AccT>
|
||||
__global__ void UpsampleGrad(const int64_t nc, const int64_t input_height,
|
||||
const int64_t input_width, const int64_t output_height,
|
||||
const int64_t output_width, const AccT rheight,
|
||||
const AccT rwidth, const bool align_corners,
|
||||
const T* dY_data, T* dX_data) {
|
||||
const size_t dy_numel = nc * output_width * output_height;
|
||||
const size_t dx_numel = nc * input_width * input_height;
|
||||
for (size_t index = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
index < dy_numel;
|
||||
index += blockDim.x * gridDim.x) {
|
||||
size_t index_temp = index;
|
||||
const int w2 = index_temp % output_width; // 0:width2-1
|
||||
index_temp /= output_width;
|
||||
const int h2 = index_temp % output_height; // 0:height2-1
|
||||
const size_t nc = index_temp / output_height;
|
||||
|
||||
const AccT h1r = AreaPixelComputeSourceIndex<AccT>(
|
||||
rheight, h2, align_corners, /*cubic=*/false);
|
||||
const int h1 = h1r;
|
||||
const int h1p = (h1 < input_height - 1) ? 1 : 0;
|
||||
const AccT h1lambda = h1r - h1;
|
||||
const AccT h0lambda = static_cast<AccT>(1) - h1lambda;
|
||||
|
||||
const AccT w1r = AreaPixelComputeSourceIndex<AccT>(
|
||||
rwidth, w2, align_corners, /*cubic=*/false);
|
||||
const int w1 = w1r;
|
||||
const int w1p = (w1 < input_width - 1) ? 1 : 0;
|
||||
const AccT w1lambda = w1r - w1;
|
||||
const AccT w0lambda = static_cast<AccT>(1) - w1lambda;
|
||||
|
||||
const T d2val = dY_data[index];
|
||||
AtomicAdd(
|
||||
dX_data,
|
||||
idx(nc, input_height, input_width, h1, w1),
|
||||
dx_numel,
|
||||
static_cast<T>(h0lambda * w0lambda) * d2val);
|
||||
AtomicAdd(
|
||||
dX_data,
|
||||
idx(nc, input_height, input_width, h1, w1 + w1p),
|
||||
dx_numel,
|
||||
static_cast<T>(h0lambda * w1lambda) * d2val);
|
||||
AtomicAdd(
|
||||
dX_data,
|
||||
idx(nc, input_height, input_width, h1 + h1p, w1),
|
||||
dx_numel,
|
||||
static_cast<T>(h1lambda * w0lambda) * d2val);
|
||||
AtomicAdd(
|
||||
dX_data,
|
||||
idx(nc, input_height, input_width, h1 + h1p, w1 + w1p),
|
||||
dx_numel,
|
||||
static_cast<T>(h1lambda * w1lambda) * d2val);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
T AreaPixelComputeScale(int64_t input_size, int64_t output_size, bool align_corners,
|
||||
const std::optional<float>& scale) {
|
||||
if (align_corners) {
|
||||
if (output_size <= 1) {
|
||||
return T{0};
|
||||
}
|
||||
return static_cast<T>(input_size - 1) / static_cast<T>(output_size - 1);
|
||||
} else {
|
||||
if (scale.has_value()) {
|
||||
return static_cast<T>(T{1.0} / *scale);
|
||||
} else {
|
||||
return static_cast<T>(input_size) / static_cast<T>(output_size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ResizeGradImpl(cudaStream_t stream, int64_t input_height,
|
||||
int64_t input_width, int64_t output_height,
|
||||
int64_t output_width, int64_t batch_size,
|
||||
int64_t channels, bool align_corners,
|
||||
const std::optional<float>& scale_height,
|
||||
const std::optional<float>& scale_width,
|
||||
const T* dY_data, T* dX_data) {
|
||||
float rheight = AreaPixelComputeScale<float>(input_height, output_height, align_corners, scale_height);
|
||||
float rwidth = AreaPixelComputeScale<float>(input_width, output_width, align_corners, scale_width);
|
||||
|
||||
const size_t output_numel = batch_size * channels * output_height * output_width;
|
||||
int blocks_per_grid = (int)(ceil(static_cast<float>(output_numel) / NumThreadsPerBlock));
|
||||
UpsampleGrad<T><<<blocks_per_grid, NumThreadsPerBlock, 0, stream>>>(
|
||||
batch_size * channels, input_height, input_width, output_height, output_width,
|
||||
rheight, rwidth, align_corners, dY_data, dX_data);
|
||||
}
|
||||
|
||||
#define SPECIALIZED_RESIZEGRAD_IMPL(T) \
|
||||
template void ResizeGradImpl<T>(cudaStream_t stream, int64_t input_height, \
|
||||
int64_t input_width, int64_t output_height, \
|
||||
int64_t output_width, int64_t batch_size, \
|
||||
int64_t channels, bool align_corners, \
|
||||
const std::optional<float>& scale_height, \
|
||||
const std::optional<float>& scale_width, \
|
||||
const T* dY_data, T* dX_data);
|
||||
|
||||
SPECIALIZED_RESIZEGRAD_IMPL(half)
|
||||
SPECIALIZED_RESIZEGRAD_IMPL(float)
|
||||
SPECIALIZED_RESIZEGRAD_IMPL(double)
|
||||
|
||||
#undef SPECIALIZED_RESIZEGRAD_IMPL
|
||||
|
||||
} // namespace onnxruntime::cuda
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <optional>
|
||||
|
||||
namespace onnxruntime::cuda {
|
||||
|
||||
template <typename T>
|
||||
void ResizeGradImpl(cudaStream_t stream, int64_t input_height,
|
||||
int64_t input_width, int64_t output_height,
|
||||
int64_t output_width, int64_t batch_size,
|
||||
int64_t channels, bool align_corners,
|
||||
const std::optional<float>& scale_height,
|
||||
const std::optional<float>& scale_width,
|
||||
const T* dY_data, T* dX_data);
|
||||
|
||||
} // namespace onnxruntime::cuda
|
||||
|
|
@ -187,6 +187,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2);
|
||||
class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad);
|
||||
|
||||
#if defined(ORT_USE_NCCL) || defined(USE_MPI)
|
||||
// P2P communication operators.
|
||||
|
|
@ -387,6 +390,9 @@ Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_BFloat16, ReduceAllL2)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16_BFloat16, ReduceAllL2)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, PadAndUnflatten)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, ResizeGrad)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, ResizeGrad)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, double, ResizeGrad)>,
|
||||
|
||||
// P2P communication operators.
|
||||
#if defined(ORT_USE_NCCL) || defined(USE_MPI)
|
||||
|
|
|
|||
Loading…
Reference in a new issue