From 17ccd6fa02877a1c8d3201344137b1ca105b681d Mon Sep 17 00:00:00 2001 From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com> Date: Mon, 29 Aug 2022 10:47:19 -0700 Subject: [PATCH] Fix shape-related issues in FuseConv (#12410) * fix shape mismatch in FuseConv * remove zeroed bias * offset Z dim * append UT * add testing model * remove output * remove commented * fix comments * refactor output msg * narrowly restrict the use of cudnn...ActFwd * reset changes in cudnn_common * add test cases covering all path * move cases to conv test * remove extra space * fix build err Co-authored-by: Randy Shuai --- onnxruntime/contrib_ops/cuda/fused_conv.cc | 87 ++++++----- onnxruntime/core/providers/cuda/nn/conv.cc | 56 ++++--- onnxruntime/core/providers/cuda/nn/conv.h | 10 +- .../test/providers/cpu/nn/conv_op_test.cc | 139 ++++++++++++++++++ onnxruntime/test/testdata/fuse_conv_bias.onnx | 37 +++++ .../test/testdata/fuse_conv_bias_slice.onnx | 40 +++++ .../test/testdata/fuse_conv_no_bias.onnx | 32 ++++ 7 files changed, 340 insertions(+), 61 deletions(-) create mode 100644 onnxruntime/test/testdata/fuse_conv_bias.onnx create mode 100644 onnxruntime/test/testdata/fuse_conv_bias_slice.onnx create mode 100644 onnxruntime/test/testdata/fuse_conv_no_bias.onnx diff --git a/onnxruntime/contrib_ops/cuda/fused_conv.cc b/onnxruntime/contrib_ops/cuda/fused_conv.cc index 975a73d212..fa3b33166f 100644 --- a/onnxruntime/contrib_ops/cuda/fused_conv.cc +++ b/onnxruntime/contrib_ops/cuda/fused_conv.cc @@ -37,7 +37,7 @@ class FusedConv : public onnxruntime::cuda::Conv { Status ComputeInternal(OpKernelContext* context) const override { CUDNN_RETURN_IF_ERROR(status_); std::lock_guard lock(Base::s_.mutex); - ORT_RETURN_IF_ERROR(Base::UpdateState(context, true)); + ORT_RETURN_IF_ERROR(Base::UpdateState(context)); if (Base::s_.Y->Shape().Size() == 0) { return Status::OK(); } @@ -47,25 +47,27 @@ class FusedConv : public onnxruntime::cuda::Conv { const auto alpha = onnxruntime::cuda::Consts::One; const auto beta = onnxruntime::cuda::Consts::Zero; IAllocatorUniquePtr workspace = Base::GetWorkSpace(); - auto cudnn_status = cudnnConvolutionBiasActivationForward(Base::CudnnHandle(), - &alpha, - Base::s_.x_tensor, - Base::s_.x_data, - Base::s_.w_desc, - Base::s_.w_data, - Base::s_.conv_desc, - Base::s_.algo, - workspace.get(), - Base::s_.workspace_bytes, - has_z ? &alpha : &beta, - has_z ? Base::s_.z_tensor : Base::s_.y_tensor, - has_z ? Base::s_.z_data : Base::s_.y_data, - Base::s_.b_tensor, - has_b ? Base::s_.b_data : Base::s_.b_zero, - activation_desc_, - Base::s_.y_tensor, - Base::s_.y_data); - if (CUDNN_STATUS_SUCCESS != cudnn_status) { + + if (has_b && has_z && !Base::s_.post_slicing_required) { + CUDNN_RETURN_IF_ERROR(cudnnConvolutionBiasActivationForward(Base::CudnnHandle(), + &alpha, + Base::s_.x_tensor, + Base::s_.x_data, + Base::s_.w_desc, + Base::s_.w_data, + Base::s_.conv_desc, + Base::s_.algo, + workspace.get(), + Base::s_.workspace_bytes, + &alpha, + Base::s_.z_tensor, + Base::s_.z_data, + Base::s_.b_tensor, + Base::s_.b_data, + activation_desc_, + Base::s_.y_tensor, + Base::s_.y_data)); + } else { CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(Base::CudnnHandle(), &alpha, Base::s_.x_tensor, @@ -79,21 +81,38 @@ class FusedConv : public onnxruntime::cuda::Conv { &beta, Base::s_.y_tensor, Base::s_.y_data)); - if (has_b) { - CUDNN_RETURN_IF_ERROR(cudnnAddTensor(Base::CudnnHandle(), &alpha, Base::s_.b_tensor, Base::s_.b_data, - &alpha, Base::s_.y_tensor, Base::s_.y_data)); + + if (Base::s_.post_slicing_required) { + ORT_RETURN_IF_ERROR(onnxruntime::cuda::SliceOutUnwantedOutputSection( + this->Stream(), Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(), + Base::s_.y_dims.GetDims(), Base::s_.slice_starts, Base::s_.slice_ends, Base::s_.slice_axes, Base::s_.element_size)); + + onnxruntime::cuda::CudnnTensor sliced_y_tensor; + ORT_RETURN_IF_ERROR(sliced_y_tensor.Set(Base::s_.y_dims.GetDims(), onnxruntime::cuda::CudnnTensor::GetDataType())); + + if (has_b) { + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(Base::CudnnHandle(), &alpha, Base::s_.b_tensor, Base::s_.b_data, + &alpha, sliced_y_tensor, Base::s_.Y->MutableDataRaw())); + } + if (has_z) { + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(Base::CudnnHandle(), &alpha, Base::s_.z_tensor, Base::s_.z_data, + &alpha, sliced_y_tensor, Base::s_.Y->MutableDataRaw())); + } + + CUDNN_RETURN_IF_ERROR(cudnnActivationForward(Base::CudnnHandle(), activation_desc_, &alpha, sliced_y_tensor, + Base::s_.y_data, &beta, sliced_y_tensor, Base::s_.y_data)); + } else { + if (has_b) { + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(Base::CudnnHandle(), &alpha, Base::s_.b_tensor, Base::s_.b_data, + &alpha, Base::s_.y_tensor, Base::s_.y_data)); + } + if (has_z) { + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(Base::CudnnHandle(), &alpha, Base::s_.z_tensor, Base::s_.z_data, + &alpha, Base::s_.y_tensor, Base::s_.y_data)); + } + CUDNN_RETURN_IF_ERROR(cudnnActivationForward(Base::CudnnHandle(), activation_desc_, &alpha, Base::s_.y_tensor, + Base::s_.y_data, &beta, Base::s_.y_tensor, Base::s_.y_data)); } - if (has_z) { - CUDNN_RETURN_IF_ERROR(cudnnAddTensor(Base::CudnnHandle(), &alpha, Base::s_.z_tensor, Base::s_.z_data, - &alpha, Base::s_.y_tensor, Base::s_.y_data)); - } - CUDNN_RETURN_IF_ERROR(cudnnActivationForward(Base::CudnnHandle(), activation_desc_, &alpha, Base::s_.y_tensor, - Base::s_.y_data, &beta, Base::s_.y_tensor, Base::s_.y_data)); - } - if (Base::s_.post_slicing_required) { - ORT_RETURN_IF_ERROR(onnxruntime::cuda::SliceOutUnwantedOutputSection( - this->Stream(), Base::s_.y_data, Base::s_.y_dims_with_adjusted_pads, Base::s_.Y->MutableDataRaw(), - Base::s_.y_dims.GetDims(), Base::s_.slice_starts, Base::s_.slice_ends, Base::s_.slice_axes, Base::s_.element_size)); } return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index fd0d15640f..8e2983dd40 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -87,7 +87,7 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream, } template -Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const { +Status Conv::UpdateState(OpKernelContext* context) const { //set X const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); @@ -109,8 +109,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const //set Z if (context->InputCount() >= 4) { const Tensor* Z = context->Input(3); - ORT_RETURN_IF_ERROR(s_.z_tensor.Set(Z->Shape().GetDims(), CudnnTensor::GetDataType())); - s_.z_data = reinterpret_cast(Z->Data()); + s_.z_data = reinterpret_cast(Z->template Data()); } else { s_.z_data = nullptr; } @@ -237,22 +236,43 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const if (context->InputCount() >= 3) { const Tensor* B = context->Input(2); const auto& b_shape = B->Shape(); - ORT_RETURN_IF_NOT(b_shape.NumDimensions() == 1, "bias should be 1D"); - TensorShapeVector b_dims(2 + kernel_shape.size(), 1); - b_dims[1] = b_shape[0]; - ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType())); - //s_.b_data = reinterpret_cast(B->Data()); - } else if (bias_expected) { - TensorShapeVector b_dims(2 + kernel_shape.size(), 1); - b_dims[1] = w_dims[0]; - auto malloc_size = b_dims[1] * sizeof(CudaT); - ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType())); - if (s_.b_zero) { - CUDA_CALL_THROW(cudaFree(s_.b_zero)); - s_.b_zero = nullptr; + if (b_shape.NumDimensions() == 1) { + TensorShapeVector b_dims(2 + kernel_shape.size(), 1); + b_dims[1] = b_shape[0]; + ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType())); + } else { + const auto& y_rank = y_dims_cudnn.size(); + const auto& b_rank = b_shape.GetDims().size(); + ORT_RETURN_IF_NOT(b_rank <= y_rank, "rank of B is ", b_rank, ", which is bigger than the rank of Y - ", y_rank); + if (b_rank == y_rank) { + ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_shape.GetDims(), CudnnTensor::GetDataType())); + } else { + TensorShapeVector b_extended_dims = b_shape.AsShapeVector(); + for (auto i = b_rank; i < y_rank; ++i) { + ORT_RETURN_IF_NOT(y_dims_cudnn[i] == 1, "dim ", i, " of Y is ", y_dims_cudnn[i], ", cannot apply it to that dim of B"); + b_extended_dims.push_back(1); + } + ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_extended_dims, CudnnTensor::GetDataType())); + } + } + } + + if (context->InputCount() >= 4) { + const Tensor* Z = context->Input(3); + const auto& z_shape = Z->Shape(); + const auto& z_rank = z_shape.GetDims().size(); + const auto& y_rank = y_dims_cudnn.size(); + ORT_RETURN_IF_NOT(z_rank <= y_rank, "rank of Z is ", z_rank, ", which is bigger than the rank of Y - ", y_rank); + if (z_rank == y_rank) { + ORT_RETURN_IF_ERROR(s_.z_tensor.Set(z_shape.GetDims(), CudnnTensor::GetDataType())); + } else { + TensorShapeVector z_extended_dims = z_shape.AsShapeVector(); + for (auto i = z_rank; i < y_rank; ++i) { + ORT_RETURN_IF_NOT(y_dims_cudnn[i] == 1, "dim ", i, " of Y is ", y_dims_cudnn[i], ", cannot apply it to that dim of Z"); + z_extended_dims.push_back(1); + } + ORT_RETURN_IF_ERROR(s_.z_tensor.Set(z_extended_dims, CudnnTensor::GetDataType())); } - CUDA_CALL_THROW(cudaMalloc(&s_.b_zero, malloc_size)); - CUDA_CALL_THROW(cudaMemsetAsync(s_.b_zero, 0, malloc_size, Stream())); } if (!s_.cached_benchmark_results.contains(x_dims_cudnn)) { diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index 135b189d4b..8227852552 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -141,7 +141,6 @@ struct CudnnConvState { const void* w_data = nullptr; CudnnTensor b_tensor; const void* b_data = nullptr; - void* b_zero = nullptr; CudnnTensor y_tensor; Tensor* Y = nullptr; void* y_data = nullptr; @@ -166,13 +165,6 @@ struct CudnnConvState { // note that conv objects are shared between execution frames, and a lock is needed to avoid multi-thread racing OrtMutex mutex; IAllocatorUniquePtr memory_for_cudnn_conv_results; - - ~CudnnConvState() { - if (b_zero) { - CUDA_CALL_THROW(cudaFree(b_zero)); - b_zero = nullptr; - } - } }; enum : size_t { @@ -197,7 +189,7 @@ class Conv : public CudaKernel { return GetScratchBuffer(s_.workspace_bytes); } - Status UpdateState(OpKernelContext* context, bool bias_expected = false) const; + Status UpdateState(OpKernelContext* context) const; ConvAttributes conv_attrs_; mutable CudnnConvState s_; constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 890bccd4f1..8cb79461b9 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -3,6 +3,9 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" +#include "core/session/inference_session.h" +#include "test/framework/test_utils.h" + using namespace std; namespace onnxruntime { namespace test { @@ -725,5 +728,141 @@ TEST(ConvTest, Conv_AutoPad_with_non_default_strides) { TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } +#ifdef USE_CUDA +TEST(ConvTest, Fuse_Conv_Bias) { + auto model_uri = ORT_TSTR("testdata/fuse_conv_bias.onnx"); + SessionOptions so; + InferenceSession session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.Load(model_uri)); + ASSERT_TRUE(session.Initialize().IsOK()); + + NameMLValMap feeds; + OrtValue ml_value; + + size_t X_count = 1 * 3 * 32 * 32; + std::vector X_data(X_count, 1.f); + std::vector X_shape{1, 3, 32, 32}; + + size_t W_count = 1 * 3 * 5 * 32; + std::vector W_data(W_count, 2.f); + std::vector W_shape{1, 3, 5, 32}; + + size_t B_count = 1; + std::vector B_data(B_count, 5.f); + std::vector B_shape{1}; + + size_t Z_count = 1 * 1 * 28; + std::vector Z_data(Z_count, 1.f); + std::vector Z_shape{1, 1, 28}; + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), X_shape, X_data, &ml_value); + feeds.insert(std::make_pair("X", ml_value)); + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), W_shape, W_data, &ml_value); + feeds.insert(std::make_pair("W", ml_value)); + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), B_shape, B_data, &ml_value); + feeds.insert(std::make_pair("B", ml_value)); + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), Z_shape, Z_data, &ml_value); + feeds.insert(std::make_pair("Z", ml_value)); + + std::vector output_names{"R"}; + std::vector fetches; + + onnxruntime::RunOptions run_options; + auto st = session.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(st.IsOK()) << st; + ASSERT_EQ(1u, fetches.size()); +} + +TEST(ConvTest, Fuse_Conv_Bias_Slice) { + auto model_uri = ORT_TSTR("testdata/fuse_conv_bias_slice.onnx"); + SessionOptions so; + InferenceSession session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.Load(model_uri)); + ASSERT_TRUE(session.Initialize().IsOK()); + + NameMLValMap feeds; + OrtValue ml_value; + + size_t X_count = 1 * 2 * 6 * 6; + std::vector X_data(X_count, 1.f); + std::vector X_shape{1, 2, 6, 6}; + + size_t W_count = 1 * 2 * 4 * 4; + std::vector W_data(W_count, 2.f); + std::vector W_shape{1, 2, 4, 4}; + + size_t B_count = 1; + std::vector B_data(B_count, 5.f); + std::vector B_shape{1}; + + size_t Z_count = 1 * 1 * 4 * 2; + std::vector Z_data(Z_count, 1.f); + std::vector Z_shape{1, 1, 4, 2}; + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), X_shape, X_data, &ml_value); + feeds.insert(std::make_pair("X", ml_value)); + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), W_shape, W_data, &ml_value); + feeds.insert(std::make_pair("W", ml_value)); + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), B_shape, B_data, &ml_value); + feeds.insert(std::make_pair("B", ml_value)); + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), Z_shape, Z_data, &ml_value); + feeds.insert(std::make_pair("Z", ml_value)); + + std::vector output_names{"R"}; + std::vector fetches; + + onnxruntime::RunOptions run_options; + auto st = session.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(st.IsOK()) << st; + ASSERT_EQ(1u, fetches.size()); +} + +TEST(ConvTest, Fuse_Conv_No_Bias) { + auto model_uri = ORT_TSTR("testdata/fuse_conv_no_bias.onnx"); + SessionOptions so; + InferenceSession session{so, GetEnvironment()}; + ASSERT_STATUS_OK(session.Load(model_uri)); + ASSERT_TRUE(session.Initialize().IsOK()); + + NameMLValMap feeds; + OrtValue ml_value; + + size_t X_count = 1 * 3 * 32 * 32; + std::vector X_data(X_count, 1.f); + std::vector X_shape{1, 3, 32, 32}; + + size_t W_count = 1 * 3 * 5 * 32; + std::vector W_data(W_count, 2.f); + std::vector W_shape{1, 3, 5, 32}; + + size_t Z_count = 1 * 1 * 28; + std::vector Z_data(Z_count, 1.f); + std::vector Z_shape{1, 1, 28}; + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), X_shape, X_data, &ml_value); + feeds.insert(std::make_pair("X", ml_value)); + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), W_shape, W_data, &ml_value); + feeds.insert(std::make_pair("W", ml_value)); + + CreateMLValue(TestCPUExecutionProvider()->GetAllocator(0, OrtMemTypeDefault), Z_shape, Z_data, &ml_value); + feeds.insert(std::make_pair("Z", ml_value)); + + std::vector output_names{"R"}; + std::vector fetches; + + onnxruntime::RunOptions run_options; + auto st = session.Run(run_options, feeds, output_names, &fetches); + ASSERT_TRUE(st.IsOK()) << st; + ASSERT_EQ(1u, fetches.size()); +} +#endif + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/testdata/fuse_conv_bias.onnx b/onnxruntime/test/testdata/fuse_conv_bias.onnx new file mode 100644 index 0000000000..e6cf15c34c --- /dev/null +++ b/onnxruntime/test/testdata/fuse_conv_bias.onnx @@ -0,0 +1,37 @@ +:Þ + +X +W +BY"Conv + +Y +ZC"Add + +CR"RelugraphZ +X + + + + + Z +W + + + + + Z +B + + +Z +Z + + + +b? +R: +84 + ÿÿÿÿÿÿÿÿÿ + ÿÿÿÿÿÿÿÿÿ + ÿÿÿÿÿÿÿÿÿ + ÿÿÿÿÿÿÿÿÿB \ No newline at end of file diff --git a/onnxruntime/test/testdata/fuse_conv_bias_slice.onnx b/onnxruntime/test/testdata/fuse_conv_bias_slice.onnx new file mode 100644 index 0000000000..c55e611c96 --- /dev/null +++ b/onnxruntime/test/testdata/fuse_conv_bias_slice.onnx @@ -0,0 +1,40 @@ +:‡ +7 +X +W +BY"Conv* +pads@@@@ * +strides@@  + +Y +ZC"Add + +CR"RelugraphZ +X + + + + +Z +W + + + + +Z +B + + +Z +Z + + + + +b? +R: +84 + ÿÿÿÿÿÿÿÿÿ + ÿÿÿÿÿÿÿÿÿ + ÿÿÿÿÿÿÿÿÿ + ÿÿÿÿÿÿÿÿÿB \ No newline at end of file diff --git a/onnxruntime/test/testdata/fuse_conv_no_bias.onnx b/onnxruntime/test/testdata/fuse_conv_no_bias.onnx new file mode 100644 index 0000000000..2094d75326 --- /dev/null +++ b/onnxruntime/test/testdata/fuse_conv_no_bias.onnx @@ -0,0 +1,32 @@ +:Ê + +X +WY"Conv + +Y +ZC"Add + +CR"RelugraphZ +X + + + + + Z +W + + + + + Z +Z + + + +b? +R: +84 + ÿÿÿÿÿÿÿÿÿ + ÿÿÿÿÿÿÿÿÿ + ÿÿÿÿÿÿÿÿÿ + ÿÿÿÿÿÿÿÿÿB \ No newline at end of file