From 2a6c69de2b2f1fd94fd5162f08812d9ee8be2fda Mon Sep 17 00:00:00 2001 From: Hector Li Date: Tue, 2 Jul 2019 23:08:59 -0700 Subject: [PATCH] Implement the Concat CUDA kernel (#1333) * Improve CUDA kernel performance for Concat. Implement the kernel code instead of using cudaMemCpy in a loop. * Update the index lookup part for Concat & Split --- .../core/providers/cpu/tensor/concat.cc | 12 +-- .../core/providers/cpu/tensor/concat.h | 1 + .../core/providers/cuda/tensor/concat.cc | 56 +++++++--- .../core/providers/cuda/tensor/concat_impl.cu | 102 ++++++++++++++++++ .../core/providers/cuda/tensor/concat_impl.h | 24 +++++ .../core/providers/cuda/tensor/split.cc | 9 ++ .../core/providers/cuda/tensor/split_impl.cu | 24 ++--- .../core/providers/cuda/tensor/split_impl.h | 1 + .../providers/cpu/tensor/concat_op_test.cc | 24 ++--- 9 files changed, 203 insertions(+), 50 deletions(-) create mode 100644 onnxruntime/core/providers/cuda/tensor/concat_impl.cu create mode 100644 onnxruntime/core/providers/cuda/tensor/concat_impl.h diff --git a/onnxruntime/core/providers/cpu/tensor/concat.cc b/onnxruntime/core/providers/cpu/tensor/concat.cc index b970270509..afca4d421e 100644 --- a/onnxruntime/core/providers/cpu/tensor/concat.cc +++ b/onnxruntime/core/providers/cpu/tensor/concat.cc @@ -21,7 +21,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep const size_t inputs_0_rank = inputs_0_dims.size(); ORT_RETURN_IF_NOT(inputs_0_rank > 0, "Cannot concatenate scalars"); - uint64_t axis = static_cast(HandleNegativeAxis(axis_, inputs_0.Shape().NumDimensions())); + p.axis = static_cast(HandleNegativeAxis(axis_, inputs_0.Shape().NumDimensions())); // cache num of elements in tensor for later use // as it's expensive to call Size() on TensorShape over and over @@ -39,7 +39,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep // Ensure all the other (non-concat) axes match for (size_t axis_index = 0; axis_index < inputs_0_rank; ++axis_index) { num_elements *= inputs_n_dims[axis_index]; - if (axis_index == axis) + if (axis_index == p.axis) continue; ORT_RETURN_IF_NOT(inputs_n_dims[axis_index] == inputs_0_dims[axis_index], "Non concat axis dimensions must match: Axis ", @@ -53,7 +53,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep size_t concat_axis_size = 0; for (int index = 0; index < input_count; index++) { tensor_pointer = ctx->Input(index); - concat_axis_size += tensor_pointer->Shape()[int(axis)]; + concat_axis_size += tensor_pointer->Shape()[int(p.axis)]; } // Calculate the shape of the output tensor @@ -64,7 +64,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep num_elements *= inputs_0_dims[dimension_index]; } tensor_num_elements[0] = num_elements; - dims[axis] = concat_axis_size; + dims[p.axis] = concat_axis_size; TensorShape output_shape(dims); auto& concat_result = *ctx->Output(0, output_shape); @@ -78,7 +78,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep // The output_axis_pitch is the number of elements to add to move to the next split axis in the output p.output_axis_pitch = 1; - for (size_t i = inputs_0_rank; i-- > axis;) p.output_axis_pitch *= dims[i]; + for (size_t i = inputs_0_rank; i-- > p.axis;) p.output_axis_pitch *= dims[i]; p.inputs.reserve(input_count); for (int input_index = 0; input_index < input_count; input_index++) { @@ -90,7 +90,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep // The input_axis_pitch is the number of elements to add to move to the next split axis in the input int64_t input_axis_pitch = 1; const auto& data_dims = data_n.Shape().GetDims(); - for (size_t i = inputs_0_rank; i-- > axis;) input_axis_pitch *= data_dims[i]; + for (size_t i = inputs_0_rank; i-- > p.axis;) input_axis_pitch *= data_dims[i]; p.inputs.push_back({&data_n, tensor_num_elements[input_index], input_axis_pitch}); } diff --git a/onnxruntime/core/providers/cpu/tensor/concat.h b/onnxruntime/core/providers/cpu/tensor/concat.h index f7df2e3d16..f5267ea3d5 100644 --- a/onnxruntime/core/providers/cpu/tensor/concat.h +++ b/onnxruntime/core/providers/cpu/tensor/concat.h @@ -28,6 +28,7 @@ class ConcatBase { int64_t output_num_elements; int64_t output_axis_pitch; Tensor* output_tensor; + uint64_t axis; }; Status PrepareForCompute(OpKernelContext* ctx, int input_count, Prepare& p) const; diff --git a/onnxruntime/core/providers/cuda/tensor/concat.cc b/onnxruntime/core/providers/cuda/tensor/concat.cc index a4aeb51785..c1f0b066af 100644 --- a/onnxruntime/core/providers/cuda/tensor/concat.cc +++ b/onnxruntime/core/providers/cuda/tensor/concat.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "concat.h" +#include "concat_impl.h" namespace onnxruntime { namespace cuda { @@ -24,25 +25,46 @@ Status Concat::ComputeInternal(OpKernelContext* ctx) const { if (p.output_num_elements == 0) return Status::OK(); - int64_t output_offset = 0; - auto element_bytes = p.output_tensor->DataType()->Size(); - for (int input_index = 0; input_index < input_count; input_index++) { - const auto& prep = p.inputs[input_index]; - // No data in this tensor - so skip it - if (prep.num_elements == 0) - continue; - // Copy the data across. For every 'input_axis_pitch' values copied, we move over by the 'output_axis_pitch' - CUDA_RETURN_IF_ERROR(cudaMemcpy2DAsync( - static_cast(p.output_tensor->MutableDataRaw()) + output_offset * element_bytes, - p.output_axis_pitch * element_bytes, - prep.tensor->DataRaw(), - prep.axis_pitch * element_bytes, - prep.axis_pitch * element_bytes, - prep.num_elements / prep.axis_pitch, - cudaMemcpyDeviceToDevice)); + int device_id = GetDeviceId(); + std::vector concat_sizes(input_count); - output_offset += prep.axis_pitch; + CudaAsyncBuffer input_ptr(this, device_id, input_count); + gsl::span input_ptr_cpuspan = input_ptr.CpuSpan(); + std::vector axis_dimension_input_output_mapping(p.output_tensor->Shape()[p.axis]); + int index = 0; + for (int i = 0; i < input_count; ++i) { + auto input = p.inputs[i]; + concat_sizes[i] = input.tensor->Shape()[p.axis]; + input_ptr_cpuspan[i] = input.tensor->DataRaw(); + for (int j = 0; j < input.tensor->Shape()[p.axis]; ++j) { + axis_dimension_input_output_mapping.at(index++) = i; + } } + std::vector concat_sizes_range(concat_sizes); + for (int i = 1; i < concat_sizes_range.size(); ++i) { + concat_sizes_range[i] += concat_sizes_range[i - 1]; + } + + CudaAsyncBuffer concat_sizes_gpu(this, device_id, concat_sizes); + CudaAsyncBuffer axis_dimension_input_output_mapping_gpu(this, device_id, axis_dimension_input_output_mapping); + CudaAsyncBuffer concat_sizes_range_gpu(this, device_id, concat_sizes_range); + concat_sizes_gpu.CopyToGpu(); + axis_dimension_input_output_mapping_gpu.CopyToGpu(); + concat_sizes_range_gpu.CopyToGpu(); + input_ptr.CopyToGpu(); + int block_size_inside_axis_dim = static_cast(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]); + int block_size_including_axis_dim = static_cast(p.output_axis_pitch); + auto element_bytes = p.output_tensor->DataType()->Size(); + ORT_RETURN_IF_ERROR(ConcatImpl(element_bytes, + block_size_including_axis_dim, + block_size_inside_axis_dim, + concat_sizes_gpu.GpuPtr(), + concat_sizes_range_gpu.GpuPtr(), + axis_dimension_input_output_mapping_gpu.GpuPtr(), + input_count, + p.output_tensor->MutableDataRaw(), + input_ptr.GpuPtr(), + p.output_num_elements)); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/tensor/concat_impl.cu b/onnxruntime/core/providers/cuda/tensor/concat_impl.cu new file mode 100644 index 0000000000..95569fc441 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/concat_impl.cu @@ -0,0 +1,102 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/cuda/cu_inc/common.cuh" +#include "core/providers/cuda/cuda_common.h" +#include "concat_impl.h" + +namespace onnxruntime { +namespace cuda { + +template +__global__ void _ConcatKernel(const fast_divmod block_size_including_axis_dim_div, + const fast_divmod block_size_inside_axis_dim_div, + const int64_t* concat_sizes, + const int64_t* concat_sizes_range, + const int64_t* axis_dimension_input_output_mapping, + const int num_inputs, + T* output_data, + const void** input_ptr, + const CUDA_LONG N) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); + CUDA_LONG input_pos = 0; + + int outter_block_index = 0; + int block_index = 0; + int offset = 0; + + block_size_including_axis_dim_div.divmod(id, outter_block_index, offset); + block_size_inside_axis_dim_div.divmod(offset, block_index, offset); + + int input_index = axis_dimension_input_output_mapping[block_index]; + int64_t range_left = (input_index == 0) ? 0 : concat_sizes_range[input_index - 1]; + int block_offset = block_index - range_left; + + input_pos = (outter_block_index * concat_sizes[input_index] + block_offset) * + block_size_inside_axis_dim_div.d_ + + offset; + + output_data[id] = reinterpret_cast(input_ptr[input_index])[input_pos]; +} + +Status ConcatImpl(const size_t element_bytes, + const int block_size_including_axis_dim, + const int block_size_inside_axis_dim, + const int64_t* concat_sizes, + const int64_t* concat_sizes_range, + const int64_t* axis_dimension_input_output_mapping, + const int num_inputs, + void* output_data, + const void** input_ptr, + const size_t N) { + int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); + + fast_divmod block_size_including_axis_dim_div = fast_divmod(block_size_including_axis_dim); + fast_divmod block_size_inside_axis_dim_div = fast_divmod(block_size_inside_axis_dim); + + switch (element_bytes) { + case sizeof(int8_t): + _ConcatKernel<<>>( + block_size_including_axis_dim_div, block_size_inside_axis_dim_div, + concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, + num_inputs, + reinterpret_cast(output_data), + input_ptr, + (CUDA_LONG)N); + break; + case sizeof(int16_t): + _ConcatKernel<<>>( + block_size_including_axis_dim_div, block_size_inside_axis_dim_div, + concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, + num_inputs, + reinterpret_cast(output_data), + input_ptr, + (CUDA_LONG)N); + break; + case sizeof(int32_t): + _ConcatKernel<<>>( + block_size_including_axis_dim_div, block_size_inside_axis_dim_div, + concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, + num_inputs, + reinterpret_cast(output_data), + input_ptr, + (CUDA_LONG)N); + break; + case sizeof(int64_t): + _ConcatKernel<<>>( + block_size_including_axis_dim_div, block_size_inside_axis_dim_div, + concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping, + num_inputs, + reinterpret_cast(output_data), + input_ptr, + (CUDA_LONG)N); + break; + default: + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Concat operator"); + } + + return Status::OK(); +} + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/concat_impl.h b/onnxruntime/core/providers/cuda/tensor/concat_impl.h new file mode 100644 index 0000000000..eeddedf642 --- /dev/null +++ b/onnxruntime/core/providers/cuda/tensor/concat_impl.h @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include +#include "core/providers/cuda/shared_inc/cuda_utils.h" +#include "core/common/common.h" + +namespace onnxruntime { +namespace cuda { + +Status ConcatImpl(const size_t element_bytes, + const int block_size_including_axis_dim, + const int block_size_inside_axis_dim, + const int64_t* concat_sizes, + const int64_t* concat_sizes_range, + const int64_t* axis_dimension_input_output_mapping, + const int num_inputs, + void* output_data, + const void** input_ptr, + const size_t N); + +} // namespace cuda +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/split.cc b/onnxruntime/core/providers/cuda/tensor/split.cc index f6dd317c5b..4f325a54cc 100644 --- a/onnxruntime/core/providers/cuda/tensor/split.cc +++ b/onnxruntime/core/providers/cuda/tensor/split.cc @@ -44,6 +44,8 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const { int device_id = GetDeviceId(); CudaAsyncBuffer output_ptr(this, device_id, num_outputs); gsl::span output_ptr_span = output_ptr.CpuSpan(); + std::vector axis_dimension_input_output_mapping(input_dims[axis]); + int index = 0; for (int i = 0; i < num_outputs; ++i) { // update size of dimension for axis we're splitting on auto split_size = gsl::narrow(split_sizes[i]); @@ -52,6 +54,9 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const { Tensor* output = ctx->Output(i, TensorShape{output_dimensions}); auto output_data = output->MutableDataRaw(); output_ptr_span[i] = output_data; + for (int j = 0; j < split_size; ++j) { + axis_dimension_input_output_mapping.at(index++) = i; + } } output_ptr.CopyToGpu(); @@ -65,12 +70,16 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const { CudaAsyncBuffer split_sizes_range_gpu(this, device_id, split_sizes_range); split_sizes_range_gpu.CopyToGpu(); + CudaAsyncBuffer axis_dimension_input_output_mapping_gpu(this, device_id, axis_dimension_input_output_mapping); + axis_dimension_input_output_mapping_gpu.CopyToGpu(); + size_t element_size = input_tensor->DataType()->Size(); ORT_RETURN_IF_ERROR(SplitImpl(element_size, block_size_including_axis_dim, block_size_inside_axis_dim, split_sizes_gpu.GpuPtr(), split_sizes_range_gpu.GpuPtr(), + axis_dimension_input_output_mapping_gpu.GpuPtr(), num_outputs, input_data, output_ptr.GpuPtr(), diff --git a/onnxruntime/core/providers/cuda/tensor/split_impl.cu b/onnxruntime/core/providers/cuda/tensor/split_impl.cu index 82cbbd8003..0c97d140ca 100644 --- a/onnxruntime/core/providers/cuda/tensor/split_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/split_impl.cu @@ -13,6 +13,7 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div const fast_divmod block_size_inside_axis_dim_div, const int64_t* split_sizes, const int64_t* split_sizes_range, + const int64_t* axis_dimension_input_output_mapping, const int num_outputs, const T* input_data, void** output_ptr, @@ -24,20 +25,12 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div int block_index = 0; int offset = 0; - int output_index = 0; - int block_offset = 0; - block_size_including_axis_dim_div.divmod(id, outter_block_index, offset); block_size_inside_axis_dim_div.divmod(offset, block_index, offset); - for (int i = 0; i < num_outputs; ++i) { - int64_t range_left = (i == 0) ? 0 : split_sizes_range[i - 1]; - if ((range_left <= block_index) && (block_index < split_sizes_range[i])) { - output_index = i; - block_offset = block_index - range_left; - break; - } - } + int output_index = axis_dimension_input_output_mapping[block_index]; + int64_t range_left = (output_index == 0) ? 0 : split_sizes_range[output_index - 1]; + int block_offset = block_index - range_left; output_pos = (outter_block_index * split_sizes[output_index] + block_offset) * block_size_inside_axis_dim_div.d_ + @@ -51,6 +44,7 @@ Status SplitImpl(const size_t element_size, const int block_size_inside_axis_dim, const int64_t* split_sizes, const int64_t* split_sizes_range, + const int64_t* axis_dimension_input_output_mapping, const int num_outputs, const void* input_data, void** output_ptr, @@ -64,7 +58,7 @@ Status SplitImpl(const size_t element_size, case sizeof(int8_t): _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, - split_sizes, split_sizes_range, num_outputs, + split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), output_ptr, (CUDA_LONG)N); @@ -72,7 +66,7 @@ Status SplitImpl(const size_t element_size, case sizeof(int16_t): _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, - split_sizes, split_sizes_range, num_outputs, + split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), output_ptr, (CUDA_LONG)N); @@ -80,7 +74,7 @@ Status SplitImpl(const size_t element_size, case sizeof(int32_t): _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, - split_sizes, split_sizes_range, num_outputs, + split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), output_ptr, (CUDA_LONG)N); @@ -88,7 +82,7 @@ Status SplitImpl(const size_t element_size, case sizeof(int64_t): _SplitKernel<<>>( block_size_including_axis_dim_div, block_size_inside_axis_dim_div, - split_sizes, split_sizes_range, num_outputs, + split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs, reinterpret_cast::MappedType*>(input_data), output_ptr, (CUDA_LONG)N); diff --git a/onnxruntime/core/providers/cuda/tensor/split_impl.h b/onnxruntime/core/providers/cuda/tensor/split_impl.h index 0ad6c51b35..72eaa5a32c 100644 --- a/onnxruntime/core/providers/cuda/tensor/split_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/split_impl.h @@ -14,6 +14,7 @@ Status SplitImpl(const size_t element_size, const int block_size_inside_axis_dim, const int64_t* split_sizes, const int64_t* split_sizes_range, + const int64_t* axis_dimension_input_output_mapping, const int num_outputs, const void* input_data, void** output_ptr, diff --git a/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc b/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc index 086177dea6..a0acc3f592 100644 --- a/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/concat_op_test.cc @@ -10,7 +10,7 @@ namespace test { // Some of the tests can't run on TensorrtExecutionProvider because of unsupported data types or limits // in its parser: axis >=0 && axis < nbDims. Those Tests will fallback to other EPs -TEST(MathOpTest, Concat1D_string) { +TEST(ConcatOpTest, Concat1D_string) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -21,7 +21,7 @@ TEST(MathOpTest, Concat1D_string) { test.Run(); } -TEST(MathOpTest, Concat1D_int32) { +TEST(ConcatOpTest, Concat1D_int32) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -32,7 +32,7 @@ TEST(MathOpTest, Concat1D_int32) { test.Run(); } -TEST(MathOpTest, Concat1D_int32_negative_axis) { +TEST(ConcatOpTest, Concat1D_int32_negative_axis) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{-1}); @@ -43,7 +43,7 @@ TEST(MathOpTest, Concat1D_int32_negative_axis) { test.Run(); } -TEST(MathOpTest, Concat1D_1) { +TEST(ConcatOpTest, Concat1D_1) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -54,7 +54,7 @@ TEST(MathOpTest, Concat1D_1) { test.Run(); } -TEST(MathOpTest, Concat1D_2) { +TEST(ConcatOpTest, Concat1D_2) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -65,7 +65,7 @@ TEST(MathOpTest, Concat1D_2) { test.Run(); } -TEST(MathOpTest, Concat2D_1) { +TEST(ConcatOpTest, Concat2D_1) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -80,7 +80,7 @@ TEST(MathOpTest, Concat2D_1) { test.Run(); } -TEST(MathOpTest, Concat2D_2) { +TEST(ConcatOpTest, Concat2D_2) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{1}); @@ -96,7 +96,7 @@ TEST(MathOpTest, Concat2D_2) { test.Run(); } -TEST(MathOpTest, Concat2D_3) { +TEST(ConcatOpTest, Concat2D_3) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{1}); @@ -107,7 +107,7 @@ TEST(MathOpTest, Concat2D_3) { test.Run(); } -TEST(MathOpTest, Concat3D_1) { +TEST(ConcatOpTest, Concat3D_1) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{0}); @@ -139,7 +139,7 @@ TEST(MathOpTest, Concat3D_1) { test.Run(); } -TEST(MathOpTest, Concat3D_1_negative_axis) { +TEST(ConcatOpTest, Concat3D_1_negative_axis) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{-3}); @@ -171,7 +171,7 @@ TEST(MathOpTest, Concat3D_1_negative_axis) { test.Run(); } -TEST(MathOpTest, Concat3D_2) { +TEST(ConcatOpTest, Concat3D_2) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{1}); @@ -203,7 +203,7 @@ TEST(MathOpTest, Concat3D_2) { test.Run(); } -TEST(MathOpTest, Concat3D_3) { +TEST(ConcatOpTest, Concat3D_3) { OpTester test("Concat"); test.AddAttribute("axis", int64_t{1});