Implement the Concat CUDA kernel (#1333)

* Improve CUDA kernel performance for Concat. Implement the kernel code instead of using cudaMemCpy in a loop.

* Update the index lookup part for Concat & Split
This commit is contained in:
Hector Li 2019-07-02 23:08:59 -07:00 committed by GitHub
parent 5e54bbffec
commit 2a6c69de2b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 203 additions and 50 deletions

View file

@ -21,7 +21,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
const size_t inputs_0_rank = inputs_0_dims.size();
ORT_RETURN_IF_NOT(inputs_0_rank > 0, "Cannot concatenate scalars");
uint64_t axis = static_cast<uint64_t>(HandleNegativeAxis(axis_, inputs_0.Shape().NumDimensions()));
p.axis = static_cast<uint64_t>(HandleNegativeAxis(axis_, inputs_0.Shape().NumDimensions()));
// cache num of elements in tensor for later use
// as it's expensive to call Size() on TensorShape over and over
@ -39,7 +39,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
// Ensure all the other (non-concat) axes match
for (size_t axis_index = 0; axis_index < inputs_0_rank; ++axis_index) {
num_elements *= inputs_n_dims[axis_index];
if (axis_index == axis)
if (axis_index == p.axis)
continue;
ORT_RETURN_IF_NOT(inputs_n_dims[axis_index] == inputs_0_dims[axis_index],
"Non concat axis dimensions must match: Axis ",
@ -53,7 +53,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
size_t concat_axis_size = 0;
for (int index = 0; index < input_count; index++) {
tensor_pointer = ctx->Input<Tensor>(index);
concat_axis_size += tensor_pointer->Shape()[int(axis)];
concat_axis_size += tensor_pointer->Shape()[int(p.axis)];
}
// Calculate the shape of the output tensor
@ -64,7 +64,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
num_elements *= inputs_0_dims[dimension_index];
}
tensor_num_elements[0] = num_elements;
dims[axis] = concat_axis_size;
dims[p.axis] = concat_axis_size;
TensorShape output_shape(dims);
auto& concat_result = *ctx->Output(0, output_shape);
@ -78,7 +78,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
// The output_axis_pitch is the number of elements to add to move to the next split axis in the output
p.output_axis_pitch = 1;
for (size_t i = inputs_0_rank; i-- > axis;) p.output_axis_pitch *= dims[i];
for (size_t i = inputs_0_rank; i-- > p.axis;) p.output_axis_pitch *= dims[i];
p.inputs.reserve(input_count);
for (int input_index = 0; input_index < input_count; input_index++) {
@ -90,7 +90,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
// The input_axis_pitch is the number of elements to add to move to the next split axis in the input
int64_t input_axis_pitch = 1;
const auto& data_dims = data_n.Shape().GetDims();
for (size_t i = inputs_0_rank; i-- > axis;) input_axis_pitch *= data_dims[i];
for (size_t i = inputs_0_rank; i-- > p.axis;) input_axis_pitch *= data_dims[i];
p.inputs.push_back({&data_n, tensor_num_elements[input_index], input_axis_pitch});
}

View file

@ -28,6 +28,7 @@ class ConcatBase {
int64_t output_num_elements;
int64_t output_axis_pitch;
Tensor* output_tensor;
uint64_t axis;
};
Status PrepareForCompute(OpKernelContext* ctx, int input_count, Prepare& p) const;

View file

@ -2,6 +2,7 @@
// Licensed under the MIT License.
#include "concat.h"
#include "concat_impl.h"
namespace onnxruntime {
namespace cuda {
@ -24,25 +25,46 @@ Status Concat::ComputeInternal(OpKernelContext* ctx) const {
if (p.output_num_elements == 0)
return Status::OK();
int64_t output_offset = 0;
auto element_bytes = p.output_tensor->DataType()->Size();
for (int input_index = 0; input_index < input_count; input_index++) {
const auto& prep = p.inputs[input_index];
// No data in this tensor - so skip it
if (prep.num_elements == 0)
continue;
// Copy the data across. For every 'input_axis_pitch' values copied, we move over by the 'output_axis_pitch'
CUDA_RETURN_IF_ERROR(cudaMemcpy2DAsync(
static_cast<uint8_t*>(p.output_tensor->MutableDataRaw()) + output_offset * element_bytes,
p.output_axis_pitch * element_bytes,
prep.tensor->DataRaw(),
prep.axis_pitch * element_bytes,
prep.axis_pitch * element_bytes,
prep.num_elements / prep.axis_pitch,
cudaMemcpyDeviceToDevice));
int device_id = GetDeviceId();
std::vector<int64_t> concat_sizes(input_count);
output_offset += prep.axis_pitch;
CudaAsyncBuffer<const void*> input_ptr(this, device_id, input_count);
gsl::span<const void*> input_ptr_cpuspan = input_ptr.CpuSpan();
std::vector<int64_t> axis_dimension_input_output_mapping(p.output_tensor->Shape()[p.axis]);
int index = 0;
for (int i = 0; i < input_count; ++i) {
auto input = p.inputs[i];
concat_sizes[i] = input.tensor->Shape()[p.axis];
input_ptr_cpuspan[i] = input.tensor->DataRaw();
for (int j = 0; j < input.tensor->Shape()[p.axis]; ++j) {
axis_dimension_input_output_mapping.at(index++) = i;
}
}
std::vector<int64_t> concat_sizes_range(concat_sizes);
for (int i = 1; i < concat_sizes_range.size(); ++i) {
concat_sizes_range[i] += concat_sizes_range[i - 1];
}
CudaAsyncBuffer<int64_t> concat_sizes_gpu(this, device_id, concat_sizes);
CudaAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, device_id, axis_dimension_input_output_mapping);
CudaAsyncBuffer<int64_t> concat_sizes_range_gpu(this, device_id, concat_sizes_range);
concat_sizes_gpu.CopyToGpu();
axis_dimension_input_output_mapping_gpu.CopyToGpu();
concat_sizes_range_gpu.CopyToGpu();
input_ptr.CopyToGpu();
int block_size_inside_axis_dim = static_cast<int>(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]);
int block_size_including_axis_dim = static_cast<int>(p.output_axis_pitch);
auto element_bytes = p.output_tensor->DataType()->Size();
ORT_RETURN_IF_ERROR(ConcatImpl(element_bytes,
block_size_including_axis_dim,
block_size_inside_axis_dim,
concat_sizes_gpu.GpuPtr(),
concat_sizes_range_gpu.GpuPtr(),
axis_dimension_input_output_mapping_gpu.GpuPtr(),
input_count,
p.output_tensor->MutableDataRaw(),
input_ptr.GpuPtr(),
p.output_num_elements));
return Status::OK();
}

View file

@ -0,0 +1,102 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/cuda/cu_inc/common.cuh"
#include "core/providers/cuda/cuda_common.h"
#include "concat_impl.h"
namespace onnxruntime {
namespace cuda {
template <typename T>
__global__ void _ConcatKernel(const fast_divmod block_size_including_axis_dim_div,
const fast_divmod block_size_inside_axis_dim_div,
const int64_t* concat_sizes,
const int64_t* concat_sizes_range,
const int64_t* axis_dimension_input_output_mapping,
const int num_inputs,
T* output_data,
const void** input_ptr,
const CUDA_LONG N) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
CUDA_LONG input_pos = 0;
int outter_block_index = 0;
int block_index = 0;
int offset = 0;
block_size_including_axis_dim_div.divmod(id, outter_block_index, offset);
block_size_inside_axis_dim_div.divmod(offset, block_index, offset);
int input_index = axis_dimension_input_output_mapping[block_index];
int64_t range_left = (input_index == 0) ? 0 : concat_sizes_range[input_index - 1];
int block_offset = block_index - range_left;
input_pos = (outter_block_index * concat_sizes[input_index] + block_offset) *
block_size_inside_axis_dim_div.d_ +
offset;
output_data[id] = reinterpret_cast<const T*>(input_ptr[input_index])[input_pos];
}
Status ConcatImpl(const size_t element_bytes,
const int block_size_including_axis_dim,
const int block_size_inside_axis_dim,
const int64_t* concat_sizes,
const int64_t* concat_sizes_range,
const int64_t* axis_dimension_input_output_mapping,
const int num_inputs,
void* output_data,
const void** input_ptr,
const size_t N) {
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
fast_divmod block_size_including_axis_dim_div = fast_divmod(block_size_including_axis_dim);
fast_divmod block_size_inside_axis_dim_div = fast_divmod(block_size_inside_axis_dim);
switch (element_bytes) {
case sizeof(int8_t):
_ConcatKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping,
num_inputs,
reinterpret_cast<int8_t*>(output_data),
input_ptr,
(CUDA_LONG)N);
break;
case sizeof(int16_t):
_ConcatKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping,
num_inputs,
reinterpret_cast<int16_t*>(output_data),
input_ptr,
(CUDA_LONG)N);
break;
case sizeof(int32_t):
_ConcatKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping,
num_inputs,
reinterpret_cast<int32_t*>(output_data),
input_ptr,
(CUDA_LONG)N);
break;
case sizeof(int64_t):
_ConcatKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping,
num_inputs,
reinterpret_cast<int64_t*>(output_data),
input_ptr,
(CUDA_LONG)N);
break;
default:
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Concat operator");
}
return Status::OK();
}
} // namespace cuda
} // namespace onnxruntime

View file

@ -0,0 +1,24 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <stdint.h>
#include "core/providers/cuda/shared_inc/cuda_utils.h"
#include "core/common/common.h"
namespace onnxruntime {
namespace cuda {
Status ConcatImpl(const size_t element_bytes,
const int block_size_including_axis_dim,
const int block_size_inside_axis_dim,
const int64_t* concat_sizes,
const int64_t* concat_sizes_range,
const int64_t* axis_dimension_input_output_mapping,
const int num_inputs,
void* output_data,
const void** input_ptr,
const size_t N);
} // namespace cuda
} // namespace onnxruntime

View file

@ -44,6 +44,8 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const {
int device_id = GetDeviceId();
CudaAsyncBuffer<void*> output_ptr(this, device_id, num_outputs);
gsl::span<void*> output_ptr_span = output_ptr.CpuSpan();
std::vector<int64_t> axis_dimension_input_output_mapping(input_dims[axis]);
int index = 0;
for (int i = 0; i < num_outputs; ++i) {
// update size of dimension for axis we're splitting on
auto split_size = gsl::narrow<int>(split_sizes[i]);
@ -52,6 +54,9 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const {
Tensor* output = ctx->Output(i, TensorShape{output_dimensions});
auto output_data = output->MutableDataRaw();
output_ptr_span[i] = output_data;
for (int j = 0; j < split_size; ++j) {
axis_dimension_input_output_mapping.at(index++) = i;
}
}
output_ptr.CopyToGpu();
@ -65,12 +70,16 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const {
CudaAsyncBuffer<int64_t> split_sizes_range_gpu(this, device_id, split_sizes_range);
split_sizes_range_gpu.CopyToGpu();
CudaAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, device_id, axis_dimension_input_output_mapping);
axis_dimension_input_output_mapping_gpu.CopyToGpu();
size_t element_size = input_tensor->DataType()->Size();
ORT_RETURN_IF_ERROR(SplitImpl(element_size,
block_size_including_axis_dim,
block_size_inside_axis_dim,
split_sizes_gpu.GpuPtr(),
split_sizes_range_gpu.GpuPtr(),
axis_dimension_input_output_mapping_gpu.GpuPtr(),
num_outputs,
input_data,
output_ptr.GpuPtr(),

View file

@ -13,6 +13,7 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div
const fast_divmod block_size_inside_axis_dim_div,
const int64_t* split_sizes,
const int64_t* split_sizes_range,
const int64_t* axis_dimension_input_output_mapping,
const int num_outputs,
const T* input_data,
void** output_ptr,
@ -24,20 +25,12 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div
int block_index = 0;
int offset = 0;
int output_index = 0;
int block_offset = 0;
block_size_including_axis_dim_div.divmod(id, outter_block_index, offset);
block_size_inside_axis_dim_div.divmod(offset, block_index, offset);
for (int i = 0; i < num_outputs; ++i) {
int64_t range_left = (i == 0) ? 0 : split_sizes_range[i - 1];
if ((range_left <= block_index) && (block_index < split_sizes_range[i])) {
output_index = i;
block_offset = block_index - range_left;
break;
}
}
int output_index = axis_dimension_input_output_mapping[block_index];
int64_t range_left = (output_index == 0) ? 0 : split_sizes_range[output_index - 1];
int block_offset = block_index - range_left;
output_pos = (outter_block_index * split_sizes[output_index] + block_offset) *
block_size_inside_axis_dim_div.d_ +
@ -51,6 +44,7 @@ Status SplitImpl(const size_t element_size,
const int block_size_inside_axis_dim,
const int64_t* split_sizes,
const int64_t* split_sizes_range,
const int64_t* axis_dimension_input_output_mapping,
const int num_outputs,
const void* input_data,
void** output_ptr,
@ -64,7 +58,7 @@ Status SplitImpl(const size_t element_size,
case sizeof(int8_t):
_SplitKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
split_sizes, split_sizes_range, num_outputs,
split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs,
reinterpret_cast<const ToCudaType<int8_t>::MappedType*>(input_data),
output_ptr,
(CUDA_LONG)N);
@ -72,7 +66,7 @@ Status SplitImpl(const size_t element_size,
case sizeof(int16_t):
_SplitKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
split_sizes, split_sizes_range, num_outputs,
split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs,
reinterpret_cast<const ToCudaType<int16_t>::MappedType*>(input_data),
output_ptr,
(CUDA_LONG)N);
@ -80,7 +74,7 @@ Status SplitImpl(const size_t element_size,
case sizeof(int32_t):
_SplitKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
split_sizes, split_sizes_range, num_outputs,
split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs,
reinterpret_cast<const ToCudaType<int32_t>::MappedType*>(input_data),
output_ptr,
(CUDA_LONG)N);
@ -88,7 +82,7 @@ Status SplitImpl(const size_t element_size,
case sizeof(int64_t):
_SplitKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
split_sizes, split_sizes_range, num_outputs,
split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs,
reinterpret_cast<const ToCudaType<int64_t>::MappedType*>(input_data),
output_ptr,
(CUDA_LONG)N);

View file

@ -14,6 +14,7 @@ Status SplitImpl(const size_t element_size,
const int block_size_inside_axis_dim,
const int64_t* split_sizes,
const int64_t* split_sizes_range,
const int64_t* axis_dimension_input_output_mapping,
const int num_outputs,
const void* input_data,
void** output_ptr,

View file

@ -10,7 +10,7 @@ namespace test {
// Some of the tests can't run on TensorrtExecutionProvider because of unsupported data types or limits
// in its parser: axis >=0 && axis < nbDims. Those Tests will fallback to other EPs
TEST(MathOpTest, Concat1D_string) {
TEST(ConcatOpTest, Concat1D_string) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{0});
@ -21,7 +21,7 @@ TEST(MathOpTest, Concat1D_string) {
test.Run();
}
TEST(MathOpTest, Concat1D_int32) {
TEST(ConcatOpTest, Concat1D_int32) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{0});
@ -32,7 +32,7 @@ TEST(MathOpTest, Concat1D_int32) {
test.Run();
}
TEST(MathOpTest, Concat1D_int32_negative_axis) {
TEST(ConcatOpTest, Concat1D_int32_negative_axis) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{-1});
@ -43,7 +43,7 @@ TEST(MathOpTest, Concat1D_int32_negative_axis) {
test.Run();
}
TEST(MathOpTest, Concat1D_1) {
TEST(ConcatOpTest, Concat1D_1) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{0});
@ -54,7 +54,7 @@ TEST(MathOpTest, Concat1D_1) {
test.Run();
}
TEST(MathOpTest, Concat1D_2) {
TEST(ConcatOpTest, Concat1D_2) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{0});
@ -65,7 +65,7 @@ TEST(MathOpTest, Concat1D_2) {
test.Run();
}
TEST(MathOpTest, Concat2D_1) {
TEST(ConcatOpTest, Concat2D_1) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{0});
@ -80,7 +80,7 @@ TEST(MathOpTest, Concat2D_1) {
test.Run();
}
TEST(MathOpTest, Concat2D_2) {
TEST(ConcatOpTest, Concat2D_2) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{1});
@ -96,7 +96,7 @@ TEST(MathOpTest, Concat2D_2) {
test.Run();
}
TEST(MathOpTest, Concat2D_3) {
TEST(ConcatOpTest, Concat2D_3) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{1});
@ -107,7 +107,7 @@ TEST(MathOpTest, Concat2D_3) {
test.Run();
}
TEST(MathOpTest, Concat3D_1) {
TEST(ConcatOpTest, Concat3D_1) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{0});
@ -139,7 +139,7 @@ TEST(MathOpTest, Concat3D_1) {
test.Run();
}
TEST(MathOpTest, Concat3D_1_negative_axis) {
TEST(ConcatOpTest, Concat3D_1_negative_axis) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{-3});
@ -171,7 +171,7 @@ TEST(MathOpTest, Concat3D_1_negative_axis) {
test.Run();
}
TEST(MathOpTest, Concat3D_2) {
TEST(ConcatOpTest, Concat3D_2) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{1});
@ -203,7 +203,7 @@ TEST(MathOpTest, Concat3D_2) {
test.Run();
}
TEST(MathOpTest, Concat3D_3) {
TEST(ConcatOpTest, Concat3D_3) {
OpTester test("Concat");
test.AddAttribute("axis", int64_t{1});