mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-30 23:18:20 +00:00
Implement the Concat CUDA kernel (#1333)
* Improve CUDA kernel performance for Concat. Implement the kernel code instead of using cudaMemCpy in a loop. * Update the index lookup part for Concat & Split
This commit is contained in:
parent
5e54bbffec
commit
2a6c69de2b
9 changed files with 203 additions and 50 deletions
|
|
@ -21,7 +21,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
|
|||
const size_t inputs_0_rank = inputs_0_dims.size();
|
||||
ORT_RETURN_IF_NOT(inputs_0_rank > 0, "Cannot concatenate scalars");
|
||||
|
||||
uint64_t axis = static_cast<uint64_t>(HandleNegativeAxis(axis_, inputs_0.Shape().NumDimensions()));
|
||||
p.axis = static_cast<uint64_t>(HandleNegativeAxis(axis_, inputs_0.Shape().NumDimensions()));
|
||||
|
||||
// cache num of elements in tensor for later use
|
||||
// as it's expensive to call Size() on TensorShape over and over
|
||||
|
|
@ -39,7 +39,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
|
|||
// Ensure all the other (non-concat) axes match
|
||||
for (size_t axis_index = 0; axis_index < inputs_0_rank; ++axis_index) {
|
||||
num_elements *= inputs_n_dims[axis_index];
|
||||
if (axis_index == axis)
|
||||
if (axis_index == p.axis)
|
||||
continue;
|
||||
ORT_RETURN_IF_NOT(inputs_n_dims[axis_index] == inputs_0_dims[axis_index],
|
||||
"Non concat axis dimensions must match: Axis ",
|
||||
|
|
@ -53,7 +53,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
|
|||
size_t concat_axis_size = 0;
|
||||
for (int index = 0; index < input_count; index++) {
|
||||
tensor_pointer = ctx->Input<Tensor>(index);
|
||||
concat_axis_size += tensor_pointer->Shape()[int(axis)];
|
||||
concat_axis_size += tensor_pointer->Shape()[int(p.axis)];
|
||||
}
|
||||
|
||||
// Calculate the shape of the output tensor
|
||||
|
|
@ -64,7 +64,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
|
|||
num_elements *= inputs_0_dims[dimension_index];
|
||||
}
|
||||
tensor_num_elements[0] = num_elements;
|
||||
dims[axis] = concat_axis_size;
|
||||
dims[p.axis] = concat_axis_size;
|
||||
TensorShape output_shape(dims);
|
||||
|
||||
auto& concat_result = *ctx->Output(0, output_shape);
|
||||
|
|
@ -78,7 +78,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
|
|||
|
||||
// The output_axis_pitch is the number of elements to add to move to the next split axis in the output
|
||||
p.output_axis_pitch = 1;
|
||||
for (size_t i = inputs_0_rank; i-- > axis;) p.output_axis_pitch *= dims[i];
|
||||
for (size_t i = inputs_0_rank; i-- > p.axis;) p.output_axis_pitch *= dims[i];
|
||||
|
||||
p.inputs.reserve(input_count);
|
||||
for (int input_index = 0; input_index < input_count; input_index++) {
|
||||
|
|
@ -90,7 +90,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
|
|||
// The input_axis_pitch is the number of elements to add to move to the next split axis in the input
|
||||
int64_t input_axis_pitch = 1;
|
||||
const auto& data_dims = data_n.Shape().GetDims();
|
||||
for (size_t i = inputs_0_rank; i-- > axis;) input_axis_pitch *= data_dims[i];
|
||||
for (size_t i = inputs_0_rank; i-- > p.axis;) input_axis_pitch *= data_dims[i];
|
||||
|
||||
p.inputs.push_back({&data_n, tensor_num_elements[input_index], input_axis_pitch});
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ class ConcatBase {
|
|||
int64_t output_num_elements;
|
||||
int64_t output_axis_pitch;
|
||||
Tensor* output_tensor;
|
||||
uint64_t axis;
|
||||
};
|
||||
|
||||
Status PrepareForCompute(OpKernelContext* ctx, int input_count, Prepare& p) const;
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
// Licensed under the MIT License.
|
||||
|
||||
#include "concat.h"
|
||||
#include "concat_impl.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
|
@ -24,25 +25,46 @@ Status Concat::ComputeInternal(OpKernelContext* ctx) const {
|
|||
if (p.output_num_elements == 0)
|
||||
return Status::OK();
|
||||
|
||||
int64_t output_offset = 0;
|
||||
auto element_bytes = p.output_tensor->DataType()->Size();
|
||||
for (int input_index = 0; input_index < input_count; input_index++) {
|
||||
const auto& prep = p.inputs[input_index];
|
||||
// No data in this tensor - so skip it
|
||||
if (prep.num_elements == 0)
|
||||
continue;
|
||||
// Copy the data across. For every 'input_axis_pitch' values copied, we move over by the 'output_axis_pitch'
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpy2DAsync(
|
||||
static_cast<uint8_t*>(p.output_tensor->MutableDataRaw()) + output_offset * element_bytes,
|
||||
p.output_axis_pitch * element_bytes,
|
||||
prep.tensor->DataRaw(),
|
||||
prep.axis_pitch * element_bytes,
|
||||
prep.axis_pitch * element_bytes,
|
||||
prep.num_elements / prep.axis_pitch,
|
||||
cudaMemcpyDeviceToDevice));
|
||||
int device_id = GetDeviceId();
|
||||
std::vector<int64_t> concat_sizes(input_count);
|
||||
|
||||
output_offset += prep.axis_pitch;
|
||||
CudaAsyncBuffer<const void*> input_ptr(this, device_id, input_count);
|
||||
gsl::span<const void*> input_ptr_cpuspan = input_ptr.CpuSpan();
|
||||
std::vector<int64_t> axis_dimension_input_output_mapping(p.output_tensor->Shape()[p.axis]);
|
||||
int index = 0;
|
||||
for (int i = 0; i < input_count; ++i) {
|
||||
auto input = p.inputs[i];
|
||||
concat_sizes[i] = input.tensor->Shape()[p.axis];
|
||||
input_ptr_cpuspan[i] = input.tensor->DataRaw();
|
||||
for (int j = 0; j < input.tensor->Shape()[p.axis]; ++j) {
|
||||
axis_dimension_input_output_mapping.at(index++) = i;
|
||||
}
|
||||
}
|
||||
std::vector<int64_t> concat_sizes_range(concat_sizes);
|
||||
for (int i = 1; i < concat_sizes_range.size(); ++i) {
|
||||
concat_sizes_range[i] += concat_sizes_range[i - 1];
|
||||
}
|
||||
|
||||
CudaAsyncBuffer<int64_t> concat_sizes_gpu(this, device_id, concat_sizes);
|
||||
CudaAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, device_id, axis_dimension_input_output_mapping);
|
||||
CudaAsyncBuffer<int64_t> concat_sizes_range_gpu(this, device_id, concat_sizes_range);
|
||||
concat_sizes_gpu.CopyToGpu();
|
||||
axis_dimension_input_output_mapping_gpu.CopyToGpu();
|
||||
concat_sizes_range_gpu.CopyToGpu();
|
||||
input_ptr.CopyToGpu();
|
||||
int block_size_inside_axis_dim = static_cast<int>(p.output_axis_pitch / p.output_tensor->Shape()[p.axis]);
|
||||
int block_size_including_axis_dim = static_cast<int>(p.output_axis_pitch);
|
||||
auto element_bytes = p.output_tensor->DataType()->Size();
|
||||
ORT_RETURN_IF_ERROR(ConcatImpl(element_bytes,
|
||||
block_size_including_axis_dim,
|
||||
block_size_inside_axis_dim,
|
||||
concat_sizes_gpu.GpuPtr(),
|
||||
concat_sizes_range_gpu.GpuPtr(),
|
||||
axis_dimension_input_output_mapping_gpu.GpuPtr(),
|
||||
input_count,
|
||||
p.output_tensor->MutableDataRaw(),
|
||||
input_ptr.GpuPtr(),
|
||||
p.output_num_elements));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
|
|
|||
102
onnxruntime/core/providers/cuda/tensor/concat_impl.cu
Normal file
102
onnxruntime/core/providers/cuda/tensor/concat_impl.cu
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/providers/cuda/cu_inc/common.cuh"
|
||||
#include "core/providers/cuda/cuda_common.h"
|
||||
#include "concat_impl.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
template <typename T>
|
||||
__global__ void _ConcatKernel(const fast_divmod block_size_including_axis_dim_div,
|
||||
const fast_divmod block_size_inside_axis_dim_div,
|
||||
const int64_t* concat_sizes,
|
||||
const int64_t* concat_sizes_range,
|
||||
const int64_t* axis_dimension_input_output_mapping,
|
||||
const int num_inputs,
|
||||
T* output_data,
|
||||
const void** input_ptr,
|
||||
const CUDA_LONG N) {
|
||||
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
|
||||
CUDA_LONG input_pos = 0;
|
||||
|
||||
int outter_block_index = 0;
|
||||
int block_index = 0;
|
||||
int offset = 0;
|
||||
|
||||
block_size_including_axis_dim_div.divmod(id, outter_block_index, offset);
|
||||
block_size_inside_axis_dim_div.divmod(offset, block_index, offset);
|
||||
|
||||
int input_index = axis_dimension_input_output_mapping[block_index];
|
||||
int64_t range_left = (input_index == 0) ? 0 : concat_sizes_range[input_index - 1];
|
||||
int block_offset = block_index - range_left;
|
||||
|
||||
input_pos = (outter_block_index * concat_sizes[input_index] + block_offset) *
|
||||
block_size_inside_axis_dim_div.d_ +
|
||||
offset;
|
||||
|
||||
output_data[id] = reinterpret_cast<const T*>(input_ptr[input_index])[input_pos];
|
||||
}
|
||||
|
||||
Status ConcatImpl(const size_t element_bytes,
|
||||
const int block_size_including_axis_dim,
|
||||
const int block_size_inside_axis_dim,
|
||||
const int64_t* concat_sizes,
|
||||
const int64_t* concat_sizes_range,
|
||||
const int64_t* axis_dimension_input_output_mapping,
|
||||
const int num_inputs,
|
||||
void* output_data,
|
||||
const void** input_ptr,
|
||||
const size_t N) {
|
||||
int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
|
||||
|
||||
fast_divmod block_size_including_axis_dim_div = fast_divmod(block_size_including_axis_dim);
|
||||
fast_divmod block_size_inside_axis_dim_div = fast_divmod(block_size_inside_axis_dim);
|
||||
|
||||
switch (element_bytes) {
|
||||
case sizeof(int8_t):
|
||||
_ConcatKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
|
||||
concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping,
|
||||
num_inputs,
|
||||
reinterpret_cast<int8_t*>(output_data),
|
||||
input_ptr,
|
||||
(CUDA_LONG)N);
|
||||
break;
|
||||
case sizeof(int16_t):
|
||||
_ConcatKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
|
||||
concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping,
|
||||
num_inputs,
|
||||
reinterpret_cast<int16_t*>(output_data),
|
||||
input_ptr,
|
||||
(CUDA_LONG)N);
|
||||
break;
|
||||
case sizeof(int32_t):
|
||||
_ConcatKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
|
||||
concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping,
|
||||
num_inputs,
|
||||
reinterpret_cast<int32_t*>(output_data),
|
||||
input_ptr,
|
||||
(CUDA_LONG)N);
|
||||
break;
|
||||
case sizeof(int64_t):
|
||||
_ConcatKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
|
||||
concat_sizes, concat_sizes_range, axis_dimension_input_output_mapping,
|
||||
num_inputs,
|
||||
reinterpret_cast<int64_t*>(output_data),
|
||||
input_ptr,
|
||||
(CUDA_LONG)N);
|
||||
break;
|
||||
default:
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported for Concat operator");
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
24
onnxruntime/core/providers/cuda/tensor/concat_impl.h
Normal file
24
onnxruntime/core/providers/cuda/tensor/concat_impl.h
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "core/providers/cuda/shared_inc/cuda_utils.h"
|
||||
#include "core/common/common.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
Status ConcatImpl(const size_t element_bytes,
|
||||
const int block_size_including_axis_dim,
|
||||
const int block_size_inside_axis_dim,
|
||||
const int64_t* concat_sizes,
|
||||
const int64_t* concat_sizes_range,
|
||||
const int64_t* axis_dimension_input_output_mapping,
|
||||
const int num_inputs,
|
||||
void* output_data,
|
||||
const void** input_ptr,
|
||||
const size_t N);
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -44,6 +44,8 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const {
|
|||
int device_id = GetDeviceId();
|
||||
CudaAsyncBuffer<void*> output_ptr(this, device_id, num_outputs);
|
||||
gsl::span<void*> output_ptr_span = output_ptr.CpuSpan();
|
||||
std::vector<int64_t> axis_dimension_input_output_mapping(input_dims[axis]);
|
||||
int index = 0;
|
||||
for (int i = 0; i < num_outputs; ++i) {
|
||||
// update size of dimension for axis we're splitting on
|
||||
auto split_size = gsl::narrow<int>(split_sizes[i]);
|
||||
|
|
@ -52,6 +54,9 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const {
|
|||
Tensor* output = ctx->Output(i, TensorShape{output_dimensions});
|
||||
auto output_data = output->MutableDataRaw();
|
||||
output_ptr_span[i] = output_data;
|
||||
for (int j = 0; j < split_size; ++j) {
|
||||
axis_dimension_input_output_mapping.at(index++) = i;
|
||||
}
|
||||
}
|
||||
output_ptr.CopyToGpu();
|
||||
|
||||
|
|
@ -65,12 +70,16 @@ Status Split::ComputeInternal(OpKernelContext* ctx) const {
|
|||
CudaAsyncBuffer<int64_t> split_sizes_range_gpu(this, device_id, split_sizes_range);
|
||||
split_sizes_range_gpu.CopyToGpu();
|
||||
|
||||
CudaAsyncBuffer<int64_t> axis_dimension_input_output_mapping_gpu(this, device_id, axis_dimension_input_output_mapping);
|
||||
axis_dimension_input_output_mapping_gpu.CopyToGpu();
|
||||
|
||||
size_t element_size = input_tensor->DataType()->Size();
|
||||
ORT_RETURN_IF_ERROR(SplitImpl(element_size,
|
||||
block_size_including_axis_dim,
|
||||
block_size_inside_axis_dim,
|
||||
split_sizes_gpu.GpuPtr(),
|
||||
split_sizes_range_gpu.GpuPtr(),
|
||||
axis_dimension_input_output_mapping_gpu.GpuPtr(),
|
||||
num_outputs,
|
||||
input_data,
|
||||
output_ptr.GpuPtr(),
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div
|
|||
const fast_divmod block_size_inside_axis_dim_div,
|
||||
const int64_t* split_sizes,
|
||||
const int64_t* split_sizes_range,
|
||||
const int64_t* axis_dimension_input_output_mapping,
|
||||
const int num_outputs,
|
||||
const T* input_data,
|
||||
void** output_ptr,
|
||||
|
|
@ -24,20 +25,12 @@ __global__ void _SplitKernel(const fast_divmod block_size_including_axis_dim_div
|
|||
int block_index = 0;
|
||||
int offset = 0;
|
||||
|
||||
int output_index = 0;
|
||||
int block_offset = 0;
|
||||
|
||||
block_size_including_axis_dim_div.divmod(id, outter_block_index, offset);
|
||||
block_size_inside_axis_dim_div.divmod(offset, block_index, offset);
|
||||
|
||||
for (int i = 0; i < num_outputs; ++i) {
|
||||
int64_t range_left = (i == 0) ? 0 : split_sizes_range[i - 1];
|
||||
if ((range_left <= block_index) && (block_index < split_sizes_range[i])) {
|
||||
output_index = i;
|
||||
block_offset = block_index - range_left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
int output_index = axis_dimension_input_output_mapping[block_index];
|
||||
int64_t range_left = (output_index == 0) ? 0 : split_sizes_range[output_index - 1];
|
||||
int block_offset = block_index - range_left;
|
||||
|
||||
output_pos = (outter_block_index * split_sizes[output_index] + block_offset) *
|
||||
block_size_inside_axis_dim_div.d_ +
|
||||
|
|
@ -51,6 +44,7 @@ Status SplitImpl(const size_t element_size,
|
|||
const int block_size_inside_axis_dim,
|
||||
const int64_t* split_sizes,
|
||||
const int64_t* split_sizes_range,
|
||||
const int64_t* axis_dimension_input_output_mapping,
|
||||
const int num_outputs,
|
||||
const void* input_data,
|
||||
void** output_ptr,
|
||||
|
|
@ -64,7 +58,7 @@ Status SplitImpl(const size_t element_size,
|
|||
case sizeof(int8_t):
|
||||
_SplitKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
|
||||
split_sizes, split_sizes_range, num_outputs,
|
||||
split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs,
|
||||
reinterpret_cast<const ToCudaType<int8_t>::MappedType*>(input_data),
|
||||
output_ptr,
|
||||
(CUDA_LONG)N);
|
||||
|
|
@ -72,7 +66,7 @@ Status SplitImpl(const size_t element_size,
|
|||
case sizeof(int16_t):
|
||||
_SplitKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
|
||||
split_sizes, split_sizes_range, num_outputs,
|
||||
split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs,
|
||||
reinterpret_cast<const ToCudaType<int16_t>::MappedType*>(input_data),
|
||||
output_ptr,
|
||||
(CUDA_LONG)N);
|
||||
|
|
@ -80,7 +74,7 @@ Status SplitImpl(const size_t element_size,
|
|||
case sizeof(int32_t):
|
||||
_SplitKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
|
||||
split_sizes, split_sizes_range, num_outputs,
|
||||
split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs,
|
||||
reinterpret_cast<const ToCudaType<int32_t>::MappedType*>(input_data),
|
||||
output_ptr,
|
||||
(CUDA_LONG)N);
|
||||
|
|
@ -88,7 +82,7 @@ Status SplitImpl(const size_t element_size,
|
|||
case sizeof(int64_t):
|
||||
_SplitKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>(
|
||||
block_size_including_axis_dim_div, block_size_inside_axis_dim_div,
|
||||
split_sizes, split_sizes_range, num_outputs,
|
||||
split_sizes, split_sizes_range, axis_dimension_input_output_mapping, num_outputs,
|
||||
reinterpret_cast<const ToCudaType<int64_t>::MappedType*>(input_data),
|
||||
output_ptr,
|
||||
(CUDA_LONG)N);
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ Status SplitImpl(const size_t element_size,
|
|||
const int block_size_inside_axis_dim,
|
||||
const int64_t* split_sizes,
|
||||
const int64_t* split_sizes_range,
|
||||
const int64_t* axis_dimension_input_output_mapping,
|
||||
const int num_outputs,
|
||||
const void* input_data,
|
||||
void** output_ptr,
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ namespace test {
|
|||
// Some of the tests can't run on TensorrtExecutionProvider because of unsupported data types or limits
|
||||
// in its parser: axis >=0 && axis < nbDims. Those Tests will fallback to other EPs
|
||||
|
||||
TEST(MathOpTest, Concat1D_string) {
|
||||
TEST(ConcatOpTest, Concat1D_string) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{0});
|
||||
|
||||
|
|
@ -21,7 +21,7 @@ TEST(MathOpTest, Concat1D_string) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat1D_int32) {
|
||||
TEST(ConcatOpTest, Concat1D_int32) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{0});
|
||||
|
||||
|
|
@ -32,7 +32,7 @@ TEST(MathOpTest, Concat1D_int32) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat1D_int32_negative_axis) {
|
||||
TEST(ConcatOpTest, Concat1D_int32_negative_axis) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{-1});
|
||||
|
||||
|
|
@ -43,7 +43,7 @@ TEST(MathOpTest, Concat1D_int32_negative_axis) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat1D_1) {
|
||||
TEST(ConcatOpTest, Concat1D_1) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{0});
|
||||
|
||||
|
|
@ -54,7 +54,7 @@ TEST(MathOpTest, Concat1D_1) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat1D_2) {
|
||||
TEST(ConcatOpTest, Concat1D_2) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{0});
|
||||
|
||||
|
|
@ -65,7 +65,7 @@ TEST(MathOpTest, Concat1D_2) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat2D_1) {
|
||||
TEST(ConcatOpTest, Concat2D_1) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{0});
|
||||
|
||||
|
|
@ -80,7 +80,7 @@ TEST(MathOpTest, Concat2D_1) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat2D_2) {
|
||||
TEST(ConcatOpTest, Concat2D_2) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{1});
|
||||
|
||||
|
|
@ -96,7 +96,7 @@ TEST(MathOpTest, Concat2D_2) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat2D_3) {
|
||||
TEST(ConcatOpTest, Concat2D_3) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{1});
|
||||
|
||||
|
|
@ -107,7 +107,7 @@ TEST(MathOpTest, Concat2D_3) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat3D_1) {
|
||||
TEST(ConcatOpTest, Concat3D_1) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{0});
|
||||
|
||||
|
|
@ -139,7 +139,7 @@ TEST(MathOpTest, Concat3D_1) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat3D_1_negative_axis) {
|
||||
TEST(ConcatOpTest, Concat3D_1_negative_axis) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{-3});
|
||||
|
||||
|
|
@ -171,7 +171,7 @@ TEST(MathOpTest, Concat3D_1_negative_axis) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat3D_2) {
|
||||
TEST(ConcatOpTest, Concat3D_2) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{1});
|
||||
|
||||
|
|
@ -203,7 +203,7 @@ TEST(MathOpTest, Concat3D_2) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MathOpTest, Concat3D_3) {
|
||||
TEST(ConcatOpTest, Concat3D_3) {
|
||||
OpTester test("Concat");
|
||||
test.AddAttribute("axis", int64_t{1});
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue