diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.cc b/onnxruntime/core/providers/cpu/tensor/transpose.cc index df016c9f5a..16a516532b 100644 --- a/onnxruntime/core/providers/cpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/cpu/tensor/transpose.cc @@ -15,7 +15,7 @@ namespace onnxruntime { // ComputeOffset: compute offset into a tensor. This is essentially the dot-product of // index and stride, restricted to the specified number of axes. -size_t ComputeOffset(const std::vector& index, const std::vector& stride, int64_t num_axes) { +static inline size_t ComputeOffset(const std::vector& index, const std::vector& stride, int64_t num_axes) { size_t offset = 0; for (int64_t j = 0; j < num_axes; ++j) { offset += index[j] * stride[j]; @@ -25,7 +25,7 @@ size_t ComputeOffset(const std::vector& index, const std::vector& index, const std::vector& upper_bound, int64_t num_axes) { +static inline void IncrementIndex(std::vector& index, const std::vector& upper_bound, int64_t num_axes) { for (int64_t k = num_axes - 1; k >= 0; --k) { index[k]++; if (index[k] < upper_bound[k]) break; @@ -33,13 +33,26 @@ void IncrementIndex(std::vector& index, const std::vector& upp } } +// DoTransposeSingleBlock: specialization of DoTranspose for the num_blocks=1 case. +// copies source tensor to target, transposing elements. +static inline void DoTransposeSingleBlock(size_t num_elts_in_block, const void* source, void* target, + size_t element_size) { + size_t blocksize = num_elts_in_block * element_size; + // copy + memcpy(target, source, blocksize); +} + +static inline void DoTransposeSingleBlock(size_t num_elts_in_block, const std::string* source, std::string* target) { + const std::string* end = source + num_elts_in_block; + std::copy(source, end, target); +} + // DoTranspose: copies source tensor to target, transposing elements. // The stride vector indicates the transposition. -template static void DoTransposeImpl(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, size_t num_elts_in_block, const std::vector& stride, - const T* source, T* target) { - size_t blocksize = num_elts_in_block * sizeof(float); + const uint8_t* source, uint8_t* target, size_t element_size) { + size_t blocksize = num_elts_in_block * element_size; // index used to iterate over target iteration-space std::vector target_index(num_axes, 0); for (size_t i = 0; i < num_blocks; ++i) { @@ -47,7 +60,25 @@ static void DoTransposeImpl(int64_t num_axes, const std::vector& target size_t source_offset = ComputeOffset(target_index, stride, num_axes); // copy - memcpy(target, source + source_offset, blocksize); + memcpy(target, source + source_offset * element_size, blocksize); + + // increment target_index: + IncrementIndex(target_index, target_dims, num_axes); + target += blocksize; + } +} + +static void DoTransposeImpl(int64_t num_axes, const std::vector& target_dims, + size_t num_blocks, size_t num_elts_in_block, const std::vector& stride, + const std::string* source, std::string* target) { + // index used to iterate over target iteration-space + std::vector target_index(num_axes, 0); + for (size_t i = 0; i < num_blocks; ++i) { + // convert target_index into an offset in source data + size_t source_offset = ComputeOffset(target_index, stride, num_axes); + + // copy + DoTransposeSingleBlock(num_elts_in_block, source + source_offset, target); // increment target_index: IncrementIndex(target_index, target_dims, num_axes); @@ -55,12 +86,80 @@ static void DoTransposeImpl(int64_t num_axes, const std::vector& target } } +template +inline void CopyPrim(uint8_t* target, const uint8_t* source) { + *reinterpret_cast(target) = *reinterpret_cast(source); +} + // DoTransposeEltWise: specialization of DoTranspose for the num_elts_in_block=1 case. // copies source tensor to target, transposing elements. // The stride vector indicates the transposition. -template static void DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, - const std::vector& stride, const T* source, T* target) { + const std::vector& stride, const uint8_t* source, uint8_t* target, + size_t element_size) { + // index used to iterate over target iteration-space + std::vector target_index(num_axes, 0); + + switch (element_size) { + case sizeof(uint64_t): + for (size_t i = 0; i < num_blocks; ++i) { + // convert target_index into an offset in source data + size_t source_offset = ComputeOffset(target_index, stride, num_axes); + + // copy + CopyPrim(target, source + (source_offset * element_size)); + + // increment target_index: + IncrementIndex(target_index, target_dims, num_axes); + target += element_size; + } + break; + case sizeof(uint32_t): + for (size_t i = 0; i < num_blocks; ++i) { + // convert target_index into an offset in source data + size_t source_offset = ComputeOffset(target_index, stride, num_axes); + + // copy + CopyPrim(target, source + (source_offset * element_size)); + + // increment target_index: + IncrementIndex(target_index, target_dims, num_axes); + target += element_size; + } + break; + case sizeof(uint16_t): + for (size_t i = 0; i < num_blocks; ++i) { + // convert target_index into an offset in source data + size_t source_offset = ComputeOffset(target_index, stride, num_axes); + + // copy + CopyPrim(target, source + (source_offset * element_size)); + + // increment target_index: + IncrementIndex(target_index, target_dims, num_axes); + target += element_size; + } + break; + case sizeof(uint8_t): + for (size_t i = 0; i < num_blocks; ++i) { + // convert target_index into an offset in source data + size_t source_offset = ComputeOffset(target_index, stride, num_axes); + + // copy + *target = *(source + (source_offset * element_size)); + + // increment target_index: + IncrementIndex(target_index, target_dims, num_axes); + target += element_size; + } + break; + default: + assert(false); + } +} + +static void DoTransposeEltWise(int64_t num_axes, const std::vector& target_dims, size_t num_blocks, + const std::vector& stride, const std::string* source, std::string* target) { // index used to iterate over target iteration-space std::vector target_index(num_axes, 0); for (size_t i = 0; i < num_blocks; ++i) { @@ -76,21 +175,14 @@ static void DoTransposeEltWise(int64_t num_axes, const std::vector& tar } } -// DoTransposeSingleBlock: specialization of DoTranspose for the num_blocks=1 case. -// copies source tensor to target, transposing elements. -template -static void DoTransposeSingleBlock(size_t num_elts_in_block, const T* source, T* target) { - size_t blocksize = num_elts_in_block * sizeof(T); - // copy - memcpy(target, source, blocksize); -} - -template -static Status DoTypedTranspose(const std::vector& permutations, const Tensor& input, Tensor& output) { +static Status DoUntypedTranspose(const std::vector& permutations, const Tensor& input, Tensor& output) { const auto& input_shape = input.Shape(); const auto& input_dims = input_shape.GetDims(); auto rank = input_shape.NumDimensions(); + const auto element_size = input.DataType()->Size(); + const bool is_string_type = input.DataType() == DataTypeImpl::GetType(); + std::vector stride(rank); for (int i = 0; i < rank; i++) { size_t inpdim = permutations[i]; @@ -118,17 +210,31 @@ static Status DoTypedTranspose(const std::vector& permutations, const T } } - const T* input_data = input.Data(); - T* output_data = output.MutableData(); - - if (1 == prefix_blocksize) - DoTransposeSingleBlock(suffix_blocksize, input_data, output_data); - else if (1 == suffix_blocksize) - DoTransposeEltWise(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, stride, - input_data, output_data); - else - DoTransposeImpl(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, suffix_blocksize, stride, - input_data, output_data); + if (is_string_type) { + const std::string* input_data = input.template Data(); + std::string* output_data = output.template MutableData(); + if (1 == prefix_blocksize) { + DoTransposeSingleBlock(suffix_blocksize, input_data, output_data); + } else if (1 == suffix_blocksize) { + DoTransposeEltWise(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, stride, + input_data, output_data); + } else { + DoTransposeImpl(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, suffix_blocksize, stride, + input_data, output_data); + } + } else { + const uint8_t* input_data = reinterpret_cast(input.DataRaw()); + uint8_t* output_data = reinterpret_cast(output.MutableDataRaw()); + if (1 == prefix_blocksize) { + DoTransposeSingleBlock(suffix_blocksize, input_data, output_data, element_size); + } else if (1 == suffix_blocksize) { + DoTransposeEltWise(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, stride, + input_data, output_data, element_size); + } else { + DoTransposeImpl(num_axes_in_prefix, output.Shape().GetDims(), prefix_blocksize, suffix_blocksize, stride, + input_data, output_data, element_size); + } + } return Status::OK(); } @@ -143,14 +249,13 @@ Status TransposeBase::DoTranspose(const std::vector& permutations, cons status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Mismatched data types between input and output Tensors. ", input_type, " != ", output_type); } else { - DispatchOnTensorTypeWithReturn(input_type, status, DoTypedTranspose, permutations, input, output); + status = DoUntypedTranspose(permutations, input, output); } return status; } -template <> -Status Transpose::Compute(OpKernelContext* ctx) const { +Status Transpose::Compute(OpKernelContext* ctx) const { // Get input and output: const Tensor* input_tensor_ptr = ctx->Input(0); ORT_ENFORCE(input_tensor_ptr != nullptr); @@ -167,7 +272,7 @@ Status Transpose::Compute(OpKernelContext* ctx) const { TensorShape output_shape{output_dims}; Tensor& Y = *ctx->Output(0, output_shape); - DoTypedTranspose(*p_perm, X, Y); + DoUntypedTranspose(*p_perm, X, Y); return Status::OK(); } @@ -175,7 +280,7 @@ Status Transpose::Compute(OpKernelContext* ctx) const { ONNX_CPU_OPERATOR_KERNEL( Transpose, 1, - KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), - Transpose); + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllTensorTypes()), + Transpose); } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/tensor/transpose.h b/onnxruntime/core/providers/cpu/tensor/transpose.h index a583d61f58..8be880a712 100644 --- a/onnxruntime/core/providers/cpu/tensor/transpose.h +++ b/onnxruntime/core/providers/cpu/tensor/transpose.h @@ -65,7 +65,6 @@ class TransposeBase { std::vector perm_; }; -template class Transpose final : public OpKernel, public TransposeBase { public: Transpose(const OpKernelInfo& info) : OpKernel(info), TransposeBase(info) {} diff --git a/onnxruntime/test/providers/cpu/tensor/transpose_test.cc b/onnxruntime/test/providers/cpu/tensor/transpose_test.cc index 995a2c8771..fb624ea478 100644 --- a/onnxruntime/test/providers/cpu/tensor/transpose_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/transpose_test.cc @@ -7,16 +7,17 @@ namespace onnxruntime { namespace test { +template void TransposeTest(std::vector& input_shape, - std::vector& input_vals, + std::vector& input_vals, std::vector* p_perm, std::vector expected_shape, - std::initializer_list& expected_vals) { + std::initializer_list& expected_vals) { OpTester test("Transpose"); if (nullptr != p_perm) test.AddAttribute("perm", *p_perm); - test.AddInput("X", input_shape, input_vals); - test.AddOutput("Y", expected_shape, expected_vals); + test.AddInput("X", input_shape, input_vals); + test.AddOutput("Y", expected_shape, expected_vals); test.Run(); } @@ -36,6 +37,21 @@ TEST(TransposeOpTest, TwoDimNoAttr) { TransposeTest(input_shape, input_vals, nullptr, expected_shape, expected_vals); } +TEST(TransposeOpTest, TwoDimNoAttrStr) { + std::vector input_shape({2, 3}); + std::vector input_vals = { + "1", "2", "3", + "4", "5", "6"}; + + std::vector expected_shape({3, 2}); + std::initializer_list expected_vals = { + "1", "4", + "2", "5", + "3", "6"}; + + TransposeTest(input_shape, input_vals, nullptr, expected_shape, expected_vals); +} + // Test 2 dimensional transpose, with permutation attribute specified TEST(TransposeOpTest, TwoDim) { std::vector input_shape({2, 3}); @@ -53,6 +69,22 @@ TEST(TransposeOpTest, TwoDim) { TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals); } +TEST(TransposeOpTest, TwoDimStr) { + std::vector input_shape({2, 3}); + std::vector input_vals = { + "1", "2", "3", + "4", "5", "6"}; + + std::vector perm = {1, 0}; + std::vector expected_shape({3, 2}); + std::initializer_list expected_vals = { + "1", "4", + "2", "5", + "3", "6"}; + + TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals); +} + // Test 3 dimensional transpose, with permutation attribute specified TEST(TransposeOpTest, ThreeDim) { std::vector input_shape({4, 2, 3}); @@ -105,5 +137,56 @@ TEST(TransposeOpTest, ThreeDim) { TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals); } +TEST(TransposeOpTest, ThreeDimStr) { + std::vector input_shape({4, 2, 3}); + std::vector input_vals = { + "1", "2", "3", + "4", "5", "6", + + "1", "2", "3", + "4", "5", "6", + + "1", "2", "3", + "4", "5", "6", + + "1", "2", "3", + "4", "5", "6"}; + + std::vector perm = {0, 2, 1}; + std::vector expected_shape({4, 3, 2}); + std::initializer_list expected_vals = { + "1", + "4", + "2", + "5", + "3", + "6", + + "1", + "4", + "2", + "5", + "3", + "6", + + "1", + "4", + "2", + "5", + "3", + "6", + + "1", + "4", + "2", + "5", + "3", + "6" + + }; + + TransposeTest(input_shape, input_vals, &perm, expected_shape, expected_vals); +} + } // namespace test } // namespace onnxruntime