From 4d71958ccff308de8f26eba8a7f5c72a895c1ab3 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Tue, 7 Apr 2020 15:22:04 -0700 Subject: [PATCH] Use IMMA for int8 matmul to leverage Turing Tensor Core (#3413) Use IMMA for int8 matmul to leverage Turing Tensor Core Format files under onnxruntime/core/providers/cude --- cmake/CMakeLists.txt | 5 + onnxruntime/core/providers/cuda/cuda_common.h | 9 +- .../providers/cuda/cuda_execution_provider.cc | 12 +- .../providers/cuda/cuda_execution_provider.h | 15 ++ onnxruntime/core/providers/cuda/cuda_pch.h | 5 + .../core/providers/cuda/cudnn_common.cc | 4 +- .../cuda/generator/constant_of_shape.cc | 10 +- .../cuda/generator/constant_of_shape.h | 2 +- .../providers/cuda/generator/range_impl.h | 1 - .../core/providers/cuda/gpu_data_transfer.cc | 6 +- onnxruntime/core/providers/cuda/igemm.cc | 177 ++++++++++++++++++ onnxruntime/core/providers/cuda/igemm.h | 31 +++ .../cuda/math/binary_elementwise_ops.h | 6 +- .../cuda/math/binary_elementwise_ops_impl.h | 22 +-- .../providers/cuda/math/matmul_integer.cc | 23 +++ onnxruntime/core/providers/cuda/math/topk.cc | 4 +- onnxruntime/core/providers/cuda/math/topk.h | 2 +- onnxruntime/core/providers/cuda/nn/conv.h | 8 +- onnxruntime/core/providers/cuda/nn/pool.cc | 6 +- .../cuda/object_detection/roialign.cc | 9 +- .../cuda/object_detection/roialign.h | 4 +- .../providers/cuda/reduction/reduction_ops.cc | 1 - .../core/providers/cuda/rnn/cudnn_rnn_base.h | 2 +- onnxruntime/core/providers/cuda/rnn/lstm.h | 1 - .../core/providers/cuda/rnn/rnn_impl.h | 2 +- .../providers/cuda/shared_inc/cuda_utils.h | 2 +- .../providers/cuda/shared_inc/fpgeneric.h | 2 +- .../core/providers/cuda/tensor/expand.cc | 13 +- .../core/providers/cuda/tensor/expand_impl.h | 1 - .../core/providers/cuda/tensor/eye_like.cc | 32 ++-- .../providers/cuda/tensor/eye_like_impl.h | 10 +- .../core/providers/cuda/tensor/identity_op.cc | 4 +- .../core/providers/cuda/tensor/nonzero_impl.h | 11 +- .../core/providers/cuda/tensor/nonzero_op.h | 4 +- .../providers/cuda/tensor/reverse_sequence.cc | 2 +- .../providers/cuda/tensor/scatter_elements.h | 1 - .../cuda/tensor/scatter_elements_impl.h | 1 - .../core/providers/cuda/tensor/shape_op.cc | 2 +- .../core/providers/cuda/tensor/slice.h | 2 +- .../core/providers/cuda/tensor/split_impl.h | 2 +- .../core/providers/cuda/tensor/tile.cc | 20 +- .../core/providers/cuda/tensor/where.cc | 8 +- 42 files changed, 371 insertions(+), 113 deletions(-) create mode 100644 onnxruntime/core/providers/cuda/igemm.cc create mode 100644 onnxruntime/core/providers/cuda/igemm.h mode change 100755 => 100644 onnxruntime/core/providers/cuda/tensor/scatter_elements.h mode change 100755 => 100644 onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index fdfd64cb83..c72ef5338b 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -757,6 +757,11 @@ if (onnxruntime_USE_CUDA) file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME) set(ONNXRUNTIME_CUDA_LIBRARIES ${CUDA_LIBRARIES}) list(APPEND ONNXRUNTIME_CUDA_LIBRARIES cublas cudnn curand) + if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL "10.1.0") + list(APPEND ONNXRUNTIME_CUDA_LIBRARIES cublasLt) + else() + message(WARNING "cublasLT is not supported in CUDA with version lower than 10.1.") + endif() if (WIN32) link_directories(${onnxruntime_CUDNN_HOME}/lib/x64) diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h index 0274273190..0fa130328a 100644 --- a/onnxruntime/core/providers/cuda/cuda_common.h +++ b/onnxruntime/core/providers/cuda/cuda_common.h @@ -159,13 +159,20 @@ class CudaKernel : public OpKernel { return provider_->PerThreadCublasHandle(); } +#if CUDA_VERSION >= 10010 + inline cublasLtHandle_t CublasLtHandle() const { + return provider_->PerThreadCublasLtHandle(); + } +#endif + inline cudnnHandle_t CudnnHandle() const { return provider_->PerThreadCudnnHandle(); } + inline curandGenerator_t CurandGenerator() const { return provider_->PerThreadCurandGenerator(); } - + template inline const T* GetConstOnes(size_t count) const { return provider_->template GetConstOnes(count); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 514d6d50d6..6e1b5b6eac 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -50,6 +50,9 @@ thread_local std::unique_ptr CUDAExe CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit) { CUDA_CALL_THROW(cudaSetDevice(device_id)); CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_)); + #if CUDA_VERSION >= 10010 + CUBLAS_CALL_THROW(cublasLtCreate(&cublasLt_handle_)); + #endif CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_)); CURAND_CALL_THROW(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); @@ -69,6 +72,14 @@ CUDAExecutionProvider::PerThreadContext::~PerThreadContext() { LOGS_DEFAULT(ERROR) << "cublasDestroy threw:" << ex.what(); } +#if CUDA_VERSION >= 10010 + try { + CUBLAS_CALL(cublasLtDestroy(cublasLt_handle_)); + } catch (const std::exception& ex) { + LOGS_DEFAULT(ERROR) << "cublasLtDestroy threw:" << ex.what(); + } +#endif + try { CUDNN_CALL(cudnnDestroy(cudnn_handle_)); } catch (const std::exception& ex) { @@ -743,7 +754,6 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, int8_t, ReduceMin); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 12, uint8_t, ReduceMin); - static void RegisterCudaKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index 3f119fa379..c4501ae23c 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -44,6 +44,12 @@ class CUDAExecutionProvider : public IExecutionProvider { return GetPerThreadContext().CublasHandle(); } +#if CUDA_VERSION >= 10010 + cublasLtHandle_t PerThreadCublasLtHandle() { + return GetPerThreadContext().CublasLtHandle(); + } +#endif + cudnnHandle_t PerThreadCudnnHandle() { return GetPerThreadContext().CudnnHandle(); } @@ -95,6 +101,12 @@ class CUDAExecutionProvider : public IExecutionProvider { return cublas_handle_; } +#if CUDA_VERSION >= 10010 + cublasLtHandle_t CublasLtHandle() const { + return cublasLt_handle_; + } +#endif + cudnnHandle_t CudnnHandle() const { return cudnn_handle_; } @@ -135,6 +147,9 @@ class CUDAExecutionProvider : public IExecutionProvider { private: cublasHandle_t cublas_handle_ = nullptr; +#if CUDA_VERSION >= 10010 + cublasLtHandle_t cublasLt_handle_ = nullptr; +#endif cudnnHandle_t cudnn_handle_ = nullptr; curandGenerator_t curand_generator_ = nullptr; diff --git a/onnxruntime/core/providers/cuda/cuda_pch.h b/onnxruntime/core/providers/cuda/cuda_pch.h index 3235505060..0c16cc0d17 100644 --- a/onnxruntime/core/providers/cuda/cuda_pch.h +++ b/onnxruntime/core/providers/cuda/cuda_pch.h @@ -14,6 +14,11 @@ #include #include +// support of cublasLt starts 10.1 +#if CUDA_VERSION >= 10010 +#include +#endif + #ifdef USE_NCCL #include #endif diff --git a/onnxruntime/core/providers/cuda/cudnn_common.cc b/onnxruntime/core/providers/cuda/cudnn_common.cc index d0d50718ec..de8669cc6b 100644 --- a/onnxruntime/core/providers/cuda/cudnn_common.cc +++ b/onnxruntime/core/providers/cuda/cudnn_common.cc @@ -115,12 +115,12 @@ Status CudnnFilterDescriptor::Set(const std::vector& filter_dims, cudnn template cudnnDataType_t CudnnTensor::GetDataType() { ORT_THROW("cuDNN engine currently supports only single/double/half/int8/uint8 precision data types. Got:", - typeid(ElemType).name()); + typeid(ElemType).name()); // Not reachable but GCC complains return 0; } -template<> +template <> cudnnDataType_t CudnnTensor::GetDataType() { return CUDNN_DATA_FLOAT; } diff --git a/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc b/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc index f598a0337b..52d3450230 100644 --- a/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc +++ b/onnxruntime/core/providers/cuda/generator/constant_of_shape.cc @@ -29,11 +29,11 @@ Status ConstantOfShape::ComputeInternal(OpKernelContext* ctx) const { const void* value_ptr = GetValuePtr(); const auto element_size = output_tensor->DataType()->Size(); -#define CASE(TYPE) \ - case sizeof(TYPE): \ - if (size > 0) { \ - cuda::Fill(reinterpret_cast(output_data), *(reinterpret_cast(value_ptr)), size); \ - } \ +#define CASE(TYPE) \ + case sizeof(TYPE): \ + if (size > 0) { \ + cuda::Fill(reinterpret_cast(output_data), *(reinterpret_cast(value_ptr)), size); \ + } \ break; switch (element_size) { diff --git a/onnxruntime/core/providers/cuda/generator/constant_of_shape.h b/onnxruntime/core/providers/cuda/generator/constant_of_shape.h index caa41aedb0..7443a1753e 100644 --- a/onnxruntime/core/providers/cuda/generator/constant_of_shape.h +++ b/onnxruntime/core/providers/cuda/generator/constant_of_shape.h @@ -15,7 +15,7 @@ namespace cuda { class ConstantOfShape final : public ConstantOfShapeBase, public CudaKernel { public: - explicit ConstantOfShape(const OpKernelInfo& info) : ConstantOfShapeBase(info), CudaKernel(info) {}; + explicit ConstantOfShape(const OpKernelInfo& info) : ConstantOfShapeBase(info), CudaKernel(info){}; ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ConstantOfShape); diff --git a/onnxruntime/core/providers/cuda/generator/range_impl.h b/onnxruntime/core/providers/cuda/generator/range_impl.h index 684978d544..9ab5521022 100644 --- a/onnxruntime/core/providers/cuda/generator/range_impl.h +++ b/onnxruntime/core/providers/cuda/generator/range_impl.h @@ -7,7 +7,6 @@ namespace onnxruntime { namespace cuda { - template bool RangeImpl(const T start, const T delta, const int count, T* output); diff --git a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc index 8fae7ae8b0..28430bd078 100644 --- a/onnxruntime/core/providers/cuda/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/cuda/gpu_data_transfer.cc @@ -18,8 +18,10 @@ GPUDataTransfer::~GPUDataTransfer() { } bool GPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { - return src_device.Type() == OrtDevice::GPU || src_device.MemType() == OrtDevice::MemType::CUDA_PINNED - || dst_device.Type() == OrtDevice::GPU || dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED; + return src_device.Type() == OrtDevice::GPU || + src_device.MemType() == OrtDevice::MemType::CUDA_PINNED || + dst_device.Type() == OrtDevice::GPU || + dst_device.MemType() == OrtDevice::MemType::CUDA_PINNED; } common::Status GPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst, int exec_queue_id) const { diff --git a/onnxruntime/core/providers/cuda/igemm.cc b/onnxruntime/core/providers/cuda/igemm.cc new file mode 100644 index 0000000000..68bc9f4b19 --- /dev/null +++ b/onnxruntime/core/providers/cuda/igemm.cc @@ -0,0 +1,177 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "igemm.h" + +#include "core/providers/cuda/cuda_common.h" +#include "core/providers/cuda/shared_inc/cuda_call.h" + +namespace onnxruntime { +namespace cuda { + +#if CUDA_VERSION >= 10010 +void LtIgemmTensor(int m, + int n, + int k, + int32_t alpha_matmul, + int32_t beta_matmul, + const int8_t* a, + int lda, + const int8_t* b, + int ldb, + int32_t* c, + int ldc, + const CudaKernel* cuda_kernel, + cublasLtHandle_t lt_handle) { + // Create descriptors for the original matrices + cublasLtMatrixLayout_t a_desc = nullptr; + cublasLtMatrixLayout_t b_desc = nullptr; + cublasLtMatrixLayout_t c_desc = nullptr; + CUBLAS_CALL_THROW(cublasLtMatrixLayoutCreate(&a_desc, CUDA_R_8I, m, k, lda)); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutCreate(&b_desc, CUDA_R_8I, n, k, ldb)); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutCreate(&c_desc, CUDA_R_32I, m, n, ldc)); + + // Set A and C row major order. + // No need for B because B need to be transposed + cublasLtOrder_t row_order = CUBLASLT_ORDER_ROW; + CUBLAS_CALL_THROW(cublasLtMatrixLayoutSetAttribute(a_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &row_order, sizeof(row_order))); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutSetAttribute(c_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, &row_order, sizeof(row_order))); + + // The tensor operations IGEMM kernels require specialized memory order of data. + // Matrix A and Matrix C need to be in CUBLASLT_ORDER_COL32 order + // And Matric B needs to be in CUBLASLT_ORDER_COL4_4R2_8C order + + cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32; + cublasLtOrder_t order_COL4_4R2_8C = CUBLASLT_ORDER_COL4_4R2_8C; + + // For CUBLASLT_ORDER_COL32 order, Data is ordered in column-major ordered tiles of 32 columns. + // The leading dimension is the stride (in elements) to the beginning of next group of 32-columns. + + // For CUBLASLT_ORDER_COL4_4R2_8C, Data is ordered in column-major ordered tiles of composite tiles + // with total 32 columns and 8 rows. + // A tile is composed of interleaved inner tiles of 4 columns within 4 even or odd rows in an alternating pattern. + // The leading dimension is the stride (in elements) to the beginning of the first 32 column x 8 row tile + // for the next 32-wide group of columns. + int lda_transform = 32 * m; + int ldb_transform = 32 * roundoff(n, 8); + int ldc_transform = 32 * m; + + // Allocate memory for transform + IAllocatorUniquePtr a_transform = cuda_kernel->GetScratchBuffer(roundoff(k, 32) / 32 * lda_transform); + IAllocatorUniquePtr b_transform = cuda_kernel->GetScratchBuffer(roundoff(k, 32) / 32 * ldb_transform); + IAllocatorUniquePtr c_transform = cuda_kernel->GetScratchBuffer(roundoff(k, 32) / 32 * ldc_transform); + + // Create descriptors for the transformed matrices + cublasLtMatrixLayout_t a_transform_desc = nullptr; + cublasLtMatrixLayout_t b_transform_desc = nullptr; + cublasLtMatrixLayout_t c_transform_desc = nullptr; + CUBLAS_CALL_THROW(cublasLtMatrixLayoutCreate(&a_transform_desc, CUDA_R_8I, m, k, lda_transform)); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutCreate(&b_transform_desc, CUDA_R_8I, n, k, ldb_transform)); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutCreate(&c_transform_desc, CUDA_R_32I, m, n, ldc_transform)); + + CUBLAS_CALL_THROW(cublasLtMatrixLayoutSetAttribute(a_transform_desc, + CUBLASLT_MATRIX_LAYOUT_ORDER, + &order_COL32, + sizeof(order_COL32))); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutSetAttribute(b_transform_desc, + CUBLASLT_MATRIX_LAYOUT_ORDER, + &order_COL4_4R2_8C, + sizeof(order_COL4_4R2_8C))); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutSetAttribute(c_transform_desc, + CUBLASLT_MATRIX_LAYOUT_ORDER, + &order_COL32, + sizeof(order_COL32))); + + cublasLtMatrixTransformDesc_t transform_desc = nullptr; + CUBLAS_CALL_THROW(cublasLtMatrixTransformDescCreate(&transform_desc, CUDA_R_32F)); + + float alpha_transform = 1.0f; + float beta_transform = 0.0f; + CUBLAS_CALL_THROW(cublasLtMatrixTransform(lt_handle, + transform_desc, + &alpha_transform, + a, + a_desc, + &beta_transform, + nullptr, + nullptr, + a_transform.get(), + a_transform_desc, + 0)); + + CUBLAS_CALL_THROW(cublasLtMatrixTransform(lt_handle, + transform_desc, + &alpha_transform, + b, + b_desc, + &beta_transform, + nullptr, + nullptr, + b_transform.get(), + b_transform_desc, + 0)); + + if (beta_matmul == 1) { + CUBLAS_CALL_THROW(cublasLtMatrixTransform(lt_handle, + transform_desc, + &alpha_transform, + c, + c_desc, + &beta_transform, + nullptr, + nullptr, + c_transform.get(), + c_transform_desc, + 0)); + } + + // Tensor op igemm kernels only support NT gemm + cublasLtMatmulDesc_t matmul_desc = nullptr; + cublasOperation_t op_trans = CUBLAS_OP_T; + CUBLAS_CALL_THROW(cublasLtMatmulDescCreate(&matmul_desc, CUDA_R_32I)); + CUBLAS_CALL_THROW(cublasLtMatmulDescSetAttribute(matmul_desc, + CUBLASLT_MATMUL_DESC_TRANSB, + &op_trans, + sizeof(op_trans))); + + CUBLAS_CALL_THROW(cublasLtMatmul(lt_handle, + matmul_desc, + &alpha_matmul, + a_transform.get(), + a_transform_desc, + b_transform.get(), + b_transform_desc, + &beta_matmul, + c_transform.get(), + c_transform_desc, + c_transform.get(), + c_transform_desc, + nullptr, + nullptr, + 0, + 0)); + + CUBLAS_CALL_THROW(cublasLtMatrixTransform(lt_handle, + transform_desc, + &alpha_transform, + c_transform.get(), + c_transform_desc, + &beta_transform, + nullptr, + nullptr, + c, + c_desc, + 0)); + + CUBLAS_CALL_THROW(cublasLtMatrixLayoutDestroy(c_transform_desc)); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutDestroy(b_transform_desc)); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutDestroy(a_transform_desc)); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutDestroy(c_desc)); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutDestroy(b_desc)); + CUBLAS_CALL_THROW(cublasLtMatrixLayoutDestroy(a_desc)); + CUBLAS_CALL_THROW(cublasLtMatmulDescDestroy(matmul_desc)); + CUBLAS_CALL_THROW(cublasLtMatrixTransformDescDestroy(transform_desc)); +} +#endif +} +} diff --git a/onnxruntime/core/providers/cuda/igemm.h b/onnxruntime/core/providers/cuda/igemm.h new file mode 100644 index 0000000000..4910e36c30 --- /dev/null +++ b/onnxruntime/core/providers/cuda/igemm.h @@ -0,0 +1,31 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/cuda/cuda_common.h" + +namespace onnxruntime { +namespace cuda { + +inline int roundoff(int v, int d) { + return (v + d - 1) / d * d; +} + +#if CUDA_VERSION >= 10010 +void LtIgemmTensor(int m, + int n, + int k, + int32_t alpha_matmul, + int32_t beta_matmul, + const int8_t* a, + int lda, + const int8_t* b, + int ldb, + int32_t* c, + int ldc, + const CudaKernel* cuda_kernel, + cublasLtHandle_t lt_handle); +#endif +} +} \ No newline at end of file diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h index f9ff9e4cfb..452eec6cb2 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops.h @@ -14,7 +14,7 @@ struct BinaryElementwisePreparation { const Tensor* lhs_tensor = nullptr; const Tensor* rhs_tensor = nullptr; Tensor* output_tensor = nullptr; - int32_t output_rank_or_simple_broadcast = 0; // for no_broadcast|left_scalar|right_scalar cases, output_rank uses SimpleBroadcast enums + int32_t output_rank_or_simple_broadcast = 0; // for no_broadcast|left_scalar|right_scalar cases, output_rank uses SimpleBroadcast enums TArray lhs_padded_strides; TArray rhs_padded_strides; @@ -42,8 +42,8 @@ struct BinaryElementwisePreparation { // early return if one operand is scalar if (lhs_shape.Size() == 1 || rhs_shape.Size() == 1) { output_rank_or_simple_broadcast = static_cast(lhs_shape.Size() == 1 - ? SimpleBroadcast::LeftScalar - : SimpleBroadcast::RightScalar); + ? SimpleBroadcast::LeftScalar + : SimpleBroadcast::RightScalar); return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h index ac717524b1..15a6ecd697 100644 --- a/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h +++ b/onnxruntime/core/providers/cuda/math/binary_elementwise_ops_impl.h @@ -34,18 +34,18 @@ namespace cuda { // NOTE that cu files are compiled with nvcc and should not refer to any onnxruntime headers // so struct BinaryElementwisePreparation cannot be used here -#define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \ - template \ - void Impl_##name( \ - int32_t output_rank_or_simple_broadcast, \ - const TArray* lhs_padded_strides, \ - const T* lhs_data, \ - const TArray* rhs_padded_strides, \ - const T* rhs_data, \ +#define BINARY_ELEMENTWISE_IMPL_DECLARATION(name) \ + template \ + void Impl_##name( \ + int32_t output_rank_or_simple_broadcast, \ + const TArray* lhs_padded_strides, \ + const T* lhs_data, \ + const TArray* rhs_padded_strides, \ + const T* rhs_data, \ const TArray* fdm_output_strides, \ - const fast_divmod& fdm_H, \ - const fast_divmod& fdm_C, \ - T* output_data, \ + const fast_divmod& fdm_H, \ + const fast_divmod& fdm_C, \ + T* output_data, \ size_t count) #define BINARY_OP_NAME_EXPR(name, expr) BINARY_ELEMENTWISE_IMPL_DECLARATION(name); diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cc b/onnxruntime/core/providers/cuda/math/matmul_integer.cc index e4547da039..61164a1ddb 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cc +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cc @@ -6,6 +6,7 @@ #include "core/providers/cpu/math/matmul_helper.h" #include "core/providers/cuda/shared_inc/fpgeneric.h" #include "core/providers/cuda/cuda_allocator.h" +#include "core/providers/cuda/igemm.h" #include "core/providers/common.h" namespace onnxruntime { @@ -106,6 +107,28 @@ Status MatMulInteger::ComputeInternal(OpKernelContext* ctx) cons beta = 1; } +#if CUDA_VERSION >= 10010 + if (DeviceProp::GetDeviceProps().major >= 7 && DeviceProp::GetDeviceProps().minor >= 5) { + for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { + LtIgemmTensor( + static_cast(helper.M()), + static_cast(helper.N()), + static_cast(helper.K()), + alpha, + beta, + a_ptr + helper.LeftOffsets()[batch], + static_cast(helper.K()), + b_ptr + helper.RightOffsets()[batch], + static_cast(helper.N()), + output_ptr + helper.OutputOffsets()[batch], + static_cast(helper.N()), + this, + Base::CublasLtHandle()); + } + return Status::OK(); + } +#endif + // pad A and B to make their leading dimension be multiples of 32 // because cublasGemmEx requires: // 1. leading dimension is multiples of 4 diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc index de78cd6833..a04bcd9b93 100644 --- a/onnxruntime/core/providers/cuda/math/topk.cc +++ b/onnxruntime/core/providers/cuda/math/topk.cc @@ -10,7 +10,7 @@ namespace cuda { ONNX_OPERATOR_VERSIONED_KERNEL_EX( TopK, kOnnxDomain, - 1,9, + 1, 9, kCudaExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), TopK); @@ -18,7 +18,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX( ONNX_OPERATOR_VERSIONED_KERNEL_EX( TopK, kOnnxDomain, - 10,10, + 10, 10, kCudaExecutionProvider, KernelDefBuilder().InputMemoryType(1).TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), TopK); diff --git a/onnxruntime/core/providers/cuda/math/topk.h b/onnxruntime/core/providers/cuda/math/topk.h index 944c95e369..caf6d02851 100644 --- a/onnxruntime/core/providers/cuda/math/topk.h +++ b/onnxruntime/core/providers/cuda/math/topk.h @@ -7,7 +7,7 @@ namespace onnxruntime { namespace cuda { -template +template class TopK final : public CudaKernel { public: TopK(const OpKernelInfo&); diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index ca3266c811..4bb529d58d 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -88,7 +88,7 @@ class lru_unordered_map { lru_list_.clear(); } -private: + private: using list_type = std::list; using iterator_type = typename list_type::iterator; struct value_type { @@ -126,12 +126,12 @@ struct CudnnConvState { CudnnConvolutionDescriptor conv_desc; struct PerfResultParams { - decltype(AlgoPerfType().algo) algo; - decltype(AlgoPerfType().memory) memory; + decltype(AlgoPerfType().algo) algo; + decltype(AlgoPerfType().memory) memory; decltype(AlgoPerfType().mathType) mathType; }; - lru_unordered_map, PerfResultParams, vector_hash> cached_benchmark_results { MAX_CACHED_ALGO_PERF_RESULTS }; + lru_unordered_map, PerfResultParams, vector_hash> cached_benchmark_results{MAX_CACHED_ALGO_PERF_RESULTS}; // note that conv objects are shared between execution frames, and a lock is needed to avoid multi-thread racing OrtMutex mutex; diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc index 367930d9b3..2b5830c5f3 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.cc +++ b/onnxruntime/core/providers/cuda/nn/pool.cc @@ -32,7 +32,6 @@ namespace cuda { KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()).TypeConstraint("I", DataTypeImpl::GetTensorType()), \ Pool); - POOLING_KERNEL_VERSIONED(AveragePool, float, AveragePool, 7, 9) POOLING_KERNEL_VERSIONED(AveragePool, double, AveragePool, 7, 9) POOLING_KERNEL_VERSIONED(AveragePool, MLFloat16, AveragePool, 7, 9) @@ -64,7 +63,6 @@ POOLING_KERNEL(MaxPool, MLFloat16, MaxPool<8>, 12) POOLING_KERNEL(MaxPool, int8_t, MaxPool<8>, 12) POOLING_KERNEL(MaxPool, uint8_t, MaxPool<8>, 12) - POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1) POOLING_KERNEL(GlobalMaxPool, double, MaxPool<1>, 1) POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1) @@ -167,8 +165,8 @@ Status Pool::ComputeInternal(OpKernelContext* context) const { cudnnPoolingMode_t mode = CUDNN_POOLING_MAX; if (PoolType::type == onnxruntime::PoolType::kAveragePool) { - mode = pool_attrs_.count_include_pad ? CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING - : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + mode = pool_attrs_.count_include_pad ? CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING + : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; } CudnnPoolingDescriptor pooling_desc; ORT_RETURN_IF_ERROR(pooling_desc.Set(mode, kernel_shape, pads, strides)); diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign.cc b/onnxruntime/core/providers/cuda/object_detection/roialign.cc index 5ca757382f..1d33617673 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign.cc +++ b/onnxruntime/core/providers/cuda/object_detection/roialign.cc @@ -1,5 +1,5 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. #include "roialign.h" #include "roialign_impl.h" @@ -15,7 +15,7 @@ namespace cuda { T, \ kCudaExecutionProvider, \ KernelDefBuilder() \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ .TypeConstraint("T2", DataTypeImpl::GetTensorType()), \ RoiAlign); @@ -58,8 +58,7 @@ Status RoiAlign::ComputeInternal(OpKernelContext* context) const { num_roi_cols, reinterpret_cast::MappedType*>(Y.template MutableData()), this->mode_ == RoiAlignMode::avg, - batch_indices_ptr->template Data() - ); + batch_indices_ptr->template Data()); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/object_detection/roialign.h b/onnxruntime/core/providers/cuda/object_detection/roialign.h index fdd0f95ccf..4ef23f0380 100644 --- a/onnxruntime/core/providers/cuda/object_detection/roialign.h +++ b/onnxruntime/core/providers/cuda/object_detection/roialign.h @@ -1,5 +1,5 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. #pragma once diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index f080672777..728bab9a8a 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -59,7 +59,6 @@ namespace cuda { KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), \ name); - // CUDA's reduction descriptor cudnnReduceTensorDescriptor_t is a pointer so // it's safer to wrap it with automatically memory deleter as CudnnReduceDescriptor. // An implicit caster from CudnnReduceDescriptor to cudnnReduceTensorDescriptor_t diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h index 41a6c94c8c..c0b5cb8a15 100644 --- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h +++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h @@ -91,7 +91,7 @@ class CudnnRnnBase : public CudaKernel { rnn_mode_ = CUDNN_LSTM; weight_cached_ = false; w_data_cache_ = nullptr; - + size_t state_size; cudnn_dropout_desc_.CreateDescriptorIfNeeded(); cudnn_dropout_desc_.GetCudnnDropoutStatesSize(CudnnHandle(), state_size); diff --git a/onnxruntime/core/providers/cuda/rnn/lstm.h b/onnxruntime/core/providers/cuda/rnn/lstm.h index 3ed12cfa7f..b8f8465962 100644 --- a/onnxruntime/core/providers/cuda/rnn/lstm.h +++ b/onnxruntime/core/providers/cuda/rnn/lstm.h @@ -10,7 +10,6 @@ namespace cuda { template class LSTM final : public CudnnRnnBase { - public: LSTM(const OpKernelInfo& info) : CudnnRnnBase(info) { CudnnRnnBase::SetRNNMode(CUDNN_LSTM); diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h index 78ceabf23b..1c4fbc5a24 100644 --- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h +++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h @@ -8,7 +8,7 @@ namespace onnxruntime { namespace cuda { -template +template void ReverseBySequence(const int32_t seq_length, const int32_t batch_size, const int32_t input_or_hidden_size, diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h index fa3955ce85..f499a5d1c3 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h @@ -48,7 +48,7 @@ struct TArray { ORT_ENFORCE(size <= capacity, "TArray size was set to ", size, ", exeeding the capacity limit of ", capacity); } - TArray(const std::vector& vec) : TArray(static_cast(vec.size())) { + TArray(const std::vector& vec) : TArray(static_cast(vec.size())) { memcpy(data_, vec.data(), vec.size() * sizeof(T)); } diff --git a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h index 0ff4447927..d56d4273f1 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h +++ b/onnxruntime/core/providers/cuda/shared_inc/fpgeneric.h @@ -79,7 +79,7 @@ inline cublasStatus_t cublasGemmStridedBatchedHelper(cublasHandle_t handle, const double* beta, double* C, int ldc, long long int strideC, - int batch_count){ + int batch_count) { return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batch_count); } diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc index 2b7ffd463f..ea92651db5 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand.cc +++ b/onnxruntime/core/providers/cuda/tensor/expand.cc @@ -22,8 +22,7 @@ static void CalcEffectiveDims(vector& x_dims, vector& y_dims) if (xdim == ydim || xdim == 1) { x_reverse.push_back(xdim); y_reverse.push_back(ydim); - } - else { // xdim < ydim && xdim > 1, split + } else { // xdim < ydim && xdim > 1, split ydim /= xdim; x_reverse.push_back(xdim); y_reverse.push_back(xdim); @@ -44,18 +43,15 @@ static void CalcEffectiveDims(vector& x_dims, vector& y_dims) } if (x_dims.back() == 1) { y_dims.back() *= y_reverse[i]; - } - else { + } else { x_dims.push_back(1); y_dims.push_back(y_reverse[i]); } - } - else { // x_reverse[i] == y_reverse[i] + } else { // x_reverse[i] == y_reverse[i] if (x_dims.back() == y_dims.back()) { x_dims.back() *= x_reverse[i]; y_dims.back() *= y_reverse[i]; - } - else { + } else { x_dims.push_back(x_reverse[i]); y_dims.push_back(y_reverse[i]); } @@ -107,7 +103,6 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const { input_strides); } - ONNX_OPERATOR_KERNEL_EX( Expand, kOnnxDomain, diff --git a/onnxruntime/core/providers/cuda/tensor/expand_impl.h b/onnxruntime/core/providers/cuda/tensor/expand_impl.h index 27d5d69d9c..77813a8b28 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/expand_impl.h @@ -20,6 +20,5 @@ Status ExpandImpl( const TArray& output_strides, const TArray& input_strides); - } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like.cc b/onnxruntime/core/providers/cuda/tensor/eye_like.cc index cdbbeb4fac..78c51d03b6 100644 --- a/onnxruntime/core/providers/cuda/tensor/eye_like.cc +++ b/onnxruntime/core/providers/cuda/tensor/eye_like.cc @@ -21,25 +21,23 @@ ONNX_OPERATOR_KERNEL_EX( DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType() - }) - .TypeConstraint("T2", - std::vector{ - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType() - }), + DataTypeImpl::GetTensorType()}) + .TypeConstraint("T2", + std::vector{ + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), EyeLike); -#define TYPED_FUNCTION_CALL(T) \ - EyeLikeImpl::MappedType>( \ - offset, \ - dim1 + 1, \ - reinterpret_cast::MappedType *>(T2->template MutableData()), \ - diag_count); \ - break; +#define TYPED_FUNCTION_CALL(T) \ + EyeLikeImpl::MappedType>( \ + offset, \ + dim1 + 1, \ + reinterpret_cast::MappedType*>(T2->template MutableData()), \ + diag_count); \ + break; Status EyeLike::ComputeInternal(OpKernelContext* context) const { const auto* T1 = context->Input(0); diff --git a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h index f95ca63782..daf0d2b686 100644 --- a/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/eye_like_impl.h @@ -12,11 +12,11 @@ namespace cuda { template void EyeLikeImpl( - size_t offset, // offset of first element in diagnal - size_t stripe, // stripe, here it's width + 1 - T* output_data, // output buffer - size_t diag_count // total number of elements in diagnal -); + size_t offset, // offset of first element in diagnal + size_t stripe, // stripe, here it's width + 1 + T* output_data, // output buffer + size_t diag_count // total number of elements in diagnal + ); } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/tensor/identity_op.cc b/onnxruntime/core/providers/cuda/tensor/identity_op.cc index 890bdf5cac..3d5f456774 100644 --- a/onnxruntime/core/providers/cuda/tensor/identity_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/identity_op.cc @@ -11,8 +11,8 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX( 7, 9, kCudaExecutionProvider, KernelDefBuilder() - .TypeConstraint("T", {DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), + .TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}) .Alias(0, 0), IdentityOp); diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h index 7d55e83133..5288a64d18 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.h @@ -10,21 +10,20 @@ namespace cuda { int NonZeroCalcBlockCount(int64_t x_size); -cudaError_t NonZeroCalcPrefixSumTempStorageBytes(int* prefix_counts, int number_of_blocks, size_t& ); +cudaError_t NonZeroCalcPrefixSumTempStorageBytes(int* prefix_counts, int number_of_blocks, size_t&); cudaError_t NonZeroInclusivePrefixSum(void* d_temp_storage, size_t temp_storage_bytes, int* prefix_counts, int number_of_blocks); -// count nonzero elements in each block into counts_in_blocks, +// count nonzero elements in each block into counts_in_blocks, // the counts_in_blocks buffer is pre-allocated on gpu first. -template +template cudaError_t NonZeroCountEachBlock(const InputT* x, int64_t x_size, int* counts_in_blocks); // output nonzero positions using input x and prefix_counts for each blocks -template +template cudaError_t NonZeroOutputPositions( - const InputT *x, int64_t x_size, int x_rank, const TArray& x_strides, + const InputT* x, int64_t x_size, int x_rank, const TArray& x_strides, const int* prefix_counts, int nonzero_elements, int64_t* results); } // namespace cuda } // namespace onnxruntime - diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_op.h b/onnxruntime/core/providers/cuda/tensor/nonzero_op.h index 1091e6fd9e..e19d432e54 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_op.h +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_op.h @@ -10,8 +10,8 @@ namespace onnxruntime { namespace cuda { template -class NonZero final: public CudaKernel { -public: +class NonZero final : public CudaKernel { + public: NonZero(const OpKernelInfo& info) : CudaKernel(info) {} Status ComputeInternal(OpKernelContext* context) const override; diff --git a/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc b/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc index f51d99c549..1a1f9afc24 100644 --- a/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc +++ b/onnxruntime/core/providers/cuda/tensor/reverse_sequence.cc @@ -20,7 +20,7 @@ ONNX_OPERATOR_KERNEL_EX( ReverseSequenceOp); #define ReverseSequenceCallCudaImplTypeAs(T, TEqual) \ - if (X.IsDataType()) { \ + if (X.IsDataType()) { \ CUDA_RETURN_IF_ERROR(ReverseSequenceCudaImpl( \ reinterpret_cast::MappedType*>(X.template Data()), \ seq_lengths.Data(), \ diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements.h b/onnxruntime/core/providers/cuda/tensor/scatter_elements.h old mode 100755 new mode 100644 index f70bf6b778..3cd55b8c9c --- a/onnxruntime/core/providers/cuda/tensor/scatter_elements.h +++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements.h @@ -24,4 +24,3 @@ class ScatterElements final : public CudaKernel { } // namespace cuda } // namespace onnxruntime - diff --git a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h old mode 100755 new mode 100644 index 2f08d542e0..5eea6ab808 --- a/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/scatter_elements_impl.h @@ -26,4 +26,3 @@ Status ScatterElementsImpl( } // namespace cuda } // namespace onnxruntime - diff --git a/onnxruntime/core/providers/cuda/tensor/shape_op.cc b/onnxruntime/core/providers/cuda/tensor/shape_op.cc index d1969785b4..be96ef1040 100644 --- a/onnxruntime/core/providers/cuda/tensor/shape_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/shape_op.cc @@ -14,7 +14,7 @@ ONNX_OPERATOR_KERNEL_EX( kCudaExecutionProvider, KernelDefBuilder() .OutputMemoryType(0) - .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()) + .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()) .TypeConstraint("T1", DataTypeImpl::GetTensorType()), Shape); diff --git a/onnxruntime/core/providers/cuda/tensor/slice.h b/onnxruntime/core/providers/cuda/tensor/slice.h index e4c301ad41..c83b17c8fa 100644 --- a/onnxruntime/core/providers/cuda/tensor/slice.h +++ b/onnxruntime/core/providers/cuda/tensor/slice.h @@ -9,7 +9,7 @@ namespace onnxruntime { namespace cuda { -template +template class Slice final : public CudaKernel, public SliceBase { public: Slice(const OpKernelInfo& info) : CudaKernel(info), SliceBase(info, dynamic) {} diff --git a/onnxruntime/core/providers/cuda/tensor/split_impl.h b/onnxruntime/core/providers/cuda/tensor/split_impl.h index fa07a68fb5..4638aca421 100644 --- a/onnxruntime/core/providers/cuda/tensor/split_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/split_impl.h @@ -9,7 +9,7 @@ namespace onnxruntime { namespace cuda { -Status SplitImpl(const size_t element_size, +Status SplitImpl(const size_t element_size, const int block_size_including_axis_dim, const int block_size_inside_axis_dim, const int64_t* split_sizes, diff --git a/onnxruntime/core/providers/cuda/tensor/tile.cc b/onnxruntime/core/providers/cuda/tensor/tile.cc index a66a11bb43..5031e57bba 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile.cc +++ b/onnxruntime/core/providers/cuda/tensor/tile.cc @@ -8,16 +8,16 @@ using namespace onnxruntime::common; namespace onnxruntime { namespace cuda { -#define REGISTER_KERNEL_TYPED(T) \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - Tile, \ - kOnnxDomain, \ - 6, \ - T, \ - kCudaExecutionProvider, \ - KernelDefBuilder() \ - .InputMemoryType(1) \ - .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ +#define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + Tile, \ + kOnnxDomain, \ + 6, \ + T, \ + kCudaExecutionProvider, \ + KernelDefBuilder() \ + .InputMemoryType(1) \ + .TypeConstraint("T", DataTypeImpl::GetTensorType()) \ .TypeConstraint("T1", DataTypeImpl::GetTensorType()), \ Tile); diff --git a/onnxruntime/core/providers/cuda/tensor/where.cc b/onnxruntime/core/providers/cuda/tensor/where.cc index 3fc0b9272e..19f889594f 100644 --- a/onnxruntime/core/providers/cuda/tensor/where.cc +++ b/onnxruntime/core/providers/cuda/tensor/where.cc @@ -68,10 +68,10 @@ struct TernaryElementwisePreparation { const Tensor* a_tensor = nullptr; const Tensor* b_tensor = nullptr; const Tensor* c_tensor = nullptr; - size_t output_rank_or_simple_broadcast = 0; // for no_broadcast cases, output_rank uses SimpleBroadcast enums - TArray a_padded_strides; // for a shape == output shape, this is nullptr - TArray b_padded_strides; // for b shape == output shape, this is nullptr - TArray c_padded_strides; // for c shape == output shape, this is nullptr + size_t output_rank_or_simple_broadcast = 0; // for no_broadcast cases, output_rank uses SimpleBroadcast enums + TArray a_padded_strides; // for a shape == output shape, this is nullptr + TArray b_padded_strides; // for b shape == output shape, this is nullptr + TArray c_padded_strides; // for c shape == output shape, this is nullptr TArray fdm_output_strides; TernaryElementwisePreparation(const Tensor* a, const Tensor* b, const Tensor* c)