From 3de1fc096d2fd8c770c96f7ed4cdc6fa9aa55f9b Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Tue, 10 Mar 2020 13:56:23 -0700 Subject: [PATCH] Move zero point inputs of MatmulInteger to CPU memory (#3159) --- .../providers/cuda/math/matmul_integer.cc | 16 ++++++---- .../providers/cuda/math/matmul_integer.cu | 32 ++++++++----------- .../providers/cuda/math/matmul_integer.cuh | 8 ++--- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cc b/onnxruntime/core/providers/cuda/math/matmul_integer.cc index 39f1e10747..c27a949ed2 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cc +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cc @@ -18,6 +18,8 @@ ONNX_OPERATOR_TYPED_KERNEL_EX( int8_t, kCudaExecutionProvider, KernelDefBuilder() + .InputMemoryType(2) + .InputMemoryType(3) .TypeConstraint("T1", DataTypeImpl::GetTensorType()) .TypeConstraint("T2", DataTypeImpl::GetTensorType()) .TypeConstraint("T3", DataTypeImpl::GetTensorType()), @@ -58,19 +60,19 @@ Status MatMulInteger::ComputeInternal(OpKernelContext* ctx) cons int32_t* output_ptr = Y->template MutableData(); // validate zero points - const int8_t* a_offset = nullptr; - const int8_t* b_offset = nullptr; + int8_t a_offset = 0; + int8_t b_offset = 0; if (has_a_zero_point_) { auto a_zero_point = ctx->Input(2); ORT_ENFORCE(IsScalarOr1ElementVector(a_zero_point), "MatmulInteger : input1 zero point must be a scalar or 1D tensor of size 1"); - a_offset = a_zero_point->template Data(); + a_offset = *(a_zero_point->template Data()); } if (has_b_zero_point_) { auto b_zero_point = ctx->Input(3); ORT_ENFORCE(IsScalarOr1ElementVector(b_zero_point), "MatmulInteger : input2 zero point must be a scalar or 1D tensor of size 1"); - b_offset = b_zero_point->template Data(); + b_offset = *(b_zero_point->template Data()); } // offset output c[i,j] to @@ -81,20 +83,20 @@ Status MatMulInteger::ComputeInternal(OpKernelContext* ctx) cons // ReduceColSumOnMatrixB computes the a_offset * (b[0,j] + b[1,j] ... + b[k,j]) part // OffsetOutput computes gets the final result IAllocatorUniquePtr a_row_buf; - if (has_b_zero_point_) { + if (b_offset != 0) { a_row_buf = GetScratchBuffer(helper.OutputShape().Size() / helper.N()); ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(a_ptr, a_row_buf.get(), b_offset, helper)); } IAllocatorUniquePtr b_col_buf; - if (has_a_zero_point_) { + if (a_offset != 0) { b_col_buf = GetScratchBuffer(helper.OutputShape().Size() / helper.M()); ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(b_ptr, b_col_buf.get(), a_offset, helper)); } int alpha = 1; int beta = 0; - if (has_a_zero_point_ || has_b_zero_point_) { + if (a_offset != 0 || b_offset != 0) { OffsetOutput(a_row_buf.get(), b_col_buf.get(), output_ptr, diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cu b/onnxruntime/core/providers/cuda/math/matmul_integer.cu index 75f301f5fa..7f7bf1c41e 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cu +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cu @@ -10,7 +10,7 @@ namespace onnxruntime { namespace cuda { template -__global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_sum, const int8_t* offset, int32_t K) { +__global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_sum, const int8_t offset, int32_t K) { int32_t thread_data = 0; const int8_t* row_ptr = matrix + blockIdx.x * K; for (int i = threadIdx.x; i < K; i += TPB) { @@ -22,11 +22,11 @@ __global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_s int32_t sum = BlockReduce(temp_storage).Sum(thread_data); if (threadIdx.x == 0) { - row_sum[blockIdx.x] = (*offset) * sum; + row_sum[blockIdx.x] = offset * sum; } } -Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t* offset, const MatMulComputeHelper& helper) { +Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { ReduceRowSumOnMatrixAKernel(GridDim::maxThreadsPerBlock)> <<(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.LeftOffsets()[batch], @@ -35,11 +35,11 @@ Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_ static_cast(helper.K())); } - return CUDA_CALL( cudaPeekAtLastError() ) ? Status::OK() : Status( common::ONNXRUNTIME, common::FAIL ); + return CUDA_CALL(cudaPeekAtLastError()) ? Status::OK() : Status(common::ONNXRUNTIME, common::FAIL); } template -__global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_sum, const int8_t* offset, int32_t row, int32_t col) { +__global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_sum, const int8_t offset, int32_t row, int32_t col) { int32_t thread_data = 0; const int8_t* col_ptr = matrix + blockIdx.x; for (int i = threadIdx.x; i < row; i += TPB) { @@ -51,11 +51,11 @@ __global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_s int32_t sum = BlockReduce(temp_storage).Sum(thread_data); if (threadIdx.x == 0) { - col_sum[blockIdx.x] = (*offset) * sum; + col_sum[blockIdx.x] = offset * sum; } } -Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t* offset, const MatMulComputeHelper& helper) { +Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { ReduceColSumOnMatrixBKernel(GridDim::maxThreadsPerBlock)> <<(helper.N()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.RightOffsets()[batch], @@ -65,18 +65,16 @@ Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_ static_cast(helper.N())); } - return CUDA_CALL( cudaPeekAtLastError() ) ? Status::OK() : Status( common::ONNXRUNTIME, common::FAIL ); + return CUDA_CALL(cudaPeekAtLastError()) ? Status::OK() : Status(common::ONNXRUNTIME, common::FAIL); } __global__ void ComputeOffsetOfMatrixAB(const int32_t* row_sum, const int32_t* col_sum, int32_t* output, - const int8_t* a_offset, - const int8_t* b_offset, - int32_t K, + int32_t K_A_B, int32_t N) { for (int32_t i = threadIdx.x; i < N; i += blockDim.x) { - *(output + blockIdx.x * N + i) = K * (*a_offset) * (*b_offset) - row_sum[blockIdx.x] - col_sum[i]; + *(output + blockIdx.x * N + i) = K_A_B - row_sum[blockIdx.x] - col_sum[i]; } } @@ -99,8 +97,8 @@ __global__ void ComputeOffsetOfMatrixB(const int32_t* row_sum, Status OffsetOutput(const int32_t* row_sum, const int32_t* col_sum, int32_t* output, - const int8_t* a_offset, - const int8_t* b_offset, + const int8_t a_offset, + const int8_t b_offset, const MatMulComputeHelper& helper) { if (a_offset && b_offset) { for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) { @@ -108,9 +106,7 @@ Status OffsetOutput(const int32_t* row_sum, row_sum + batch * helper.M(), col_sum + batch * helper.N(), output + helper.OutputOffsets()[batch], - a_offset, - b_offset, - static_cast(helper.K()), + static_cast(helper.K()) * a_offset * b_offset, static_cast(helper.N())); } } else if (a_offset) { @@ -129,7 +125,7 @@ Status OffsetOutput(const int32_t* row_sum, } } - return CUDA_CALL( cudaPeekAtLastError() ) ? Status::OK() : Status( common::ONNXRUNTIME, common::FAIL ); + return CUDA_CALL(cudaPeekAtLastError()) ? Status::OK() : Status(common::ONNXRUNTIME, common::FAIL); } __global__ void PadMatrixInLeadingDimensionKernel(const int8_t* src, int8_t* dst, int col_src, int col_dst) { diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cuh b/onnxruntime/core/providers/cuda/math/matmul_integer.cuh index e61f521c86..7be00db358 100644 --- a/onnxruntime/core/providers/cuda/math/matmul_integer.cuh +++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cuh @@ -11,13 +11,13 @@ namespace onnxruntime { namespace cuda { -Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t* offset, const MatMulComputeHelper& helper); -Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t* offset, const MatMulComputeHelper& helper); +Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper); +Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper); Status OffsetOutput(const int32_t* row_sum, const int32_t* col_sum, int32_t* output, - const int8_t* a_offset, - const int8_t* b_offset, + const int8_t a_offset, + const int8_t b_offset, const MatMulComputeHelper& helper); Status PadMatrixInLeadingDimension(const int8_t* src, int8_t* dst, int64_t row, int64_t col, int64_t pad_size);