From 3de1fc096d2fd8c770c96f7ed4cdc6fa9aa55f9b Mon Sep 17 00:00:00 2001
From: Yufeng Li <liyufeng1987@gmail.com>
Date: Tue, 10 Mar 2020 13:56:23 -0700
Subject: [PATCH] Move zero point inputs of MatmulInteger to CPU memory (#3159)

---
 .../providers/cuda/math/matmul_integer.cc     | 16 ++++++----
 .../providers/cuda/math/matmul_integer.cu     | 32 ++++++++-----------
 .../providers/cuda/math/matmul_integer.cuh    |  8 ++---
 3 files changed, 27 insertions(+), 29 deletions(-)
diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cc b/onnxruntime/core/providers/cuda/math/matmul_integer.cc
index 39f1e10747..c27a949ed2 100644
--- a/onnxruntime/core/providers/cuda/math/matmul_integer.cc
+++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cc
@@ -18,6 +18,8 @@ ONNX_OPERATOR_TYPED_KERNEL_EX(
     int8_t,
     kCudaExecutionProvider,
     KernelDefBuilder()
+        .InputMemoryType<OrtMemTypeCPUInput>(2)
+        .InputMemoryType<OrtMemTypeCPUInput>(3)
         .TypeConstraint("T1", DataTypeImpl::GetTensorType<int8_t>())
         .TypeConstraint("T2", DataTypeImpl::GetTensorType<int8_t>())
         .TypeConstraint("T3", DataTypeImpl::GetTensorType<int32_t>()),
@@ -58,19 +60,19 @@ Status MatMulInteger<int8_t, int8_t>::ComputeInternal(OpKernelContext* ctx) cons
   int32_t* output_ptr = Y->template MutableData<int32_t>();
 
   // validate zero points
-  const int8_t* a_offset = nullptr;
-  const int8_t* b_offset = nullptr;
+  int8_t a_offset = 0;
+  int8_t b_offset = 0;
   if (has_a_zero_point_) {
     auto a_zero_point = ctx->Input<Tensor>(2);
     ORT_ENFORCE(IsScalarOr1ElementVector(a_zero_point),
                 "MatmulInteger : input1 zero point must be a scalar or 1D tensor of size 1");
-    a_offset = a_zero_point->template Data<int8_t>();
+    a_offset = *(a_zero_point->template Data<int8_t>());
   }
   if (has_b_zero_point_) {
     auto b_zero_point = ctx->Input<Tensor>(3);
     ORT_ENFORCE(IsScalarOr1ElementVector(b_zero_point),
                 "MatmulInteger : input2 zero point must be a scalar or 1D tensor of size 1");
-    b_offset = b_zero_point->template Data<int8_t>();
+    b_offset = *(b_zero_point->template Data<int8_t>());
   }
 
   // offset output c[i,j] to
@@ -81,20 +83,20 @@ Status MatMulInteger<int8_t, int8_t>::ComputeInternal(OpKernelContext* ctx) cons
   // ReduceColSumOnMatrixB computes the a_offset * (b[0,j] + b[1,j] ... + b[k,j]) part
   // OffsetOutput computes gets the final result
   IAllocatorUniquePtr<int32_t> a_row_buf;
-  if (has_b_zero_point_) {
+  if (b_offset != 0) {
     a_row_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.N());
     ORT_RETURN_IF_ERROR(ReduceRowSumOnMatrixA(a_ptr, a_row_buf.get(), b_offset, helper));
   }
 
   IAllocatorUniquePtr<int32_t> b_col_buf;
-  if (has_a_zero_point_) {
+  if (a_offset != 0) {
     b_col_buf = GetScratchBuffer<int32_t>(helper.OutputShape().Size() / helper.M());
     ORT_RETURN_IF_ERROR(ReduceColSumOnMatrixB(b_ptr, b_col_buf.get(), a_offset, helper));
   }
 
   int alpha = 1;
   int beta = 0;
-  if (has_a_zero_point_ || has_b_zero_point_) {
+  if (a_offset != 0 || b_offset != 0) {
     OffsetOutput(a_row_buf.get(),
                  b_col_buf.get(),
                  output_ptr,
diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cu b/onnxruntime/core/providers/cuda/math/matmul_integer.cu
index 75f301f5fa..7f7bf1c41e 100644
--- a/onnxruntime/core/providers/cuda/math/matmul_integer.cu
+++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cu
@@ -10,7 +10,7 @@ namespace onnxruntime {
 namespace cuda {
 
 template <int TPB>
-__global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_sum, const int8_t* offset, int32_t K) {
+__global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_sum, const int8_t offset, int32_t K) {
   int32_t thread_data = 0;
   const int8_t* row_ptr = matrix + blockIdx.x * K;
   for (int i = threadIdx.x; i < K; i += TPB) {
@@ -22,11 +22,11 @@ __global__ void ReduceRowSumOnMatrixAKernel(const int8_t* matrix, int32_t* row_s
   int32_t sum = BlockReduce(temp_storage).Sum(thread_data);
 
   if (threadIdx.x == 0) {
-    row_sum[blockIdx.x] = (*offset) * sum;
+    row_sum[blockIdx.x] = offset * sum;
   }
 }
 
-Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t* offset, const MatMulComputeHelper& helper) {
+Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper) {
   for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
     ReduceRowSumOnMatrixAKernel<static_cast<int>(GridDim::maxThreadsPerBlock)>
         <<<static_cast<int>(helper.M()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.LeftOffsets()[batch],
@@ -35,11 +35,11 @@ Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_
                                                                            static_cast<int>(helper.K()));
   }
 
-  return CUDA_CALL( cudaPeekAtLastError() ) ? Status::OK() : Status( common::ONNXRUNTIME, common::FAIL );
+  return CUDA_CALL(cudaPeekAtLastError()) ? Status::OK() : Status(common::ONNXRUNTIME, common::FAIL);
 }
 
 template <int TPB>
-__global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_sum, const int8_t* offset, int32_t row, int32_t col) {
+__global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_sum, const int8_t offset, int32_t row, int32_t col) {
   int32_t thread_data = 0;
   const int8_t* col_ptr = matrix + blockIdx.x;
   for (int i = threadIdx.x; i < row; i += TPB) {
@@ -51,11 +51,11 @@ __global__ void ReduceColSumOnMatrixBKernel(const int8_t* matrix, int32_t* col_s
   int32_t sum = BlockReduce(temp_storage).Sum(thread_data);
 
   if (threadIdx.x == 0) {
-    col_sum[blockIdx.x] = (*offset) * sum;
+    col_sum[blockIdx.x] = offset * sum;
   }
 }
 
-Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t* offset, const MatMulComputeHelper& helper) {
+Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper) {
   for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
     ReduceColSumOnMatrixBKernel<static_cast<int>(GridDim::maxThreadsPerBlock)>
         <<<static_cast<int>(helper.N()), GridDim::maxThreadsPerBlock, 0>>>(matrix + helper.RightOffsets()[batch],
@@ -65,18 +65,16 @@ Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_
                                                                            static_cast<int32_t>(helper.N()));
   }
 
-  return CUDA_CALL( cudaPeekAtLastError() ) ? Status::OK() : Status( common::ONNXRUNTIME, common::FAIL );
+  return CUDA_CALL(cudaPeekAtLastError()) ? Status::OK() : Status(common::ONNXRUNTIME, common::FAIL);
 }
 
 __global__ void ComputeOffsetOfMatrixAB(const int32_t* row_sum,
                                         const int32_t* col_sum,
                                         int32_t* output,
-                                        const int8_t* a_offset,
-                                        const int8_t* b_offset,
-                                        int32_t K,
+                                        int32_t K_A_B,
                                         int32_t N) {
   for (int32_t i = threadIdx.x; i < N; i += blockDim.x) {
-    *(output + blockIdx.x * N + i) = K * (*a_offset) * (*b_offset) - row_sum[blockIdx.x] - col_sum[i];
+    *(output + blockIdx.x * N + i) = K_A_B - row_sum[blockIdx.x] - col_sum[i];
   }
 }
 
@@ -99,8 +97,8 @@ __global__ void ComputeOffsetOfMatrixB(const int32_t* row_sum,
 Status OffsetOutput(const int32_t* row_sum,
                     const int32_t* col_sum,
                     int32_t* output,
-                    const int8_t* a_offset,
-                    const int8_t* b_offset,
+                    const int8_t a_offset,
+                    const int8_t b_offset,
                     const MatMulComputeHelper& helper) {
   if (a_offset && b_offset) {
     for (size_t batch = 0; batch < helper.OutputOffsets().size(); batch++) {
@@ -108,9 +106,7 @@ Status OffsetOutput(const int32_t* row_sum,
           row_sum + batch * helper.M(),
           col_sum + batch * helper.N(),
           output + helper.OutputOffsets()[batch],
-          a_offset,
-          b_offset,
-          static_cast<int32_t>(helper.K()),
+          static_cast<int32_t>(helper.K()) * a_offset * b_offset,
           static_cast<int32_t>(helper.N()));
     }
   } else if (a_offset) {
@@ -129,7 +125,7 @@ Status OffsetOutput(const int32_t* row_sum,
     }
   }
 
-  return CUDA_CALL( cudaPeekAtLastError() ) ? Status::OK() : Status( common::ONNXRUNTIME, common::FAIL );
+  return CUDA_CALL(cudaPeekAtLastError()) ? Status::OK() : Status(common::ONNXRUNTIME, common::FAIL);
 }
 
 __global__ void PadMatrixInLeadingDimensionKernel(const int8_t* src, int8_t* dst, int col_src, int col_dst) {
diff --git a/onnxruntime/core/providers/cuda/math/matmul_integer.cuh b/onnxruntime/core/providers/cuda/math/matmul_integer.cuh
index e61f521c86..7be00db358 100644
--- a/onnxruntime/core/providers/cuda/math/matmul_integer.cuh
+++ b/onnxruntime/core/providers/cuda/math/matmul_integer.cuh
@@ -11,13 +11,13 @@
 namespace onnxruntime {
 namespace cuda {
 
-Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t* offset, const MatMulComputeHelper& helper);
-Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t* offset, const MatMulComputeHelper& helper);
+Status ReduceRowSumOnMatrixA(const int8_t* matrix, int32_t* row_sum, const int8_t offset, const MatMulComputeHelper& helper);
+Status ReduceColSumOnMatrixB(const int8_t* matrix, int32_t* col_sum, const int8_t offset, const MatMulComputeHelper& helper);
 Status OffsetOutput(const int32_t* row_sum,
                     const int32_t* col_sum,
                     int32_t* output,
-                    const int8_t* a_offset,
-                    const int8_t* b_offset,
+                    const int8_t a_offset,
+                    const int8_t b_offset,
                     const MatMulComputeHelper& helper);
 
 Status PadMatrixInLeadingDimension(const int8_t* src, int8_t* dst, int64_t row, int64_t col, int64_t pad_size);