From 4db01309cb5fcf48d7e15986e0c6ffe44fcac55a Mon Sep 17 00:00:00 2001
From: Scott McKay <skottmckay@gmail.com>
Date: Wed, 25 Mar 2020 11:49:44 +1000
Subject: [PATCH] Use GEMM for SVMRegressor. (#3305)

---
 .../core/providers/cpu/ml/svmclassifier.h     | 117 ++++++++++++++----
 .../core/providers/cpu/ml/svmregressor.cc     |  70 +++++++----
 .../core/providers/cpu/ml/svmregressor.h      |   1 +
 .../providers/cpu/ml/svmregressor_test.cc     |  42 ++++++-
 4 files changed, 179 insertions(+), 51 deletions(-)
diff --git a/onnxruntime/core/providers/cpu/ml/svmclassifier.h b/onnxruntime/core/providers/cpu/ml/svmclassifier.h
index 060f84cacf..15794a7f47 100644
--- a/onnxruntime/core/providers/cpu/ml/svmclassifier.h
+++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.h
@@ -7,6 +7,7 @@
 #include "core/framework/op_kernel.h"
 #include "core/util/math_cpuonly.h"
 #include "ml_common.h"
+#include "core/providers/cpu/math/gemm.h"
 
 namespace onnxruntime {
 namespace ml {
@@ -15,7 +16,8 @@ namespace ml {
 template <typename T>
 class SVMCommon {
  protected:
-  SVMCommon(const OpKernelInfo& info) : kernel_type_(MakeKernel(info.GetAttrOrDefault<std::string>("kernel_type", "LINEAR"))) {
+  SVMCommon(const OpKernelInfo& info)
+      : kernel_type_(MakeKernel(info.GetAttrOrDefault<std::string>("kernel_type", "LINEAR"))) {
     std::vector<float> kernel_params;
     ORT_ENFORCE(info.GetAttrs<float>("kernel_params", kernel_params).IsOK());
 
@@ -29,34 +31,98 @@ class SVMCommon {
   void set_kernel_type(KERNEL new_kernel_type) { kernel_type_ = new_kernel_type; }
   KERNEL get_kernel_type() const { return kernel_type_; }
 
+  void batched_kernel_dot(const gsl::span<const T> a, const gsl::span<const T> b,
+                          int64_t m, int64_t n, int64_t k,
+                          float scalar_C,
+                          const gsl::span<T> out,
+                          concurrency::ThreadPool* threadpool) const {
+    assert(a.size() == size_t(m * k) && b.size() == size_t(k * n) && out.size() == size_t(m * n));
+
+    if (kernel_type_ == KERNEL::RBF) {
+      T* cur_out = out.data();
+      const T* cur_batch = a.data();
+
+      // each batch has 'k' features
+      for (int64_t batch = 0; batch < m; ++batch) {
+        const T* cur_support_vector = b.data();
+
+        // broadcast the support vectors against the k features in each batch. output is one value per support vector
+        for (int64_t support_vector = 0; support_vector < n; ++support_vector) {
+          T sum = 0.f;
+          const T* cur_input = cur_batch;
+
+          for (int64_t feature = 0; feature < k; ++feature) {
+            T val = *cur_input++ - *cur_support_vector++;
+            sum += val * val;
+          }
+
+          *cur_out++ = std::exp(-gamma_ * sum);
+        }
+
+        cur_batch += k;  // move to start of next batch
+      }
+    } else {
+      float alpha = 1.f;
+      float beta = 1.f;
+      static const TensorShape shape_C({1});
+      float c = scalar_C;  // scalar_C is used for LINEAR in the GEMM
+
+      if (kernel_type_ != KERNEL::LINEAR) {
+        // kernel_type_ == POLY or SIGMOID
+        alpha = gamma_;
+        c = coef0_;
+      }
+
+      onnxruntime::Gemm<T>::ComputeGemm(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans,
+                                        m, n, k,
+                                        alpha, a.data(), b.data(), beta,
+                                        c != 0.f ? &c : nullptr, &shape_C,
+                                        out.data(),
+                                        threadpool);
+
+      if (kernel_type_ == KERNEL::POLY) {
+        auto map_out = EigenVectorArrayMap<T>(out.data(), out.size());
+        if (degree_ == 2)
+          map_out = map_out.square();
+        else if (degree_ == 3)
+          map_out = map_out.cube();
+        else
+          map_out = map_out.pow(degree_);
+
+      } else if (kernel_type_ == KERNEL::SIGMOID) {
+        MlasComputeTanh(out.data(), out.data(), out.size());
+      }
+    }
+  }
+
   float kernel_dot(const T* A, int64_t a, const std::vector<float>& B, int64_t b, int64_t len, KERNEL k) const {
     double sum = 0;
     const T* pA = A + a;
     const float* pB = B.data() + b;
-    switch(k) {
-        case KERNEL::POLY:
-            for (int64_t i = len; i > 0; --i)
-                sum += *pA++ * *pB++;
-            sum = gamma_ * sum + coef0_;
-            sum = std::pow(sum, degree_);
-            break;
-        case KERNEL::SIGMOID:
-            for (int64_t i = len; i > 0; --i)
-                sum += *pA++ * *pB++;
-            sum = gamma_ * sum + coef0_;
-            sum = std::tanh(sum);
-            break;
-        case KERNEL::RBF:
-            for (int64_t i = len; i > 0; --i) {
-                double val = *pA++ - *pB++;
-                sum += val * val;
-            }
-            sum = std::exp(-gamma_ * sum);
-            break;
-        case KERNEL::LINEAR:
-            for (int64_t i = len; i > 0; --i)
-                sum += *pA++ * *pB++;
-            break;
+    switch (k) {
+      case KERNEL::POLY:
+        for (int64_t i = len; i > 0; --i)
+          sum += *pA++ * *pB++;
+        sum = gamma_ * sum + coef0_;
+        sum = std::pow(sum, degree_);
+        break;
+      case KERNEL::SIGMOID:
+        for (int64_t i = len; i > 0; --i)
+          sum += *pA++ * *pB++;
+        sum = gamma_ * sum + coef0_;
+        sum = std::tanh(sum);
+        break;
+      case KERNEL::RBF:
+        for (int64_t i = len; i > 0; --i) {
+          double val = *pA++ - *pB++;
+          sum += val * val;
+        }
+        sum = std::exp(-gamma_ * sum);
+        break;
+      case KERNEL::LINEAR:
+        for (int64_t i = len; i > 0; --i)
+          sum += *pA++ * *pB++;
+        break;
     }
     return (float)sum;
   }
@@ -70,6 +136,7 @@ class SVMCommon {
 
 template <typename T>
 class SVMClassifier final : public OpKernel, private SVMCommon<T> {
+  using SVMCommon<T>::batched_kernel_dot;
   using SVMCommon<T>::kernel_dot;
   using SVMCommon<T>::set_kernel_type;
   using SVMCommon<T>::get_kernel_type;
diff --git a/onnxruntime/core/providers/cpu/ml/svmregressor.cc b/onnxruntime/core/providers/cpu/ml/svmregressor.cc
index cfdd8a1298..b235b78570 100644
--- a/onnxruntime/core/providers/cpu/ml/svmregressor.cc
+++ b/onnxruntime/core/providers/cpu/ml/svmregressor.cc
@@ -40,33 +40,61 @@ template <typename T>
 Status SVMRegressor<T>::Compute(OpKernelContext* ctx) const {
   const auto* X = ctx->Input<Tensor>(0);
 
-  int64_t stride = X->Shape().NumDimensions() == 1 ? X->Shape()[0] : X->Shape()[1];
-  int64_t N = X->Shape().NumDimensions() == 1 ? 1 : X->Shape()[0];
+  int64_t num_features = X->Shape().NumDimensions() == 1 ? X->Shape()[0] : X->Shape()[1];
+  int64_t num_batches = X->Shape().NumDimensions() == 1 ? 1 : X->Shape()[0];
+  ORT_ENFORCE(num_features == feature_count_);
 
-  Tensor* Y = ctx->Output(0, TensorShape({N, 1}));  // this op outputs for one target only
-  const auto* x_data = X->template Data<T>();
-  int64_t current_weight_0;
-  float sum;
+  // X: [num_batches, feature_count_] where features could be coefficients or support vectors
+  // coefficients_: [vector_count_]
+  // support_vectors_ : [vector_count_, feature_count_]
 
-  for (int64_t n = 0; n < N; n++) {  //for each example
-    current_weight_0 = n * stride;
-    sum = 0.f;
-    if (mode_ == SVM_TYPE::SVM_SVC) {
-      for (int64_t j = 0; j < vector_count_; j++) {
-        sum += kernel_dot(x_data, current_weight_0, support_vectors_, feature_count_ * j,
-                          feature_count_, get_kernel_type()) * coefficients_[j];
-      }
-      sum += rho_[0];
-    } else if (mode_ == SVM_TYPE::SVM_LINEAR) {  //liblinear
-      sum = kernel_dot(x_data, current_weight_0, coefficients_, 0,
-                       feature_count_, get_kernel_type());
-      sum += rho_[0];
+  // Y: [num_batches, 1]
+  Tensor* Y = ctx->Output(0, TensorShape({num_batches, 1}));  // this op outputs for one target only
+  const auto x_data = X->template DataAsSpan<T>();
+  auto out = Y->MutableDataAsSpan<T>();
+
+  concurrency::ThreadPool* threadpool = ctx->GetOperatorThreadPool();
+
+  if (mode_ == SVM_TYPE::SVM_SVC) {
+    AllocatorPtr allocator;
+    auto status = ctx->GetTempSpaceAllocator(&allocator);
+    ORT_RETURN_IF_ERROR(status);
+
+    auto tmp_data = IAllocator::MakeUniquePtr<T>(allocator, num_batches * vector_count_);
+    auto tmp_data_span = gsl::make_span<T>(tmp_data.get(), num_batches * vector_count_);
+
+    // combine the input data with the support vectors and apply the kernel type
+    // output is {num_batches, vector_count_}
+    batched_kernel_dot(x_data, support_vectors_, num_batches, vector_count_, feature_count_, 0.f, tmp_data_span,
+                       threadpool);
+
+    static const TensorShape rho_shape({1});
+
+    // combine with coefficients and add rho_[0]
+    onnxruntime::Gemm<T>::ComputeGemm(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans,
+                                      num_batches, 1, vector_count_,
+                                      1.f, tmp_data_span.data(), coefficients_.data(), 1.f,
+                                      rho_.data(), &rho_shape,
+                                      out.data(),
+                                      threadpool);
+  } else if (mode_ == SVM_TYPE::SVM_LINEAR) {
+    // combine the coefficients with the input data and apply the kernel type
+    batched_kernel_dot(x_data, coefficients_, num_batches, 1, feature_count_, rho_[0], out, threadpool);
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unexpected mode:", static_cast<int>(mode_));
+  }
+
+  if (one_class_) {
+    float* y = out.data();
+    float* y_end = y + out.size();
+
+    while (y < y_end) {
+      *y = (*y > 0.f ? 1.f : -1.f);
+      ++y;
     }
-    Y->template MutableData<float>()[n] = one_class_ ? (sum > 0 ? 1.f : -1.f) : sum;
   }
 
   return Status::OK();
 }
-
 }  // namespace ml
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/ml/svmregressor.h b/onnxruntime/core/providers/cpu/ml/svmregressor.h
index 37d5a81bc1..4813b6ec08 100644
--- a/onnxruntime/core/providers/cpu/ml/svmregressor.h
+++ b/onnxruntime/core/providers/cpu/ml/svmregressor.h
@@ -14,6 +14,7 @@ namespace ml {
 
 template <typename T>
 class SVMRegressor final : public OpKernel, private SVMCommon<T> {
+  using SVMCommon<T>::batched_kernel_dot;
   using SVMCommon<T>::kernel_dot;
   using SVMCommon<T>::set_kernel_type;
   using SVMCommon<T>::get_kernel_type;
diff --git a/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc b/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc
index 3df0041761..53d0d601b1 100644
--- a/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc
@@ -60,14 +60,46 @@ TEST(MLOpTest, SVMRegressorNuSVC) {
 TEST(MLOpTest, SVMRegressorNuSVCPolyKernel) {
   OpTester test("SVMRegressor", 1, onnxruntime::kMLDomain);
 
-  std::vector<float> dual_coefficients = {-2.74322388e+01f, 5.81893108e+01f, -1.00000000e+02f, 6.91693781e+01f, 7.62161261e-02f, -2.66618042e-03f};
-  std::vector<float> support_vectors = {0.f, 0.5f, 32.f, 1.f, 1.5f, 1.f, 2.f, 2.9f, -32.f, 3.f, 13.3f, -11.f, 12.f, 12.9f, -312.f, 43.f, 413.3f, -114.f};
+  std::vector<float> dual_coefficients = {-2.74322388e+01f, 5.81893108e+01f, -1.00000000e+02f,
+                                          6.91693781e+01f, 7.62161261e-02f, -2.66618042e-03f};
+  std::vector<float> support_vectors = {0.f, 0.5f, 32.f,
+                                        1.f, 1.5f, 1.f,
+                                        2.f, 2.9f, -32.f,
+                                        3.f, 13.3f, -11.f,
+                                        12.f, 12.9f, -312.f,
+                                        43.f, 413.3f, -114.f};
   std::vector<float> rho = {1.5004596f};
   std::vector<float> kernel_params = {0.001f, 0.f, 3.f};  //gamma, coef0, degree
 
-  //three estimates, for 3 points each, so 9 predictions
-  std::vector<float> X = {1.f, 0.0f, 0.4f, 3.0f, 44.0f, -3.f, 12.0f, 12.9f, -312.f, 23.0f, 11.3f, -222.f, 23.0f, 11.3f, -222.f, 23.0f, 3311.3f, -222.f, 23.0f, 11.3f, -222.f, 43.0f, 413.3f, -114.f};
-  std::vector<float> predictions = {1.50041863e+00f, 3.49624795e-01f, 2.75850969e+00f, -2.28659294e+02f, -2.28659294e+02f, -6.09640826e+05f, -2.28659294e+02f, 3.89055773e+00f};
+  // 8 batches with 3 features in each
+  std::vector<float> X = {1.f, 0.0f, 0.4f,
+                          3.0f, 44.0f, -3.f,
+                          12.0f, 12.9f, 112.f,
+                          23.0f, 11.3f, -222.f,
+                          23.0f, 11.3f, -222.f,
+                          23.0f, 3311.3f, -222.f,
+                          23.0f, 11.3f, -222.f,
+                          43.0f, 413.3f, -114.f};
+
+  /*
+  # batched_dot_product calculations
+  first = np.matmul(X, np.transpose(support_vectors))
+  first *= gamma
+  first += coef0
+  POLY_dot_product = np.power(first, degree)
+
+  # second GEMM call in SVMRegressor code
+  predictions = np.matmul(POLY_dot_product, np.transpose(coefficients))
+  predictions += rho
+  */
+  std::vector<float> predictions = {1.50041862e+00f,
+                                    3.49624789e-01f,
+                                    -1.36680453e+02f,
+                                    -2.28659315e+02f,
+                                    -2.28659315e+02f,
+                                    -6.09640827e+05f,
+                                    -2.28659315e+02f,
+                                    3.89055458e+00f};
 
   test.AddAttribute("kernel_type", std::string("POLY"));
   test.AddAttribute("coefficients", dual_coefficients);