Use GEMM for SVMRegressor. (#3305)

2026-07-18 18:52:16 +00:00 · 2020-03-25 11:49:44 +10:00 · 2020-03-25 11:49:44 +10:00 · 4db01309cb
commit 4db01309cb
parent 19edad132c
4 changed files with 179 additions and 51 deletions
--- a/onnxruntime/core/providers/cpu/ml/svmclassifier.h
+++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.h
@ -7,6 +7,7 @@
 #include "core/framework/op_kernel.h"
 #include "core/util/math_cpuonly.h"
 #include "ml_common.h"
+#include "core/providers/cpu/math/gemm.h"

 namespace onnxruntime {
 namespace ml {
@ -15,7 +16,8 @@ namespace ml {
 template <typename T>
 class SVMCommon {
 protected:
-  SVMCommon(const OpKernelInfo& info) : kernel_type_(MakeKernel(info.GetAttrOrDefault<std::string>("kernel_type", "LINEAR"))) {
+  SVMCommon(const OpKernelInfo& info)
+      : kernel_type_(MakeKernel(info.GetAttrOrDefault<std::string>("kernel_type", "LINEAR"))) {
    std::vector<float> kernel_params;
    ORT_ENFORCE(info.GetAttrs<float>("kernel_params", kernel_params).IsOK());

@ -29,34 +31,98 @@ class SVMCommon {
  void set_kernel_type(KERNEL new_kernel_type) { kernel_type_ = new_kernel_type; }
  KERNEL get_kernel_type() const { return kernel_type_; }

+  void batched_kernel_dot(const gsl::span<const T> a, const gsl::span<const T> b,
+                          int64_t m, int64_t n, int64_t k,
+                          float scalar_C,
+                          const gsl::span<T> out,
+                          concurrency::ThreadPool* threadpool) const {
+    assert(a.size() == size_t(m * k) && b.size() == size_t(k * n) && out.size() == size_t(m * n));
+
+    if (kernel_type_ == KERNEL::RBF) {
+      T* cur_out = out.data();
+      const T* cur_batch = a.data();
+
+      // each batch has 'k' features
+      for (int64_t batch = 0; batch < m; ++batch) {
+        const T* cur_support_vector = b.data();
+
+        // broadcast the support vectors against the k features in each batch. output is one value per support vector
+        for (int64_t support_vector = 0; support_vector < n; ++support_vector) {
+          T sum = 0.f;
+          const T* cur_input = cur_batch;
+
+          for (int64_t feature = 0; feature < k; ++feature) {
+            T val = *cur_input++ - *cur_support_vector++;
+            sum += val * val;
+          }
+
+          *cur_out++ = std::exp(-gamma_ * sum);
+        }
+
+        cur_batch += k;  // move to start of next batch
+      }
+    } else {
+      float alpha = 1.f;
+      float beta = 1.f;
+      static const TensorShape shape_C({1});
+      float c = scalar_C;  // scalar_C is used for LINEAR in the GEMM
+
+      if (kernel_type_ != KERNEL::LINEAR) {
+        // kernel_type_ == POLY or SIGMOID
+        alpha = gamma_;
+        c = coef0_;
+      }
+
+      onnxruntime::Gemm<T>::ComputeGemm(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans,
+                                        m, n, k,
+                                        alpha, a.data(), b.data(), beta,
+                                        c != 0.f ? &c : nullptr, &shape_C,
+                                        out.data(),
+                                        threadpool);
+
+      if (kernel_type_ == KERNEL::POLY) {
+        auto map_out = EigenVectorArrayMap<T>(out.data(), out.size());
+        if (degree_ == 2)
+          map_out = map_out.square();
+        else if (degree_ == 3)
+          map_out = map_out.cube();
+        else
+          map_out = map_out.pow(degree_);
+
+      } else if (kernel_type_ == KERNEL::SIGMOID) {
+        MlasComputeTanh(out.data(), out.data(), out.size());
+      }
+    }
+  }
+
  float kernel_dot(const T* A, int64_t a, const std::vector<float>& B, int64_t b, int64_t len, KERNEL k) const {
    double sum = 0;
    const T* pA = A + a;
    const float* pB = B.data() + b;
-    switch(k) {
-        case KERNEL::POLY:
-            for (int64_t i = len; i > 0; --i)
-                sum += *pA++ * *pB++;
-            sum = gamma_ * sum + coef0_;
-            sum = std::pow(sum, degree_);
-            break;
-        case KERNEL::SIGMOID:
-            for (int64_t i = len; i > 0; --i)
-                sum += *pA++ * *pB++;
-            sum = gamma_ * sum + coef0_;
-            sum = std::tanh(sum);
-            break;
-        case KERNEL::RBF:
-            for (int64_t i = len; i > 0; --i) {
-                double val = *pA++ - *pB++;
-                sum += val * val;
-            }
-            sum = std::exp(-gamma_ * sum);
-            break;
-        case KERNEL::LINEAR:
-            for (int64_t i = len; i > 0; --i)
-                sum += *pA++ * *pB++;
-            break;
+    switch (k) {
+      case KERNEL::POLY:
+        for (int64_t i = len; i > 0; --i)
+          sum += *pA++ * *pB++;
+        sum = gamma_ * sum + coef0_;
+        sum = std::pow(sum, degree_);
+        break;
+      case KERNEL::SIGMOID:
+        for (int64_t i = len; i > 0; --i)
+          sum += *pA++ * *pB++;
+        sum = gamma_ * sum + coef0_;
+        sum = std::tanh(sum);
+        break;
+      case KERNEL::RBF:
+        for (int64_t i = len; i > 0; --i) {
+          double val = *pA++ - *pB++;
+          sum += val * val;
+        }
+        sum = std::exp(-gamma_ * sum);
+        break;
+      case KERNEL::LINEAR:
+        for (int64_t i = len; i > 0; --i)
+          sum += *pA++ * *pB++;
+        break;
    }
    return (float)sum;
  }
@ -70,6 +136,7 @@ class SVMCommon {

 template <typename T>
 class SVMClassifier final : public OpKernel, private SVMCommon<T> {
+  using SVMCommon<T>::batched_kernel_dot;
  using SVMCommon<T>::kernel_dot;
  using SVMCommon<T>::set_kernel_type;
  using SVMCommon<T>::get_kernel_type;
--- a/onnxruntime/core/providers/cpu/ml/svmregressor.cc
+++ b/onnxruntime/core/providers/cpu/ml/svmregressor.cc
@ -40,33 +40,61 @@ template <typename T>
 Status SVMRegressor<T>::Compute(OpKernelContext* ctx) const {
  const auto* X = ctx->Input<Tensor>(0);

-  int64_t stride = X->Shape().NumDimensions() == 1 ? X->Shape()[0] : X->Shape()[1];
-  int64_t N = X->Shape().NumDimensions() == 1 ? 1 : X->Shape()[0];
+  int64_t num_features = X->Shape().NumDimensions() == 1 ? X->Shape()[0] : X->Shape()[1];
+  int64_t num_batches = X->Shape().NumDimensions() == 1 ? 1 : X->Shape()[0];
+  ORT_ENFORCE(num_features == feature_count_);

-  Tensor* Y = ctx->Output(0, TensorShape({N, 1}));  // this op outputs for one target only
-  const auto* x_data = X->template Data<T>();
-  int64_t current_weight_0;
-  float sum;
+  // X: [num_batches, feature_count_] where features could be coefficients or support vectors
+  // coefficients_: [vector_count_]
+  // support_vectors_ : [vector_count_, feature_count_]

-  for (int64_t n = 0; n < N; n++) {  //for each example
-    current_weight_0 = n * stride;
-    sum = 0.f;
-    if (mode_ == SVM_TYPE::SVM_SVC) {
-      for (int64_t j = 0; j < vector_count_; j++) {
-        sum += kernel_dot(x_data, current_weight_0, support_vectors_, feature_count_ * j,
-                          feature_count_, get_kernel_type()) * coefficients_[j];
-      }
-      sum += rho_[0];
-    } else if (mode_ == SVM_TYPE::SVM_LINEAR) {  //liblinear
-      sum = kernel_dot(x_data, current_weight_0, coefficients_, 0,
-                       feature_count_, get_kernel_type());
-      sum += rho_[0];
+  // Y: [num_batches, 1]
+  Tensor* Y = ctx->Output(0, TensorShape({num_batches, 1}));  // this op outputs for one target only
+  const auto x_data = X->template DataAsSpan<T>();
+  auto out = Y->MutableDataAsSpan<T>();
+
+  concurrency::ThreadPool* threadpool = ctx->GetOperatorThreadPool();
+
+  if (mode_ == SVM_TYPE::SVM_SVC) {
+    AllocatorPtr allocator;
+    auto status = ctx->GetTempSpaceAllocator(&allocator);
+    ORT_RETURN_IF_ERROR(status);
+
+    auto tmp_data = IAllocator::MakeUniquePtr<T>(allocator, num_batches * vector_count_);
+    auto tmp_data_span = gsl::make_span<T>(tmp_data.get(), num_batches * vector_count_);
+
+    // combine the input data with the support vectors and apply the kernel type
+    // output is {num_batches, vector_count_}
+    batched_kernel_dot(x_data, support_vectors_, num_batches, vector_count_, feature_count_, 0.f, tmp_data_span,
+                       threadpool);
+
+    static const TensorShape rho_shape({1});
+
+    // combine with coefficients and add rho_[0]
+    onnxruntime::Gemm<T>::ComputeGemm(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans,
+                                      num_batches, 1, vector_count_,
+                                      1.f, tmp_data_span.data(), coefficients_.data(), 1.f,
+                                      rho_.data(), &rho_shape,
+                                      out.data(),
+                                      threadpool);
+  } else if (mode_ == SVM_TYPE::SVM_LINEAR) {
+    // combine the coefficients with the input data and apply the kernel type
+    batched_kernel_dot(x_data, coefficients_, num_batches, 1, feature_count_, rho_[0], out, threadpool);
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unexpected mode:", static_cast<int>(mode_));
+  }
+
+  if (one_class_) {
+    float* y = out.data();
+    float* y_end = y + out.size();
+
+    while (y < y_end) {
+      *y = (*y > 0.f ? 1.f : -1.f);
+      ++y;
    }
-    Y->template MutableData<float>()[n] = one_class_ ? (sum > 0 ? 1.f : -1.f) : sum;
  }

  return Status::OK();
 }
-
 }  // namespace ml
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/cpu/ml/svmregressor.h
+++ b/onnxruntime/core/providers/cpu/ml/svmregressor.h
@ -14,6 +14,7 @@ namespace ml {

 template <typename T>
 class SVMRegressor final : public OpKernel, private SVMCommon<T> {
+  using SVMCommon<T>::batched_kernel_dot;
  using SVMCommon<T>::kernel_dot;
  using SVMCommon<T>::set_kernel_type;
  using SVMCommon<T>::get_kernel_type;
--- a/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc
+++ b/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc
@ -60,14 +60,46 @@ TEST(MLOpTest, SVMRegressorNuSVC) {
 TEST(MLOpTest, SVMRegressorNuSVCPolyKernel) {
  OpTester test("SVMRegressor", 1, onnxruntime::kMLDomain);

-  std::vector<float> dual_coefficients = {-2.74322388e+01f, 5.81893108e+01f, -1.00000000e+02f, 6.91693781e+01f, 7.62161261e-02f, -2.66618042e-03f};
-  std::vector<float> support_vectors = {0.f, 0.5f, 32.f, 1.f, 1.5f, 1.f, 2.f, 2.9f, -32.f, 3.f, 13.3f, -11.f, 12.f, 12.9f, -312.f, 43.f, 413.3f, -114.f};
+  std::vector<float> dual_coefficients = {-2.74322388e+01f, 5.81893108e+01f, -1.00000000e+02f,
+                                          6.91693781e+01f, 7.62161261e-02f, -2.66618042e-03f};
+  std::vector<float> support_vectors = {0.f, 0.5f, 32.f,
+                                        1.f, 1.5f, 1.f,
+                                        2.f, 2.9f, -32.f,
+                                        3.f, 13.3f, -11.f,
+                                        12.f, 12.9f, -312.f,
+                                        43.f, 413.3f, -114.f};
  std::vector<float> rho = {1.5004596f};
  std::vector<float> kernel_params = {0.001f, 0.f, 3.f};  //gamma, coef0, degree

-  //three estimates, for 3 points each, so 9 predictions
-  std::vector<float> X = {1.f, 0.0f, 0.4f, 3.0f, 44.0f, -3.f, 12.0f, 12.9f, -312.f, 23.0f, 11.3f, -222.f, 23.0f, 11.3f, -222.f, 23.0f, 3311.3f, -222.f, 23.0f, 11.3f, -222.f, 43.0f, 413.3f, -114.f};
-  std::vector<float> predictions = {1.50041863e+00f, 3.49624795e-01f, 2.75850969e+00f, -2.28659294e+02f, -2.28659294e+02f, -6.09640826e+05f, -2.28659294e+02f, 3.89055773e+00f};
+  // 8 batches with 3 features in each
+  std::vector<float> X = {1.f, 0.0f, 0.4f,
+                          3.0f, 44.0f, -3.f,
+                          12.0f, 12.9f, 112.f,
+                          23.0f, 11.3f, -222.f,
+                          23.0f, 11.3f, -222.f,
+                          23.0f, 3311.3f, -222.f,
+                          23.0f, 11.3f, -222.f,
+                          43.0f, 413.3f, -114.f};
+
+  /*
+  # batched_dot_product calculations
+  first = np.matmul(X, np.transpose(support_vectors))
+  first *= gamma
+  first += coef0
+  POLY_dot_product = np.power(first, degree)
+
+  # second GEMM call in SVMRegressor code
+  predictions = np.matmul(POLY_dot_product, np.transpose(coefficients))
+  predictions += rho
+  */
+  std::vector<float> predictions = {1.50041862e+00f,
+                                    3.49624789e-01f,
+                                    -1.36680453e+02f,
+                                    -2.28659315e+02f,
+                                    -2.28659315e+02f,
+                                    -6.09640827e+05f,
+                                    -2.28659315e+02f,
+                                    3.89055458e+00f};

  test.AddAttribute("kernel_type", std::string("POLY"));
  test.AddAttribute("coefficients", dual_coefficients);