From 4db01309cb5fcf48d7e15986e0c6ffe44fcac55a Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Wed, 25 Mar 2020 11:49:44 +1000 Subject: [PATCH] Use GEMM for SVMRegressor. (#3305) --- .../core/providers/cpu/ml/svmclassifier.h | 117 ++++++++++++++---- .../core/providers/cpu/ml/svmregressor.cc | 70 +++++++---- .../core/providers/cpu/ml/svmregressor.h | 1 + .../providers/cpu/ml/svmregressor_test.cc | 42 ++++++- 4 files changed, 179 insertions(+), 51 deletions(-) diff --git a/onnxruntime/core/providers/cpu/ml/svmclassifier.h b/onnxruntime/core/providers/cpu/ml/svmclassifier.h index 060f84cacf..15794a7f47 100644 --- a/onnxruntime/core/providers/cpu/ml/svmclassifier.h +++ b/onnxruntime/core/providers/cpu/ml/svmclassifier.h @@ -7,6 +7,7 @@ #include "core/framework/op_kernel.h" #include "core/util/math_cpuonly.h" #include "ml_common.h" +#include "core/providers/cpu/math/gemm.h" namespace onnxruntime { namespace ml { @@ -15,7 +16,8 @@ namespace ml { template class SVMCommon { protected: - SVMCommon(const OpKernelInfo& info) : kernel_type_(MakeKernel(info.GetAttrOrDefault("kernel_type", "LINEAR"))) { + SVMCommon(const OpKernelInfo& info) + : kernel_type_(MakeKernel(info.GetAttrOrDefault("kernel_type", "LINEAR"))) { std::vector kernel_params; ORT_ENFORCE(info.GetAttrs("kernel_params", kernel_params).IsOK()); @@ -29,34 +31,98 @@ class SVMCommon { void set_kernel_type(KERNEL new_kernel_type) { kernel_type_ = new_kernel_type; } KERNEL get_kernel_type() const { return kernel_type_; } + void batched_kernel_dot(const gsl::span a, const gsl::span b, + int64_t m, int64_t n, int64_t k, + float scalar_C, + const gsl::span out, + concurrency::ThreadPool* threadpool) const { + assert(a.size() == size_t(m * k) && b.size() == size_t(k * n) && out.size() == size_t(m * n)); + + if (kernel_type_ == KERNEL::RBF) { + T* cur_out = out.data(); + const T* cur_batch = a.data(); + + // each batch has 'k' features + for (int64_t batch = 0; batch < m; ++batch) { + const T* cur_support_vector = b.data(); + + // broadcast the support vectors against the k features in each batch. output is one value per support vector + for (int64_t support_vector = 0; support_vector < n; ++support_vector) { + T sum = 0.f; + const T* cur_input = cur_batch; + + for (int64_t feature = 0; feature < k; ++feature) { + T val = *cur_input++ - *cur_support_vector++; + sum += val * val; + } + + *cur_out++ = std::exp(-gamma_ * sum); + } + + cur_batch += k; // move to start of next batch + } + } else { + float alpha = 1.f; + float beta = 1.f; + static const TensorShape shape_C({1}); + float c = scalar_C; // scalar_C is used for LINEAR in the GEMM + + if (kernel_type_ != KERNEL::LINEAR) { + // kernel_type_ == POLY or SIGMOID + alpha = gamma_; + c = coef0_; + } + + onnxruntime::Gemm::ComputeGemm(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans, + m, n, k, + alpha, a.data(), b.data(), beta, + c != 0.f ? &c : nullptr, &shape_C, + out.data(), + threadpool); + + if (kernel_type_ == KERNEL::POLY) { + auto map_out = EigenVectorArrayMap(out.data(), out.size()); + if (degree_ == 2) + map_out = map_out.square(); + else if (degree_ == 3) + map_out = map_out.cube(); + else + map_out = map_out.pow(degree_); + + } else if (kernel_type_ == KERNEL::SIGMOID) { + MlasComputeTanh(out.data(), out.data(), out.size()); + } + } + } + float kernel_dot(const T* A, int64_t a, const std::vector& B, int64_t b, int64_t len, KERNEL k) const { double sum = 0; const T* pA = A + a; const float* pB = B.data() + b; - switch(k) { - case KERNEL::POLY: - for (int64_t i = len; i > 0; --i) - sum += *pA++ * *pB++; - sum = gamma_ * sum + coef0_; - sum = std::pow(sum, degree_); - break; - case KERNEL::SIGMOID: - for (int64_t i = len; i > 0; --i) - sum += *pA++ * *pB++; - sum = gamma_ * sum + coef0_; - sum = std::tanh(sum); - break; - case KERNEL::RBF: - for (int64_t i = len; i > 0; --i) { - double val = *pA++ - *pB++; - sum += val * val; - } - sum = std::exp(-gamma_ * sum); - break; - case KERNEL::LINEAR: - for (int64_t i = len; i > 0; --i) - sum += *pA++ * *pB++; - break; + switch (k) { + case KERNEL::POLY: + for (int64_t i = len; i > 0; --i) + sum += *pA++ * *pB++; + sum = gamma_ * sum + coef0_; + sum = std::pow(sum, degree_); + break; + case KERNEL::SIGMOID: + for (int64_t i = len; i > 0; --i) + sum += *pA++ * *pB++; + sum = gamma_ * sum + coef0_; + sum = std::tanh(sum); + break; + case KERNEL::RBF: + for (int64_t i = len; i > 0; --i) { + double val = *pA++ - *pB++; + sum += val * val; + } + sum = std::exp(-gamma_ * sum); + break; + case KERNEL::LINEAR: + for (int64_t i = len; i > 0; --i) + sum += *pA++ * *pB++; + break; } return (float)sum; } @@ -70,6 +136,7 @@ class SVMCommon { template class SVMClassifier final : public OpKernel, private SVMCommon { + using SVMCommon::batched_kernel_dot; using SVMCommon::kernel_dot; using SVMCommon::set_kernel_type; using SVMCommon::get_kernel_type; diff --git a/onnxruntime/core/providers/cpu/ml/svmregressor.cc b/onnxruntime/core/providers/cpu/ml/svmregressor.cc index cfdd8a1298..b235b78570 100644 --- a/onnxruntime/core/providers/cpu/ml/svmregressor.cc +++ b/onnxruntime/core/providers/cpu/ml/svmregressor.cc @@ -40,33 +40,61 @@ template Status SVMRegressor::Compute(OpKernelContext* ctx) const { const auto* X = ctx->Input(0); - int64_t stride = X->Shape().NumDimensions() == 1 ? X->Shape()[0] : X->Shape()[1]; - int64_t N = X->Shape().NumDimensions() == 1 ? 1 : X->Shape()[0]; + int64_t num_features = X->Shape().NumDimensions() == 1 ? X->Shape()[0] : X->Shape()[1]; + int64_t num_batches = X->Shape().NumDimensions() == 1 ? 1 : X->Shape()[0]; + ORT_ENFORCE(num_features == feature_count_); - Tensor* Y = ctx->Output(0, TensorShape({N, 1})); // this op outputs for one target only - const auto* x_data = X->template Data(); - int64_t current_weight_0; - float sum; + // X: [num_batches, feature_count_] where features could be coefficients or support vectors + // coefficients_: [vector_count_] + // support_vectors_ : [vector_count_, feature_count_] - for (int64_t n = 0; n < N; n++) { //for each example - current_weight_0 = n * stride; - sum = 0.f; - if (mode_ == SVM_TYPE::SVM_SVC) { - for (int64_t j = 0; j < vector_count_; j++) { - sum += kernel_dot(x_data, current_weight_0, support_vectors_, feature_count_ * j, - feature_count_, get_kernel_type()) * coefficients_[j]; - } - sum += rho_[0]; - } else if (mode_ == SVM_TYPE::SVM_LINEAR) { //liblinear - sum = kernel_dot(x_data, current_weight_0, coefficients_, 0, - feature_count_, get_kernel_type()); - sum += rho_[0]; + // Y: [num_batches, 1] + Tensor* Y = ctx->Output(0, TensorShape({num_batches, 1})); // this op outputs for one target only + const auto x_data = X->template DataAsSpan(); + auto out = Y->MutableDataAsSpan(); + + concurrency::ThreadPool* threadpool = ctx->GetOperatorThreadPool(); + + if (mode_ == SVM_TYPE::SVM_SVC) { + AllocatorPtr allocator; + auto status = ctx->GetTempSpaceAllocator(&allocator); + ORT_RETURN_IF_ERROR(status); + + auto tmp_data = IAllocator::MakeUniquePtr(allocator, num_batches * vector_count_); + auto tmp_data_span = gsl::make_span(tmp_data.get(), num_batches * vector_count_); + + // combine the input data with the support vectors and apply the kernel type + // output is {num_batches, vector_count_} + batched_kernel_dot(x_data, support_vectors_, num_batches, vector_count_, feature_count_, 0.f, tmp_data_span, + threadpool); + + static const TensorShape rho_shape({1}); + + // combine with coefficients and add rho_[0] + onnxruntime::Gemm::ComputeGemm(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans, + num_batches, 1, vector_count_, + 1.f, tmp_data_span.data(), coefficients_.data(), 1.f, + rho_.data(), &rho_shape, + out.data(), + threadpool); + } else if (mode_ == SVM_TYPE::SVM_LINEAR) { + // combine the coefficients with the input data and apply the kernel type + batched_kernel_dot(x_data, coefficients_, num_batches, 1, feature_count_, rho_[0], out, threadpool); + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unexpected mode:", static_cast(mode_)); + } + + if (one_class_) { + float* y = out.data(); + float* y_end = y + out.size(); + + while (y < y_end) { + *y = (*y > 0.f ? 1.f : -1.f); + ++y; } - Y->template MutableData()[n] = one_class_ ? (sum > 0 ? 1.f : -1.f) : sum; } return Status::OK(); } - } // namespace ml } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/ml/svmregressor.h b/onnxruntime/core/providers/cpu/ml/svmregressor.h index 37d5a81bc1..4813b6ec08 100644 --- a/onnxruntime/core/providers/cpu/ml/svmregressor.h +++ b/onnxruntime/core/providers/cpu/ml/svmregressor.h @@ -14,6 +14,7 @@ namespace ml { template class SVMRegressor final : public OpKernel, private SVMCommon { + using SVMCommon::batched_kernel_dot; using SVMCommon::kernel_dot; using SVMCommon::set_kernel_type; using SVMCommon::get_kernel_type; diff --git a/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc b/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc index 3df0041761..53d0d601b1 100644 --- a/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc +++ b/onnxruntime/test/providers/cpu/ml/svmregressor_test.cc @@ -60,14 +60,46 @@ TEST(MLOpTest, SVMRegressorNuSVC) { TEST(MLOpTest, SVMRegressorNuSVCPolyKernel) { OpTester test("SVMRegressor", 1, onnxruntime::kMLDomain); - std::vector dual_coefficients = {-2.74322388e+01f, 5.81893108e+01f, -1.00000000e+02f, 6.91693781e+01f, 7.62161261e-02f, -2.66618042e-03f}; - std::vector support_vectors = {0.f, 0.5f, 32.f, 1.f, 1.5f, 1.f, 2.f, 2.9f, -32.f, 3.f, 13.3f, -11.f, 12.f, 12.9f, -312.f, 43.f, 413.3f, -114.f}; + std::vector dual_coefficients = {-2.74322388e+01f, 5.81893108e+01f, -1.00000000e+02f, + 6.91693781e+01f, 7.62161261e-02f, -2.66618042e-03f}; + std::vector support_vectors = {0.f, 0.5f, 32.f, + 1.f, 1.5f, 1.f, + 2.f, 2.9f, -32.f, + 3.f, 13.3f, -11.f, + 12.f, 12.9f, -312.f, + 43.f, 413.3f, -114.f}; std::vector rho = {1.5004596f}; std::vector kernel_params = {0.001f, 0.f, 3.f}; //gamma, coef0, degree - //three estimates, for 3 points each, so 9 predictions - std::vector X = {1.f, 0.0f, 0.4f, 3.0f, 44.0f, -3.f, 12.0f, 12.9f, -312.f, 23.0f, 11.3f, -222.f, 23.0f, 11.3f, -222.f, 23.0f, 3311.3f, -222.f, 23.0f, 11.3f, -222.f, 43.0f, 413.3f, -114.f}; - std::vector predictions = {1.50041863e+00f, 3.49624795e-01f, 2.75850969e+00f, -2.28659294e+02f, -2.28659294e+02f, -6.09640826e+05f, -2.28659294e+02f, 3.89055773e+00f}; + // 8 batches with 3 features in each + std::vector X = {1.f, 0.0f, 0.4f, + 3.0f, 44.0f, -3.f, + 12.0f, 12.9f, 112.f, + 23.0f, 11.3f, -222.f, + 23.0f, 11.3f, -222.f, + 23.0f, 3311.3f, -222.f, + 23.0f, 11.3f, -222.f, + 43.0f, 413.3f, -114.f}; + + /* + # batched_dot_product calculations + first = np.matmul(X, np.transpose(support_vectors)) + first *= gamma + first += coef0 + POLY_dot_product = np.power(first, degree) + + # second GEMM call in SVMRegressor code + predictions = np.matmul(POLY_dot_product, np.transpose(coefficients)) + predictions += rho + */ + std::vector predictions = {1.50041862e+00f, + 3.49624789e-01f, + -1.36680453e+02f, + -2.28659315e+02f, + -2.28659315e+02f, + -6.09640827e+05f, + -2.28659315e+02f, + 3.89055458e+00f}; test.AddAttribute("kernel_type", std::string("POLY")); test.AddAttribute("coefficients", dual_coefficients);