From 40d80cde8f26204ba6d43ab9f8ba34ebcfc73ca8 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Thu, 9 Apr 2020 14:05:25 +1000 Subject: [PATCH] Rework CDist (#3393) * Make CDist faster via Eigen squaredNorma and GEMM. * Add call to abs() as the GEMM output may differ slightly due to floating point accuracy and result in a negative distance which returns NaN if sqrt() is applied to it. * Update math::Gemm to use the type for alpha and beta instead of hardcoding to float. Matches the GemmEx definition. * Provide Eigen based replication of the GEMM call on x86 if T=double. * Make test model data deterministic. * Do the GEMM first so we can avoid potentially subtracting two numbers that are very close to each other. --- onnxruntime/contrib_ops/cpu/cdist.cc | 117 +++++++++++++++- onnxruntime/contrib_ops/cpu/cdist.h | 131 +----------------- onnxruntime/core/util/math.h | 4 +- onnxruntime/core/util/math_cpu.cc | 25 ++++ onnxruntime/test/contrib_ops/cdist_op_test.cc | 73 ++++++++++ onnxruntime/test/onnx/gen_test_models.py | 6 +- 6 files changed, 227 insertions(+), 129 deletions(-) create mode 100644 onnxruntime/test/contrib_ops/cdist_op_test.cc diff --git a/onnxruntime/contrib_ops/cpu/cdist.cc b/onnxruntime/contrib_ops/cpu/cdist.cc index d97f32846e..f6768bc731 100644 --- a/onnxruntime/contrib_ops/cpu/cdist.cc +++ b/onnxruntime/contrib_ops/cpu/cdist.cc @@ -2,6 +2,11 @@ // Licensed under the MIT License. #include "cdist.h" +#include "core/common/common.h" +#include "core/framework/op_kernel.h" +#include "core/util/math.h" +#include "core/util/math_cpuonly.h" +#include "core/mlas/inc/mlas.h" namespace onnxruntime { namespace contrib { @@ -12,5 +17,115 @@ namespace contrib { DEFINE_KERNEL(float); DEFINE_KERNEL(double); +template +static void CalculateSqeuclidean(const Tensor& a, const Tensor& b, Tensor& c, concurrency::ThreadPool* threadpool) { + // input shapes have already been validated + const auto& shape_a = a.Shape().GetDims(); // {m, k} + const auto& shape_b = b.Shape().GetDims(); // {n, k} + int64_t m = shape_a[0]; + int64_t n = shape_b[0]; + int64_t k = shape_a[1]; + + // https://github.com/droyed/eucl_dist/wiki/Main-Article + // dist(Xi,Yj) = sum_k(Xik**2) + sum_k(Yjk**2) - 2*sum_k(Xik*Yjk) + + const auto* a_data = a.Data(); + const auto* b_data = b.Data(); + auto* c_data = c.MutableData(); + + // ReduceSumSquare for A + std::vector a_ss; + a_ss.resize(m); + const auto* cur_a = a_data; + for (int64_t i = 0; i < m; ++i) { + a_ss[i] = ConstEigenVectorMap(cur_a, k).squaredNorm(); + cur_a += k; + } + + // ReduceSumSquare for B + std::vector b_ss; + b_ss.resize(n); + const auto* cur_b = b_data; + for (int64_t i = 0; i < n; ++i) { + b_ss[i] = ConstEigenVectorMap(cur_b, k).squaredNorm(); + cur_b += k; + } + + // NOTE: We want to avoid subtracting two numbers that are very close to each other as that can lead to + // 'catastrophic cancellation'. (sum_k(Xik**2) + sum_k(Yjk**2)) would be close to 2*sum_k(Xik*Yjk) if the values + // in Xij and Yjk are very similar, so subtracting can be problematic. + // Due to that we calculate -2*sum_k(Xik*Yjk) using GEMM, add sum_k(Xik**2) next, and add sum_k(Yjk**2) last. + +// use MLAS on 64-bit (no 32-bit dgemm), or MKL on 32-bit or 64-bit +#if defined(_M_AMD64) || defined(__x86_64__) || defined(USE_MKLML_FOR_BLAS) + // Use GEMM of A and B^T with -2 as alpha to calculate -2*sum_k(Xik*Yjk) + math::Gemm(CBLAS_TRANSPOSE::CblasNoTrans, CBLAS_TRANSPOSE::CblasTrans, + m, n, k, + static_cast(-2.), a_data, b_data, static_cast(0.), + c_data, + threadpool); +#else + // the performance of this isn't great as the eigen matmul is single threaded by default + // if you're on x86 and care about performance try MKL first. if there's a good enough argument for optimising this + // we can look into it in the future. + ORT_UNUSED_PARAMETER(threadpool); + + // https://eigen.tuxfamily.org/dox/TopicWritingEfficientProductExpression.html + auto out_map = EigenMatrixMapRowMajor(c_data, m, n); + out_map.noalias() = static_cast(-2.) * + (ConstEigenMatrixMapRowMajor(a_data, m, k) * + ConstEigenMatrixMapRowMajor(b_data, n, k).transpose()); +#endif + + // add a_ss and b_ss, with broadcast + // output shape is {m, n} + auto* cur_out = c_data; + for (int64_t i = 0; i < m; ++i) { + T a_val = a_ss[i]; + for (int64_t j = 0; j < n; ++j) { + *cur_out = (*cur_out + a_val) + b_ss[j]; + ++cur_out; + } + } +} + +template +common::Status CDist::Compute(OpKernelContext* context) const { + concurrency::ThreadPool* tp = context->GetOperatorThreadPool(); + + assert(context->InputCount() == 2); + const Tensor* A = context->Input(0); + const Tensor* B = context->Input(1); + const TensorShape& shape_a = A->Shape(); + const TensorShape& shape_b = B->Shape(); + if (shape_a.NumDimensions() != 2 || shape_a[1] <= 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "The first input of CDist kernel has wrong shape: ", shape_a); + } + + if (shape_b.NumDimensions() != 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "The second input of CDist kernel has wrong shape: ", shape_b); + } + if (shape_a[1] != shape_b[1]) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Input shape dimensions mismatch:", shape_a, " and ", shape_b); + } + + TensorShape output_shape = {shape_a[0], shape_b[0]}; + Tensor* C = context->Output(0, output_shape); + T* output = C->MutableData(); + + CalculateSqeuclidean(*A, *B, *C, tp); + auto map_out = EigenVectorArrayMap(output, output_shape.Size()); + + // because we use GEMM in CalculateSqeuclidean there's a slight chance a number extremely close to zero + // could be negative, so we need to run abs() to avoid NaN's in the results. + if (mode_ == Mode::EUCLIDEAN) { + map_out = map_out.abs().sqrt(); // do both abs and sqrt in one call so Eigen has a chance to combine + } else if (mode_ == Mode::SQEUCLIDEAN) { + map_out = map_out.abs(); + } + + return Status::OK(); +} + } // namespace contrib -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/cdist.h b/onnxruntime/contrib_ops/cpu/cdist.h index c7f0489c4a..9ccba3aee7 100644 --- a/onnxruntime/contrib_ops/cpu/cdist.h +++ b/onnxruntime/contrib_ops/cpu/cdist.h @@ -4,152 +4,33 @@ #pragma once #include "core/common/common.h" -#include "core/util/distance.h" #include "core/framework/op_kernel.h" -#include "core/framework/op_kernel_context_internal.h" -#include "core/util/math_cpuonly.h" -#include "assert.h" -#ifndef _OPENMP -#include "core/util/eigen_common_wrapper.h" -#endif namespace onnxruntime { namespace contrib { -// https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html -//\param a: matrix with shape of[ma,n] -//\param b: matrix with shape of[mb,n] -//\param dest: matrix with shape of [ma,mb] -template -void cdist_single_threaded(const T* a, const T* b, T* dest, size_t ma, size_t mb, size_t n) { - ElemFunc f; - for (size_t i = 0; i != ma; ++i) { - // i-th row of matrix A - const T* a1 = a + n * i; - for (size_t j = 0; j != mb; ++j) { - // j-th row of matrix B - const T* b1 = b + n * j; - *dest++ = f(a1, b1, n); - } - } -} - -template -class CDistOneBlock { - public: - const T* a; - const T* b; - T* dest; - size_t mb; - size_t n; - - CDistOneBlock(const T* a1, const T* b1, T* dest1, size_t mb1, size_t n1) - : a(a1), b(b1), dest(dest1), mb(mb1), n(n1) {} - - void operator()(Eigen::Index start, Eigen::Index end) { - Eigen::Index mb_local = mb; - Eigen::Index i = start / mb_local; - Eigen::Index j = start - i * mb_local; - assert(i * mb_local + j == start); - Eigen::Index i_end = end / mb_local; - Eigen::Index j_end = end - i_end * mb_local; - assert(i_end * mb_local + j_end == end); - - T* dest_local = dest + start; - ElemFunc f; - const T* a1 = a + n * i; - for (; i != i_end; ++i) { - a1 = a + n * i; - for (; j != mb_local; ++j) { - const T* b1 = b + n * j; - *dest_local++ = f(a1, b1, n); - } - j = 0; - } - a1 = a + n * i; - for (j = 0; j != j_end; ++j) { - const T* b1 = b + n * j; - *dest_local++ = f(a1, b1, n); - } - assert(dest_local == dest + end); - } -}; - -template -void cdist(const T* a, const T* b, T* dest, size_t ma, size_t mb, size_t n, concurrency::ThreadPool* tp) { -#ifndef _OPENMP - if (tp == nullptr) { -#else - (void)tp; -#endif - return cdist_single_threaded(a, b, dest, ma, mb, n); -#ifndef _OPENMP - } - tp->ParallelFor(ma * mb, static_cast(3 * n), CDistOneBlock(a, b, dest, mb, n)); -#endif -} - template class CDist final : public OpKernel { private: typedef void (*DistFunc)(const T* a, const T* b, T* dest, size_t ma, size_t mb, size_t n, concurrency::ThreadPool* tp); - enum { EUCLIDEAN, - SQEUCLIDEAN } mode_; + enum class Mode { EUCLIDEAN, + SQEUCLIDEAN } mode_; public: CDist(const OpKernelInfo& info) : OpKernel(info) { std::string metric; ORT_ENFORCE(info.GetAttr("metric", &metric).IsOK()); if (metric.compare("sqeuclidean") == 0) - mode_ = SQEUCLIDEAN; + mode_ = Mode::SQEUCLIDEAN; else if (metric.compare("euclidean") == 0) { - mode_ = EUCLIDEAN; + mode_ = Mode::EUCLIDEAN; } else ORT_NOT_IMPLEMENTED(); } - common::Status Compute(OpKernelContext* context) const override { - concurrency::ThreadPool* tp = context->GetOperatorThreadPool(); - - assert(context->InputCount() == 2); - const Tensor* A = context->Input(0); - const Tensor* B = context->Input(1); - const TensorShape& shape_a = A->Shape(); - const TensorShape& shape_b = B->Shape(); - if (shape_a.NumDimensions() != 2 || shape_a[1] <= 0) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "The first input of CDist kernel has wrong shape: ", shape_a); - } - - if (shape_b.NumDimensions() != 2) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "The second input of CDist kernel has wrong shape: ", shape_b); - } - if (shape_a[1] != shape_b[1]) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Input shape dimensions mismatch:", shape_a, " and ", shape_b); - } - - TensorShape output_shape = {shape_a[0], shape_b[0]}; - Tensor* C = context->Output(0, output_shape); - T* output = C->MutableData(); - switch (mode_) { - case EUCLIDEAN: - if (shape_a[1] >= 8) - cdist >(A->Data(), B->Data(), output, shape_a[0], shape_b[0], shape_a[1], tp); - else // for smaller vector size, a raw loop is better - cdist >(A->Data(), B->Data(), output, shape_a[0], shape_b[0], shape_a[1], tp); - break; - case SQEUCLIDEAN: - if (shape_a[1] >= 8) - cdist >(A->Data(), B->Data(), output, shape_a[0], shape_b[0], shape_a[1], - tp); - else // for smaller vector size, a raw loop is better - cdist >(A->Data(), B->Data(), output, shape_a[0], shape_b[0], shape_a[1], tp); - break; - default: - return Status(ONNXRUNTIME, NOT_IMPLEMENTED); - } - return Status::OK(); - } + common::Status Compute(OpKernelContext* context) const override; }; + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/core/util/math.h b/onnxruntime/core/util/math.h index abdcd4f25a..4fcf08372a 100644 --- a/onnxruntime/core/util/math.h +++ b/onnxruntime/core/util/math.h @@ -88,10 +88,10 @@ void Gemm( int64_t M, int64_t N, int64_t K, - float alpha, + T alpha, const T* A, const T* B, - float beta, + T beta, T* C, Provider*); diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc index 4619760255..8930aafbf4 100644 --- a/onnxruntime/core/util/math_cpu.cc +++ b/onnxruntime/core/util/math_cpu.cc @@ -84,6 +84,17 @@ void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE MlasGemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N, threadpool); } +#if defined(_M_AMD64) || defined(__x86_64__) +template <> +void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, + const int64_t N, const int64_t K, double alpha, const double* A, const double* B, double beta, + double* C, ThreadPool* threadpool) { + int lda = static_cast((TransA == CblasNoTrans) ? K : M); + int ldb = static_cast((TransB == CblasNoTrans) ? N : K); + MlasGemm(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N, threadpool); +} +#endif + template <> void MatMul(int M, int N, int K, const float* A, const float* B, float* C, ThreadPool* threadpool) { MlasGemm(CblasNoTrans, CblasNoTrans, M, N, K, 1.f, A, K, B, N, 0.f, C, N, threadpool); @@ -160,6 +171,20 @@ void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE beta, C, gsl::narrow_cast(N)); } +template <> +void Gemm(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int64_t M, + const int64_t N, const int64_t K, double alpha, const double* A, const double* B, double beta, + double* C, ThreadPool* /*context*/) { + int lda = gsl::narrow_cast((TransA == CblasNoTrans) ? K : M); + int ldb = gsl::narrow_cast((TransB == CblasNoTrans) ? N : K); + cblas_dgemm(CblasRowMajor, TransA, TransB, + gsl::narrow_cast(M), + gsl::narrow_cast(N), + gsl::narrow_cast(K), + alpha, A, lda, B, ldb, + beta, C, gsl::narrow_cast(N)); +} + template <> void MatMul(int M, int N, int K, const float* A, const float* B, float* C, ThreadPool*) { cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, A, K, B, N, 0, C, N); diff --git a/onnxruntime/test/contrib_ops/cdist_op_test.cc b/onnxruntime/test/contrib_ops/cdist_op_test.cc new file mode 100644 index 0000000000..72626aa057 --- /dev/null +++ b/onnxruntime/test/contrib_ops/cdist_op_test.cc @@ -0,0 +1,73 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "gtest/gtest.h" +#include "test/providers/provider_test_utils.h" + +namespace onnxruntime { +namespace test { + +TEST(CDistOpTest, Euclidean) { + OpTester test("CDist", 1, onnxruntime::kMSDomain); + test.AddAttribute("metric", "euclidean"); + + test.AddInput("A", {4, 2}, + {-1.0856307f, 0.99734545f, + 0.2829785f, -1.5062947f, + -0.5786002f, 1.6514366f, + -2.4266791f, -0.42891264f}); + test.AddInput("B", {3, 2}, + {1.2659363f, -0.8667404f, + -0.6788862f, -0.09470897f, + 1.4913896f, -0.638902f}); + + test.AddOutput("y", {4, 3}, + {3.0007803f, 1.1653428f, 3.0525956f, + 1.1727045f, 1.7081447f, 1.4874904f, + 3.1214628f, 1.749023f, 3.0871522f, + 3.718481f, 1.7794584f, 3.923692f}); + test.Run(); +} + +TEST(CDistOpTest, Sqeuclidean) { + OpTester test("CDist", 1, onnxruntime::kMSDomain); + test.AddAttribute("metric", "sqeuclidean"); + + test.AddInput("A", {4, 2}, + {-1.0856307f, 0.99734545f, + 0.2829785f, -1.5062947f, + -0.5786002f, 1.6514366f, + -2.4266791f, -0.42891264f}); + test.AddInput("B", {3, 2}, + {1.2659363f, -0.8667404f, + -0.6788862f, -0.09470897f, + 1.4913896f, -0.638902f}); + + test.AddOutput("y", {4, 3}, + {9.004683f, 1.3580238f, 9.318338f, + 1.3752356f, 2.917758f, 2.2126276f, + 9.74353f, 3.0590816f, 9.530509f, + 13.827101f, 3.1664724f, 15.395359f}); + test.Run(); +} + +TEST(CDistOpTest, DoubleEuclidean) { + OpTester test("CDist", 1, onnxruntime::kMSDomain); + test.AddAttribute("metric", "euclidean"); + + test.AddInput("A", {2, 3}, + {0.17251948, 1.6354825, 0.0373364, + -0.8841497, -1.1431923, -0.621366}); + test.AddInput("B", {3, 3}, + {-1.3486496, -0.81973106, -0.1342539, + 1.5996001, -0.28360364, -0.5063398, + 0.06890842, 1.4522595, -1.6390957}); + + test.AddOutput("y", {2, 3}, + {2.8933496, 2.4525568, 1.6895947, + 0.7467701, 2.6308053, 2.9462626}); + test.Run(); +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/onnx/gen_test_models.py b/onnxruntime/test/onnx/gen_test_models.py index 40a9dd1e6c..1ddf6fddd9 100644 --- a/onnxruntime/test/onnx/gen_test_models.py +++ b/onnxruntime/test/onnx/gen_test_models.py @@ -5,6 +5,7 @@ import onnx import numpy as np import os import argparse +from datetime import date from onnx import numpy_helper from onnx import helper from onnx import utils @@ -20,7 +21,7 @@ def parse_arguments(): def write_config(model_dir): with open(os.path.join(model_dir, "config.txt"), "w") as f: - f.write("per_sample_tolerance:1e-6\n") + f.write("per_sample_tolerance:1e-3\n") f.write("relative_per_sample_tolerance:1e-6\n") @@ -182,6 +183,9 @@ def test_cdist(output_dir): args = parse_arguments() os.makedirs(args.output_dir, exist_ok=True) +# make test values deterministic but variable +today = date.today() +np.random.seed(today.year + today.month + today.day) test_abs(args.output_dir) test_size(args.output_dir) test_reducesum(args.output_dir)