From e7bdfa00db1d0e9909f1d0d5e159173e3fbd608f Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 27 Nov 2018 11:04:42 -0800 Subject: [PATCH 1/3] Optimize softmax cpu by parallel using openmp. --- .../core/providers/cpu/math/softmax_shared.cc | 41 +++++++++++++++---- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/providers/cpu/math/softmax_shared.cc b/onnxruntime/core/providers/cpu/math/softmax_shared.cc index 32df249f36..762ee5bdfc 100644 --- a/onnxruntime/core/providers/cpu/math/softmax_shared.cc +++ b/onnxruntime/core/providers/cpu/math/softmax_shared.cc @@ -38,6 +38,29 @@ namespace onnxruntime { +common::Status SoftmaxCore(const int n, + const int d, + const float* Xdata, + float* Ydata, + float* scale, + const float* sum_multiplier, + bool logarithmic, + float* rowmax) { + + const int nd = n * d; + + math::RowwiseMax(n, d, Xdata, rowmax, nullptr); + + // Put the intermediate result X - max(X) into Y by first copying X to Y, and then subtracting max from each entry + gsl::copy(gsl::make_span(Xdata, nd), gsl::make_span(Ydata, nd)); + + math::Gemm(CblasNoTrans, CblasNoTrans, n, d, 1, -1, rowmax, sum_multiplier, 1, Ydata, nullptr); + // Exponentiation + math::Exp(nd, Ydata, Ydata, nullptr); + + return Status::OK(); +} + common::Status SoftmaxCPU(const int64_t N, const int64_t D, const float* Xdata, @@ -59,19 +82,23 @@ common::Status SoftmaxCPU(const int64_t N, const int d = gsl::narrow_cast(D); const int nd = gsl::narrow_cast(N * D); - math::RowwiseMax(n, d, Xdata, rowmax, nullptr); + static const int kGROUP = 8; + int g = (n + (kGROUP-1)) / kGROUP; - // Put the intermediate result X - max(X) into Y by first copying X to Y, and then subtracting max from each entry - gsl::copy(gsl::make_span(Xdata, nd), gsl::make_span(Ydata, nd)); + #pragma omp parallel for + for (int i = 0; i < kGROUP; ++i) { + int s = g * i; + if (s < n) { + int c = (n - s >= g)?g : (n-s); + SoftmaxCore(c, d, Xdata + (s*d), Ydata + (s*d), scale + s, sum_multiplier, logarithmic, rowmax+s); + } + } - math::Gemm(CblasNoTrans, CblasNoTrans, n, d, 1, -1, rowmax, sum_multiplier, 1, Ydata, nullptr); - - // Exponentiation - math::Exp(nd, Ydata, Ydata, nullptr); math::Gemv(CblasNoTrans, n, d, 1, Ydata, sum_multiplier, 0, scale, nullptr); // Do division if (!logarithmic) { + #pragma omp parallel for for (int i = 0; i < N; ++i) { for (int j = 0; j < D; ++j) { Ydata[i * D + j] /= scale[i]; From c530064ebe912621ac5627bc486c62356fbd1eeb Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 28 Nov 2018 11:15:31 -0800 Subject: [PATCH 2/3] Better opemmp parallel group count calculation in Softmax parallel running. --- .../core/providers/cpu/math/softmax_shared.cc | 55 +++++++++++++------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/onnxruntime/core/providers/cpu/math/softmax_shared.cc b/onnxruntime/core/providers/cpu/math/softmax_shared.cc index 762ee5bdfc..77f6cf69a0 100644 --- a/onnxruntime/core/providers/cpu/math/softmax_shared.cc +++ b/onnxruntime/core/providers/cpu/math/softmax_shared.cc @@ -36,31 +36,51 @@ #include "gsl/gsl_algorithm" #include "gsl/gsl_util" +#if defined(_OPENMP) +#include +#endif + +#ifdef __GNUC__ +#define UNUSED __attribute__((unused)) +#else +#define UNUSED +#endif + namespace onnxruntime { common::Status SoftmaxCore(const int n, - const int d, - const float* Xdata, - float* Ydata, - float* scale, - const float* sum_multiplier, - bool logarithmic, - float* rowmax) { - + const int d, + const float* Xdata, + float* Ydata, + const float* sum_multiplier, + float* rowmax) { const int nd = n * d; math::RowwiseMax(n, d, Xdata, rowmax, nullptr); - // Put the intermediate result X - max(X) into Y by first copying X to Y, and then subtracting max from each entry gsl::copy(gsl::make_span(Xdata, nd), gsl::make_span(Ydata, nd)); - math::Gemm(CblasNoTrans, CblasNoTrans, n, d, 1, -1, rowmax, sum_multiplier, 1, Ydata, nullptr); // Exponentiation math::Exp(nd, Ydata, Ydata, nullptr); - return Status::OK(); } +static int GetParallelGroupCount(int UNUSED n, int UNUSED d) { +#if defined(_OPENMP) + int omp_num_threads = omp_get_num_threads(); + int group_count = std::min(omp_num_threads, n); + if (group_count <= 1) return 1; + + // 2048 * sizeof(float) is size of 2 cache page + static const int min_elements_per_group = 2048; + int max_groups = gsl::narrow_cast((int64_t{n} * d + min_elements_per_group-1) / min_elements_per_group); + + return std::min(group_count, max_groups); +#else + return 1; +#endif +} + common::Status SoftmaxCPU(const int64_t N, const int64_t D, const float* Xdata, @@ -80,17 +100,16 @@ common::Status SoftmaxCPU(const int64_t N, const int n = gsl::narrow_cast(N); const int d = gsl::narrow_cast(D); - const int nd = gsl::narrow_cast(N * D); - static const int kGROUP = 8; - int g = (n + (kGROUP-1)) / kGROUP; + int parallel_group_count = GetParallelGroupCount(n, d); + int n_per_group = (n + (parallel_group_count-1)) / parallel_group_count; #pragma omp parallel for - for (int i = 0; i < kGROUP; ++i) { - int s = g * i; + for (int i = 0; i < parallel_group_count; ++i) { + int s = n_per_group * i; if (s < n) { - int c = (n - s >= g)?g : (n-s); - SoftmaxCore(c, d, Xdata + (s*d), Ydata + (s*d), scale + s, sum_multiplier, logarithmic, rowmax+s); + int c = (n - s >= n_per_group) ? n_per_group : (n-s); + SoftmaxCore(c, d, Xdata + (s*d), Ydata + (s*d), sum_multiplier, rowmax+s); } } From 6b00e6bb4df23e77da73b72c67ff98ba605712f1 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 28 Nov 2018 11:30:57 -0800 Subject: [PATCH 3/3] Simpler unused parameter in #if defined() switch. --- onnxruntime/core/providers/cpu/math/softmax_shared.cc | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/providers/cpu/math/softmax_shared.cc b/onnxruntime/core/providers/cpu/math/softmax_shared.cc index 77f6cf69a0..18f077d6c1 100644 --- a/onnxruntime/core/providers/cpu/math/softmax_shared.cc +++ b/onnxruntime/core/providers/cpu/math/softmax_shared.cc @@ -40,12 +40,6 @@ #include #endif -#ifdef __GNUC__ -#define UNUSED __attribute__((unused)) -#else -#define UNUSED -#endif - namespace onnxruntime { common::Status SoftmaxCore(const int n, @@ -65,7 +59,7 @@ common::Status SoftmaxCore(const int n, return Status::OK(); } -static int GetParallelGroupCount(int UNUSED n, int UNUSED d) { +static int GetParallelGroupCount(int n, int d) { #if defined(_OPENMP) int omp_num_threads = omp_get_num_threads(); int group_count = std::min(omp_num_threads, n); @@ -77,6 +71,8 @@ static int GetParallelGroupCount(int UNUSED n, int UNUSED d) { return std::min(group_count, max_groups); #else + (void)n; + (void)d; return 1; #endif }