From dcefea2706fb35ece5e49fc138d952a2acd15824 Mon Sep 17 00:00:00 2001 From: efiks <5167930+efiks@users.noreply.github.com> Date: Thu, 10 Nov 2022 06:11:05 +0000 Subject: [PATCH] [caffe2][tourch] Optimize BatchBoxCox (#87585) Differential Revision: D40215424 Pull Request resolved: https://github.com/pytorch/pytorch/pull/87585 Approved by: https://github.com/hyuen --- caffe2/perfkernels/batch_box_cox_avx2.cc | 121 +++++++++++++++++++---- caffe2/perfkernels/lstm_unit_cpu-impl.h | 22 +---- caffe2/perfkernels/vectorizer.h | 28 ++++++ 3 files changed, 133 insertions(+), 38 deletions(-) create mode 100644 caffe2/perfkernels/vectorizer.h diff --git a/caffe2/perfkernels/batch_box_cox_avx2.cc b/caffe2/perfkernels/batch_box_cox_avx2.cc index cf0801b4733..8b93293646d 100644 --- a/caffe2/perfkernels/batch_box_cox_avx2.cc +++ b/caffe2/perfkernels/batch_box_cox_avx2.cc @@ -3,6 +3,35 @@ #include #include +#include "vectorizer.h" + +#ifndef VECTORIZED_KERNEL +#define CPU_CAPABILITY_AVX2 +#include + +namespace at::vec { + +template +Vectorized max(const Vectorized& a, const Vectorized& b); + +// Implements the vectorized version of std::max() operation, +// which DOESNOT propagates NaN for second argument +template <> +Vectorized max(const Vectorized& a, const Vectorized& b) { + // std::max(NaN, nonNan) -> NaN + return _mm256_max_pd(b, a); +} + + +template <> +Vectorized max(const Vectorized& a, const Vectorized& b) { + // std::max(NaN, nonNan) -> NaN + return _mm256_max_ps(b, a); +} + +} +#endif + #include #include #include @@ -65,6 +94,7 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, Ln, vsLn) DELEGATE_SIMPLE_UNARY_FUNCTION(double, Ln, vdLn) #undef DELEGATE_SIMPLE_UNARY_FUNCTION +#ifndef VECTORIZED_KERNEL template void box_cox_zero_lambda( size_t D, @@ -72,36 +102,93 @@ void box_cox_zero_lambda( const T* const lambda2_data, T k_eps, T* const output_data) { - Add(D, self_data, lambda2_data, output_data); - for (const auto j : c10::irange(D)) { - output_data[j] = std::max(output_data[j], k_eps); + int j = 0; + using Vec = at::vec::Vectorized; + constexpr int64_t VLEN = Vec::size(); + auto k_eps_vec = Vec(k_eps); + for(; j + VLEN < D; j += VLEN) { + auto data = Vec::loadu(self_data + j); + auto lambda2 = Vec::loadu(lambda2_data + j); + auto sum = data + lambda2; + auto max = at::vec::max(sum, k_eps_vec); + auto res = max.log(); + res.store(output_data + j); + } + for ( ;j < D; ++j) { + auto sum = self_data[j] + lambda2_data[j]; + auto max = std::max(sum, k_eps); + output_data[j] = std::log(max); } - - Ln(D, output_data, output_data); } template void box_cox_nonzero_lambda( + int64_t D, + const T* data_ptr, + const T* lambda1_ptr, + const T* lambda2_ptr, + T k_eps, + T* out) { + + int j = 0; + using Vec = at::vec::Vectorized; + constexpr int64_t VLEN = Vec::size(); + auto k_eps_vec = Vec(k_eps); + for(; j + VLEN < D; j += VLEN) { + auto data = Vec::loadu(data_ptr + j); + auto lambda2 = Vec::loadu(lambda2_ptr + j); + auto sum = data + lambda2; + auto max = at::vec::max(sum, k_eps_vec); + auto lambda1 = Vec::loadu(lambda1_ptr + j); + auto lambda_over_1 = lambda1.reciprocal(); + auto pow = max.pow(lambda1); + auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1); + res.store(out + j); + } + for ( ;j < D; ++j) { + auto sum = data_ptr[j] + lambda2_ptr[j]; + auto max = std::max(sum, k_eps); + auto lambda_over_1 = 1 / lambda1_ptr[j]; + auto pow = std::pow(max, lambda1_ptr[j]); + out[j] = pow * lambda_over_1 - lambda_over_1; + } +} +#else +template +void box_cox_zero_lambda( size_t D, const T* const self_data, - const T* const lambda1_data, const T* const lambda2_data, T k_eps, T* const output_data) { - Add(D, self_data, lambda2_data, output_data); - for (const auto j : c10::irange(D)) { - output_data[j] = std::max(output_data[j], k_eps); + VECTOR_LOOP for (auto j=0 ;j < D; ++j) { + auto sum = self_data[j] + lambda2_data[j]; + auto max = std::max(sum, k_eps); + output_data[j] = std::log(max); } - - // output = output ^ lambda1 - Pow(D, output_data, lambda1_data, output_data); - // output = (output - 1)/ lambda1 - for (const auto j : c10::irange(D)) { - output_data[j] -= 1.0; - } - Div(D, output_data, lambda1_data, output_data); } +template +void box_cox_nonzero_lambda( + int64_t D, + const T* data_ptr, + const T* lambda1_ptr, + const T* lambda2_ptr, + T k_eps, + T* out) { + + VECTOR_LOOP for (auto j=0 ;j < D; ++j) { + FAST_MATH + auto sum = data_ptr[j] + lambda2_ptr[j]; + auto max = std::max(sum, k_eps); + auto lambda_over_1 = 1 / lambda1_ptr[j]; + auto pow = std::pow(max, lambda1_ptr[j]); + out[j] = pow * lambda_over_1 - lambda_over_1; + } +} + +#endif + template void box_cox_mixed_lambda( const T* const self_data, diff --git a/caffe2/perfkernels/lstm_unit_cpu-impl.h b/caffe2/perfkernels/lstm_unit_cpu-impl.h index 5e76e1aa39f..239d2807f77 100644 --- a/caffe2/perfkernels/lstm_unit_cpu-impl.h +++ b/caffe2/perfkernels/lstm_unit_cpu-impl.h @@ -5,27 +5,7 @@ #include "c10/util/irange.h" #include "caffe2/utils/conversions.h" -#if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG) -#if defined(__clang__) && (__clang_major__ > 7) -#define IS_SANITIZER \ - ((__has_feature(address_sanitizer) == 1) || \ - (__has_feature(memory_sanitizer) == 1) || \ - (__has_feature(thread_sanitizer) == 1) || \ - (__has_feature(undefined_sanitizer) == 1)) - -#if IS_SANITIZER == 0 -#define VECTOR_LOOP _Pragma("clang loop vectorize(enable)") -#endif -#elif defined(_OPENMP) && (_OPENMP >= 201511) -// Support with OpenMP4.5 and above -#define VECTOR_LOOP _Pragma("omp for simd") -#endif -#endif - -#ifndef VECTOR_LOOP -// Not supported -#define VECTOR_LOOP -#endif +#include "vectorizer.h" namespace caffe2 { namespace perfkernels { diff --git a/caffe2/perfkernels/vectorizer.h b/caffe2/perfkernels/vectorizer.h new file mode 100644 index 00000000000..be4e6bbc280 --- /dev/null +++ b/caffe2/perfkernels/vectorizer.h @@ -0,0 +1,28 @@ +#pragma once + +#if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG) +#if defined(__clang__) && (__clang_major__ > 7) +#define IS_SANITIZER \ + ((__has_feature(address_sanitizer) == 1) || \ + (__has_feature(memory_sanitizer) == 1) || \ + (__has_feature(thread_sanitizer) == 1) || \ + (__has_feature(undefined_sanitizer) == 1)) + +#if IS_SANITIZER == 0 +#define VECTOR_LOOP _Pragma("clang loop vectorize(enable)") +#define FAST_MATH _Pragma("clang fp contract(fast)") +#define VECTORIZED_KERNEL 1 +#endif +#elif defined(_OPENMP) && (_OPENMP >= 201511) +// Support with OpenMP4.5 and above +#define VECTOR_LOOP _Pragma("omp for simd") +#define VECTORIZED_KERNEL 1 +#define FAST_MATH +#endif +#endif + +#ifndef VECTOR_LOOP +// Not supported +#define VECTOR_LOOP +#define FAST_MATH +#endif