From dcefea2706fb35ece5e49fc138d952a2acd15824 Mon Sep 17 00:00:00 2001
From: efiks <5167930+efiks@users.noreply.github.com>
Date: Thu, 10 Nov 2022 06:11:05 +0000
Subject: [PATCH] [caffe2][tourch] Optimize BatchBoxCox (#87585)

Differential Revision: D40215424

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87585
Approved by: https://github.com/hyuen
---
 caffe2/perfkernels/batch_box_cox_avx2.cc | 121 +++++++++++++++++++----
 caffe2/perfkernels/lstm_unit_cpu-impl.h  |  22 +----
 caffe2/perfkernels/vectorizer.h          |  28 ++++++
 3 files changed, 133 insertions(+), 38 deletions(-)
 create mode 100644 caffe2/perfkernels/vectorizer.h
diff --git a/caffe2/perfkernels/batch_box_cox_avx2.cc b/caffe2/perfkernels/batch_box_cox_avx2.cc
index cf0801b4733..8b93293646d 100644
--- a/caffe2/perfkernels/batch_box_cox_avx2.cc
+++ b/caffe2/perfkernels/batch_box_cox_avx2.cc
@@ -3,6 +3,35 @@
 #include <caffe2/perfkernels/common.h>
 #include <folly/SingletonThreadLocal.h>
 
+#include "vectorizer.h"
+
+#ifndef VECTORIZED_KERNEL
+#define CPU_CAPABILITY_AVX2
+#include <ATen/cpu/vec/vec.h>
+
+namespace at::vec {
+
+template <typename scalar_t>
+Vectorized<scalar_t> max(const Vectorized<scalar_t>& a, const Vectorized<scalar_t>& b);
+
+// Implements the vectorized version of std::max() operation,
+// which DOESNOT propagates NaN for second argument
+template <>
+Vectorized<double> max(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // std::max(NaN, nonNan) -> NaN
+  return _mm256_max_pd(b, a);
+}
+
+
+template <>
+Vectorized<float> max(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // std::max(NaN, nonNan) -> NaN
+  return _mm256_max_ps(b, a);
+}
+
+}
+#endif
+
 #include <cstdint>
 #include <cmath>
 #include <vector>
@@ -65,6 +94,7 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, Ln, vsLn)
 DELEGATE_SIMPLE_UNARY_FUNCTION(double, Ln, vdLn)
 #undef DELEGATE_SIMPLE_UNARY_FUNCTION
 
+#ifndef VECTORIZED_KERNEL
 template <typename T>
 void box_cox_zero_lambda(
     size_t D,
@@ -72,36 +102,93 @@ void box_cox_zero_lambda(
     const T* const lambda2_data,
     T k_eps,
     T* const output_data) {
-  Add(D, self_data, lambda2_data, output_data);
-  for (const auto j : c10::irange(D)) {
-    output_data[j] = std::max(output_data[j], k_eps);
+  int j = 0;
+  using Vec = at::vec::Vectorized<T>;
+  constexpr int64_t VLEN = Vec::size();
+  auto k_eps_vec = Vec(k_eps);
+  for(; j + VLEN < D; j += VLEN) {
+    auto data = Vec::loadu(self_data + j);
+    auto lambda2 = Vec::loadu(lambda2_data + j);
+    auto sum = data + lambda2;
+    auto max = at::vec::max(sum, k_eps_vec);
+    auto res = max.log();
+    res.store(output_data + j);
+  }
+  for ( ;j < D; ++j) {
+    auto sum = self_data[j] + lambda2_data[j];
+    auto max = std::max(sum, k_eps);
+    output_data[j] = std::log(max);
   }
-
-  Ln(D, output_data, output_data);
 }
 
 template <typename T>
 void box_cox_nonzero_lambda(
+    int64_t D,
+    const T* data_ptr,
+    const T* lambda1_ptr,
+    const T* lambda2_ptr,
+    T k_eps,
+    T* out) {
+
+  int j = 0;
+  using Vec = at::vec::Vectorized<T>;
+  constexpr int64_t VLEN = Vec::size();
+  auto k_eps_vec = Vec(k_eps);
+  for(; j + VLEN < D; j += VLEN) {
+    auto data = Vec::loadu(data_ptr + j);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j);
+    auto sum = data + lambda2;
+    auto max = at::vec::max(sum, k_eps_vec);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j);
+    auto lambda_over_1 = lambda1.reciprocal();
+    auto pow = max.pow(lambda1);
+    auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+    res.store(out + j);
+  }
+  for ( ;j < D; ++j) {
+    auto sum = data_ptr[j] + lambda2_ptr[j];
+    auto max = std::max(sum, k_eps);
+    auto lambda_over_1 = 1 / lambda1_ptr[j];
+    auto pow = std::pow(max, lambda1_ptr[j]);
+    out[j] = pow * lambda_over_1 - lambda_over_1;
+  }
+}
+#else
+template <typename T>
+void box_cox_zero_lambda(
     size_t D,
     const T* const self_data,
-    const T* const lambda1_data,
     const T* const lambda2_data,
     T k_eps,
     T* const output_data) {
-  Add(D, self_data, lambda2_data, output_data);
-  for (const auto j : c10::irange(D)) {
-    output_data[j] = std::max(output_data[j], k_eps);
+  VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
+    auto sum = self_data[j] + lambda2_data[j];
+    auto max = std::max(sum, k_eps);
+    output_data[j] = std::log(max);
   }
-
-  // output = output ^ lambda1
-  Pow(D, output_data, lambda1_data, output_data);
-  // output = (output  - 1)/ lambda1
-  for (const auto j : c10::irange(D)) {
-    output_data[j] -= 1.0;
-  }
-  Div(D, output_data, lambda1_data, output_data);
 }
 
+template <typename T>
+void box_cox_nonzero_lambda(
+    int64_t D,
+    const T* data_ptr,
+    const T* lambda1_ptr,
+    const T* lambda2_ptr,
+    T k_eps,
+    T* out) {
+
+  VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
+    FAST_MATH
+    auto sum = data_ptr[j] + lambda2_ptr[j];
+    auto max = std::max(sum, k_eps);
+    auto lambda_over_1 = 1 / lambda1_ptr[j];
+    auto pow = std::pow(max, lambda1_ptr[j]);
+    out[j] = pow * lambda_over_1 - lambda_over_1;
+  }
+}
+
+#endif
+
 template <typename T>
 void box_cox_mixed_lambda(
     const T* const self_data,
diff --git a/caffe2/perfkernels/lstm_unit_cpu-impl.h b/caffe2/perfkernels/lstm_unit_cpu-impl.h
index 5e76e1aa39f..239d2807f77 100644
--- a/caffe2/perfkernels/lstm_unit_cpu-impl.h
+++ b/caffe2/perfkernels/lstm_unit_cpu-impl.h
@@ -5,27 +5,7 @@
 #include "c10/util/irange.h"
 #include "caffe2/utils/conversions.h"
 
-#if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG)
-#if defined(__clang__) && (__clang_major__ > 7)
-#define IS_SANITIZER                          \
-  ((__has_feature(address_sanitizer) == 1) || \
-   (__has_feature(memory_sanitizer) == 1) ||  \
-   (__has_feature(thread_sanitizer) == 1) ||  \
-   (__has_feature(undefined_sanitizer) == 1))
-
-#if IS_SANITIZER == 0
-#define VECTOR_LOOP _Pragma("clang loop vectorize(enable)")
-#endif
-#elif defined(_OPENMP) && (_OPENMP >= 201511)
-// Support with OpenMP4.5 and above
-#define VECTOR_LOOP _Pragma("omp for simd")
-#endif
-#endif
-
-#ifndef VECTOR_LOOP
-// Not supported
-#define VECTOR_LOOP
-#endif
+#include "vectorizer.h"
 
 namespace caffe2 {
 namespace perfkernels {
diff --git a/caffe2/perfkernels/vectorizer.h b/caffe2/perfkernels/vectorizer.h
new file mode 100644
index 00000000000..be4e6bbc280
--- /dev/null
+++ b/caffe2/perfkernels/vectorizer.h
@@ -0,0 +1,28 @@
+#pragma once
+
+#if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG)
+#if defined(__clang__) && (__clang_major__ > 7)
+#define IS_SANITIZER                          \
+  ((__has_feature(address_sanitizer) == 1) || \
+   (__has_feature(memory_sanitizer) == 1) ||  \
+   (__has_feature(thread_sanitizer) == 1) ||  \
+   (__has_feature(undefined_sanitizer) == 1))
+
+#if IS_SANITIZER == 0
+#define VECTOR_LOOP _Pragma("clang loop vectorize(enable)")
+#define FAST_MATH _Pragma("clang fp contract(fast)")
+#define VECTORIZED_KERNEL 1
+#endif
+#elif defined(_OPENMP) && (_OPENMP >= 201511)
+// Support with OpenMP4.5 and above
+#define VECTOR_LOOP _Pragma("omp for simd")
+#define VECTORIZED_KERNEL 1
+#define FAST_MATH
+#endif
+#endif
+
+#ifndef VECTOR_LOOP
+// Not supported
+#define VECTOR_LOOP
+#define FAST_MATH
+#endif