[caffe2][tourch] Optimize BatchBoxCox (#87585)

Differential Revision: D40215424

Pull Request resolved: https://github.com/pytorch/pytorch/pull/87585
Approved by: https://github.com/hyuen
This commit is contained in:
efiks 2022-11-10 06:11:05 +00:00 committed by PyTorch MergeBot
parent e87c79ca0c
commit dcefea2706
3 changed files with 133 additions and 38 deletions

View file

@ -3,6 +3,35 @@
#include <caffe2/perfkernels/common.h>
#include <folly/SingletonThreadLocal.h>
#include "vectorizer.h"
#ifndef VECTORIZED_KERNEL
#define CPU_CAPABILITY_AVX2
#include <ATen/cpu/vec/vec.h>
namespace at::vec {
template <typename scalar_t>
Vectorized<scalar_t> max(const Vectorized<scalar_t>& a, const Vectorized<scalar_t>& b);
// Implements the vectorized version of std::max() operation,
// which DOESNOT propagates NaN for second argument
template <>
Vectorized<double> max(const Vectorized<double>& a, const Vectorized<double>& b) {
// std::max(NaN, nonNan) -> NaN
return _mm256_max_pd(b, a);
}
template <>
Vectorized<float> max(const Vectorized<float>& a, const Vectorized<float>& b) {
// std::max(NaN, nonNan) -> NaN
return _mm256_max_ps(b, a);
}
}
#endif
#include <cstdint>
#include <cmath>
#include <vector>
@ -65,6 +94,7 @@ DELEGATE_SIMPLE_UNARY_FUNCTION(float, Ln, vsLn)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Ln, vdLn)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION
#ifndef VECTORIZED_KERNEL
template <typename T>
void box_cox_zero_lambda(
size_t D,
@ -72,36 +102,93 @@ void box_cox_zero_lambda(
const T* const lambda2_data,
T k_eps,
T* const output_data) {
Add(D, self_data, lambda2_data, output_data);
for (const auto j : c10::irange(D)) {
output_data[j] = std::max(output_data[j], k_eps);
int j = 0;
using Vec = at::vec::Vectorized<T>;
constexpr int64_t VLEN = Vec::size();
auto k_eps_vec = Vec(k_eps);
for(; j + VLEN < D; j += VLEN) {
auto data = Vec::loadu(self_data + j);
auto lambda2 = Vec::loadu(lambda2_data + j);
auto sum = data + lambda2;
auto max = at::vec::max(sum, k_eps_vec);
auto res = max.log();
res.store(output_data + j);
}
for ( ;j < D; ++j) {
auto sum = self_data[j] + lambda2_data[j];
auto max = std::max(sum, k_eps);
output_data[j] = std::log(max);
}
Ln(D, output_data, output_data);
}
template <typename T>
void box_cox_nonzero_lambda(
int64_t D,
const T* data_ptr,
const T* lambda1_ptr,
const T* lambda2_ptr,
T k_eps,
T* out) {
int j = 0;
using Vec = at::vec::Vectorized<T>;
constexpr int64_t VLEN = Vec::size();
auto k_eps_vec = Vec(k_eps);
for(; j + VLEN < D; j += VLEN) {
auto data = Vec::loadu(data_ptr + j);
auto lambda2 = Vec::loadu(lambda2_ptr + j);
auto sum = data + lambda2;
auto max = at::vec::max(sum, k_eps_vec);
auto lambda1 = Vec::loadu(lambda1_ptr + j);
auto lambda_over_1 = lambda1.reciprocal();
auto pow = max.pow(lambda1);
auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
res.store(out + j);
}
for ( ;j < D; ++j) {
auto sum = data_ptr[j] + lambda2_ptr[j];
auto max = std::max(sum, k_eps);
auto lambda_over_1 = 1 / lambda1_ptr[j];
auto pow = std::pow(max, lambda1_ptr[j]);
out[j] = pow * lambda_over_1 - lambda_over_1;
}
}
#else
template <typename T>
void box_cox_zero_lambda(
size_t D,
const T* const self_data,
const T* const lambda1_data,
const T* const lambda2_data,
T k_eps,
T* const output_data) {
Add(D, self_data, lambda2_data, output_data);
for (const auto j : c10::irange(D)) {
output_data[j] = std::max(output_data[j], k_eps);
VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
auto sum = self_data[j] + lambda2_data[j];
auto max = std::max(sum, k_eps);
output_data[j] = std::log(max);
}
// output = output ^ lambda1
Pow(D, output_data, lambda1_data, output_data);
// output = (output - 1)/ lambda1
for (const auto j : c10::irange(D)) {
output_data[j] -= 1.0;
}
Div(D, output_data, lambda1_data, output_data);
}
template <typename T>
void box_cox_nonzero_lambda(
int64_t D,
const T* data_ptr,
const T* lambda1_ptr,
const T* lambda2_ptr,
T k_eps,
T* out) {
VECTOR_LOOP for (auto j=0 ;j < D; ++j) {
FAST_MATH
auto sum = data_ptr[j] + lambda2_ptr[j];
auto max = std::max(sum, k_eps);
auto lambda_over_1 = 1 / lambda1_ptr[j];
auto pow = std::pow(max, lambda1_ptr[j]);
out[j] = pow * lambda_over_1 - lambda_over_1;
}
}
#endif
template <typename T>
void box_cox_mixed_lambda(
const T* const self_data,

View file

@ -5,27 +5,7 @@
#include "c10/util/irange.h"
#include "caffe2/utils/conversions.h"
#if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG)
#if defined(__clang__) && (__clang_major__ > 7)
#define IS_SANITIZER \
((__has_feature(address_sanitizer) == 1) || \
(__has_feature(memory_sanitizer) == 1) || \
(__has_feature(thread_sanitizer) == 1) || \
(__has_feature(undefined_sanitizer) == 1))
#if IS_SANITIZER == 0
#define VECTOR_LOOP _Pragma("clang loop vectorize(enable)")
#endif
#elif defined(_OPENMP) && (_OPENMP >= 201511)
// Support with OpenMP4.5 and above
#define VECTOR_LOOP _Pragma("omp for simd")
#endif
#endif
#ifndef VECTOR_LOOP
// Not supported
#define VECTOR_LOOP
#endif
#include "vectorizer.h"
namespace caffe2 {
namespace perfkernels {

View file

@ -0,0 +1,28 @@
#pragma once
#if (ENABLE_VECTORIZATION > 0) && !defined(_DEBUG) && !defined(DEBUG)
#if defined(__clang__) && (__clang_major__ > 7)
#define IS_SANITIZER \
((__has_feature(address_sanitizer) == 1) || \
(__has_feature(memory_sanitizer) == 1) || \
(__has_feature(thread_sanitizer) == 1) || \
(__has_feature(undefined_sanitizer) == 1))
#if IS_SANITIZER == 0
#define VECTOR_LOOP _Pragma("clang loop vectorize(enable)")
#define FAST_MATH _Pragma("clang fp contract(fast)")
#define VECTORIZED_KERNEL 1
#endif
#elif defined(_OPENMP) && (_OPENMP >= 201511)
// Support with OpenMP4.5 and above
#define VECTOR_LOOP _Pragma("omp for simd")
#define VECTORIZED_KERNEL 1
#define FAST_MATH
#endif
#endif
#ifndef VECTOR_LOOP
// Not supported
#define VECTOR_LOOP
#define FAST_MATH
#endif