pytorch/caffe2/utils/math/elementwise.cu

#include "caffe2/utils/math/elementwise.h"

#include <thrust/functional.h>

#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/conversions.h"
#include "caffe2/utils/math/half_utils.h"
#include "caffe2/utils/math/utils.h"

namespace caffe2 {
namespace math {

namespace {

#define DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(T, Func, DeviceFunc) \
  __global__ void Func##CUDAKernel(const int N, const T* X, T* Y) {     \
    const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;    \
    if (i < N) {                                                        \
      Y[i] = DeviceFunc(X[i]);                                          \
    }                                                                   \
  }
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Exp, expf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Log, logf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cos, cosf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Acos, acosf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sin, sinf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Asin, asinf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tan, tanf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Atan, atanf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sinh, sinhf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cosh, coshf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tanh, tanhf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Abs, fabsf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqr, utils::Square<float>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqrt, sqrtf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Rsqrt, rsqrtf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cbrt, cbrtf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Erf, erff)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Erf, erf)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
    std::int32_t,
    Cube,
    utils::Cube<std::int32_t>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
    std::int64_t,
    Cube,
    utils::Cube<std::int64_t>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cube, utils::Cube<float>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Cube, utils::Cube<double>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(bool, Not, utils::Not<bool>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
    std::int32_t,
    Neg,
    utils::Negate<std::int32_t>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
    std::int64_t,
    Neg,
    utils::Negate<std::int64_t>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Neg, utils::Negate<float>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Neg, utils::Negate<double>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
    std::int32_t,
    Sign,
    utils::Sign<std::int32_t>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
    std::int64_t,
    Sign,
    utils::Sign<std::int64_t>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sign, utils::Sign<float>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Sign, utils::Sign<double>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Inv, utils::Inv<float>)
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Inv, utils::Inv<double>)
#undef DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION

template <typename T>
__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
  const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
  if (i < N) {
#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
    c10::cuda::compat::sincos(__ldg(X + i), S + i, C + i);
#else
    c10::cuda::compat::sincos(X[i], S + i, C + i);
#endif
  }
}

template <typename T, class Func>
__global__ void SimpleBinaryCUDAKernel(
    const int N,
    const Func func,
    const T* A,
    const T* B,
    T* C) {
  const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
  if (i < N) {
    C[i] = func(A[i], B[i]);
  }
}

template <typename T, class Comp>
__global__ void SimpleCompareCUDAKernel(
    const int N,
    const Comp comp,
    const T* A,
    const T* B,
    bool* C) {
  const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
  if (i < N) {
    C[i] = comp(A[i], B[i]);
  }
}

} // namespace

#define DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func)           \
  template <>                                                \
  CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>(              \
      const int N, const T* X, T* Y, CUDAContext* context) { \
    if (N > 0) {                                             \
      const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS);        \
      Func##CUDAKernel<<<                                    \
          M,                                                 \
          CAFFE_CUDA_NUM_THREADS,                            \
          0,                                                 \
          context->cuda_stream()>>>(N, X, Y);                \
    }                                                        \
  }
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sinh)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cosh)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tanh)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Erf)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Erf)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Cube)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Cube)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Neg)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Neg)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Sign)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Sign)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Inv)
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv)
#undef DEFINE_SIMPLE_CUDA_UNARY_FUNCTION

#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T)                             \
  template <>                                                         \
  CAFFE2_CUDA_EXPORT void SinCos<T, CUDAContext>(                     \
      const int N, const T* X, T* S, T* C, CUDAContext* context) {    \
    if (N > 0) {                                                      \
      const int K = DivUp(N, CAFFE_CUDA_NUM_THREADS);                 \
      SinCosCUDAKernel<T>                                             \
          <<<K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
              N, X, S, C);                                            \
    }                                                                 \
  }
CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
#undef CAFFE2_SPECIALIZED_CUDA_SINCOS

#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Func, DeviceFunc)        \
  template <>                                                            \
  CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>(                          \
      const int N, const T* A, const T* B, T* C, CUDAContext* context) { \
    if (N > 0) {                                                         \
      const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS);                    \
      SimpleBinaryCUDAKernel<<<                                          \
          M,                                                             \
          CAFFE_CUDA_NUM_THREADS,                                        \
          0,                                                             \
          context->cuda_stream()>>>(N, DeviceFunc, A, B, C);             \
    }                                                                    \
  }
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int32_t,
    Add,
    thrust::plus<std::int32_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int64_t,
    Add,
    thrust::plus<std::int64_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, thrust::plus<float>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Add, thrust::plus<double>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Add, utils::HalfAddFunctor())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int32_t,
    Sub,
    thrust::minus<std::int32_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int64_t,
    Sub,
    thrust::minus<std::int64_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, thrust::minus<float>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Sub, thrust::minus<double>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Sub, utils::HalfSubFunctor())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int32_t,
    Mul,
    thrust::multiplies<std::int32_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int64_t,
    Mul,
    thrust::multiplies<std::int64_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, thrust::multiplies<float>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Mul, thrust::multiplies<double>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Mul, utils::HalfMulFunctor())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int32_t,
    Div,
    thrust::divides<std::int32_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int64_t,
    Div,
    thrust::divides<std::int64_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, thrust::divides<float>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Div, thrust::divides<double>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Div, utils::HalfDivFunctor())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Min, thrust::minimum<float>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Min, thrust::minimum<double>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Max, thrust::maximum<float>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Max, thrust::maximum<double>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, And, thrust::logical_and<bool>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Or, thrust::logical_or<bool>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Xor, thrust::bit_xor<bool>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseAnd, thrust::bit_and<bool>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int32_t,
    BitwiseAnd,
    thrust::bit_and<std::int32_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int64_t,
    BitwiseAnd,
    thrust::bit_and<std::int64_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseOr, thrust::bit_or<bool>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int32_t,
    BitwiseOr,
    thrust::bit_or<std::int32_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int64_t,
    BitwiseOr,
    thrust::bit_or<std::int64_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseXor, thrust::bit_xor<bool>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int32_t,
    BitwiseXor,
    thrust::bit_xor<std::int32_t>())
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    std::int64_t,
    BitwiseXor,
    thrust::bit_xor<std::int64_t>())
#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION

#define DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(T, Func, DeviceComp)          \
  template <>                                                               \
  CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>(                             \
      const int N, const T* A, const T* B, bool* C, CUDAContext* context) { \
    if (N > 0) {                                                            \
      const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS);                       \
      SimpleCompareCUDAKernel<<<                                            \
          M,                                                                \
          CAFFE_CUDA_NUM_THREADS,                                           \
          0,                                                                \
          context->cuda_stream()>>>(N, DeviceComp, A, B, C);                \
    }                                                                       \
  }
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, EQ, thrust::equal_to<bool>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int32_t,
    EQ,
    thrust::equal_to<std::int32_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int64_t,
    EQ,
    thrust::equal_to<std::int64_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, EQ, thrust::equal_to<float>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, EQ, thrust::equal_to<double>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, NE, thrust::not_equal_to<bool>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int32_t,
    NE,
    thrust::not_equal_to<std::int32_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int64_t,
    NE,
    thrust::not_equal_to<std::int64_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, NE, thrust::not_equal_to<float>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    double,
    NE,
    thrust::not_equal_to<double>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LT, thrust::less<bool>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int32_t,
    LT,
    thrust::less<std::int32_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int64_t,
    LT,
    thrust::less<std::int64_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LT, thrust::less<float>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LT, thrust::less<double>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LE, thrust::less_equal<bool>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int32_t,
    LE,
    thrust::less_equal<std::int32_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int64_t,
    LE,
    thrust::less_equal<std::int64_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LE, thrust::less_equal<float>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LE, thrust::less_equal<double>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GT, thrust::greater<bool>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int32_t,
    GT,
    thrust::greater<std::int32_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int64_t,
    GT,
    thrust::greater<std::int64_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GT, thrust::greater<float>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, GT, thrust::greater<double>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GE, thrust::greater_equal<bool>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int32_t,
    GE,
    thrust::greater_equal<std::int32_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    std::int64_t,
    GE,
    thrust::greater_equal<std::int64_t>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GE, thrust::greater_equal<float>())
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
    double,
    GE,
    thrust::greater_equal<double>())
#undef DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION

} // namespace math
} // namespace caffe2