pytorch/caffe2/utils/math_gpu.cu

// Implements the math functions for GPU.

#include "caffe2/utils/math.h"

#include <limits>
#include <numeric>
#include <vector>

#include <cub/block/block_reduce.cuh>
#include <cub/cub.cuh>

#include <thrust/functional.h>

#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/conversions.h"
#include "caffe2/utils/fixed_divisor.h"
#include "caffe2/utils/math_utils.h"

#if THRUST_VERSION >= 100800
#define THRUST_SUPPORTS_PER_THREAD
#endif // THRUST_VERSION >= 100800

namespace caffe2 {
namespace math {

namespace {

#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr)        \
  template <typename T>                                               \
  struct Func##Functor {                                              \
    inline __host__ __device__ T                                      \
    operator()(const T& lhs, const T& rhs) const {                    \
      return lhs expr rhs;                                            \
    }                                                                 \
  };                                                                  \
  template <>                                                         \
  struct Func##Functor<float16> {                                     \
    inline __host__ __device__ float16                                \
    operator()(const float16& lhs, const float16& rhs) const {        \
      return convert::To<float, float16>(convert::To<float16, float>( \
          lhs) expr convert::To<float16, float>(rhs));                \
    }                                                                 \
  };
DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +)
DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -)
DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Mul, *)
DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /)
#undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR

template <typename T>
__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
  CUDA_1D_KERNEL_LOOP(i, N) {
#if __CUDA_ARCH__ >= 350
    sincos(__ldg(X + i), S + i, C + i);
#else
    sincos(X[i], S + i, C + i);
#endif
  }
}

template <typename TIn, typename TOut, class BinaryOperator>
__global__ void SimpleBinaryOpCUDAKernel(
    const int N,
    const BinaryOperator op,
    const TIn* A,
    const TIn* B,
    TOut* C) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    C[i] = op(A[i], B[i]);
  }
}

template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
__global__ void RowwiseBinaryOpCUDAKenel(
    const int size,
    const FixedDivisor<int> cols,
    const BinaryOperator op,
    const TIn* A,
    const TIn* B,
    TOut* C) {
  CUDA_1D_KERNEL_LOOP(C_index, size) {
    const int j = cols.Mod(C_index);
    const int A_index = broadcast_1st ? j : C_index;
    const int B_index = broadcast_1st ? C_index : j;
    C[C_index] = op(A[A_index], B[B_index]);
  }
}

template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
__global__ void ColwiseBinaryOpCUDAKenel(
    const int size,
    const FixedDivisor<int> cols,
    const BinaryOperator op,
    const TIn* A,
    const TIn* B,
    TOut* C) {
  CUDA_1D_KERNEL_LOOP(C_index, size) {
    const int i = cols.Div(C_index);
    const int A_index = broadcast_1st ? i : C_index;
    const int B_index = broadcast_1st ? C_index : i;
    C[C_index] = op(A[A_index], B[B_index]);
  }
}

template <typename TIn, typename TOut, class BinaryOperator, int D>
__global__ void BroadcastBinaryOpCUDAKernel(
    const int size,
    const SimpleArray<int, D> A_strides,
    const SimpleArray<int, D> B_strides,
    const SimpleArray<FixedDivisor<int>, D> C_dims,
    const BinaryOperator op,
    const TIn* A,
    const TIn* B,
    TOut* C) {
  CUDA_1D_KERNEL_LOOP(C_index, size) {
    int A_index = 0;
    int B_index = 0;
    int C_index_val = C_index;
#pragma unroll
    for (int i = D - 1; i >= 0; --i) {
      int d;
      C_dims.data[i].DivMod(C_index_val, &C_index_val, &d);
      A_index += d * A_strides.data[i];
      B_index += d * B_strides.data[i];
    }
    C[C_index] = op(A[A_index], B[B_index]);
  }
}

template <typename TIn, typename TOut, class BinaryOperator>
void BinaryOpWith2DBroadcasting(
    const int ndim,
    const int* dims,
    const int pivot,
    const bool rowwise_broadcast,
    const bool broadcast_1st,
    const BinaryOperator& op,
    const TIn* A,
    const TIn* B,
    TOut* C,
    CUDAContext* context) {
  const int rows =
      std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
  const int cols =
      std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
  if (rows == 0 || cols == 0) {
    return;
  }
  const int size = rows * cols;
  const FixedDivisor<int> cols_div(cols);
  if (rowwise_broadcast) {
    if (broadcast_1st) {
      RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
          <<<CAFFE_GET_BLOCKS(size),
             CAFFE_CUDA_NUM_THREADS,
             0,
             context->cuda_stream()>>>(size, cols_div, op, A, B, C);
    } else {
      RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
          <<<CAFFE_GET_BLOCKS(size),
             CAFFE_CUDA_NUM_THREADS,
             0,
             context->cuda_stream()>>>(size, cols_div, op, A, B, C);
    }
  } else {
    if (broadcast_1st) {
      ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
          <<<CAFFE_GET_BLOCKS(size),
             CAFFE_CUDA_NUM_THREADS,
             0,
             context->cuda_stream()>>>(size, cols_div, op, A, B, C);
    } else {
      ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
          <<<CAFFE_GET_BLOCKS(size),
             CAFFE_CUDA_NUM_THREADS,
             0,
             context->cuda_stream()>>>(size, cols_div, op, A, B, C);
    }
  }
}

template <typename TIn, typename TOut, class BinaryOperator, int D>
void BroadcastBinaryOpImpl(
    const int* A_dims,
    const int* B_dims,
    const int* C_dims,
    const BinaryOperator& op,
    const TIn* A,
    const TIn* B,
    TOut* C,
    CUDAContext* context) {
  SimpleArray<int, D> A_strides_array;
  SimpleArray<int, D> B_strides_array;
  SimpleArray<FixedDivisor<int>, D> C_dims_array;
  int A_stride = 1;
  int B_stride = 1;
  for (int i = D - 1; i >= 0; --i) {
    if (C_dims[i] == 0) {
      return;
    }
    A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride;
    B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride;
    A_stride *= A_dims[i];
    B_stride *= B_dims[i];
    C_dims_array.data[i] = FixedDivisor<int>(C_dims[i]);
  }
  const int size =
      std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>());
  BroadcastBinaryOpCUDAKernel<TIn, TOut, BinaryOperator, D>
      <<<CAFFE_GET_BLOCKS(size),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
          size, A_strides_array, B_strides_array, C_dims_array, op, A, B, C);
}

template <typename TIn, typename TOut, class BinaryOperator>
void BroadcastBinaryOp(
    const int A_ndim,
    const int* A_dims,
    const int B_ndim,
    const int* B_dims,
    const BinaryOperator& op,
    const TIn* A,
    const TIn* B,
    TOut* C,
    CUDAContext* context) {
  const int ndim = std::max(A_ndim, B_ndim);
  std::vector<int> A_dims_array(ndim);
  std::vector<int> B_dims_array(ndim);
  std::vector<int> C_dims_array(ndim);
  utils::ComputeBroadcastBinaryOpDims(
      A_ndim,
      A_dims,
      B_ndim,
      B_dims,
      A_dims_array.data(),
      B_dims_array.data(),
      C_dims_array.data());
  if (A_dims_array == B_dims_array) {
    const int size = std::accumulate(
        C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies<int>());
    SimpleBinaryOpCUDAKernel<TIn, TOut, BinaryOperator>
        <<<CAFFE_GET_BLOCKS(size),
           CAFFE_CUDA_NUM_THREADS,
           0,
           context->cuda_stream()>>>(size, op, A, B, C);
    return;
  }
  int pivot;
  bool broadcast_1st;
  if (utils::IsRowwiseBroadcastBinaryOp(
          ndim,
          A_dims_array.data(),
          B_dims_array.data(),
          &pivot,
          &broadcast_1st)) {
    BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
        ndim,
        C_dims_array.data(),
        pivot,
        true,
        broadcast_1st,
        op,
        A,
        B,
        C,
        context);
    return;
  }
  if (utils::IsColwiseBroadcastBinaryOp(
          ndim,
          A_dims_array.data(),
          B_dims_array.data(),
          &pivot,
          &broadcast_1st)) {
    BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
        ndim,
        C_dims_array.data(),
        pivot,
        false,
        broadcast_1st,
        op,
        A,
        B,
        C,
        context);
    return;
  }
  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
      ndim,
      BroadcastBinaryOpImpl,
      TIn,
      TOut,
      BinaryOperator,
      A_dims_array.data(),
      B_dims_array.data(),
      C_dims_array.data(),
      op,
      A,
      B,
      C,
      context);
}

} // namespace

#define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func, op)            \
  __global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \
    CUDA_1D_KERNEL_LOOP(i, N) {                                     \
      Y[i] = op(X[i]);                                              \
    }                                                               \
  }                                                                 \
  template <>                                                       \
  void Func<T, CUDAContext>(                                        \
      const int N, const T* x, T* y, CUDAContext* context) {        \
    Func##CUDAKernel<<<                                             \
        CAFFE_GET_BLOCKS(N),                                        \
        CAFFE_CUDA_NUM_THREADS,                                     \
        0,                                                          \
        context->cuda_stream()>>>(N, x, y);                         \
  }

DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, utils::Square<float>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt, rsqrtf)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt, cbrtf)

DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube, utils::Cube<float>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube, utils::Cube<double>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
    std::int32_t,
    Cube,
    utils::Cube<std::int32_t>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
    std::int64_t,
    Cube,
    utils::Cube<std::int64_t>)

DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not, utils::Not)

DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg, utils::Negate<float>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg, utils::Negate<double>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
    std::int32_t,
    Neg,
    utils::Negate<std::int32_t>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
    std::int64_t,
    Neg,
    utils::Negate<std::int64_t>)

DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign, utils::Sign<float>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign, utils::Sign<double>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
    std::int32_t,
    Sign,
    utils::Sign<std::int32_t>)
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
    std::int64_t,
    Sign,
    utils::Sign<std::int64_t>)

#undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION

#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T)                            \
  template <>                                                        \
  void SinCos<T, CUDAContext>(                                       \
      const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \
    SinCosCUDAKernel<<<                                              \
        CAFFE_GET_BLOCKS(N),                                         \
        CAFFE_CUDA_NUM_THREADS,                                      \
        0,                                                           \
        context->cuda_stream()>>>(N, x, ys, yc);                     \
  }
CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
#undef CAFFE2_SPECIALIZED_CUDA_SINCOS

#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
  template <>                                                     \
  void Func<TIn, CUDAContext>(                                    \
      const int N,                                                \
      const TIn* A,                                               \
      const TIn* B,                                               \
      TOut* C,                                                    \
      CUDAContext* context) {                                     \
    SimpleBinaryOpCUDAKernel<TIn, TOut, Op<TIn>>                  \
        <<<CAFFE_GET_BLOCKS(N),                                   \
           CAFFE_CUDA_NUM_THREADS,                                \
           0,                                                     \
           context->cuda_stream()>>>(N, Op<TIn>(), A, B, C);      \
  }

#define DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(Func, Op)                \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)

DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LT, thrust::less)
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)

#undef DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION

#define DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Func, Op)                         \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, float, Func, Op)               \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, double, Func, Op)             \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)

DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Add, AddFunctor)
DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Div, DivFunctor)

#undef DEFINE_SIMPLE_CUDA_BINARY_FUNCTION

DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)

#define DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(Func, Op)                 \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)                 \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
  DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)

DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)

#undef DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION

DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
    float,
    float,
    ElemwiseMax,
    thrust::maximum);

#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION

#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)   \
  template <>                                                             \
  void Rowwise##Func<TIn, CUDAContext, true>(                             \
      const int rows,                                                     \
      const int cols,                                                     \
      const TIn* A,                                                       \
      const TIn* B,                                                       \
      TOut* C,                                                            \
      CUDAContext* context) {                                             \
    if (rows == 0 || cols == 0) {                                         \
      return;                                                             \
    }                                                                     \
    const int size = rows * cols;                                         \
    const FixedDivisor<int> cols_div(cols);                               \
    RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true>                    \
        <<<CAFFE_GET_BLOCKS(size),                                        \
           CAFFE_CUDA_NUM_THREADS,                                        \
           0,                                                             \
           context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
  }                                                                       \
  template <>                                                             \
  void Rowwise##Func<TIn, CUDAContext, false>(                            \
      const int rows,                                                     \
      const int cols,                                                     \
      const TIn* A,                                                       \
      const TIn* B,                                                       \
      TOut* C,                                                            \
      CUDAContext* context) {                                             \
    if (rows == 0 || cols == 0) {                                         \
      return;                                                             \
    }                                                                     \
    const int size = rows * cols;                                         \
    const FixedDivisor<int> cols_div(cols);                               \
    RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false>                   \
        <<<CAFFE_GET_BLOCKS(size),                                        \
           CAFFE_CUDA_NUM_THREADS,                                        \
           0,                                                             \
           context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
  }                                                                       \
  template <>                                                             \
  void Colwise##Func<TIn, CUDAContext, true>(                             \
      const int rows,                                                     \
      const int cols,                                                     \
      const TIn* A,                                                       \
      const TIn* B,                                                       \
      TOut* C,                                                            \
      CUDAContext* context) {                                             \
    if (rows == 0 || cols == 0) {                                         \
      return;                                                             \
    }                                                                     \
    const int size = rows * cols;                                         \
    const FixedDivisor<int> cols_div(cols);                               \
    ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true>                    \
        <<<CAFFE_GET_BLOCKS(size),                                        \
           CAFFE_CUDA_NUM_THREADS,                                        \
           0,                                                             \
           context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
  }                                                                       \
  template <>                                                             \
  void Colwise##Func<TIn, CUDAContext, false>(                            \
      const int rows,                                                     \
      const int cols,                                                     \
      const TIn* A,                                                       \
      const TIn* B,                                                       \
      TOut* C,                                                            \
      CUDAContext* context) {                                             \
    if (rows == 0 || cols == 0) {                                         \
      return;                                                             \
    }                                                                     \
    const int size = rows * cols;                                         \
    const FixedDivisor<int> cols_div(cols);                               \
    ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false>                   \
        <<<CAFFE_GET_BLOCKS(size),                                        \
           CAFFE_CUDA_NUM_THREADS,                                        \
           0,                                                             \
           context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
  }

#define DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op)                \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)

DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)

#undef DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION

#define DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op)             \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                          \
      std::int32_t, std::int32_t, Func, Op)                            \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                          \
      std::int64_t, std::int64_t, Func, Op)                            \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op)   \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)

DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)

#undef DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION

DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)

#define DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                      \
      std::int32_t, std::int32_t, Func, Op)                        \
  DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(                      \
      std::int64_t, std::int64_t, Func, Op)

DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)

#undef DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION

#undef DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION

#define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op)  \
  template <>                                                         \
  void Func<TIn, CUDAContext>(                                        \
      const int A_ndim,                                               \
      const int* A_dims,                                              \
      const int B_ndim,                                               \
      const int* B_dims,                                              \
      const TIn* A,                                                   \
      const TIn* B,                                                   \
      TOut* C,                                                        \
      CUDAContext* context) {                                         \
    BroadcastBinaryOp<TIn, TOut, Op<TIn>>(                            \
        A_ndim, A_dims, B_ndim, B_dims, Op<TIn>(), A, B, C, context); \
  }

#define DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op)                \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op)        \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op)       \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)

DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)

#undef DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION

#define DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op)             \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                          \
      std::int32_t, std::int32_t, Func, Op)                         \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                          \
      std::int64_t, std::int64_t, Func, Op)                         \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op)   \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)

DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)

#undef DEFINE_BROADCAST_CUDA_BINARY_FUNCTION

DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)

#define DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(                      \
      std::int32_t, std::int32_t, Func, Op)                     \
  DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)

DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)

#undef DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION

#undef DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION

#define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func)                  \
  template <>                                                           \
  void Funcname<T, CUDAContext>(                                        \
      const int N,                                                      \
      const T* src,                                                     \
      T* dst,                                                           \
      Tensor<CUDAContext>* scratch_ptr,                                 \
      CUDAContext* context) {                                           \
    size_t memRequired = 0;                                             \
    cub::DeviceReduce::func(                                            \
        nullptr, memRequired, src, dst, N, context->cuda_stream());     \
    auto buffer_size =                                                  \
        static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); \
    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});              \
    cub::DeviceReduce::func(                                            \
        static_cast<void*>(scratch_ptr->mutable_data<T>()),             \
        memRequired,                                                    \
        src,                                                            \
        dst,                                                            \
        N,                                                              \
        context->cuda_stream());                                        \
  }

DELEGATE_REDUCTION_FUNCTION(float, ReduceMin, Min)
DELEGATE_REDUCTION_FUNCTION(float, ReduceMax, Max)
DELEGATE_REDUCTION_FUNCTION(int32_t, ReduceMax, Max)
DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max)

#undef DELEGATE_REDUCTION_FUNCTION

// Caffe2 gemm provides a simpler interface to the gemm functions, with the
// limitation that the data has to be contiguous in memory.
template <>
void Gemm<float, CUDAContext>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float* A,
    const float* B,
    const float beta,
    float* C,
    CUDAContext* context,
    TensorProto::DataType math_type) {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (TransA == CblasNoTrans) ? K : M;
  int ldb = (TransB == CblasNoTrans) ? N : K;
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  CUBLAS_ENFORCE(cublasSgemm(
      context->cublas_handle(),
      cuTransB,
      cuTransA,
      N,
      M,
      K,
      &alpha,
      B,
      ldb,
      A,
      lda,
      &beta,
      C,
      N));
}

template <>
void Gemm<float16, CUDAContext>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float16* A,
    const float16* B,
    const float beta,
    float16* C,
    CUDAContext* context,
    TensorProto::DataType math_type) {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (TransA == CblasNoTrans) ? K : M;
  int ldb = (TransB == CblasNoTrans) ? N : K;
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  if (math_type == TensorProto_DataType_FLOAT) {
    CUBLAS_CHECK(cublasSgemmEx(
        context->cublas_handle(),
        cuTransB,
        cuTransA,
        N,
        M,
        K,
        &alpha,
        B,
        CUDA_R_16F,
        ldb,
        A,
        CUDA_R_16F,
        lda,
        &beta,
        C,
        CUDA_R_16F,
        N));

  } else if (math_type == TensorProto_DataType_FLOAT16) {
    // convert alpha, beta from float -> __half
    auto alpha_fp16 = convert::floatToHalf(alpha);
    auto beta_fp16 = convert::floatToHalf(beta);

    // call cublasHgemm
    CUBLAS_CHECK(cublasHgemm(
        context->cublas_handle(),
        cuTransB,
        cuTransA,
        N,
        M,
        K,
        &alpha_fp16,
        (const __half*)B,
        ldb,
        (const __half*)A,
        lda,
        &beta_fp16,
        (__half*)C,
        N));
  } else {
    // fail
    CAFFE_THROW("Unsupported math type");
  }
}

template <>
void BiasCHW<float, CUDAContext>(
    const float* bias,
    const float* bias_multiplier,
    const int bias_channels,
    const int image_size,
    float* image,
    CUDAContext* context) {
  Gemm<float, CUDAContext>(
      CblasNoTrans,
      CblasNoTrans,
      bias_channels,
      image_size,
      1,
      1,
      bias,
      bias_multiplier,
      1,
      image,
      context);
}

template <>
void GemmBatched<float, CUDAContext>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int batch_size,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float* A,
    const float* B,
    const float beta,
    float* C,
    CUDAContext* context,
    Tensor<CUDAContext>* scratch,
    TensorProto::DataType math_type) {
  const int a_stride = M * K;
  const int b_stride = K * N;
  const int c_stride = M * N;
#if __CUDACC_VER_MAJOR__ < 8
  // loop over matrices in the batch
  for (int i = 0; i < batch_size; ++i) {
    math::Gemm<float, CUDAContext>(
        TransA,
        TransB,
        M,
        N,
        K,
        alpha,
        A + a_stride * i,
        B + b_stride * i,
        beta,
        C + c_stride * i,
        context);
  }
#else
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  const int lda = (TransA == CblasNoTrans) ? K : M;
  const int ldb = (TransB == CblasNoTrans) ? N : K;
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  CUBLAS_ENFORCE(cublasSgemmStridedBatched(
      context->cublas_handle(),
      cuTransB,
      cuTransA,
      N,
      M,
      K,
      &alpha,
      B,
      ldb,
      b_stride,
      A,
      lda,
      a_stride,
      &beta,
      C,
      N,
      c_stride,
      batch_size));
#endif
}

namespace {

__global__ void FloatToHalfKernel(const int N, const float* X, half* Y) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    Y[i] = __float2half(X[i]);
  }
}

__global__ void HalfToFloatKernel(const int N, const half* X, float* Y) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    Y[i] = __half2float(X[i]);
  }
}

}; // namespace

template <>
void GemmBatched<float16, CUDAContext>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int batch_size,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float16* A,
    const float16* B,
    const float beta,
    float16* C,
    CUDAContext* context,
    Tensor<CUDAContext>* scratch,
    TensorProto::DataType math_type) {
  const int a_stride = M * K;
  const int b_stride = K * N;
  const int c_stride = M * N;
#if __CUDACC_VER_MAJOR__ < 8
  // loop over matrices in the batch
  for (int i = 0; i < batch_size; ++i) {
    math::Gemm<float16, CUDAContext>(
        TransA,
        TransB,
        M,
        N,
        K,
        alpha,
        A + a_stride * i,
        B + b_stride * i,
        beta,
        C + c_stride * i,
        context);
  }
#else
  // 3 options:
  // 1) scratch != null = cast to fp32, SgemmStridedBatched, cast result to fp16
  // 2) math_type == FLOAT, scratch == nullptr = looped SgemmEx
  // 3) math_type == FLOAT16, scratch == nullptr = batched Hgemm

  if (scratch != nullptr) {
    const int A_size = a_stride * batch_size;
    const int B_size = b_stride * batch_size;
    // cast, cublasSgemmStridedBatched, cast
    size_t in_elems = A_size + B_size;
    size_t out_elems = c_stride * batch_size;

    scratch->Resize(in_elems + out_elems);
    float* scratch_ptr = scratch->mutable_data<float>();

    float* A_fp32 = scratch_ptr;
    float* B_fp32 = scratch_ptr + A_size;
    float* C_fp32 = scratch_ptr + A_size + B_size;

    // cast A, B into fp32
    HalfToFloatKernel<<<
        CAFFE_GET_BLOCKS(A_size),
        CAFFE_CUDA_NUM_THREADS,
        0,
        context->cuda_stream()>>>(A_size, (half*)A, A_fp32);
    HalfToFloatKernel<<<
        CAFFE_GET_BLOCKS(B_size),
        CAFFE_CUDA_NUM_THREADS,
        0,
        context->cuda_stream()>>>(B_size, (half*)B, B_fp32);

    // run fp32 batched Gemm
    GemmBatched<float, CUDAContext>(
        TransA,
        TransB,
        batch_size,
        M,
        N,
        K,
        alpha,
        A_fp32,
        B_fp32,
        beta,
        C_fp32,
        context);

    // cast result back to fp16
    FloatToHalfKernel<<<
        CAFFE_GET_BLOCKS(batch_size * M * N),
        CAFFE_CUDA_NUM_THREADS,
        0,
        context->cuda_stream()>>>(batch_size * M * N, C_fp32, (half*)C);
  } else {
    if (math_type == TensorProto_DataType_FLOAT) {
      // loop over matrices in the batch
      for (int i = 0; i < batch_size; ++i) {
        math::Gemm<float16, CUDAContext>(
            TransA,
            TransB,
            M,
            N,
            K,
            alpha,
            A + a_stride * i,
            B + b_stride * i,
            beta,
            C + c_stride * i,
            context);
      }
    } else if (math_type == TensorProto_DataType_FLOAT16) {
      // Note that cublas follows fortran order, so the order is different from
      // the cblas convention.
      const int lda = (TransA == CblasNoTrans) ? K : M;
      const int ldb = (TransB == CblasNoTrans) ? N : K;
      cublasOperation_t cuTransA =
          (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
      cublasOperation_t cuTransB =
          (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;

      // convert alpha, beta from float -> __half
      auto alpha_fp16 = convert::floatToHalf(alpha);
      auto beta_fp16 = convert::floatToHalf(beta);
      CUBLAS_ENFORCE(cublasHgemmStridedBatched(
          context->cublas_handle(),
          cuTransB,
          cuTransA,
          N,
          M,
          K,
          &alpha_fp16,
          (const __half*)B,
          ldb,
          b_stride,
          (const __half*)A,
          lda,
          a_stride,
          &beta_fp16,
          (__half*)C,
          N,
          c_stride,
          batch_size));
    }
  }
#endif
}

#if CUDA_VERSION >= 9000

// No change, but required. Defer to default CUDA engine
template <>
void Gemm<float, CUDAContext, TensorCoreEngine>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float* A,
    const float* B,
    const float beta,
    float* C,
    CUDAContext* context,
    TensorProto::DataType math_type) {
  return Gemm<float, CUDAContext>(
      TransA, TransB, M, N, K, alpha, A, B, beta, C, context, math_type);
}

template <>
void Gemm<float16, CUDAContext, TensorCoreEngine>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float16* A,
    const float16* B,
    const float beta,
    float16* C,
    CUDAContext* context,
    TensorProto::DataType math_type) {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  int lda = (TransA == CblasNoTrans) ? K : M;
  int ldb = (TransB == CblasNoTrans) ? N : K;
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;

  // enable TensorCore for this call on this handle
  if (TensorCoreAvailable()) {
    CUBLAS_ENFORCE(
        cublasSetMathMode(context->cublas_handle(), CUBLAS_TENSOR_OP_MATH));
  }

  CUBLAS_CHECK(cublasGemmEx(
      context->cublas_handle(),
      cuTransB,
      cuTransA,
      N,
      M,
      K,
      &alpha,
      B,
      CUDA_R_16F,
      ldb,
      A,
      CUDA_R_16F,
      lda,
      &beta,
      C,
      CUDA_R_16F,
      N,
      CUDA_R_32F,
      CUBLAS_GEMM_DFALT_TENSOR_OP));

  // Now disable TensorCore math for subsequent calls to this handle
  if (TensorCoreAvailable()) {
    CUBLAS_ENFORCE(
        cublasSetMathMode(context->cublas_handle(), CUBLAS_DEFAULT_MATH));
  }
}

template <>
void GemmBatched<float, CUDAContext, TensorCoreEngine>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int batch_size,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float* A,
    const float* B,
    const float beta,
    float* C,
    CUDAContext* context,
    Tensor<CUDAContext>* scratch,
    TensorProto::DataType math_type) {
  return GemmBatched<float, CUDAContext, DefaultEngine>(
      TransA,
      TransB,
      batch_size,
      M,
      N,
      K,
      alpha,
      A,
      B,
      beta,
      C,
      context,
      scratch,
      math_type);
}

template <>
void GemmBatched<float16, CUDAContext, TensorCoreEngine>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int batch_size,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float16* A,
    const float16* B,
    const float beta,
    float16* C,
    CUDAContext* context,
    Tensor<CUDAContext>* scratch,
    TensorProto::DataType math_type) {
  return GemmBatched<float16, CUDAContext, DefaultEngine>(
      TransA,
      TransB,
      batch_size,
      M,
      N,
      K,
      alpha,
      A,
      B,
      beta,
      C,
      context,
      scratch,
      math_type);
}

#endif // CUDA_VERSION >= 9000

template <>
void GemmEx<float, CUDAContext>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float* A,
    const int lda,
    const float* B,
    const int ldb,
    const float beta,
    float* C,
    const int ldc,
    CUDAContext* context) {
  // Note that cublas follows fortran order, so the order is different from
  // the cblas convention.
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t cuTransB =
      (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
  CUBLAS_ENFORCE(cublasSgemm(
      context->cublas_handle(),
      cuTransB,
      cuTransA,
      N,
      M,
      K,
      &alpha,
      B,
      ldb,
      A,
      lda,
      &beta,
      C,
      ldc));
}

template <>
void Gemv<float, CUDAContext>(
    const CBLAS_TRANSPOSE TransA,
    const int M,
    const int N,
    const float alpha,
    const float* A,
    const float* x,
    const float beta,
    float* y,
    CUDAContext* context,
    TensorProto::DataType math_type) {
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
  CUBLAS_ENFORCE(cublasSgemv(
      context->cublas_handle(),
      cuTransA,
      N,
      M,
      &alpha,
      A,
      N,
      x,
      1,
      &beta,
      y,
      1));
}

// Batched Add variants
namespace {

template <typename T>
__global__ void AddStripedBatchKernel(
    const int N,
    const T* first,
    T* Y,
    const int stripe,
    const int batch) {
  for (int j = 0; j < batch; j++) {
    const T* x = first + j * stripe;
    CUDA_1D_KERNEL_LOOP(i, N) {
      float tmpY = convert::To<T, float>(Y[i]);
      tmpY += convert::To<T, float>(x[i]);
      Y[i] = convert::To<float, T>(tmpY);
    }
  }
}
} // namespace

#define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T)              \
  template <>                                                     \
  void AddStripedBatch<T, CUDAContext>(                           \
      const int N,                                                \
      const T* first,                                             \
      T* Y,                                                       \
      const int stripe,                                           \
      const int batch,                                            \
      CUDAContext* context) {                                     \
    AddStripedBatchKernel<T>                                      \
        <<<CAFFE_GET_BLOCKS(N),                                   \
           CAFFE_CUDA_NUM_THREADS,                                \
           0,                                                     \
           context->cuda_stream()>>>(N, first, Y, stripe, batch); \
  }

CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float);
CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float16);
#undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH

template <>
void Gemv<float16, CUDAContext>(
    const CBLAS_TRANSPOSE TransA,
    const int M,
    const int N,
    const float alpha,
    const float16* A,
    const float16* x,
    const float beta,
    float16* y,
    CUDAContext* context,
    TensorProto::DataType math_type) {
  cublasOperation_t cuTransA =
      (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;

  // sort out what we need to call cublasSgemmEx / cublasHgemm
  int m = (cuTransA == CUBLAS_OP_N) ? N : M;
  int k = (cuTransA == CUBLAS_OP_N) ? M : N;
  int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
  int LDC = m;

  if (math_type == TensorProto_DataType_FLOAT) {
    CUBLAS_CHECK(cublasSgemmEx(
        context->cublas_handle(),
        cuTransA,
        CUBLAS_OP_N,
        m,
        1,
        k,
        &alpha,
        A,
        CUDA_R_16F,
        LDA,
        x,
        CUDA_R_16F,
        k,
        &beta,
        y,
        CUDA_R_16F,
        LDC));
  } else if (math_type == TensorProto_DataType_FLOAT16) {
    auto alpha_fp16 = convert::floatToHalf(alpha);
    auto beta_fp16 = convert::floatToHalf(beta);

    CUBLAS_CHECK(cublasHgemm(
        context->cublas_handle(),
        cuTransA,
        CUBLAS_OP_N,
        m,
        1,
        k,
        &alpha_fp16,
        (const __half*)A,
        LDA,
        (const __half*)x,
        k,
        &beta_fp16,
        (__half*)y,
        LDC));
  } else {
    // fail
    CAFFE_THROW("Unsupported math type");
  }
}

namespace {
template <typename T>
__global__ void SetKernel(const int N, const T alpha, T* Y) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    Y[i] = alpha;
  }
}
} // namespace

#define CAFFE2_SPECIALIZED_CUDA_SET(T)                             \
  template <>                                                      \
  void Set<T, CUDAContext>(                                        \
      const size_t N, const T alpha, T* Y, CUDAContext* context) { \
    SetKernel<<<                                                   \
        CAFFE_GET_BLOCKS(N),                                       \
        CAFFE_CUDA_NUM_THREADS,                                    \
        0,                                                         \
        context->cuda_stream()>>>(N, alpha, Y);                    \
  }

CAFFE2_SPECIALIZED_CUDA_SET(float);
CAFFE2_SPECIALIZED_CUDA_SET(double);
CAFFE2_SPECIALIZED_CUDA_SET(bool);
CAFFE2_SPECIALIZED_CUDA_SET(int8_t);
CAFFE2_SPECIALIZED_CUDA_SET(int16_t);
CAFFE2_SPECIALIZED_CUDA_SET(float16);
CAFFE2_SPECIALIZED_CUDA_SET(int);
CAFFE2_SPECIALIZED_CUDA_SET(int64_t);
CAFFE2_SPECIALIZED_CUDA_SET(char);
CAFFE2_SPECIALIZED_CUDA_SET(uint8_t);
CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
#undef CAFFE2_SPECIALIZED_CUDA_SET

namespace {
template <typename T>
__global__ void
UniformShift(const size_t N, const float min, const float max, T* x) {
  float scale = max - min;
  CUDA_1D_KERNEL_LOOP(i, N) {
    x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min);
  }
}

__global__ void
UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) {
  int* x_int = reinterpret_cast<int*>(x);
  int range = (max - min + 1);
  CUDA_1D_KERNEL_LOOP(i, N) {
    x_int[i] = min + static_cast<int>(x[i] % range);
  }
}
} // namespace

template <>
void RandUniform<float, CUDAContext>(
    const size_t n,
    const float min,
    const float max,
    float* r,
    CUDAContext* context) {
  CURAND_ENFORCE(curandGenerateUniform(context->curand_generator(), r, n));
  UniformShift<float>
      <<<CAFFE_GET_BLOCKS(n),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(n, min, max, r);
}

template <>
void RandUniform<double, CUDAContext>(
    const size_t n,
    const double min,
    const double max,
    double* r,
    CUDAContext* context) {
  CURAND_ENFORCE(
      curandGenerateUniformDouble(context->curand_generator(), r, n));
  UniformShift<double>
      <<<CAFFE_GET_BLOCKS(n),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(n, min, max, r);
}

template <>
void RandUniform<int, CUDAContext>(
    const size_t n,
    const int min,
    const int max,
    int* r,
    CUDAContext* context) {
  CURAND_ENFORCE(curandGenerate(
      context->curand_generator(), reinterpret_cast<unsigned int*>(r), n));
  UniformIntFit<<<
      CAFFE_GET_BLOCKS(n),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context->cuda_stream()>>>(
      n, min, max, reinterpret_cast<unsigned int*>(r));
}

template <typename T>
size_t HandleOddLengthRandGaussian(
    const size_t n,
    const T mean,
    const T std,
    T* r,
    CUDAContext* context) {
  if (n % 2 == 1) {
    std::default_random_engine generator;
    std::normal_distribution<T> distribution(mean, std);
    const T random_value = distribution(generator);
    math::Set<T, CUDAContext>(1, random_value, r + (n - 1), context);
    return n - 1;
  }
  return n;
}

template <>
void RandGaussian<float, CUDAContext>(
    const size_t n,
    const float mean,
    const float std,
    float* r,
    CUDAContext* context) {
  // If n is odd, we add a random Gaussian value at the end manually
  // and generate n-1 random values using curandGenerateNormal.
  // curandGenerateNormal requires n to be even.
  const size_t even_n =
      HandleOddLengthRandGaussian<float>(n, mean, std, r, context);
  CURAND_ENFORCE(
      curandGenerateNormal(context->curand_generator(), r, even_n, mean, std));
}

template <>
void RandGaussian<double, CUDAContext>(
    const size_t n,
    const double mean,
    const double std,
    double* r,
    CUDAContext* context) {
  const size_t even_n =
      HandleOddLengthRandGaussian<double>(n, mean, std, r, context);
  CURAND_ENFORCE(curandGenerateNormalDouble(
      context->curand_generator(), r, even_n, mean, std));
}

template <>
void Dot<float, CUDAContext>(
    const int n,
    const float* a,
    const float* b,
    float* y,
    CUDAContext* context) {
  float result;
  CUBLAS_ENFORCE(cublasSdot(context->cublas_handle(), n, a, 1, b, 1, &result));
  context->Copy<float, CPUContext, CUDAContext>(1, &result, y);
}

template <>
void Dot<float16, CUDAContext>(
    const int n,
    const float16* a,
    const float16* b,
    float16* y,
    CUDAContext* context) {
  float16 result;
  // execute with 32-bit math
  CUBLAS_CHECK(cublasDotEx(
      context->cublas_handle(),
      n,
      a,
      CUDA_R_16F,
      1,
      b,
      CUDA_R_16F,
      1,
      &result,
      CUDA_R_16F,
      CUDA_R_32F));
  context->Copy<float16, CPUContext, CUDAContext>(1, &result, y);
}

// A previous version of caffe2 used Thrust but it turns out that thrust
// reduction has an implicit scratch space allocation and deallocation, which
// may interfere with NCCL and create a deadlock. Hence we are using a custom
// reduction here.
#define SUM_KERNEL_NTHREADS 128
template <typename T>
__global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
  const int idx = threadIdx.x;
  __shared__ float reduction_buffer[SUM_KERNEL_NTHREADS];

  reduction_buffer[idx] = 0;

  // A multilevel reduction.
  // N -> 128
  if (!square) {
    for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
      reduction_buffer[idx] += convert::To<T, float>(X[i]);
    }
  } else {
    for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
      float Xi = convert::To<T, float>(X[i]);
      reduction_buffer[idx] += Xi * Xi;
    }
  }
  __syncthreads();
  // 128 -> 32
  if (idx < 32) {
    reduction_buffer[idx] += reduction_buffer[idx + 32] +
        reduction_buffer[idx + 64] + reduction_buffer[idx + 96];
  }
  __syncthreads();
  // 32 -> 1
  if (idx == 0) {
    float tmp = 0;
    for (int i = 0; i < 32; ++i) {
      tmp += reduction_buffer[i];
    }
    *Y = convert::To<float, T>(tmp);
  }
}

// According to the benchmarks script
// caffe2/caffe2/experiments/python/device_reduce_sum_bench.py,
// device reduce is slower for N <= 10000.
#define DEVICE_REDUCE_SIZE_THRESHOLD 10000

namespace {

template <typename T>
__global__ void SumConvertKernel(float* sum, T* dest) {
  *dest = convert::To<float, T>(*sum);
}

template <typename T, typename IterT>
void SumGenericIter(
    const int N,
    IterT it,
    T*& dest,
    CUDAContext* context,
    Tensor<CUDAContext>* scratch_ptr) {
  size_t memRequired = 0;
  cub::DeviceReduce::Sum(
      nullptr, memRequired, it, dest, N, context->cuda_stream());
  auto buffer_size =
      static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T));
  if (!dest) {
    // allocate one more T at the end of scratch for dest
    scratch_ptr->Resize(std::vector<TIndex>{buffer_size + 1});
    dest = scratch_ptr->template mutable_data<T>() + buffer_size;
  } else {
    scratch_ptr->Resize(std::vector<TIndex>{buffer_size});
  }
  cub::DeviceReduce::Sum(
      static_cast<void*>(scratch_ptr->template mutable_data<T>()),
      memRequired,
      it,
      dest,
      N,
      context->cuda_stream());
}
} // namespace

template <>
void Sum<float, CUDAContext>(
    const int N,
    const float* x,
    float* y,
    CUDAContext* context,
    Tensor<CUDAContext>* scratch_ptr) {
  if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
    SumGenericIter<float>(N, x, y, context, scratch_ptr);
  } else {
    SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
        N, x, y, false);
  }
}

template <>
void Sum<int32_t, CUDAContext>(
    const int N,
    const int32_t* x,
    int32_t* y,
    CUDAContext* context,
    Tensor<CUDAContext>* scratch_ptr) {
  if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
    SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
  } else {
    SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
        N, x, y, false);
  }
}

namespace {
template <typename T>
struct FloatTransform {
  inline __host__ __device__ float operator()(const T v) const {
    return convert::To<T, float>(v);
  }
};
} // namespace

#define CAFFE2_MATH_SUM_FUNC(T)                                           \
  template <>                                                             \
  void Sum<T, CUDAContext>(                                               \
      const int N,                                                        \
      const T* x,                                                         \
      T* y,                                                               \
      CUDAContext* context,                                               \
      Tensor<CUDAContext>* scratch_ptr) {                                 \
    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {                \
      FloatTransform<T> transform;                                        \
      cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
          x, transform);                                                  \
      float* sum = nullptr;                                               \
      SumGenericIter<float>(N, it, sum, context, scratch_ptr);            \
      SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y);      \
    } else {                                                              \
      SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(   \
          N, x, y, false);                                                \
    }                                                                     \
  }

CAFFE2_MATH_SUM_FUNC(float16)
#undef CAFFE2_MATH_SUM_FUNC

namespace {
template <typename T>
struct SqrTransform {
  inline __host__ __device__ T operator()(const T v) const {
    return v * v;
  }
};
} //  namespace

template <>
void SumSqr<float, CUDAContext>(
    const int N,
    const float* x,
    float* y,
    CUDAContext* context,
    Tensor<CUDAContext>* scratch_ptr) {
  if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
    SqrTransform<float> transform;
    cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
        x, transform);
    SumGenericIter<float>(N, it, y, context, scratch_ptr);
  } else {
    SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
        N, x, y, true);
  }
}

#define CAFFE2_MATH_SUMSQR_FUNC(T)                                      \
  template <>                                                           \
  void SumSqr<T, CUDAContext>(                                          \
      const int N,                                                      \
      const T* x,                                                       \
      T* y,                                                             \
      CUDAContext* context,                                             \
      Tensor<CUDAContext>* scratch_ptr) {                               \
    if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {              \
      FloatTransform<T> float_transform;                                \
      cub::TransformInputIterator<float, FloatTransform<T>, const T*>   \
          float_it(x, float_transform);                                 \
      SqrTransform<float> sqr_transform;                                \
      cub::TransformInputIterator<                                      \
          float,                                                        \
          SqrTransform<float>,                                          \
          decltype(float_it)>                                           \
          it(float_it, sqr_transform);                                  \
      float* sum = nullptr;                                             \
      SumGenericIter<float>(N, it, sum, context, scratch_ptr);          \
      SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y);    \
    } else {                                                            \
      SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( \
          N, x, y, true);                                               \
    }                                                                   \
  }

CAFFE2_MATH_SUMSQR_FUNC(float16)
#undef CAFFE2_MATH_SUMSQR_FUNC
#undef DEVICE_REDUCE_SIZE_THRESHOLD

namespace {
template <typename T>
__global__ void
SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    y[i] = x[i * D + idx[i]];
  }
}
} // namespace

template <>
void Select<float, CUDAContext>(
    const int N,
    const int D,
    const float* x,
    const int* idx,
    float* y,
    CUDAContext* context) {
  SelectKernel<float>
      <<<CAFFE_GET_BLOCKS(N),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(N, D, x, idx, y);
}

template <>
void Select<float16, CUDAContext>(
    const int N,
    const int D,
    const float16* x,
    const int* idx,
    float16* y,
    CUDAContext* context) {
  SelectKernel<float16>
      <<<CAFFE_GET_BLOCKS(N),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(N, D, x, idx, y);
}

namespace {
template <typename T>
__global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
    // y[i] = convert::To<float,T>(convert::To<T, float>(x[i]) * alpha);
    y[i] = convert::Get<T>(convert::Get<float>(x[i]) * alpha);
  }
}

template <typename T>
__global__ void
ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
    y[i] = x[i] * (*alpha);
  }
}

template <typename T>
__global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
    y[i] = powf(x[i], exponent);
  }
}

// fp16 specialization
template <>
__global__ void ScaleKernelDeviceAlpha(
    const int n,
    const float* alpha,
    const float16* x,
    float16* y) {
  CUDA_1D_KERNEL_LOOP(i, n) {
    y[i] = convert::To<float, float16>(
        convert::To<float16, float>(x[i]) * (*alpha));
  }
}

} // namespace

template <>
void Powx<float, CUDAContext>(
    const int N,
    const float* a,
    const float b,
    float* y,
    CUDAContext* context) {
  PowKernel<<<
      CAFFE_GET_BLOCKS(N),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context->cuda_stream()>>>(N, a, b, y);
}

template <>
void Scale<float, CUDAContext>(
    const int n,
    const float alpha,
    const float* x,
    float* y,
    CUDAContext* context) {
  ScaleKernel<float>
      <<<CAFFE_GET_BLOCKS(n),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(n, alpha, x, y);
}

template <>
void Scale<float16, CUDAContext>(
    const int n,
    const float alpha,
    const float16* x,
    float16* y,
    CUDAContext* context) {
  ScaleKernel<float16>
      <<<CAFFE_GET_BLOCKS(n),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(n, alpha, x, y);
}

template <>
void Scale<float, CUDAContext>(
    const int n,
    const float* alpha,
    const float* x,
    float* y,
    CUDAContext* context) {
  ScaleKernelDeviceAlpha<float>
      <<<CAFFE_GET_BLOCKS(n),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(n, alpha, x, y);
}

template <>
void Scale<float16, CUDAContext>(
    const int n,
    const float* alpha,
    const float16* x,
    float16* y,
    CUDAContext* context) {
  ScaleKernelDeviceAlpha<float16>
      <<<CAFFE_GET_BLOCKS(n),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(n, alpha, x, y);
}

template <>
void Axpy<float, CUDAContext>(
    const int N,
    const float alpha,
    const float* X,
    float* Y,
    CUDAContext* context) {
  CUBLAS_ENFORCE(cublasSaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1));
}

template <>
void Axpy<double, CUDAContext>(
    const int N,
    const float alpha,
    const double* X,
    double* Y,
    CUDAContext* context) {
  double alpha_d{alpha};
  CUBLAS_ENFORCE(
      cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1));
}

template <>
void Axpy<float16, CUDAContext>(
    const int N,
    const float alpha,
    const float16* X,
    float16* Y,
    CUDAContext* context) {
  CUBLAS_CHECK(cublasAxpyEx(
      context->cublas_handle(),
      N,
      &alpha,
      CUDA_R_16F,
      X,
      CUDA_R_16F,
      1,
      Y,
      CUDA_R_16F,
      1,
      CUDA_R_32F));
}

namespace {
template <typename T>
__global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    y[index] = convert::Get<T>(
        convert::Get<float>(x[index]) * (*a) + convert::Get<float>(y[index]));
  }
}
} // namespace

template <>
void Axpy<float, CUDAContext>(
    const int n,
    const float* alpha,
    const float* X,
    float* Y,
    CUDAContext* context) {
  AxpyKernel<float>
      <<<CAFFE_GET_BLOCKS(n),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(n, alpha, X, Y);
}

template <>
void Axpy<float16, CUDAContext>(
    const int n,
    const float* alpha,
    const float16* X,
    float16* Y,
    CUDAContext* context) {
  AxpyKernel<float16>
      <<<CAFFE_GET_BLOCKS(n),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(n, alpha, X, Y);
}

namespace {
template <typename T>
__global__ void
AxpbyKernel(const int n, const T a, const T* x, const T b, T* y) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    y[index] = x[index] * a + y[index] * b;
  }
}
} // namespace

template <>
void Axpby<float, CUDAContext>(
    const int n,
    const float a,
    const float* x,
    const float b,
    float* y,
    CUDAContext* context) {
  AxpbyKernel<float>
      <<<CAFFE_GET_BLOCKS(n),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(n, a, x, b, y);
}

namespace {

template <typename T>
__global__ void Im2ColNCHWCUDAKernel(
    const int n,
    const int input_h,
    const int input_w,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int stride_h,
    const int stride_w,
    const int output_h,
    const int output_w,
    const T* img_data,
    T* col_data) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    const int w_out = index % output_w;
    const int h_index = index / output_w;
    const int h_out = h_index % output_h;
    const int channel_in = h_index / output_h;
    const int channel_out = channel_in * kernel_h * kernel_w;
    const int h_in = h_out * stride_h - pad_t;
    const int w_in = w_out * stride_w - pad_l;
    const int output_size = output_h * output_w;
    T* col_data_ptr =
        col_data + (channel_out * output_h + h_out) * output_w + w_out;
    const T* img_data_ptr =
        img_data + (channel_in * input_h + h_in) * input_w + w_in;
    int dh = 0;
    for (int i = 0; i < kernel_h; ++i) {
      int dw = 0;
      for (int j = 0; j < kernel_w; ++j) {
        const int h = h_in + dh;
        const int w = w_in + dw;
#if __CUDA_ARCH__ >= 350
        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
            ? __ldg(img_data_ptr + dh * input_w + dw)
            : 0;
#else
        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
            ? img_data_ptr[dh * input_w + dw]
            : 0;
#endif
        col_data_ptr += output_size;
        dw += dilation_w;
      }
      dh += dilation_h;
    }
  }
}

template <typename T>
__global__ void Im2ColNHWCCUDAKernel(
    const int n,
    const int input_h,
    const int input_w,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int stride_h,
    const int stride_w,
    const int output_w,
    const int channels,
    const T* img_data,
    T* col_data) {
  CUDA_1D_KERNEL_LOOP(index, n) {
    const int channel_in = index % channels;
    const int w_out = index / channels % output_w;
    const int h_out = index / channels / output_w;
    const int h_in = h_out * stride_h - pad_t;
    const int w_in = w_out * stride_w - pad_l;
    T* col_data_ptr = col_data +
        (h_out * output_w + w_out) * channels * kernel_h * kernel_w +
        channel_in;
    int dh = 0;
    for (int i = 0; i < kernel_h; ++i) {
      int dw = 0;
      for (int j = 0; j < kernel_w; ++j) {
        const int h = h_in + dh;
        const int w = w_in + dw;
#if __CUDA_ARCH__ >= 350
        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
            ? __ldg(img_data + (h * input_w + w) * channels + channel_in)
            : 0;
#else
        *col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
            ? img_data[(h * input_w + w) * channels + channel_in]
            : 0;
#endif
        col_data_ptr += channels;
        dw += dilation_w;
      }
      dh += dilation_h;
    }
  }
}

template <typename T>
__global__ void Col2ImNCHWCUDAKernel(
    const int n,
    const int input_h,
    const int input_w,
    const int patch_h,
    const int patch_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int stride_h,
    const int stride_w,
    const int output_h,
    const int output_w,
    const T* col_data,
    T* img_data) {
  const int dpatch_h = dilation_h * (patch_h - 1) + 1;
  const int dpatch_w = dilation_w * (patch_w - 1) + 1;

  CUDA_1D_KERNEL_LOOP(index, n) {
    T val = 0;
    const int w = index % input_w + pad_l;
    const int h = index / input_w % input_h + pad_t;
    const int c = index / (input_h * input_w);

    // compute the start and end of the output
    const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
    const int w_col_end = min(w / stride_w + 1, output_w);
    const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
    const int h_col_end = min(h / stride_h + 1, output_h);

    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
        int h_k = (h - h_col * stride_h);
        int w_k = (w - w_col * stride_w);
        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
          h_k /= dilation_h;
          w_k /= dilation_w;
          const int col_data_index =
              (((c * patch_h + h_k) * patch_w + w_k) * output_h + h_col) *
                  output_w +
              w_col;
#if __CUDA_ARCH__ >= 350
          val += __ldg(col_data + col_data_index);
#else
          val += col_data[col_data_index];
#endif
        }
      }
    }
    img_data[index] = val;
  }
}

template <typename T>
__global__ void Col2ImNHWCCUDAKernel(
    const int n,
    const int input_w,
    const int channels,
    const int patch_h,
    const int patch_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int stride_h,
    const int stride_w,
    const int output_h,
    const int output_w,
    const T* col_data,
    T* img_data) {
  const int dpatch_h = dilation_h * (patch_h - 1) + 1;
  const int dpatch_w = dilation_w * (patch_w - 1) + 1;

  CUDA_1D_KERNEL_LOOP(index, n) {
    T val = 0;
    const int c = index % channels;
    const int w = index / channels % input_w + pad_l;
    const int h = index / channels / input_w + pad_t;
    // compute the start and end of the output
    const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
    const int w_col_end = min(w / stride_w + 1, output_w);
    const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
    const int h_col_end = min(h / stride_h + 1, output_h);
    const int channels_col = patch_h * patch_w * channels;

    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
        int h_k = h - h_col * stride_h;
        int w_k = w - w_col * stride_w;
        if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
          h_k /= dilation_h;
          w_k /= dilation_w;
          const int c_col = (h_k * patch_w + w_k) * channels + c;
#if __CUDA_ARCH__ >= 350
          val += __ldg(
              col_data + (h_col * output_w + w_col) * channels_col + c_col);
#else
          val += col_data[(h_col * output_w + w_col) * channels_col + c_col];
#endif
        }
      }
    }
    img_data[index] = val;
  }
}

template <typename T, int N, bool kCol2Im>
__global__ void Im2ColNdNCHWCUDAKernel(
    const int outer_size,
    const int inner_size,
    const int kernel_size,
    SimpleArray<int, N + 1> img_shape,
    SimpleArray<int, N + 1> col_shape,
    SimpleArray<int, N> kernel_shape,
    SimpleArray<int, N> stride,
    SimpleArray<int, N> dilation,
    SimpleArray<int, N> pad,
    const T* X_data,
    T* Y_data) {
  int d_offset[N];
  int d_iter[N];
  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
    int offset_i = i;
#pragma unroll
    for (int d_i = N - 1; d_i >= 0; --d_i) {
      d_offset[d_i] = offset_i % kernel_shape.data[d_i];
      offset_i /= kernel_shape.data[d_i];
    }
    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
      int offset_j = j;
#pragma unroll
      for (int d_i = N - 1; d_i >= 0; --d_i) {
        d_iter[d_i] = offset_j % col_shape.data[d_i + 1];
        offset_j /= col_shape.data[d_i + 1];
      }
      const int col_index = i * inner_size + j;
      int img_index = i / kernel_size;
      bool is_padding = false;
#pragma unroll
      for (int d_i = 0; d_i < N; ++d_i) {
        const int d_img = d_iter[d_i] * stride.data[d_i] - pad.data[d_i] +
            d_offset[d_i] * dilation.data[d_i];
        is_padding |= d_img < 0 || d_img >= img_shape.data[d_i + 1];
        img_index = img_index * img_shape.data[d_i + 1] + d_img;
      }
#if __CUDA_ARCH__ >= 350
      if (!kCol2Im) {
        Y_data[col_index] = is_padding ? 0 : __ldg(X_data + img_index);
      } else if (!is_padding) {
        atomicAdd(Y_data + img_index, __ldg(X_data + col_index));
      }
#else
      if (!kCol2Im) {
        Y_data[col_index] = is_padding ? 0 : X_data[img_index];
      } else if (!is_padding) {
        atomicAdd(Y_data + img_index, X_data[col_index]);
      }
#endif
    }
  }
}

template <typename T, int N>
void Im2ColNdNCHWCUDAImpl(
    const int img_size,
    const int col_size,
    const int* img_shape,
    const int* col_shape,
    const int* kernel_shape,
    const int* stride,
    const int* dilation,
    const int* pad,
    const float* img_data,
    float* col_data,
    CUDAContext* context) {
  const int outer_size = col_shape[0];
  const int inner_size = col_size / outer_size;
  const int kernel_size = std::accumulate(
      kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
  SimpleArray<int, N + 1> img_shape_array;
  SimpleArray<int, N + 1> col_shape_array;
  SimpleArray<int, N> kernel_shape_array;
  SimpleArray<int, N> stride_array;
  SimpleArray<int, N> dilation_array;
  SimpleArray<int, N> pad_array;
  std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
  std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
  std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
  std::memcpy(stride_array.data, stride, N * sizeof(int));
  std::memcpy(dilation_array.data, dilation, N * sizeof(int));
  std::memcpy(pad_array.data, pad, N * sizeof(int));
  Im2ColNdNCHWCUDAKernel<T, N, false>
      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
          outer_size,
          inner_size,
          kernel_size,
          img_shape_array,
          col_shape_array,
          kernel_shape_array,
          stride_array,
          dilation_array,
          pad_array,
          img_data,
          col_data);
}

template <typename T, int N>
void Col2ImNdNCHWCUDAImpl(
    const int img_size,
    const int col_size,
    const int* img_shape,
    const int* col_shape,
    const int* kernel_shape,
    const int* stride,
    const int* dilation,
    const int* pad,
    const float* col_data,
    float* img_data,
    CUDAContext* context) {
  const int outer_size = col_shape[0];
  const int inner_size = col_size / outer_size;
  const int kernel_size = std::accumulate(
      kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
  SimpleArray<int, N + 1> img_shape_array;
  SimpleArray<int, N + 1> col_shape_array;
  SimpleArray<int, N> kernel_shape_array;
  SimpleArray<int, N> stride_array;
  SimpleArray<int, N> dilation_array;
  SimpleArray<int, N> pad_array;
  std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
  std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
  std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
  std::memcpy(stride_array.data, stride, N * sizeof(int));
  std::memcpy(dilation_array.data, dilation, N * sizeof(int));
  std::memcpy(pad_array.data, pad, N * sizeof(int));
  Set<T, CUDAContext>(img_size, 0, img_data, context);
  Im2ColNdNCHWCUDAKernel<T, N, true>
      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
          outer_size,
          inner_size,
          kernel_size,
          img_shape_array,
          col_shape_array,
          kernel_shape_array,
          stride_array,
          dilation_array,
          pad_array,
          col_data,
          img_data);
}

} // namespace

template <>
void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
    const int channels,
    const int height,
    const int width,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int stride_h,
    const int stride_w,
    const float* img_data,
    float* col_data,
    CUDAContext* context) {
  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
  const int num_kernels = channels * output_h * output_w;
  Im2ColNCHWCUDAKernel<float>
      <<<CAFFE_GET_BLOCKS(num_kernels),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
          num_kernels,
          height,
          width,
          kernel_h,
          kernel_w,
          dilation_h,
          dilation_w,
          pad_t,
          pad_l,
          stride_h,
          stride_w,
          output_h,
          output_w,
          img_data,
          col_data);
}

template <>
void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
    const int channels,
    const int height,
    const int width,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int stride_h,
    const int stride_w,
    const float* img_data,
    float* col_data,
    CUDAContext* context) {
  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
  const int num_kernels = output_h * output_w * channels;
  Im2ColNHWCCUDAKernel<float>
      <<<CAFFE_GET_BLOCKS(num_kernels),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
          num_kernels,
          height,
          width,
          kernel_h,
          kernel_w,
          dilation_h,
          dilation_w,
          pad_t,
          pad_l,
          stride_h,
          stride_w,
          output_w,
          channels,
          img_data,
          col_data);
}

template <>
void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
    const int channels,
    const int height,
    const int width,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int stride_h,
    const int stride_w,
    const float* col_data,
    float* img_data,
    CUDAContext* context) {
  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
  const int num_kernels = channels * height * width;
  Col2ImNCHWCUDAKernel<float>
      <<<CAFFE_GET_BLOCKS(num_kernels),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
          num_kernels,
          height,
          width,
          kernel_h,
          kernel_w,
          dilation_h,
          dilation_w,
          pad_t,
          pad_l,
          stride_h,
          stride_w,
          output_h,
          output_w,
          col_data,
          img_data);
}

template <>
void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
    const int channels,
    const int height,
    const int width,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int stride_h,
    const int stride_w,
    const float* col_data,
    float* img_data,
    CUDAContext* context) {
  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
  const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
  const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
  const int num_kernels = height * width * channels;
  Col2ImNHWCCUDAKernel<float>
      <<<CAFFE_GET_BLOCKS(num_kernels),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
          num_kernels,
          width,
          channels,
          kernel_h,
          kernel_w,
          dilation_h,
          dilation_w,
          pad_t,
          pad_l,
          stride_h,
          stride_w,
          output_h,
          output_w,
          col_data,
          img_data);
}

template <>
void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
    const int N,
    const int img_size,
    const int col_size,
    const int* img_shape,
    const int* col_shape,
    const int* kernel_shape,
    const int* stride,
    const int* dilation,
    const int* pad,
    const float* img_data,
    float* col_data,
    CUDAContext* context) {
  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
      N,
      Im2ColNdNCHWCUDAImpl,
      float,
      img_size,
      col_size,
      img_shape,
      col_shape,
      kernel_shape,
      stride,
      dilation,
      pad,
      img_data,
      col_data,
      context);
}

template <>
void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
    const int N,
    const int img_size,
    const int col_size,
    const int* img_shape,
    const int* col_shape,
    const int* kernel_shape,
    const int* stride,
    const int* dilation,
    const int* pad,
    const float* col_data,
    float* img_data,
    CUDAContext* context) {
  DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
      N,
      Col2ImNdNCHWCUDAImpl,
      float,
      img_size,
      col_size,
      img_shape,
      col_shape,
      kernel_shape,
      stride,
      dilation,
      pad,
      col_data,
      img_data,
      context);
}

template <>
void CopyMatrix<CUDAContext>(
    const size_t itemsize,
    const int M,
    const int N,
    const void* A,
    const int lda,
    void* B,
    const int ldb,
    CUDAContext* context,
    TypeMeta::TypedCopy copy) {
  CAFFE_ENFORCE(!copy, "Copy constructor is not supported in CUDA context");
  cudaMemcpy2DAsync(
      B,
      ldb * itemsize,
      A,
      lda * itemsize,
      N * itemsize,
      M,
      cudaMemcpyDeviceToDevice,
      context->cuda_stream());
}

template <>
void CopyVector<float, CUDAContext>(
    const int N,
    const float* src,
    float* dst,
    CUDAContext* context) {
  if (src != dst && N > 0) {
    cudaMemcpyAsync(
        dst,
        src,
        sizeof(float) * N,
        cudaMemcpyDeviceToDevice,
        context->cuda_stream());
  }
}

namespace {

template <typename T>
using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;

template <typename T, class Reducer>
__global__ void RowwiseReduceKernel(
    const int rows,
    const int cols,
    const Reducer reducer,
    const T init,
    const T* X,
    T* Y) {
  __shared__ typename BlockReduce<T>::TempStorage temp_storage;
  for (int i = blockIdx.x; i < rows; i += gridDim.x) {
    T val = init;
    for (int j = threadIdx.x; j < cols; j += blockDim.x) {
      val = reducer(X[i * cols + j], val);
    }
    val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
    if (threadIdx.x == 0) {
      Y[i] = val;
    }
    __syncthreads();
  }
}

template <typename T, class Reducer>
__global__ void ColwiseReduceKernel(
    const int rows,
    const int cols,
    const Reducer reducer,
    const T init,
    const T* X,
    T* Y) {
  __shared__ typename BlockReduce<T>::TempStorage temp_storage;
  for (int i = blockIdx.x; i < cols; i += gridDim.x) {
    T val = init;
    for (int j = threadIdx.x; j < rows; j += blockDim.x) {
      val = reducer(X[j * cols + i], val);
    }
    val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
    if (threadIdx.x == 0) {
      Y[i] = val;
    }
    __syncthreads();
  }
}

} // namespace

#define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T)                            \
  template <>                                                             \
  void RowwiseMax<T, CUDAContext>(                                        \
      const int N, const int D, const T* x, T* y, CUDAContext* context) { \
    RowwiseReduceKernel<<<                                                \
        std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
        CAFFE_CUDA_NUM_THREADS,                                           \
        0,                                                                \
        context->cuda_stream()>>>(                                        \
        N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y);        \
  }
CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float)
#undef CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX

#define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T)                            \
  template <>                                                             \
  void ColwiseMax<T, CUDAContext>(                                        \
      const int N, const int D, const T* x, T* y, CUDAContext* context) { \
    ColwiseReduceKernel<<<                                                \
        std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS),                            \
        CAFFE_CUDA_NUM_THREADS,                                           \
        0,                                                                \
        context->cuda_stream()>>>(                                        \
        N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y);        \
  }
CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(float)
#undef CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX

namespace {
__global__ void
maximum_kernel(const int N, const float alpha, const float* x, float* y) {
  CUDA_1D_KERNEL_LOOP(i, N) {
    y[i] = fmaxf(x[i], alpha);
  }
}
} // namespace

template <>
void Maximum(
    const int N,
    const float alpha,
    const float* x,
    float* y,
    CUDAContext* context) {
  maximum_kernel<<<
      std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
      CAFFE_CUDA_NUM_THREADS,
      0,
      context->cuda_stream()>>>(N, alpha, x, y);
}

namespace {

template <typename T, class Reducer, int D>
__global__ void ReduceTensorCUDAKernel(
    const int outer_size,
    const int inner_size,
    SimpleArray<int, D> X_strides,
    SimpleArray<FixedDivisor<int>, D> Y_dims,
    const Reducer reducer,
    const T init,
    const T* X,
    T* Y) {
  __shared__ typename BlockReduce<T>::TempStorage temp_storage;
  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
    T val = init;
    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
      int X_index = 0;
      int Y_index = i * inner_size + j;
#pragma unroll
      for (int d = D - 1; d >= 0; --d) {
        int r;
        Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
        X_index += r * X_strides.data[d];
      }
#if __CUDA_ARCH__ >= 350
      val = reducer(val, __ldg(X + X_index));
#else
      val = reducer(val, X[X_index]);
#endif
    }
    val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
    if (threadIdx.x == 0) {
      Y[i] = val;
    }
    __syncthreads();
  }
}

template <typename T, class Reducer, int D>
void ReduceTensorCUDAImpl(
    const int outer_size,
    const int inner_size,
    const int* dims,
    const int* axes,
    const Reducer& reducer,
    const T& init,
    const T* X,
    T* Y,
    CUDAContext* context) {
  SimpleArray<int, D> X_strides;
  SimpleArray<FixedDivisor<int>, D> Y_dims;
  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
  for (int i = 0; i < D; ++i) {
    Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
  }
  ReduceTensorCUDAKernel<T, Reducer, D>
      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
          outer_size, inner_size, X_strides, Y_dims, reducer, init, X, Y);
}

template <typename T, class Reducer>
void ReduceTensorCUDA(
    const int num_dims,
    const int* dims,
    const int num_axes,
    const int* axes,
    const Reducer& reducer,
    const T& init,
    const T* X,
    T* Y,
    CUDAContext* context) {
  CAFFE_ENFORCE_LE(num_axes, num_dims);
  if (X == Y) {
    return;
  }
  std::vector<int> transpose_axes(num_dims);
  utils::ComputeTransposeAxesForReduceOp(
      num_dims, num_axes, axes, transpose_axes.data());
  const int pivot = num_dims - num_axes;
  int outer_size = 1;
  for (int i = 0; i < pivot; ++i) {
    outer_size *= dims[transpose_axes[i]];
  }
  int inner_size = 1;
  for (int i = pivot; i < num_dims; ++i) {
    inner_size *= dims[transpose_axes[i]];
  }
  if (outer_size > 0 && inner_size > 0) {
    if (transpose_axes[pivot] == pivot) {
      RowwiseReduceKernel<T>
          <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
             CAFFE_CUDA_NUM_THREADS,
             0,
             context->cuda_stream()>>>(
              outer_size, inner_size, reducer, init, X, Y);
      return;
    }
    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
        num_dims,
        ReduceTensorCUDAImpl,
        T,
        Reducer,
        outer_size,
        inner_size,
        dims,
        transpose_axes.data(),
        reducer,
        init,
        X,
        Y,
        context);
  } else if (outer_size > 0) {
    math::Set<T, CUDAContext>(outer_size, init, Y, context);
  }
}

template <typename T>
void ReduceMeanCUDAImpl(
    const int num_dims,
    const int* dims,
    const int num_axes,
    const int* axes,
    const T* X,
    T* Y,
    CUDAContext* context) {
  ReduceTensorCUDA(
      num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context);
  const int X_size =
      std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>());
  int scale = 1;
  for (int i = 0; i < num_axes; ++i) {
    scale *= dims[axes[i]];
  }
  const int Y_size = X_size / scale;
  Scale<T, CUDAContext>(
      Y_size, 1.0f / static_cast<float>(scale), Y, Y, context);
}

} // namespace

#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \
  template <>                                 \
  void ReduceMin<T, CUDAContext>(             \
      const int num_dims,                     \
      const int* dims,                        \
      const int num_axes,                     \
      const int* axes,                        \
      const T* X,                             \
      T* Y,                                   \
      CUDAContext* context) {                 \
    ReduceTensorCUDA(                         \
        num_dims,                             \
        dims,                                 \
        num_axes,                             \
        axes,                                 \
        cub::Min(),                           \
        std::numeric_limits<T>::max(),        \
        X,                                    \
        Y,                                    \
        context);                             \
  }
CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(std::int32_t)
CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(std::int64_t)
CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(float)
CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double)
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN

#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \
  template <>                                 \
  void ReduceMax<T, CUDAContext>(             \
      const int num_dims,                     \
      const int* dims,                        \
      const int num_axes,                     \
      const int* axes,                        \
      const T* X,                             \
      T* Y,                                   \
      CUDAContext* context) {                 \
    ReduceTensorCUDA(                         \
        num_dims,                             \
        dims,                                 \
        num_axes,                             \
        axes,                                 \
        cub::Max(),                           \
        std::numeric_limits<T>::lowest(),     \
        X,                                    \
        Y,                                    \
        context);                             \
  }
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(std::int32_t)
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(std::int64_t)
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(float)
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX

#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T)                             \
  template <>                                                             \
  void ReduceSum<T, CUDAContext>(                                         \
      const int num_dims,                                                 \
      const int* dims,                                                    \
      const int num_axes,                                                 \
      const int* axes,                                                    \
      const T* X,                                                         \
      T* Y,                                                               \
      CUDAContext* context) {                                             \
    ReduceTensorCUDA(                                                     \
        num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context); \
  }
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int32_t)
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int64_t)
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(float)
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double)
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM

#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T)                            \
  template <>                                                             \
  void ReduceMean<T, CUDAContext>(                                        \
      const int num_dims,                                                 \
      const int* dims,                                                    \
      const int num_axes,                                                 \
      const int* axes,                                                    \
      const T* X,                                                         \
      T* Y,                                                               \
      CUDAContext* context) {                                             \
    ReduceMeanCUDAImpl<T>(num_dims, dims, num_axes, axes, X, Y, context); \
  }
CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(float)
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN

namespace {

template <typename T, int D>
__global__ void BroadcastCUDAKernel(
    const int Y_size,
    const SimpleArray<int, D> X_strides,
    const SimpleArray<int, D> Y_dims,
    const T* X,
    T* Y) {
  CUDA_1D_KERNEL_LOOP(Y_index, Y_size) {
    int X_index = 0;
    int Y_index_val = Y_index;
#pragma unroll
    for (int i = D - 1; i >= 0; --i) {
      X_index += X_strides.data[i] == 0
          ? 0
          : (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
      Y_index_val /= Y_dims.data[i];
    }
#if __CUDA_ARCH__ >= 350
    Y[Y_index] = __ldg(X + X_index);
#else
    Y[Y_index] = X[X_index];
#endif
  }
}

template <typename T, int D>
void BroadcastCUDAImpl(
    const int X_ndim,
    const int* X_dims,
    const int* Y_dims,
    const T* X,
    T* Y,
    CUDAContext* context) {
  SimpleArray<int, D> X_strides_array;
  SimpleArray<int, D> Y_dims_array;
  const int d = D - X_ndim;
  std::fill(X_strides_array.data, X_strides_array.data + d, 0);
  int cur_stride = 1;
  for (int i = D - 1; i >= d; --i) {
    CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
    X_strides_array.data[i] = X_dims[i - d] == 1 ? 0 : cur_stride;
    cur_stride *= X_dims[i - d];
  }
  std::copy_n(Y_dims, D, Y_dims_array.data);
  const int Y_size =
      std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>());
  BroadcastCUDAKernel<T, D>
      <<<CAFFE_GET_BLOCKS(Y_size),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(Y_size, X_strides_array, Y_dims_array, X, Y);
}

} // namespace

#define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T)                                  \
  template <>                                                                 \
  void Broadcast<T, CUDAContext>(                                             \
      const int X_ndim,                                                       \
      const int* X_dims,                                                      \
      const int Y_ndim,                                                       \
      const int* Y_dims,                                                      \
      const T* X,                                                             \
      T* Y,                                                                   \
      CUDAContext* context) {                                                 \
    CAFFE_ENFORCE_LE(X_ndim, Y_ndim);                                         \
    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                                   \
        Y_ndim, BroadcastCUDAImpl, T, X_ndim, X_dims, Y_dims, X, Y, context); \
  }
CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int32_t)
CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int64_t)
CAFFE2_SPECIALIZED_CUDA_BROADCAST(float)
CAFFE2_SPECIALIZED_CUDA_BROADCAST(double)
#undef CAFFE2_SPECIALIZED_CUDA_BROADCAST

namespace {

template <typename T>
__global__ void RowwiseMomentsCUDAKernel(
    const int rows,
    const int cols,
    const T* X,
    T* mean,
    T* variance) {
  __shared__ typename BlockReduce<T>::TempStorage m_storage;
  __shared__ typename BlockReduce<T>::TempStorage v_storage;
  for (int i = blockIdx.x; i < rows; i += gridDim.x) {
    T m_val = 0;
    T v_val = 0;
    for (int j = threadIdx.x; j < cols; j += blockDim.x) {
      const int X_index = i * cols + j;
#if __CUDA_ARCH__ >= 350
      m_val += __ldg(X + X_index);
      v_val += __ldg(X + X_index) * __ldg(X + X_index);
#else
      m_val += X[X_index];
      v_val += X[X_index] * X[X_index];
#endif
    }
    m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
    v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
    if (threadIdx.x == 0) {
      mean[i] = m_val / static_cast<T>(cols);
      variance[i] = v_val / static_cast<T>(cols) - mean[i] * mean[i];
    }
    __syncthreads();
  }
}

template <typename T, int D>
__global__ void MomentsCUDAKernel(
    const int outer_size,
    const int inner_size,
    SimpleArray<int, D> X_strides,
    SimpleArray<FixedDivisor<int>, D> Y_dims,
    const T* X,
    T* mean,
    T* variance) {
  __shared__ typename BlockReduce<T>::TempStorage m_storage;
  __shared__ typename BlockReduce<T>::TempStorage v_storage;
  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
    T m_val = 0;
    T v_val = 0;
    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
      int X_index = 0;
      int Y_index = i * inner_size + j;
#pragma unroll
      for (int d = D - 1; d >= 0; --d) {
        int r;
        Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
        X_index += r * X_strides.data[d];
      }
#if __CUDA_ARCH__ >= 350
      m_val += __ldg(X + X_index);
      v_val += __ldg(X + X_index) * __ldg(X + X_index);
#else
      m_val += X[X_index];
      v_val += X[X_index] * X[X_index];
#endif
    }
    m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
    v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
    if (threadIdx.x == 0) {
      mean[i] = m_val / static_cast<T>(inner_size);
      variance[i] = v_val / static_cast<T>(inner_size) - mean[i] * mean[i];
    }
    __syncthreads();
  }
}

template <typename T, int D>
void MomentsCUDAImpl(
    const int outer_size,
    const int inner_size,
    const int* dims,
    const int* axes,
    const T* X,
    T* mean,
    T* variance,
    CUDAContext* context) {
  SimpleArray<int, D> X_strides;
  SimpleArray<FixedDivisor<int>, D> Y_dims;
  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
  for (int i = 0; i < D; ++i) {
    Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
  }
  MomentsCUDAKernel<T, D>
      <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(
          outer_size, inner_size, X_strides, Y_dims, X, mean, variance);
}

template <typename T>
void MomentsCUDA(
    const int num_dims,
    const int* dims,
    const int num_axes,
    const int* axes,
    const T* X,
    T* mean,
    T* variance,
    CUDAContext* context) {
  CAFFE_ENFORCE_LE(num_axes, num_dims);
  std::vector<int> transpose_axes(num_dims);
  utils::ComputeTransposeAxesForReduceOp(
      num_dims, num_axes, axes, transpose_axes.data());
  const int pivot = num_dims - num_axes;
  int outer_size = 1;
  for (int i = 0; i < pivot; ++i) {
    outer_size *= dims[transpose_axes[i]];
  }
  int inner_size = 1;
  for (int i = pivot; i < num_dims; ++i) {
    inner_size *= dims[transpose_axes[i]];
  }
  if (outer_size > 0 && inner_size > 0) {
    if (transpose_axes[pivot] == pivot) {
      RowwiseMomentsCUDAKernel<T>
          <<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
             CAFFE_CUDA_NUM_THREADS,
             0,
             context->cuda_stream()>>>(
              outer_size, inner_size, X, mean, variance);
      return;
    }
    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
        num_dims,
        MomentsCUDAImpl,
        T,
        outer_size,
        inner_size,
        dims,
        transpose_axes.data(),
        X,
        mean,
        variance,
        context);
  }
}

} // namespace

#define CAFFE2_SPECIALIZED_CUDA_MOMENTS(T)                           \
  template <>                                                        \
  void Moments<T, CUDAContext>(                                      \
      const int num_dims,                                            \
      const int* dims,                                               \
      const int num_axes,                                            \
      const int* axes,                                               \
      const T* X,                                                    \
      T* mean,                                                       \
      T* variance,                                                   \
      CUDAContext* context) {                                        \
    MomentsCUDA<T>(                                                  \
        num_dims, dims, num_axes, axes, X, mean, variance, context); \
  }
CAFFE2_SPECIALIZED_CUDA_MOMENTS(float)
#undef CAFFE2_SPECIALIZED_CUDA_MOMENTS

namespace {

template <typename T, int D>
__global__ void TransposeCUDAKernel(
    const int size,
    const SimpleArray<int, D> X_strides,
    const SimpleArray<FixedDivisor<int>, D> Y_dims,
    const T* X,
    T* Y) {
  CUDA_1D_KERNEL_LOOP(Y_index, size) {
    int X_index = 0;
    int Y_index_val = Y_index;
#pragma unroll
    for (int i = D - 1; i >= 0; --i) {
      int d;
      Y_dims.data[i].DivMod(Y_index_val, &Y_index_val, &d);
      X_index += d * X_strides.data[i];
    }
#if __CUDA_ARCH__ >= 350
    Y[Y_index] = __ldg(X + X_index);
#else
    Y[Y_index] = X[X_index];
#endif
  }
}

template <typename T, int D>
void TransposeCUDAImpl(
    const int* dims,
    const int* axes,
    const T* X,
    T* Y,
    CUDAContext* context) {
  SimpleArray<int, D> X_strides;
  SimpleArray<FixedDivisor<int>, D> Y_dims;
  utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
  int size = 1;
  for (int i = 0; i < D; ++i) {
    Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
    size *= dims[i];
  }
  TransposeCUDAKernel<T, D>
      <<<CAFFE_GET_BLOCKS(size),
         CAFFE_CUDA_NUM_THREADS,
         0,
         context->cuda_stream()>>>(size, X_strides, Y_dims, X, Y);
}

} // namespace

#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T)                             \
  template <>                                                            \
  void Transpose<T, CUDAContext>(                                        \
      const int ndim,                                                    \
      const int* dims,                                                   \
      const int* axes,                                                   \
      const T* X,                                                        \
      T* Y,                                                              \
      CUDAContext* context) {                                            \
    if (utils::IsIdentityPermutation(ndim, axes)) {                      \
      const int size =                                                   \
          std::accumulate(dims, dims + ndim, 1, std::multiplies<int>()); \
      context->template Copy<T, CUDAContext, CUDAContext>(size, X, Y);   \
      return;                                                            \
    }                                                                    \
    DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(                              \
        ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context);          \
  }
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float)
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double)
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int)
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(TIndex)
#undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE

} // namespace math
} // namespace caffe2