pytorch/caffe2/utils/math_cpu.cc

// Implementes the math functions for CPU.
// The implementation in this file allows us to route the underlying numerical
// computation library to different backends. Notably:
// (1) For all BLAS-related functions, one can explicitly request a BLAS backend
//     such as MKL, openblas or Atlas. To see the set of supported backends
//     currently provided, check //third_party/blas/.
// (2) If one chooses to link against MKL, we utilize MKL's vector math library
//     (VML) for a few functions such as Exp and Log.
// (3) Fallback implementations are provided in Eigen for cross-platform
//     support. Since Eigen is a header-only library and supports a number of
//     platforms, it allows one to quickly port Caffe2 to different platforms
//     where BLAS may not be present.

#include <atomic>
#include <chrono>
#include <random>
#include <unordered_set>

#ifdef CAFFE2_USE_MKL
#include <mkl.h>
#endif  // CAFFE2_USE_MKL

#include "caffe2/utils/math.h"
#include "caffe2/utils/cpu_neon.h"
#include "caffe2/core/context.h"
#include "Eigen/Core"
#include "Eigen/Dense"

#if defined(_MSC_VER)
#include <process.h>
#endif

namespace caffe2 {
namespace math {

////////////////////////////////////////////////////////////////////////////////
// BLAS alternatives.
// Depending on whether we have specified an external BLAS library or not, we
// will delegate the Caffe math functions that are BLAS-related to either the
// CBLAS call or the Eigen implementation.
////////////////////////////////////////////////////////////////////////////////
#ifdef CAFFE2_USE_EIGEN_FOR_BLAS

// Caffe2 gemm provides a simpler interface to the gemm functions, with the
// limitation that the data has to be contiguous in memory.
//
// The gemm call implements the following operation:
//
//                  C = alpha * op(A) * op(B) + beta * C
//
// where op(A) has size M x K, op(B) has size K x N, and C has size M x N. Each
// of A, B, and C are matrices and alpha and beta are scalars. Note that the
// most common use case of gemm will involve setting alpha to 1 and beta to 0.
//
// op(A) and op(B) represent the transformations that are done to A and B before
// the matrix multiply; depending on the flags set, op(A) is equal to A or A^T
// (transpose) if the argument TransA or TransB is set to CblasNoTrans or
// CblasTrans, respectively, for each of A and B.
template <>
void Gemm<float, CPUContext>(
    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
    const int M, const int N, const int K, const float alpha, const float* A,
    const float* B, const float beta, float* C, CPUContext* context) {
  auto C_mat = EigenMatrixMap<float>(C, N, M);
  if (beta == 0) {
    C_mat.setZero();
  } else {
    C_mat *= beta;
  }
  switch (TransA) {
  case CblasNoTrans: {
    switch (TransB) {
    case CblasNoTrans:
      C_mat.noalias() += alpha * (
          ConstEigenMatrixMap<float>(B, N, K) *
          ConstEigenMatrixMap<float>(A, K, M));
      return;
    case CblasTrans:
      C_mat.noalias() += alpha * (
          ConstEigenMatrixMap<float>(B, K, N).transpose() *
          ConstEigenMatrixMap<float>(A, K, M));
      return;
    default:
      LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
    }
  }
  case CblasTrans: {
    switch (TransB) {
    case CblasNoTrans:
      C_mat.noalias() += alpha * (
          ConstEigenMatrixMap<float>(B, N, K) *
          ConstEigenMatrixMap<float>(A, M, K).transpose());
      return;
    case CblasTrans:
      C_mat.noalias() += alpha * (
          ConstEigenMatrixMap<float>(B, K, N).transpose() *
          ConstEigenMatrixMap<float>(A, M, K).transpose());
      return;
    default:
      LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
    }
  }
  default:
    LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
  }
}

template <>
void GemmEx<float, CPUContext>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float* A,
    const int lda,
    const float* B,
    const int ldb,
    const float beta,
    float* C,
    const int ldc,
    CPUContext*) {
  using OuterStride = Eigen::OuterStride<Eigen::Dynamic>;
  using StridedMap = Eigen::Map<Eigen::MatrixXf, 0, OuterStride>;
  using ConstStridedMap = Eigen::Map<const Eigen::MatrixXf, 0, OuterStride>;
  auto C_mat = StridedMap(C, N, M, OuterStride(ldc));
  if (beta == 0) {
    C_mat.setZero();
  } else {
    C_mat *= beta;
  }
  switch (TransA) {
    case CblasNoTrans: {
      switch (TransB) {
        case CblasNoTrans:
          C_mat.noalias() +=
              alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
                       ConstStridedMap(A, K, M, OuterStride(lda)));
          return;
        case CblasTrans:
          C_mat.noalias() +=
              alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
                       ConstStridedMap(A, K, M, OuterStride(lda)));
          return;
        default:
          LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
      }
    }
    case CblasTrans: {
      switch (TransB) {
        case CblasNoTrans:
          C_mat.noalias() +=
              alpha * (ConstStridedMap(B, N, K, OuterStride(ldb)) *
                       ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
          return;
        case CblasTrans:
          C_mat.noalias() +=
              alpha * (ConstStridedMap(B, K, N, OuterStride(ldb)).transpose() *
                       ConstStridedMap(A, M, K, OuterStride(lda)).transpose());
          return;
        default:
          LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransB";
      }
    }
    default:
      LOG(FATAL) << "Unexpected CBLAS_TRANSPOSE for TransA";
  }
}

template <>
void Gemv<float, CPUContext>(
    const CBLAS_TRANSPOSE TransA,
    const int M,
    const int N,
    const float alpha,
    const float* A,
    const float* x,
    const float beta,
    float* y,
    CPUContext* context) {
  EigenVectorMap<float> y_vec(y, TransA == CblasNoTrans ? M : N);
  if (beta == 0) {
    // In Caffe2 we often do a lazy initialization, which may contain NaNs in
    // the float values. As a result, if beta is 0, we explicitly do a setzero.
    y_vec.setZero();
  } else {
    y_vec *= beta;
  }
  switch (TransA) {
    case CblasNoTrans: {
      y_vec.noalias() += alpha * (
          ConstEigenMatrixMap<float>(A, N, M).transpose() *
          ConstEigenVectorMap<float>(x, N));
      return;
    }
    case CblasTrans: {
      y_vec.noalias() += alpha * (
          ConstEigenMatrixMap<float>(A, N, M) *
          ConstEigenVectorMap<float>(x, M));
      return;
    }
    default:
      LOG(FATAL) << "Gemv float found an unexpected CBLAS_TRANSPOSE input.";
  }
}

#define CAFFE2_SPECIALIZED_SCALE(T)                                         \
  template <>                                                               \
  void Scale<T, CPUContext>(                                                \
      const int n, const T alpha, const T* x, T* y, CPUContext* context) {  \
    EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * alpha;         \
  }                                                                         \
  template <>                                                               \
  void Scale<T, CPUContext>(                                                \
      const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
    EigenVectorMap<T>(y, n) = ConstEigenVectorMap<T>(x, n) * (*alpha);      \
  }
CAFFE2_SPECIALIZED_SCALE(float)
CAFFE2_SPECIALIZED_SCALE(double)
#undef CAFFE2_SPECIALIZED_SCALE

#define CAFFE2_SPECIALIZED_DOT(T)                                              \
template<>                                                                     \
void Dot<T, CPUContext>(                                                       \
    const int N, const T* a, const T* b, T* y,                                 \
    CPUContext* context) {                                                     \
  *y = ConstEigenVectorMap<T>(a, N).dot(ConstEigenVectorMap<T>(b, N));         \
}
CAFFE2_SPECIALIZED_DOT(float)
CAFFE2_SPECIALIZED_DOT(double)
#undef CAFFE2_SPECIALIZED_DOT

#define CAFFE2_SPECIALIZED_AXPY(T)                                          \
  template <>                                                               \
  void Axpy<T, CPUContext>(                                                 \
      const int N, const T alpha, const T* x, T* Y, CPUContext* context) {  \
    EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * alpha;        \
  }                                                                         \
  template <>                                                               \
  void Axpy<T, CPUContext>(                                                 \
      const int N, const T* alpha, const T* x, T* Y, CPUContext* context) { \
    EigenVectorMap<T>(Y, N) += ConstEigenVectorMap<T>(x, N) * (*alpha);     \
  }
CAFFE2_SPECIALIZED_AXPY(float)
CAFFE2_SPECIALIZED_AXPY(double)
#undef CAFFE2_SPECIALIZED_AXPY

#define CAFFE2_SPECIALIZED_AXPBY(T)                                            \
template <>                                                                    \
void Axpby<T, CPUContext>(const int N, const T alpha, const T* x,              \
                          const T beta, T* y, CPUContext* context) {           \
  EigenVectorMap<T> y_vec(y, N);                                               \
  y_vec = y_vec * beta + ConstEigenVectorMap<T>(x, N) * alpha;                 \
}
CAFFE2_SPECIALIZED_AXPBY(float)
CAFFE2_SPECIALIZED_AXPBY(double)
#undef CAFFE2_SPECIALIZED_AXPBY

#else  // CAFFE2_USE_EIGEN_FOR_BLAS

template <>
void Gemm<float, CPUContext>(
    const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB,
    const int M, const int N, const int K, const float alpha, const float* A,
    const float* B, const float beta, float* C, CPUContext* context) {
  int lda = (TransA == CblasNoTrans) ? K : M;
  int ldb = (TransB == CblasNoTrans) ? N : K;
  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
              beta, C, N);
}

template <>
void GemmEx<float, CPUContext>(
    const CBLAS_TRANSPOSE TransA,
    const CBLAS_TRANSPOSE TransB,
    const int M,
    const int N,
    const int K,
    const float alpha,
    const float* A,
    const int lda,
    const float* B,
    const int ldb,
    const float beta,
    float* C,
    const int ldc,
    CPUContext* context) {
  cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb,
              beta, C, ldc);
}

template <>
void Gemv<float, CPUContext>(
    const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha,
    const float* A, const float* x, const float beta, float* y,
    CPUContext* context) {
  cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
}

#define CAFFE2_SPECIALIZED_SCALE(T, prefix)                                 \
  template <>                                                               \
  void Scale<T, CPUContext>(                                                \
      const int n, const T alpha, const T* x, T* y, CPUContext* context) {  \
    if (y != x)                                                             \
      cblas_##prefix##copy(n, x, 1, y, 1);                                  \
    cblas_##prefix##scal(n, alpha, y, 1);                                   \
  }                                                                         \
  template <>                                                               \
  void Scale<T, CPUContext>(                                                \
      const int n, const T* alpha, const T* x, T* y, CPUContext* context) { \
    if (y != x)                                                             \
      cblas_##prefix##copy(n, x, 1, y, 1);                                  \
    cblas_##prefix##scal(n, *alpha, y, 1);                                  \
  }
CAFFE2_SPECIALIZED_SCALE(float, s)
CAFFE2_SPECIALIZED_SCALE(double, d)
#undef CAFFE2_SPECIALIZED_SCALE

#define CAFFE2_SPECIALIZED_DOT(T, prefix)                                      \
template<>                                                                     \
void Dot<T, CPUContext>(                                                       \
    const int N, const T* a, const T* b, T* y,                                 \
    CPUContext* context) {                                                     \
  *y = cblas_##prefix##dot(N, a, 1, b, 1);                                     \
}
CAFFE2_SPECIALIZED_DOT(float, s)
CAFFE2_SPECIALIZED_DOT(double, d)
#undef CAFFE2_SPECIALIZED_DOT

#define CAFFE2_SPECIALIZED_AXPY(T, prefix)                                  \
  template <>                                                               \
  void Axpy<T, CPUContext>(                                                 \
      const int N, const T alpha, const T* x, T* y, CPUContext* context) {  \
    cblas_##prefix##axpy(N, alpha, x, 1, y, 1);                             \
  }                                                                         \
  template <>                                                               \
  void Axpy<T, CPUContext>(                                                 \
      const int N, const T* alpha, const T* x, T* y, CPUContext* context) { \
    cblas_##prefix##axpy(N, *alpha, x, 1, y, 1);                            \
  }
CAFFE2_SPECIALIZED_AXPY(float, s)
CAFFE2_SPECIALIZED_AXPY(double, d)
#undef CAFFE2_SPECIALIZED_AXPY

// cblas_[sd]axpby is not a standard blas function, and if MKL is not present,
// we will need to implement it.
#ifdef CAFFE2_USE_MKL
#define CAFFE2_SPECIALIZED_AXPBY(T, prefix)                                    \
template <>                                                                    \
void Axpby<T, CPUContext>(const int N, const T alpha, const T* x,              \
                          const T beta, T* y, CPUContext* context) {           \
  cblas_##prefix##axpby(N, alpha, x, 1, beta, y, 1);                           \
}
#else  // CAFFE2_USE_MKL
#define CAFFE2_SPECIALIZED_AXPBY(T, prefix)                                    \
template <>                                                                    \
void Axpby<T, CPUContext>(const int N, const T alpha, const T* x,              \
                          const T beta, T* y, CPUContext* context) {           \
  cblas_##prefix##scal(N, beta, y, 1);                                         \
  cblas_##prefix##axpy(N, alpha, x, 1, y, 1);                                  \
}
#endif  // CAFFE2_USE_MKL
CAFFE2_SPECIALIZED_AXPBY(float, s)
CAFFE2_SPECIALIZED_AXPBY(double, d)
#undef CAFFE2_SPECIALIZED_AXPBY

#endif  // CAFFE2_USE_EIGEN_FOR_BLAS


////////////////////////////////////////////////////////////////////////////////
// MKL VML alternatives.
// Depending on whether we are using MKL, we will delegate the Caffe math
// functions that are VML-related to either the VML call or the Eigen
// implementation. If you are setting the flags (such as AVX) right for your CPU
// architecture, usually Eigen will deliver a throughput as fast as the VML
// functions.
////////////////////////////////////////////////////////////////////////////////
#ifdef CAFFE2_USE_MKL

#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, OriginalFunc, ...) \
  template <>                                                          \
  void Funcname<T, CPUContext>(                                        \
      const int N, const T* x, T* y, CPUContext* context) {            \
    OriginalFunc(N, x, y, ##__VA_ARGS__);                              \
  }
DELEGATE_SIMPLE_UNARY_FUNCTION(
    float,
    Exp,
    vmsExp,
    VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
DELEGATE_SIMPLE_UNARY_FUNCTION(
    double,
    Exp,
    vmdExp,
    VML_HA | VML_FTZDAZ_OFF | VML_ERRMODE_IGNORE)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, vsLn)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, vdLn)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, vsSqr)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, vdSqr)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION

#define DELEGATE_POWX_FUNCTION(T, OriginalFunc)                                \
template <>                                                                    \
void Powx<T, CPUContext>(                                                      \
    const int N, const T* a, T b, T* y, CPUContext* context) {                 \
  OriginalFunc(N, a, b, y);                                                    \
}
DELEGATE_POWX_FUNCTION(float, vsPowx)
DELEGATE_POWX_FUNCTION(double, vdPowx)
#undef DELEGATE_POWX_FUNCTION

#define DELEGATE_SIMPLE_BINARY_FUNCTION(T, Funcname, OriginalFunc)             \
template <>                                                                    \
void Funcname<T, CPUContext>(                                                  \
    const int N, const T* a, const T* b, T* y,                                 \
    CPUContext* context) {                                                     \
  OriginalFunc(N, a, b, y);                                                    \
}
DELEGATE_SIMPLE_BINARY_FUNCTION(float,  Add, vsAdd)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Add, vdAdd)
DELEGATE_SIMPLE_BINARY_FUNCTION(float,  Sub, vsSub)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Sub, vdSub)
DELEGATE_SIMPLE_BINARY_FUNCTION(float,  Mul, vsMul)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Mul, vdMul)
DELEGATE_SIMPLE_BINARY_FUNCTION(float,  Div, vsDiv)
DELEGATE_SIMPLE_BINARY_FUNCTION(double, Div, vdDiv)
#undef DELEGATE_SIMPLE_BINARY_FUNCTION

#else  // CAFFE2_USE_MKL

#define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)                      \
template <>                                                                    \
void Funcname<T, CPUContext>(const int N, const T* x, T* y,                    \
                             CPUContext* context) {                            \
  EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(x, N).array().expr();       \
}
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Exp, exp)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Exp, exp)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Log, log)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Log, log)
DELEGATE_SIMPLE_UNARY_FUNCTION(float, Sqr, square)
DELEGATE_SIMPLE_UNARY_FUNCTION(double, Sqr, square)
#undef DELEGATE_SIMPLE_UNARY_FUNCTION

#define DELEGATE_POWX_FUNCTION(T)                                              \
template <>                                                                    \
void Powx<T, CPUContext>(                                                      \
    const int N, const T* a, T b, T* y, CPUContext* context) {                 \
  EigenVectorMap<T>(y, N) = ConstEigenVectorMap<T>(a, N).array().pow(b);       \
}
DELEGATE_POWX_FUNCTION(float)
DELEGATE_POWX_FUNCTION(double)
#undef DELEGATE_POWX_FUNCTION

#endif  // CAFFE2_USE_MKL


#define EIGEN_SIMPLE_BINARY_FUNCTION(T, Funcname, expr)                        \
template <>                                                                    \
void Funcname<T, CPUContext>(                                                  \
    const int N, const T* a, const T* b, T* y,                                 \
    CPUContext*) {                                                             \
  EigenVectorMap<T>(y, N) =                                                    \
      ConstEigenVectorMap<T>(a, N).array() expr                                \
      ConstEigenVectorMap<T>(b, N).array();                                    \
}

#ifdef CAFFE2_USE_MKL

#define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr)                          \
EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr)                          \
EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)

#else

#define DEFINE_SIMPLE_BINARY_FUNCTION(Funcname, expr)                          \
EIGEN_SIMPLE_BINARY_FUNCTION(float, Funcname, expr)                            \
EIGEN_SIMPLE_BINARY_FUNCTION(double, Funcname, expr)                           \
EIGEN_SIMPLE_BINARY_FUNCTION(int32_t, Funcname, expr)                          \
EIGEN_SIMPLE_BINARY_FUNCTION(int64_t, Funcname, expr)

#endif

DEFINE_SIMPLE_BINARY_FUNCTION(Add, +)
DEFINE_SIMPLE_BINARY_FUNCTION(Sub, -)
DEFINE_SIMPLE_BINARY_FUNCTION(Mul, *)
DEFINE_SIMPLE_BINARY_FUNCTION(Div, /)

#undef EIGEN_SIMPLE_BINARY_FUNCTION
#undef DEFINE_FLOAT_BINARY_FUNCTION


////////////////////////////////////////////////////////////////////////////////
// Common math functions being used in Caffe that do not have a BLAS or MKL
// equivalent. For all these functions, we will simply implement them either via
// Eigen or via custom code.
////////////////////////////////////////////////////////////////////////////////

#define CAFFE2_SPECIALIZED_ROWWISEMAX(T)                                       \
template <> void RowwiseMax<T, CPUContext>(                                    \
    const int N, const int D, const T* x, T* y, CPUContext* context) {         \
  EigenVectorMap<T>(y, N) =                                                    \
      ConstEigenMatrixMap<T>(x, D, N).colwise().maxCoeff();                    \
}
CAFFE2_SPECIALIZED_ROWWISEMAX(float)

#define CAFFE2_SPECIALIZED_COLWISEMAX(T)                                       \
template <> void ColwiseMax<T, CPUContext>(                                    \
    const int N, const int D, const T* x, T* y, CPUContext* context) {         \
  EigenVectorMap<T>(y, D) =                                                    \
      ConstEigenMatrixMap<T>(x, D, N).rowwise().maxCoeff();                    \
}
CAFFE2_SPECIALIZED_COLWISEMAX(float)

// AddToRow and AddToCol adds the corresponding row/col vector b to the matrix a
// of shape M x N. The actual implementation uses eigen which is column major,
// so notice the row/column swap in the actual implementation.
#define DELEGATE_BROADCAST_BINARY_FUNCTION(T, Funcname, expr)               \
  template <>                                                               \
  void Funcname##ToRow<T, CPUContext>(                                      \
      const int M,                                                          \
      const int N,                                                          \
      const T* a,                                                           \
      const T* b,                                                           \
      T* y,                                                                 \
      CPUContext*) {                                                        \
    EigenArrayMap<T>(y, N, M) = ConstEigenArrayMap<T>(a, N, M).colwise()    \
                                    expr ConstEigenVectorArrayMap<T>(b, N); \
  }                                                                         \
  /* inplace versions */                                                    \
  template <>                                                               \
  void Funcname##ToRow<T, CPUContext>(                                      \
      const int M, const int N, const T* x, T* y, CPUContext* context) {    \
    EigenArrayMap<T>(y, N, M).colwise() expr## =                            \
        ConstEigenVectorArrayMap<T>(x, N);                                  \
  }                                                                         \
  template <>                                                               \
  void Funcname##ToCol<T, CPUContext>(                                      \
      const int M, const int N, const T* x, T* y, CPUContext* context) {    \
    EigenArrayMap<T>(y, N, M).rowwise() expr## =                            \
        ConstEigenVectorArrayMap<T>(x, M).transpose();                      \
  }

#define DEFINE_BROADCAST_BINARY_FUNCTION(name, op)                       \
  DELEGATE_BROADCAST_BINARY_FUNCTION(int32_t, name, op)                  \
  DELEGATE_BROADCAST_BINARY_FUNCTION(int64_t, name, op)                  \
  DELEGATE_BROADCAST_BINARY_FUNCTION(float, name, op)                    \
  DELEGATE_BROADCAST_BINARY_FUNCTION(double, name, op)

DEFINE_BROADCAST_BINARY_FUNCTION(Add, +)
DEFINE_BROADCAST_BINARY_FUNCTION(Sub, -)
DEFINE_BROADCAST_BINARY_FUNCTION(Mul, *)
DEFINE_BROADCAST_BINARY_FUNCTION(Div, /)

#undef DEFINE_BROADCAST_BINARY_FUNCTION
#undef DELEGATE_BROADCAST_BINARY_FUNCTION

#define CAFFE2_SPECIALIZED_SET(T)                                 \
  template <>                                                     \
  void Set<T, CPUContext>(                                        \
      const TIndex N, const T alpha, T* Y, CPUContext* context) { \
    if (alpha == (T)0) {                                          \
      memset(Y, 0, N * sizeof(T));                                \
    } else {                                                      \
      EigenVectorMap<T>(Y, N).setConstant(alpha);                 \
    }                                                             \
  }

CAFFE2_SPECIALIZED_SET(float);
CAFFE2_SPECIALIZED_SET(double);
CAFFE2_SPECIALIZED_SET(int8_t);
CAFFE2_SPECIALIZED_SET(int16_t);
CAFFE2_SPECIALIZED_SET(int);
CAFFE2_SPECIALIZED_SET(int64_t);
CAFFE2_SPECIALIZED_SET(bool);
CAFFE2_SPECIALIZED_SET(char);
CAFFE2_SPECIALIZED_SET(uint8_t);
CAFFE2_SPECIALIZED_SET(uint16_t);
#undef CAFFE2_SPECIALIZED_SET

#define CAFFE2_INSTANTIATE_BINARY_OP(name, op, T)                          \
  template <>                                                              \
  void name<T, CPUContext>(                                                \
      const int n, const T* a, const T* b, bool* y, CPUContext* context) { \
    for (int i = 0; i < n; ++i) {                                          \
      y[i] = a[i] op b[i];                                                 \
    }                                                                      \
  }                                                                        \
  template <>                                                              \
  void name##ToRow<T, CPUContext>(                                         \
      const int m,                                                         \
      const int n,                                                         \
      const T* a,                                                          \
      const T* b,                                                          \
      bool* y,                                                             \
      CPUContext* context) {                                               \
    for (int i = 0; i < n * m; ++i) {                                      \
      y[i] = a[i] op b[i % n];                                             \
    }                                                                      \
  }

#define CAFFE2_DEFINE_BINARY_OP(name, op)         \
  CAFFE2_INSTANTIATE_BINARY_OP(name, op, float)   \
  CAFFE2_INSTANTIATE_BINARY_OP(name, op, double)  \
  CAFFE2_INSTANTIATE_BINARY_OP(name, op, int32_t) \
  CAFFE2_INSTANTIATE_BINARY_OP(name, op, int64_t)

CAFFE2_DEFINE_BINARY_OP(LT, <);
CAFFE2_DEFINE_BINARY_OP(LE, <=);
CAFFE2_DEFINE_BINARY_OP(GT, >);
CAFFE2_DEFINE_BINARY_OP(GE, >=);

CAFFE2_INSTANTIATE_BINARY_OP(Or, |, bool);
CAFFE2_INSTANTIATE_BINARY_OP(And, &, bool);
CAFFE2_INSTANTIATE_BINARY_OP(Xor, ^, bool);

template <>
void Not<bool, CPUContext>(
    const int n,
    const bool* x,
    bool* y,
    CPUContext* context) {
  for (int i = 0; i < n; ++i) {
    y[i] = !x[i];
  }
}

#undef CAFFE2_DEFINE_BINARY_OP
#undef CAFFE2_INSTANTIATE_BINARY_OP

template <>
void RandUniform<float, CPUContext>(
    const int n, const float a, const float b, float* r,
    CPUContext* context) {
  std::uniform_real_distribution<float> distribution(a, b);
  for (int i = 0; i < n; ++i) {
    r[i] = distribution(context->RandGenerator());
  }
}

template <>
void RandUniform<int, CPUContext>(
    const int n, const int a, const int b, int* r,
    CPUContext* context) {
  std::uniform_int_distribution<int> distribution(a, b);
  for (int i = 0; i < n; ++i) {
    r[i] = distribution(context->RandGenerator());
  }
}

#define CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(T)                   \
  template <>                                                       \
  void RandUniformUnique<T, CPUContext>(                            \
      const size_t n,                                               \
      const T a,                                                    \
      const T b,                                                    \
      T* r,                                                         \
      const size_t m,                                               \
      const T* avoid,                                               \
      CPUContext* context) {                                        \
    CAFFE_ENFORCE_LE(                                               \
        n, b - a - m + 1, "Cannot satisfy the unique requirement"); \
    std::unordered_set<T> avoid_set(n);                             \
    if (m) {                                                        \
      avoid_set.insert(avoid, avoid + m);                           \
    }                                                               \
    std::uniform_int_distribution<T> distribution(a, b);            \
    T v = 0;                                                        \
    for (size_t i = 0; i < n; ++i) {                                \
      do {                                                          \
        v = distribution(context->RandGenerator());                 \
      } while (avoid_set.count(v));                                 \
      r[i] = v;                                                     \
      avoid_set.insert(v);                                          \
    }                                                               \
  }

CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int32_t);
CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE(int64_t);
#undef CAFFE2_SPECIALIZED_RAND_UNIFORM_UNIQUE

template <>
void RandGaussian<float, CPUContext>(
    const int n, const float mean, const float std, float* r,
    CPUContext* context) {
  std::normal_distribution<float> distribution(mean, std);
  for (int i = 0; i < n; ++i) {
    r[i] = distribution(context->RandGenerator());
  }
}

template<>
void Sum<float, CPUContext>(
    const int N, const float* x, float* y,
    CPUContext* context) {
  *y = ConstEigenVectorMap<float>(x, N).sum();
}

template<>
void Sum<double, CPUContext>(
    const int N, const double* x, double* y,
    CPUContext* context) {
  *y = ConstEigenVectorMap<double>(x, N).sum();
}

template <>
void Select<float, CPUContext>(
      const int N, const int D, const float* x, const int* idx, float* y,
      CPUContext* context) {
  for (int i = 0; i < N; ++i) {
    DCHECK_LT(idx[i], D);
    y[i] = x[i * D + idx[i]];
  }
}

template <>
void Im2col<float, CPUContext, StorageOrder::NCHW>(
    const float* data_im,
    const int channels,
    const int height,
    const int width,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int stride_h,
    const int stride_w,
    float* data_col,
    CPUContext* context) {
  const int output_h =
      (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
      1;
  const int output_w =
      (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
      1;

  // Fast path for zero padding and no dilation
  // From Torch, THNN_(unfolded_copy)
  if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
      pad_t == 0 && pad_b == 0) {
    for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
      const auto nip = k / (kernel_h * kernel_w);
      const auto rest = k % (kernel_h * kernel_w);
      const auto kh = rest / kernel_w;
      const auto kw = rest % kernel_w;
      auto* dst = data_col + nip * (kernel_h * kernel_w * output_h * output_w) +
          kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
      const auto* src = data_im + nip * (height * width);
      for (auto y = 0; y < output_h; y++) {
        const auto iy = y * stride_h + kh;
        const auto ix = kw;
        if (stride_w == 1) {
          memcpy(
              dst + (y * output_w),
              src + (iy * width + ix),
              sizeof(float) * output_w);
        } else {
          for (auto x = 0; x < output_w; x++) {
            memcpy(
                dst + (y * output_w + x),
                src + (iy * width + ix + x * stride_w),
                sizeof(float));
          }
        }
      }
    }
    return;
  }

  // Fast path for equal padding
  if (pad_l == pad_r && pad_t == pad_b) {
    // From Intel, https://github.com/BVLC/caffe/pull/3536
    const int pad_h = pad_t;
    const int pad_w = pad_l;
    const int channel_size = height * width;
    for (int channel = channels; channel--; data_im += channel_size) {
      for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
        for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
          int input_row = -pad_h + kernel_row * dilation_h;
          for (int output_rows = output_h; output_rows; output_rows--) {
            if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
              for (int output_cols = output_w; output_cols; output_cols--) {
                *(data_col++) = 0;
              }
            } else {
              int input_col = -pad_w + kernel_col * dilation_w;
              for (int output_col = output_w; output_col; output_col--) {
                if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                  *(data_col++) = data_im[input_row * width + input_col];
                } else {
                  *(data_col++) = 0;
                }
                input_col += stride_w;
              }
            }
            input_row += stride_h;
          }
        }
      }
    }
    return;
  }

  // Baseline
  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;

  int channels_col = channels * kernel_h * kernel_w;
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % kernel_w;
    int h_offset = (c / kernel_w) % kernel_h;
    int c_im = c / kernel_h / kernel_w;
    for (int h = 0; h < height_col; ++h) {
      for (int w = 0; w < width_col; ++w) {
        int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
        int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width)
          data_col[(c * height_col + h) * width_col + w] =
              data_im[(c_im * height + h_pad) * width + w_pad];
        else
          data_col[(c * height_col + h) * width_col + w] = 0;
      }
    }
  }
}

template <>
void Im2col<float, CPUContext, StorageOrder::NHWC>(
    const float* data_im,
    const int channels,
    const int height,
    const int width,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int stride_h,
    const int stride_w,
    float* data_col,
    CPUContext* context) {
  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;

  int h_pad = -pad_t;
  for (int h = 0; h < height_col; ++h) {
    int w_pad = -pad_l;
    for (int w = 0; w < width_col; ++w) {
      for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
        for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
            memcpy(data_col, data_im + (ih * width + iw) * channels,
                   sizeof(float) * channels);
          } else {
            // This should be simply padded with zero.
            memset(data_col, 0, sizeof(float) * channels);
          }
          data_col += channels;
        }
      }
      w_pad += stride_w;
    }
    h_pad += stride_h;
  }
}

template <>
void Col2im<float, CPUContext, StorageOrder::NCHW>(
    const float* data_col,
    const int channels,
    const int height,
    const int width,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int stride_h,
    const int stride_w,
    float* data_im,
    CPUContext* context) {
  const int output_h =
      (height + pad_b + pad_t - (dilation_h * (kernel_h - 1) + 1)) / stride_h +
      1;
  const int output_w =
      (width + pad_l + pad_r - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
      1;

  Set<float, CPUContext>(height * width * channels, 0, data_im, context);

  // Fast path for zero padding and no dilation
  // From Torch, modified THNN_(unfolded_acc)
  if (dilation_h == 1 && dilation_w == 1 && pad_l == 0 && pad_r == 0 &&
      pad_t == 0 && pad_b == 0) {
    for (auto k = 0; k < channels * kernel_h * kernel_w; k++) {
      const auto nip = k / (kernel_h * kernel_w);
      const auto rest = k % (kernel_h * kernel_w);
      const auto kh = rest / kernel_w;
      const auto kw = rest % kernel_w;
      const auto* dst = data_col +
          nip * (kernel_h * kernel_w * output_h * output_w) +
          kh * (kernel_w * output_h * output_w) + kw * (output_h * output_w);
      auto* src = data_im + nip * (height * width);
      for (auto y = 0; y < output_h; y++) {
        const auto iy = y * stride_h + kh;
        const auto ix = kw;
        if (stride_w == 1) {
          auto offsrc = src + (iy * width + ix);
          const auto offdst = dst + (y * output_w);
          for (auto i = 0; i < output_w; ++i) {
            offsrc[i] += offdst[i];
          }
        } else {
          for (auto x = 0; x < output_w; x++) {
            auto offsrc = src + (iy * width + ix + x * stride_w);
            const auto offdst = dst + (y * output_w + x);
            *offsrc += *offdst;
          }
        }
      }
    }
    return;
  }

  // Fast path for equal padding
  if (pad_l == pad_r && pad_t == pad_b) {
    // From Intel, https://github.com/BVLC/caffe/pull/3536
    const int pad_h = pad_t;
    const int pad_w = pad_l;
    const int channel_size = height * width;
    for (int channel = channels; channel--; data_im += channel_size) {
      for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
        for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
          int input_row = -pad_h + kernel_row * dilation_h;
          for (int output_rows = output_h; output_rows; output_rows--) {
            if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
              data_col += output_w;
            } else {
              int input_col = -pad_w + kernel_col * dilation_w;
              for (int output_col = output_w; output_col; output_col--) {
                if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                  data_im[input_row * width + input_col] += *data_col;
                }
                data_col++;
                input_col += stride_w;
              }
            }
            input_row += stride_h;
          }
        }
      }
    }
    return;
  }

  // Fallback
  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
  int channels_col = channels * kernel_h * kernel_w;
  for (int c = 0; c < channels_col; ++c) {
    int w_offset = c % kernel_w;
    int h_offset = (c / kernel_w) % kernel_h;
    int c_im = c / kernel_h / kernel_w;
    for (int h = 0; h < height_col; ++h) {
      for (int w = 0; w < width_col; ++w) {
        int h_pad = h * stride_h - pad_t + h_offset * dilation_h;
        int w_pad = w * stride_w - pad_l + w_offset * dilation_w;
        if (h_pad >= 0 && h_pad < height && w_pad >= 0 && w_pad < width) {
          data_im[(c_im * height + h_pad) * width + w_pad] +=
              data_col[(c * height_col + h) * width_col + w];
        }
      }
    }
  }
}

template <>
void Col2im<float, CPUContext, StorageOrder::NHWC>(
    const float* data_col,
    const int channels,
    const int height,
    const int width,
    const int kernel_h,
    const int kernel_w,
    const int dilation_h,
    const int dilation_w,
    const int pad_t,
    const int pad_l,
    const int pad_b,
    const int pad_r,
    const int stride_h,
    const int stride_w,
    float* data_im,
    CPUContext* context) {
  const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
  const int dkernel_w = dilation_w * (kernel_w - 1) + 1;

  Set<float, CPUContext>(height * width * channels, 0, data_im, context);
  int height_col = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
  int width_col = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
  int h_pad = -pad_t;
  for (int h = 0; h < height_col; ++h) {
    int w_pad = -pad_l;
    for (int w = 0; w < width_col; ++w) {
      for (int ih = h_pad; ih < h_pad + dkernel_h; ih += dilation_h) {
        for (int iw = w_pad; iw < w_pad + dkernel_w; iw += dilation_w) {
          if (ih >= 0 && ih < height && iw >= 0 && iw < width) {
            auto* data_im_patch = data_im + (ih * width + iw) * channels;
            Add<float, CPUContext>(
                  channels, data_im_patch, data_col, data_im_patch, context);
          }
          data_col += channels;
        }
      }
      w_pad += stride_w;
    }
    h_pad += stride_h;
  }
}

template <>
void BiasCHW<float, CPUContext>(
  const float* bias,
  const int bias_channels,
  const int image_size,
  float* image,
  CPUContext* context) {
  // Sum the per-channel bias into every image plane
  for (int c = 0; c < bias_channels; ++c) {
    float b = bias[c];

#ifdef __ARM_NEON__
    float32x4_t vBias = vdupq_n_f32(b);

    // We give alignment hints for additional speed, so handle the
    // non-vectorizable prologue separately
    constexpr int kVecSizeInFloat = sizeof(float32x4_t) / sizeof(float);

    // FIXME: if input < kVecSizeInFloat, can't vectorize at all

    int prologue =
      kVecSizeInFloat -
      // remainder in floats
      (((uintptr_t) image) % (sizeof(float32x4_t))) / sizeof(float);

    int i = 0;
    // Prologue loop
    for (; i < prologue; ++i) {
      image[i] += b;
    }

    // The loop is manually unrolled by 8
    constexpr int kUnroll = 8;
    constexpr int kFloatsPerLoop = kUnroll * kVecSizeInFloat;

    int remainder = image_size - prologue;
    int vectorizable = prologue + (remainder / kFloatsPerLoop) * kFloatsPerLoop;

    // Vectorizable body
    for (; i < vectorizable; i += kFloatsPerLoop) {
      // Manually unrolled
      float32x4_t v0 = vld1q_f32_aligned(image + i + 0);
      float32x4_t v1 = vld1q_f32_aligned(image + i + 4);
      float32x4_t v2 = vld1q_f32_aligned(image + i + 8);
      float32x4_t v3 = vld1q_f32_aligned(image + i + 12);
      float32x4_t v4 = vld1q_f32_aligned(image + i + 16);
      float32x4_t v5 = vld1q_f32_aligned(image + i + 20);
      float32x4_t v6 = vld1q_f32_aligned(image + i + 24);
      float32x4_t v7 = vld1q_f32_aligned(image + i + 28);

      v0 = vaddq_f32(v0, vBias);
      v1 = vaddq_f32(v1, vBias);
      v2 = vaddq_f32(v2, vBias);
      v3 = vaddq_f32(v3, vBias);
      v4 = vaddq_f32(v4, vBias);
      v5 = vaddq_f32(v5, vBias);
      v6 = vaddq_f32(v6, vBias);
      v7 = vaddq_f32(v7, vBias);

      vst1q_f32_aligned(image + i + 0, v0);
      vst1q_f32_aligned(image + i + 4, v1);
      vst1q_f32_aligned(image + i + 8, v2);
      vst1q_f32_aligned(image + i + 12, v3);
      vst1q_f32_aligned(image + i + 16, v4);
      vst1q_f32_aligned(image + i + 20, v5);
      vst1q_f32_aligned(image + i + 24, v6);
      vst1q_f32_aligned(image + i + 28, v7);
    }

    // Non-vectorizable epilogue
    for (; i < image_size; ++i) {
      image[i] += b;
    }
#else
    // Non-NEON CPU implementation
    for (int i = 0; i < image_size; ++i) {
      image[i] += b;
    }
#endif // __ARM_NEON__

    image += image_size;
  }
}

template <>
void CopyMatrix<CPUContext>(
    const size_t itemsize, const int M, const int N, const void* A,
    const int lda, void* B, const int ldb, CPUContext* context) {
  if (lda == N && ldb == N) {
    // can coalese to a single memcpy of size M * N
    memcpy(
        static_cast<char*>(B), static_cast<const char*>(A), itemsize * N * M);
    return;
  }

  for (int i = 0; i < M; ++i) {
    memcpy(static_cast<char*>(B) + ldb * i * itemsize,
           static_cast<const char*>(A) + lda * i * itemsize,
           itemsize * N);
  }
}

uint32_t randomNumberSeed() {
  // Originally copied from folly::randomNumberSeed (at 418ad4)
  // modified to use chrono instead of sys/time.h
  static std::atomic<uint32_t> seedInput(0);
  auto tv = std::chrono::system_clock::now().time_since_epoch();
  uint64_t usec = static_cast<uint64_t>(
      std::chrono::duration_cast<std::chrono::microseconds>(tv).count());
  uint32_t tv_sec = usec / 1000000;
  uint32_t tv_usec = usec % 1000000;
  const uint32_t kPrime0 = 51551;
  const uint32_t kPrime1 = 61631;
  const uint32_t kPrime2 = 64997;
  const uint32_t kPrime3 = 111857;
  return kPrime0 * (seedInput++) + kPrime1 * static_cast<uint32_t>(getpid()) +
      kPrime2 * tv_sec + kPrime3 * tv_usec;
}

}  // namespace math
}  // namespace caffe2