mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/16753 Separate elementwise level2 math functions i-am-not-moving-c2-to-c10 Reviewed By: houseroad Differential Revision: D13954928 fbshipit-source-id: 1ca7a5d3da96e32510f502e5e4e79168854bee67
359 lines
15 KiB
Text
359 lines
15 KiB
Text
#include "caffe2/utils/math/elementwise.h"
|
|
|
|
#include <thrust/functional.h>
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/utils/conversions.h"
|
|
#include "caffe2/utils/math/half_utils.h"
|
|
#include "caffe2/utils/math/utils.h"
|
|
|
|
namespace caffe2 {
|
|
namespace math {
|
|
|
|
namespace {
|
|
|
|
#define DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(T, Func, DeviceFunc) \
|
|
__global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \
|
|
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x; \
|
|
if (i < N) { \
|
|
Y[i] = DeviceFunc(X[i]); \
|
|
} \
|
|
}
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Exp, expf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Log, logf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cos, cosf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Acos, acosf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sin, sinf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Asin, asinf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tan, tanf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Atan, atanf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sinh, sinhf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cosh, coshf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Tanh, tanhf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Abs, fabsf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqr, utils::Square<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sqrt, sqrtf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Rsqrt, rsqrtf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cbrt, cbrtf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Erf, erff)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Erf, erf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
|
|
std::int32_t,
|
|
Cube,
|
|
utils::Cube<std::int32_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
|
|
std::int64_t,
|
|
Cube,
|
|
utils::Cube<std::int64_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Cube, utils::Cube<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Cube, utils::Cube<double>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(bool, Not, utils::Not<bool>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
|
|
std::int32_t,
|
|
Neg,
|
|
utils::Negate<std::int32_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
|
|
std::int64_t,
|
|
Neg,
|
|
utils::Negate<std::int64_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Neg, utils::Negate<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Neg, utils::Negate<double>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
|
|
std::int32_t,
|
|
Sign,
|
|
utils::Sign<std::int32_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(
|
|
std::int64_t,
|
|
Sign,
|
|
utils::Sign<std::int64_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Sign, utils::Sign<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Sign, utils::Sign<double>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(float, Inv, utils::Inv<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION(double, Inv, utils::Inv<double>)
|
|
#undef DELEGATE_SIMPLE_CUDA_UNARY_KERNEL_FUNCTION
|
|
|
|
template <typename T>
|
|
__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
|
|
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
|
if (i < N) {
|
|
#if __CUDA_ARCH__ >= 350 || defined(__HIP_PLATFORM_HCC__)
|
|
c10::cuda::compat::sincos(__ldg(X + i), S + i, C + i);
|
|
#else
|
|
c10::cuda::compat::sincos(X[i], S + i, C + i);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
template <typename T, class Func>
|
|
__global__ void SimpleBinaryCUDAKernel(
|
|
const int N,
|
|
const Func func,
|
|
const T* A,
|
|
const T* B,
|
|
T* C) {
|
|
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
|
if (i < N) {
|
|
C[i] = func(A[i], B[i]);
|
|
}
|
|
}
|
|
|
|
template <typename T, class Comp>
|
|
__global__ void SimpleCompareCUDAKernel(
|
|
const int N,
|
|
const Comp comp,
|
|
const T* A,
|
|
const T* B,
|
|
bool* C) {
|
|
const int i = blockIdx.x * CAFFE_CUDA_NUM_THREADS + threadIdx.x;
|
|
if (i < N) {
|
|
C[i] = comp(A[i], B[i]);
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
|
|
const int N, const T* X, T* Y, CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
|
|
Func##CUDAKernel<<< \
|
|
M, \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(N, X, Y); \
|
|
} \
|
|
}
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sinh)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cosh)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tanh)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Erf)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Erf)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Cube)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Cube)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Neg)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Neg)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int32_t, Sign)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(std::int64_t, Sign)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(float, Inv)
|
|
DEFINE_SIMPLE_CUDA_UNARY_FUNCTION(double, Inv)
|
|
#undef DEFINE_SIMPLE_CUDA_UNARY_FUNCTION
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void SinCos<T, CUDAContext>( \
|
|
const int N, const T* X, T* S, T* C, CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
const int K = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
|
|
SinCosCUDAKernel<T> \
|
|
<<<K, CAFFE_CUDA_NUM_THREADS, 0, context->cuda_stream()>>>( \
|
|
N, X, S, C); \
|
|
} \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
|
|
CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_SINCOS
|
|
|
|
#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(T, Func, DeviceFunc) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
|
|
const int N, const T* A, const T* B, T* C, CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
|
|
SimpleBinaryCUDAKernel<<< \
|
|
M, \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(N, DeviceFunc, A, B, C); \
|
|
} \
|
|
}
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
Add,
|
|
thrust::plus<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
Add,
|
|
thrust::plus<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Add, thrust::plus<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Add, thrust::plus<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Add, utils::HalfAddFunctor())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
Sub,
|
|
thrust::minus<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
Sub,
|
|
thrust::minus<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Sub, thrust::minus<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Sub, thrust::minus<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Sub, utils::HalfSubFunctor())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
Mul,
|
|
thrust::multiplies<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
Mul,
|
|
thrust::multiplies<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Mul, thrust::multiplies<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Mul, thrust::multiplies<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Mul, utils::HalfMulFunctor())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
Div,
|
|
thrust::divides<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
Div,
|
|
thrust::divides<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Div, thrust::divides<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Div, thrust::divides<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(at::Half, Div, utils::HalfDivFunctor())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Min, thrust::minimum<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Min, thrust::minimum<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, Max, thrust::maximum<float>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, Max, thrust::maximum<double>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, And, thrust::logical_and<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Or, thrust::logical_or<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, Xor, thrust::bit_xor<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseAnd, thrust::bit_and<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
BitwiseAnd,
|
|
thrust::bit_and<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
BitwiseAnd,
|
|
thrust::bit_and<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseOr, thrust::bit_or<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
BitwiseOr,
|
|
thrust::bit_or<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
BitwiseOr,
|
|
thrust::bit_or<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, BitwiseXor, thrust::bit_xor<bool>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int32_t,
|
|
BitwiseXor,
|
|
thrust::bit_xor<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
std::int64_t,
|
|
BitwiseXor,
|
|
thrust::bit_xor<std::int64_t>())
|
|
#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION
|
|
|
|
#define DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(T, Func, DeviceComp) \
|
|
template <> \
|
|
CAFFE2_CUDA_EXPORT void Func<T, CUDAContext>( \
|
|
const int N, const T* A, const T* B, bool* C, CUDAContext* context) { \
|
|
if (N > 0) { \
|
|
const int M = DivUp(N, CAFFE_CUDA_NUM_THREADS); \
|
|
SimpleCompareCUDAKernel<<< \
|
|
M, \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(N, DeviceComp, A, B, C); \
|
|
} \
|
|
}
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, EQ, thrust::equal_to<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
EQ,
|
|
thrust::equal_to<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
EQ,
|
|
thrust::equal_to<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, EQ, thrust::equal_to<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, EQ, thrust::equal_to<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, NE, thrust::not_equal_to<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
NE,
|
|
thrust::not_equal_to<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
NE,
|
|
thrust::not_equal_to<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, NE, thrust::not_equal_to<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
double,
|
|
NE,
|
|
thrust::not_equal_to<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LT, thrust::less<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
LT,
|
|
thrust::less<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
LT,
|
|
thrust::less<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LT, thrust::less<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LT, thrust::less<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, LE, thrust::less_equal<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
LE,
|
|
thrust::less_equal<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
LE,
|
|
thrust::less_equal<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, LE, thrust::less_equal<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, LE, thrust::less_equal<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GT, thrust::greater<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
GT,
|
|
thrust::greater<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
GT,
|
|
thrust::greater<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GT, thrust::greater<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(double, GT, thrust::greater<double>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(bool, GE, thrust::greater_equal<bool>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int32_t,
|
|
GE,
|
|
thrust::greater_equal<std::int32_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
std::int64_t,
|
|
GE,
|
|
thrust::greater_equal<std::int64_t>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(float, GE, thrust::greater_equal<float>())
|
|
DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION(
|
|
double,
|
|
GE,
|
|
thrust::greater_equal<double>())
|
|
#undef DELEGATE_SIMPLE_CUDA_COMPARE_FUNCTION
|
|
|
|
} // namespace math
|
|
} // namespace caffe2
|