mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Summary: Closes https://github.com/pytorch/pytorch/pull/9072 Use FixedDivisor in Reduce and Broadcast CUDA kernels Reviewed By: houseroad Differential Revision: D8710243 fbshipit-source-id: 6f1da12234898594a1be8c979d942aa515832aeb
3295 lines
103 KiB
Text
3295 lines
103 KiB
Text
// Implements the math functions for GPU.
|
|
|
|
#include "caffe2/utils/math.h"
|
|
|
|
#include <limits>
|
|
#include <numeric>
|
|
#include <vector>
|
|
|
|
#include <cub/block/block_reduce.cuh>
|
|
#include <cub/cub.cuh>
|
|
|
|
#include <thrust/functional.h>
|
|
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/utils/conversions.h"
|
|
#include "caffe2/utils/fixed_divisor.h"
|
|
#include "caffe2/utils/math_utils.h"
|
|
|
|
#if THRUST_VERSION >= 100800
|
|
#define THRUST_SUPPORTS_PER_THREAD
|
|
#endif // THRUST_VERSION >= 100800
|
|
|
|
namespace caffe2 {
|
|
namespace math {
|
|
|
|
namespace {
|
|
|
|
#define DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Func, expr) \
|
|
template <typename T> \
|
|
struct Func##Functor { \
|
|
inline __host__ __device__ T \
|
|
operator()(const T& lhs, const T& rhs) const { \
|
|
return lhs expr rhs; \
|
|
} \
|
|
}; \
|
|
template <> \
|
|
struct Func##Functor<float16> { \
|
|
inline __host__ __device__ float16 \
|
|
operator()(const float16& lhs, const float16& rhs) const { \
|
|
return convert::To<float, float16>(convert::To<float16, float>( \
|
|
lhs) expr convert::To<float16, float>(rhs)); \
|
|
} \
|
|
};
|
|
DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Add, +)
|
|
DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Sub, -)
|
|
DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Mul, *)
|
|
DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR(Div, /)
|
|
#undef DELEGATE_SIMPLE_HOST_DEVICE_BINARY_FUNCTOR
|
|
|
|
template <typename T>
|
|
__global__ void SinCosCUDAKernel(const int N, const T* X, T* S, T* C) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
#if __CUDA_ARCH__ >= 350
|
|
sincos(__ldg(X + i), S + i, C + i);
|
|
#else
|
|
sincos(X[i], S + i, C + i);
|
|
#endif
|
|
}
|
|
}
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator>
|
|
__global__ void SimpleBinaryOpCUDAKernel(
|
|
const int N,
|
|
const BinaryOperator op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
C[i] = op(A[i], B[i]);
|
|
}
|
|
}
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
|
|
__global__ void RowwiseBinaryOpCUDAKenel(
|
|
const int size,
|
|
const FixedDivisor<int> cols,
|
|
const BinaryOperator op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C) {
|
|
CUDA_1D_KERNEL_LOOP(C_index, size) {
|
|
const int j = cols.Mod(C_index);
|
|
const int A_index = broadcast_1st ? j : C_index;
|
|
const int B_index = broadcast_1st ? C_index : j;
|
|
C[C_index] = op(A[A_index], B[B_index]);
|
|
}
|
|
}
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
|
|
__global__ void ColwiseBinaryOpCUDAKenel(
|
|
const int size,
|
|
const FixedDivisor<int> cols,
|
|
const BinaryOperator op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C) {
|
|
CUDA_1D_KERNEL_LOOP(C_index, size) {
|
|
const int i = cols.Div(C_index);
|
|
const int A_index = broadcast_1st ? i : C_index;
|
|
const int B_index = broadcast_1st ? C_index : i;
|
|
C[C_index] = op(A[A_index], B[B_index]);
|
|
}
|
|
}
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator, int D>
|
|
__global__ void BroadcastBinaryOpCUDAKernel(
|
|
const int size,
|
|
const SimpleArray<int, D> A_strides,
|
|
const SimpleArray<int, D> B_strides,
|
|
const SimpleArray<FixedDivisor<int>, D> C_dims,
|
|
const BinaryOperator op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C) {
|
|
CUDA_1D_KERNEL_LOOP(C_index, size) {
|
|
int A_index = 0;
|
|
int B_index = 0;
|
|
int C_index_val = C_index;
|
|
#pragma unroll
|
|
for (int i = D - 1; i >= 0; --i) {
|
|
int d;
|
|
C_dims.data[i].DivMod(C_index_val, &C_index_val, &d);
|
|
A_index += d * A_strides.data[i];
|
|
B_index += d * B_strides.data[i];
|
|
}
|
|
C[C_index] = op(A[A_index], B[B_index]);
|
|
}
|
|
}
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator>
|
|
void BinaryOpWith2DBroadcasting(
|
|
const int ndim,
|
|
const int* dims,
|
|
const int pivot,
|
|
const bool rowwise_broadcast,
|
|
const bool broadcast_1st,
|
|
const BinaryOperator& op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C,
|
|
CUDAContext* context) {
|
|
const int rows =
|
|
std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
|
|
const int cols =
|
|
std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
|
|
if (rows == 0 || cols == 0) {
|
|
return;
|
|
}
|
|
const int size = rows * cols;
|
|
const FixedDivisor<int> cols_div(cols);
|
|
if (rowwise_broadcast) {
|
|
if (broadcast_1st) {
|
|
RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
|
|
<<<CAFFE_GET_BLOCKS(size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
|
|
} else {
|
|
RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
|
|
<<<CAFFE_GET_BLOCKS(size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
|
|
}
|
|
} else {
|
|
if (broadcast_1st) {
|
|
ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
|
|
<<<CAFFE_GET_BLOCKS(size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
|
|
} else {
|
|
ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
|
|
<<<CAFFE_GET_BLOCKS(size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator, int D>
|
|
void BroadcastBinaryOpImpl(
|
|
const int* A_dims,
|
|
const int* B_dims,
|
|
const int* C_dims,
|
|
const BinaryOperator& op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C,
|
|
CUDAContext* context) {
|
|
SimpleArray<int, D> A_strides_array;
|
|
SimpleArray<int, D> B_strides_array;
|
|
SimpleArray<FixedDivisor<int>, D> C_dims_array;
|
|
int A_stride = 1;
|
|
int B_stride = 1;
|
|
for (int i = D - 1; i >= 0; --i) {
|
|
if (C_dims[i] == 0) {
|
|
return;
|
|
}
|
|
A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride;
|
|
B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride;
|
|
A_stride *= A_dims[i];
|
|
B_stride *= B_dims[i];
|
|
C_dims_array.data[i] = FixedDivisor<int>(C_dims[i]);
|
|
}
|
|
const int size =
|
|
std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>());
|
|
BroadcastBinaryOpCUDAKernel<TIn, TOut, BinaryOperator, D>
|
|
<<<CAFFE_GET_BLOCKS(size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
size, A_strides_array, B_strides_array, C_dims_array, op, A, B, C);
|
|
}
|
|
|
|
template <typename TIn, typename TOut, class BinaryOperator>
|
|
void BroadcastBinaryOp(
|
|
const int A_ndim,
|
|
const int* A_dims,
|
|
const int B_ndim,
|
|
const int* B_dims,
|
|
const BinaryOperator& op,
|
|
const TIn* A,
|
|
const TIn* B,
|
|
TOut* C,
|
|
CUDAContext* context) {
|
|
const int ndim = std::max(A_ndim, B_ndim);
|
|
std::vector<int> A_dims_array(ndim);
|
|
std::vector<int> B_dims_array(ndim);
|
|
std::vector<int> C_dims_array(ndim);
|
|
utils::ComputeBroadcastBinaryOpDims(
|
|
A_ndim,
|
|
A_dims,
|
|
B_ndim,
|
|
B_dims,
|
|
A_dims_array.data(),
|
|
B_dims_array.data(),
|
|
C_dims_array.data());
|
|
if (A_dims_array == B_dims_array) {
|
|
const int size = std::accumulate(
|
|
C_dims_array.cbegin(), C_dims_array.cend(), 1, std::multiplies<int>());
|
|
SimpleBinaryOpCUDAKernel<TIn, TOut, BinaryOperator>
|
|
<<<CAFFE_GET_BLOCKS(size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(size, op, A, B, C);
|
|
return;
|
|
}
|
|
int pivot;
|
|
bool broadcast_1st;
|
|
if (utils::IsRowwiseBroadcastBinaryOp(
|
|
ndim,
|
|
A_dims_array.data(),
|
|
B_dims_array.data(),
|
|
&pivot,
|
|
&broadcast_1st)) {
|
|
BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
|
|
ndim,
|
|
C_dims_array.data(),
|
|
pivot,
|
|
true,
|
|
broadcast_1st,
|
|
op,
|
|
A,
|
|
B,
|
|
C,
|
|
context);
|
|
return;
|
|
}
|
|
if (utils::IsColwiseBroadcastBinaryOp(
|
|
ndim,
|
|
A_dims_array.data(),
|
|
B_dims_array.data(),
|
|
&pivot,
|
|
&broadcast_1st)) {
|
|
BinaryOpWith2DBroadcasting<TIn, TOut, BinaryOperator>(
|
|
ndim,
|
|
C_dims_array.data(),
|
|
pivot,
|
|
false,
|
|
broadcast_1st,
|
|
op,
|
|
A,
|
|
B,
|
|
C,
|
|
context);
|
|
return;
|
|
}
|
|
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
|
|
ndim,
|
|
BroadcastBinaryOpImpl,
|
|
TIn,
|
|
TOut,
|
|
BinaryOperator,
|
|
A_dims_array.data(),
|
|
B_dims_array.data(),
|
|
C_dims_array.data(),
|
|
op,
|
|
A,
|
|
B,
|
|
C,
|
|
context);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(T, Func, op) \
|
|
__global__ void Func##CUDAKernel(const int N, const T* X, T* Y) { \
|
|
CUDA_1D_KERNEL_LOOP(i, N) { \
|
|
Y[i] = op(X[i]); \
|
|
} \
|
|
} \
|
|
template <> \
|
|
void Func<T, CUDAContext>( \
|
|
const int N, const T* x, T* y, CUDAContext* context) { \
|
|
Func##CUDAKernel<<< \
|
|
CAFFE_GET_BLOCKS(N), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(N, x, y); \
|
|
}
|
|
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Exp, expf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Log, logf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cos, cosf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Acos, acosf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sin, sinf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Asin, asinf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Tan, tanf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Atan, atanf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Abs, fabsf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqr, utils::Square<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sqrt, sqrtf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Rsqrt, rsqrtf)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cbrt, cbrtf)
|
|
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Cube, utils::Cube<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Cube, utils::Cube<double>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int32_t,
|
|
Cube,
|
|
utils::Cube<std::int32_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int64_t,
|
|
Cube,
|
|
utils::Cube<std::int64_t>)
|
|
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(bool, Not, utils::Not)
|
|
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Neg, utils::Negate<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Neg, utils::Negate<double>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int32_t,
|
|
Neg,
|
|
utils::Negate<std::int32_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int64_t,
|
|
Neg,
|
|
utils::Negate<std::int64_t>)
|
|
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(float, Sign, utils::Sign<float>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(double, Sign, utils::Sign<double>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int32_t,
|
|
Sign,
|
|
utils::Sign<std::int32_t>)
|
|
DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION(
|
|
std::int64_t,
|
|
Sign,
|
|
utils::Sign<std::int64_t>)
|
|
|
|
#undef DELEGATE_SIMPLE_CUDA_UNARY_FUNCTION
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_SINCOS(T) \
|
|
template <> \
|
|
void SinCos<T, CUDAContext>( \
|
|
const int N, const T* x, T* ys, T* yc, CUDAContext* context) { \
|
|
SinCosCUDAKernel<<< \
|
|
CAFFE_GET_BLOCKS(N), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(N, x, ys, yc); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_SINCOS(float)
|
|
CAFFE2_SPECIALIZED_CUDA_SINCOS(double)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_SINCOS
|
|
|
|
#define DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
|
|
template <> \
|
|
void Func<TIn, CUDAContext>( \
|
|
const int N, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CUDAContext* context) { \
|
|
SimpleBinaryOpCUDAKernel<TIn, TOut, Op<TIn>> \
|
|
<<<CAFFE_GET_BLOCKS(N), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(N, Op<TIn>(), A, B, C); \
|
|
}
|
|
|
|
#define DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, bool, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, bool, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
|
|
|
|
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
|
|
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
|
|
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LT, thrust::less)
|
|
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
|
|
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
|
|
DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
|
|
|
|
#undef DEFINE_SIMPLE_CUDA_COMPARE_FUNCTION
|
|
|
|
#define DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float, float, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)
|
|
|
|
DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Add, AddFunctor)
|
|
DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
|
|
DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
|
|
DEFINE_SIMPLE_CUDA_BINARY_FUNCTION(Div, DivFunctor)
|
|
|
|
#undef DEFINE_SIMPLE_CUDA_BINARY_FUNCTION
|
|
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
|
|
|
|
#define DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int32_t, std::int32_t, Func, Op) \
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
|
|
|
|
DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
|
|
DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
|
|
DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
|
|
|
|
#undef DEFINE_SIMPLE_CUDA_BITWISE_BINARY_FUNCTION
|
|
|
|
DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|
float,
|
|
float,
|
|
ElemwiseMax,
|
|
thrust::maximum);
|
|
|
|
#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION
|
|
|
|
#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
|
|
template <> \
|
|
void Rowwise##Func<TIn, CUDAContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CUDAContext* context) { \
|
|
if (rows == 0 || cols == 0) { \
|
|
return; \
|
|
} \
|
|
const int size = rows * cols; \
|
|
const FixedDivisor<int> cols_div(cols); \
|
|
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
|
|
<<<CAFFE_GET_BLOCKS(size), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
|
|
} \
|
|
template <> \
|
|
void Rowwise##Func<TIn, CUDAContext, false>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CUDAContext* context) { \
|
|
if (rows == 0 || cols == 0) { \
|
|
return; \
|
|
} \
|
|
const int size = rows * cols; \
|
|
const FixedDivisor<int> cols_div(cols); \
|
|
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
|
|
<<<CAFFE_GET_BLOCKS(size), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
|
|
} \
|
|
template <> \
|
|
void Colwise##Func<TIn, CUDAContext, true>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CUDAContext* context) { \
|
|
if (rows == 0 || cols == 0) { \
|
|
return; \
|
|
} \
|
|
const int size = rows * cols; \
|
|
const FixedDivisor<int> cols_div(cols); \
|
|
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
|
|
<<<CAFFE_GET_BLOCKS(size), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
|
|
} \
|
|
template <> \
|
|
void Colwise##Func<TIn, CUDAContext, false>( \
|
|
const int rows, \
|
|
const int cols, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CUDAContext* context) { \
|
|
if (rows == 0 || cols == 0) { \
|
|
return; \
|
|
} \
|
|
const int size = rows * cols; \
|
|
const FixedDivisor<int> cols_div(cols); \
|
|
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
|
|
<<<CAFFE_GET_BLOCKS(size), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
|
|
}
|
|
|
|
#define DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
|
|
|
|
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
|
|
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
|
|
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
|
|
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
|
|
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
|
|
DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
|
|
|
|
#undef DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION
|
|
|
|
#define DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \
|
|
std::int32_t, std::int32_t, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \
|
|
std::int64_t, std::int64_t, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)
|
|
|
|
DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
|
|
DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
|
|
DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
|
|
DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)
|
|
|
|
#undef DEFINE_2D_BROADCAST_CUDA_BINARY_FUNCTION
|
|
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
|
|
|
|
#define DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \
|
|
std::int32_t, std::int32_t, Func, Op) \
|
|
DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION( \
|
|
std::int64_t, std::int64_t, Func, Op)
|
|
|
|
DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
|
|
DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
|
|
DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
|
|
|
|
#undef DEFINE_2D_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION
|
|
|
|
#undef DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION
|
|
|
|
#define DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
|
|
template <> \
|
|
void Func<TIn, CUDAContext>( \
|
|
const int A_ndim, \
|
|
const int* A_dims, \
|
|
const int B_ndim, \
|
|
const int* B_dims, \
|
|
const TIn* A, \
|
|
const TIn* B, \
|
|
TOut* C, \
|
|
CUDAContext* context) { \
|
|
BroadcastBinaryOp<TIn, TOut, Op<TIn>>( \
|
|
A_ndim, A_dims, B_ndim, B_dims, Op<TIn>(), A, B, C, context); \
|
|
}
|
|
|
|
#define DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int32_t, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op)
|
|
|
|
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(EQ, thrust::equal_to)
|
|
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(NE, thrust::not_equal_to)
|
|
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LT, thrust::less)
|
|
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(LE, thrust::less_equal)
|
|
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GT, thrust::greater)
|
|
DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION(GE, thrust::greater_equal)
|
|
|
|
#undef DEFINE_BROADCAST_CUDA_COMPARE_FUNCTION
|
|
|
|
#define DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION( \
|
|
std::int32_t, std::int32_t, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION( \
|
|
std::int64_t, std::int64_t, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float, float, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(double, double, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(float16, float16, Func, Op)
|
|
|
|
DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Add, AddFunctor)
|
|
DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Sub, SubFunctor)
|
|
DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Mul, MulFunctor)
|
|
DEFINE_BROADCAST_CUDA_BINARY_FUNCTION(Div, DivFunctor)
|
|
|
|
#undef DEFINE_BROADCAST_CUDA_BINARY_FUNCTION
|
|
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, And, thrust::logical_and)
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Or, thrust::logical_or)
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Xor, thrust::bit_xor)
|
|
|
|
#define DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(bool, bool, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION( \
|
|
std::int32_t, std::int32_t, Func, Op) \
|
|
DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION(std::int64_t, std::int64_t, Func, Op)
|
|
|
|
DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseAnd, thrust::bit_and)
|
|
DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseOr, thrust::bit_or)
|
|
DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION(BitwiseXor, thrust::bit_xor)
|
|
|
|
#undef DEFINE_BROADCAST_CUDA_BITWISE_BINARY_FUNCTION
|
|
|
|
#undef DELEGATE_BROADCAST_CUDA_BINARY_FUNCTION
|
|
|
|
#define DELEGATE_REDUCTION_FUNCTION(T, Funcname, func) \
|
|
template <> \
|
|
void Funcname<T, CUDAContext>( \
|
|
const int N, \
|
|
const T* src, \
|
|
T* dst, \
|
|
Tensor<CUDAContext>* scratch_ptr, \
|
|
CUDAContext* context) { \
|
|
size_t memRequired = 0; \
|
|
cub::DeviceReduce::func( \
|
|
nullptr, memRequired, src, dst, N, context->cuda_stream()); \
|
|
auto buffer_size = \
|
|
static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T)); \
|
|
scratch_ptr->Resize(std::vector<TIndex>{buffer_size}); \
|
|
cub::DeviceReduce::func( \
|
|
static_cast<void*>(scratch_ptr->mutable_data<T>()), \
|
|
memRequired, \
|
|
src, \
|
|
dst, \
|
|
N, \
|
|
context->cuda_stream()); \
|
|
}
|
|
|
|
DELEGATE_REDUCTION_FUNCTION(float, ReduceMin, Min)
|
|
DELEGATE_REDUCTION_FUNCTION(float, ReduceMax, Max)
|
|
DELEGATE_REDUCTION_FUNCTION(int32_t, ReduceMax, Max)
|
|
DELEGATE_REDUCTION_FUNCTION(int64_t, ReduceMax, Max)
|
|
|
|
#undef DELEGATE_REDUCTION_FUNCTION
|
|
|
|
// Caffe2 gemm provides a simpler interface to the gemm functions, with the
|
|
// limitation that the data has to be contiguous in memory.
|
|
template <>
|
|
void Gemm<float, CUDAContext>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const CBLAS_TRANSPOSE TransB,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const float* B,
|
|
const float beta,
|
|
float* C,
|
|
CUDAContext* context,
|
|
TensorProto::DataType math_type) {
|
|
// Note that cublas follows fortran order, so the order is different from
|
|
// the cblas convention.
|
|
int lda = (TransA == CblasNoTrans) ? K : M;
|
|
int ldb = (TransB == CblasNoTrans) ? N : K;
|
|
cublasOperation_t cuTransA =
|
|
(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
cublasOperation_t cuTransB =
|
|
(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
CUBLAS_ENFORCE(cublasSgemm(
|
|
context->cublas_handle(),
|
|
cuTransB,
|
|
cuTransA,
|
|
N,
|
|
M,
|
|
K,
|
|
&alpha,
|
|
B,
|
|
ldb,
|
|
A,
|
|
lda,
|
|
&beta,
|
|
C,
|
|
N));
|
|
}
|
|
|
|
template <>
|
|
void Gemm<float16, CUDAContext>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const CBLAS_TRANSPOSE TransB,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float16* A,
|
|
const float16* B,
|
|
const float beta,
|
|
float16* C,
|
|
CUDAContext* context,
|
|
TensorProto::DataType math_type) {
|
|
// Note that cublas follows fortran order, so the order is different from
|
|
// the cblas convention.
|
|
int lda = (TransA == CblasNoTrans) ? K : M;
|
|
int ldb = (TransB == CblasNoTrans) ? N : K;
|
|
cublasOperation_t cuTransA =
|
|
(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
cublasOperation_t cuTransB =
|
|
(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
if (math_type == TensorProto_DataType_FLOAT) {
|
|
CUBLAS_CHECK(cublasSgemmEx(
|
|
context->cublas_handle(),
|
|
cuTransB,
|
|
cuTransA,
|
|
N,
|
|
M,
|
|
K,
|
|
&alpha,
|
|
B,
|
|
CUDA_R_16F,
|
|
ldb,
|
|
A,
|
|
CUDA_R_16F,
|
|
lda,
|
|
&beta,
|
|
C,
|
|
CUDA_R_16F,
|
|
N));
|
|
|
|
} else if (math_type == TensorProto_DataType_FLOAT16) {
|
|
// convert alpha, beta from float -> __half
|
|
auto alpha_fp16 = convert::floatToHalf(alpha);
|
|
auto beta_fp16 = convert::floatToHalf(beta);
|
|
|
|
// call cublasHgemm
|
|
CUBLAS_CHECK(cublasHgemm(
|
|
context->cublas_handle(),
|
|
cuTransB,
|
|
cuTransA,
|
|
N,
|
|
M,
|
|
K,
|
|
&alpha_fp16,
|
|
(const __half*)B,
|
|
ldb,
|
|
(const __half*)A,
|
|
lda,
|
|
&beta_fp16,
|
|
(__half*)C,
|
|
N));
|
|
} else {
|
|
// fail
|
|
CAFFE_THROW("Unsupported math type");
|
|
}
|
|
}
|
|
|
|
template <>
|
|
void BiasCHW<float, CUDAContext>(
|
|
const float* bias,
|
|
const float* bias_multiplier,
|
|
const int bias_channels,
|
|
const int image_size,
|
|
float* image,
|
|
CUDAContext* context) {
|
|
Gemm<float, CUDAContext>(
|
|
CblasNoTrans,
|
|
CblasNoTrans,
|
|
bias_channels,
|
|
image_size,
|
|
1,
|
|
1,
|
|
bias,
|
|
bias_multiplier,
|
|
1,
|
|
image,
|
|
context);
|
|
}
|
|
|
|
template <>
|
|
void GemmBatched<float, CUDAContext>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const CBLAS_TRANSPOSE TransB,
|
|
const int batch_size,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const float* B,
|
|
const float beta,
|
|
float* C,
|
|
CUDAContext* context,
|
|
Tensor<CUDAContext>* scratch,
|
|
TensorProto::DataType math_type) {
|
|
const int a_stride = M * K;
|
|
const int b_stride = K * N;
|
|
const int c_stride = M * N;
|
|
#if __CUDACC_VER_MAJOR__ < 8
|
|
// loop over matrices in the batch
|
|
for (int i = 0; i < batch_size; ++i) {
|
|
math::Gemm<float, CUDAContext>(
|
|
TransA,
|
|
TransB,
|
|
M,
|
|
N,
|
|
K,
|
|
alpha,
|
|
A + a_stride * i,
|
|
B + b_stride * i,
|
|
beta,
|
|
C + c_stride * i,
|
|
context);
|
|
}
|
|
#else
|
|
// Note that cublas follows fortran order, so the order is different from
|
|
// the cblas convention.
|
|
const int lda = (TransA == CblasNoTrans) ? K : M;
|
|
const int ldb = (TransB == CblasNoTrans) ? N : K;
|
|
cublasOperation_t cuTransA =
|
|
(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
cublasOperation_t cuTransB =
|
|
(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
CUBLAS_ENFORCE(cublasSgemmStridedBatched(
|
|
context->cublas_handle(),
|
|
cuTransB,
|
|
cuTransA,
|
|
N,
|
|
M,
|
|
K,
|
|
&alpha,
|
|
B,
|
|
ldb,
|
|
b_stride,
|
|
A,
|
|
lda,
|
|
a_stride,
|
|
&beta,
|
|
C,
|
|
N,
|
|
c_stride,
|
|
batch_size));
|
|
#endif
|
|
}
|
|
|
|
namespace {
|
|
|
|
__global__ void FloatToHalfKernel(const int N, const float* X, half* Y) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
Y[i] = __float2half(X[i]);
|
|
}
|
|
}
|
|
|
|
__global__ void HalfToFloatKernel(const int N, const half* X, float* Y) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
Y[i] = __half2float(X[i]);
|
|
}
|
|
}
|
|
|
|
}; // namespace
|
|
|
|
template <>
|
|
void GemmBatched<float16, CUDAContext>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const CBLAS_TRANSPOSE TransB,
|
|
const int batch_size,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float16* A,
|
|
const float16* B,
|
|
const float beta,
|
|
float16* C,
|
|
CUDAContext* context,
|
|
Tensor<CUDAContext>* scratch,
|
|
TensorProto::DataType math_type) {
|
|
const int a_stride = M * K;
|
|
const int b_stride = K * N;
|
|
const int c_stride = M * N;
|
|
#if __CUDACC_VER_MAJOR__ < 8
|
|
// loop over matrices in the batch
|
|
for (int i = 0; i < batch_size; ++i) {
|
|
math::Gemm<float16, CUDAContext>(
|
|
TransA,
|
|
TransB,
|
|
M,
|
|
N,
|
|
K,
|
|
alpha,
|
|
A + a_stride * i,
|
|
B + b_stride * i,
|
|
beta,
|
|
C + c_stride * i,
|
|
context);
|
|
}
|
|
#else
|
|
// 3 options:
|
|
// 1) scratch != null = cast to fp32, SgemmStridedBatched, cast result to fp16
|
|
// 2) math_type == FLOAT, scratch == nullptr = looped SgemmEx
|
|
// 3) math_type == FLOAT16, scratch == nullptr = batched Hgemm
|
|
|
|
if (scratch != nullptr) {
|
|
const int A_size = a_stride * batch_size;
|
|
const int B_size = b_stride * batch_size;
|
|
// cast, cublasSgemmStridedBatched, cast
|
|
size_t in_elems = A_size + B_size;
|
|
size_t out_elems = c_stride * batch_size;
|
|
|
|
scratch->Resize(in_elems + out_elems);
|
|
float* scratch_ptr = scratch->mutable_data<float>();
|
|
|
|
float* A_fp32 = scratch_ptr;
|
|
float* B_fp32 = scratch_ptr + A_size;
|
|
float* C_fp32 = scratch_ptr + A_size + B_size;
|
|
|
|
// cast A, B into fp32
|
|
HalfToFloatKernel<<<
|
|
CAFFE_GET_BLOCKS(A_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(A_size, (half*)A, A_fp32);
|
|
HalfToFloatKernel<<<
|
|
CAFFE_GET_BLOCKS(B_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(B_size, (half*)B, B_fp32);
|
|
|
|
// run fp32 batched Gemm
|
|
GemmBatched<float, CUDAContext>(
|
|
TransA,
|
|
TransB,
|
|
batch_size,
|
|
M,
|
|
N,
|
|
K,
|
|
alpha,
|
|
A_fp32,
|
|
B_fp32,
|
|
beta,
|
|
C_fp32,
|
|
context);
|
|
|
|
// cast result back to fp16
|
|
FloatToHalfKernel<<<
|
|
CAFFE_GET_BLOCKS(batch_size * M * N),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(batch_size * M * N, C_fp32, (half*)C);
|
|
} else {
|
|
if (math_type == TensorProto_DataType_FLOAT) {
|
|
// loop over matrices in the batch
|
|
for (int i = 0; i < batch_size; ++i) {
|
|
math::Gemm<float16, CUDAContext>(
|
|
TransA,
|
|
TransB,
|
|
M,
|
|
N,
|
|
K,
|
|
alpha,
|
|
A + a_stride * i,
|
|
B + b_stride * i,
|
|
beta,
|
|
C + c_stride * i,
|
|
context);
|
|
}
|
|
} else if (math_type == TensorProto_DataType_FLOAT16) {
|
|
// Note that cublas follows fortran order, so the order is different from
|
|
// the cblas convention.
|
|
const int lda = (TransA == CblasNoTrans) ? K : M;
|
|
const int ldb = (TransB == CblasNoTrans) ? N : K;
|
|
cublasOperation_t cuTransA =
|
|
(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
cublasOperation_t cuTransB =
|
|
(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
|
|
// convert alpha, beta from float -> __half
|
|
auto alpha_fp16 = convert::floatToHalf(alpha);
|
|
auto beta_fp16 = convert::floatToHalf(beta);
|
|
CUBLAS_ENFORCE(cublasHgemmStridedBatched(
|
|
context->cublas_handle(),
|
|
cuTransB,
|
|
cuTransA,
|
|
N,
|
|
M,
|
|
K,
|
|
&alpha_fp16,
|
|
(const __half*)B,
|
|
ldb,
|
|
b_stride,
|
|
(const __half*)A,
|
|
lda,
|
|
a_stride,
|
|
&beta_fp16,
|
|
(__half*)C,
|
|
N,
|
|
c_stride,
|
|
batch_size));
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
|
|
#if CUDA_VERSION >= 9000
|
|
|
|
// No change, but required. Defer to default CUDA engine
|
|
template <>
|
|
void Gemm<float, CUDAContext, TensorCoreEngine>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const CBLAS_TRANSPOSE TransB,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const float* B,
|
|
const float beta,
|
|
float* C,
|
|
CUDAContext* context,
|
|
TensorProto::DataType math_type) {
|
|
return Gemm<float, CUDAContext>(
|
|
TransA, TransB, M, N, K, alpha, A, B, beta, C, context, math_type);
|
|
}
|
|
|
|
template <>
|
|
void Gemm<float16, CUDAContext, TensorCoreEngine>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const CBLAS_TRANSPOSE TransB,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float16* A,
|
|
const float16* B,
|
|
const float beta,
|
|
float16* C,
|
|
CUDAContext* context,
|
|
TensorProto::DataType math_type) {
|
|
// Note that cublas follows fortran order, so the order is different from
|
|
// the cblas convention.
|
|
int lda = (TransA == CblasNoTrans) ? K : M;
|
|
int ldb = (TransB == CblasNoTrans) ? N : K;
|
|
cublasOperation_t cuTransA =
|
|
(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
cublasOperation_t cuTransB =
|
|
(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
|
|
// enable TensorCore for this call on this handle
|
|
if (TensorCoreAvailable()) {
|
|
CUBLAS_ENFORCE(
|
|
cublasSetMathMode(context->cublas_handle(), CUBLAS_TENSOR_OP_MATH));
|
|
}
|
|
|
|
CUBLAS_CHECK(cublasGemmEx(
|
|
context->cublas_handle(),
|
|
cuTransB,
|
|
cuTransA,
|
|
N,
|
|
M,
|
|
K,
|
|
&alpha,
|
|
B,
|
|
CUDA_R_16F,
|
|
ldb,
|
|
A,
|
|
CUDA_R_16F,
|
|
lda,
|
|
&beta,
|
|
C,
|
|
CUDA_R_16F,
|
|
N,
|
|
CUDA_R_32F,
|
|
CUBLAS_GEMM_DFALT_TENSOR_OP));
|
|
|
|
// Now disable TensorCore math for subsequent calls to this handle
|
|
if (TensorCoreAvailable()) {
|
|
CUBLAS_ENFORCE(
|
|
cublasSetMathMode(context->cublas_handle(), CUBLAS_DEFAULT_MATH));
|
|
}
|
|
}
|
|
|
|
template <>
|
|
void GemmBatched<float, CUDAContext, TensorCoreEngine>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const CBLAS_TRANSPOSE TransB,
|
|
const int batch_size,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const float* B,
|
|
const float beta,
|
|
float* C,
|
|
CUDAContext* context,
|
|
Tensor<CUDAContext>* scratch,
|
|
TensorProto::DataType math_type) {
|
|
return GemmBatched<float, CUDAContext, DefaultEngine>(
|
|
TransA,
|
|
TransB,
|
|
batch_size,
|
|
M,
|
|
N,
|
|
K,
|
|
alpha,
|
|
A,
|
|
B,
|
|
beta,
|
|
C,
|
|
context,
|
|
scratch,
|
|
math_type);
|
|
}
|
|
|
|
template <>
|
|
void GemmBatched<float16, CUDAContext, TensorCoreEngine>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const CBLAS_TRANSPOSE TransB,
|
|
const int batch_size,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float16* A,
|
|
const float16* B,
|
|
const float beta,
|
|
float16* C,
|
|
CUDAContext* context,
|
|
Tensor<CUDAContext>* scratch,
|
|
TensorProto::DataType math_type) {
|
|
return GemmBatched<float16, CUDAContext, DefaultEngine>(
|
|
TransA,
|
|
TransB,
|
|
batch_size,
|
|
M,
|
|
N,
|
|
K,
|
|
alpha,
|
|
A,
|
|
B,
|
|
beta,
|
|
C,
|
|
context,
|
|
scratch,
|
|
math_type);
|
|
}
|
|
|
|
#endif // CUDA_VERSION >= 9000
|
|
|
|
template <>
|
|
void GemmEx<float, CUDAContext>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const CBLAS_TRANSPOSE TransB,
|
|
const int M,
|
|
const int N,
|
|
const int K,
|
|
const float alpha,
|
|
const float* A,
|
|
const int lda,
|
|
const float* B,
|
|
const int ldb,
|
|
const float beta,
|
|
float* C,
|
|
const int ldc,
|
|
CUDAContext* context) {
|
|
// Note that cublas follows fortran order, so the order is different from
|
|
// the cblas convention.
|
|
cublasOperation_t cuTransA =
|
|
(TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
cublasOperation_t cuTransB =
|
|
(TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
|
|
CUBLAS_ENFORCE(cublasSgemm(
|
|
context->cublas_handle(),
|
|
cuTransB,
|
|
cuTransA,
|
|
N,
|
|
M,
|
|
K,
|
|
&alpha,
|
|
B,
|
|
ldb,
|
|
A,
|
|
lda,
|
|
&beta,
|
|
C,
|
|
ldc));
|
|
}
|
|
|
|
template <>
|
|
void Gemv<float, CUDAContext>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const int M,
|
|
const int N,
|
|
const float alpha,
|
|
const float* A,
|
|
const float* x,
|
|
const float beta,
|
|
float* y,
|
|
CUDAContext* context,
|
|
TensorProto::DataType math_type) {
|
|
cublasOperation_t cuTransA =
|
|
(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
|
|
CUBLAS_ENFORCE(cublasSgemv(
|
|
context->cublas_handle(),
|
|
cuTransA,
|
|
N,
|
|
M,
|
|
&alpha,
|
|
A,
|
|
N,
|
|
x,
|
|
1,
|
|
&beta,
|
|
y,
|
|
1));
|
|
}
|
|
|
|
// Batched Add variants
|
|
namespace {
|
|
|
|
template <typename T>
|
|
__global__ void AddStripedBatchKernel(
|
|
const int N,
|
|
const T* first,
|
|
T* Y,
|
|
const int stripe,
|
|
const int batch) {
|
|
for (int j = 0; j < batch; j++) {
|
|
const T* x = first + j * stripe;
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
float tmpY = convert::To<T, float>(Y[i]);
|
|
tmpY += convert::To<T, float>(x[i]);
|
|
Y[i] = convert::To<float, T>(tmpY);
|
|
}
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(T) \
|
|
template <> \
|
|
void AddStripedBatch<T, CUDAContext>( \
|
|
const int N, \
|
|
const T* first, \
|
|
T* Y, \
|
|
const int stripe, \
|
|
const int batch, \
|
|
CUDAContext* context) { \
|
|
AddStripedBatchKernel<T> \
|
|
<<<CAFFE_GET_BLOCKS(N), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(N, first, Y, stripe, batch); \
|
|
}
|
|
|
|
CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float);
|
|
CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH(float16);
|
|
#undef CAFFE2_SPECIALIZED_CUDA_ADD_STRIPED_BATCH
|
|
|
|
template <>
|
|
void Gemv<float16, CUDAContext>(
|
|
const CBLAS_TRANSPOSE TransA,
|
|
const int M,
|
|
const int N,
|
|
const float alpha,
|
|
const float16* A,
|
|
const float16* x,
|
|
const float beta,
|
|
float16* y,
|
|
CUDAContext* context,
|
|
TensorProto::DataType math_type) {
|
|
cublasOperation_t cuTransA =
|
|
(TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
|
|
|
|
// sort out what we need to call cublasSgemmEx / cublasHgemm
|
|
int m = (cuTransA == CUBLAS_OP_N) ? N : M;
|
|
int k = (cuTransA == CUBLAS_OP_N) ? M : N;
|
|
int LDA = (cuTransA == CUBLAS_OP_N) ? m : k;
|
|
int LDC = m;
|
|
|
|
if (math_type == TensorProto_DataType_FLOAT) {
|
|
CUBLAS_CHECK(cublasSgemmEx(
|
|
context->cublas_handle(),
|
|
cuTransA,
|
|
CUBLAS_OP_N,
|
|
m,
|
|
1,
|
|
k,
|
|
&alpha,
|
|
A,
|
|
CUDA_R_16F,
|
|
LDA,
|
|
x,
|
|
CUDA_R_16F,
|
|
k,
|
|
&beta,
|
|
y,
|
|
CUDA_R_16F,
|
|
LDC));
|
|
} else if (math_type == TensorProto_DataType_FLOAT16) {
|
|
auto alpha_fp16 = convert::floatToHalf(alpha);
|
|
auto beta_fp16 = convert::floatToHalf(beta);
|
|
|
|
CUBLAS_CHECK(cublasHgemm(
|
|
context->cublas_handle(),
|
|
cuTransA,
|
|
CUBLAS_OP_N,
|
|
m,
|
|
1,
|
|
k,
|
|
&alpha_fp16,
|
|
(const __half*)A,
|
|
LDA,
|
|
(const __half*)x,
|
|
k,
|
|
&beta_fp16,
|
|
(__half*)y,
|
|
LDC));
|
|
} else {
|
|
// fail
|
|
CAFFE_THROW("Unsupported math type");
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
template <typename T>
|
|
__global__ void SetKernel(const int N, const T alpha, T* Y) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
Y[i] = alpha;
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_SET(T) \
|
|
template <> \
|
|
void Set<T, CUDAContext>( \
|
|
const size_t N, const T alpha, T* Y, CUDAContext* context) { \
|
|
SetKernel<<< \
|
|
CAFFE_GET_BLOCKS(N), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>(N, alpha, Y); \
|
|
}
|
|
|
|
CAFFE2_SPECIALIZED_CUDA_SET(float);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(double);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(bool);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(int8_t);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(int16_t);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(float16);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(int);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(int64_t);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(char);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(uint8_t);
|
|
CAFFE2_SPECIALIZED_CUDA_SET(uint16_t);
|
|
#undef CAFFE2_SPECIALIZED_CUDA_SET
|
|
|
|
namespace {
|
|
template <typename T>
|
|
__global__ void
|
|
UniformShift(const size_t N, const float min, const float max, T* x) {
|
|
float scale = max - min;
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
x[i] = convert::To<float, T>(convert::To<T, float>(x[i]) * scale + min);
|
|
}
|
|
}
|
|
|
|
__global__ void
|
|
UniformIntFit(const size_t N, const int min, const int max, unsigned int* x) {
|
|
int* x_int = reinterpret_cast<int*>(x);
|
|
int range = (max - min + 1);
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
x_int[i] = min + static_cast<int>(x[i] % range);
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template <>
|
|
void RandUniform<float, CUDAContext>(
|
|
const size_t n,
|
|
const float min,
|
|
const float max,
|
|
float* r,
|
|
CUDAContext* context) {
|
|
CURAND_ENFORCE(curandGenerateUniform(context->curand_generator(), r, n));
|
|
UniformShift<float>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(n, min, max, r);
|
|
}
|
|
|
|
template <>
|
|
void RandUniform<double, CUDAContext>(
|
|
const size_t n,
|
|
const double min,
|
|
const double max,
|
|
double* r,
|
|
CUDAContext* context) {
|
|
CURAND_ENFORCE(
|
|
curandGenerateUniformDouble(context->curand_generator(), r, n));
|
|
UniformShift<double>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(n, min, max, r);
|
|
}
|
|
|
|
template <>
|
|
void RandUniform<int, CUDAContext>(
|
|
const size_t n,
|
|
const int min,
|
|
const int max,
|
|
int* r,
|
|
CUDAContext* context) {
|
|
CURAND_ENFORCE(curandGenerate(
|
|
context->curand_generator(), reinterpret_cast<unsigned int*>(r), n));
|
|
UniformIntFit<<<
|
|
CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
n, min, max, reinterpret_cast<unsigned int*>(r));
|
|
}
|
|
|
|
template <typename T>
|
|
size_t HandleOddLengthRandGaussian(
|
|
const size_t n,
|
|
const T mean,
|
|
const T std,
|
|
T* r,
|
|
CUDAContext* context) {
|
|
if (n % 2 == 1) {
|
|
std::default_random_engine generator;
|
|
std::normal_distribution<T> distribution(mean, std);
|
|
const T random_value = distribution(generator);
|
|
math::Set<T, CUDAContext>(1, random_value, r + (n - 1), context);
|
|
return n - 1;
|
|
}
|
|
return n;
|
|
}
|
|
|
|
template <>
|
|
void RandGaussian<float, CUDAContext>(
|
|
const size_t n,
|
|
const float mean,
|
|
const float std,
|
|
float* r,
|
|
CUDAContext* context) {
|
|
// If n is odd, we add a random Gaussian value at the end manually
|
|
// and generate n-1 random values using curandGenerateNormal.
|
|
// curandGenerateNormal requires n to be even.
|
|
const size_t even_n =
|
|
HandleOddLengthRandGaussian<float>(n, mean, std, r, context);
|
|
CURAND_ENFORCE(
|
|
curandGenerateNormal(context->curand_generator(), r, even_n, mean, std));
|
|
}
|
|
|
|
template <>
|
|
void RandGaussian<double, CUDAContext>(
|
|
const size_t n,
|
|
const double mean,
|
|
const double std,
|
|
double* r,
|
|
CUDAContext* context) {
|
|
const size_t even_n =
|
|
HandleOddLengthRandGaussian<double>(n, mean, std, r, context);
|
|
CURAND_ENFORCE(curandGenerateNormalDouble(
|
|
context->curand_generator(), r, even_n, mean, std));
|
|
}
|
|
|
|
template <>
|
|
void Dot<float, CUDAContext>(
|
|
const int n,
|
|
const float* a,
|
|
const float* b,
|
|
float* y,
|
|
CUDAContext* context) {
|
|
float result;
|
|
CUBLAS_ENFORCE(cublasSdot(context->cublas_handle(), n, a, 1, b, 1, &result));
|
|
context->Copy<float, CPUContext, CUDAContext>(1, &result, y);
|
|
}
|
|
|
|
template <>
|
|
void Dot<float16, CUDAContext>(
|
|
const int n,
|
|
const float16* a,
|
|
const float16* b,
|
|
float16* y,
|
|
CUDAContext* context) {
|
|
float16 result;
|
|
// execute with 32-bit math
|
|
CUBLAS_CHECK(cublasDotEx(
|
|
context->cublas_handle(),
|
|
n,
|
|
a,
|
|
CUDA_R_16F,
|
|
1,
|
|
b,
|
|
CUDA_R_16F,
|
|
1,
|
|
&result,
|
|
CUDA_R_16F,
|
|
CUDA_R_32F));
|
|
context->Copy<float16, CPUContext, CUDAContext>(1, &result, y);
|
|
}
|
|
|
|
// A previous version of caffe2 used Thrust but it turns out that thrust
|
|
// reduction has an implicit scratch space allocation and deallocation, which
|
|
// may interfere with NCCL and create a deadlock. Hence we are using a custom
|
|
// reduction here.
|
|
#define SUM_KERNEL_NTHREADS 128
|
|
template <typename T>
|
|
__global__ void SumKernel(const int N, const T* X, T* Y, bool square) {
|
|
const int idx = threadIdx.x;
|
|
__shared__ float reduction_buffer[SUM_KERNEL_NTHREADS];
|
|
|
|
reduction_buffer[idx] = 0;
|
|
|
|
// A multilevel reduction.
|
|
// N -> 128
|
|
if (!square) {
|
|
for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
|
|
reduction_buffer[idx] += convert::To<T, float>(X[i]);
|
|
}
|
|
} else {
|
|
for (int i = idx; i < N; i += SUM_KERNEL_NTHREADS) {
|
|
float Xi = convert::To<T, float>(X[i]);
|
|
reduction_buffer[idx] += Xi * Xi;
|
|
}
|
|
}
|
|
__syncthreads();
|
|
// 128 -> 32
|
|
if (idx < 32) {
|
|
reduction_buffer[idx] += reduction_buffer[idx + 32] +
|
|
reduction_buffer[idx + 64] + reduction_buffer[idx + 96];
|
|
}
|
|
__syncthreads();
|
|
// 32 -> 1
|
|
if (idx == 0) {
|
|
float tmp = 0;
|
|
for (int i = 0; i < 32; ++i) {
|
|
tmp += reduction_buffer[i];
|
|
}
|
|
*Y = convert::To<float, T>(tmp);
|
|
}
|
|
}
|
|
|
|
// According to the benchmarks script
|
|
// caffe2/caffe2/experiments/python/device_reduce_sum_bench.py,
|
|
// device reduce is slower for N <= 10000.
|
|
#define DEVICE_REDUCE_SIZE_THRESHOLD 10000
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
__global__ void SumConvertKernel(float* sum, T* dest) {
|
|
*dest = convert::To<float, T>(*sum);
|
|
}
|
|
|
|
template <typename T, typename IterT>
|
|
void SumGenericIter(
|
|
const int N,
|
|
IterT it,
|
|
T*& dest,
|
|
CUDAContext* context,
|
|
Tensor<CUDAContext>* scratch_ptr) {
|
|
size_t memRequired = 0;
|
|
cub::DeviceReduce::Sum(
|
|
nullptr, memRequired, it, dest, N, context->cuda_stream());
|
|
auto buffer_size =
|
|
static_cast<TIndex>((memRequired + sizeof(T) - 1) / sizeof(T));
|
|
if (!dest) {
|
|
// allocate one more T at the end of scratch for dest
|
|
scratch_ptr->Resize(std::vector<TIndex>{buffer_size + 1});
|
|
dest = scratch_ptr->template mutable_data<T>() + buffer_size;
|
|
} else {
|
|
scratch_ptr->Resize(std::vector<TIndex>{buffer_size});
|
|
}
|
|
cub::DeviceReduce::Sum(
|
|
static_cast<void*>(scratch_ptr->template mutable_data<T>()),
|
|
memRequired,
|
|
it,
|
|
dest,
|
|
N,
|
|
context->cuda_stream());
|
|
}
|
|
} // namespace
|
|
|
|
template <>
|
|
void Sum<float, CUDAContext>(
|
|
const int N,
|
|
const float* x,
|
|
float* y,
|
|
CUDAContext* context,
|
|
Tensor<CUDAContext>* scratch_ptr) {
|
|
if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
|
|
SumGenericIter<float>(N, x, y, context, scratch_ptr);
|
|
} else {
|
|
SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
|
|
N, x, y, false);
|
|
}
|
|
}
|
|
|
|
template <>
|
|
void Sum<int32_t, CUDAContext>(
|
|
const int N,
|
|
const int32_t* x,
|
|
int32_t* y,
|
|
CUDAContext* context,
|
|
Tensor<CUDAContext>* scratch_ptr) {
|
|
if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
|
|
SumGenericIter<int32_t>(N, x, y, context, scratch_ptr);
|
|
} else {
|
|
SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
|
|
N, x, y, false);
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
template <typename T>
|
|
struct FloatTransform {
|
|
inline __host__ __device__ float operator()(const T v) const {
|
|
return convert::To<T, float>(v);
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
#define CAFFE2_MATH_SUM_FUNC(T) \
|
|
template <> \
|
|
void Sum<T, CUDAContext>( \
|
|
const int N, \
|
|
const T* x, \
|
|
T* y, \
|
|
CUDAContext* context, \
|
|
Tensor<CUDAContext>* scratch_ptr) { \
|
|
if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \
|
|
FloatTransform<T> transform; \
|
|
cub::TransformInputIterator<float, FloatTransform<T>, const T*> it( \
|
|
x, transform); \
|
|
float* sum = nullptr; \
|
|
SumGenericIter<float>(N, it, sum, context, scratch_ptr); \
|
|
SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y); \
|
|
} else { \
|
|
SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( \
|
|
N, x, y, false); \
|
|
} \
|
|
}
|
|
|
|
CAFFE2_MATH_SUM_FUNC(float16)
|
|
#undef CAFFE2_MATH_SUM_FUNC
|
|
|
|
namespace {
|
|
template <typename T>
|
|
struct SqrTransform {
|
|
inline __host__ __device__ T operator()(const T v) const {
|
|
return v * v;
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
template <>
|
|
void SumSqr<float, CUDAContext>(
|
|
const int N,
|
|
const float* x,
|
|
float* y,
|
|
CUDAContext* context,
|
|
Tensor<CUDAContext>* scratch_ptr) {
|
|
if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) {
|
|
SqrTransform<float> transform;
|
|
cub::TransformInputIterator<float, SqrTransform<float>, const float*> it(
|
|
x, transform);
|
|
SumGenericIter<float>(N, it, y, context, scratch_ptr);
|
|
} else {
|
|
SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>(
|
|
N, x, y, true);
|
|
}
|
|
}
|
|
|
|
#define CAFFE2_MATH_SUMSQR_FUNC(T) \
|
|
template <> \
|
|
void SumSqr<T, CUDAContext>( \
|
|
const int N, \
|
|
const T* x, \
|
|
T* y, \
|
|
CUDAContext* context, \
|
|
Tensor<CUDAContext>* scratch_ptr) { \
|
|
if (scratch_ptr && N > DEVICE_REDUCE_SIZE_THRESHOLD) { \
|
|
FloatTransform<T> float_transform; \
|
|
cub::TransformInputIterator<float, FloatTransform<T>, const T*> \
|
|
float_it(x, float_transform); \
|
|
SqrTransform<float> sqr_transform; \
|
|
cub::TransformInputIterator< \
|
|
float, \
|
|
SqrTransform<float>, \
|
|
decltype(float_it)> \
|
|
it(float_it, sqr_transform); \
|
|
float* sum = nullptr; \
|
|
SumGenericIter<float>(N, it, sum, context, scratch_ptr); \
|
|
SumConvertKernel<<<1, 1, 0, context->cuda_stream()>>>(sum, y); \
|
|
} else { \
|
|
SumKernel<<<1, SUM_KERNEL_NTHREADS, 0, context->cuda_stream()>>>( \
|
|
N, x, y, true); \
|
|
} \
|
|
}
|
|
|
|
CAFFE2_MATH_SUMSQR_FUNC(float16)
|
|
#undef CAFFE2_MATH_SUMSQR_FUNC
|
|
#undef DEVICE_REDUCE_SIZE_THRESHOLD
|
|
|
|
namespace {
|
|
template <typename T>
|
|
__global__ void
|
|
SelectKernel(const int N, const int D, const T* x, const int* idx, T* y) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
y[i] = x[i * D + idx[i]];
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template <>
|
|
void Select<float, CUDAContext>(
|
|
const int N,
|
|
const int D,
|
|
const float* x,
|
|
const int* idx,
|
|
float* y,
|
|
CUDAContext* context) {
|
|
SelectKernel<float>
|
|
<<<CAFFE_GET_BLOCKS(N),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(N, D, x, idx, y);
|
|
}
|
|
|
|
template <>
|
|
void Select<float16, CUDAContext>(
|
|
const int N,
|
|
const int D,
|
|
const float16* x,
|
|
const int* idx,
|
|
float16* y,
|
|
CUDAContext* context) {
|
|
SelectKernel<float16>
|
|
<<<CAFFE_GET_BLOCKS(N),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(N, D, x, idx, y);
|
|
}
|
|
|
|
namespace {
|
|
template <typename T>
|
|
__global__ void ScaleKernel(const int n, const float alpha, const T* x, T* y) {
|
|
CUDA_1D_KERNEL_LOOP(i, n) {
|
|
// y[i] = convert::To<float,T>(convert::To<T, float>(x[i]) * alpha);
|
|
y[i] = convert::Get<T>(convert::Get<float>(x[i]) * alpha);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void
|
|
ScaleKernelDeviceAlpha(const int n, const float* alpha, const T* x, T* y) {
|
|
CUDA_1D_KERNEL_LOOP(i, n) {
|
|
y[i] = x[i] * (*alpha);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void PowKernel(const int n, const T* x, const T exponent, T* y) {
|
|
CUDA_1D_KERNEL_LOOP(i, n) {
|
|
y[i] = powf(x[i], exponent);
|
|
}
|
|
}
|
|
|
|
// fp16 specialization
|
|
template <>
|
|
__global__ void ScaleKernelDeviceAlpha(
|
|
const int n,
|
|
const float* alpha,
|
|
const float16* x,
|
|
float16* y) {
|
|
CUDA_1D_KERNEL_LOOP(i, n) {
|
|
y[i] = convert::To<float, float16>(
|
|
convert::To<float16, float>(x[i]) * (*alpha));
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
void Powx<float, CUDAContext>(
|
|
const int N,
|
|
const float* a,
|
|
const float b,
|
|
float* y,
|
|
CUDAContext* context) {
|
|
PowKernel<<<
|
|
CAFFE_GET_BLOCKS(N),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(N, a, b, y);
|
|
}
|
|
|
|
template <>
|
|
void Scale<float, CUDAContext>(
|
|
const int n,
|
|
const float alpha,
|
|
const float* x,
|
|
float* y,
|
|
CUDAContext* context) {
|
|
ScaleKernel<float>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(n, alpha, x, y);
|
|
}
|
|
|
|
template <>
|
|
void Scale<float16, CUDAContext>(
|
|
const int n,
|
|
const float alpha,
|
|
const float16* x,
|
|
float16* y,
|
|
CUDAContext* context) {
|
|
ScaleKernel<float16>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(n, alpha, x, y);
|
|
}
|
|
|
|
template <>
|
|
void Scale<float, CUDAContext>(
|
|
const int n,
|
|
const float* alpha,
|
|
const float* x,
|
|
float* y,
|
|
CUDAContext* context) {
|
|
ScaleKernelDeviceAlpha<float>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(n, alpha, x, y);
|
|
}
|
|
|
|
template <>
|
|
void Scale<float16, CUDAContext>(
|
|
const int n,
|
|
const float* alpha,
|
|
const float16* x,
|
|
float16* y,
|
|
CUDAContext* context) {
|
|
ScaleKernelDeviceAlpha<float16>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(n, alpha, x, y);
|
|
}
|
|
|
|
template <>
|
|
void Axpy<float, CUDAContext>(
|
|
const int N,
|
|
const float alpha,
|
|
const float* X,
|
|
float* Y,
|
|
CUDAContext* context) {
|
|
CUBLAS_ENFORCE(cublasSaxpy(context->cublas_handle(), N, &alpha, X, 1, Y, 1));
|
|
}
|
|
|
|
template <>
|
|
void Axpy<double, CUDAContext>(
|
|
const int N,
|
|
const float alpha,
|
|
const double* X,
|
|
double* Y,
|
|
CUDAContext* context) {
|
|
double alpha_d{alpha};
|
|
CUBLAS_ENFORCE(
|
|
cublasDaxpy(context->cublas_handle(), N, &alpha_d, X, 1, Y, 1));
|
|
}
|
|
|
|
template <>
|
|
void Axpy<float16, CUDAContext>(
|
|
const int N,
|
|
const float alpha,
|
|
const float16* X,
|
|
float16* Y,
|
|
CUDAContext* context) {
|
|
CUBLAS_CHECK(cublasAxpyEx(
|
|
context->cublas_handle(),
|
|
N,
|
|
&alpha,
|
|
CUDA_R_16F,
|
|
X,
|
|
CUDA_R_16F,
|
|
1,
|
|
Y,
|
|
CUDA_R_16F,
|
|
1,
|
|
CUDA_R_32F));
|
|
}
|
|
|
|
namespace {
|
|
template <typename T>
|
|
__global__ void AxpyKernel(const int n, const float* a, const T* x, T* y) {
|
|
CUDA_1D_KERNEL_LOOP(index, n) {
|
|
y[index] = convert::Get<T>(
|
|
convert::Get<float>(x[index]) * (*a) + convert::Get<float>(y[index]));
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template <>
|
|
void Axpy<float, CUDAContext>(
|
|
const int n,
|
|
const float* alpha,
|
|
const float* X,
|
|
float* Y,
|
|
CUDAContext* context) {
|
|
AxpyKernel<float>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(n, alpha, X, Y);
|
|
}
|
|
|
|
template <>
|
|
void Axpy<float16, CUDAContext>(
|
|
const int n,
|
|
const float* alpha,
|
|
const float16* X,
|
|
float16* Y,
|
|
CUDAContext* context) {
|
|
AxpyKernel<float16>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(n, alpha, X, Y);
|
|
}
|
|
|
|
namespace {
|
|
template <typename T>
|
|
__global__ void
|
|
AxpbyKernel(const int n, const T a, const T* x, const T b, T* y) {
|
|
CUDA_1D_KERNEL_LOOP(index, n) {
|
|
y[index] = x[index] * a + y[index] * b;
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template <>
|
|
void Axpby<float, CUDAContext>(
|
|
const int n,
|
|
const float a,
|
|
const float* x,
|
|
const float b,
|
|
float* y,
|
|
CUDAContext* context) {
|
|
AxpbyKernel<float>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(n, a, x, b, y);
|
|
}
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
__global__ void Im2ColNCHWCUDAKernel(
|
|
const int n,
|
|
const int input_h,
|
|
const int input_w,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const int output_h,
|
|
const int output_w,
|
|
const T* img_data,
|
|
T* col_data) {
|
|
CUDA_1D_KERNEL_LOOP(index, n) {
|
|
const int w_out = index % output_w;
|
|
const int h_index = index / output_w;
|
|
const int h_out = h_index % output_h;
|
|
const int channel_in = h_index / output_h;
|
|
const int channel_out = channel_in * kernel_h * kernel_w;
|
|
const int h_in = h_out * stride_h - pad_t;
|
|
const int w_in = w_out * stride_w - pad_l;
|
|
const int output_size = output_h * output_w;
|
|
T* col_data_ptr =
|
|
col_data + (channel_out * output_h + h_out) * output_w + w_out;
|
|
const T* img_data_ptr =
|
|
img_data + (channel_in * input_h + h_in) * input_w + w_in;
|
|
int dh = 0;
|
|
for (int i = 0; i < kernel_h; ++i) {
|
|
int dw = 0;
|
|
for (int j = 0; j < kernel_w; ++j) {
|
|
const int h = h_in + dh;
|
|
const int w = w_in + dw;
|
|
#if __CUDA_ARCH__ >= 350
|
|
*col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
|
|
? __ldg(img_data_ptr + dh * input_w + dw)
|
|
: 0;
|
|
#else
|
|
*col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
|
|
? img_data_ptr[dh * input_w + dw]
|
|
: 0;
|
|
#endif
|
|
col_data_ptr += output_size;
|
|
dw += dilation_w;
|
|
}
|
|
dh += dilation_h;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void Im2ColNHWCCUDAKernel(
|
|
const int n,
|
|
const int input_h,
|
|
const int input_w,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const int output_w,
|
|
const int channels,
|
|
const T* img_data,
|
|
T* col_data) {
|
|
CUDA_1D_KERNEL_LOOP(index, n) {
|
|
const int channel_in = index % channels;
|
|
const int w_out = index / channels % output_w;
|
|
const int h_out = index / channels / output_w;
|
|
const int h_in = h_out * stride_h - pad_t;
|
|
const int w_in = w_out * stride_w - pad_l;
|
|
T* col_data_ptr = col_data +
|
|
(h_out * output_w + w_out) * channels * kernel_h * kernel_w +
|
|
channel_in;
|
|
int dh = 0;
|
|
for (int i = 0; i < kernel_h; ++i) {
|
|
int dw = 0;
|
|
for (int j = 0; j < kernel_w; ++j) {
|
|
const int h = h_in + dh;
|
|
const int w = w_in + dw;
|
|
#if __CUDA_ARCH__ >= 350
|
|
*col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
|
|
? __ldg(img_data + (h * input_w + w) * channels + channel_in)
|
|
: 0;
|
|
#else
|
|
*col_data_ptr = (h >= 0 && w >= 0 && h < input_h && w < input_w)
|
|
? img_data[(h * input_w + w) * channels + channel_in]
|
|
: 0;
|
|
#endif
|
|
col_data_ptr += channels;
|
|
dw += dilation_w;
|
|
}
|
|
dh += dilation_h;
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void Col2ImNCHWCUDAKernel(
|
|
const int n,
|
|
const int input_h,
|
|
const int input_w,
|
|
const int patch_h,
|
|
const int patch_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const int output_h,
|
|
const int output_w,
|
|
const T* col_data,
|
|
T* img_data) {
|
|
const int dpatch_h = dilation_h * (patch_h - 1) + 1;
|
|
const int dpatch_w = dilation_w * (patch_w - 1) + 1;
|
|
|
|
CUDA_1D_KERNEL_LOOP(index, n) {
|
|
T val = 0;
|
|
const int w = index % input_w + pad_l;
|
|
const int h = index / input_w % input_h + pad_t;
|
|
const int c = index / (input_h * input_w);
|
|
|
|
// compute the start and end of the output
|
|
const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
|
|
const int w_col_end = min(w / stride_w + 1, output_w);
|
|
const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
|
|
const int h_col_end = min(h / stride_h + 1, output_h);
|
|
|
|
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
|
|
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
|
|
int h_k = (h - h_col * stride_h);
|
|
int w_k = (w - w_col * stride_w);
|
|
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
|
|
h_k /= dilation_h;
|
|
w_k /= dilation_w;
|
|
const int col_data_index =
|
|
(((c * patch_h + h_k) * patch_w + w_k) * output_h + h_col) *
|
|
output_w +
|
|
w_col;
|
|
#if __CUDA_ARCH__ >= 350
|
|
val += __ldg(col_data + col_data_index);
|
|
#else
|
|
val += col_data[col_data_index];
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
img_data[index] = val;
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
__global__ void Col2ImNHWCCUDAKernel(
|
|
const int n,
|
|
const int input_w,
|
|
const int channels,
|
|
const int patch_h,
|
|
const int patch_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const int output_h,
|
|
const int output_w,
|
|
const T* col_data,
|
|
T* img_data) {
|
|
const int dpatch_h = dilation_h * (patch_h - 1) + 1;
|
|
const int dpatch_w = dilation_w * (patch_w - 1) + 1;
|
|
|
|
CUDA_1D_KERNEL_LOOP(index, n) {
|
|
T val = 0;
|
|
const int c = index % channels;
|
|
const int w = index / channels % input_w + pad_l;
|
|
const int h = index / channels / input_w + pad_t;
|
|
// compute the start and end of the output
|
|
const int w_col_start = (w < dpatch_w) ? 0 : (w - dpatch_w) / stride_w + 1;
|
|
const int w_col_end = min(w / stride_w + 1, output_w);
|
|
const int h_col_start = (h < dpatch_h) ? 0 : (h - dpatch_h) / stride_h + 1;
|
|
const int h_col_end = min(h / stride_h + 1, output_h);
|
|
const int channels_col = patch_h * patch_w * channels;
|
|
|
|
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
|
|
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
|
|
int h_k = h - h_col * stride_h;
|
|
int w_k = w - w_col * stride_w;
|
|
if (h_k % dilation_h == 0 && w_k % dilation_w == 0) {
|
|
h_k /= dilation_h;
|
|
w_k /= dilation_w;
|
|
const int c_col = (h_k * patch_w + w_k) * channels + c;
|
|
#if __CUDA_ARCH__ >= 350
|
|
val += __ldg(
|
|
col_data + (h_col * output_w + w_col) * channels_col + c_col);
|
|
#else
|
|
val += col_data[(h_col * output_w + w_col) * channels_col + c_col];
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
img_data[index] = val;
|
|
}
|
|
}
|
|
|
|
template <typename T, int N, bool kCol2Im>
|
|
__global__ void Im2ColNdNCHWCUDAKernel(
|
|
const int outer_size,
|
|
const int inner_size,
|
|
const int kernel_size,
|
|
SimpleArray<int, N + 1> img_shape,
|
|
SimpleArray<int, N + 1> col_shape,
|
|
SimpleArray<int, N> kernel_shape,
|
|
SimpleArray<int, N> stride,
|
|
SimpleArray<int, N> dilation,
|
|
SimpleArray<int, N> pad,
|
|
const T* X_data,
|
|
T* Y_data) {
|
|
int d_offset[N];
|
|
int d_iter[N];
|
|
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
|
|
int offset_i = i;
|
|
#pragma unroll
|
|
for (int d_i = N - 1; d_i >= 0; --d_i) {
|
|
d_offset[d_i] = offset_i % kernel_shape.data[d_i];
|
|
offset_i /= kernel_shape.data[d_i];
|
|
}
|
|
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
|
|
int offset_j = j;
|
|
#pragma unroll
|
|
for (int d_i = N - 1; d_i >= 0; --d_i) {
|
|
d_iter[d_i] = offset_j % col_shape.data[d_i + 1];
|
|
offset_j /= col_shape.data[d_i + 1];
|
|
}
|
|
const int col_index = i * inner_size + j;
|
|
int img_index = i / kernel_size;
|
|
bool is_padding = false;
|
|
#pragma unroll
|
|
for (int d_i = 0; d_i < N; ++d_i) {
|
|
const int d_img = d_iter[d_i] * stride.data[d_i] - pad.data[d_i] +
|
|
d_offset[d_i] * dilation.data[d_i];
|
|
is_padding |= d_img < 0 || d_img >= img_shape.data[d_i + 1];
|
|
img_index = img_index * img_shape.data[d_i + 1] + d_img;
|
|
}
|
|
#if __CUDA_ARCH__ >= 350
|
|
if (!kCol2Im) {
|
|
Y_data[col_index] = is_padding ? 0 : __ldg(X_data + img_index);
|
|
} else if (!is_padding) {
|
|
atomicAdd(Y_data + img_index, __ldg(X_data + col_index));
|
|
}
|
|
#else
|
|
if (!kCol2Im) {
|
|
Y_data[col_index] = is_padding ? 0 : X_data[img_index];
|
|
} else if (!is_padding) {
|
|
atomicAdd(Y_data + img_index, X_data[col_index]);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
template <typename T, int N>
|
|
void Im2ColNdNCHWCUDAImpl(
|
|
const int img_size,
|
|
const int col_size,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const float* img_data,
|
|
float* col_data,
|
|
CUDAContext* context) {
|
|
const int outer_size = col_shape[0];
|
|
const int inner_size = col_size / outer_size;
|
|
const int kernel_size = std::accumulate(
|
|
kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
|
|
SimpleArray<int, N + 1> img_shape_array;
|
|
SimpleArray<int, N + 1> col_shape_array;
|
|
SimpleArray<int, N> kernel_shape_array;
|
|
SimpleArray<int, N> stride_array;
|
|
SimpleArray<int, N> dilation_array;
|
|
SimpleArray<int, N> pad_array;
|
|
std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
|
|
std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
|
|
std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
|
|
std::memcpy(stride_array.data, stride, N * sizeof(int));
|
|
std::memcpy(dilation_array.data, dilation, N * sizeof(int));
|
|
std::memcpy(pad_array.data, pad, N * sizeof(int));
|
|
Im2ColNdNCHWCUDAKernel<T, N, false>
|
|
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
outer_size,
|
|
inner_size,
|
|
kernel_size,
|
|
img_shape_array,
|
|
col_shape_array,
|
|
kernel_shape_array,
|
|
stride_array,
|
|
dilation_array,
|
|
pad_array,
|
|
img_data,
|
|
col_data);
|
|
}
|
|
|
|
template <typename T, int N>
|
|
void Col2ImNdNCHWCUDAImpl(
|
|
const int img_size,
|
|
const int col_size,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const float* col_data,
|
|
float* img_data,
|
|
CUDAContext* context) {
|
|
const int outer_size = col_shape[0];
|
|
const int inner_size = col_size / outer_size;
|
|
const int kernel_size = std::accumulate(
|
|
kernel_shape, kernel_shape + N, 1, std::multiplies<int>());
|
|
SimpleArray<int, N + 1> img_shape_array;
|
|
SimpleArray<int, N + 1> col_shape_array;
|
|
SimpleArray<int, N> kernel_shape_array;
|
|
SimpleArray<int, N> stride_array;
|
|
SimpleArray<int, N> dilation_array;
|
|
SimpleArray<int, N> pad_array;
|
|
std::memcpy(img_shape_array.data, img_shape, (N + 1) * sizeof(int));
|
|
std::memcpy(col_shape_array.data, col_shape, (N + 1) * sizeof(int));
|
|
std::memcpy(kernel_shape_array.data, kernel_shape, N * sizeof(int));
|
|
std::memcpy(stride_array.data, stride, N * sizeof(int));
|
|
std::memcpy(dilation_array.data, dilation, N * sizeof(int));
|
|
std::memcpy(pad_array.data, pad, N * sizeof(int));
|
|
Set<T, CUDAContext>(img_size, 0, img_data, context);
|
|
Im2ColNdNCHWCUDAKernel<T, N, true>
|
|
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
outer_size,
|
|
inner_size,
|
|
kernel_size,
|
|
img_shape_array,
|
|
col_shape_array,
|
|
kernel_shape_array,
|
|
stride_array,
|
|
dilation_array,
|
|
pad_array,
|
|
col_data,
|
|
img_data);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
template <>
|
|
void Im2Col<float, CUDAContext, StorageOrder::NCHW>(
|
|
const int channels,
|
|
const int height,
|
|
const int width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const float* img_data,
|
|
float* col_data,
|
|
CUDAContext* context) {
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
const int num_kernels = channels * output_h * output_w;
|
|
Im2ColNCHWCUDAKernel<float>
|
|
<<<CAFFE_GET_BLOCKS(num_kernels),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
num_kernels,
|
|
height,
|
|
width,
|
|
kernel_h,
|
|
kernel_w,
|
|
dilation_h,
|
|
dilation_w,
|
|
pad_t,
|
|
pad_l,
|
|
stride_h,
|
|
stride_w,
|
|
output_h,
|
|
output_w,
|
|
img_data,
|
|
col_data);
|
|
}
|
|
|
|
template <>
|
|
void Im2Col<float, CUDAContext, StorageOrder::NHWC>(
|
|
const int channels,
|
|
const int height,
|
|
const int width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const float* img_data,
|
|
float* col_data,
|
|
CUDAContext* context) {
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
const int num_kernels = output_h * output_w * channels;
|
|
Im2ColNHWCCUDAKernel<float>
|
|
<<<CAFFE_GET_BLOCKS(num_kernels),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
num_kernels,
|
|
height,
|
|
width,
|
|
kernel_h,
|
|
kernel_w,
|
|
dilation_h,
|
|
dilation_w,
|
|
pad_t,
|
|
pad_l,
|
|
stride_h,
|
|
stride_w,
|
|
output_w,
|
|
channels,
|
|
img_data,
|
|
col_data);
|
|
}
|
|
|
|
template <>
|
|
void Col2Im<float, CUDAContext, StorageOrder::NCHW>(
|
|
const int channels,
|
|
const int height,
|
|
const int width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const float* col_data,
|
|
float* img_data,
|
|
CUDAContext* context) {
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
const int num_kernels = channels * height * width;
|
|
Col2ImNCHWCUDAKernel<float>
|
|
<<<CAFFE_GET_BLOCKS(num_kernels),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
num_kernels,
|
|
height,
|
|
width,
|
|
kernel_h,
|
|
kernel_w,
|
|
dilation_h,
|
|
dilation_w,
|
|
pad_t,
|
|
pad_l,
|
|
stride_h,
|
|
stride_w,
|
|
output_h,
|
|
output_w,
|
|
col_data,
|
|
img_data);
|
|
}
|
|
|
|
template <>
|
|
void Col2Im<float, CUDAContext, StorageOrder::NHWC>(
|
|
const int channels,
|
|
const int height,
|
|
const int width,
|
|
const int kernel_h,
|
|
const int kernel_w,
|
|
const int dilation_h,
|
|
const int dilation_w,
|
|
const int pad_t,
|
|
const int pad_l,
|
|
const int pad_b,
|
|
const int pad_r,
|
|
const int stride_h,
|
|
const int stride_w,
|
|
const float* col_data,
|
|
float* img_data,
|
|
CUDAContext* context) {
|
|
const int dkernel_h = dilation_h * (kernel_h - 1) + 1;
|
|
const int dkernel_w = dilation_w * (kernel_w - 1) + 1;
|
|
const int output_h = (height + pad_t + pad_b - dkernel_h) / stride_h + 1;
|
|
const int output_w = (width + pad_l + pad_r - dkernel_w) / stride_w + 1;
|
|
const int num_kernels = height * width * channels;
|
|
Col2ImNHWCCUDAKernel<float>
|
|
<<<CAFFE_GET_BLOCKS(num_kernels),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
num_kernels,
|
|
width,
|
|
channels,
|
|
kernel_h,
|
|
kernel_w,
|
|
dilation_h,
|
|
dilation_w,
|
|
pad_t,
|
|
pad_l,
|
|
stride_h,
|
|
stride_w,
|
|
output_h,
|
|
output_w,
|
|
col_data,
|
|
img_data);
|
|
}
|
|
|
|
template <>
|
|
void Im2ColNd<float, CUDAContext, StorageOrder::NCHW>(
|
|
const int N,
|
|
const int img_size,
|
|
const int col_size,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const float* img_data,
|
|
float* col_data,
|
|
CUDAContext* context) {
|
|
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
|
|
N,
|
|
Im2ColNdNCHWCUDAImpl,
|
|
float,
|
|
img_size,
|
|
col_size,
|
|
img_shape,
|
|
col_shape,
|
|
kernel_shape,
|
|
stride,
|
|
dilation,
|
|
pad,
|
|
img_data,
|
|
col_data,
|
|
context);
|
|
}
|
|
|
|
template <>
|
|
void Col2ImNd<float, CUDAContext, StorageOrder::NCHW>(
|
|
const int N,
|
|
const int img_size,
|
|
const int col_size,
|
|
const int* img_shape,
|
|
const int* col_shape,
|
|
const int* kernel_shape,
|
|
const int* stride,
|
|
const int* dilation,
|
|
const int* pad,
|
|
const float* col_data,
|
|
float* img_data,
|
|
CUDAContext* context) {
|
|
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
|
|
N,
|
|
Col2ImNdNCHWCUDAImpl,
|
|
float,
|
|
img_size,
|
|
col_size,
|
|
img_shape,
|
|
col_shape,
|
|
kernel_shape,
|
|
stride,
|
|
dilation,
|
|
pad,
|
|
col_data,
|
|
img_data,
|
|
context);
|
|
}
|
|
|
|
template <>
|
|
void CopyMatrix<CUDAContext>(
|
|
const size_t itemsize,
|
|
const int M,
|
|
const int N,
|
|
const void* A,
|
|
const int lda,
|
|
void* B,
|
|
const int ldb,
|
|
CUDAContext* context,
|
|
TypeMeta::TypedCopy copy) {
|
|
CAFFE_ENFORCE(!copy, "Copy constructor is not supported in CUDA context");
|
|
cudaMemcpy2DAsync(
|
|
B,
|
|
ldb * itemsize,
|
|
A,
|
|
lda * itemsize,
|
|
N * itemsize,
|
|
M,
|
|
cudaMemcpyDeviceToDevice,
|
|
context->cuda_stream());
|
|
}
|
|
|
|
template <>
|
|
void CopyVector<float, CUDAContext>(
|
|
const int N,
|
|
const float* src,
|
|
float* dst,
|
|
CUDAContext* context) {
|
|
if (src != dst && N > 0) {
|
|
cudaMemcpyAsync(
|
|
dst,
|
|
src,
|
|
sizeof(float) * N,
|
|
cudaMemcpyDeviceToDevice,
|
|
context->cuda_stream());
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
using BlockReduce = cub::BlockReduce<T, CAFFE_CUDA_NUM_THREADS>;
|
|
|
|
template <typename T, class Reducer>
|
|
__global__ void RowwiseReduceKernel(
|
|
const int rows,
|
|
const int cols,
|
|
const Reducer reducer,
|
|
const T init,
|
|
const T* X,
|
|
T* Y) {
|
|
__shared__ typename BlockReduce<T>::TempStorage temp_storage;
|
|
for (int i = blockIdx.x; i < rows; i += gridDim.x) {
|
|
T val = init;
|
|
for (int j = threadIdx.x; j < cols; j += blockDim.x) {
|
|
val = reducer(X[i * cols + j], val);
|
|
}
|
|
val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
|
|
if (threadIdx.x == 0) {
|
|
Y[i] = val;
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
template <typename T, class Reducer>
|
|
__global__ void ColwiseReduceKernel(
|
|
const int rows,
|
|
const int cols,
|
|
const Reducer reducer,
|
|
const T init,
|
|
const T* X,
|
|
T* Y) {
|
|
__shared__ typename BlockReduce<T>::TempStorage temp_storage;
|
|
for (int i = blockIdx.x; i < cols; i += gridDim.x) {
|
|
T val = init;
|
|
for (int j = threadIdx.x; j < rows; j += blockDim.x) {
|
|
val = reducer(X[j * cols + i], val);
|
|
}
|
|
val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
|
|
if (threadIdx.x == 0) {
|
|
Y[i] = val;
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(T) \
|
|
template <> \
|
|
void RowwiseMax<T, CUDAContext>( \
|
|
const int N, const int D, const T* x, T* y, CUDAContext* context) { \
|
|
RowwiseReduceKernel<<< \
|
|
std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>( \
|
|
N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX(float)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_ROWWISE_MAX
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(T) \
|
|
template <> \
|
|
void ColwiseMax<T, CUDAContext>( \
|
|
const int N, const int D, const T* x, T* y, CUDAContext* context) { \
|
|
ColwiseReduceKernel<<< \
|
|
std::min(D, CAFFE_MAXIMUM_NUM_BLOCKS), \
|
|
CAFFE_CUDA_NUM_THREADS, \
|
|
0, \
|
|
context->cuda_stream()>>>( \
|
|
N, D, cub::Max(), std::numeric_limits<T>::lowest(), x, y); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX(float)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_COLWISE_MAX
|
|
|
|
namespace {
|
|
__global__ void
|
|
maximum_kernel(const int N, const float alpha, const float* x, float* y) {
|
|
CUDA_1D_KERNEL_LOOP(i, N) {
|
|
y[i] = fmaxf(x[i], alpha);
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template <>
|
|
void Maximum(
|
|
const int N,
|
|
const float alpha,
|
|
const float* x,
|
|
float* y,
|
|
CUDAContext* context) {
|
|
maximum_kernel<<<
|
|
std::min(N, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(N, alpha, x, y);
|
|
}
|
|
|
|
namespace {
|
|
|
|
template <typename T, class Reducer, int D>
|
|
__global__ void ReduceTensorCUDAKernel(
|
|
const int outer_size,
|
|
const int inner_size,
|
|
SimpleArray<int, D> X_strides,
|
|
SimpleArray<FixedDivisor<int>, D> Y_dims,
|
|
const Reducer reducer,
|
|
const T init,
|
|
const T* X,
|
|
T* Y) {
|
|
__shared__ typename BlockReduce<T>::TempStorage temp_storage;
|
|
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
|
|
T val = init;
|
|
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
|
|
int X_index = 0;
|
|
int Y_index = i * inner_size + j;
|
|
#pragma unroll
|
|
for (int d = D - 1; d >= 0; --d) {
|
|
int r;
|
|
Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
|
|
X_index += r * X_strides.data[d];
|
|
}
|
|
#if __CUDA_ARCH__ >= 350
|
|
val = reducer(val, __ldg(X + X_index));
|
|
#else
|
|
val = reducer(val, X[X_index]);
|
|
#endif
|
|
}
|
|
val = BlockReduce<T>(temp_storage).Reduce(val, reducer);
|
|
if (threadIdx.x == 0) {
|
|
Y[i] = val;
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
template <typename T, class Reducer, int D>
|
|
void ReduceTensorCUDAImpl(
|
|
const int outer_size,
|
|
const int inner_size,
|
|
const int* dims,
|
|
const int* axes,
|
|
const Reducer& reducer,
|
|
const T& init,
|
|
const T* X,
|
|
T* Y,
|
|
CUDAContext* context) {
|
|
SimpleArray<int, D> X_strides;
|
|
SimpleArray<FixedDivisor<int>, D> Y_dims;
|
|
utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
|
|
for (int i = 0; i < D; ++i) {
|
|
Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
|
|
}
|
|
ReduceTensorCUDAKernel<T, Reducer, D>
|
|
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
outer_size, inner_size, X_strides, Y_dims, reducer, init, X, Y);
|
|
}
|
|
|
|
template <typename T, class Reducer>
|
|
void ReduceTensorCUDA(
|
|
const int num_dims,
|
|
const int* dims,
|
|
const int num_axes,
|
|
const int* axes,
|
|
const Reducer& reducer,
|
|
const T& init,
|
|
const T* X,
|
|
T* Y,
|
|
CUDAContext* context) {
|
|
CAFFE_ENFORCE_LE(num_axes, num_dims);
|
|
if (X == Y) {
|
|
return;
|
|
}
|
|
std::vector<int> transpose_axes(num_dims);
|
|
utils::ComputeTransposeAxesForReduceOp(
|
|
num_dims, num_axes, axes, transpose_axes.data());
|
|
const int pivot = num_dims - num_axes;
|
|
int outer_size = 1;
|
|
for (int i = 0; i < pivot; ++i) {
|
|
outer_size *= dims[transpose_axes[i]];
|
|
}
|
|
int inner_size = 1;
|
|
for (int i = pivot; i < num_dims; ++i) {
|
|
inner_size *= dims[transpose_axes[i]];
|
|
}
|
|
if (outer_size > 0 && inner_size > 0) {
|
|
if (transpose_axes[pivot] == pivot) {
|
|
RowwiseReduceKernel<T>
|
|
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
outer_size, inner_size, reducer, init, X, Y);
|
|
return;
|
|
}
|
|
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
|
|
num_dims,
|
|
ReduceTensorCUDAImpl,
|
|
T,
|
|
Reducer,
|
|
outer_size,
|
|
inner_size,
|
|
dims,
|
|
transpose_axes.data(),
|
|
reducer,
|
|
init,
|
|
X,
|
|
Y,
|
|
context);
|
|
} else if (outer_size > 0) {
|
|
math::Set<T, CUDAContext>(outer_size, init, Y, context);
|
|
}
|
|
}
|
|
|
|
template <typename T>
|
|
void ReduceMeanCUDAImpl(
|
|
const int num_dims,
|
|
const int* dims,
|
|
const int num_axes,
|
|
const int* axes,
|
|
const T* X,
|
|
T* Y,
|
|
CUDAContext* context) {
|
|
ReduceTensorCUDA(
|
|
num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context);
|
|
const int X_size =
|
|
std::accumulate(dims, dims + num_dims, 1, std::multiplies<int>());
|
|
int scale = 1;
|
|
for (int i = 0; i < num_axes; ++i) {
|
|
scale *= dims[axes[i]];
|
|
}
|
|
const int Y_size = X_size / scale;
|
|
Scale<T, CUDAContext>(
|
|
Y_size, 1.0f / static_cast<float>(scale), Y, Y, context);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(T) \
|
|
template <> \
|
|
void ReduceMin<T, CUDAContext>( \
|
|
const int num_dims, \
|
|
const int* dims, \
|
|
const int num_axes, \
|
|
const int* axes, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
ReduceTensorCUDA( \
|
|
num_dims, \
|
|
dims, \
|
|
num_axes, \
|
|
axes, \
|
|
cub::Min(), \
|
|
std::numeric_limits<T>::max(), \
|
|
X, \
|
|
Y, \
|
|
context); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(std::int32_t)
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(std::int64_t)
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(float)
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN(double)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MIN
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(T) \
|
|
template <> \
|
|
void ReduceMax<T, CUDAContext>( \
|
|
const int num_dims, \
|
|
const int* dims, \
|
|
const int num_axes, \
|
|
const int* axes, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
ReduceTensorCUDA( \
|
|
num_dims, \
|
|
dims, \
|
|
num_axes, \
|
|
axes, \
|
|
cub::Max(), \
|
|
std::numeric_limits<T>::lowest(), \
|
|
X, \
|
|
Y, \
|
|
context); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(std::int32_t)
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(std::int64_t)
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(float)
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX(double)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MAX
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(T) \
|
|
template <> \
|
|
void ReduceSum<T, CUDAContext>( \
|
|
const int num_dims, \
|
|
const int* dims, \
|
|
const int num_axes, \
|
|
const int* axes, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
ReduceTensorCUDA( \
|
|
num_dims, dims, num_axes, axes, cub::Sum(), T(0), X, Y, context); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int32_t)
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(std::int64_t)
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(float)
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM(double)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_SUM
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(T) \
|
|
template <> \
|
|
void ReduceMean<T, CUDAContext>( \
|
|
const int num_dims, \
|
|
const int* dims, \
|
|
const int num_axes, \
|
|
const int* axes, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
ReduceMeanCUDAImpl<T>(num_dims, dims, num_axes, axes, X, Y, context); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN(float)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_REDUCE_MEAN
|
|
|
|
namespace {
|
|
|
|
template <typename T, int D>
|
|
__global__ void BroadcastCUDAKernel(
|
|
const int Y_size,
|
|
const SimpleArray<int, D> X_strides,
|
|
const SimpleArray<int, D> Y_dims,
|
|
const T* X,
|
|
T* Y) {
|
|
CUDA_1D_KERNEL_LOOP(Y_index, Y_size) {
|
|
int X_index = 0;
|
|
int Y_index_val = Y_index;
|
|
#pragma unroll
|
|
for (int i = D - 1; i >= 0; --i) {
|
|
X_index += X_strides.data[i] == 0
|
|
? 0
|
|
: (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
|
|
Y_index_val /= Y_dims.data[i];
|
|
}
|
|
#if __CUDA_ARCH__ >= 350
|
|
Y[Y_index] = __ldg(X + X_index);
|
|
#else
|
|
Y[Y_index] = X[X_index];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
template <typename T, int D>
|
|
void BroadcastCUDAImpl(
|
|
const int X_ndim,
|
|
const int* X_dims,
|
|
const int* Y_dims,
|
|
const T* X,
|
|
T* Y,
|
|
CUDAContext* context) {
|
|
SimpleArray<int, D> X_strides_array;
|
|
SimpleArray<int, D> Y_dims_array;
|
|
const int d = D - X_ndim;
|
|
std::fill(X_strides_array.data, X_strides_array.data + d, 0);
|
|
int cur_stride = 1;
|
|
for (int i = D - 1; i >= d; --i) {
|
|
CAFFE_ENFORCE(X_dims[i - d] == 1 || X_dims[i - d] == Y_dims[i]);
|
|
X_strides_array.data[i] = X_dims[i - d] == 1 ? 0 : cur_stride;
|
|
cur_stride *= X_dims[i - d];
|
|
}
|
|
std::copy_n(Y_dims, D, Y_dims_array.data);
|
|
const int Y_size =
|
|
std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>());
|
|
BroadcastCUDAKernel<T, D>
|
|
<<<CAFFE_GET_BLOCKS(Y_size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(Y_size, X_strides_array, Y_dims_array, X, Y);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_BROADCAST(T) \
|
|
template <> \
|
|
void Broadcast<T, CUDAContext>( \
|
|
const int X_ndim, \
|
|
const int* X_dims, \
|
|
const int Y_ndim, \
|
|
const int* Y_dims, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
CAFFE_ENFORCE_LE(X_ndim, Y_ndim); \
|
|
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \
|
|
Y_ndim, BroadcastCUDAImpl, T, X_ndim, X_dims, Y_dims, X, Y, context); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int32_t)
|
|
CAFFE2_SPECIALIZED_CUDA_BROADCAST(std::int64_t)
|
|
CAFFE2_SPECIALIZED_CUDA_BROADCAST(float)
|
|
CAFFE2_SPECIALIZED_CUDA_BROADCAST(double)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_BROADCAST
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
__global__ void RowwiseMomentsCUDAKernel(
|
|
const int rows,
|
|
const int cols,
|
|
const T* X,
|
|
T* mean,
|
|
T* variance) {
|
|
__shared__ typename BlockReduce<T>::TempStorage m_storage;
|
|
__shared__ typename BlockReduce<T>::TempStorage v_storage;
|
|
for (int i = blockIdx.x; i < rows; i += gridDim.x) {
|
|
T m_val = 0;
|
|
T v_val = 0;
|
|
for (int j = threadIdx.x; j < cols; j += blockDim.x) {
|
|
const int X_index = i * cols + j;
|
|
#if __CUDA_ARCH__ >= 350
|
|
m_val += __ldg(X + X_index);
|
|
v_val += __ldg(X + X_index) * __ldg(X + X_index);
|
|
#else
|
|
m_val += X[X_index];
|
|
v_val += X[X_index] * X[X_index];
|
|
#endif
|
|
}
|
|
m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
|
|
v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
|
|
if (threadIdx.x == 0) {
|
|
mean[i] = m_val / static_cast<T>(cols);
|
|
variance[i] = v_val / static_cast<T>(cols) - mean[i] * mean[i];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
template <typename T, int D>
|
|
__global__ void MomentsCUDAKernel(
|
|
const int outer_size,
|
|
const int inner_size,
|
|
SimpleArray<int, D> X_strides,
|
|
SimpleArray<FixedDivisor<int>, D> Y_dims,
|
|
const T* X,
|
|
T* mean,
|
|
T* variance) {
|
|
__shared__ typename BlockReduce<T>::TempStorage m_storage;
|
|
__shared__ typename BlockReduce<T>::TempStorage v_storage;
|
|
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
|
|
T m_val = 0;
|
|
T v_val = 0;
|
|
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
|
|
int X_index = 0;
|
|
int Y_index = i * inner_size + j;
|
|
#pragma unroll
|
|
for (int d = D - 1; d >= 0; --d) {
|
|
int r;
|
|
Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
|
|
X_index += r * X_strides.data[d];
|
|
}
|
|
#if __CUDA_ARCH__ >= 350
|
|
m_val += __ldg(X + X_index);
|
|
v_val += __ldg(X + X_index) * __ldg(X + X_index);
|
|
#else
|
|
m_val += X[X_index];
|
|
v_val += X[X_index] * X[X_index];
|
|
#endif
|
|
}
|
|
m_val = BlockReduce<T>(m_storage).Reduce(m_val, cub::Sum());
|
|
v_val = BlockReduce<T>(v_storage).Reduce(v_val, cub::Sum());
|
|
if (threadIdx.x == 0) {
|
|
mean[i] = m_val / static_cast<T>(inner_size);
|
|
variance[i] = v_val / static_cast<T>(inner_size) - mean[i] * mean[i];
|
|
}
|
|
__syncthreads();
|
|
}
|
|
}
|
|
|
|
template <typename T, int D>
|
|
void MomentsCUDAImpl(
|
|
const int outer_size,
|
|
const int inner_size,
|
|
const int* dims,
|
|
const int* axes,
|
|
const T* X,
|
|
T* mean,
|
|
T* variance,
|
|
CUDAContext* context) {
|
|
SimpleArray<int, D> X_strides;
|
|
SimpleArray<FixedDivisor<int>, D> Y_dims;
|
|
utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
|
|
for (int i = 0; i < D; ++i) {
|
|
Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
|
|
}
|
|
MomentsCUDAKernel<T, D>
|
|
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
outer_size, inner_size, X_strides, Y_dims, X, mean, variance);
|
|
}
|
|
|
|
template <typename T>
|
|
void MomentsCUDA(
|
|
const int num_dims,
|
|
const int* dims,
|
|
const int num_axes,
|
|
const int* axes,
|
|
const T* X,
|
|
T* mean,
|
|
T* variance,
|
|
CUDAContext* context) {
|
|
CAFFE_ENFORCE_LE(num_axes, num_dims);
|
|
std::vector<int> transpose_axes(num_dims);
|
|
utils::ComputeTransposeAxesForReduceOp(
|
|
num_dims, num_axes, axes, transpose_axes.data());
|
|
const int pivot = num_dims - num_axes;
|
|
int outer_size = 1;
|
|
for (int i = 0; i < pivot; ++i) {
|
|
outer_size *= dims[transpose_axes[i]];
|
|
}
|
|
int inner_size = 1;
|
|
for (int i = pivot; i < num_dims; ++i) {
|
|
inner_size *= dims[transpose_axes[i]];
|
|
}
|
|
if (outer_size > 0 && inner_size > 0) {
|
|
if (transpose_axes[pivot] == pivot) {
|
|
RowwiseMomentsCUDAKernel<T>
|
|
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(
|
|
outer_size, inner_size, X, mean, variance);
|
|
return;
|
|
}
|
|
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1(
|
|
num_dims,
|
|
MomentsCUDAImpl,
|
|
T,
|
|
outer_size,
|
|
inner_size,
|
|
dims,
|
|
transpose_axes.data(),
|
|
X,
|
|
mean,
|
|
variance,
|
|
context);
|
|
}
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_MOMENTS(T) \
|
|
template <> \
|
|
void Moments<T, CUDAContext>( \
|
|
const int num_dims, \
|
|
const int* dims, \
|
|
const int num_axes, \
|
|
const int* axes, \
|
|
const T* X, \
|
|
T* mean, \
|
|
T* variance, \
|
|
CUDAContext* context) { \
|
|
MomentsCUDA<T>( \
|
|
num_dims, dims, num_axes, axes, X, mean, variance, context); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_MOMENTS(float)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_MOMENTS
|
|
|
|
namespace {
|
|
|
|
template <typename T, int D>
|
|
__global__ void TransposeCUDAKernel(
|
|
const int size,
|
|
const SimpleArray<int, D> X_strides,
|
|
const SimpleArray<FixedDivisor<int>, D> Y_dims,
|
|
const T* X,
|
|
T* Y) {
|
|
CUDA_1D_KERNEL_LOOP(Y_index, size) {
|
|
int X_index = 0;
|
|
int Y_index_val = Y_index;
|
|
#pragma unroll
|
|
for (int i = D - 1; i >= 0; --i) {
|
|
int d;
|
|
Y_dims.data[i].DivMod(Y_index_val, &Y_index_val, &d);
|
|
X_index += d * X_strides.data[i];
|
|
}
|
|
#if __CUDA_ARCH__ >= 350
|
|
Y[Y_index] = __ldg(X + X_index);
|
|
#else
|
|
Y[Y_index] = X[X_index];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
template <typename T, int D>
|
|
void TransposeCUDAImpl(
|
|
const int* dims,
|
|
const int* axes,
|
|
const T* X,
|
|
T* Y,
|
|
CUDAContext* context) {
|
|
SimpleArray<int, D> X_strides;
|
|
SimpleArray<FixedDivisor<int>, D> Y_dims;
|
|
utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
|
|
int size = 1;
|
|
for (int i = 0; i < D; ++i) {
|
|
Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
|
|
size *= dims[i];
|
|
}
|
|
TransposeCUDAKernel<T, D>
|
|
<<<CAFFE_GET_BLOCKS(size),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context->cuda_stream()>>>(size, X_strides, Y_dims, X, Y);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T) \
|
|
template <> \
|
|
void Transpose<T, CUDAContext>( \
|
|
const int ndim, \
|
|
const int* dims, \
|
|
const int* axes, \
|
|
const T* X, \
|
|
T* Y, \
|
|
CUDAContext* context) { \
|
|
if (utils::IsIdentityPermutation(ndim, axes)) { \
|
|
const int size = \
|
|
std::accumulate(dims, dims + ndim, 1, std::multiplies<int>()); \
|
|
context->template Copy<T, CUDAContext, CUDAContext>(size, X, Y); \
|
|
return; \
|
|
} \
|
|
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \
|
|
ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context); \
|
|
}
|
|
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float)
|
|
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double)
|
|
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(int)
|
|
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(TIndex)
|
|
#undef CAFFE2_SPECIALIZED_CUDA_TRANSPOSE
|
|
|
|
} // namespace math
|
|
} // namespace caffe2
|