Use FixedDivisor in Reduce and Broadcast CUDA kernels (#9072)

Summary:
Closes https://github.com/pytorch/pytorch/pull/9072

Use FixedDivisor in Reduce and Broadcast CUDA kernels

Reviewed By: houseroad

Differential Revision: D8710243

fbshipit-source-id: 6f1da12234898594a1be8c979d942aa515832aeb
This commit is contained in:
Xiaomeng Yang 2018-07-01 00:09:35 -07:00 committed by Facebook Github Bot
parent 90fd4df695
commit 03e7953a98
18 changed files with 463 additions and 470 deletions

View file

@ -10,18 +10,18 @@ namespace {
template <typename T, class Compare, class Context>
void ComputeArgImpl(
const TIndex prev_size,
const TIndex next_size,
const TIndex n,
const int prev_size,
const int next_size,
const int n,
const Compare& comp,
const T* X,
TIndex* Y,
Context* context) {
math::Set<TIndex, Context>(prev_size * next_size, TIndex(0), Y, context);
for (TIndex i = 0; i < prev_size; ++i) {
for (int i = 0; i < prev_size; ++i) {
const T* cur_X = X + i * n * next_size + next_size;
for (TIndex k = 1; k < n; ++k) {
for (TIndex j = 0; j < next_size; ++j) {
for (int k = 1; k < n; ++k) {
for (int j = 0; j < next_size; ++j) {
TIndex* cur_Y = Y + i * next_size + j;
if (comp(*cur_X, X[i * n * next_size + *cur_Y * next_size + j])) {
*cur_Y = k;
@ -37,9 +37,9 @@ void ComputeArgImpl(
template <>
template <typename T>
bool ArgMaxReducer<CPUContext>::operator()(
const TIndex prev_size,
const TIndex next_size,
const TIndex n,
const int prev_size,
const int next_size,
const int n,
const T* X,
TIndex* Y,
CPUContext* context) const {
@ -50,9 +50,9 @@ bool ArgMaxReducer<CPUContext>::operator()(
template <>
template <typename T>
bool ArgMinReducer<CPUContext>::operator()(
const TIndex prev_size,
const TIndex next_size,
const TIndex n,
const int prev_size,
const int next_size,
const int n,
const T* X,
TIndex* Y,
CPUContext* context) const {
@ -157,14 +157,16 @@ Indices: [[1 0 0]
)DOC")
.Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
.Output(0,
"Indices",
"*(type: Tensor`<float>`)* Tensor of indices for the largest values.")
.Output(
0,
"Indices",
"*(type: Tensor`<float>`)* Tensor of indices for the largest values.")
.Arg("axis", "*(type: int; default: -1)* The axis to get argmax.")
.Arg("keepdims",
"*(type: bool; default: True)* If True (default), the output tensor "
"shape will match the input tensor shape except the `axis` dimension "
"equals 1. Else, the `axis` dimension of the output tensor is removed.");
.Arg(
"keepdims",
"*(type: bool; default: True)* If True (default), the output tensor "
"shape will match the input tensor shape except the `axis` dimension "
"equals 1. Else, the `axis` dimension of the output tensor is removed.");
OPERATOR_SCHEMA(ArgMin)
.NumInputs(1)
@ -227,16 +229,18 @@ Indices: [[4]
)DOC")
.Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
.Output(0,
"Indices",
"*(type: Tensor`<float>`)* Tensor of indices for the smallest values.")
.Output(
0,
"Indices",
"*(type: Tensor`<float>`)* Tensor of indices for the smallest values.")
.Arg("axis", "*(type: int; default: -1)* The axis to get argmin.")
.Arg("keepdims",
"*(type: bool; default: True)* If True (default), the output tensor "
"shape will match the input tensor shape except the `axis` dimension "
"equals 1. Else, the `axis` dimension of the output tensor is removed.");
.Arg(
"keepdims",
"*(type: bool; default: True)* If True (default), the output tensor "
"shape will match the input tensor shape except the `axis` dimension "
"equals 1. Else, the `axis` dimension of the output tensor is removed.");
NO_GRADIENT(ArgMax);
NO_GRADIENT(ArgMin);
SHOULD_NOT_DO_GRADIENT(ArgMax);
SHOULD_NOT_DO_GRADIENT(ArgMin);
} // namespace caffe2

View file

@ -7,37 +7,41 @@
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/fixed_divisor.h"
namespace caffe2 {
namespace {
template <typename T>
using KeyValuePair = cub::KeyValuePair<TIndex, T>;
template <typename K, typename V>
using KeyValuePair = cub::KeyValuePair<K, V>;
template <typename T>
using BlockReduce = cub::BlockReduce<KeyValuePair<T>, CAFFE_CUDA_NUM_THREADS>;
template <typename K, typename V>
using BlockReduce =
cub::BlockReduce<KeyValuePair<K, V>, CAFFE_CUDA_NUM_THREADS>;
template <typename T, class Reducer>
__global__ void ComputeArgCUDAKernel(
const TIndex outer_size,
const TIndex inner_size,
const TIndex stride,
const int outer_size,
const int inner_size,
const FixedDivisor<int> stride,
const Reducer reducer,
const T init,
const T* X,
TIndex* Y) {
__shared__ typename BlockReduce<T>::TempStorage temp_storage;
for (TIndex idx = blockIdx.x; idx < outer_size; idx += gridDim.x) {
const TIndex i = idx / stride;
const TIndex j = idx % stride;
KeyValuePair<T> kv = {-1, init};
for (TIndex k = threadIdx.x; k < inner_size; k += blockDim.x) {
kv = reducer({k, X[i * inner_size * stride + k * stride + j]}, kv);
__shared__ typename BlockReduce<int, T>::TempStorage temp_storage;
const int d = stride.d();
for (int idx = blockIdx.x; idx < outer_size; idx += gridDim.x) {
int i;
int j;
stride.DivMod(idx, &i, &j);
KeyValuePair<int, T> kv = {-1, init};
for (int k = threadIdx.x; k < inner_size; k += blockDim.x) {
kv = reducer({k, X[i * inner_size * d + k * d + j]}, kv);
}
kv = BlockReduce<T>(temp_storage).Reduce(kv, reducer);
kv = BlockReduce<int, T>(temp_storage).Reduce(kv, reducer);
if (threadIdx.x == 0) {
Y[idx] = kv.key;
Y[idx] = static_cast<TIndex>(kv.key);
}
__syncthreads();
}
@ -48,21 +52,22 @@ __global__ void ComputeArgCUDAKernel(
template <>
template <typename T>
bool ArgMaxReducer<CUDAContext>::operator()(
const TIndex prev_size,
const TIndex next_size,
const TIndex n,
const int prev_size,
const int next_size,
const int n,
const T* X,
TIndex* Y,
CUDAContext* context) const {
const TIndex outer_size = prev_size * next_size;
const int outer_size = prev_size * next_size;
const FixedDivisor<int> stride(next_size);
ComputeArgCUDAKernel<<<
std::min(outer_size, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
outer_size,
n,
next_size,
stride,
cub::ArgMax(),
std::numeric_limits<T>::lowest(),
X,
@ -73,21 +78,22 @@ bool ArgMaxReducer<CUDAContext>::operator()(
template <>
template <typename T>
bool ArgMinReducer<CUDAContext>::operator()(
const TIndex prev_size,
const TIndex next_size,
const TIndex n,
const int prev_size,
const int next_size,
const int n,
const T* X,
TIndex* Y,
CUDAContext* context) const {
const TIndex outer_size = prev_size * next_size;
const int outer_size = prev_size * next_size;
const FixedDivisor<int> stride(next_size);
ComputeArgCUDAKernel<<<
std::min(outer_size, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
outer_size,
n,
next_size,
stride,
cub::ArgMin(),
std::numeric_limits<T>::max(),
X,

View file

@ -37,11 +37,11 @@ class ArgOp final : public Operator<Context> {
}
CAFFE_ENFORCE_GE(axis_, 0);
CAFFE_ENFORCE_LT(axis_, ndim);
const std::vector<TIndex>& X_dims = X.dims();
std::vector<TIndex> Y_dims;
const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
std::vector<int> Y_dims;
Y_dims.reserve(ndim);
TIndex prev_size = 1;
TIndex next_size = 1;
int prev_size = 1;
int next_size = 1;
for (int i = 0; i < axis_; ++i) {
Y_dims.push_back(X_dims[i]);
prev_size *= X_dims[i];
@ -54,7 +54,7 @@ class ArgOp final : public Operator<Context> {
next_size *= X_dims[i];
}
Y->Resize(Y_dims);
const TIndex n = X_dims[axis_];
const int n = X_dims[axis_];
return reducer_(
prev_size,
next_size,
@ -74,9 +74,9 @@ template <class Context>
struct ArgMaxReducer {
template <typename T>
bool operator()(
const TIndex prev_size,
const TIndex next_size,
const TIndex n,
const int prev_size,
const int next_size,
const int n,
const T* X,
TIndex* Y,
Context* context) const;
@ -86,9 +86,9 @@ template <class Context>
struct ArgMinReducer {
template <typename T>
bool operator()(
const TIndex prev_size,
const TIndex next_size,
const TIndex n,
const int prev_size,
const int next_size,
const int n,
const T* X,
TIndex* Y,
Context* context) const;

View file

@ -1,82 +0,0 @@
#ifndef CAFFE2_OPERATORS_ARG_OPS_EIGEN_H_
#define CAFFE2_OPERATORS_ARG_OPS_EIGEN_H_
#include "caffe2/core/context.h"
#include "caffe2/core/types.h"
#include "Eigen/Core"
#if EIGEN_VERSION_AT_LEAST(3, 3, 0)
#include "unsupported/Eigen/CXX11/Tensor"
namespace caffe2 {
namespace arg_ops_eigen {
template <typename T>
using EigenTensorMap1D = Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>>;
template <typename T>
using EigenTensorMap2D = Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>>;
template <typename T>
using EigenTensorMap3D = Eigen::TensorMap<Eigen::Tensor<T, 3, Eigen::RowMajor>>;
template <class Device, typename T>
void ComputeArgMaxEigen(
const Device& device,
const T* X,
const TIndex prev_size,
const TIndex next_size,
const TIndex n,
TIndex* Y) {
if (next_size == 1) {
EigenTensorMap1D<TIndex>(Y, prev_size).device(device) =
EigenTensorMap2D<T>(const_cast<T*>(X), prev_size, n)
.argmax(1)
.template cast<TIndex>();
} else if (prev_size == 1) {
EigenTensorMap1D<TIndex>(Y, next_size).device(device) =
EigenTensorMap2D<T>(const_cast<T*>(X), n, next_size)
.argmax(0)
.template cast<TIndex>();
} else {
EigenTensorMap2D<TIndex>(Y, prev_size, next_size).device(device) =
EigenTensorMap3D<T>(const_cast<T*>(X), prev_size, n, next_size)
.argmax(1)
.template cast<TIndex>();
}
}
template <class Device, typename T>
void ComputeArgMinEigen(
const Device& device,
const T* X,
const TIndex prev_size,
const TIndex next_size,
const TIndex n,
TIndex* Y) {
if (next_size == 1) {
EigenTensorMap1D<TIndex>(Y, prev_size).device(device) =
EigenTensorMap2D<T>(const_cast<T*>(X), prev_size, n)
.argmin(1)
.template cast<TIndex>();
} else if (prev_size == 1) {
EigenTensorMap1D<TIndex>(Y, next_size).device(device) =
EigenTensorMap2D<T>(const_cast<T*>(X), n, next_size)
.argmin(0)
.template cast<TIndex>();
} else {
EigenTensorMap2D<TIndex>(Y, prev_size, next_size).device(device) =
EigenTensorMap3D<T>(const_cast<T*>(X), prev_size, n, next_size)
.argmin(1)
.template cast<TIndex>();
}
}
} // namespace arg_ops_eigen
} // namespace caffe2
#endif // EIGEN_VERSION_AT_LEAST(3, 3, 0)
#endif // CAFFE2_OPERATORS_ARG_OPS_EIGEN_H_

View file

@ -31,4 +31,4 @@ struct CbrtGradientFunctor {
} // namespace caffe2
#endif
#endif // CAFFE2_OPERATORS_CBRT_OP_H_

View file

@ -384,7 +384,7 @@ void reinterleaveMultithreaded(
size_t tileId) {
int h;
int c;
divOutputH.divMod((int)tileId, c, h);
divOutputH.DivMod((int)tileId, &c, &h);
REINTERLEAVE(N);
};

View file

@ -8,6 +8,7 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/elementwise_ops_utils.h"
#include "caffe2/utils/fixed_divisor.h"
namespace caffe2 {
@ -20,10 +21,10 @@ template <typename TGrad, typename TIn, int D>
__global__ void ComputeDivAGradientCUDAKernel(
const int outer_size,
const int inner_size,
const SimpleArray<int, D> C_dims,
const SimpleArray<FixedDivisor<int>, D> C_dims,
const SimpleArray<int, D> C_strides,
const SimpleArray<int, D> B_strides,
const SimpleArray<int, D> A_dims,
const SimpleArray<FixedDivisor<int>, D> A_dims,
const TGrad* dC,
const TIn* B,
TGrad* dA) {
@ -36,17 +37,17 @@ __global__ void ComputeDivAGradientCUDAKernel(
int A_index_val = A_index;
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
C_index += (A_index_val % A_dims.data[d]) * C_strides.data[d];
A_index_val /= A_dims.data[d];
int r;
A_dims.data[d].DivMod(A_index_val, &A_index_val, &r);
C_index += r * C_strides.data[d];
}
int B_index = 0;
int C_index_val = C_index;
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
B_index += B_strides.data[d] == 0
? 0
: (C_index_val % C_dims.data[d]) * B_strides.data[d];
C_index_val /= C_dims.data[d];
int r;
C_dims.data[d].DivMod(C_index_val, &C_index_val, &r);
B_index += r * B_strides.data[d];
}
#if __CUDA_ARCH__ >= 350
sum += __ldg(dC + C_index) / __ldg(B + B_index);
@ -83,7 +84,7 @@ __global__ void ComputeDivBGradientCUDAKernel(
const int outer_size,
const int inner_size,
const SimpleArray<int, D> C_strides,
const SimpleArray<int, D> B_dims,
const SimpleArray<FixedDivisor<int>, D> B_dims,
const TGrad* dC,
const TIn* B,
const TOut* C,
@ -96,8 +97,9 @@ __global__ void ComputeDivBGradientCUDAKernel(
int B_index = i * inner_size + j;
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
C_index += (B_index % B_dims.data[d]) * C_strides.data[d];
B_index /= B_dims.data[d];
int r;
B_dims.data[d].DivMod(B_index, &B_index, &r);
C_index += r * C_strides.data[d];
}
#if __CUDA_ARCH__ >= 350
sum += -__ldg(dC + C_index) * __ldg(C + C_index) / __ldg(B + i);
@ -124,20 +126,20 @@ void ComputeDivAGradientCUDAImpl(
const TIn* B,
TGrad* dA,
CUDAContext* context) {
SimpleArray<int, D> C_dims_arr;
SimpleArray<FixedDivisor<int>, D> C_dims_arr;
SimpleArray<int, D> C_strides_arr;
SimpleArray<int, D> B_strides_arr;
SimpleArray<int, D> A_dims_arr;
std::copy_n(C_dims, D, C_dims_arr.data);
SimpleArray<FixedDivisor<int>, D> A_dims_arr;
for (int i = 0; i < D; ++i) {
C_dims_arr.data[i] = FixedDivisor<int>(C_dims[i]);
A_dims_arr.data[i] = FixedDivisor<int>(C_dims[A_axes[i]]);
}
math::utils::ComputeTransposedStrides(D, C_dims, A_axes, C_strides_arr.data);
int cur_stride = 1;
for (int i = D - 1; i >= 0; --i) {
B_strides_arr.data[i] = B_dims[i] == 1 ? 0 : cur_stride;
cur_stride *= B_dims[i];
}
for (int i = 0; i < D; ++i) {
A_dims_arr.data[i] = C_dims[A_axes[i]];
}
ComputeDivAGradientCUDAKernel<TGrad, TIn, D>
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
@ -166,10 +168,10 @@ void ComputeDivBGradientCUDAImpl(
TGrad* dB,
CUDAContext* context) {
SimpleArray<int, D> C_strides_arr;
SimpleArray<int, D> B_dims_arr;
SimpleArray<FixedDivisor<int>, D> B_dims_arr;
math::utils::ComputeTransposedStrides(D, C_dims, B_axes, C_strides_arr.data);
for (int i = 0; i < D; ++i) {
B_dims_arr.data[i] = C_dims[B_axes[i]];
B_dims_arr.data[i] = FixedDivisor<int>(C_dims[B_axes[i]]);
}
ComputeDivBGradientCUDAKernel<TGrad, TIn, TOut, D>
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
@ -202,7 +204,7 @@ void ComputeDivAGradientCUDA(
for (int i = pivot; i < ndim; ++i) {
inner_size *= C_dims[A_transpose_axes[i]];
}
if (outer_size > 0) {
if (outer_size > 0 && inner_size > 0) {
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
ndim,
ComputeDivAGradientCUDAImpl,
@ -217,6 +219,8 @@ void ComputeDivAGradientCUDA(
B,
dA,
context);
} else if (outer_size > 0) {
math::Set<TGrad, CUDAContext>(outer_size, TGrad(0), dA, context);
}
}
@ -242,7 +246,7 @@ void ComputeDivBGradientCUDA(
for (int i = pivot; i < ndim; ++i) {
inner_size *= C_dims[B_transpose_axes[i]];
}
if (outer_size > 0) {
if (outer_size > 0 && inner_size > 0) {
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
ndim,
ComputeDivBGradientCUDAImpl,
@ -258,6 +262,8 @@ void ComputeDivBGradientCUDA(
C,
dB,
context);
} else if (outer_size > 0) {
math::Set<TGrad, CUDAContext>(outer_size, TGrad(0), dB, context);
}
}

View file

@ -8,6 +8,7 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/elementwise_ops_utils.h"
#include "caffe2/utils/fixed_divisor.h"
namespace caffe2 {
@ -20,10 +21,10 @@ template <typename TGrad, typename TIn, int D>
__global__ void ComputeMulGradientCUDAKernel(
const int outer_size,
const int inner_size,
const SimpleArray<int, D> Y_dims,
const SimpleArray<FixedDivisor<int>, D> Y_dims,
const SimpleArray<int, D> Y_strides,
const SimpleArray<int, D> W_strides,
const SimpleArray<int, D> X_dims,
const SimpleArray<FixedDivisor<int>, D> X_dims,
const TGrad* dY,
const TIn* W,
TGrad* dX) {
@ -36,17 +37,17 @@ __global__ void ComputeMulGradientCUDAKernel(
int X_index_val = X_index;
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
Y_index += (X_index_val % X_dims.data[d]) * Y_strides.data[d];
X_index_val /= X_dims.data[d];
int r;
X_dims.data[d].DivMod(X_index_val, &X_index_val, &r);
Y_index += r * Y_strides.data[d];
}
int W_index = 0;
int Y_index_val = Y_index;
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
W_index += W_strides.data[d] == 0
? 0
: (Y_index_val % Y_dims.data[d]) * W_strides.data[d];
Y_index_val /= Y_dims.data[d];
int r;
Y_dims.data[d].DivMod(Y_index_val, &Y_index_val, &r);
W_index += r * W_strides.data[d];
}
#if __CUDA_ARCH__ >= 350
sum += __ldg(dY + Y_index) * __ldg(W + W_index);
@ -73,20 +74,20 @@ void ComputeMulGradientCUDAImpl(
const TIn* W,
TGrad* dX,
CUDAContext* context) {
SimpleArray<int, D> Y_dims_arr;
SimpleArray<FixedDivisor<int>, D> Y_dims_arr;
SimpleArray<int, D> Y_strides_arr;
SimpleArray<int, D> W_strides_arr;
SimpleArray<int, D> X_dims_arr;
std::copy_n(Y_dims, D, Y_dims_arr.data);
SimpleArray<FixedDivisor<int>, D> X_dims_arr;
for (int i = 0; i < D; ++i) {
Y_dims_arr.data[i] = FixedDivisor<int>(Y_dims[i]);
X_dims_arr.data[i] = FixedDivisor<int>(Y_dims[X_axes[i]]);
}
math::utils::ComputeTransposedStrides(D, Y_dims, X_axes, Y_strides_arr.data);
int cur_stride = 1;
for (int i = D - 1; i >= 0; --i) {
W_strides_arr.data[i] = W_dims[i] == 1 ? 0 : cur_stride;
cur_stride *= W_dims[i];
}
for (int i = 0; i < D; ++i) {
X_dims_arr.data[i] = Y_dims[X_axes[i]];
}
ComputeMulGradientCUDAKernel<TGrad, TIn, D>
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
CAFFE_CUDA_NUM_THREADS,
@ -126,7 +127,7 @@ void ComputeMulGradientCUDA(
for (int i = pivot; i < ndim; ++i) {
inner_size *= Y_dims[X_transpose_axes[i]];
}
if (outer_size > 0) {
if (outer_size > 0 && inner_size > 0) {
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
ndim,
ComputeMulGradientCUDAImpl,
@ -141,6 +142,8 @@ void ComputeMulGradientCUDA(
W,
dX,
context);
} else if (outer_size > 0) {
math::Set<TGrad, CUDAContext>(outer_size, TGrad(0), dX, context);
}
}

View file

@ -4,6 +4,7 @@
#include <functional>
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/fixed_divisor.h"
namespace caffe2 {
@ -11,35 +12,34 @@ namespace {
template <typename T, int D>
__global__ void ComputeMomentsGradientCUDAKernel(
const int dX_size,
const SimpleArray<int, D> dY_strides,
const SimpleArray<int, D> dX_dims,
const int X_size,
const SimpleArray<int, D> Y_strides,
const SimpleArray<FixedDivisor<int>, D> X_dims,
const T scale,
const T* dmean,
const T* dvariance,
const T* X,
const T* mean,
T* dX) {
CUDA_1D_KERNEL_LOOP(dX_index, dX_size) {
int dY_index = 0;
int dX_index_val = dX_index;
CUDA_1D_KERNEL_LOOP(X_index, X_size) {
int Y_index = 0;
int X_index_val = X_index;
#pragma unroll
for (int i = D - 1; i >= 0; --i) {
dY_index += dY_strides.data[i] == 0
? 0
: (dX_index_val % dX_dims.data[i]) * dY_strides.data[i];
dX_index_val /= dX_dims.data[i];
int d;
X_dims.data[i].DivMod(X_index_val, &X_index_val, &d);
Y_index += d * Y_strides.data[i];
}
#if __CUDA_ARCH__ >= 350
dX[dX_index] =
(__ldg(dmean + dY_index) +
static_cast<T>(2) * (__ldg(X + dX_index) - __ldg(mean + dY_index)) *
__ldg(dvariance + dY_index)) *
dX[X_index] =
(__ldg(dmean + Y_index) +
static_cast<T>(2) * (__ldg(X + X_index) - __ldg(mean + Y_index)) *
__ldg(dvariance + Y_index)) *
scale;
#else
dX[dX_index] = (dmean[dY_index] +
static_cast<T>(2) * (X[dX_index] - mean[dY_index]) *
dvariance[dY_index]) *
dX[X_index] = (dmean[Y_index] +
static_cast<T>(2) * (X[X_index] - mean[Y_index]) *
dvariance[Y_index]) *
scale;
#endif
}
@ -47,35 +47,38 @@ __global__ void ComputeMomentsGradientCUDAKernel(
template <typename T, int D>
void ComputeMomentsGradientCUDAImpl(
const int* dY_dims,
const int* dX_dims,
const int* Y_dims,
const int* X_dims,
const T* dmean,
const T* dvariance,
const T* X,
const T* mean,
T* dX,
CUDAContext* context) {
SimpleArray<int, D> dY_strides_array;
SimpleArray<int, D> dX_dims_array;
SimpleArray<int, D> Y_strides_array;
SimpleArray<FixedDivisor<int>, D> X_dims_array;
int cur_stride = 1;
for (int i = D - 1; i >= 0; --i) {
dY_strides_array.data[i] = dY_dims[i] == 1 ? 0 : cur_stride;
dX_dims_array.data[i] = dX_dims[i];
cur_stride *= dY_dims[i];
if (X_dims[i] == 0) {
return;
}
Y_strides_array.data[i] = Y_dims[i] == 1 ? 0 : cur_stride;
X_dims_array.data[i] = FixedDivisor<int>(X_dims[i]);
cur_stride *= Y_dims[i];
}
const int dY_size =
std::accumulate(dY_dims, dY_dims + D, 1, std::multiplies<int>());
const int dX_size =
std::accumulate(dX_dims, dX_dims + D, 1, std::multiplies<int>());
const T scale = static_cast<T>(dY_size) / static_cast<T>(dX_size);
const int Y_size =
std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>());
const int X_size =
std::accumulate(X_dims, X_dims + D, 1, std::multiplies<int>());
const T scale = static_cast<T>(Y_size) / static_cast<T>(X_size);
ComputeMomentsGradientCUDAKernel<T, D>
<<<CAFFE_GET_BLOCKS(dX_size),
<<<CAFFE_GET_BLOCKS(X_size),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
dX_size,
dY_strides_array,
dX_dims_array,
X_size,
Y_strides_array,
X_dims_array,
scale,
dmean,
dvariance,

View file

@ -6,6 +6,7 @@
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/fixed_divisor.h"
namespace caffe2 {
@ -13,61 +14,63 @@ namespace {
template <typename T, int D>
__global__ void ComputeReduceMinMaxGradientCUDAKernel(
const int dX_size,
const SimpleArray<int, D> dY_strides,
const SimpleArray<int, D> dX_dims,
const int X_size,
const SimpleArray<int, D> Y_strides,
const SimpleArray<FixedDivisor<int>, D> X_dims,
const T* dY_data,
const T* X_data,
const T* Y_data,
T* dX_data) {
CUDA_1D_KERNEL_LOOP(dX_index, dX_size) {
int dY_index = 0;
int dX_index_val = dX_index;
CUDA_1D_KERNEL_LOOP(X_index, X_size) {
int Y_index = 0;
int X_index_val = X_index;
#pragma unroll
for (int i = D - 1; i >= 0; --i) {
dY_index += dY_strides.data[i] == 0
? 0
: (dX_index_val % dX_dims.data[i]) * dY_strides.data[i];
dX_index_val /= dX_dims.data[i];
int d;
X_dims.data[i].DivMod(X_index_val, &X_index_val, &d);
Y_index += d * Y_strides.data[i];
}
#if __CUDA_ARCH__ >= 350
dX_data[dX_index] = __ldg(Y_data + dY_index) == __ldg(X_data + dX_index)
? __ldg(dY_data + dY_index)
dX_data[X_index] = __ldg(Y_data + Y_index) == __ldg(X_data + X_index)
? __ldg(dY_data + Y_index)
: T(0);
#else
dX_data[dX_index] =
Y_data[dY_index] == X_data[dX_index] ? dY_data[dY_index] : T(0);
dX_data[X_index] =
Y_data[Y_index] == X_data[X_index] ? dY_data[Y_index] : T(0);
#endif
}
}
template <typename T, int D>
void ComputeReduceMinMaxGradientCUDAImpl(
const int* dY_dims,
const int* dX_dims,
const int* Y_dims,
const int* X_dims,
const T* dY_data,
const T* X_data,
const T* Y_data,
T* dX_data,
CUDAContext* context) {
SimpleArray<int, D> dY_strides_array;
SimpleArray<int, D> dX_dims_array;
SimpleArray<int, D> Y_strides_array;
SimpleArray<FixedDivisor<int>, D> X_dims_array;
int cur_stride = 1;
for (int i = D - 1; i >= 0; --i) {
dY_strides_array.data[i] = dY_dims[i] == 1 ? 0 : cur_stride;
dX_dims_array.data[i] = dX_dims[i];
cur_stride *= dY_dims[i];
if (X_dims[i] == 0) {
return;
}
Y_strides_array.data[i] = Y_dims[i] == 1 ? 0 : cur_stride;
X_dims_array.data[i] = FixedDivisor<int>(X_dims[i]);
cur_stride *= Y_dims[i];
}
const int dX_size =
std::accumulate(dX_dims, dX_dims + D, 1, std::multiplies<int>());
const int X_size =
std::accumulate(X_dims, X_dims + D, 1, std::multiplies<int>());
ComputeReduceMinMaxGradientCUDAKernel<T, D>
<<<CAFFE_GET_BLOCKS(dX_size),
<<<CAFFE_GET_BLOCKS(X_size),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
dX_size,
dY_strides_array,
dX_dims_array,
X_size,
Y_strides_array,
X_dims_array,
dY_data,
X_data,
Y_data,

View file

@ -32,7 +32,16 @@ class TransposeOp final : public Operator<Context> {
~TransposeOp() = default;
bool RunOnDevice() override {
// Do the actual transpose, which is implemented in DoRunWithType().
return DispatchHelper<TensorTypes<float, double, int, TIndex>>::call(
this, Input(0));
}
private:
template <typename T>
bool DoRunWithType() {
const auto& X = Input(0);
auto* Y = Output(0);
const int ndim = X.ndim();
if (axes_.empty()) {
axes_.resize(ndim);
@ -40,17 +49,6 @@ class TransposeOp final : public Operator<Context> {
} else {
CAFFE_ENFORCE_EQ(ndim, axes_.size());
}
// Do the actual transpose, which is implemented in DoRunWithType().
return DispatchHelper<TensorTypes<float, double, int, TIndex>>::call(
this, Input(0));
}
protected:
template <typename T>
bool DoRunWithType() {
const auto& X = Input(0);
auto* Y = Output(0);
const int ndim = X.ndim();
const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
std::vector<int> Y_dims(ndim);
for (int i = 0; i < ndim; ++i) {

View file

@ -76,9 +76,11 @@ class TestReduceOps(hu.HypothesisTestCase):
self.run_reduce_op_test(
"ReduceMax", X, keepdims, num_axes, np.max, gc, dc)
@given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
num_axes=st.integers(1, 4), **hu.gcs)
def test_reduce_sum(self, X, keepdims, num_axes, gc, dc):
@given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
t=st.integers(0, 5), keepdims=st.booleans(),
num_axes=st.integers(1, 3), **hu.gcs)
def test_reduce_sum(self, n, m, k, t, keepdims, num_axes, gc, dc):
X = np.random.randn(n, m, k, t).astype(np.float32)
self.run_reduce_op_test(
"ReduceSum", X, keepdims, num_axes, np.sum, gc, dc)
@ -208,7 +210,8 @@ class TestReduceFrontReductions(hu.HypothesisTestCase):
workspace.FeedBlob('X', not_empty_X)
workspace.RunNet(workspace.GetNetName(net))
output = workspace.FetchBlob('output')
np.testing.assert_allclose(output, ref_sum(not_empty_X)[0], atol=1e-3)
np.testing.assert_allclose(
output, ref_sum(not_empty_X)[0], atol=1e-3)
workspace.FeedBlob('X', X)
workspace.RunNet(workspace.GetNetName(net))

View file

@ -1,110 +1,115 @@
#ifndef CAFFE2_UTILS_FIXED_DIVISOR_H_
#define CAFFE2_UTILS_FIXED_DIVISOR_H_
#include <cstdlib>
#include <stdint.h>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#ifdef __CUDA_ARCH__
#define FIXED_DIVISOR_DECL inline __host__ __device__
#else
#define FIXED_DIVISOR_DECL inline
#endif
namespace caffe2 {
// Utility class for quickly calculating quotients and remainders for
// a known integer divisor
template <typename T>
class FixedDivisor {
};
class FixedDivisor {};
// Works for any positive divisor, 1 to INT_MAX. One 64-bit
// multiplication and one 64-bit shift is used to calculate the
// result.
template <>
class FixedDivisor<int32_t> {
class FixedDivisor<std::int32_t> {
public:
FixedDivisor(int32_t d) : d_(d) {
calcSignedMagic();
FixedDivisor() = default;
explicit FixedDivisor(const std::int32_t d) : d_(d) {
CalcSignedMagic();
}
uint64_t getMagic() const {
FIXED_DIVISOR_DECL std::int32_t d() const {
return d_;
}
FIXED_DIVISOR_DECL std::uint64_t magic() const {
return magic_;
}
int getShift() const {
FIXED_DIVISOR_DECL int shift() const {
return shift_;
}
/// Calculates `q = n / d`.
inline int32_t div(int32_t n) const {
FIXED_DIVISOR_DECL std::int32_t Div(const std::int32_t n) const {
// In lieu of a mulhi instruction being available, perform the
// work in uint64
uint64_t mul64 = magic_ * (uint64_t) n;
return (int32_t) (mul64 >> shift_);
return (int32_t)((magic_ * (uint64_t)n) >> shift_);
}
/// Calculates `r = n % d`.
inline int32_t mod(int32_t n) const {
return n - d_ * div(n);
FIXED_DIVISOR_DECL std::int32_t Mod(const std::int32_t n) const {
return n - d_ * Div(n);
}
/// Calculates `q = n / d` and `r = n % d` together.
inline void divMod(int32_t n, int32_t& q, int32_t& r) const {
const int32_t quotient = div(n);
q = quotient;
r = n - d_ * quotient;
FIXED_DIVISOR_DECL void
DivMod(const std::int32_t n, std::int32_t* q, int32_t* r) const {
*q = Div(n);
*r = n - d_ * *q;
}
private:
/**
Calculates magic multiplicative value and shift amount for
calculating `q = n / d` for signed 32-bit integers.
Implementation taken from Hacker's Delight section 10.
*/
void calcSignedMagic() {
// Calculates magic multiplicative value and shift amount for calculating `q =
// n / d` for signed 32-bit integers.
// Implementation taken from Hacker's Delight section 10.
void CalcSignedMagic() {
if (d_ == 1) {
magic_ = UINT64_C(0x1) << 32;
shift_ = 32;
return;
}
const uint32_t two31 = UINT32_C(0x80000000);
uint32_t ad = std::abs(d_);
uint32_t t = two31 + ((uint32_t) d_ >> 31);
uint32_t anc = t - 1 - t % ad; // Absolute value of nc.
uint32_t p = 31; // Init. p.
uint32_t q1 = two31 / anc; // Init. q1 = 2**p/|nc|.
uint32_t r1 = two31 - q1 * anc; // Init. r1 = rem(2**p, |nc|).
uint32_t q2 = two31 / ad; // Init. q2 = 2**p/|d|.
uint32_t r2 = two31 - q2 * ad; // Init. r2 = rem(2**p, |d|).
uint32_t delta = 0;
const std::uint32_t two31 = UINT32_C(0x80000000);
const std::uint32_t ad = std::abs(d_);
const std::uint32_t t = two31 + ((uint32_t)d_ >> 31);
const std::uint32_t anc = t - 1 - t % ad; // Absolute value of nc.
std::uint32_t p = 31; // Init. p.
std::uint32_t q1 = two31 / anc; // Init. q1 = 2**p/|nc|.
std::uint32_t r1 = two31 - q1 * anc; // Init. r1 = rem(2**p, |nc|).
std::uint32_t q2 = two31 / ad; // Init. q2 = 2**p/|d|.
std::uint32_t r2 = two31 - q2 * ad; // Init. r2 = rem(2**p, |d|).
std::uint32_t delta = 0;
do {
p = p + 1;
q1 = 2 * q1; // Update q1 = 2**p/|nc|.
r1 = 2 * r1; // Update r1 = rem(2**p, |nc|).
if (r1 >= anc) { // (Must be an unsigned
q1 = q1 + 1; // comparison here).
r1 = r1 - anc;
++p;
q1 <<= 1; // Update q1 = 2**p/|nc|.
r1 <<= 1; // Update r1 = rem(2**p, |nc|).
if (r1 >= anc) { // (Must be an unsigned
++q1; // comparison here).
r1 -= anc;
}
q2 = 2 * q2; // Update q2 = 2**p/|d|.
r2 = 2 * r2; // Update r2 = rem(2**p, |d|).
if (r2 >= ad) { // (Must be an unsigned
q2 = q2 + 1; // comparison here).
r2 = r2 - ad;
q2 <<= 1; // Update q2 = 2**p/|d|.
r2 <<= 1; // Update r2 = rem(2**p, |d|).
if (r2 >= ad) { // (Must be an unsigned
++q2; // comparison here).
r2 -= ad;
}
delta = ad - r2;
} while (q1 < delta || (q1 == delta && r1 == 0));
int32_t magic = q2 + 1;
std::int32_t magic = q2 + 1;
if (d_ < 0) {
magic = -magic;
}
shift_ = p;
magic_ = (uint64_t) (uint32_t) magic;
magic_ = (std::uint64_t)(std::uint32_t)magic;
}
int32_t d_;
uint64_t magic_;
std::int32_t d_ = 1;
std::uint64_t magic_;
int shift_;
};

View file

@ -1,4 +1,5 @@
#include "caffe2/utils/fixed_divisor.h"
#include <gtest/gtest.h>
#include <random>
@ -7,73 +8,71 @@ namespace caffe2 {
namespace {
void compareDivMod(int32_t v, int32_t divisor) {
void CompareDivMod(int32_t v, int32_t divisor) {
auto fixed = FixedDivisor<int32_t>(divisor);
int nativeQ = v / divisor;
int nativeR = v % divisor;
int native_q = v / divisor;
int native_r = v % divisor;
int fixedQ = fixed.div(v);
int fixedR = fixed.mod(v);
int fixed_q = fixed.Div(v);
int fixed_r = fixed.Mod(v);
EXPECT_EQ(fixedQ, nativeQ) << v << " / " << divisor
<< " magic " << fixed.getMagic()
<< " shift " << fixed.getShift()
<< " quot " << fixedQ << " " << nativeQ;
EXPECT_EQ(native_q, fixed_q)
<< v << " / " << divisor << " magic " << fixed.magic() << " shift "
<< fixed.shift() << " quot " << fixed_q << " " << native_q;
EXPECT_EQ(fixedR, nativeR) << v << " / " << divisor
<< " magic " << fixed.getMagic()
<< " shift " << fixed.getShift()
<< " rem " << fixedR << " " << nativeR;
EXPECT_EQ(native_r, fixed_r)
<< v << " / " << divisor << " magic " << fixed.magic() << " shift "
<< fixed.shift() << " rem " << fixed_r << " " << native_r;
}
}
} // namespace
TEST(FixedDivisorTest, Test) {
TEST(FixedDivisorTest, FixedDivisorInt32Test) {
constexpr int32_t kMax = std::numeric_limits<int32_t>::max();
// divide by 1
compareDivMod(kMax, 1);
compareDivMod(0, 1);
compareDivMod(1, 1);
CompareDivMod(kMax, 1);
CompareDivMod(0, 1);
CompareDivMod(1, 1);
// divide by max
compareDivMod(kMax, kMax);
compareDivMod(0, kMax);
compareDivMod(1, kMax);
CompareDivMod(kMax, kMax);
CompareDivMod(0, kMax);
CompareDivMod(1, kMax);
// divide by random positive values
std::random_device rd;
std::uniform_int_distribution<int32_t> vDist(0, kMax);
std::uniform_int_distribution<int32_t> qDist(1, kMax);
std::uniform_int_distribution<int32_t> v_dist(0, kMax);
std::uniform_int_distribution<int32_t> q_dist(1, kMax);
std::uniform_int_distribution<int32_t> vSmallDist(0, 1000);
std::uniform_int_distribution<int32_t> qSmallDist(1, 1000);
std::uniform_int_distribution<int32_t> v_small_dist(0, 1000);
std::uniform_int_distribution<int32_t> q_small_dist(1, 1000);
for (int i = 0; i < 10000; ++i) {
auto q = qDist(rd);
auto v = vDist(rd);
auto qSmall = qSmallDist(rd);
auto vSmall = vSmallDist(rd);
auto q = q_dist(rd);
auto v = v_dist(rd);
auto q_small = q_small_dist(rd);
auto v_small = v_small_dist(rd);
// random value
compareDivMod(vSmall, qSmall);
compareDivMod(vSmall, q);
compareDivMod(v, qSmall);
compareDivMod(v, q);
CompareDivMod(v_small, q_small);
CompareDivMod(v_small, q);
CompareDivMod(v, q_small);
CompareDivMod(v, q);
// special values
compareDivMod(kMax, qSmall);
compareDivMod(0, qSmall);
compareDivMod(1, qSmall);
compareDivMod(kMax, q);
compareDivMod(0, q);
compareDivMod(1, q);
CompareDivMod(kMax, q_small);
CompareDivMod(0, q_small);
CompareDivMod(1, q_small);
CompareDivMod(kMax, q);
CompareDivMod(0, q);
CompareDivMod(1, q);
compareDivMod(vSmall, 1);
compareDivMod(vSmall, kMax);
compareDivMod(v, 1);
compareDivMod(v, kMax);
CompareDivMod(v_small, 1);
CompareDivMod(v_small, kMax);
CompareDivMod(v, 1);
CompareDivMod(v, kMax);
}
}
} // namespace caffe2
} // namespace caffe2

View file

@ -13,6 +13,7 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/conversions.h"
#include "caffe2/utils/fixed_divisor.h"
#include "caffe2/utils/math_utils.h"
#if THRUST_VERSION >= 100800
@ -71,15 +72,14 @@ __global__ void SimpleBinaryOpCUDAKernel(
template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
__global__ void RowwiseBinaryOpCUDAKenel(
const int rows,
const int cols,
const int size,
const FixedDivisor<int> cols,
const BinaryOperator op,
const TIn* A,
const TIn* B,
TOut* C) {
const int size = rows * cols;
CUDA_1D_KERNEL_LOOP(C_index, size) {
const int j = C_index % cols;
const int j = cols.Mod(C_index);
const int A_index = broadcast_1st ? j : C_index;
const int B_index = broadcast_1st ? C_index : j;
C[C_index] = op(A[A_index], B[B_index]);
@ -88,15 +88,14 @@ __global__ void RowwiseBinaryOpCUDAKenel(
template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
__global__ void ColwiseBinaryOpCUDAKenel(
const int rows,
const int cols,
const int size,
const FixedDivisor<int> cols,
const BinaryOperator op,
const TIn* A,
const TIn* B,
TOut* C) {
const int size = rows * cols;
CUDA_1D_KERNEL_LOOP(C_index, size) {
const int i = C_index / cols;
const int i = cols.Div(C_index);
const int A_index = broadcast_1st ? i : C_index;
const int B_index = broadcast_1st ? C_index : i;
C[C_index] = op(A[A_index], B[B_index]);
@ -108,7 +107,7 @@ __global__ void BroadcastBinaryOpCUDAKernel(
const int size,
const SimpleArray<int, D> A_strides,
const SimpleArray<int, D> B_strides,
const SimpleArray<int, D> C_dims,
const SimpleArray<FixedDivisor<int>, D> C_dims,
const BinaryOperator op,
const TIn* A,
const TIn* B,
@ -119,10 +118,10 @@ __global__ void BroadcastBinaryOpCUDAKernel(
int C_index_val = C_index;
#pragma unroll
for (int i = D - 1; i >= 0; --i) {
const int d = C_index_val % C_dims.data[i];
A_index += A_strides.data[i] == 0 ? 0 : d * A_strides.data[i];
B_index += B_strides.data[i] == 0 ? 0 : d * B_strides.data[i];
C_index_val /= C_dims.data[i];
int d;
C_dims.data[i].DivMod(C_index_val, &C_index_val, &d);
A_index += d * A_strides.data[i];
B_index += d * B_strides.data[i];
}
C[C_index] = op(A[A_index], B[B_index]);
}
@ -144,20 +143,24 @@ void BinaryOpWith2DBroadcasting(
std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
const int cols =
std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
if (rows == 0 || cols == 0) {
return;
}
const int size = rows * cols;
const FixedDivisor<int> cols_div(cols);
if (rowwise_broadcast) {
if (broadcast_1st) {
RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
<<<CAFFE_GET_BLOCKS(size),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(rows, cols, op, A, B, C);
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
} else {
RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
<<<CAFFE_GET_BLOCKS(size),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(rows, cols, op, A, B, C);
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
}
} else {
if (broadcast_1st) {
@ -165,13 +168,13 @@ void BinaryOpWith2DBroadcasting(
<<<CAFFE_GET_BLOCKS(size),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(rows, cols, op, A, B, C);
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
} else {
ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
<<<CAFFE_GET_BLOCKS(size),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(rows, cols, op, A, B, C);
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
}
}
}
@ -188,16 +191,19 @@ void BroadcastBinaryOpImpl(
CUDAContext* context) {
SimpleArray<int, D> A_strides_array;
SimpleArray<int, D> B_strides_array;
SimpleArray<int, D> C_dims_array;
SimpleArray<FixedDivisor<int>, D> C_dims_array;
int A_stride = 1;
int B_stride = 1;
for (int i = D - 1; i >= 0; --i) {
if (C_dims[i] == 0) {
return;
}
A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride;
B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride;
A_stride *= A_dims[i];
B_stride *= B_dims[i];
C_dims_array.data[i] = FixedDivisor<int>(C_dims[i]);
}
std::copy(C_dims, C_dims + D, C_dims_array.data);
const int size =
std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>());
BroadcastBinaryOpCUDAKernel<TIn, TOut, BinaryOperator, D>
@ -448,66 +454,82 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION
#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
template <> \
void Rowwise##Func<TIn, CUDAContext, true>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CUDAContext* context) { \
const int size = rows * cols; \
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
<<<CAFFE_GET_BLOCKS(size), \
CAFFE_CUDA_NUM_THREADS, \
0, \
context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C); \
} \
template <> \
void Rowwise##Func<TIn, CUDAContext, false>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CUDAContext* context) { \
const int size = rows * cols; \
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
<<<CAFFE_GET_BLOCKS(size), \
CAFFE_CUDA_NUM_THREADS, \
0, \
context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C); \
} \
template <> \
void Colwise##Func<TIn, CUDAContext, true>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CUDAContext* context) { \
const int size = rows * cols; \
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
<<<CAFFE_GET_BLOCKS(size), \
CAFFE_CUDA_NUM_THREADS, \
0, \
context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C); \
} \
template <> \
void Colwise##Func<TIn, CUDAContext, false>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CUDAContext* context) { \
const int size = rows * cols; \
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
<<<CAFFE_GET_BLOCKS(size), \
CAFFE_CUDA_NUM_THREADS, \
0, \
context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C); \
#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
template <> \
void Rowwise##Func<TIn, CUDAContext, true>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CUDAContext* context) { \
if (rows == 0 || cols == 0) { \
return; \
} \
const int size = rows * cols; \
const FixedDivisor<int> cols_div(cols); \
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
<<<CAFFE_GET_BLOCKS(size), \
CAFFE_CUDA_NUM_THREADS, \
0, \
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
} \
template <> \
void Rowwise##Func<TIn, CUDAContext, false>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CUDAContext* context) { \
if (rows == 0 || cols == 0) { \
return; \
} \
const int size = rows * cols; \
const FixedDivisor<int> cols_div(cols); \
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
<<<CAFFE_GET_BLOCKS(size), \
CAFFE_CUDA_NUM_THREADS, \
0, \
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
} \
template <> \
void Colwise##Func<TIn, CUDAContext, true>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CUDAContext* context) { \
if (rows == 0 || cols == 0) { \
return; \
} \
const int size = rows * cols; \
const FixedDivisor<int> cols_div(cols); \
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
<<<CAFFE_GET_BLOCKS(size), \
CAFFE_CUDA_NUM_THREADS, \
0, \
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
} \
template <> \
void Colwise##Func<TIn, CUDAContext, false>( \
const int rows, \
const int cols, \
const TIn* A, \
const TIn* B, \
TOut* C, \
CUDAContext* context) { \
if (rows == 0 || cols == 0) { \
return; \
} \
const int size = rows * cols; \
const FixedDivisor<int> cols_div(cols); \
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
<<<CAFFE_GET_BLOCKS(size), \
CAFFE_CUDA_NUM_THREADS, \
0, \
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
}
#define DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op) \
@ -2728,7 +2750,7 @@ __global__ void ReduceTensorCUDAKernel(
const int outer_size,
const int inner_size,
SimpleArray<int, D> X_strides,
SimpleArray<int, D> Y_dims,
SimpleArray<FixedDivisor<int>, D> Y_dims,
const Reducer reducer,
const T init,
const T* X,
@ -2741,8 +2763,9 @@ __global__ void ReduceTensorCUDAKernel(
int Y_index = i * inner_size + j;
#pragma unroll
for (int d = D - 1; d >= 0; --d) {
X_index += (Y_index % Y_dims.data[d]) * X_strides.data[d];
Y_index /= Y_dims.data[d];
int r;
Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
X_index += r * X_strides.data[d];
}
#if __CUDA_ARCH__ >= 350
val = reducer(val, __ldg(X + X_index));
@ -2770,10 +2793,10 @@ void ReduceTensorCUDAImpl(
T* Y,
CUDAContext* context) {
SimpleArray<int, D> X_strides;
SimpleArray<int, D> Y_dims;
SimpleArray<FixedDivisor<int>, D> Y_dims;
utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
for (int i = 0; i < D; ++i) {
Y_dims.data[i] = dims[axes[i]];
Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
}
ReduceTensorCUDAKernel<T, Reducer, D>
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
@ -2810,7 +2833,7 @@ void ReduceTensorCUDA(
for (int i = pivot; i < num_dims; ++i) {
inner_size *= dims[transpose_axes[i]];
}
if (outer_size > 0) {
if (outer_size > 0 && inner_size > 0) {
if (transpose_axes[pivot] == pivot) {
RowwiseReduceKernel<T>
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
@ -2834,6 +2857,8 @@ void ReduceTensorCUDA(
X,
Y,
context);
} else if (outer_size > 0) {
math::Set<T, CUDAContext>(outer_size, init, Y, context);
}
}
@ -3065,7 +3090,7 @@ __global__ void MomentsCUDAKernel(
const int outer_size,
const int inner_size,
SimpleArray<int, D> X_strides,
SimpleArray<int, D> Y_dims,
SimpleArray<FixedDivisor<int>, D> Y_dims,
const T* X,
T* mean,
T* variance) {
@ -3078,9 +3103,10 @@ __global__ void MomentsCUDAKernel(
int X_index = 0;
int Y_index = i * inner_size + j;
#pragma unroll
for (int i = D - 1; i >= 0; --i) {
X_index += (Y_index % Y_dims.data[i]) * X_strides.data[i];
Y_index /= Y_dims.data[i];
for (int d = D - 1; d >= 0; --d) {
int r;
Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
X_index += r * X_strides.data[d];
}
#if __CUDA_ARCH__ >= 350
m_val += __ldg(X + X_index);
@ -3111,10 +3137,10 @@ void MomentsCUDAImpl(
T* variance,
CUDAContext* context) {
SimpleArray<int, D> X_strides;
SimpleArray<int, D> Y_dims;
SimpleArray<FixedDivisor<int>, D> Y_dims;
utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
for (int i = 0; i < D; ++i) {
Y_dims.data[i] = dims[axes[i]];
Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
}
MomentsCUDAKernel<T, D>
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
@ -3147,7 +3173,7 @@ void MomentsCUDA(
for (int i = pivot; i < num_dims; ++i) {
inner_size *= dims[transpose_axes[i]];
}
if (outer_size > 0) {
if (outer_size > 0 && inner_size > 0) {
if (transpose_axes[pivot] == pivot) {
RowwiseMomentsCUDAKernel<T>
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
@ -3197,7 +3223,7 @@ template <typename T, int D>
__global__ void TransposeCUDAKernel(
const int size,
const SimpleArray<int, D> X_strides,
const SimpleArray<int, D> Y_dims,
const SimpleArray<FixedDivisor<int>, D> Y_dims,
const T* X,
T* Y) {
CUDA_1D_KERNEL_LOOP(Y_index, size) {
@ -3205,8 +3231,9 @@ __global__ void TransposeCUDAKernel(
int Y_index_val = Y_index;
#pragma unroll
for (int i = D - 1; i >= 0; --i) {
X_index += (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
Y_index_val /= Y_dims.data[i];
int d;
Y_dims.data[i].DivMod(Y_index_val, &Y_index_val, &d);
X_index += d * X_strides.data[i];
}
#if __CUDA_ARCH__ >= 350
Y[Y_index] = __ldg(X + X_index);
@ -3224,11 +3251,11 @@ void TransposeCUDAImpl(
T* Y,
CUDAContext* context) {
SimpleArray<int, D> X_strides;
SimpleArray<int, D> Y_dims;
SimpleArray<FixedDivisor<int>, D> Y_dims;
utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
int size = 1;
for (int i = 0; i < D; ++i) {
Y_dims.data[i] = dims[axes[i]];
Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
size *= dims[i];
}
TransposeCUDAKernel<T, D>
@ -3240,17 +3267,23 @@ void TransposeCUDAImpl(
} // namespace
#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T) \
template <> \
void Transpose<T, CUDAContext>( \
const int ndim, \
const int* dims, \
const int* axes, \
const T* X, \
T* Y, \
CUDAContext* context) { \
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \
ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context); \
#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T) \
template <> \
void Transpose<T, CUDAContext>( \
const int ndim, \
const int* dims, \
const int* axes, \
const T* X, \
T* Y, \
CUDAContext* context) { \
if (utils::IsIdentityPermutation(ndim, axes)) { \
const int size = \
std::accumulate(dims, dims + ndim, 1, std::multiplies<int>()); \
context->template Copy<T, CUDAContext, CUDAContext>(size, X, Y); \
return; \
} \
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \
ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context); \
}
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float)
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double)

View file

@ -30,6 +30,15 @@ int GetIndexFromDims(const int n, const int* dims, const int* index) {
return sum;
}
bool IsIdentityPermutation(const int n, const int* perm) {
for (int i = 0; i < n; ++i) {
if (perm[i] != i) {
return false;
}
}
return true;
}
void ComputeBroadcastBinaryOpDims(
const int A_ndim,
const int* A_dims,

View file

@ -46,6 +46,9 @@ void IncreaseIndexInDims(const int n, const int* dims, int* index);
// Get index value from dims and index digits.
int GetIndexFromDims(const int n, const int* dims, const int* index);
// Checks if the input permutation is an identity permutation;
bool IsIdentityPermutation(const int n, const int* perm);
// Computest the broadcast binary operation dims.
void ComputeBroadcastBinaryOpDims(
const int A_ndim,

View file

@ -77,7 +77,7 @@ static void compute_2d(const struct compute_2d_context* context, size_t linear_i
int q;
int r;
context->range_j.divMod((int) linear_index, q, r);
context->range_j.DivMod((int)linear_index, &q, &r);
context->function(context->argument, q, r);
}
@ -121,7 +121,7 @@ static void compute_2d_tiled(const struct compute_2d_tiled_context* context, siz
int q;
int r;
context->tile_range_j.divMod(linear_index, q, r);
context->tile_range_j.DivMod(linear_index, &q, &r);
const size_t max_tile_i = context->tile_i;
const size_t max_tile_j = context->tile_j;
const size_t index_i = q * max_tile_i;