mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Use FixedDivisor in Reduce and Broadcast CUDA kernels (#9072)
Summary: Closes https://github.com/pytorch/pytorch/pull/9072 Use FixedDivisor in Reduce and Broadcast CUDA kernels Reviewed By: houseroad Differential Revision: D8710243 fbshipit-source-id: 6f1da12234898594a1be8c979d942aa515832aeb
This commit is contained in:
parent
90fd4df695
commit
03e7953a98
18 changed files with 463 additions and 470 deletions
|
|
@ -10,18 +10,18 @@ namespace {
|
|||
|
||||
template <typename T, class Compare, class Context>
|
||||
void ComputeArgImpl(
|
||||
const TIndex prev_size,
|
||||
const TIndex next_size,
|
||||
const TIndex n,
|
||||
const int prev_size,
|
||||
const int next_size,
|
||||
const int n,
|
||||
const Compare& comp,
|
||||
const T* X,
|
||||
TIndex* Y,
|
||||
Context* context) {
|
||||
math::Set<TIndex, Context>(prev_size * next_size, TIndex(0), Y, context);
|
||||
for (TIndex i = 0; i < prev_size; ++i) {
|
||||
for (int i = 0; i < prev_size; ++i) {
|
||||
const T* cur_X = X + i * n * next_size + next_size;
|
||||
for (TIndex k = 1; k < n; ++k) {
|
||||
for (TIndex j = 0; j < next_size; ++j) {
|
||||
for (int k = 1; k < n; ++k) {
|
||||
for (int j = 0; j < next_size; ++j) {
|
||||
TIndex* cur_Y = Y + i * next_size + j;
|
||||
if (comp(*cur_X, X[i * n * next_size + *cur_Y * next_size + j])) {
|
||||
*cur_Y = k;
|
||||
|
|
@ -37,9 +37,9 @@ void ComputeArgImpl(
|
|||
template <>
|
||||
template <typename T>
|
||||
bool ArgMaxReducer<CPUContext>::operator()(
|
||||
const TIndex prev_size,
|
||||
const TIndex next_size,
|
||||
const TIndex n,
|
||||
const int prev_size,
|
||||
const int next_size,
|
||||
const int n,
|
||||
const T* X,
|
||||
TIndex* Y,
|
||||
CPUContext* context) const {
|
||||
|
|
@ -50,9 +50,9 @@ bool ArgMaxReducer<CPUContext>::operator()(
|
|||
template <>
|
||||
template <typename T>
|
||||
bool ArgMinReducer<CPUContext>::operator()(
|
||||
const TIndex prev_size,
|
||||
const TIndex next_size,
|
||||
const TIndex n,
|
||||
const int prev_size,
|
||||
const int next_size,
|
||||
const int n,
|
||||
const T* X,
|
||||
TIndex* Y,
|
||||
CPUContext* context) const {
|
||||
|
|
@ -157,14 +157,16 @@ Indices: [[1 0 0]
|
|||
|
||||
)DOC")
|
||||
.Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
|
||||
.Output(0,
|
||||
"Indices",
|
||||
"*(type: Tensor`<float>`)* Tensor of indices for the largest values.")
|
||||
.Output(
|
||||
0,
|
||||
"Indices",
|
||||
"*(type: Tensor`<float>`)* Tensor of indices for the largest values.")
|
||||
.Arg("axis", "*(type: int; default: -1)* The axis to get argmax.")
|
||||
.Arg("keepdims",
|
||||
"*(type: bool; default: True)* If True (default), the output tensor "
|
||||
"shape will match the input tensor shape except the `axis` dimension "
|
||||
"equals 1. Else, the `axis` dimension of the output tensor is removed.");
|
||||
.Arg(
|
||||
"keepdims",
|
||||
"*(type: bool; default: True)* If True (default), the output tensor "
|
||||
"shape will match the input tensor shape except the `axis` dimension "
|
||||
"equals 1. Else, the `axis` dimension of the output tensor is removed.");
|
||||
|
||||
OPERATOR_SCHEMA(ArgMin)
|
||||
.NumInputs(1)
|
||||
|
|
@ -227,16 +229,18 @@ Indices: [[4]
|
|||
|
||||
)DOC")
|
||||
.Input(0, "X", "*(type: Tensor`<float>`)* Input tensor.")
|
||||
.Output(0,
|
||||
"Indices",
|
||||
"*(type: Tensor`<float>`)* Tensor of indices for the smallest values.")
|
||||
.Output(
|
||||
0,
|
||||
"Indices",
|
||||
"*(type: Tensor`<float>`)* Tensor of indices for the smallest values.")
|
||||
.Arg("axis", "*(type: int; default: -1)* The axis to get argmin.")
|
||||
.Arg("keepdims",
|
||||
"*(type: bool; default: True)* If True (default), the output tensor "
|
||||
"shape will match the input tensor shape except the `axis` dimension "
|
||||
"equals 1. Else, the `axis` dimension of the output tensor is removed.");
|
||||
.Arg(
|
||||
"keepdims",
|
||||
"*(type: bool; default: True)* If True (default), the output tensor "
|
||||
"shape will match the input tensor shape except the `axis` dimension "
|
||||
"equals 1. Else, the `axis` dimension of the output tensor is removed.");
|
||||
|
||||
NO_GRADIENT(ArgMax);
|
||||
NO_GRADIENT(ArgMin);
|
||||
SHOULD_NOT_DO_GRADIENT(ArgMax);
|
||||
SHOULD_NOT_DO_GRADIENT(ArgMin);
|
||||
|
||||
} // namespace caffe2
|
||||
|
|
|
|||
|
|
@ -7,37 +7,41 @@
|
|||
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/utils/fixed_divisor.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
using KeyValuePair = cub::KeyValuePair<TIndex, T>;
|
||||
template <typename K, typename V>
|
||||
using KeyValuePair = cub::KeyValuePair<K, V>;
|
||||
|
||||
template <typename T>
|
||||
using BlockReduce = cub::BlockReduce<KeyValuePair<T>, CAFFE_CUDA_NUM_THREADS>;
|
||||
template <typename K, typename V>
|
||||
using BlockReduce =
|
||||
cub::BlockReduce<KeyValuePair<K, V>, CAFFE_CUDA_NUM_THREADS>;
|
||||
|
||||
template <typename T, class Reducer>
|
||||
__global__ void ComputeArgCUDAKernel(
|
||||
const TIndex outer_size,
|
||||
const TIndex inner_size,
|
||||
const TIndex stride,
|
||||
const int outer_size,
|
||||
const int inner_size,
|
||||
const FixedDivisor<int> stride,
|
||||
const Reducer reducer,
|
||||
const T init,
|
||||
const T* X,
|
||||
TIndex* Y) {
|
||||
__shared__ typename BlockReduce<T>::TempStorage temp_storage;
|
||||
for (TIndex idx = blockIdx.x; idx < outer_size; idx += gridDim.x) {
|
||||
const TIndex i = idx / stride;
|
||||
const TIndex j = idx % stride;
|
||||
KeyValuePair<T> kv = {-1, init};
|
||||
for (TIndex k = threadIdx.x; k < inner_size; k += blockDim.x) {
|
||||
kv = reducer({k, X[i * inner_size * stride + k * stride + j]}, kv);
|
||||
__shared__ typename BlockReduce<int, T>::TempStorage temp_storage;
|
||||
const int d = stride.d();
|
||||
for (int idx = blockIdx.x; idx < outer_size; idx += gridDim.x) {
|
||||
int i;
|
||||
int j;
|
||||
stride.DivMod(idx, &i, &j);
|
||||
KeyValuePair<int, T> kv = {-1, init};
|
||||
for (int k = threadIdx.x; k < inner_size; k += blockDim.x) {
|
||||
kv = reducer({k, X[i * inner_size * d + k * d + j]}, kv);
|
||||
}
|
||||
kv = BlockReduce<T>(temp_storage).Reduce(kv, reducer);
|
||||
kv = BlockReduce<int, T>(temp_storage).Reduce(kv, reducer);
|
||||
if (threadIdx.x == 0) {
|
||||
Y[idx] = kv.key;
|
||||
Y[idx] = static_cast<TIndex>(kv.key);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
|
@ -48,21 +52,22 @@ __global__ void ComputeArgCUDAKernel(
|
|||
template <>
|
||||
template <typename T>
|
||||
bool ArgMaxReducer<CUDAContext>::operator()(
|
||||
const TIndex prev_size,
|
||||
const TIndex next_size,
|
||||
const TIndex n,
|
||||
const int prev_size,
|
||||
const int next_size,
|
||||
const int n,
|
||||
const T* X,
|
||||
TIndex* Y,
|
||||
CUDAContext* context) const {
|
||||
const TIndex outer_size = prev_size * next_size;
|
||||
const int outer_size = prev_size * next_size;
|
||||
const FixedDivisor<int> stride(next_size);
|
||||
ComputeArgCUDAKernel<<<
|
||||
std::min(outer_size, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
|
||||
std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context->cuda_stream()>>>(
|
||||
outer_size,
|
||||
n,
|
||||
next_size,
|
||||
stride,
|
||||
cub::ArgMax(),
|
||||
std::numeric_limits<T>::lowest(),
|
||||
X,
|
||||
|
|
@ -73,21 +78,22 @@ bool ArgMaxReducer<CUDAContext>::operator()(
|
|||
template <>
|
||||
template <typename T>
|
||||
bool ArgMinReducer<CUDAContext>::operator()(
|
||||
const TIndex prev_size,
|
||||
const TIndex next_size,
|
||||
const TIndex n,
|
||||
const int prev_size,
|
||||
const int next_size,
|
||||
const int n,
|
||||
const T* X,
|
||||
TIndex* Y,
|
||||
CUDAContext* context) const {
|
||||
const TIndex outer_size = prev_size * next_size;
|
||||
const int outer_size = prev_size * next_size;
|
||||
const FixedDivisor<int> stride(next_size);
|
||||
ComputeArgCUDAKernel<<<
|
||||
std::min(outer_size, static_cast<TIndex>(CAFFE_MAXIMUM_NUM_BLOCKS)),
|
||||
std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context->cuda_stream()>>>(
|
||||
outer_size,
|
||||
n,
|
||||
next_size,
|
||||
stride,
|
||||
cub::ArgMin(),
|
||||
std::numeric_limits<T>::max(),
|
||||
X,
|
||||
|
|
|
|||
|
|
@ -37,11 +37,11 @@ class ArgOp final : public Operator<Context> {
|
|||
}
|
||||
CAFFE_ENFORCE_GE(axis_, 0);
|
||||
CAFFE_ENFORCE_LT(axis_, ndim);
|
||||
const std::vector<TIndex>& X_dims = X.dims();
|
||||
std::vector<TIndex> Y_dims;
|
||||
const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
|
||||
std::vector<int> Y_dims;
|
||||
Y_dims.reserve(ndim);
|
||||
TIndex prev_size = 1;
|
||||
TIndex next_size = 1;
|
||||
int prev_size = 1;
|
||||
int next_size = 1;
|
||||
for (int i = 0; i < axis_; ++i) {
|
||||
Y_dims.push_back(X_dims[i]);
|
||||
prev_size *= X_dims[i];
|
||||
|
|
@ -54,7 +54,7 @@ class ArgOp final : public Operator<Context> {
|
|||
next_size *= X_dims[i];
|
||||
}
|
||||
Y->Resize(Y_dims);
|
||||
const TIndex n = X_dims[axis_];
|
||||
const int n = X_dims[axis_];
|
||||
return reducer_(
|
||||
prev_size,
|
||||
next_size,
|
||||
|
|
@ -74,9 +74,9 @@ template <class Context>
|
|||
struct ArgMaxReducer {
|
||||
template <typename T>
|
||||
bool operator()(
|
||||
const TIndex prev_size,
|
||||
const TIndex next_size,
|
||||
const TIndex n,
|
||||
const int prev_size,
|
||||
const int next_size,
|
||||
const int n,
|
||||
const T* X,
|
||||
TIndex* Y,
|
||||
Context* context) const;
|
||||
|
|
@ -86,9 +86,9 @@ template <class Context>
|
|||
struct ArgMinReducer {
|
||||
template <typename T>
|
||||
bool operator()(
|
||||
const TIndex prev_size,
|
||||
const TIndex next_size,
|
||||
const TIndex n,
|
||||
const int prev_size,
|
||||
const int next_size,
|
||||
const int n,
|
||||
const T* X,
|
||||
TIndex* Y,
|
||||
Context* context) const;
|
||||
|
|
|
|||
|
|
@ -1,82 +0,0 @@
|
|||
#ifndef CAFFE2_OPERATORS_ARG_OPS_EIGEN_H_
|
||||
#define CAFFE2_OPERATORS_ARG_OPS_EIGEN_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/types.h"
|
||||
|
||||
#include "Eigen/Core"
|
||||
|
||||
#if EIGEN_VERSION_AT_LEAST(3, 3, 0)
|
||||
|
||||
#include "unsupported/Eigen/CXX11/Tensor"
|
||||
|
||||
namespace caffe2 {
|
||||
namespace arg_ops_eigen {
|
||||
|
||||
template <typename T>
|
||||
using EigenTensorMap1D = Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor>>;
|
||||
|
||||
template <typename T>
|
||||
using EigenTensorMap2D = Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor>>;
|
||||
|
||||
template <typename T>
|
||||
using EigenTensorMap3D = Eigen::TensorMap<Eigen::Tensor<T, 3, Eigen::RowMajor>>;
|
||||
|
||||
template <class Device, typename T>
|
||||
void ComputeArgMaxEigen(
|
||||
const Device& device,
|
||||
const T* X,
|
||||
const TIndex prev_size,
|
||||
const TIndex next_size,
|
||||
const TIndex n,
|
||||
TIndex* Y) {
|
||||
if (next_size == 1) {
|
||||
EigenTensorMap1D<TIndex>(Y, prev_size).device(device) =
|
||||
EigenTensorMap2D<T>(const_cast<T*>(X), prev_size, n)
|
||||
.argmax(1)
|
||||
.template cast<TIndex>();
|
||||
} else if (prev_size == 1) {
|
||||
EigenTensorMap1D<TIndex>(Y, next_size).device(device) =
|
||||
EigenTensorMap2D<T>(const_cast<T*>(X), n, next_size)
|
||||
.argmax(0)
|
||||
.template cast<TIndex>();
|
||||
} else {
|
||||
EigenTensorMap2D<TIndex>(Y, prev_size, next_size).device(device) =
|
||||
EigenTensorMap3D<T>(const_cast<T*>(X), prev_size, n, next_size)
|
||||
.argmax(1)
|
||||
.template cast<TIndex>();
|
||||
}
|
||||
}
|
||||
|
||||
template <class Device, typename T>
|
||||
void ComputeArgMinEigen(
|
||||
const Device& device,
|
||||
const T* X,
|
||||
const TIndex prev_size,
|
||||
const TIndex next_size,
|
||||
const TIndex n,
|
||||
TIndex* Y) {
|
||||
if (next_size == 1) {
|
||||
EigenTensorMap1D<TIndex>(Y, prev_size).device(device) =
|
||||
EigenTensorMap2D<T>(const_cast<T*>(X), prev_size, n)
|
||||
.argmin(1)
|
||||
.template cast<TIndex>();
|
||||
} else if (prev_size == 1) {
|
||||
EigenTensorMap1D<TIndex>(Y, next_size).device(device) =
|
||||
EigenTensorMap2D<T>(const_cast<T*>(X), n, next_size)
|
||||
.argmin(0)
|
||||
.template cast<TIndex>();
|
||||
} else {
|
||||
EigenTensorMap2D<TIndex>(Y, prev_size, next_size).device(device) =
|
||||
EigenTensorMap3D<T>(const_cast<T*>(X), prev_size, n, next_size)
|
||||
.argmin(1)
|
||||
.template cast<TIndex>();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace arg_ops_eigen
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // EIGEN_VERSION_AT_LEAST(3, 3, 0)
|
||||
|
||||
#endif // CAFFE2_OPERATORS_ARG_OPS_EIGEN_H_
|
||||
|
|
@ -31,4 +31,4 @@ struct CbrtGradientFunctor {
|
|||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif
|
||||
#endif // CAFFE2_OPERATORS_CBRT_OP_H_
|
||||
|
|
|
|||
|
|
@ -384,7 +384,7 @@ void reinterleaveMultithreaded(
|
|||
size_t tileId) {
|
||||
int h;
|
||||
int c;
|
||||
divOutputH.divMod((int)tileId, c, h);
|
||||
divOutputH.DivMod((int)tileId, &c, &h);
|
||||
|
||||
REINTERLEAVE(N);
|
||||
};
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/elementwise_ops_utils.h"
|
||||
#include "caffe2/utils/fixed_divisor.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
|
|
@ -20,10 +21,10 @@ template <typename TGrad, typename TIn, int D>
|
|||
__global__ void ComputeDivAGradientCUDAKernel(
|
||||
const int outer_size,
|
||||
const int inner_size,
|
||||
const SimpleArray<int, D> C_dims,
|
||||
const SimpleArray<FixedDivisor<int>, D> C_dims,
|
||||
const SimpleArray<int, D> C_strides,
|
||||
const SimpleArray<int, D> B_strides,
|
||||
const SimpleArray<int, D> A_dims,
|
||||
const SimpleArray<FixedDivisor<int>, D> A_dims,
|
||||
const TGrad* dC,
|
||||
const TIn* B,
|
||||
TGrad* dA) {
|
||||
|
|
@ -36,17 +37,17 @@ __global__ void ComputeDivAGradientCUDAKernel(
|
|||
int A_index_val = A_index;
|
||||
#pragma unroll
|
||||
for (int d = D - 1; d >= 0; --d) {
|
||||
C_index += (A_index_val % A_dims.data[d]) * C_strides.data[d];
|
||||
A_index_val /= A_dims.data[d];
|
||||
int r;
|
||||
A_dims.data[d].DivMod(A_index_val, &A_index_val, &r);
|
||||
C_index += r * C_strides.data[d];
|
||||
}
|
||||
int B_index = 0;
|
||||
int C_index_val = C_index;
|
||||
#pragma unroll
|
||||
for (int d = D - 1; d >= 0; --d) {
|
||||
B_index += B_strides.data[d] == 0
|
||||
? 0
|
||||
: (C_index_val % C_dims.data[d]) * B_strides.data[d];
|
||||
C_index_val /= C_dims.data[d];
|
||||
int r;
|
||||
C_dims.data[d].DivMod(C_index_val, &C_index_val, &r);
|
||||
B_index += r * B_strides.data[d];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
sum += __ldg(dC + C_index) / __ldg(B + B_index);
|
||||
|
|
@ -83,7 +84,7 @@ __global__ void ComputeDivBGradientCUDAKernel(
|
|||
const int outer_size,
|
||||
const int inner_size,
|
||||
const SimpleArray<int, D> C_strides,
|
||||
const SimpleArray<int, D> B_dims,
|
||||
const SimpleArray<FixedDivisor<int>, D> B_dims,
|
||||
const TGrad* dC,
|
||||
const TIn* B,
|
||||
const TOut* C,
|
||||
|
|
@ -96,8 +97,9 @@ __global__ void ComputeDivBGradientCUDAKernel(
|
|||
int B_index = i * inner_size + j;
|
||||
#pragma unroll
|
||||
for (int d = D - 1; d >= 0; --d) {
|
||||
C_index += (B_index % B_dims.data[d]) * C_strides.data[d];
|
||||
B_index /= B_dims.data[d];
|
||||
int r;
|
||||
B_dims.data[d].DivMod(B_index, &B_index, &r);
|
||||
C_index += r * C_strides.data[d];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
sum += -__ldg(dC + C_index) * __ldg(C + C_index) / __ldg(B + i);
|
||||
|
|
@ -124,20 +126,20 @@ void ComputeDivAGradientCUDAImpl(
|
|||
const TIn* B,
|
||||
TGrad* dA,
|
||||
CUDAContext* context) {
|
||||
SimpleArray<int, D> C_dims_arr;
|
||||
SimpleArray<FixedDivisor<int>, D> C_dims_arr;
|
||||
SimpleArray<int, D> C_strides_arr;
|
||||
SimpleArray<int, D> B_strides_arr;
|
||||
SimpleArray<int, D> A_dims_arr;
|
||||
std::copy_n(C_dims, D, C_dims_arr.data);
|
||||
SimpleArray<FixedDivisor<int>, D> A_dims_arr;
|
||||
for (int i = 0; i < D; ++i) {
|
||||
C_dims_arr.data[i] = FixedDivisor<int>(C_dims[i]);
|
||||
A_dims_arr.data[i] = FixedDivisor<int>(C_dims[A_axes[i]]);
|
||||
}
|
||||
math::utils::ComputeTransposedStrides(D, C_dims, A_axes, C_strides_arr.data);
|
||||
int cur_stride = 1;
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
B_strides_arr.data[i] = B_dims[i] == 1 ? 0 : cur_stride;
|
||||
cur_stride *= B_dims[i];
|
||||
}
|
||||
for (int i = 0; i < D; ++i) {
|
||||
A_dims_arr.data[i] = C_dims[A_axes[i]];
|
||||
}
|
||||
ComputeDivAGradientCUDAKernel<TGrad, TIn, D>
|
||||
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
|
|
@ -166,10 +168,10 @@ void ComputeDivBGradientCUDAImpl(
|
|||
TGrad* dB,
|
||||
CUDAContext* context) {
|
||||
SimpleArray<int, D> C_strides_arr;
|
||||
SimpleArray<int, D> B_dims_arr;
|
||||
SimpleArray<FixedDivisor<int>, D> B_dims_arr;
|
||||
math::utils::ComputeTransposedStrides(D, C_dims, B_axes, C_strides_arr.data);
|
||||
for (int i = 0; i < D; ++i) {
|
||||
B_dims_arr.data[i] = C_dims[B_axes[i]];
|
||||
B_dims_arr.data[i] = FixedDivisor<int>(C_dims[B_axes[i]]);
|
||||
}
|
||||
ComputeDivBGradientCUDAKernel<TGrad, TIn, TOut, D>
|
||||
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
|
|
@ -202,7 +204,7 @@ void ComputeDivAGradientCUDA(
|
|||
for (int i = pivot; i < ndim; ++i) {
|
||||
inner_size *= C_dims[A_transpose_axes[i]];
|
||||
}
|
||||
if (outer_size > 0) {
|
||||
if (outer_size > 0 && inner_size > 0) {
|
||||
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
|
||||
ndim,
|
||||
ComputeDivAGradientCUDAImpl,
|
||||
|
|
@ -217,6 +219,8 @@ void ComputeDivAGradientCUDA(
|
|||
B,
|
||||
dA,
|
||||
context);
|
||||
} else if (outer_size > 0) {
|
||||
math::Set<TGrad, CUDAContext>(outer_size, TGrad(0), dA, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -242,7 +246,7 @@ void ComputeDivBGradientCUDA(
|
|||
for (int i = pivot; i < ndim; ++i) {
|
||||
inner_size *= C_dims[B_transpose_axes[i]];
|
||||
}
|
||||
if (outer_size > 0) {
|
||||
if (outer_size > 0 && inner_size > 0) {
|
||||
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_3(
|
||||
ndim,
|
||||
ComputeDivBGradientCUDAImpl,
|
||||
|
|
@ -258,6 +262,8 @@ void ComputeDivBGradientCUDA(
|
|||
C,
|
||||
dB,
|
||||
context);
|
||||
} else if (outer_size > 0) {
|
||||
math::Set<TGrad, CUDAContext>(outer_size, TGrad(0), dB, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/operators/elementwise_ops_utils.h"
|
||||
#include "caffe2/utils/fixed_divisor.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
|
|
@ -20,10 +21,10 @@ template <typename TGrad, typename TIn, int D>
|
|||
__global__ void ComputeMulGradientCUDAKernel(
|
||||
const int outer_size,
|
||||
const int inner_size,
|
||||
const SimpleArray<int, D> Y_dims,
|
||||
const SimpleArray<FixedDivisor<int>, D> Y_dims,
|
||||
const SimpleArray<int, D> Y_strides,
|
||||
const SimpleArray<int, D> W_strides,
|
||||
const SimpleArray<int, D> X_dims,
|
||||
const SimpleArray<FixedDivisor<int>, D> X_dims,
|
||||
const TGrad* dY,
|
||||
const TIn* W,
|
||||
TGrad* dX) {
|
||||
|
|
@ -36,17 +37,17 @@ __global__ void ComputeMulGradientCUDAKernel(
|
|||
int X_index_val = X_index;
|
||||
#pragma unroll
|
||||
for (int d = D - 1; d >= 0; --d) {
|
||||
Y_index += (X_index_val % X_dims.data[d]) * Y_strides.data[d];
|
||||
X_index_val /= X_dims.data[d];
|
||||
int r;
|
||||
X_dims.data[d].DivMod(X_index_val, &X_index_val, &r);
|
||||
Y_index += r * Y_strides.data[d];
|
||||
}
|
||||
int W_index = 0;
|
||||
int Y_index_val = Y_index;
|
||||
#pragma unroll
|
||||
for (int d = D - 1; d >= 0; --d) {
|
||||
W_index += W_strides.data[d] == 0
|
||||
? 0
|
||||
: (Y_index_val % Y_dims.data[d]) * W_strides.data[d];
|
||||
Y_index_val /= Y_dims.data[d];
|
||||
int r;
|
||||
Y_dims.data[d].DivMod(Y_index_val, &Y_index_val, &r);
|
||||
W_index += r * W_strides.data[d];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
sum += __ldg(dY + Y_index) * __ldg(W + W_index);
|
||||
|
|
@ -73,20 +74,20 @@ void ComputeMulGradientCUDAImpl(
|
|||
const TIn* W,
|
||||
TGrad* dX,
|
||||
CUDAContext* context) {
|
||||
SimpleArray<int, D> Y_dims_arr;
|
||||
SimpleArray<FixedDivisor<int>, D> Y_dims_arr;
|
||||
SimpleArray<int, D> Y_strides_arr;
|
||||
SimpleArray<int, D> W_strides_arr;
|
||||
SimpleArray<int, D> X_dims_arr;
|
||||
std::copy_n(Y_dims, D, Y_dims_arr.data);
|
||||
SimpleArray<FixedDivisor<int>, D> X_dims_arr;
|
||||
for (int i = 0; i < D; ++i) {
|
||||
Y_dims_arr.data[i] = FixedDivisor<int>(Y_dims[i]);
|
||||
X_dims_arr.data[i] = FixedDivisor<int>(Y_dims[X_axes[i]]);
|
||||
}
|
||||
math::utils::ComputeTransposedStrides(D, Y_dims, X_axes, Y_strides_arr.data);
|
||||
int cur_stride = 1;
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
W_strides_arr.data[i] = W_dims[i] == 1 ? 0 : cur_stride;
|
||||
cur_stride *= W_dims[i];
|
||||
}
|
||||
for (int i = 0; i < D; ++i) {
|
||||
X_dims_arr.data[i] = Y_dims[X_axes[i]];
|
||||
}
|
||||
ComputeMulGradientCUDAKernel<TGrad, TIn, D>
|
||||
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
|
|
@ -126,7 +127,7 @@ void ComputeMulGradientCUDA(
|
|||
for (int i = pivot; i < ndim; ++i) {
|
||||
inner_size *= Y_dims[X_transpose_axes[i]];
|
||||
}
|
||||
if (outer_size > 0) {
|
||||
if (outer_size > 0 && inner_size > 0) {
|
||||
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_2(
|
||||
ndim,
|
||||
ComputeMulGradientCUDAImpl,
|
||||
|
|
@ -141,6 +142,8 @@ void ComputeMulGradientCUDA(
|
|||
W,
|
||||
dX,
|
||||
context);
|
||||
} else if (outer_size > 0) {
|
||||
math::Set<TGrad, CUDAContext>(outer_size, TGrad(0), dX, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include <functional>
|
||||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/utils/fixed_divisor.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
|
|
@ -11,35 +12,34 @@ namespace {
|
|||
|
||||
template <typename T, int D>
|
||||
__global__ void ComputeMomentsGradientCUDAKernel(
|
||||
const int dX_size,
|
||||
const SimpleArray<int, D> dY_strides,
|
||||
const SimpleArray<int, D> dX_dims,
|
||||
const int X_size,
|
||||
const SimpleArray<int, D> Y_strides,
|
||||
const SimpleArray<FixedDivisor<int>, D> X_dims,
|
||||
const T scale,
|
||||
const T* dmean,
|
||||
const T* dvariance,
|
||||
const T* X,
|
||||
const T* mean,
|
||||
T* dX) {
|
||||
CUDA_1D_KERNEL_LOOP(dX_index, dX_size) {
|
||||
int dY_index = 0;
|
||||
int dX_index_val = dX_index;
|
||||
CUDA_1D_KERNEL_LOOP(X_index, X_size) {
|
||||
int Y_index = 0;
|
||||
int X_index_val = X_index;
|
||||
#pragma unroll
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
dY_index += dY_strides.data[i] == 0
|
||||
? 0
|
||||
: (dX_index_val % dX_dims.data[i]) * dY_strides.data[i];
|
||||
dX_index_val /= dX_dims.data[i];
|
||||
int d;
|
||||
X_dims.data[i].DivMod(X_index_val, &X_index_val, &d);
|
||||
Y_index += d * Y_strides.data[i];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
dX[dX_index] =
|
||||
(__ldg(dmean + dY_index) +
|
||||
static_cast<T>(2) * (__ldg(X + dX_index) - __ldg(mean + dY_index)) *
|
||||
__ldg(dvariance + dY_index)) *
|
||||
dX[X_index] =
|
||||
(__ldg(dmean + Y_index) +
|
||||
static_cast<T>(2) * (__ldg(X + X_index) - __ldg(mean + Y_index)) *
|
||||
__ldg(dvariance + Y_index)) *
|
||||
scale;
|
||||
#else
|
||||
dX[dX_index] = (dmean[dY_index] +
|
||||
static_cast<T>(2) * (X[dX_index] - mean[dY_index]) *
|
||||
dvariance[dY_index]) *
|
||||
dX[X_index] = (dmean[Y_index] +
|
||||
static_cast<T>(2) * (X[X_index] - mean[Y_index]) *
|
||||
dvariance[Y_index]) *
|
||||
scale;
|
||||
#endif
|
||||
}
|
||||
|
|
@ -47,35 +47,38 @@ __global__ void ComputeMomentsGradientCUDAKernel(
|
|||
|
||||
template <typename T, int D>
|
||||
void ComputeMomentsGradientCUDAImpl(
|
||||
const int* dY_dims,
|
||||
const int* dX_dims,
|
||||
const int* Y_dims,
|
||||
const int* X_dims,
|
||||
const T* dmean,
|
||||
const T* dvariance,
|
||||
const T* X,
|
||||
const T* mean,
|
||||
T* dX,
|
||||
CUDAContext* context) {
|
||||
SimpleArray<int, D> dY_strides_array;
|
||||
SimpleArray<int, D> dX_dims_array;
|
||||
SimpleArray<int, D> Y_strides_array;
|
||||
SimpleArray<FixedDivisor<int>, D> X_dims_array;
|
||||
int cur_stride = 1;
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
dY_strides_array.data[i] = dY_dims[i] == 1 ? 0 : cur_stride;
|
||||
dX_dims_array.data[i] = dX_dims[i];
|
||||
cur_stride *= dY_dims[i];
|
||||
if (X_dims[i] == 0) {
|
||||
return;
|
||||
}
|
||||
Y_strides_array.data[i] = Y_dims[i] == 1 ? 0 : cur_stride;
|
||||
X_dims_array.data[i] = FixedDivisor<int>(X_dims[i]);
|
||||
cur_stride *= Y_dims[i];
|
||||
}
|
||||
const int dY_size =
|
||||
std::accumulate(dY_dims, dY_dims + D, 1, std::multiplies<int>());
|
||||
const int dX_size =
|
||||
std::accumulate(dX_dims, dX_dims + D, 1, std::multiplies<int>());
|
||||
const T scale = static_cast<T>(dY_size) / static_cast<T>(dX_size);
|
||||
const int Y_size =
|
||||
std::accumulate(Y_dims, Y_dims + D, 1, std::multiplies<int>());
|
||||
const int X_size =
|
||||
std::accumulate(X_dims, X_dims + D, 1, std::multiplies<int>());
|
||||
const T scale = static_cast<T>(Y_size) / static_cast<T>(X_size);
|
||||
ComputeMomentsGradientCUDAKernel<T, D>
|
||||
<<<CAFFE_GET_BLOCKS(dX_size),
|
||||
<<<CAFFE_GET_BLOCKS(X_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context->cuda_stream()>>>(
|
||||
dX_size,
|
||||
dY_strides_array,
|
||||
dX_dims_array,
|
||||
X_size,
|
||||
Y_strides_array,
|
||||
X_dims_array,
|
||||
scale,
|
||||
dmean,
|
||||
dvariance,
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/utils/fixed_divisor.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
|
|
@ -13,61 +14,63 @@ namespace {
|
|||
|
||||
template <typename T, int D>
|
||||
__global__ void ComputeReduceMinMaxGradientCUDAKernel(
|
||||
const int dX_size,
|
||||
const SimpleArray<int, D> dY_strides,
|
||||
const SimpleArray<int, D> dX_dims,
|
||||
const int X_size,
|
||||
const SimpleArray<int, D> Y_strides,
|
||||
const SimpleArray<FixedDivisor<int>, D> X_dims,
|
||||
const T* dY_data,
|
||||
const T* X_data,
|
||||
const T* Y_data,
|
||||
T* dX_data) {
|
||||
CUDA_1D_KERNEL_LOOP(dX_index, dX_size) {
|
||||
int dY_index = 0;
|
||||
int dX_index_val = dX_index;
|
||||
CUDA_1D_KERNEL_LOOP(X_index, X_size) {
|
||||
int Y_index = 0;
|
||||
int X_index_val = X_index;
|
||||
#pragma unroll
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
dY_index += dY_strides.data[i] == 0
|
||||
? 0
|
||||
: (dX_index_val % dX_dims.data[i]) * dY_strides.data[i];
|
||||
dX_index_val /= dX_dims.data[i];
|
||||
int d;
|
||||
X_dims.data[i].DivMod(X_index_val, &X_index_val, &d);
|
||||
Y_index += d * Y_strides.data[i];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
dX_data[dX_index] = __ldg(Y_data + dY_index) == __ldg(X_data + dX_index)
|
||||
? __ldg(dY_data + dY_index)
|
||||
dX_data[X_index] = __ldg(Y_data + Y_index) == __ldg(X_data + X_index)
|
||||
? __ldg(dY_data + Y_index)
|
||||
: T(0);
|
||||
#else
|
||||
dX_data[dX_index] =
|
||||
Y_data[dY_index] == X_data[dX_index] ? dY_data[dY_index] : T(0);
|
||||
dX_data[X_index] =
|
||||
Y_data[Y_index] == X_data[X_index] ? dY_data[Y_index] : T(0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, int D>
|
||||
void ComputeReduceMinMaxGradientCUDAImpl(
|
||||
const int* dY_dims,
|
||||
const int* dX_dims,
|
||||
const int* Y_dims,
|
||||
const int* X_dims,
|
||||
const T* dY_data,
|
||||
const T* X_data,
|
||||
const T* Y_data,
|
||||
T* dX_data,
|
||||
CUDAContext* context) {
|
||||
SimpleArray<int, D> dY_strides_array;
|
||||
SimpleArray<int, D> dX_dims_array;
|
||||
SimpleArray<int, D> Y_strides_array;
|
||||
SimpleArray<FixedDivisor<int>, D> X_dims_array;
|
||||
int cur_stride = 1;
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
dY_strides_array.data[i] = dY_dims[i] == 1 ? 0 : cur_stride;
|
||||
dX_dims_array.data[i] = dX_dims[i];
|
||||
cur_stride *= dY_dims[i];
|
||||
if (X_dims[i] == 0) {
|
||||
return;
|
||||
}
|
||||
Y_strides_array.data[i] = Y_dims[i] == 1 ? 0 : cur_stride;
|
||||
X_dims_array.data[i] = FixedDivisor<int>(X_dims[i]);
|
||||
cur_stride *= Y_dims[i];
|
||||
}
|
||||
const int dX_size =
|
||||
std::accumulate(dX_dims, dX_dims + D, 1, std::multiplies<int>());
|
||||
const int X_size =
|
||||
std::accumulate(X_dims, X_dims + D, 1, std::multiplies<int>());
|
||||
ComputeReduceMinMaxGradientCUDAKernel<T, D>
|
||||
<<<CAFFE_GET_BLOCKS(dX_size),
|
||||
<<<CAFFE_GET_BLOCKS(X_size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context->cuda_stream()>>>(
|
||||
dX_size,
|
||||
dY_strides_array,
|
||||
dX_dims_array,
|
||||
X_size,
|
||||
Y_strides_array,
|
||||
X_dims_array,
|
||||
dY_data,
|
||||
X_data,
|
||||
Y_data,
|
||||
|
|
|
|||
|
|
@ -32,7 +32,16 @@ class TransposeOp final : public Operator<Context> {
|
|||
~TransposeOp() = default;
|
||||
|
||||
bool RunOnDevice() override {
|
||||
// Do the actual transpose, which is implemented in DoRunWithType().
|
||||
return DispatchHelper<TensorTypes<float, double, int, TIndex>>::call(
|
||||
this, Input(0));
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
bool DoRunWithType() {
|
||||
const auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
const int ndim = X.ndim();
|
||||
if (axes_.empty()) {
|
||||
axes_.resize(ndim);
|
||||
|
|
@ -40,17 +49,6 @@ class TransposeOp final : public Operator<Context> {
|
|||
} else {
|
||||
CAFFE_ENFORCE_EQ(ndim, axes_.size());
|
||||
}
|
||||
// Do the actual transpose, which is implemented in DoRunWithType().
|
||||
return DispatchHelper<TensorTypes<float, double, int, TIndex>>::call(
|
||||
this, Input(0));
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename T>
|
||||
bool DoRunWithType() {
|
||||
const auto& X = Input(0);
|
||||
auto* Y = Output(0);
|
||||
const int ndim = X.ndim();
|
||||
const std::vector<int> X_dims(X.dims().cbegin(), X.dims().cend());
|
||||
std::vector<int> Y_dims(ndim);
|
||||
for (int i = 0; i < ndim; ++i) {
|
||||
|
|
|
|||
|
|
@ -76,9 +76,11 @@ class TestReduceOps(hu.HypothesisTestCase):
|
|||
self.run_reduce_op_test(
|
||||
"ReduceMax", X, keepdims, num_axes, np.max, gc, dc)
|
||||
|
||||
@given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
|
||||
num_axes=st.integers(1, 4), **hu.gcs)
|
||||
def test_reduce_sum(self, X, keepdims, num_axes, gc, dc):
|
||||
@given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
|
||||
t=st.integers(0, 5), keepdims=st.booleans(),
|
||||
num_axes=st.integers(1, 3), **hu.gcs)
|
||||
def test_reduce_sum(self, n, m, k, t, keepdims, num_axes, gc, dc):
|
||||
X = np.random.randn(n, m, k, t).astype(np.float32)
|
||||
self.run_reduce_op_test(
|
||||
"ReduceSum", X, keepdims, num_axes, np.sum, gc, dc)
|
||||
|
||||
|
|
@ -208,7 +210,8 @@ class TestReduceFrontReductions(hu.HypothesisTestCase):
|
|||
workspace.FeedBlob('X', not_empty_X)
|
||||
workspace.RunNet(workspace.GetNetName(net))
|
||||
output = workspace.FetchBlob('output')
|
||||
np.testing.assert_allclose(output, ref_sum(not_empty_X)[0], atol=1e-3)
|
||||
np.testing.assert_allclose(
|
||||
output, ref_sum(not_empty_X)[0], atol=1e-3)
|
||||
|
||||
workspace.FeedBlob('X', X)
|
||||
workspace.RunNet(workspace.GetNetName(net))
|
||||
|
|
|
|||
|
|
@ -1,110 +1,115 @@
|
|||
#ifndef CAFFE2_UTILS_FIXED_DIVISOR_H_
|
||||
#define CAFFE2_UTILS_FIXED_DIVISOR_H_
|
||||
|
||||
#include <cstdlib>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define FIXED_DIVISOR_DECL inline __host__ __device__
|
||||
#else
|
||||
#define FIXED_DIVISOR_DECL inline
|
||||
#endif
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// Utility class for quickly calculating quotients and remainders for
|
||||
// a known integer divisor
|
||||
template <typename T>
|
||||
class FixedDivisor {
|
||||
};
|
||||
class FixedDivisor {};
|
||||
|
||||
// Works for any positive divisor, 1 to INT_MAX. One 64-bit
|
||||
// multiplication and one 64-bit shift is used to calculate the
|
||||
// result.
|
||||
template <>
|
||||
class FixedDivisor<int32_t> {
|
||||
class FixedDivisor<std::int32_t> {
|
||||
public:
|
||||
FixedDivisor(int32_t d) : d_(d) {
|
||||
calcSignedMagic();
|
||||
FixedDivisor() = default;
|
||||
|
||||
explicit FixedDivisor(const std::int32_t d) : d_(d) {
|
||||
CalcSignedMagic();
|
||||
}
|
||||
|
||||
uint64_t getMagic() const {
|
||||
FIXED_DIVISOR_DECL std::int32_t d() const {
|
||||
return d_;
|
||||
}
|
||||
|
||||
FIXED_DIVISOR_DECL std::uint64_t magic() const {
|
||||
return magic_;
|
||||
}
|
||||
|
||||
int getShift() const {
|
||||
FIXED_DIVISOR_DECL int shift() const {
|
||||
return shift_;
|
||||
}
|
||||
|
||||
/// Calculates `q = n / d`.
|
||||
inline int32_t div(int32_t n) const {
|
||||
FIXED_DIVISOR_DECL std::int32_t Div(const std::int32_t n) const {
|
||||
// In lieu of a mulhi instruction being available, perform the
|
||||
// work in uint64
|
||||
uint64_t mul64 = magic_ * (uint64_t) n;
|
||||
return (int32_t) (mul64 >> shift_);
|
||||
return (int32_t)((magic_ * (uint64_t)n) >> shift_);
|
||||
}
|
||||
|
||||
/// Calculates `r = n % d`.
|
||||
inline int32_t mod(int32_t n) const {
|
||||
return n - d_ * div(n);
|
||||
FIXED_DIVISOR_DECL std::int32_t Mod(const std::int32_t n) const {
|
||||
return n - d_ * Div(n);
|
||||
}
|
||||
|
||||
/// Calculates `q = n / d` and `r = n % d` together.
|
||||
inline void divMod(int32_t n, int32_t& q, int32_t& r) const {
|
||||
const int32_t quotient = div(n);
|
||||
q = quotient;
|
||||
r = n - d_ * quotient;
|
||||
FIXED_DIVISOR_DECL void
|
||||
DivMod(const std::int32_t n, std::int32_t* q, int32_t* r) const {
|
||||
*q = Div(n);
|
||||
*r = n - d_ * *q;
|
||||
}
|
||||
|
||||
private:
|
||||
/**
|
||||
Calculates magic multiplicative value and shift amount for
|
||||
calculating `q = n / d` for signed 32-bit integers.
|
||||
Implementation taken from Hacker's Delight section 10.
|
||||
*/
|
||||
void calcSignedMagic() {
|
||||
// Calculates magic multiplicative value and shift amount for calculating `q =
|
||||
// n / d` for signed 32-bit integers.
|
||||
// Implementation taken from Hacker's Delight section 10.
|
||||
void CalcSignedMagic() {
|
||||
if (d_ == 1) {
|
||||
magic_ = UINT64_C(0x1) << 32;
|
||||
shift_ = 32;
|
||||
return;
|
||||
}
|
||||
|
||||
const uint32_t two31 = UINT32_C(0x80000000);
|
||||
uint32_t ad = std::abs(d_);
|
||||
uint32_t t = two31 + ((uint32_t) d_ >> 31);
|
||||
uint32_t anc = t - 1 - t % ad; // Absolute value of nc.
|
||||
uint32_t p = 31; // Init. p.
|
||||
uint32_t q1 = two31 / anc; // Init. q1 = 2**p/|nc|.
|
||||
uint32_t r1 = two31 - q1 * anc; // Init. r1 = rem(2**p, |nc|).
|
||||
uint32_t q2 = two31 / ad; // Init. q2 = 2**p/|d|.
|
||||
uint32_t r2 = two31 - q2 * ad; // Init. r2 = rem(2**p, |d|).
|
||||
uint32_t delta = 0;
|
||||
|
||||
const std::uint32_t two31 = UINT32_C(0x80000000);
|
||||
const std::uint32_t ad = std::abs(d_);
|
||||
const std::uint32_t t = two31 + ((uint32_t)d_ >> 31);
|
||||
const std::uint32_t anc = t - 1 - t % ad; // Absolute value of nc.
|
||||
std::uint32_t p = 31; // Init. p.
|
||||
std::uint32_t q1 = two31 / anc; // Init. q1 = 2**p/|nc|.
|
||||
std::uint32_t r1 = two31 - q1 * anc; // Init. r1 = rem(2**p, |nc|).
|
||||
std::uint32_t q2 = two31 / ad; // Init. q2 = 2**p/|d|.
|
||||
std::uint32_t r2 = two31 - q2 * ad; // Init. r2 = rem(2**p, |d|).
|
||||
std::uint32_t delta = 0;
|
||||
do {
|
||||
p = p + 1;
|
||||
q1 = 2 * q1; // Update q1 = 2**p/|nc|.
|
||||
r1 = 2 * r1; // Update r1 = rem(2**p, |nc|).
|
||||
|
||||
if (r1 >= anc) { // (Must be an unsigned
|
||||
q1 = q1 + 1; // comparison here).
|
||||
r1 = r1 - anc;
|
||||
++p;
|
||||
q1 <<= 1; // Update q1 = 2**p/|nc|.
|
||||
r1 <<= 1; // Update r1 = rem(2**p, |nc|).
|
||||
if (r1 >= anc) { // (Must be an unsigned
|
||||
++q1; // comparison here).
|
||||
r1 -= anc;
|
||||
}
|
||||
|
||||
q2 = 2 * q2; // Update q2 = 2**p/|d|.
|
||||
r2 = 2 * r2; // Update r2 = rem(2**p, |d|).
|
||||
|
||||
if (r2 >= ad) { // (Must be an unsigned
|
||||
q2 = q2 + 1; // comparison here).
|
||||
r2 = r2 - ad;
|
||||
q2 <<= 1; // Update q2 = 2**p/|d|.
|
||||
r2 <<= 1; // Update r2 = rem(2**p, |d|).
|
||||
if (r2 >= ad) { // (Must be an unsigned
|
||||
++q2; // comparison here).
|
||||
r2 -= ad;
|
||||
}
|
||||
|
||||
delta = ad - r2;
|
||||
} while (q1 < delta || (q1 == delta && r1 == 0));
|
||||
|
||||
int32_t magic = q2 + 1;
|
||||
std::int32_t magic = q2 + 1;
|
||||
if (d_ < 0) {
|
||||
magic = -magic;
|
||||
}
|
||||
shift_ = p;
|
||||
magic_ = (uint64_t) (uint32_t) magic;
|
||||
magic_ = (std::uint64_t)(std::uint32_t)magic;
|
||||
}
|
||||
|
||||
int32_t d_;
|
||||
uint64_t magic_;
|
||||
std::int32_t d_ = 1;
|
||||
std::uint64_t magic_;
|
||||
int shift_;
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
#include "caffe2/utils/fixed_divisor.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <random>
|
||||
|
|
@ -7,73 +8,71 @@ namespace caffe2 {
|
|||
|
||||
namespace {
|
||||
|
||||
void compareDivMod(int32_t v, int32_t divisor) {
|
||||
void CompareDivMod(int32_t v, int32_t divisor) {
|
||||
auto fixed = FixedDivisor<int32_t>(divisor);
|
||||
|
||||
int nativeQ = v / divisor;
|
||||
int nativeR = v % divisor;
|
||||
int native_q = v / divisor;
|
||||
int native_r = v % divisor;
|
||||
|
||||
int fixedQ = fixed.div(v);
|
||||
int fixedR = fixed.mod(v);
|
||||
int fixed_q = fixed.Div(v);
|
||||
int fixed_r = fixed.Mod(v);
|
||||
|
||||
EXPECT_EQ(fixedQ, nativeQ) << v << " / " << divisor
|
||||
<< " magic " << fixed.getMagic()
|
||||
<< " shift " << fixed.getShift()
|
||||
<< " quot " << fixedQ << " " << nativeQ;
|
||||
EXPECT_EQ(native_q, fixed_q)
|
||||
<< v << " / " << divisor << " magic " << fixed.magic() << " shift "
|
||||
<< fixed.shift() << " quot " << fixed_q << " " << native_q;
|
||||
|
||||
EXPECT_EQ(fixedR, nativeR) << v << " / " << divisor
|
||||
<< " magic " << fixed.getMagic()
|
||||
<< " shift " << fixed.getShift()
|
||||
<< " rem " << fixedR << " " << nativeR;
|
||||
EXPECT_EQ(native_r, fixed_r)
|
||||
<< v << " / " << divisor << " magic " << fixed.magic() << " shift "
|
||||
<< fixed.shift() << " rem " << fixed_r << " " << native_r;
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace
|
||||
|
||||
TEST(FixedDivisorTest, Test) {
|
||||
TEST(FixedDivisorTest, FixedDivisorInt32Test) {
|
||||
constexpr int32_t kMax = std::numeric_limits<int32_t>::max();
|
||||
|
||||
// divide by 1
|
||||
compareDivMod(kMax, 1);
|
||||
compareDivMod(0, 1);
|
||||
compareDivMod(1, 1);
|
||||
CompareDivMod(kMax, 1);
|
||||
CompareDivMod(0, 1);
|
||||
CompareDivMod(1, 1);
|
||||
|
||||
// divide by max
|
||||
compareDivMod(kMax, kMax);
|
||||
compareDivMod(0, kMax);
|
||||
compareDivMod(1, kMax);
|
||||
CompareDivMod(kMax, kMax);
|
||||
CompareDivMod(0, kMax);
|
||||
CompareDivMod(1, kMax);
|
||||
|
||||
// divide by random positive values
|
||||
std::random_device rd;
|
||||
std::uniform_int_distribution<int32_t> vDist(0, kMax);
|
||||
std::uniform_int_distribution<int32_t> qDist(1, kMax);
|
||||
std::uniform_int_distribution<int32_t> v_dist(0, kMax);
|
||||
std::uniform_int_distribution<int32_t> q_dist(1, kMax);
|
||||
|
||||
std::uniform_int_distribution<int32_t> vSmallDist(0, 1000);
|
||||
std::uniform_int_distribution<int32_t> qSmallDist(1, 1000);
|
||||
std::uniform_int_distribution<int32_t> v_small_dist(0, 1000);
|
||||
std::uniform_int_distribution<int32_t> q_small_dist(1, 1000);
|
||||
for (int i = 0; i < 10000; ++i) {
|
||||
auto q = qDist(rd);
|
||||
auto v = vDist(rd);
|
||||
auto qSmall = qSmallDist(rd);
|
||||
auto vSmall = vSmallDist(rd);
|
||||
auto q = q_dist(rd);
|
||||
auto v = v_dist(rd);
|
||||
auto q_small = q_small_dist(rd);
|
||||
auto v_small = v_small_dist(rd);
|
||||
|
||||
// random value
|
||||
compareDivMod(vSmall, qSmall);
|
||||
compareDivMod(vSmall, q);
|
||||
compareDivMod(v, qSmall);
|
||||
compareDivMod(v, q);
|
||||
CompareDivMod(v_small, q_small);
|
||||
CompareDivMod(v_small, q);
|
||||
CompareDivMod(v, q_small);
|
||||
CompareDivMod(v, q);
|
||||
|
||||
// special values
|
||||
compareDivMod(kMax, qSmall);
|
||||
compareDivMod(0, qSmall);
|
||||
compareDivMod(1, qSmall);
|
||||
compareDivMod(kMax, q);
|
||||
compareDivMod(0, q);
|
||||
compareDivMod(1, q);
|
||||
CompareDivMod(kMax, q_small);
|
||||
CompareDivMod(0, q_small);
|
||||
CompareDivMod(1, q_small);
|
||||
CompareDivMod(kMax, q);
|
||||
CompareDivMod(0, q);
|
||||
CompareDivMod(1, q);
|
||||
|
||||
compareDivMod(vSmall, 1);
|
||||
compareDivMod(vSmall, kMax);
|
||||
compareDivMod(v, 1);
|
||||
compareDivMod(v, kMax);
|
||||
CompareDivMod(v_small, 1);
|
||||
CompareDivMod(v_small, kMax);
|
||||
CompareDivMod(v, 1);
|
||||
CompareDivMod(v, kMax);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
} // namespace caffe2
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "caffe2/utils/conversions.h"
|
||||
#include "caffe2/utils/fixed_divisor.h"
|
||||
#include "caffe2/utils/math_utils.h"
|
||||
|
||||
#if THRUST_VERSION >= 100800
|
||||
|
|
@ -71,15 +72,14 @@ __global__ void SimpleBinaryOpCUDAKernel(
|
|||
|
||||
template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
|
||||
__global__ void RowwiseBinaryOpCUDAKenel(
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int size,
|
||||
const FixedDivisor<int> cols,
|
||||
const BinaryOperator op,
|
||||
const TIn* A,
|
||||
const TIn* B,
|
||||
TOut* C) {
|
||||
const int size = rows * cols;
|
||||
CUDA_1D_KERNEL_LOOP(C_index, size) {
|
||||
const int j = C_index % cols;
|
||||
const int j = cols.Mod(C_index);
|
||||
const int A_index = broadcast_1st ? j : C_index;
|
||||
const int B_index = broadcast_1st ? C_index : j;
|
||||
C[C_index] = op(A[A_index], B[B_index]);
|
||||
|
|
@ -88,15 +88,14 @@ __global__ void RowwiseBinaryOpCUDAKenel(
|
|||
|
||||
template <typename TIn, typename TOut, class BinaryOperator, bool broadcast_1st>
|
||||
__global__ void ColwiseBinaryOpCUDAKenel(
|
||||
const int rows,
|
||||
const int cols,
|
||||
const int size,
|
||||
const FixedDivisor<int> cols,
|
||||
const BinaryOperator op,
|
||||
const TIn* A,
|
||||
const TIn* B,
|
||||
TOut* C) {
|
||||
const int size = rows * cols;
|
||||
CUDA_1D_KERNEL_LOOP(C_index, size) {
|
||||
const int i = C_index / cols;
|
||||
const int i = cols.Div(C_index);
|
||||
const int A_index = broadcast_1st ? i : C_index;
|
||||
const int B_index = broadcast_1st ? C_index : i;
|
||||
C[C_index] = op(A[A_index], B[B_index]);
|
||||
|
|
@ -108,7 +107,7 @@ __global__ void BroadcastBinaryOpCUDAKernel(
|
|||
const int size,
|
||||
const SimpleArray<int, D> A_strides,
|
||||
const SimpleArray<int, D> B_strides,
|
||||
const SimpleArray<int, D> C_dims,
|
||||
const SimpleArray<FixedDivisor<int>, D> C_dims,
|
||||
const BinaryOperator op,
|
||||
const TIn* A,
|
||||
const TIn* B,
|
||||
|
|
@ -119,10 +118,10 @@ __global__ void BroadcastBinaryOpCUDAKernel(
|
|||
int C_index_val = C_index;
|
||||
#pragma unroll
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
const int d = C_index_val % C_dims.data[i];
|
||||
A_index += A_strides.data[i] == 0 ? 0 : d * A_strides.data[i];
|
||||
B_index += B_strides.data[i] == 0 ? 0 : d * B_strides.data[i];
|
||||
C_index_val /= C_dims.data[i];
|
||||
int d;
|
||||
C_dims.data[i].DivMod(C_index_val, &C_index_val, &d);
|
||||
A_index += d * A_strides.data[i];
|
||||
B_index += d * B_strides.data[i];
|
||||
}
|
||||
C[C_index] = op(A[A_index], B[B_index]);
|
||||
}
|
||||
|
|
@ -144,20 +143,24 @@ void BinaryOpWith2DBroadcasting(
|
|||
std::accumulate(dims, dims + pivot, 1, std::multiplies<int>());
|
||||
const int cols =
|
||||
std::accumulate(dims + pivot, dims + ndim, 1, std::multiplies<int>());
|
||||
if (rows == 0 || cols == 0) {
|
||||
return;
|
||||
}
|
||||
const int size = rows * cols;
|
||||
const FixedDivisor<int> cols_div(cols);
|
||||
if (rowwise_broadcast) {
|
||||
if (broadcast_1st) {
|
||||
RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, true>
|
||||
<<<CAFFE_GET_BLOCKS(size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context->cuda_stream()>>>(rows, cols, op, A, B, C);
|
||||
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
|
||||
} else {
|
||||
RowwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
|
||||
<<<CAFFE_GET_BLOCKS(size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context->cuda_stream()>>>(rows, cols, op, A, B, C);
|
||||
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
|
||||
}
|
||||
} else {
|
||||
if (broadcast_1st) {
|
||||
|
|
@ -165,13 +168,13 @@ void BinaryOpWith2DBroadcasting(
|
|||
<<<CAFFE_GET_BLOCKS(size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context->cuda_stream()>>>(rows, cols, op, A, B, C);
|
||||
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
|
||||
} else {
|
||||
ColwiseBinaryOpCUDAKenel<TIn, TOut, BinaryOperator, false>
|
||||
<<<CAFFE_GET_BLOCKS(size),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context->cuda_stream()>>>(rows, cols, op, A, B, C);
|
||||
context->cuda_stream()>>>(size, cols_div, op, A, B, C);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -188,16 +191,19 @@ void BroadcastBinaryOpImpl(
|
|||
CUDAContext* context) {
|
||||
SimpleArray<int, D> A_strides_array;
|
||||
SimpleArray<int, D> B_strides_array;
|
||||
SimpleArray<int, D> C_dims_array;
|
||||
SimpleArray<FixedDivisor<int>, D> C_dims_array;
|
||||
int A_stride = 1;
|
||||
int B_stride = 1;
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
if (C_dims[i] == 0) {
|
||||
return;
|
||||
}
|
||||
A_strides_array.data[i] = A_dims[i] == 1 ? 0 : A_stride;
|
||||
B_strides_array.data[i] = B_dims[i] == 1 ? 0 : B_stride;
|
||||
A_stride *= A_dims[i];
|
||||
B_stride *= B_dims[i];
|
||||
C_dims_array.data[i] = FixedDivisor<int>(C_dims[i]);
|
||||
}
|
||||
std::copy(C_dims, C_dims + D, C_dims_array.data);
|
||||
const int size =
|
||||
std::accumulate(C_dims, C_dims + D, 1, std::multiplies<int>());
|
||||
BroadcastBinaryOpCUDAKernel<TIn, TOut, BinaryOperator, D>
|
||||
|
|
@ -448,66 +454,82 @@ DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION(
|
|||
|
||||
#undef DELEGATE_SIMPLE_CUDA_BINARY_FUNCTION
|
||||
|
||||
#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
|
||||
template <> \
|
||||
void Rowwise##Func<TIn, CUDAContext, true>( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const TIn* A, \
|
||||
const TIn* B, \
|
||||
TOut* C, \
|
||||
CUDAContext* context) { \
|
||||
const int size = rows * cols; \
|
||||
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
|
||||
<<<CAFFE_GET_BLOCKS(size), \
|
||||
CAFFE_CUDA_NUM_THREADS, \
|
||||
0, \
|
||||
context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C); \
|
||||
} \
|
||||
template <> \
|
||||
void Rowwise##Func<TIn, CUDAContext, false>( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const TIn* A, \
|
||||
const TIn* B, \
|
||||
TOut* C, \
|
||||
CUDAContext* context) { \
|
||||
const int size = rows * cols; \
|
||||
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
|
||||
<<<CAFFE_GET_BLOCKS(size), \
|
||||
CAFFE_CUDA_NUM_THREADS, \
|
||||
0, \
|
||||
context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C); \
|
||||
} \
|
||||
template <> \
|
||||
void Colwise##Func<TIn, CUDAContext, true>( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const TIn* A, \
|
||||
const TIn* B, \
|
||||
TOut* C, \
|
||||
CUDAContext* context) { \
|
||||
const int size = rows * cols; \
|
||||
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
|
||||
<<<CAFFE_GET_BLOCKS(size), \
|
||||
CAFFE_CUDA_NUM_THREADS, \
|
||||
0, \
|
||||
context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C); \
|
||||
} \
|
||||
template <> \
|
||||
void Colwise##Func<TIn, CUDAContext, false>( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const TIn* A, \
|
||||
const TIn* B, \
|
||||
TOut* C, \
|
||||
CUDAContext* context) { \
|
||||
const int size = rows * cols; \
|
||||
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
|
||||
<<<CAFFE_GET_BLOCKS(size), \
|
||||
CAFFE_CUDA_NUM_THREADS, \
|
||||
0, \
|
||||
context->cuda_stream()>>>(rows, cols, Op<TIn>(), A, B, C); \
|
||||
#define DELEGATE_2D_BROADCAST_CUDA_BINARY_FUNCTION(TIn, TOut, Func, Op) \
|
||||
template <> \
|
||||
void Rowwise##Func<TIn, CUDAContext, true>( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const TIn* A, \
|
||||
const TIn* B, \
|
||||
TOut* C, \
|
||||
CUDAContext* context) { \
|
||||
if (rows == 0 || cols == 0) { \
|
||||
return; \
|
||||
} \
|
||||
const int size = rows * cols; \
|
||||
const FixedDivisor<int> cols_div(cols); \
|
||||
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
|
||||
<<<CAFFE_GET_BLOCKS(size), \
|
||||
CAFFE_CUDA_NUM_THREADS, \
|
||||
0, \
|
||||
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
|
||||
} \
|
||||
template <> \
|
||||
void Rowwise##Func<TIn, CUDAContext, false>( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const TIn* A, \
|
||||
const TIn* B, \
|
||||
TOut* C, \
|
||||
CUDAContext* context) { \
|
||||
if (rows == 0 || cols == 0) { \
|
||||
return; \
|
||||
} \
|
||||
const int size = rows * cols; \
|
||||
const FixedDivisor<int> cols_div(cols); \
|
||||
RowwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
|
||||
<<<CAFFE_GET_BLOCKS(size), \
|
||||
CAFFE_CUDA_NUM_THREADS, \
|
||||
0, \
|
||||
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
|
||||
} \
|
||||
template <> \
|
||||
void Colwise##Func<TIn, CUDAContext, true>( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const TIn* A, \
|
||||
const TIn* B, \
|
||||
TOut* C, \
|
||||
CUDAContext* context) { \
|
||||
if (rows == 0 || cols == 0) { \
|
||||
return; \
|
||||
} \
|
||||
const int size = rows * cols; \
|
||||
const FixedDivisor<int> cols_div(cols); \
|
||||
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, true> \
|
||||
<<<CAFFE_GET_BLOCKS(size), \
|
||||
CAFFE_CUDA_NUM_THREADS, \
|
||||
0, \
|
||||
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
|
||||
} \
|
||||
template <> \
|
||||
void Colwise##Func<TIn, CUDAContext, false>( \
|
||||
const int rows, \
|
||||
const int cols, \
|
||||
const TIn* A, \
|
||||
const TIn* B, \
|
||||
TOut* C, \
|
||||
CUDAContext* context) { \
|
||||
if (rows == 0 || cols == 0) { \
|
||||
return; \
|
||||
} \
|
||||
const int size = rows * cols; \
|
||||
const FixedDivisor<int> cols_div(cols); \
|
||||
ColwiseBinaryOpCUDAKenel<TIn, TOut, Op<TIn>, false> \
|
||||
<<<CAFFE_GET_BLOCKS(size), \
|
||||
CAFFE_CUDA_NUM_THREADS, \
|
||||
0, \
|
||||
context->cuda_stream()>>>(size, cols_div, Op<TIn>(), A, B, C); \
|
||||
}
|
||||
|
||||
#define DEFINE_2D_BROADCAST_CUDA_COMPARE_FUNCTION(Func, Op) \
|
||||
|
|
@ -2728,7 +2750,7 @@ __global__ void ReduceTensorCUDAKernel(
|
|||
const int outer_size,
|
||||
const int inner_size,
|
||||
SimpleArray<int, D> X_strides,
|
||||
SimpleArray<int, D> Y_dims,
|
||||
SimpleArray<FixedDivisor<int>, D> Y_dims,
|
||||
const Reducer reducer,
|
||||
const T init,
|
||||
const T* X,
|
||||
|
|
@ -2741,8 +2763,9 @@ __global__ void ReduceTensorCUDAKernel(
|
|||
int Y_index = i * inner_size + j;
|
||||
#pragma unroll
|
||||
for (int d = D - 1; d >= 0; --d) {
|
||||
X_index += (Y_index % Y_dims.data[d]) * X_strides.data[d];
|
||||
Y_index /= Y_dims.data[d];
|
||||
int r;
|
||||
Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
|
||||
X_index += r * X_strides.data[d];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
val = reducer(val, __ldg(X + X_index));
|
||||
|
|
@ -2770,10 +2793,10 @@ void ReduceTensorCUDAImpl(
|
|||
T* Y,
|
||||
CUDAContext* context) {
|
||||
SimpleArray<int, D> X_strides;
|
||||
SimpleArray<int, D> Y_dims;
|
||||
SimpleArray<FixedDivisor<int>, D> Y_dims;
|
||||
utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
|
||||
for (int i = 0; i < D; ++i) {
|
||||
Y_dims.data[i] = dims[axes[i]];
|
||||
Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
|
||||
}
|
||||
ReduceTensorCUDAKernel<T, Reducer, D>
|
||||
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
|
|
@ -2810,7 +2833,7 @@ void ReduceTensorCUDA(
|
|||
for (int i = pivot; i < num_dims; ++i) {
|
||||
inner_size *= dims[transpose_axes[i]];
|
||||
}
|
||||
if (outer_size > 0) {
|
||||
if (outer_size > 0 && inner_size > 0) {
|
||||
if (transpose_axes[pivot] == pivot) {
|
||||
RowwiseReduceKernel<T>
|
||||
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
|
|
@ -2834,6 +2857,8 @@ void ReduceTensorCUDA(
|
|||
X,
|
||||
Y,
|
||||
context);
|
||||
} else if (outer_size > 0) {
|
||||
math::Set<T, CUDAContext>(outer_size, init, Y, context);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -3065,7 +3090,7 @@ __global__ void MomentsCUDAKernel(
|
|||
const int outer_size,
|
||||
const int inner_size,
|
||||
SimpleArray<int, D> X_strides,
|
||||
SimpleArray<int, D> Y_dims,
|
||||
SimpleArray<FixedDivisor<int>, D> Y_dims,
|
||||
const T* X,
|
||||
T* mean,
|
||||
T* variance) {
|
||||
|
|
@ -3078,9 +3103,10 @@ __global__ void MomentsCUDAKernel(
|
|||
int X_index = 0;
|
||||
int Y_index = i * inner_size + j;
|
||||
#pragma unroll
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
X_index += (Y_index % Y_dims.data[i]) * X_strides.data[i];
|
||||
Y_index /= Y_dims.data[i];
|
||||
for (int d = D - 1; d >= 0; --d) {
|
||||
int r;
|
||||
Y_dims.data[d].DivMod(Y_index, &Y_index, &r);
|
||||
X_index += r * X_strides.data[d];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
m_val += __ldg(X + X_index);
|
||||
|
|
@ -3111,10 +3137,10 @@ void MomentsCUDAImpl(
|
|||
T* variance,
|
||||
CUDAContext* context) {
|
||||
SimpleArray<int, D> X_strides;
|
||||
SimpleArray<int, D> Y_dims;
|
||||
SimpleArray<FixedDivisor<int>, D> Y_dims;
|
||||
utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
|
||||
for (int i = 0; i < D; ++i) {
|
||||
Y_dims.data[i] = dims[axes[i]];
|
||||
Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
|
||||
}
|
||||
MomentsCUDAKernel<T, D>
|
||||
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
|
|
@ -3147,7 +3173,7 @@ void MomentsCUDA(
|
|||
for (int i = pivot; i < num_dims; ++i) {
|
||||
inner_size *= dims[transpose_axes[i]];
|
||||
}
|
||||
if (outer_size > 0) {
|
||||
if (outer_size > 0 && inner_size > 0) {
|
||||
if (transpose_axes[pivot] == pivot) {
|
||||
RowwiseMomentsCUDAKernel<T>
|
||||
<<<std::min(outer_size, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
|
|
@ -3197,7 +3223,7 @@ template <typename T, int D>
|
|||
__global__ void TransposeCUDAKernel(
|
||||
const int size,
|
||||
const SimpleArray<int, D> X_strides,
|
||||
const SimpleArray<int, D> Y_dims,
|
||||
const SimpleArray<FixedDivisor<int>, D> Y_dims,
|
||||
const T* X,
|
||||
T* Y) {
|
||||
CUDA_1D_KERNEL_LOOP(Y_index, size) {
|
||||
|
|
@ -3205,8 +3231,9 @@ __global__ void TransposeCUDAKernel(
|
|||
int Y_index_val = Y_index;
|
||||
#pragma unroll
|
||||
for (int i = D - 1; i >= 0; --i) {
|
||||
X_index += (Y_index_val % Y_dims.data[i]) * X_strides.data[i];
|
||||
Y_index_val /= Y_dims.data[i];
|
||||
int d;
|
||||
Y_dims.data[i].DivMod(Y_index_val, &Y_index_val, &d);
|
||||
X_index += d * X_strides.data[i];
|
||||
}
|
||||
#if __CUDA_ARCH__ >= 350
|
||||
Y[Y_index] = __ldg(X + X_index);
|
||||
|
|
@ -3224,11 +3251,11 @@ void TransposeCUDAImpl(
|
|||
T* Y,
|
||||
CUDAContext* context) {
|
||||
SimpleArray<int, D> X_strides;
|
||||
SimpleArray<int, D> Y_dims;
|
||||
SimpleArray<FixedDivisor<int>, D> Y_dims;
|
||||
utils::ComputeTransposedStrides(D, dims, axes, X_strides.data);
|
||||
int size = 1;
|
||||
for (int i = 0; i < D; ++i) {
|
||||
Y_dims.data[i] = dims[axes[i]];
|
||||
Y_dims.data[i] = FixedDivisor<int>(dims[axes[i]]);
|
||||
size *= dims[i];
|
||||
}
|
||||
TransposeCUDAKernel<T, D>
|
||||
|
|
@ -3240,17 +3267,23 @@ void TransposeCUDAImpl(
|
|||
|
||||
} // namespace
|
||||
|
||||
#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T) \
|
||||
template <> \
|
||||
void Transpose<T, CUDAContext>( \
|
||||
const int ndim, \
|
||||
const int* dims, \
|
||||
const int* axes, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CUDAContext* context) { \
|
||||
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \
|
||||
ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context); \
|
||||
#define CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(T) \
|
||||
template <> \
|
||||
void Transpose<T, CUDAContext>( \
|
||||
const int ndim, \
|
||||
const int* dims, \
|
||||
const int* axes, \
|
||||
const T* X, \
|
||||
T* Y, \
|
||||
CUDAContext* context) { \
|
||||
if (utils::IsIdentityPermutation(ndim, axes)) { \
|
||||
const int size = \
|
||||
std::accumulate(dims, dims + ndim, 1, std::multiplies<int>()); \
|
||||
context->template Copy<T, CUDAContext, CUDAContext>(size, X, Y); \
|
||||
return; \
|
||||
} \
|
||||
DISPATCH_FUNCTION_BY_VALUE_WITH_TYPE_1( \
|
||||
ndim, TransposeCUDAImpl, T, dims, axes, X, Y, context); \
|
||||
}
|
||||
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(float)
|
||||
CAFFE2_SPECIALIZED_CUDA_TRANSPOSE(double)
|
||||
|
|
|
|||
|
|
@ -30,6 +30,15 @@ int GetIndexFromDims(const int n, const int* dims, const int* index) {
|
|||
return sum;
|
||||
}
|
||||
|
||||
bool IsIdentityPermutation(const int n, const int* perm) {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
if (perm[i] != i) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void ComputeBroadcastBinaryOpDims(
|
||||
const int A_ndim,
|
||||
const int* A_dims,
|
||||
|
|
|
|||
|
|
@ -46,6 +46,9 @@ void IncreaseIndexInDims(const int n, const int* dims, int* index);
|
|||
// Get index value from dims and index digits.
|
||||
int GetIndexFromDims(const int n, const int* dims, const int* index);
|
||||
|
||||
// Checks if the input permutation is an identity permutation;
|
||||
bool IsIdentityPermutation(const int n, const int* perm);
|
||||
|
||||
// Computest the broadcast binary operation dims.
|
||||
void ComputeBroadcastBinaryOpDims(
|
||||
const int A_ndim,
|
||||
|
|
|
|||
|
|
@ -77,7 +77,7 @@ static void compute_2d(const struct compute_2d_context* context, size_t linear_i
|
|||
|
||||
int q;
|
||||
int r;
|
||||
context->range_j.divMod((int) linear_index, q, r);
|
||||
context->range_j.DivMod((int)linear_index, &q, &r);
|
||||
context->function(context->argument, q, r);
|
||||
}
|
||||
|
||||
|
|
@ -121,7 +121,7 @@ static void compute_2d_tiled(const struct compute_2d_tiled_context* context, siz
|
|||
int q;
|
||||
int r;
|
||||
|
||||
context->tile_range_j.divMod(linear_index, q, r);
|
||||
context->tile_range_j.DivMod(linear_index, &q, &r);
|
||||
const size_t max_tile_i = context->tile_i;
|
||||
const size_t max_tile_j = context->tile_j;
|
||||
const size_t index_i = q * max_tile_i;
|
||||
|
|
|
|||
Loading…
Reference in a new issue