mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/73151 Test Plan: Sandcastle Reviewed By: malfet Differential Revision: D34365492 fbshipit-source-id: d9eaa2e21aacd8ff0b97152e590d83f682df4667 (cherry picked from commit ca0efc53db568b035049b09f5827387d2dc7e382)
262 lines
7.5 KiB
Text
262 lines
7.5 KiB
Text
#include "caffe2/operators/elementwise_ops.h"
|
|
|
|
#include "caffe2/utils/cub_namespace.cuh"
|
|
#include <cub/block/block_load.cuh>
|
|
#include <cub/block/block_reduce.cuh>
|
|
#include <cub/device/device_reduce.cuh>
|
|
|
|
#include "caffe2/core/common_gpu.h"
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/utils/conversions.h"
|
|
|
|
#ifdef __HIPCC__
|
|
#if TORCH_HIP_VERSION < 210
|
|
// rocblas doesn't fully support fp16 yet
|
|
#define ROCBLAS_FP16 0
|
|
#endif
|
|
#endif
|
|
|
|
namespace caffe2 {
|
|
|
|
REGISTER_CUDA_OPERATOR(
|
|
Not,
|
|
UnaryElementwiseOp<BoolTypes, CUDAContext, NotFunctor<CUDAContext>>);
|
|
REGISTER_CUDA_OPERATOR(
|
|
Sign,
|
|
UnaryElementwiseOp<NumericTypes, CUDAContext, SignFunctor<CUDAContext>>);
|
|
|
|
#define REGISTER_CUDA_COMPARE_OPERATOR(Op) \
|
|
REGISTER_CUDA_OPERATOR( \
|
|
Op, \
|
|
BinaryElementwiseOp< \
|
|
TensorTypes<bool, int32_t, int64_t, float, double>, \
|
|
CUDAContext, \
|
|
Op##Functor<CUDAContext>, \
|
|
FixedType<bool>>)
|
|
|
|
REGISTER_CUDA_COMPARE_OPERATOR(EQ);
|
|
REGISTER_CUDA_COMPARE_OPERATOR(NE);
|
|
REGISTER_CUDA_COMPARE_OPERATOR(LT);
|
|
REGISTER_CUDA_COMPARE_OPERATOR(LE);
|
|
REGISTER_CUDA_COMPARE_OPERATOR(GT);
|
|
REGISTER_CUDA_COMPARE_OPERATOR(GE);
|
|
|
|
#undef REGISTER_CUDA_COMPARE_OPERATOR
|
|
|
|
#define REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Op) \
|
|
REGISTER_CUDA_OPERATOR( \
|
|
Op, \
|
|
BinaryElementwiseOp<BoolTypes, CUDAContext, Op##Functor<CUDAContext>>)
|
|
|
|
REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(And);
|
|
REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Or);
|
|
REGISTER_CUDA_LOGICAL_BINARY_OPERATOR(Xor);
|
|
|
|
#undef REGISTER_CUDA_LOGICAL_BINARY_OPERATOR
|
|
|
|
#define REGISTER_CUDA_BITWISE_BINARY_OPERATOR(Op) \
|
|
REGISTER_CUDA_OPERATOR( \
|
|
Op, \
|
|
BinaryElementwiseOp< \
|
|
IntBoolTypes, \
|
|
CUDAContext, \
|
|
Op##Functor<CUDAContext>>)
|
|
|
|
REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseAnd);
|
|
REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseOr);
|
|
REGISTER_CUDA_BITWISE_BINARY_OPERATOR(BitwiseXor);
|
|
|
|
#undef REGISTER_CUDA_BITWISE_BINARY_OPERATOR
|
|
|
|
namespace {
|
|
|
|
template <typename T>
|
|
__global__ void
|
|
reduce_sum_like_post1(const T* g_idata, T* g_odata, int pre, int N) {
|
|
int n = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (n >= N) {
|
|
return;
|
|
}
|
|
|
|
float sum = 0.0;
|
|
for (int i = 0; i < pre; ++i) {
|
|
sum += convert::To<T, float>(g_idata[i * N + n]);
|
|
}
|
|
|
|
g_odata[n] = convert::To<float, T>(sum);
|
|
}
|
|
|
|
template <typename T>
|
|
void device_reduce(
|
|
const T* d_in,
|
|
T* d_out,
|
|
int N,
|
|
Tensor* buffer,
|
|
CUDAContext* context) {
|
|
// Determine temporary device storage requirements
|
|
size_t temp_storage_bytes = 0;
|
|
cub::DeviceReduce::Sum(
|
|
NULL, temp_storage_bytes, d_in, d_out, N, context->cuda_stream());
|
|
|
|
auto buffer_size = temp_storage_bytes / sizeof(T);
|
|
buffer_size += temp_storage_bytes % sizeof(T) != 0 ? 1 : 0;
|
|
buffer->Resize(buffer_size);
|
|
void* d_temp_storage = static_cast<void*>(buffer->template mutable_data<T>());
|
|
// Run sum-reduction
|
|
cub::DeviceReduce::Sum(
|
|
d_temp_storage,
|
|
temp_storage_bytes,
|
|
d_in,
|
|
d_out,
|
|
N,
|
|
context->cuda_stream());
|
|
}
|
|
|
|
template <>
|
|
void device_reduce<at::Half>(
|
|
const at::Half* in,
|
|
at::Half* out,
|
|
int N,
|
|
Tensor* buffer,
|
|
CUDAContext* context) {
|
|
(void)N; // Suppress unused variable warning
|
|
(void)buffer; // Suppress unused variable warning
|
|
(void)context; // Suppress unused variable warning
|
|
#if TORCH_HIP_VERSION >= 210
|
|
auto buffer_size = 1;
|
|
|
|
if (buffer->numel() != buffer_size) {
|
|
buffer->Resize(buffer_size);
|
|
|
|
math::Set<at::Half, CUDAContext>(
|
|
N,
|
|
convert::To<float, at::Half>(1.),
|
|
buffer->template mutable_data<at::Half>(),
|
|
context);
|
|
}
|
|
|
|
CUBLAS_ENFORCE(rocblas_hdot(
|
|
context->cublas_handle(),
|
|
N,
|
|
reinterpret_cast<const rocblas_half*>(in),
|
|
1,
|
|
reinterpret_cast<const rocblas_half*>(buffer->data<at::Half>()),
|
|
0,
|
|
reinterpret_cast<rocblas_half*>(out)));
|
|
#elif TORCH_HIP_VERSION < 210
|
|
CAFFE_THROW("HIP rocblas doesn't fully support fp16 device_reduce yet.");
|
|
#else
|
|
auto buffer_size = 1;
|
|
|
|
if (buffer->numel() != buffer_size) {
|
|
buffer->Resize(buffer_size);
|
|
|
|
math::Set<at::Half, CUDAContext>(
|
|
N,
|
|
convert::To<float, at::Half>(1.),
|
|
buffer->template mutable_data<at::Half>(),
|
|
context);
|
|
}
|
|
|
|
CUBLAS_ENFORCE(cublasDotEx(
|
|
context->cublas_handle(),
|
|
N,
|
|
in,
|
|
CUDA_R_16F,
|
|
1,
|
|
buffer->data<at::Half>(),
|
|
CUDA_R_16F,
|
|
0,
|
|
out,
|
|
CUDA_R_16F,
|
|
CUDA_R_32F));
|
|
#endif
|
|
}
|
|
|
|
template <typename T, int BLOCK_THREADS>
|
|
__global__ void
|
|
reduce_sum_like(const T* g_idata, T* g_odata, int pre, int N, int post) {
|
|
int n = blockIdx.x;
|
|
float sum = 0.0;
|
|
int limit = pre * post;
|
|
for (int i = threadIdx.x; i < limit; i += blockDim.x) {
|
|
int curPre = i / post;
|
|
int curPost = i % post;
|
|
|
|
sum +=
|
|
convert::To<T, float>(g_idata[curPre * N * post + n * post + curPost]);
|
|
}
|
|
// uses a shared memory reduction within block
|
|
typedef cub::BlockReduce<float, BLOCK_THREADS> BlockReduceT;
|
|
// Shared memory
|
|
__shared__ typename BlockReduceT::TempStorage temp_storage;
|
|
float aggregate = BlockReduceT(temp_storage).Sum(sum);
|
|
if (threadIdx.x == 0) {
|
|
g_odata[n] = convert::To<float, T>(aggregate);
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
template <>
|
|
template <typename T>
|
|
bool SumReduceLikeOp<CUDAContext>::DoRunWithType() {
|
|
const auto& A = Input(0);
|
|
const auto& B = Input(1);
|
|
auto* C = Output(0);
|
|
auto count = A.size();
|
|
CAFFE_ENFORCE(&B != C, "In-place is not allowed.");
|
|
C->ResizeLike(B);
|
|
const T* Adata = A.template data<T>();
|
|
auto* Cdata = C->template mutable_data<T>();
|
|
|
|
if (C->size() == 0) {
|
|
// output is empty, nothing to do, not even launching the CUDA kernel
|
|
return true;
|
|
}
|
|
|
|
if (B.size() == 1) {
|
|
device_reduce<T>(Adata, Cdata, count, &sum_buffer_, &context_);
|
|
} else {
|
|
size_t pre, n, post;
|
|
std::tie(pre, n, post) =
|
|
elementwise_ops_utils::ComputeLegacyBroadcastSizes(A, B, axis_);
|
|
// because we check shape(B) \in shape(A) before,
|
|
// post and pre cannot be 1 at same time
|
|
if (post == 1) {
|
|
reduce_sum_like_post1<T>
|
|
<<<CAFFE_GET_BLOCKS(n),
|
|
CAFFE_CUDA_NUM_THREADS,
|
|
0,
|
|
context_.cuda_stream()>>>(Adata, Cdata, pre, n);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
} else {
|
|
if (post >= 128) {
|
|
reduce_sum_like<T, 512>
|
|
<<<n, 512, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
} else if (post >= 64) {
|
|
reduce_sum_like<T, 128>
|
|
<<<n, 128, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
} else if (post >= 32) {
|
|
reduce_sum_like<T, 64>
|
|
<<<n, 64, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
} else {
|
|
reduce_sum_like<T, 32>
|
|
<<<n, 32, 0, context_.cuda_stream()>>>(Adata, Cdata, pre, n, post);
|
|
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
template <>
|
|
bool SumReduceLikeOp<CUDAContext>::RunOnDevice() {
|
|
return DispatchHelper<TensorTypes<float, at::Half>>::call(this, Input(0));
|
|
}
|
|
|
|
REGISTER_CUDA_OPERATOR(SumReduceLike, SumReduceLikeOp<CUDAContext>);
|
|
|
|
} // namespace caffe2
|