Refactoring code related to WARP_SIZE. (#3623)

1. Centralize its definition in common.cuh.
2. Rename it to GPU_WARP_SIZE which can be extended to AMD GPU later.
3. Centralize warp shuffle functions.

Co-authored-by: Weixing Zhang <wezhan@microsoft.com>
This commit is contained in:
Weixing Zhang 2020-04-22 15:19:06 -07:00 committed by GitHub
parent bb9b0ba5b3
commit e4fc83252d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 128 additions and 163 deletions

View file

@ -32,35 +32,6 @@ namespace cuda {
using namespace onnxruntime::cuda;
template <typename T>
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_sync(mask, value, srcLane, width);
#else
return __shfl(value, srcLane, width);
#endif
}
template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}
template <typename U>
__device__ void cuWelfordOnlineSum(
const U curr,
@ -82,8 +53,7 @@ __device__ void cuChanOnlineSum(
const U countB,
U& mu,
U& sigma2,
U& count,
const int& warp_size) {
U& count) {
U delta = muB - mu;
U nA = count;
U nB = countB;
@ -108,10 +78,9 @@ __device__ void cuWelfordMuSigma2(
const int i1,
U& mu,
U& sigma2,
U* buf,
const int warp_size) {
U* buf) {
// Assumptions:
// 1) blockDim.x == warpSize
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensor is contiguous
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
//
@ -140,10 +109,10 @@ __device__ void cuWelfordMuSigma2(
// intra-warp reductions
for (int l = 0; l <= 4; ++l) {
int srcLaneB = (threadIdx.x + (1 << l)) & 31;
U muB = WARP_SHFL(mu, srcLaneB, warp_size);
U countB = WARP_SHFL(count, srcLaneB, warp_size);
U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
U muB = WARP_SHFL(mu, srcLaneB);
U countB = WARP_SHFL(count, srcLaneB);
U sigma2B = WARP_SHFL(sigma2, srcLaneB);
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
}
// threadIdx.x == 0 has correct values for each warp
// inter-warp reductions
@ -164,7 +133,7 @@ __device__ void cuWelfordMuSigma2(
U muB = ubuf[2 * threadIdx.y];
U sigma2B = ubuf[2 * threadIdx.y + 1];
U countB = ibuf[threadIdx.y];
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
}
__syncthreads();
}
@ -178,8 +147,8 @@ __device__ void cuWelfordMuSigma2(
sigma2 = ubuf[1] / U(n2);
// don't care about final value of count, we know count == n2
} else {
mu = WARP_SHFL(mu, 0, warp_size);
sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size);
mu = WARP_SHFL(mu, 0);
sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
}
}
}
@ -192,10 +161,9 @@ __device__ void cuWelfordMuSigma2(
const int i1,
float& mu,
float& sigma2,
float* buf,
const int warp_size) {
float* buf) {
// Assumptions:
// 1) blockDim.x == warpSize
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensor is contiguous
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
//
@ -235,10 +203,10 @@ __device__ void cuWelfordMuSigma2(
// intra-warp reductions
for (int l = 0; l <= 4; ++l) {
int srcLaneB = (threadIdx.x + (1 << l)) & 31;
float muB = WARP_SHFL(mu, srcLaneB, warp_size);
float countB = WARP_SHFL(count, srcLaneB, warp_size);
float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
float muB = WARP_SHFL(mu, srcLaneB);
float countB = WARP_SHFL(count, srcLaneB);
float sigma2B = WARP_SHFL(sigma2, srcLaneB);
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
}
// threadIdx.x == 0 has correct values for each warp
// inter-warp reductions
@ -259,7 +227,7 @@ __device__ void cuWelfordMuSigma2(
float muB = ubuf[2 * threadIdx.y];
float sigma2B = ubuf[2 * threadIdx.y + 1];
float countB = ibuf[threadIdx.y];
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
}
__syncthreads();
}
@ -273,8 +241,8 @@ __device__ void cuWelfordMuSigma2(
sigma2 = ubuf[1] / float(n2);
// don't care about final value of count, we know count == n2
} else {
mu = WARP_SHFL(mu, 0, warp_size);
sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size);
mu = WARP_SHFL(mu, 0);
sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
}
}
}
@ -337,17 +305,16 @@ __global__ void cuApplyLayerNorm(
const int n2,
const U epsilon,
const T* __restrict__ gamma,
const T* __restrict__ beta,
int warp_size) {
const T* __restrict__ beta) {
// Assumptions:
// 1) blockDim.x == warpSize
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensors are contiguous
//
for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
SharedMemory<U> shared;
U* buf = shared.getPointer();
U mu, sigma2;
cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size);
cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
const T* lvals = vals + i1 * n2;
T* ovals = output_vals + i1 * n2;
U c_invvar = rsqrt(sigma2 + epsilon);
@ -385,6 +352,8 @@ void HostApplyLayerNorm(
const T* beta) {
const uint64_t maxGridY = prop.maxGridSize[1];
const int warp_size = prop.warpSize;
ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
const dim3 threads(warp_size, 4, 1);
const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
int nshared =
@ -396,7 +365,7 @@ void HostApplyLayerNorm(
input,
n1, n2,
U(epsilon),
gamma, beta, warp_size);
gamma, beta);
}
#define LAYERNORM_LINEAR_IMPL(T, U) \

View file

@ -3,7 +3,6 @@ Copyright(c) 2016 Facebook Inc.
==============================================================================*/
/* Modifications Copyright (c) Microsoft. */
#pragma once
#include "core/providers/cuda/cu_inc/common.cuh"
#include "core/providers/cuda/cu_inc/binary_elementwise_impl.cuh"
#include "core/providers/cuda/shared_inc/cuda_utils.h"

View file

@ -219,9 +219,9 @@ __device__ __inline__ T _Gelu(T a) {
return a * _Normcdf(a);
}
// We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
// For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
#ifndef CUDA_LONG
#define CUDA_LONG int32_t
#endif
@ -239,6 +239,7 @@ struct GridDim {
};
};
#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \
if (id >= N) \
@ -253,5 +254,48 @@ struct GridDim {
#define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
#endif // __APPLE__
// WARP related definitions and functions
constexpr int GPU_WARP_SIZE = 32;
template <typename T>
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_sync(mask, value, srcLane, width);
#else
return __shfl(value, srcLane, width);
#endif
}
template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}
template <typename T>
__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_up_sync(mask, value, delta, width);
#else
return __shfl_up(value, delta, width);
#endif
}
template <typename T>
__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_down_sync(mask, value, delta, width);
#else
return __shfl_down(value, delta, width);
#endif
}
} // namespace cuda
} // namespace onnxruntime

View file

@ -32,7 +32,7 @@ namespace cuda {
// The template arguments have the following meaning:
// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples.
// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small.
// A "WARP" contains "CUDA_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp.
// A "WARP" contains "GPU_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp.
// This is important because it means only __shfl_ instructions are required for reductions.
// Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
// CUDA warp size is 32 for all existing GPU architecures, but there is no guarantee this will not change for future arch.
@ -48,7 +48,7 @@ template <typename input_t, typename output_t, typename acc_t, int log2_elements
__global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batch_size, int stride, int element_count) {
// WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel.
constexpr int next_power_of_two = 1 << log2_elements;
constexpr int WARP_SIZE = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
constexpr int WARP_SIZE = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
@ -143,7 +143,7 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele
const int next_power_of_two = 1 << log2_elements;
// This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
int warp_size = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
int warp_size = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
// This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

View file

@ -23,23 +23,12 @@
namespace onnxruntime {
namespace cuda {
constexpr int CUDA_WARP_SIZE = 32;
inline int log2_ceil(int value) {
int log2_value = 0;
while ((1 << log2_value) < value) ++log2_value;
return log2_value;
}
template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) {
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}
template <typename T>
struct Add {
__device__ __forceinline__ T operator()(T a, T b) const {

View file

@ -99,7 +99,7 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output
TOut value_ = value;
#pragma unroll
for (int stride = NUM_THREADS_PER_WARP / 2; stride > 0; stride /= 2) {
value_ += __shfl_down_sync(ALL_ONE_MASK, value_, stride);
value_ += WARP_SHFL_DOWN(value_, stride);
}
// Return early if only one warp is used for reduction.
@ -366,10 +366,9 @@ template<typename TIn, typename TOut, typename TBuf>
void call_reduce_matrix_rows(const TIn *input, TOut *output, int m, int n) {
constexpr int max_num_threads_in_block = 512;
constexpr int max_num_blocks_in_grid = 512;
constexpr int warp_size = 32;
constexpr int load_count_per_thread = 4;
const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, warp_size)));
const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE)));
const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread)));
const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid));
const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4));

View file

@ -30,7 +30,7 @@ template <typename input_t, typename output_t, typename acc_t, int log2_elements
__global__ void softmax_warp_backward(output_t* gradInput, const input_t* grad, const input_t* output, int batch_size, int stride, int element_count) {
// WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel.
constexpr int next_power_of_two = 1 << log2_elements;
constexpr int WARP_SIZE = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
constexpr int WARP_SIZE = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
@ -116,7 +116,7 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const
const int next_power_of_two = 1 << log2_elements;
// This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
int warp_size = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
int warp_size = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
// This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;

View file

@ -30,34 +30,6 @@ namespace cuda {
using namespace onnxruntime::cuda;
template <typename T>
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_sync(mask, value, srcLane, width);
#else
return __shfl(value, srcLane, width);
#endif
}
template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}
template <typename U>
__device__ void cuWelfordOnlineSum(
const U curr,
@ -79,8 +51,7 @@ __device__ void cuChanOnlineSum(
const U countB,
U& mu,
U& sigma2,
U& count,
const int& warp_size) {
U& count) {
U delta = muB - mu;
U nA = count;
U nB = countB;
@ -105,10 +76,9 @@ __device__ void cuWelfordMuSigma2(
const int i1,
U& mu,
U& sigma2,
U* buf,
const int warp_size) {
U* buf) {
// Assumptions:
// 1) blockDim.x == warpSize
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensor is contiguous
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
//
@ -137,10 +107,10 @@ __device__ void cuWelfordMuSigma2(
// intra-warp reductions
for (int l = 0; l <= 4; ++l) {
int srcLaneB = (threadIdx.x + (1 << l)) & 31;
U muB = WARP_SHFL(mu, srcLaneB, warp_size);
U countB = WARP_SHFL(count, srcLaneB, warp_size);
U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
U muB = WARP_SHFL(mu, srcLaneB);
U countB = WARP_SHFL(count, srcLaneB);
U sigma2B = WARP_SHFL(sigma2, srcLaneB);
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
}
// threadIdx.x == 0 has correct values for each warp
// inter-warp reductions
@ -161,7 +131,7 @@ __device__ void cuWelfordMuSigma2(
U muB = ubuf[2 * threadIdx.y];
U sigma2B = ubuf[2 * threadIdx.y + 1];
U countB = ibuf[threadIdx.y];
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
}
__syncthreads();
}
@ -175,8 +145,8 @@ __device__ void cuWelfordMuSigma2(
sigma2 = ubuf[1] / U(n2);
// don't care about final value of count, we know count == n2
} else {
mu = WARP_SHFL(mu, 0, warp_size);
sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size);
mu = WARP_SHFL(mu, 0);
sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
}
}
}
@ -189,10 +159,9 @@ __device__ void cuWelfordMuSigma2(
const int i1,
float& mu,
float& sigma2,
float* buf,
const int warp_size) {
float* buf) {
// Assumptions:
// 1) blockDim.x == warpSize
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensor is contiguous
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
//
@ -232,10 +201,10 @@ __device__ void cuWelfordMuSigma2(
// intra-warp reductions
for (int l = 0; l <= 4; ++l) {
int srcLaneB = (threadIdx.x + (1 << l)) & 31;
float muB = WARP_SHFL(mu, srcLaneB, warp_size);
float countB = WARP_SHFL(count, srcLaneB, warp_size);
float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
float muB = WARP_SHFL(mu, srcLaneB);
float countB = WARP_SHFL(count, srcLaneB);
float sigma2B = WARP_SHFL(sigma2, srcLaneB);
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
}
// threadIdx.x == 0 has correct values for each warp
// inter-warp reductions
@ -256,7 +225,7 @@ __device__ void cuWelfordMuSigma2(
float muB = ubuf[2 * threadIdx.y];
float sigma2B = ubuf[2 * threadIdx.y + 1];
float countB = ibuf[threadIdx.y];
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
}
__syncthreads();
}
@ -270,8 +239,8 @@ __device__ void cuWelfordMuSigma2(
sigma2 = ubuf[1] / float(n2);
// don't care about final value of count, we know count == n2
} else {
mu = WARP_SHFL(mu, 0, warp_size);
sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size);
mu = WARP_SHFL(mu, 0);
sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
}
}
}
@ -334,17 +303,16 @@ __global__ void cuApplyLayerNorm(
const int n2,
const U epsilon,
const T* __restrict__ gamma,
const T* __restrict__ beta,
int warp_size) {
const T* __restrict__ beta) {
// Assumptions:
// 1) blockDim.x == warpSize
// 1) blockDim.x == GPU_WARP_SIZE
// 2) Tensors are contiguous
//
for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
SharedMemory<U> shared;
U* buf = shared.getPointer();
U mu, sigma2;
cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size);
cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
const T* lvals = vals + i1 * n2;
T* ovals = output_vals + i1 * n2;
U c_invvar = rsqrt(sigma2 + epsilon);
@ -382,6 +350,8 @@ void HostApplyLayerNorm(
const T* beta) {
const uint64_t maxGridY = prop.maxGridSize[1];
const int warp_size = prop.warpSize;
ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
const dim3 threads(warp_size, 4, 1);
const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
int nshared =
@ -393,7 +363,7 @@ void HostApplyLayerNorm(
input,
n1, n2,
U(epsilon),
gamma, beta, warp_size);
gamma, beta);
}
#define LAYERNORM_LINEAR_IMPL(T, U) \
@ -727,7 +697,10 @@ void HostLayerNormGradient(
U* part_grad_gamma,
U* part_grad_beta,
const int part_size) {
const dim3 threads2(prop.warpSize, 4, 1);
const int warp_size = prop.warpSize;
ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
const dim3 threads2(warp_size, 4, 1);
const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1);
const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
const int nshared2_b = threads2.x * threads2.y * sizeof(U);
@ -742,7 +715,7 @@ void HostLayerNormGradient(
part_grad_gamma,
part_grad_beta);
const dim3 threads3(prop.warpSize, 8, 1);
const dim3 threads3(warp_size, 8, 1);
const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1);
const int nshared3 = threads3.x * threads3.y * sizeof(U);
cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, 0>>>(
@ -756,7 +729,7 @@ void HostLayerNormGradient(
// compute grad_input
const uint64_t maxGridY = prop.maxGridSize[1];
const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
const dim3 threads1(prop.warpSize, 4, 1);
const dim3 threads1(warp_size, 4, 1);
int nshared =
threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0;
cuComputeGradInput<<<blocks1, threads1, nshared, 0>>>(

View file

@ -467,17 +467,16 @@ __global__ void LambMultiTensorReductionImpl(ChunkGroup<4> chunk_group) {
}
}
// Thread count in a block must be a multiple of 32.
constexpr int warp_size = 32;
// Thread count in a block must be a multiple of GPU_WARP_SIZE.
#pragma unroll
for (int stride = warp_size / 2; stride > 0; stride /= 2) {
w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride);
d_sum += __shfl_down_sync(0xFFFFFFFF, d_sum, stride);
for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
w_sum += WARP_SHFL_DOWN(w_sum, stride);
d_sum += WARP_SHFL_DOWN(d_sum, stride);
}
const int warp_count_in_block = blockDim.x / warp_size;
const int lid = threadIdx.x % warp_size;
const int wid = threadIdx.x / warp_size;
const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE;
const int lid = threadIdx.x % GPU_WARP_SIZE;
const int wid = threadIdx.x / GPU_WARP_SIZE;
// Shape is 2 x warp_count_in_block.
extern __shared__ unsigned char shared_memory_[];
@ -511,13 +510,11 @@ template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename
void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(ChunkGroup<4> chunk_group) {
// thread count per block.
constexpr int thread_count = ChunkGroup<4>::thread_count_per_block;
// warp size of GPU.
constexpr int warp_size = 32;
// shared memory's size per block.
const int shared_memory_size = thread_count / warp_size * 2 * sizeof(TBuf);
const int shared_memory_size = thread_count / GPU_WARP_SIZE * 2 * sizeof(TBuf);
// Enforce assumptions used inside this reduction CUDA kernel.
ORT_ENFORCE(thread_count % warp_size == 0);
ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0);
ORT_ENFORCE((thread_count & (thread_count - 1)) == 0);
LambMultiTensorReductionImpl<TIn1, TIn2, TOut1, TOut2, TBuf><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(chunk_group);

View file

@ -46,16 +46,15 @@ __global__ void _MultiTensorReduceImpl(ChunkGroup<1> chunk_group, TOut* output)
}
}
// Thread count in a block must be a multiple of 32.
constexpr int warp_size = 32;
// Thread count in a block must be a multiple of GPU_WARP_SIZE.
#pragma unroll
for (int stride = warp_size / 2; stride > 0; stride /= 2) {
w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride);
for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
w_sum += WARP_SHFL_DOWN(w_sum, stride);
}
const int warp_count_in_block = blockDim.x / warp_size;
const int lid = threadIdx.x % warp_size;
const int wid = threadIdx.x / warp_size;
const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE;
const int lid = threadIdx.x % GPU_WARP_SIZE;
const int wid = threadIdx.x / GPU_WARP_SIZE;
// Shape is 2 x warp_count_in_block.
extern __shared__ unsigned char shared_memory_[];
@ -84,13 +83,11 @@ template <typename TIn, typename TOut, typename TBuf, typename TInOp, typename T
void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) {
// thread count per block.
constexpr int thread_count = ChunkGroup<1>::thread_count_per_block;
// warp size of GPU.
constexpr int warp_size = 32;
// shared memory's size per block.
const int shared_memory_size = thread_count / warp_size * sizeof(TBuf);
const int shared_memory_size = thread_count / GPU_WARP_SIZE * sizeof(TBuf);
// Enforce assumptions used inside this reduction CUDA kernel.
ORT_ENFORCE(thread_count % warp_size == 0);
ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0);
ORT_ENFORCE((thread_count & (thread_count - 1)) == 0);
_MultiTensorReduceImpl<TIn, TOut, TBuf, TInOp, TOutOp><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(chunk_group, output);

View file

@ -11,8 +11,6 @@
namespace onnxruntime {
namespace cuda {
static constexpr int WARP_SIZE = 32;
template <typename T>
__global__ void _Iota(
cub::CountingInputIterator<T> input,
@ -47,7 +45,7 @@ __global__ void _GatherGradImpl(
#pragma unroll
for (int ii = 0; ii < SZ; ii++) {
int feature_dim = start_feature + ii * WARP_SIZE;
int feature_dim = start_feature + ii * GPU_WARP_SIZE;
if (feature_dim < stride) {
gradient[ii] = static_cast<T>(grad_output[grad_row + feature_dim]);
weight[ii] = static_cast<T>(grad_weight[weight_row + feature_dim]);
@ -61,7 +59,7 @@ __global__ void _GatherGradImpl(
#pragma unroll
for (int ii = 0; ii < SZ; ii++) {
int feature_dim = start_feature + ii * WARP_SIZE;
int feature_dim = start_feature + ii * GPU_WARP_SIZE;
if (feature_dim < stride) {
grad_weight[weight_row + feature_dim] = static_cast<T>(weight[ii]);
}
@ -114,8 +112,8 @@ void GatherGradImpl(
original_indices.get(), original_indices_sorted.get(),
num_indices));
dim3 block(GPU_WARP_SIZE, 4);
dim3 grid(CeilDiv(num_indices, 4), CeilDiv(stride, 128));
dim3 block(WARP_SIZE, 4);
_GatherGradImpl<<<grid, block>>>(
indices_data_sorted.get(),