mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-20 02:07:56 +00:00
Refactoring code related to WARP_SIZE. (#3623)
1. Centralize its definition in common.cuh. 2. Rename it to GPU_WARP_SIZE which can be extended to AMD GPU later. 3. Centralize warp shuffle functions. Co-authored-by: Weixing Zhang <wezhan@microsoft.com>
This commit is contained in:
parent
bb9b0ba5b3
commit
e4fc83252d
11 changed files with 128 additions and 163 deletions
|
|
@ -32,35 +32,6 @@ namespace cuda {
|
|||
|
||||
using namespace onnxruntime::cuda;
|
||||
|
||||
template <typename T>
|
||||
|
||||
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
#if CUDA_VERSION >= 9000
|
||||
|
||||
return __shfl_sync(mask, value, srcLane, width);
|
||||
|
||||
#else
|
||||
|
||||
return __shfl(value, srcLane, width);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
#if CUDA_VERSION >= 9000
|
||||
|
||||
return __shfl_xor_sync(mask, value, laneMask, width);
|
||||
|
||||
#else
|
||||
|
||||
return __shfl_xor(value, laneMask, width);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
__device__ void cuWelfordOnlineSum(
|
||||
const U curr,
|
||||
|
|
@ -82,8 +53,7 @@ __device__ void cuChanOnlineSum(
|
|||
const U countB,
|
||||
U& mu,
|
||||
U& sigma2,
|
||||
U& count,
|
||||
const int& warp_size) {
|
||||
U& count) {
|
||||
U delta = muB - mu;
|
||||
U nA = count;
|
||||
U nB = countB;
|
||||
|
|
@ -108,10 +78,9 @@ __device__ void cuWelfordMuSigma2(
|
|||
const int i1,
|
||||
U& mu,
|
||||
U& sigma2,
|
||||
U* buf,
|
||||
const int warp_size) {
|
||||
U* buf) {
|
||||
// Assumptions:
|
||||
// 1) blockDim.x == warpSize
|
||||
// 1) blockDim.x == GPU_WARP_SIZE
|
||||
// 2) Tensor is contiguous
|
||||
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
|
||||
//
|
||||
|
|
@ -140,10 +109,10 @@ __device__ void cuWelfordMuSigma2(
|
|||
// intra-warp reductions
|
||||
for (int l = 0; l <= 4; ++l) {
|
||||
int srcLaneB = (threadIdx.x + (1 << l)) & 31;
|
||||
U muB = WARP_SHFL(mu, srcLaneB, warp_size);
|
||||
U countB = WARP_SHFL(count, srcLaneB, warp_size);
|
||||
U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
|
||||
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
|
||||
U muB = WARP_SHFL(mu, srcLaneB);
|
||||
U countB = WARP_SHFL(count, srcLaneB);
|
||||
U sigma2B = WARP_SHFL(sigma2, srcLaneB);
|
||||
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
|
||||
}
|
||||
// threadIdx.x == 0 has correct values for each warp
|
||||
// inter-warp reductions
|
||||
|
|
@ -164,7 +133,7 @@ __device__ void cuWelfordMuSigma2(
|
|||
U muB = ubuf[2 * threadIdx.y];
|
||||
U sigma2B = ubuf[2 * threadIdx.y + 1];
|
||||
U countB = ibuf[threadIdx.y];
|
||||
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
|
||||
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
|
@ -178,8 +147,8 @@ __device__ void cuWelfordMuSigma2(
|
|||
sigma2 = ubuf[1] / U(n2);
|
||||
// don't care about final value of count, we know count == n2
|
||||
} else {
|
||||
mu = WARP_SHFL(mu, 0, warp_size);
|
||||
sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size);
|
||||
mu = WARP_SHFL(mu, 0);
|
||||
sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -192,10 +161,9 @@ __device__ void cuWelfordMuSigma2(
|
|||
const int i1,
|
||||
float& mu,
|
||||
float& sigma2,
|
||||
float* buf,
|
||||
const int warp_size) {
|
||||
float* buf) {
|
||||
// Assumptions:
|
||||
// 1) blockDim.x == warpSize
|
||||
// 1) blockDim.x == GPU_WARP_SIZE
|
||||
// 2) Tensor is contiguous
|
||||
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
|
||||
//
|
||||
|
|
@ -235,10 +203,10 @@ __device__ void cuWelfordMuSigma2(
|
|||
// intra-warp reductions
|
||||
for (int l = 0; l <= 4; ++l) {
|
||||
int srcLaneB = (threadIdx.x + (1 << l)) & 31;
|
||||
float muB = WARP_SHFL(mu, srcLaneB, warp_size);
|
||||
float countB = WARP_SHFL(count, srcLaneB, warp_size);
|
||||
float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
|
||||
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
|
||||
float muB = WARP_SHFL(mu, srcLaneB);
|
||||
float countB = WARP_SHFL(count, srcLaneB);
|
||||
float sigma2B = WARP_SHFL(sigma2, srcLaneB);
|
||||
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
|
||||
}
|
||||
// threadIdx.x == 0 has correct values for each warp
|
||||
// inter-warp reductions
|
||||
|
|
@ -259,7 +227,7 @@ __device__ void cuWelfordMuSigma2(
|
|||
float muB = ubuf[2 * threadIdx.y];
|
||||
float sigma2B = ubuf[2 * threadIdx.y + 1];
|
||||
float countB = ibuf[threadIdx.y];
|
||||
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
|
||||
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
|
@ -273,8 +241,8 @@ __device__ void cuWelfordMuSigma2(
|
|||
sigma2 = ubuf[1] / float(n2);
|
||||
// don't care about final value of count, we know count == n2
|
||||
} else {
|
||||
mu = WARP_SHFL(mu, 0, warp_size);
|
||||
sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size);
|
||||
mu = WARP_SHFL(mu, 0);
|
||||
sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -337,17 +305,16 @@ __global__ void cuApplyLayerNorm(
|
|||
const int n2,
|
||||
const U epsilon,
|
||||
const T* __restrict__ gamma,
|
||||
const T* __restrict__ beta,
|
||||
int warp_size) {
|
||||
const T* __restrict__ beta) {
|
||||
// Assumptions:
|
||||
// 1) blockDim.x == warpSize
|
||||
// 1) blockDim.x == GPU_WARP_SIZE
|
||||
// 2) Tensors are contiguous
|
||||
//
|
||||
for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
|
||||
SharedMemory<U> shared;
|
||||
U* buf = shared.getPointer();
|
||||
U mu, sigma2;
|
||||
cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size);
|
||||
cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
|
||||
const T* lvals = vals + i1 * n2;
|
||||
T* ovals = output_vals + i1 * n2;
|
||||
U c_invvar = rsqrt(sigma2 + epsilon);
|
||||
|
|
@ -385,6 +352,8 @@ void HostApplyLayerNorm(
|
|||
const T* beta) {
|
||||
const uint64_t maxGridY = prop.maxGridSize[1];
|
||||
const int warp_size = prop.warpSize;
|
||||
ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
|
||||
|
||||
const dim3 threads(warp_size, 4, 1);
|
||||
const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
|
||||
int nshared =
|
||||
|
|
@ -396,7 +365,7 @@ void HostApplyLayerNorm(
|
|||
input,
|
||||
n1, n2,
|
||||
U(epsilon),
|
||||
gamma, beta, warp_size);
|
||||
gamma, beta);
|
||||
}
|
||||
|
||||
#define LAYERNORM_LINEAR_IMPL(T, U) \
|
||||
|
|
|
|||
|
|
@ -3,7 +3,6 @@ Copyright(c) 2016 Facebook Inc.
|
|||
==============================================================================*/
|
||||
/* Modifications Copyright (c) Microsoft. */
|
||||
|
||||
#pragma once
|
||||
#include "core/providers/cuda/cu_inc/common.cuh"
|
||||
#include "core/providers/cuda/cu_inc/binary_elementwise_impl.cuh"
|
||||
#include "core/providers/cuda/shared_inc/cuda_utils.h"
|
||||
|
|
|
|||
|
|
@ -219,9 +219,9 @@ __device__ __inline__ T _Gelu(T a) {
|
|||
return a * _Normcdf(a);
|
||||
}
|
||||
|
||||
|
||||
// We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
|
||||
// For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
|
||||
|
||||
#ifndef CUDA_LONG
|
||||
#define CUDA_LONG int32_t
|
||||
#endif
|
||||
|
|
@ -239,6 +239,7 @@ struct GridDim {
|
|||
};
|
||||
};
|
||||
|
||||
|
||||
#define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \
|
||||
CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \
|
||||
if (id >= N) \
|
||||
|
|
@ -253,5 +254,48 @@ struct GridDim {
|
|||
#define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
|
||||
#endif // __APPLE__
|
||||
|
||||
// WARP related definitions and functions
|
||||
constexpr int GPU_WARP_SIZE = 32;
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_sync(mask, value, srcLane, width);
|
||||
#else
|
||||
return __shfl(value, srcLane, width);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_xor_sync(mask, value, laneMask, width);
|
||||
#else
|
||||
return __shfl_xor(value, laneMask, width);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_up_sync(mask, value, delta, width);
|
||||
#else
|
||||
return __shfl_up(value, delta, width);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
|
||||
{
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_down_sync(mask, value, delta, width);
|
||||
#else
|
||||
return __shfl_down(value, delta, width);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace cuda
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ namespace cuda {
|
|||
// The template arguments have the following meaning:
|
||||
// One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples.
|
||||
// WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small.
|
||||
// A "WARP" contains "CUDA_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp.
|
||||
// A "WARP" contains "GPU_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp.
|
||||
// This is important because it means only __shfl_ instructions are required for reductions.
|
||||
// Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
|
||||
// CUDA warp size is 32 for all existing GPU architecures, but there is no guarantee this will not change for future arch.
|
||||
|
|
@ -48,7 +48,7 @@ template <typename input_t, typename output_t, typename acc_t, int log2_elements
|
|||
__global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batch_size, int stride, int element_count) {
|
||||
// WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel.
|
||||
constexpr int next_power_of_two = 1 << log2_elements;
|
||||
constexpr int WARP_SIZE = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
|
||||
constexpr int WARP_SIZE = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
|
||||
constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
|
||||
constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
|
||||
|
||||
|
|
@ -143,7 +143,7 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele
|
|||
const int next_power_of_two = 1 << log2_elements;
|
||||
|
||||
// This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
|
||||
int warp_size = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
|
||||
int warp_size = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
|
||||
|
||||
// This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
|
||||
int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
|
||||
|
|
|
|||
|
|
@ -23,23 +23,12 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
constexpr int CUDA_WARP_SIZE = 32;
|
||||
|
||||
inline int log2_ceil(int value) {
|
||||
int log2_value = 0;
|
||||
while ((1 << log2_value) < value) ++log2_value;
|
||||
return log2_value;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) {
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_xor_sync(mask, value, laneMask, width);
|
||||
#else
|
||||
return __shfl_xor(value, laneMask, width);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct Add {
|
||||
__device__ __forceinline__ T operator()(T a, T b) const {
|
||||
|
|
|
|||
|
|
@ -99,7 +99,7 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output
|
|||
TOut value_ = value;
|
||||
#pragma unroll
|
||||
for (int stride = NUM_THREADS_PER_WARP / 2; stride > 0; stride /= 2) {
|
||||
value_ += __shfl_down_sync(ALL_ONE_MASK, value_, stride);
|
||||
value_ += WARP_SHFL_DOWN(value_, stride);
|
||||
}
|
||||
|
||||
// Return early if only one warp is used for reduction.
|
||||
|
|
@ -366,10 +366,9 @@ template<typename TIn, typename TOut, typename TBuf>
|
|||
void call_reduce_matrix_rows(const TIn *input, TOut *output, int m, int n) {
|
||||
constexpr int max_num_threads_in_block = 512;
|
||||
constexpr int max_num_blocks_in_grid = 512;
|
||||
constexpr int warp_size = 32;
|
||||
constexpr int load_count_per_thread = 4;
|
||||
|
||||
const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, warp_size)));
|
||||
const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE)));
|
||||
const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread)));
|
||||
const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid));
|
||||
const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4));
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ template <typename input_t, typename output_t, typename acc_t, int log2_elements
|
|||
__global__ void softmax_warp_backward(output_t* gradInput, const input_t* grad, const input_t* output, int batch_size, int stride, int element_count) {
|
||||
// WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel.
|
||||
constexpr int next_power_of_two = 1 << log2_elements;
|
||||
constexpr int WARP_SIZE = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
|
||||
constexpr int WARP_SIZE = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
|
||||
constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
|
||||
constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
|
||||
|
||||
|
|
@ -116,7 +116,7 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const
|
|||
const int next_power_of_two = 1 << log2_elements;
|
||||
|
||||
// This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
|
||||
int warp_size = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
|
||||
int warp_size = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
|
||||
|
||||
// This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
|
||||
int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
|
||||
|
|
|
|||
|
|
@ -30,34 +30,6 @@ namespace cuda {
|
|||
|
||||
using namespace onnxruntime::cuda;
|
||||
|
||||
template <typename T>
|
||||
|
||||
__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff)
|
||||
|
||||
{
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __shfl_sync(mask, value, srcLane, width);
|
||||
#else
|
||||
return __shfl(value, srcLane, width);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
||||
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
|
||||
|
||||
{
|
||||
#if CUDA_VERSION >= 9000
|
||||
|
||||
return __shfl_xor_sync(mask, value, laneMask, width);
|
||||
|
||||
#else
|
||||
|
||||
return __shfl_xor(value, laneMask, width);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename U>
|
||||
__device__ void cuWelfordOnlineSum(
|
||||
const U curr,
|
||||
|
|
@ -79,8 +51,7 @@ __device__ void cuChanOnlineSum(
|
|||
const U countB,
|
||||
U& mu,
|
||||
U& sigma2,
|
||||
U& count,
|
||||
const int& warp_size) {
|
||||
U& count) {
|
||||
U delta = muB - mu;
|
||||
U nA = count;
|
||||
U nB = countB;
|
||||
|
|
@ -105,10 +76,9 @@ __device__ void cuWelfordMuSigma2(
|
|||
const int i1,
|
||||
U& mu,
|
||||
U& sigma2,
|
||||
U* buf,
|
||||
const int warp_size) {
|
||||
U* buf) {
|
||||
// Assumptions:
|
||||
// 1) blockDim.x == warpSize
|
||||
// 1) blockDim.x == GPU_WARP_SIZE
|
||||
// 2) Tensor is contiguous
|
||||
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
|
||||
//
|
||||
|
|
@ -137,10 +107,10 @@ __device__ void cuWelfordMuSigma2(
|
|||
// intra-warp reductions
|
||||
for (int l = 0; l <= 4; ++l) {
|
||||
int srcLaneB = (threadIdx.x + (1 << l)) & 31;
|
||||
U muB = WARP_SHFL(mu, srcLaneB, warp_size);
|
||||
U countB = WARP_SHFL(count, srcLaneB, warp_size);
|
||||
U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
|
||||
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
|
||||
U muB = WARP_SHFL(mu, srcLaneB);
|
||||
U countB = WARP_SHFL(count, srcLaneB);
|
||||
U sigma2B = WARP_SHFL(sigma2, srcLaneB);
|
||||
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
|
||||
}
|
||||
// threadIdx.x == 0 has correct values for each warp
|
||||
// inter-warp reductions
|
||||
|
|
@ -161,7 +131,7 @@ __device__ void cuWelfordMuSigma2(
|
|||
U muB = ubuf[2 * threadIdx.y];
|
||||
U sigma2B = ubuf[2 * threadIdx.y + 1];
|
||||
U countB = ibuf[threadIdx.y];
|
||||
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
|
||||
cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
|
@ -175,8 +145,8 @@ __device__ void cuWelfordMuSigma2(
|
|||
sigma2 = ubuf[1] / U(n2);
|
||||
// don't care about final value of count, we know count == n2
|
||||
} else {
|
||||
mu = WARP_SHFL(mu, 0, warp_size);
|
||||
sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size);
|
||||
mu = WARP_SHFL(mu, 0);
|
||||
sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -189,10 +159,9 @@ __device__ void cuWelfordMuSigma2(
|
|||
const int i1,
|
||||
float& mu,
|
||||
float& sigma2,
|
||||
float* buf,
|
||||
const int warp_size) {
|
||||
float* buf) {
|
||||
// Assumptions:
|
||||
// 1) blockDim.x == warpSize
|
||||
// 1) blockDim.x == GPU_WARP_SIZE
|
||||
// 2) Tensor is contiguous
|
||||
// 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
|
||||
//
|
||||
|
|
@ -232,10 +201,10 @@ __device__ void cuWelfordMuSigma2(
|
|||
// intra-warp reductions
|
||||
for (int l = 0; l <= 4; ++l) {
|
||||
int srcLaneB = (threadIdx.x + (1 << l)) & 31;
|
||||
float muB = WARP_SHFL(mu, srcLaneB, warp_size);
|
||||
float countB = WARP_SHFL(count, srcLaneB, warp_size);
|
||||
float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
|
||||
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
|
||||
float muB = WARP_SHFL(mu, srcLaneB);
|
||||
float countB = WARP_SHFL(count, srcLaneB);
|
||||
float sigma2B = WARP_SHFL(sigma2, srcLaneB);
|
||||
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
|
||||
}
|
||||
// threadIdx.x == 0 has correct values for each warp
|
||||
// inter-warp reductions
|
||||
|
|
@ -256,7 +225,7 @@ __device__ void cuWelfordMuSigma2(
|
|||
float muB = ubuf[2 * threadIdx.y];
|
||||
float sigma2B = ubuf[2 * threadIdx.y + 1];
|
||||
float countB = ibuf[threadIdx.y];
|
||||
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
|
||||
cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
|
@ -270,8 +239,8 @@ __device__ void cuWelfordMuSigma2(
|
|||
sigma2 = ubuf[1] / float(n2);
|
||||
// don't care about final value of count, we know count == n2
|
||||
} else {
|
||||
mu = WARP_SHFL(mu, 0, warp_size);
|
||||
sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size);
|
||||
mu = WARP_SHFL(mu, 0);
|
||||
sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -334,17 +303,16 @@ __global__ void cuApplyLayerNorm(
|
|||
const int n2,
|
||||
const U epsilon,
|
||||
const T* __restrict__ gamma,
|
||||
const T* __restrict__ beta,
|
||||
int warp_size) {
|
||||
const T* __restrict__ beta) {
|
||||
// Assumptions:
|
||||
// 1) blockDim.x == warpSize
|
||||
// 1) blockDim.x == GPU_WARP_SIZE
|
||||
// 2) Tensors are contiguous
|
||||
//
|
||||
for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
|
||||
SharedMemory<U> shared;
|
||||
U* buf = shared.getPointer();
|
||||
U mu, sigma2;
|
||||
cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size);
|
||||
cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
|
||||
const T* lvals = vals + i1 * n2;
|
||||
T* ovals = output_vals + i1 * n2;
|
||||
U c_invvar = rsqrt(sigma2 + epsilon);
|
||||
|
|
@ -382,6 +350,8 @@ void HostApplyLayerNorm(
|
|||
const T* beta) {
|
||||
const uint64_t maxGridY = prop.maxGridSize[1];
|
||||
const int warp_size = prop.warpSize;
|
||||
ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
|
||||
|
||||
const dim3 threads(warp_size, 4, 1);
|
||||
const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
|
||||
int nshared =
|
||||
|
|
@ -393,7 +363,7 @@ void HostApplyLayerNorm(
|
|||
input,
|
||||
n1, n2,
|
||||
U(epsilon),
|
||||
gamma, beta, warp_size);
|
||||
gamma, beta);
|
||||
}
|
||||
|
||||
#define LAYERNORM_LINEAR_IMPL(T, U) \
|
||||
|
|
@ -727,7 +697,10 @@ void HostLayerNormGradient(
|
|||
U* part_grad_gamma,
|
||||
U* part_grad_beta,
|
||||
const int part_size) {
|
||||
const dim3 threads2(prop.warpSize, 4, 1);
|
||||
const int warp_size = prop.warpSize;
|
||||
ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
|
||||
|
||||
const dim3 threads2(warp_size, 4, 1);
|
||||
const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1);
|
||||
const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
|
||||
const int nshared2_b = threads2.x * threads2.y * sizeof(U);
|
||||
|
|
@ -742,7 +715,7 @@ void HostLayerNormGradient(
|
|||
part_grad_gamma,
|
||||
part_grad_beta);
|
||||
|
||||
const dim3 threads3(prop.warpSize, 8, 1);
|
||||
const dim3 threads3(warp_size, 8, 1);
|
||||
const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1);
|
||||
const int nshared3 = threads3.x * threads3.y * sizeof(U);
|
||||
cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, 0>>>(
|
||||
|
|
@ -756,7 +729,7 @@ void HostLayerNormGradient(
|
|||
// compute grad_input
|
||||
const uint64_t maxGridY = prop.maxGridSize[1];
|
||||
const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
|
||||
const dim3 threads1(prop.warpSize, 4, 1);
|
||||
const dim3 threads1(warp_size, 4, 1);
|
||||
int nshared =
|
||||
threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0;
|
||||
cuComputeGradInput<<<blocks1, threads1, nshared, 0>>>(
|
||||
|
|
|
|||
|
|
@ -467,17 +467,16 @@ __global__ void LambMultiTensorReductionImpl(ChunkGroup<4> chunk_group) {
|
|||
}
|
||||
}
|
||||
|
||||
// Thread count in a block must be a multiple of 32.
|
||||
constexpr int warp_size = 32;
|
||||
// Thread count in a block must be a multiple of GPU_WARP_SIZE.
|
||||
#pragma unroll
|
||||
for (int stride = warp_size / 2; stride > 0; stride /= 2) {
|
||||
w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride);
|
||||
d_sum += __shfl_down_sync(0xFFFFFFFF, d_sum, stride);
|
||||
for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
|
||||
w_sum += WARP_SHFL_DOWN(w_sum, stride);
|
||||
d_sum += WARP_SHFL_DOWN(d_sum, stride);
|
||||
}
|
||||
|
||||
const int warp_count_in_block = blockDim.x / warp_size;
|
||||
const int lid = threadIdx.x % warp_size;
|
||||
const int wid = threadIdx.x / warp_size;
|
||||
const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE;
|
||||
const int lid = threadIdx.x % GPU_WARP_SIZE;
|
||||
const int wid = threadIdx.x / GPU_WARP_SIZE;
|
||||
|
||||
// Shape is 2 x warp_count_in_block.
|
||||
extern __shared__ unsigned char shared_memory_[];
|
||||
|
|
@ -511,13 +510,11 @@ template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename
|
|||
void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(ChunkGroup<4> chunk_group) {
|
||||
// thread count per block.
|
||||
constexpr int thread_count = ChunkGroup<4>::thread_count_per_block;
|
||||
// warp size of GPU.
|
||||
constexpr int warp_size = 32;
|
||||
// shared memory's size per block.
|
||||
const int shared_memory_size = thread_count / warp_size * 2 * sizeof(TBuf);
|
||||
const int shared_memory_size = thread_count / GPU_WARP_SIZE * 2 * sizeof(TBuf);
|
||||
|
||||
// Enforce assumptions used inside this reduction CUDA kernel.
|
||||
ORT_ENFORCE(thread_count % warp_size == 0);
|
||||
ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0);
|
||||
ORT_ENFORCE((thread_count & (thread_count - 1)) == 0);
|
||||
|
||||
LambMultiTensorReductionImpl<TIn1, TIn2, TOut1, TOut2, TBuf><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(chunk_group);
|
||||
|
|
|
|||
|
|
@ -46,16 +46,15 @@ __global__ void _MultiTensorReduceImpl(ChunkGroup<1> chunk_group, TOut* output)
|
|||
}
|
||||
}
|
||||
|
||||
// Thread count in a block must be a multiple of 32.
|
||||
constexpr int warp_size = 32;
|
||||
// Thread count in a block must be a multiple of GPU_WARP_SIZE.
|
||||
#pragma unroll
|
||||
for (int stride = warp_size / 2; stride > 0; stride /= 2) {
|
||||
w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride);
|
||||
for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
|
||||
w_sum += WARP_SHFL_DOWN(w_sum, stride);
|
||||
}
|
||||
|
||||
const int warp_count_in_block = blockDim.x / warp_size;
|
||||
const int lid = threadIdx.x % warp_size;
|
||||
const int wid = threadIdx.x / warp_size;
|
||||
const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE;
|
||||
const int lid = threadIdx.x % GPU_WARP_SIZE;
|
||||
const int wid = threadIdx.x / GPU_WARP_SIZE;
|
||||
|
||||
// Shape is 2 x warp_count_in_block.
|
||||
extern __shared__ unsigned char shared_memory_[];
|
||||
|
|
@ -84,13 +83,11 @@ template <typename TIn, typename TOut, typename TBuf, typename TInOp, typename T
|
|||
void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) {
|
||||
// thread count per block.
|
||||
constexpr int thread_count = ChunkGroup<1>::thread_count_per_block;
|
||||
// warp size of GPU.
|
||||
constexpr int warp_size = 32;
|
||||
// shared memory's size per block.
|
||||
const int shared_memory_size = thread_count / warp_size * sizeof(TBuf);
|
||||
const int shared_memory_size = thread_count / GPU_WARP_SIZE * sizeof(TBuf);
|
||||
|
||||
// Enforce assumptions used inside this reduction CUDA kernel.
|
||||
ORT_ENFORCE(thread_count % warp_size == 0);
|
||||
ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0);
|
||||
ORT_ENFORCE((thread_count & (thread_count - 1)) == 0);
|
||||
|
||||
_MultiTensorReduceImpl<TIn, TOut, TBuf, TInOp, TOutOp><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(chunk_group, output);
|
||||
|
|
|
|||
|
|
@ -11,8 +11,6 @@
|
|||
namespace onnxruntime {
|
||||
namespace cuda {
|
||||
|
||||
static constexpr int WARP_SIZE = 32;
|
||||
|
||||
template <typename T>
|
||||
__global__ void _Iota(
|
||||
cub::CountingInputIterator<T> input,
|
||||
|
|
@ -47,7 +45,7 @@ __global__ void _GatherGradImpl(
|
|||
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < SZ; ii++) {
|
||||
int feature_dim = start_feature + ii * WARP_SIZE;
|
||||
int feature_dim = start_feature + ii * GPU_WARP_SIZE;
|
||||
if (feature_dim < stride) {
|
||||
gradient[ii] = static_cast<T>(grad_output[grad_row + feature_dim]);
|
||||
weight[ii] = static_cast<T>(grad_weight[weight_row + feature_dim]);
|
||||
|
|
@ -61,7 +59,7 @@ __global__ void _GatherGradImpl(
|
|||
|
||||
#pragma unroll
|
||||
for (int ii = 0; ii < SZ; ii++) {
|
||||
int feature_dim = start_feature + ii * WARP_SIZE;
|
||||
int feature_dim = start_feature + ii * GPU_WARP_SIZE;
|
||||
if (feature_dim < stride) {
|
||||
grad_weight[weight_row + feature_dim] = static_cast<T>(weight[ii]);
|
||||
}
|
||||
|
|
@ -114,8 +112,8 @@ void GatherGradImpl(
|
|||
original_indices.get(), original_indices_sorted.get(),
|
||||
num_indices));
|
||||
|
||||
dim3 block(GPU_WARP_SIZE, 4);
|
||||
dim3 grid(CeilDiv(num_indices, 4), CeilDiv(stride, 128));
|
||||
dim3 block(WARP_SIZE, 4);
|
||||
|
||||
_GatherGradImpl<<<grid, block>>>(
|
||||
indices_data_sorted.get(),
|
||||
|
|
|
|||
Loading…
Reference in a new issue