diff --git a/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu index e65e22727a..d6867949d0 100644 --- a/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu +++ b/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu @@ -32,35 +32,6 @@ namespace cuda { using namespace onnxruntime::cuda; -template - -__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff) -{ -#if CUDA_VERSION >= 9000 - - return __shfl_sync(mask, value, srcLane, width); - -#else - - return __shfl(value, srcLane, width); - -#endif -} - -template -__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) -{ -#if CUDA_VERSION >= 9000 - - return __shfl_xor_sync(mask, value, laneMask, width); - -#else - - return __shfl_xor(value, laneMask, width); - -#endif -} - template __device__ void cuWelfordOnlineSum( const U curr, @@ -82,8 +53,7 @@ __device__ void cuChanOnlineSum( const U countB, U& mu, U& sigma2, - U& count, - const int& warp_size) { + U& count) { U delta = muB - mu; U nA = count; U nB = countB; @@ -108,10 +78,9 @@ __device__ void cuWelfordMuSigma2( const int i1, U& mu, U& sigma2, - U* buf, - const int warp_size) { + U* buf) { // Assumptions: - // 1) blockDim.x == warpSize + // 1) blockDim.x == GPU_WARP_SIZE // 2) Tensor is contiguous // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available. // @@ -140,10 +109,10 @@ __device__ void cuWelfordMuSigma2( // intra-warp reductions for (int l = 0; l <= 4; ++l) { int srcLaneB = (threadIdx.x + (1 << l)) & 31; - U muB = WARP_SHFL(mu, srcLaneB, warp_size); - U countB = WARP_SHFL(count, srcLaneB, warp_size); - U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size); - cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size); + U muB = WARP_SHFL(mu, srcLaneB); + U countB = WARP_SHFL(count, srcLaneB); + U sigma2B = WARP_SHFL(sigma2, srcLaneB); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -164,7 +133,7 @@ __device__ void cuWelfordMuSigma2( U muB = ubuf[2 * threadIdx.y]; U sigma2B = ubuf[2 * threadIdx.y + 1]; U countB = ibuf[threadIdx.y]; - cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } __syncthreads(); } @@ -178,8 +147,8 @@ __device__ void cuWelfordMuSigma2( sigma2 = ubuf[1] / U(n2); // don't care about final value of count, we know count == n2 } else { - mu = WARP_SHFL(mu, 0, warp_size); - sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size); + mu = WARP_SHFL(mu, 0); + sigma2 = WARP_SHFL(sigma2 / U(n2), 0); } } } @@ -192,10 +161,9 @@ __device__ void cuWelfordMuSigma2( const int i1, float& mu, float& sigma2, - float* buf, - const int warp_size) { + float* buf) { // Assumptions: - // 1) blockDim.x == warpSize + // 1) blockDim.x == GPU_WARP_SIZE // 2) Tensor is contiguous // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available. // @@ -235,10 +203,10 @@ __device__ void cuWelfordMuSigma2( // intra-warp reductions for (int l = 0; l <= 4; ++l) { int srcLaneB = (threadIdx.x + (1 << l)) & 31; - float muB = WARP_SHFL(mu, srcLaneB, warp_size); - float countB = WARP_SHFL(count, srcLaneB, warp_size); - float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size); - cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size); + float muB = WARP_SHFL(mu, srcLaneB); + float countB = WARP_SHFL(count, srcLaneB); + float sigma2B = WARP_SHFL(sigma2, srcLaneB); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -259,7 +227,7 @@ __device__ void cuWelfordMuSigma2( float muB = ubuf[2 * threadIdx.y]; float sigma2B = ubuf[2 * threadIdx.y + 1]; float countB = ibuf[threadIdx.y]; - cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } __syncthreads(); } @@ -273,8 +241,8 @@ __device__ void cuWelfordMuSigma2( sigma2 = ubuf[1] / float(n2); // don't care about final value of count, we know count == n2 } else { - mu = WARP_SHFL(mu, 0, warp_size); - sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size); + mu = WARP_SHFL(mu, 0); + sigma2 = WARP_SHFL(sigma2 / float(n2), 0); } } } @@ -337,17 +305,16 @@ __global__ void cuApplyLayerNorm( const int n2, const U epsilon, const T* __restrict__ gamma, - const T* __restrict__ beta, - int warp_size) { + const T* __restrict__ beta) { // Assumptions: - // 1) blockDim.x == warpSize + // 1) blockDim.x == GPU_WARP_SIZE // 2) Tensors are contiguous // for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) { SharedMemory shared; U* buf = shared.getPointer(); U mu, sigma2; - cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size); + cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf); const T* lvals = vals + i1 * n2; T* ovals = output_vals + i1 * n2; U c_invvar = rsqrt(sigma2 + epsilon); @@ -385,6 +352,8 @@ void HostApplyLayerNorm( const T* beta) { const uint64_t maxGridY = prop.maxGridSize[1]; const int warp_size = prop.warpSize; + ORT_ENFORCE(warp_size == GPU_WARP_SIZE); + const dim3 threads(warp_size, 4, 1); const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1); int nshared = @@ -396,7 +365,7 @@ void HostApplyLayerNorm( input, n1, n2, U(epsilon), - gamma, beta, warp_size); + gamma, beta); } #define LAYERNORM_LINEAR_IMPL(T, U) \ diff --git a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu index 995bf54480..0f8cc58521 100644 --- a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu +++ b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu @@ -3,7 +3,6 @@ Copyright(c) 2016 Facebook Inc. ==============================================================================*/ /* Modifications Copyright (c) Microsoft. */ -#pragma once #include "core/providers/cuda/cu_inc/common.cuh" #include "core/providers/cuda/cu_inc/binary_elementwise_impl.cuh" #include "core/providers/cuda/shared_inc/cuda_utils.h" diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh index 6884d236c9..96395a947d 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh @@ -219,9 +219,9 @@ __device__ __inline__ T _Gelu(T a) { return a * _Normcdf(a); } + // We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type. - #ifndef CUDA_LONG #define CUDA_LONG int32_t #endif @@ -239,6 +239,7 @@ struct GridDim { }; }; + #define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N) \ CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; \ if (id >= N) \ @@ -253,5 +254,48 @@ struct GridDim { #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__) #endif // __APPLE__ +// WARP related definitions and functions +constexpr int GPU_WARP_SIZE = 32; + +template +__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_sync(mask, value, srcLane, width); +#else + return __shfl(value, srcLane, width); +#endif +} + +template +__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_xor_sync(mask, value, laneMask, width); +#else + return __shfl_xor(value, laneMask, width); +#endif +} + +template +__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_up_sync(mask, value, delta, width); +#else + return __shfl_up(value, delta, width); +#endif +} + +template +__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff) +{ +#if CUDA_VERSION >= 9000 + return __shfl_down_sync(mask, value, delta, width); +#else + return __shfl_down(value, delta, width); +#endif +} + } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/math/softmax_impl.cu b/onnxruntime/core/providers/cuda/math/softmax_impl.cu index 71e7a495fb..6aec65a38c 100644 --- a/onnxruntime/core/providers/cuda/math/softmax_impl.cu +++ b/onnxruntime/core/providers/cuda/math/softmax_impl.cu @@ -32,7 +32,7 @@ namespace cuda { // The template arguments have the following meaning: // One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples. // WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small. -// A "WARP" contains "CUDA_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp. +// A "WARP" contains "GPU_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp. // This is important because it means only __shfl_ instructions are required for reductions. // Note that this means WARP_SIZE must be a power of two and <= architecture warp size. // CUDA warp size is 32 for all existing GPU architecures, but there is no guarantee this will not change for future arch. @@ -48,7 +48,7 @@ template -__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) { -#if CUDA_VERSION >= 9000 - return __shfl_xor_sync(mask, value, laneMask, width); -#else - return __shfl_xor(value, laneMask, width); -#endif -} - template struct Add { __device__ __forceinline__ T operator()(T a, T b) const { diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu index de437d08a2..142cb3b302 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu +++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu @@ -99,7 +99,7 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output TOut value_ = value; #pragma unroll for (int stride = NUM_THREADS_PER_WARP / 2; stride > 0; stride /= 2) { - value_ += __shfl_down_sync(ALL_ONE_MASK, value_, stride); + value_ += WARP_SHFL_DOWN(value_, stride); } // Return early if only one warp is used for reduction. @@ -366,10 +366,9 @@ template void call_reduce_matrix_rows(const TIn *input, TOut *output, int m, int n) { constexpr int max_num_threads_in_block = 512; constexpr int max_num_blocks_in_grid = 512; - constexpr int warp_size = 32; constexpr int load_count_per_thread = 4; - const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, warp_size))); + const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE))); const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread))); const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid)); const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4)); diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu index c6f69990b9..0f4140c1f8 100644 --- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu @@ -30,7 +30,7 @@ template - -__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff) - -{ -#if CUDA_VERSION >= 9000 - return __shfl_sync(mask, value, srcLane, width); -#else - return __shfl(value, srcLane, width); -#endif -} - -template - -__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) - -{ -#if CUDA_VERSION >= 9000 - - return __shfl_xor_sync(mask, value, laneMask, width); - -#else - - return __shfl_xor(value, laneMask, width); - -#endif -} - template __device__ void cuWelfordOnlineSum( const U curr, @@ -79,8 +51,7 @@ __device__ void cuChanOnlineSum( const U countB, U& mu, U& sigma2, - U& count, - const int& warp_size) { + U& count) { U delta = muB - mu; U nA = count; U nB = countB; @@ -105,10 +76,9 @@ __device__ void cuWelfordMuSigma2( const int i1, U& mu, U& sigma2, - U* buf, - const int warp_size) { + U* buf) { // Assumptions: - // 1) blockDim.x == warpSize + // 1) blockDim.x == GPU_WARP_SIZE // 2) Tensor is contiguous // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available. // @@ -137,10 +107,10 @@ __device__ void cuWelfordMuSigma2( // intra-warp reductions for (int l = 0; l <= 4; ++l) { int srcLaneB = (threadIdx.x + (1 << l)) & 31; - U muB = WARP_SHFL(mu, srcLaneB, warp_size); - U countB = WARP_SHFL(count, srcLaneB, warp_size); - U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size); - cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size); + U muB = WARP_SHFL(mu, srcLaneB); + U countB = WARP_SHFL(count, srcLaneB); + U sigma2B = WARP_SHFL(sigma2, srcLaneB); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -161,7 +131,7 @@ __device__ void cuWelfordMuSigma2( U muB = ubuf[2 * threadIdx.y]; U sigma2B = ubuf[2 * threadIdx.y + 1]; U countB = ibuf[threadIdx.y]; - cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } __syncthreads(); } @@ -175,8 +145,8 @@ __device__ void cuWelfordMuSigma2( sigma2 = ubuf[1] / U(n2); // don't care about final value of count, we know count == n2 } else { - mu = WARP_SHFL(mu, 0, warp_size); - sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size); + mu = WARP_SHFL(mu, 0); + sigma2 = WARP_SHFL(sigma2 / U(n2), 0); } } } @@ -189,10 +159,9 @@ __device__ void cuWelfordMuSigma2( const int i1, float& mu, float& sigma2, - float* buf, - const int warp_size) { + float* buf) { // Assumptions: - // 1) blockDim.x == warpSize + // 1) blockDim.x == GPU_WARP_SIZE // 2) Tensor is contiguous // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available. // @@ -232,10 +201,10 @@ __device__ void cuWelfordMuSigma2( // intra-warp reductions for (int l = 0; l <= 4; ++l) { int srcLaneB = (threadIdx.x + (1 << l)) & 31; - float muB = WARP_SHFL(mu, srcLaneB, warp_size); - float countB = WARP_SHFL(count, srcLaneB, warp_size); - float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size); - cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size); + float muB = WARP_SHFL(mu, srcLaneB); + float countB = WARP_SHFL(count, srcLaneB); + float sigma2B = WARP_SHFL(sigma2, srcLaneB); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -256,7 +225,7 @@ __device__ void cuWelfordMuSigma2( float muB = ubuf[2 * threadIdx.y]; float sigma2B = ubuf[2 * threadIdx.y + 1]; float countB = ibuf[threadIdx.y]; - cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } __syncthreads(); } @@ -270,8 +239,8 @@ __device__ void cuWelfordMuSigma2( sigma2 = ubuf[1] / float(n2); // don't care about final value of count, we know count == n2 } else { - mu = WARP_SHFL(mu, 0, warp_size); - sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size); + mu = WARP_SHFL(mu, 0); + sigma2 = WARP_SHFL(sigma2 / float(n2), 0); } } } @@ -334,17 +303,16 @@ __global__ void cuApplyLayerNorm( const int n2, const U epsilon, const T* __restrict__ gamma, - const T* __restrict__ beta, - int warp_size) { + const T* __restrict__ beta) { // Assumptions: - // 1) blockDim.x == warpSize + // 1) blockDim.x == GPU_WARP_SIZE // 2) Tensors are contiguous // for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) { SharedMemory shared; U* buf = shared.getPointer(); U mu, sigma2; - cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size); + cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf); const T* lvals = vals + i1 * n2; T* ovals = output_vals + i1 * n2; U c_invvar = rsqrt(sigma2 + epsilon); @@ -382,6 +350,8 @@ void HostApplyLayerNorm( const T* beta) { const uint64_t maxGridY = prop.maxGridSize[1]; const int warp_size = prop.warpSize; + ORT_ENFORCE(warp_size == GPU_WARP_SIZE); + const dim3 threads(warp_size, 4, 1); const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1); int nshared = @@ -393,7 +363,7 @@ void HostApplyLayerNorm( input, n1, n2, U(epsilon), - gamma, beta, warp_size); + gamma, beta); } #define LAYERNORM_LINEAR_IMPL(T, U) \ @@ -727,7 +697,10 @@ void HostLayerNormGradient( U* part_grad_gamma, U* part_grad_beta, const int part_size) { - const dim3 threads2(prop.warpSize, 4, 1); + const int warp_size = prop.warpSize; + ORT_ENFORCE(warp_size == GPU_WARP_SIZE); + + const dim3 threads2(warp_size, 4, 1); const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1); const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1); const int nshared2_b = threads2.x * threads2.y * sizeof(U); @@ -742,7 +715,7 @@ void HostLayerNormGradient( part_grad_gamma, part_grad_beta); - const dim3 threads3(prop.warpSize, 8, 1); + const dim3 threads3(warp_size, 8, 1); const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1); const int nshared3 = threads3.x * threads3.y * sizeof(U); cuComputeGradGammaBeta<<>>( @@ -756,7 +729,7 @@ void HostLayerNormGradient( // compute grad_input const uint64_t maxGridY = prop.maxGridSize[1]; const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1); - const dim3 threads1(prop.warpSize, 4, 1); + const dim3 threads1(warp_size, 4, 1); int nshared = threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0; cuComputeGradInput<<>>( diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu index 94832b484f..1444d67cc8 100644 --- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu +++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu @@ -467,17 +467,16 @@ __global__ void LambMultiTensorReductionImpl(ChunkGroup<4> chunk_group) { } } - // Thread count in a block must be a multiple of 32. - constexpr int warp_size = 32; + // Thread count in a block must be a multiple of GPU_WARP_SIZE. #pragma unroll - for (int stride = warp_size / 2; stride > 0; stride /= 2) { - w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride); - d_sum += __shfl_down_sync(0xFFFFFFFF, d_sum, stride); + for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { + w_sum += WARP_SHFL_DOWN(w_sum, stride); + d_sum += WARP_SHFL_DOWN(d_sum, stride); } - const int warp_count_in_block = blockDim.x / warp_size; - const int lid = threadIdx.x % warp_size; - const int wid = threadIdx.x / warp_size; + const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE; + const int lid = threadIdx.x % GPU_WARP_SIZE; + const int wid = threadIdx.x / GPU_WARP_SIZE; // Shape is 2 x warp_count_in_block. extern __shared__ unsigned char shared_memory_[]; @@ -511,13 +510,11 @@ template ::operator()(ChunkGroup<4> chunk_group) { // thread count per block. constexpr int thread_count = ChunkGroup<4>::thread_count_per_block; - // warp size of GPU. - constexpr int warp_size = 32; // shared memory's size per block. - const int shared_memory_size = thread_count / warp_size * 2 * sizeof(TBuf); + const int shared_memory_size = thread_count / GPU_WARP_SIZE * 2 * sizeof(TBuf); // Enforce assumptions used inside this reduction CUDA kernel. - ORT_ENFORCE(thread_count % warp_size == 0); + ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0); ORT_ENFORCE((thread_count & (thread_count - 1)) == 0); LambMultiTensorReductionImpl<<>>(chunk_group); diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu index 5509962f54..da89a00d22 100644 --- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu +++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu @@ -46,16 +46,15 @@ __global__ void _MultiTensorReduceImpl(ChunkGroup<1> chunk_group, TOut* output) } } - // Thread count in a block must be a multiple of 32. - constexpr int warp_size = 32; + // Thread count in a block must be a multiple of GPU_WARP_SIZE. #pragma unroll - for (int stride = warp_size / 2; stride > 0; stride /= 2) { - w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride); + for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { + w_sum += WARP_SHFL_DOWN(w_sum, stride); } - const int warp_count_in_block = blockDim.x / warp_size; - const int lid = threadIdx.x % warp_size; - const int wid = threadIdx.x / warp_size; + const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE; + const int lid = threadIdx.x % GPU_WARP_SIZE; + const int wid = threadIdx.x / GPU_WARP_SIZE; // Shape is 2 x warp_count_in_block. extern __shared__ unsigned char shared_memory_[]; @@ -84,13 +83,11 @@ template chunk_group, TOut* output) { // thread count per block. constexpr int thread_count = ChunkGroup<1>::thread_count_per_block; - // warp size of GPU. - constexpr int warp_size = 32; // shared memory's size per block. - const int shared_memory_size = thread_count / warp_size * sizeof(TBuf); + const int shared_memory_size = thread_count / GPU_WARP_SIZE * sizeof(TBuf); // Enforce assumptions used inside this reduction CUDA kernel. - ORT_ENFORCE(thread_count % warp_size == 0); + ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0); ORT_ENFORCE((thread_count & (thread_count - 1)) == 0); _MultiTensorReduceImpl<<>>(chunk_group, output); diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu index ac1f60d842..1fc15f2218 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu +++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu @@ -11,8 +11,6 @@ namespace onnxruntime { namespace cuda { -static constexpr int WARP_SIZE = 32; - template __global__ void _Iota( cub::CountingInputIterator input, @@ -47,7 +45,7 @@ __global__ void _GatherGradImpl( #pragma unroll for (int ii = 0; ii < SZ; ii++) { - int feature_dim = start_feature + ii * WARP_SIZE; + int feature_dim = start_feature + ii * GPU_WARP_SIZE; if (feature_dim < stride) { gradient[ii] = static_cast(grad_output[grad_row + feature_dim]); weight[ii] = static_cast(grad_weight[weight_row + feature_dim]); @@ -61,7 +59,7 @@ __global__ void _GatherGradImpl( #pragma unroll for (int ii = 0; ii < SZ; ii++) { - int feature_dim = start_feature + ii * WARP_SIZE; + int feature_dim = start_feature + ii * GPU_WARP_SIZE; if (feature_dim < stride) { grad_weight[weight_row + feature_dim] = static_cast(weight[ii]); } @@ -114,8 +112,8 @@ void GatherGradImpl( original_indices.get(), original_indices_sorted.get(), num_indices)); + dim3 block(GPU_WARP_SIZE, 4); dim3 grid(CeilDiv(num_indices, 4), CeilDiv(stride, 128)); - dim3 block(WARP_SIZE, 4); _GatherGradImpl<<>>( indices_data_sorted.get(),