Refactoring code related to WARP_SIZE. (#3623)

1. Centralize its definition in common.cuh. 2. Rename it to GPU_WARP_SIZE which can be extended to AMD GPU later. 3. Centralize warp shuffle functions. Co-authored-by: Weixing Zhang <wezhan@microsoft.com>
2026-06-20 02:07:56 +00:00 · 2020-04-22 15:19:06 -07:00 · 2020-04-22 15:19:06 -07:00 · e4fc83252d
commit e4fc83252d
parent bb9b0ba5b3
11 changed files with 128 additions and 163 deletions
--- a/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu
@ -32,35 +32,6 @@ namespace cuda {

 using namespace onnxruntime::cuda;

-template <typename T>
-
-__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff)
-{
-#if CUDA_VERSION >= 9000
-
-  return __shfl_sync(mask, value, srcLane, width);
-
-#else
-
-  return __shfl(value, srcLane, width);
-
-#endif
-}
-
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
-{
-#if CUDA_VERSION >= 9000
-
-  return __shfl_xor_sync(mask, value, laneMask, width);
-
-#else
-
-  return __shfl_xor(value, laneMask, width);
-
-#endif
-}
-
 template <typename U>
 __device__ void cuWelfordOnlineSum(
    const U curr,
@ -82,8 +53,7 @@ __device__ void cuChanOnlineSum(
    const U countB,
    U& mu,
    U& sigma2,
-    U& count,
-    const int& warp_size) {
+    U& count) {
  U delta = muB - mu;
  U nA = count;
  U nB = countB;
@ -108,10 +78,9 @@ __device__ void cuWelfordMuSigma2(
    const int i1,
    U& mu,
    U& sigma2,
-    U* buf,
-    const int warp_size) {
+    U* buf) {
  // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
  // 2) Tensor is contiguous
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
  //
@ -140,10 +109,10 @@ __device__ void cuWelfordMuSigma2(
    // intra-warp reductions
    for (int l = 0; l <= 4; ++l) {
      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      U muB = WARP_SHFL(mu, srcLaneB, warp_size);
-      U countB = WARP_SHFL(count, srcLaneB, warp_size);
-      U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
-      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+      U muB = WARP_SHFL(mu, srcLaneB);
+      U countB = WARP_SHFL(count, srcLaneB);
+      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
    }
    // threadIdx.x == 0 has correct values for each warp
    // inter-warp reductions
@ -164,7 +133,7 @@ __device__ void cuWelfordMuSigma2(
          U muB = ubuf[2 * threadIdx.y];
          U sigma2B = ubuf[2 * threadIdx.y + 1];
          U countB = ibuf[threadIdx.y];
-          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
        }
        __syncthreads();
      }
@ -178,8 +147,8 @@ __device__ void cuWelfordMuSigma2(
      sigma2 = ubuf[1] / U(n2);
      // don't care about final value of count, we know count == n2
    } else {
-      mu = WARP_SHFL(mu, 0, warp_size);
-      sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size);
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
    }
  }
 }
@ -192,10 +161,9 @@ __device__ void cuWelfordMuSigma2(
    const int i1,
    float& mu,
    float& sigma2,
-    float* buf,
-    const int warp_size) {
+    float* buf) {
  // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
  // 2) Tensor is contiguous
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
  //
@ -235,10 +203,10 @@ __device__ void cuWelfordMuSigma2(
    // intra-warp reductions
    for (int l = 0; l <= 4; ++l) {
      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      float muB = WARP_SHFL(mu, srcLaneB, warp_size);
-      float countB = WARP_SHFL(count, srcLaneB, warp_size);
-      float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
-      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+      float muB = WARP_SHFL(mu, srcLaneB);
+      float countB = WARP_SHFL(count, srcLaneB);
+      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
    }
    // threadIdx.x == 0 has correct values for each warp
    // inter-warp reductions
@ -259,7 +227,7 @@ __device__ void cuWelfordMuSigma2(
          float muB = ubuf[2 * threadIdx.y];
          float sigma2B = ubuf[2 * threadIdx.y + 1];
          float countB = ibuf[threadIdx.y];
-          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
        }
        __syncthreads();
      }
@ -273,8 +241,8 @@ __device__ void cuWelfordMuSigma2(
      sigma2 = ubuf[1] / float(n2);
      // don't care about final value of count, we know count == n2
    } else {
-      mu = WARP_SHFL(mu, 0, warp_size);
-      sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size);
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
    }
  }
 }
@ -337,17 +305,16 @@ __global__ void cuApplyLayerNorm(
    const int n2,
    const U epsilon,
    const T* __restrict__ gamma,
-    const T* __restrict__ beta,
-    int warp_size) {
+    const T* __restrict__ beta) {
  // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
  // 2) Tensors are contiguous
  //
  for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
    SharedMemory<U> shared;
    U* buf = shared.getPointer();
    U mu, sigma2;
-    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size);
+    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
    const T* lvals = vals + i1 * n2;
    T* ovals = output_vals + i1 * n2;
    U c_invvar = rsqrt(sigma2 + epsilon);
@ -385,6 +352,8 @@ void HostApplyLayerNorm(
    const T* beta) {
  const uint64_t maxGridY = prop.maxGridSize[1];
  const int warp_size = prop.warpSize;
+  ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
+
  const dim3 threads(warp_size, 4, 1);
  const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
  int nshared =
@ -396,7 +365,7 @@ void HostApplyLayerNorm(
      input,
      n1, n2,
      U(epsilon),
-      gamma, beta, warp_size);
+      gamma, beta);
 }

 #define LAYERNORM_LINEAR_IMPL(T, U)                                                                       \
--- a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu
@ -3,7 +3,6 @@ Copyright(c) 2016 Facebook Inc.
 ==============================================================================*/
 /* Modifications Copyright (c) Microsoft. */

-#pragma once
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/cu_inc/binary_elementwise_impl.cuh"
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@ -219,9 +219,9 @@ __device__ __inline__ T _Gelu(T a) {
  return a * _Normcdf(a);
 }

+
 // We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
-
 #ifndef CUDA_LONG
 #define CUDA_LONG int32_t
 #endif
@ -239,6 +239,7 @@ struct GridDim {
  };
 };

+
 #define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N)          \
  CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;     \
  if (id >= N)                                              \
@ -253,5 +254,48 @@ struct GridDim {
 #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
 #endif // __APPLE__

+// WARP related definitions and functions
+constexpr int GPU_WARP_SIZE = 32;
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(mask, value, srcLane, width);
+#else
+  return __shfl(value, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+  return __shfl_up_sync(mask, value, delta, width);
+#else
+  return __shfl_up(value, delta, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(mask, value, delta, width);
+#else
+  return __shfl_down(value, delta, width);
+#endif
+}
+
 }  // namespace cuda
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/cuda/math/softmax_impl.cu
+++ b/onnxruntime/core/providers/cuda/math/softmax_impl.cu
@ -32,7 +32,7 @@ namespace cuda {
 // The template arguments have the following meaning:
 // One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples.
 // WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small.
-// A "WARP" contains "CUDA_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp.
+// A "WARP" contains "GPU_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp.
 // This is important because it means only __shfl_ instructions are required for reductions.
 // Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
 // CUDA warp size is 32 for all existing GPU architecures, but there is no guarantee this will not change for future arch.
@ -48,7 +48,7 @@ template <typename input_t, typename output_t, typename acc_t, int log2_elements
 __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batch_size, int stride, int element_count) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
-  constexpr int WARP_SIZE = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
+  constexpr int WARP_SIZE = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;

@ -143,7 +143,7 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele
    const int next_power_of_two = 1 << log2_elements;

    // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
-    int warp_size = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
+    int warp_size = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;

    // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
--- a/onnxruntime/core/providers/cuda/math/softmax_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/softmax_impl.cuh
@ -23,23 +23,12 @@
 namespace onnxruntime {
 namespace cuda {

-constexpr int CUDA_WARP_SIZE = 32;
-
 inline int log2_ceil(int value) {
  int log2_value = 0;
  while ((1 << log2_value) < value) ++log2_value;
  return log2_value;
 }

-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) {
-#if CUDA_VERSION >= 9000
-  return __shfl_xor_sync(mask, value, laneMask, width);
-#else
-  return __shfl_xor(value, laneMask, width);
-#endif
-}
-
 template <typename T>
 struct Add {
  __device__ __forceinline__ T operator()(T a, T b) const {
--- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
@ -99,7 +99,7 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output
  TOut value_ = value;
 #pragma unroll
  for (int stride = NUM_THREADS_PER_WARP / 2; stride > 0; stride /= 2) {
-    value_ += __shfl_down_sync(ALL_ONE_MASK, value_, stride);
+    value_ += WARP_SHFL_DOWN(value_, stride);
  }

  // Return early if only one warp is used for reduction.
@ -366,10 +366,9 @@ template<typename TIn, typename TOut, typename TBuf>
 void call_reduce_matrix_rows(const TIn *input, TOut *output, int m, int n) {
  constexpr int max_num_threads_in_block = 512;
  constexpr int max_num_blocks_in_grid = 512;
-  constexpr int warp_size = 32;
  constexpr int load_count_per_thread = 4;

-  const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, warp_size)));
+  const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE)));
  const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread)));
  const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid));
  const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4));
--- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu
@ -30,7 +30,7 @@ template <typename input_t, typename output_t, typename acc_t, int log2_elements
 __global__ void softmax_warp_backward(output_t* gradInput, const input_t* grad, const input_t* output, int batch_size, int stride, int element_count) {
  // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel.
  constexpr int next_power_of_two = 1 << log2_elements;
-  constexpr int WARP_SIZE = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
+  constexpr int WARP_SIZE = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
  constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
  constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;

@ -116,7 +116,7 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const
    const int next_power_of_two = 1 << log2_elements;

    // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
-    int warp_size = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
+    int warp_size = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;

    // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
    int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
--- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
@ -30,34 +30,6 @@ namespace cuda {

 using namespace onnxruntime::cuda;

-template <typename T>
-
-__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff)
-
-{
-#if CUDA_VERSION >= 9000
-  return __shfl_sync(mask, value, srcLane, width);
-#else
-  return __shfl(value, srcLane, width);
-#endif
-}
-
-template <typename T>
-
-__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
-
-{
-#if CUDA_VERSION >= 9000
-
-  return __shfl_xor_sync(mask, value, laneMask, width);
-
-#else
-
-  return __shfl_xor(value, laneMask, width);
-
-#endif
-}
-
 template <typename U>
 __device__ void cuWelfordOnlineSum(
    const U curr,
@ -79,8 +51,7 @@ __device__ void cuChanOnlineSum(
    const U countB,
    U& mu,
    U& sigma2,
-    U& count,
-    const int& warp_size) {
+    U& count) {
  U delta = muB - mu;
  U nA = count;
  U nB = countB;
@ -105,10 +76,9 @@ __device__ void cuWelfordMuSigma2(
    const int i1,
    U& mu,
    U& sigma2,
-    U* buf,
-    const int warp_size) {
+    U* buf) {
  // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
  // 2) Tensor is contiguous
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
  //
@ -137,10 +107,10 @@ __device__ void cuWelfordMuSigma2(
    // intra-warp reductions
    for (int l = 0; l <= 4; ++l) {
      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      U muB = WARP_SHFL(mu, srcLaneB, warp_size);
-      U countB = WARP_SHFL(count, srcLaneB, warp_size);
-      U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
-      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+      U muB = WARP_SHFL(mu, srcLaneB);
+      U countB = WARP_SHFL(count, srcLaneB);
+      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
    }
    // threadIdx.x == 0 has correct values for each warp
    // inter-warp reductions
@ -161,7 +131,7 @@ __device__ void cuWelfordMuSigma2(
          U muB = ubuf[2 * threadIdx.y];
          U sigma2B = ubuf[2 * threadIdx.y + 1];
          U countB = ibuf[threadIdx.y];
-          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
        }
        __syncthreads();
      }
@ -175,8 +145,8 @@ __device__ void cuWelfordMuSigma2(
      sigma2 = ubuf[1] / U(n2);
      // don't care about final value of count, we know count == n2
    } else {
-      mu = WARP_SHFL(mu, 0, warp_size);
-      sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size);
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
    }
  }
 }
@ -189,10 +159,9 @@ __device__ void cuWelfordMuSigma2(
    const int i1,
    float& mu,
    float& sigma2,
-    float* buf,
-    const int warp_size) {
+    float* buf) {
  // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
  // 2) Tensor is contiguous
  // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
  //
@ -232,10 +201,10 @@ __device__ void cuWelfordMuSigma2(
    // intra-warp reductions
    for (int l = 0; l <= 4; ++l) {
      int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      float muB = WARP_SHFL(mu, srcLaneB, warp_size);
-      float countB = WARP_SHFL(count, srcLaneB, warp_size);
-      float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
-      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+      float muB = WARP_SHFL(mu, srcLaneB);
+      float countB = WARP_SHFL(count, srcLaneB);
+      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
    }
    // threadIdx.x == 0 has correct values for each warp
    // inter-warp reductions
@ -256,7 +225,7 @@ __device__ void cuWelfordMuSigma2(
          float muB = ubuf[2 * threadIdx.y];
          float sigma2B = ubuf[2 * threadIdx.y + 1];
          float countB = ibuf[threadIdx.y];
-          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
        }
        __syncthreads();
      }
@ -270,8 +239,8 @@ __device__ void cuWelfordMuSigma2(
      sigma2 = ubuf[1] / float(n2);
      // don't care about final value of count, we know count == n2
    } else {
-      mu = WARP_SHFL(mu, 0, warp_size);
-      sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size);
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
    }
  }
 }
@ -334,17 +303,16 @@ __global__ void cuApplyLayerNorm(
    const int n2,
    const U epsilon,
    const T* __restrict__ gamma,
-    const T* __restrict__ beta,
-    int warp_size) {
+    const T* __restrict__ beta) {
  // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
  // 2) Tensors are contiguous
  //
  for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
    SharedMemory<U> shared;
    U* buf = shared.getPointer();
    U mu, sigma2;
-    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size);
+    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
    const T* lvals = vals + i1 * n2;
    T* ovals = output_vals + i1 * n2;
    U c_invvar = rsqrt(sigma2 + epsilon);
@ -382,6 +350,8 @@ void HostApplyLayerNorm(
    const T* beta) {
  const uint64_t maxGridY = prop.maxGridSize[1];
  const int warp_size = prop.warpSize;
+  ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
+
  const dim3 threads(warp_size, 4, 1);
  const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
  int nshared =
@ -393,7 +363,7 @@ void HostApplyLayerNorm(
      input,
      n1, n2,
      U(epsilon),
-      gamma, beta, warp_size);
+      gamma, beta);
 }

 #define LAYERNORM_LINEAR_IMPL(T, U)                                                                       \
@ -727,7 +697,10 @@ void HostLayerNormGradient(
    U* part_grad_gamma,
    U* part_grad_beta,
    const int part_size) {
-  const dim3 threads2(prop.warpSize, 4, 1);
+  const int warp_size = prop.warpSize;
+  ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
+
+  const dim3 threads2(warp_size, 4, 1);
  const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1);
  const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
  const int nshared2_b = threads2.x * threads2.y * sizeof(U);
@ -742,7 +715,7 @@ void HostLayerNormGradient(
      part_grad_gamma,
      part_grad_beta);

-  const dim3 threads3(prop.warpSize, 8, 1);
+  const dim3 threads3(warp_size, 8, 1);
  const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1);
  const int nshared3 = threads3.x * threads3.y * sizeof(U);
  cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, 0>>>(
@ -756,7 +729,7 @@ void HostLayerNormGradient(
  // compute grad_input
  const uint64_t maxGridY = prop.maxGridSize[1];
  const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
-  const dim3 threads1(prop.warpSize, 4, 1);
+  const dim3 threads1(warp_size, 4, 1);
  int nshared =
      threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0;
  cuComputeGradInput<<<blocks1, threads1, nshared, 0>>>(
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu
@ -467,17 +467,16 @@ __global__ void LambMultiTensorReductionImpl(ChunkGroup<4> chunk_group) {
    }
  }

-  // Thread count in a block must be a multiple of 32.
-  constexpr int warp_size = 32;
+  // Thread count in a block must be a multiple of GPU_WARP_SIZE.
 #pragma unroll
-  for (int stride = warp_size / 2; stride > 0; stride /= 2) {
-    w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride);
-    d_sum += __shfl_down_sync(0xFFFFFFFF, d_sum, stride);
+  for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+    w_sum += WARP_SHFL_DOWN(w_sum, stride);
+    d_sum += WARP_SHFL_DOWN(d_sum, stride);
  }

-  const int warp_count_in_block = blockDim.x / warp_size;
-  const int lid = threadIdx.x % warp_size;
-  const int wid = threadIdx.x / warp_size;
+  const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE;
+  const int lid = threadIdx.x % GPU_WARP_SIZE;
+  const int wid = threadIdx.x / GPU_WARP_SIZE;

  // Shape is 2 x warp_count_in_block.
  extern __shared__ unsigned char shared_memory_[];
@ -511,13 +510,11 @@ template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename
 void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(ChunkGroup<4> chunk_group) {
  // thread count per block.
  constexpr int thread_count = ChunkGroup<4>::thread_count_per_block;
-  // warp size of GPU.
-  constexpr int warp_size = 32;
  // shared memory's size per block.
-  const int shared_memory_size = thread_count / warp_size * 2 * sizeof(TBuf);
+  const int shared_memory_size = thread_count / GPU_WARP_SIZE * 2 * sizeof(TBuf);

  // Enforce assumptions used inside this reduction CUDA kernel.
-  ORT_ENFORCE(thread_count % warp_size == 0);
+  ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0);
  ORT_ENFORCE((thread_count & (thread_count - 1)) == 0);

  LambMultiTensorReductionImpl<TIn1, TIn2, TOut1, TOut2, TBuf><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(chunk_group);
--- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu
+++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu
@ -46,16 +46,15 @@ __global__ void _MultiTensorReduceImpl(ChunkGroup<1> chunk_group, TOut* output)
    }
  }

-  // Thread count in a block must be a multiple of 32.
-  constexpr int warp_size = 32;
+  // Thread count in a block must be a multiple of GPU_WARP_SIZE.
 #pragma unroll
-  for (int stride = warp_size / 2; stride > 0; stride /= 2) {
-    w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride);
+  for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+    w_sum += WARP_SHFL_DOWN(w_sum, stride);
  }

-  const int warp_count_in_block = blockDim.x / warp_size;
-  const int lid = threadIdx.x % warp_size;
-  const int wid = threadIdx.x / warp_size;
+  const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE;
+  const int lid = threadIdx.x % GPU_WARP_SIZE;
+  const int wid = threadIdx.x / GPU_WARP_SIZE;

  // Shape is 2 x warp_count_in_block.
  extern __shared__ unsigned char shared_memory_[];
@ -84,13 +83,11 @@ template <typename TIn, typename TOut, typename TBuf, typename TInOp, typename T
 void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) {
  // thread count per block.
  constexpr int thread_count = ChunkGroup<1>::thread_count_per_block;
-  // warp size of GPU.
-  constexpr int warp_size = 32;
  // shared memory's size per block.
-  const int shared_memory_size = thread_count / warp_size * sizeof(TBuf);
+  const int shared_memory_size = thread_count / GPU_WARP_SIZE * sizeof(TBuf);

  // Enforce assumptions used inside this reduction CUDA kernel.
-  ORT_ENFORCE(thread_count % warp_size == 0);
+  ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0);
  ORT_ENFORCE((thread_count & (thread_count - 1)) == 0);

  _MultiTensorReduceImpl<TIn, TOut, TBuf, TInOp, TOutOp><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(chunk_group, output);
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu
@ -11,8 +11,6 @@
 namespace onnxruntime {
 namespace cuda {

-static constexpr int WARP_SIZE = 32;
-
 template <typename T>
 __global__ void _Iota(
    cub::CountingInputIterator<T> input,
@ -47,7 +45,7 @@ __global__ void _GatherGradImpl(

 #pragma unroll
        for (int ii = 0; ii < SZ; ii++) {
-          int feature_dim = start_feature + ii * WARP_SIZE;
+          int feature_dim = start_feature + ii * GPU_WARP_SIZE;
          if (feature_dim < stride) {
            gradient[ii] = static_cast<T>(grad_output[grad_row + feature_dim]);
            weight[ii] = static_cast<T>(grad_weight[weight_row + feature_dim]);
@ -61,7 +59,7 @@ __global__ void _GatherGradImpl(

 #pragma unroll
        for (int ii = 0; ii < SZ; ii++) {
-          int feature_dim = start_feature + ii * WARP_SIZE;
+          int feature_dim = start_feature + ii * GPU_WARP_SIZE;
          if (feature_dim < stride) {
            grad_weight[weight_row + feature_dim] = static_cast<T>(weight[ii]);
          }
@ -114,8 +112,8 @@ void GatherGradImpl(
      original_indices.get(), original_indices_sorted.get(),
      num_indices));

+  dim3 block(GPU_WARP_SIZE, 4);
  dim3 grid(CeilDiv(num_indices, 4), CeilDiv(stride, 128));
-  dim3 block(WARP_SIZE, 4);

  _GatherGradImpl<<<grid, block>>>(
      indices_data_sorted.get(),