diff --git a/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu b/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu
index e65e22727a..d6867949d0 100644
--- a/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/layer_norm_impl.cu
@@ -32,35 +32,6 @@ namespace cuda {
 
 using namespace onnxruntime::cuda;
 
-template <typename T>
-
-__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff)
-{
-#if CUDA_VERSION >= 9000
-
-  return __shfl_sync(mask, value, srcLane, width);
-
-#else
-
-  return __shfl(value, srcLane, width);
-
-#endif
-}
-
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
-{
-#if CUDA_VERSION >= 9000
-
-  return __shfl_xor_sync(mask, value, laneMask, width);
-
-#else
-
-  return __shfl_xor(value, laneMask, width);
-
-#endif
-}
-
 template <typename U>
 __device__ void cuWelfordOnlineSum(
     const U curr,
@@ -82,8 +53,7 @@ __device__ void cuChanOnlineSum(
     const U countB,
     U& mu,
     U& sigma2,
-    U& count,
-    const int& warp_size) {
+    U& count) {
   U delta = muB - mu;
   U nA = count;
   U nB = countB;
@@ -108,10 +78,9 @@ __device__ void cuWelfordMuSigma2(
     const int i1,
     U& mu,
     U& sigma2,
-    U* buf,
-    const int warp_size) {
+    U* buf) {
   // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
   // 2) Tensor is contiguous
   // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
   //
@@ -140,10 +109,10 @@ __device__ void cuWelfordMuSigma2(
     // intra-warp reductions
     for (int l = 0; l <= 4; ++l) {
       int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      U muB = WARP_SHFL(mu, srcLaneB, warp_size);
-      U countB = WARP_SHFL(count, srcLaneB, warp_size);
-      U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
-      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+      U muB = WARP_SHFL(mu, srcLaneB);
+      U countB = WARP_SHFL(count, srcLaneB);
+      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -164,7 +133,7 @@ __device__ void cuWelfordMuSigma2(
           U muB = ubuf[2 * threadIdx.y];
           U sigma2B = ubuf[2 * threadIdx.y + 1];
           U countB = ibuf[threadIdx.y];
-          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
         }
         __syncthreads();
       }
@@ -178,8 +147,8 @@ __device__ void cuWelfordMuSigma2(
       sigma2 = ubuf[1] / U(n2);
       // don't care about final value of count, we know count == n2
     } else {
-      mu = WARP_SHFL(mu, 0, warp_size);
-      sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size);
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
     }
   }
 }
@@ -192,10 +161,9 @@ __device__ void cuWelfordMuSigma2(
     const int i1,
     float& mu,
     float& sigma2,
-    float* buf,
-    const int warp_size) {
+    float* buf) {
   // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
   // 2) Tensor is contiguous
   // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
   //
@@ -235,10 +203,10 @@ __device__ void cuWelfordMuSigma2(
     // intra-warp reductions
     for (int l = 0; l <= 4; ++l) {
       int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      float muB = WARP_SHFL(mu, srcLaneB, warp_size);
-      float countB = WARP_SHFL(count, srcLaneB, warp_size);
-      float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
-      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+      float muB = WARP_SHFL(mu, srcLaneB);
+      float countB = WARP_SHFL(count, srcLaneB);
+      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -259,7 +227,7 @@ __device__ void cuWelfordMuSigma2(
           float muB = ubuf[2 * threadIdx.y];
           float sigma2B = ubuf[2 * threadIdx.y + 1];
           float countB = ibuf[threadIdx.y];
-          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
         }
         __syncthreads();
       }
@@ -273,8 +241,8 @@ __device__ void cuWelfordMuSigma2(
       sigma2 = ubuf[1] / float(n2);
       // don't care about final value of count, we know count == n2
     } else {
-      mu = WARP_SHFL(mu, 0, warp_size);
-      sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size);
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
     }
   }
 }
@@ -337,17 +305,16 @@ __global__ void cuApplyLayerNorm(
     const int n2,
     const U epsilon,
     const T* __restrict__ gamma,
-    const T* __restrict__ beta,
-    int warp_size) {
+    const T* __restrict__ beta) {
   // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
   // 2) Tensors are contiguous
   //
   for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
     SharedMemory<U> shared;
     U* buf = shared.getPointer();
     U mu, sigma2;
-    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size);
+    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
     const T* lvals = vals + i1 * n2;
     T* ovals = output_vals + i1 * n2;
     U c_invvar = rsqrt(sigma2 + epsilon);
@@ -385,6 +352,8 @@ void HostApplyLayerNorm(
     const T* beta) {
   const uint64_t maxGridY = prop.maxGridSize[1];
   const int warp_size = prop.warpSize;
+  ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
+
   const dim3 threads(warp_size, 4, 1);
   const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
   int nshared =
@@ -396,7 +365,7 @@ void HostApplyLayerNorm(
       input,
       n1, n2,
       U(epsilon),
-      gamma, beta, warp_size);
+      gamma, beta);
 }
 
 #define LAYERNORM_LINEAR_IMPL(T, U)                                                                       \
diff --git a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu
index 995bf54480..0f8cc58521 100644
--- a/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/math/fft_ops_impl.cu
@@ -3,7 +3,6 @@ Copyright(c) 2016 Facebook Inc.
 ==============================================================================*/
 /* Modifications Copyright (c) Microsoft. */
 
-#pragma once
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/cu_inc/binary_elementwise_impl.cuh"
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
index 6884d236c9..96395a947d 100644
--- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh
+++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh
@@ -219,9 +219,9 @@ __device__ __inline__ T _Gelu(T a) {
   return a * _Normcdf(a);
 }
 
+
 // We would like to use 64-bit integer to support large matrices. However, CUDA seems to support only 32-bit integer
 // For now, use int32_t to ensure that both Linux and Windows see this as 32 bit integer type.
-
 #ifndef CUDA_LONG
 #define CUDA_LONG int32_t
 #endif
@@ -239,6 +239,7 @@ struct GridDim {
   };
 };
 
+
 #define CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N)          \
   CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;     \
   if (id >= N)                                              \
@@ -253,5 +254,48 @@ struct GridDim {
 #define CUDA_KERNEL_ASSERT(...) assert(__VA_ARGS__)
 #endif // __APPLE__
 
+// WARP related definitions and functions
+constexpr int GPU_WARP_SIZE = 32;
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+  return __shfl_sync(mask, value, srcLane, width);
+#else
+  return __shfl(value, srcLane, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_UP(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+  return __shfl_up_sync(mask, value, delta, width);
+#else
+  return __shfl_up(value, delta, width);
+#endif
+}
+
+template <typename T>
+__device__ __forceinline__ T WARP_SHFL_DOWN(T value, unsigned int delta, int width = GPU_WARP_SIZE, unsigned int mask = 0xffffffff)
+{
+#if CUDA_VERSION >= 9000
+  return __shfl_down_sync(mask, value, delta, width);
+#else
+  return __shfl_down(value, delta, width);
+#endif
+}
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/math/softmax_impl.cu b/onnxruntime/core/providers/cuda/math/softmax_impl.cu
index 71e7a495fb..6aec65a38c 100644
--- a/onnxruntime/core/providers/cuda/math/softmax_impl.cu
+++ b/onnxruntime/core/providers/cuda/math/softmax_impl.cu
@@ -32,7 +32,7 @@ namespace cuda {
 // The template arguments have the following meaning:
 // One "WARP" works on one "BATCH". One "BATCH" contains "WARP_BATCH" samples.
 // WARP_BATCH is equal to 1 when element_count is large, and > 1 when element_count is small.
-// A "WARP" contains "CUDA_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp.
+// A "WARP" contains "GPU_WARP_SIZE" threads, these treads are guaranteed to belong to the same warp.
 // This is important because it means only __shfl_ instructions are required for reductions.
 // Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
 // CUDA warp size is 32 for all existing GPU architecures, but there is no guarantee this will not change for future arch.
@@ -48,7 +48,7 @@ template <typename input_t, typename output_t, typename acc_t, int log2_elements
 __global__ void softmax_warp_forward(output_t* dst, const input_t* src, int batch_size, int stride, int element_count) {
   // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_forward_kernel.
   constexpr int next_power_of_two = 1 << log2_elements;
-  constexpr int WARP_SIZE = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
+  constexpr int WARP_SIZE = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
   constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
   constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
 
@@ -143,7 +143,7 @@ void dispatch_softmax_forward(output_t* dst, const input_t* src, int softmax_ele
     const int next_power_of_two = 1 << log2_elements;
 
     // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_forward.
-    int warp_size = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
+    int warp_size = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
 
     // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward.
     int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
diff --git a/onnxruntime/core/providers/cuda/math/softmax_impl.cuh b/onnxruntime/core/providers/cuda/math/softmax_impl.cuh
index 2ca5c4b80d..b9c07da7f2 100644
--- a/onnxruntime/core/providers/cuda/math/softmax_impl.cuh
+++ b/onnxruntime/core/providers/cuda/math/softmax_impl.cuh
@@ -23,23 +23,12 @@
 namespace onnxruntime {
 namespace cuda {
 
-constexpr int CUDA_WARP_SIZE = 32;
-
 inline int log2_ceil(int value) {
   int log2_value = 0;
   while ((1 << log2_value) < value) ++log2_value;
   return log2_value;
 }
 
-template <typename T>
-__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff) {
-#if CUDA_VERSION >= 9000
-  return __shfl_xor_sync(mask, value, laneMask, width);
-#else
-  return __shfl_xor(value, laneMask, width);
-#endif
-}
-
 template <typename T>
 struct Add {
   __device__ __forceinline__ T operator()(T a, T b) const {
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
index de437d08a2..142cb3b302 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
@@ -99,7 +99,7 @@ __global__ void reduce_all_kernel(const int size, const TIn * data, TOut* output
   TOut value_ = value;
 #pragma unroll
   for (int stride = NUM_THREADS_PER_WARP / 2; stride > 0; stride /= 2) {
-    value_ += __shfl_down_sync(ALL_ONE_MASK, value_, stride);
+    value_ += WARP_SHFL_DOWN(value_, stride);
   }
 
   // Return early if only one warp is used for reduction.
@@ -366,10 +366,9 @@ template<typename TIn, typename TOut, typename TBuf>
 void call_reduce_matrix_rows(const TIn *input, TOut *output, int m, int n) {
   constexpr int max_num_threads_in_block = 512;
   constexpr int max_num_blocks_in_grid = 512;
-  constexpr int warp_size = 32;
   constexpr int load_count_per_thread = 4;
 
-  const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, warp_size)));
+  const int block_x_dim = least_pow2_bound(std::max(1, std::min(n, GPU_WARP_SIZE)));
   const int block_y_dim = least_pow2_bound(std::max(1, std::min(max_num_threads_in_block / block_x_dim, m / load_count_per_thread)));
   const int grid_x_dim = std::max(1, std::min(n / block_x_dim, max_num_blocks_in_grid));
   const int grid_y_dim = std::max(1, std::min(max_num_blocks_in_grid / grid_x_dim, m / block_y_dim / 4));
diff --git a/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu
index c6f69990b9..0f4140c1f8 100644
--- a/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/math/softmax_grad_impl.cu
@@ -30,7 +30,7 @@ template <typename input_t, typename output_t, typename acc_t, int log2_elements
 __global__ void softmax_warp_backward(output_t* gradInput, const input_t* grad, const input_t* output, int batch_size, int stride, int element_count) {
   // WARP_SIZE and WARP_BATCH must match the return values batches_per_warp and warp_size of method warp_softmax_backward_kernel.
   constexpr int next_power_of_two = 1 << log2_elements;
-  constexpr int WARP_SIZE = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
+  constexpr int WARP_SIZE = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
   constexpr int WARP_ITERATIONS = next_power_of_two / WARP_SIZE;
   constexpr int WARP_BATCH = (next_power_of_two <= 128) ? 2 : 1;
 
@@ -116,7 +116,7 @@ void dispatch_softmax_backward(output_t* grad_input, const input_t* grad, const
     const int next_power_of_two = 1 << log2_elements;
 
     // This value must match the WARP_SIZE constexpr value computed inside softmax_warp_backward.
-    int warp_size = (next_power_of_two < CUDA_WARP_SIZE) ? next_power_of_two : CUDA_WARP_SIZE;
+    int warp_size = (next_power_of_two < GPU_WARP_SIZE) ? next_power_of_two : GPU_WARP_SIZE;
 
     // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward.
     int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1;
diff --git a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
index dc16bcc72d..e8ba8a0e6c 100644
--- a/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/nn/layer_norm_impl.cu
@@ -30,34 +30,6 @@ namespace cuda {
 
 using namespace onnxruntime::cuda;
 
-template <typename T>
-
-__device__ __forceinline__ T WARP_SHFL(T value, int srcLane, int width, unsigned int mask = 0xffffffff)
-
-{
-#if CUDA_VERSION >= 9000
-  return __shfl_sync(mask, value, srcLane, width);
-#else
-  return __shfl(value, srcLane, width);
-#endif
-}
-
-template <typename T>
-
-__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
-
-{
-#if CUDA_VERSION >= 9000
-
-  return __shfl_xor_sync(mask, value, laneMask, width);
-
-#else
-
-  return __shfl_xor(value, laneMask, width);
-
-#endif
-}
-
 template <typename U>
 __device__ void cuWelfordOnlineSum(
     const U curr,
@@ -79,8 +51,7 @@ __device__ void cuChanOnlineSum(
     const U countB,
     U& mu,
     U& sigma2,
-    U& count,
-    const int& warp_size) {
+    U& count) {
   U delta = muB - mu;
   U nA = count;
   U nB = countB;
@@ -105,10 +76,9 @@ __device__ void cuWelfordMuSigma2(
     const int i1,
     U& mu,
     U& sigma2,
-    U* buf,
-    const int warp_size) {
+    U* buf) {
   // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
   // 2) Tensor is contiguous
   // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
   //
@@ -137,10 +107,10 @@ __device__ void cuWelfordMuSigma2(
     // intra-warp reductions
     for (int l = 0; l <= 4; ++l) {
       int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      U muB = WARP_SHFL(mu, srcLaneB, warp_size);
-      U countB = WARP_SHFL(count, srcLaneB, warp_size);
-      U sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
-      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+      U muB = WARP_SHFL(mu, srcLaneB);
+      U countB = WARP_SHFL(count, srcLaneB);
+      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -161,7 +131,7 @@ __device__ void cuWelfordMuSigma2(
           U muB = ubuf[2 * threadIdx.y];
           U sigma2B = ubuf[2 * threadIdx.y + 1];
           U countB = ibuf[threadIdx.y];
-          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+          cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
         }
         __syncthreads();
       }
@@ -175,8 +145,8 @@ __device__ void cuWelfordMuSigma2(
       sigma2 = ubuf[1] / U(n2);
       // don't care about final value of count, we know count == n2
     } else {
-      mu = WARP_SHFL(mu, 0, warp_size);
-      sigma2 = WARP_SHFL(sigma2 / U(n2), 0, warp_size);
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / U(n2), 0);
     }
   }
 }
@@ -189,10 +159,9 @@ __device__ void cuWelfordMuSigma2(
     const int i1,
     float& mu,
     float& sigma2,
-    float* buf,
-    const int warp_size) {
+    float* buf) {
   // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
   // 2) Tensor is contiguous
   // 3) 2*blockDim.y*sizeof(U)+blockDim.y*sizeof(int) shared memory available.
   //
@@ -232,10 +201,10 @@ __device__ void cuWelfordMuSigma2(
     // intra-warp reductions
     for (int l = 0; l <= 4; ++l) {
       int srcLaneB = (threadIdx.x + (1 << l)) & 31;
-      float muB = WARP_SHFL(mu, srcLaneB, warp_size);
-      float countB = WARP_SHFL(count, srcLaneB, warp_size);
-      float sigma2B = WARP_SHFL(sigma2, srcLaneB, warp_size);
-      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+      float muB = WARP_SHFL(mu, srcLaneB);
+      float countB = WARP_SHFL(count, srcLaneB);
+      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
+      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -256,7 +225,7 @@ __device__ void cuWelfordMuSigma2(
           float muB = ubuf[2 * threadIdx.y];
           float sigma2B = ubuf[2 * threadIdx.y + 1];
           float countB = ibuf[threadIdx.y];
-          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count, warp_size);
+          cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
         }
         __syncthreads();
       }
@@ -270,8 +239,8 @@ __device__ void cuWelfordMuSigma2(
       sigma2 = ubuf[1] / float(n2);
       // don't care about final value of count, we know count == n2
     } else {
-      mu = WARP_SHFL(mu, 0, warp_size);
-      sigma2 = WARP_SHFL(sigma2 / float(n2), 0, warp_size);
+      mu = WARP_SHFL(mu, 0);
+      sigma2 = WARP_SHFL(sigma2 / float(n2), 0);
     }
   }
 }
@@ -334,17 +303,16 @@ __global__ void cuApplyLayerNorm(
     const int n2,
     const U epsilon,
     const T* __restrict__ gamma,
-    const T* __restrict__ beta,
-    int warp_size) {
+    const T* __restrict__ beta) {
   // Assumptions:
-  // 1) blockDim.x == warpSize
+  // 1) blockDim.x == GPU_WARP_SIZE
   // 2) Tensors are contiguous
   //
   for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) {
     SharedMemory<U> shared;
     U* buf = shared.getPointer();
     U mu, sigma2;
-    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf, warp_size);
+    cuWelfordMuSigma2(vals, n1, n2, i1, mu, sigma2, buf);
     const T* lvals = vals + i1 * n2;
     T* ovals = output_vals + i1 * n2;
     U c_invvar = rsqrt(sigma2 + epsilon);
@@ -382,6 +350,8 @@ void HostApplyLayerNorm(
     const T* beta) {
   const uint64_t maxGridY = prop.maxGridSize[1];
   const int warp_size = prop.warpSize;
+  ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
+
   const dim3 threads(warp_size, 4, 1);
   const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
   int nshared =
@@ -393,7 +363,7 @@ void HostApplyLayerNorm(
       input,
       n1, n2,
       U(epsilon),
-      gamma, beta, warp_size);
+      gamma, beta);
 }
 
 #define LAYERNORM_LINEAR_IMPL(T, U)                                                                       \
@@ -727,7 +697,10 @@ void HostLayerNormGradient(
     U* part_grad_gamma,
     U* part_grad_beta,
     const int part_size) {
-  const dim3 threads2(prop.warpSize, 4, 1);
+  const int warp_size = prop.warpSize;
+  ORT_ENFORCE(warp_size == GPU_WARP_SIZE);
+
+  const dim3 threads2(warp_size, 4, 1);
   const dim3 blocks2((n2 + threads2.x - 1) / threads2.x, part_size, 1);
   const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1);
   const int nshared2_b = threads2.x * threads2.y * sizeof(U);
@@ -742,7 +715,7 @@ void HostLayerNormGradient(
       part_grad_gamma,
       part_grad_beta);
 
-  const dim3 threads3(prop.warpSize, 8, 1);
+  const dim3 threads3(warp_size, 8, 1);
   const dim3 blocks3((n2 + threads2.x - 1) / threads2.x, 1, 1);
   const int nshared3 = threads3.x * threads3.y * sizeof(U);
   cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, 0>>>(
@@ -756,7 +729,7 @@ void HostLayerNormGradient(
   // compute grad_input
   const uint64_t maxGridY = prop.maxGridSize[1];
   const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
-  const dim3 threads1(prop.warpSize, 4, 1);
+  const dim3 threads1(warp_size, 4, 1);
   int nshared =
       threads1.y > 1 ? threads1.y * threads1.x * sizeof(U) : 0;
   cuComputeGradInput<<<blocks1, threads1, nshared, 0>>>(
diff --git a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu
index 94832b484f..1444d67cc8 100644
--- a/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu
+++ b/orttraining/orttraining/training_ops/cuda/optimizer/lamb.cu
@@ -467,17 +467,16 @@ __global__ void LambMultiTensorReductionImpl(ChunkGroup<4> chunk_group) {
     }
   }
 
-  // Thread count in a block must be a multiple of 32.
-  constexpr int warp_size = 32;
+  // Thread count in a block must be a multiple of GPU_WARP_SIZE.
 #pragma unroll
-  for (int stride = warp_size / 2; stride > 0; stride /= 2) {
-    w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride);
-    d_sum += __shfl_down_sync(0xFFFFFFFF, d_sum, stride);
+  for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+    w_sum += WARP_SHFL_DOWN(w_sum, stride);
+    d_sum += WARP_SHFL_DOWN(d_sum, stride);
   }
 
-  const int warp_count_in_block = blockDim.x / warp_size;
-  const int lid = threadIdx.x % warp_size;
-  const int wid = threadIdx.x / warp_size;
+  const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE;
+  const int lid = threadIdx.x % GPU_WARP_SIZE;
+  const int wid = threadIdx.x / GPU_WARP_SIZE;
 
   // Shape is 2 x warp_count_in_block.
   extern __shared__ unsigned char shared_memory_[];
@@ -511,13 +510,11 @@ template <typename TIn1, typename TIn2, typename TOut1, typename TOut2, typename
 void LambMultiTensorReductionFunctor<TIn1, TIn2, TOut1, TOut2, TBuf>::operator()(ChunkGroup<4> chunk_group) {
   // thread count per block.
   constexpr int thread_count = ChunkGroup<4>::thread_count_per_block;
-  // warp size of GPU.
-  constexpr int warp_size = 32;
   // shared memory's size per block.
-  const int shared_memory_size = thread_count / warp_size * 2 * sizeof(TBuf);
+  const int shared_memory_size = thread_count / GPU_WARP_SIZE * 2 * sizeof(TBuf);
 
   // Enforce assumptions used inside this reduction CUDA kernel.
-  ORT_ENFORCE(thread_count % warp_size == 0);
+  ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0);
   ORT_ENFORCE((thread_count & (thread_count - 1)) == 0);
 
   LambMultiTensorReductionImpl<TIn1, TIn2, TOut1, TOut2, TBuf><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(chunk_group);
diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu
index 5509962f54..da89a00d22 100644
--- a/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu
+++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_all.cu
@@ -46,16 +46,15 @@ __global__ void _MultiTensorReduceImpl(ChunkGroup<1> chunk_group, TOut* output)
     }
   }
 
-  // Thread count in a block must be a multiple of 32.
-  constexpr int warp_size = 32;
+  // Thread count in a block must be a multiple of GPU_WARP_SIZE.
 #pragma unroll
-  for (int stride = warp_size / 2; stride > 0; stride /= 2) {
-    w_sum += __shfl_down_sync(0xFFFFFFFF, w_sum, stride);
+  for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+    w_sum += WARP_SHFL_DOWN(w_sum, stride);
   }
 
-  const int warp_count_in_block = blockDim.x / warp_size;
-  const int lid = threadIdx.x % warp_size;
-  const int wid = threadIdx.x / warp_size;
+  const int warp_count_in_block = blockDim.x / GPU_WARP_SIZE;
+  const int lid = threadIdx.x % GPU_WARP_SIZE;
+  const int wid = threadIdx.x / GPU_WARP_SIZE;
 
   // Shape is 2 x warp_count_in_block.
   extern __shared__ unsigned char shared_memory_[];
@@ -84,13 +83,11 @@ template <typename TIn, typename TOut, typename TBuf, typename TInOp, typename T
 void MultiTensorReduce(ChunkGroup<1> chunk_group, TOut* output) {
   // thread count per block.
   constexpr int thread_count = ChunkGroup<1>::thread_count_per_block;
-  // warp size of GPU.
-  constexpr int warp_size = 32;
   // shared memory's size per block.
-  const int shared_memory_size = thread_count / warp_size * sizeof(TBuf);
+  const int shared_memory_size = thread_count / GPU_WARP_SIZE * sizeof(TBuf);
 
   // Enforce assumptions used inside this reduction CUDA kernel.
-  ORT_ENFORCE(thread_count % warp_size == 0);
+  ORT_ENFORCE(thread_count % GPU_WARP_SIZE == 0);
   ORT_ENFORCE((thread_count & (thread_count - 1)) == 0);
 
   _MultiTensorReduceImpl<TIn, TOut, TBuf, TInOp, TOutOp><<<chunk_group.chunk_count, thread_count, shared_memory_size>>>(chunk_group, output);
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu
index ac1f60d842..1fc15f2218 100644
--- a/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu
+++ b/orttraining/orttraining/training_ops/cuda/tensor/gather_grad_impl.cu
@@ -11,8 +11,6 @@
 namespace onnxruntime {
 namespace cuda {
 
-static constexpr int WARP_SIZE = 32;
-
 template <typename T>
 __global__ void _Iota(
     cub::CountingInputIterator<T> input,
@@ -47,7 +45,7 @@ __global__ void _GatherGradImpl(
 
 #pragma unroll
         for (int ii = 0; ii < SZ; ii++) {
-          int feature_dim = start_feature + ii * WARP_SIZE;
+          int feature_dim = start_feature + ii * GPU_WARP_SIZE;
           if (feature_dim < stride) {
             gradient[ii] = static_cast<T>(grad_output[grad_row + feature_dim]);
             weight[ii] = static_cast<T>(grad_weight[weight_row + feature_dim]);
@@ -61,7 +59,7 @@ __global__ void _GatherGradImpl(
 
 #pragma unroll
         for (int ii = 0; ii < SZ; ii++) {
-          int feature_dim = start_feature + ii * WARP_SIZE;
+          int feature_dim = start_feature + ii * GPU_WARP_SIZE;
           if (feature_dim < stride) {
             grad_weight[weight_row + feature_dim] = static_cast<T>(weight[ii]);
           }
@@ -114,8 +112,8 @@ void GatherGradImpl(
       original_indices.get(), original_indices_sorted.get(),
       num_indices));
 
+  dim3 block(GPU_WARP_SIZE, 4);
   dim3 grid(CeilDiv(num_indices, 4), CeilDiv(stride, 128));
-  dim3 block(WARP_SIZE, 4);
 
   _GatherGradImpl<<<grid, block>>>(
       indices_data_sorted.get(),