diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
index fe797a6beb..6ea89a7e86 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@@ -3,9 +3,6 @@
 
 #include "core/providers/cpu/reduction/reduction_ops.h"
 #include "core/providers/common.h"
-#include "core/util/math_cpuonly.h"
-#include "core/providers/cpu/containers.h"
-#include "core/platform/threadpool.h"
 
 using namespace std;
 namespace onnxruntime {
@@ -151,24 +148,6 @@ REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 1, 10);
 REGISTER_UNARY_ELEMENTWISE_VERSIONED_KERNEL(ArgMin, 11, 11);
 REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 12);
 
-namespace contrib {
-#define REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(T)                \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
-      ReduceSumTraining,                                          \
-      kMSDomain,                                                  \
-      1,                                                          \
-      T,                                                          \
-      kCpuExecutionProvider,                                      \
-      KernelDefBuilder()                                          \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      ReduceSumTraining<T>);
-
-REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(float)
-REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double)
-REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t)
-REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t)
-}  // namespace contrib
-
 // When all reduce axes are located at the tail of the dims, quite general cases, transpose and extra
 // copy could be skipped to improve performance. If required by check_no_transpose = true, then
 // the calling code will check if the data was transposed and act accordingly.
@@ -184,8 +163,8 @@ bool PrepareForReduce(const Tensor* input_tensor_ptr,
                       const std::vector<int64_t>& axes_,
                       bool keepdims_,
                       /*out*/ std::vector<int64_t>& reduced_dims,
-                      bool check_no_transpose = false,
-                      const TensorShape* input_shape_override = nullptr) {
+                      bool check_no_transpose,
+                      const TensorShape* input_shape_override) {
   ORT_ENFORCE(input_tensor_ptr != nullptr, "Input to be reduced is null");
 
   if (input_shape_override) {
@@ -614,12 +593,11 @@ Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
 }
 
 template <typename T>
-static void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
-                          int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
-                          concurrency::ThreadPool* tp) {
+void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
+                   int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
+                   concurrency::ThreadPool* tp) {
   if (no_transpose) {
     auto lambda = [input_data, blocks, output_data](ptrdiff_t i) {
-
       // The ConstEigenMatrixMap type is expanded to work around a MS compiler issue
       output_data[i] = Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>>(input_data + (i * blocks), blocks).sum();
     };
@@ -668,43 +646,6 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
   return Status::OK();
 }
 
-template <typename T>
-Status ReduceSumTraining<T>::Compute(OpKernelContext* ctx) const {
-  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
-  int64_t block_size;
-  int64_t blocks;
-  std::vector<int64_t> reduced_dims;
-  const Tensor* input = ctx->Input<Tensor>(0);
-
-  //override the attribute value with the input value for reduction_axes
-  const Tensor* axes_tensor = ctx->Input<Tensor>(1);
-  ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
-  ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
-              "An axes tensor must be a vector tensor.");
-  auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
-  const auto* data = axes_tensor->template Data<int64_t>();
-  std::vector<int64_t> axes(data, data + nDims);
-  if (axes.size() > 0) {
-    ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed.");
-  }
-
-  // empty axes and no-op
-  if (axes.empty() && noop_with_empty_axes_) {
-    auto* output = ctx->Output(0, input->Shape());
-    memcpy(output->template MutableData<T>(), input->template Data<T>(), input->SizeInBytes() * sizeof(T));
-    return Status::OK();
-  }
-
-  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true);
-
-  auto* output = ctx->Output(0, reduced_dims);
-
-  ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
-                no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
-
-  return Status::OK();
-}
-
 template <typename T>
 Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
   FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
@@ -870,4 +811,14 @@ template class ReduceSum<int32_t>;
 template class ReduceSum<double>;
 template class ReduceSum<int64_t>;
 
+#define REGISTER_REDUCESUMCORE_TYPED(T)                                                                         \
+  template void ReduceSumCore<T>(const T* input_data, T* output_data, bool no_transpose,                        \
+                                 int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data, \
+                                 concurrency::ThreadPool* tp);
+
+REGISTER_REDUCESUMCORE_TYPED(float)
+REGISTER_REDUCESUMCORE_TYPED(double)
+REGISTER_REDUCESUMCORE_TYPED(int32_t)
+REGISTER_REDUCESUMCORE_TYPED(int64_t)
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
index a6dbff884a..9ef1b43a40 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.h
@@ -7,9 +7,28 @@
 #include "core/common/common.h"
 #include "core/common/optional.h"
 #include "core/framework/op_kernel.h"
+#include "core/providers/cpu/containers.h"
+#include "core/util/math_cpuonly.h"
+#include "core/platform/threadpool.h"
 
 namespace onnxruntime {
 
+template <typename T>
+bool PrepareForReduce(const Tensor* input_tensor_ptr,
+                      FastAllocVector<T>& transposed_input_data,
+                      int64_t& block_size,
+                      int64_t& blocks,
+                      const std::vector<int64_t>& axes_,
+                      bool keepdims_,
+                      /*out*/ std::vector<int64_t>& reduced_dims,
+                      bool check_no_transpose = false,
+                      const TensorShape* input_shape_override = nullptr);
+
+template <typename T>
+void ReduceSumCore(const T* input_data, T* output_data, bool no_transpose,
+                   int64_t blocks, int64_t block_size, FastAllocVector<T>& transposed_input_data,
+                   concurrency::ThreadPool* tp);
+
 template <bool allow_multi_axes>
 class ReduceKernelBase {
  protected:
@@ -132,21 +151,6 @@ class ReduceSum final : public ReduceKernel<true> {
                      const TensorShape* input_shape_override = nullptr);
 };
 
-template <typename T>
-class ReduceSumTraining final : public ReduceKernel<true> {
- public:
-  ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
-  }
-
-  Status Compute(OpKernelContext* context) const override;
-
-  // For external calls requiring ReduceSumTraining implementation - will return the reduced output.
-  //`input_shape_override` overrides the shape of `input` for compute purposes.
-  static Tensor Impl(const Tensor& input, const std::vector<int64_t>& reduce_axes,
-                     AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims,
-                     const TensorShape* input_shape_override = nullptr);
-};
-
 template <typename T>
 class ReduceSumSquare final : public ReduceKernel<true> {
  public:
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
index 592dc1ed51..4723246fcf 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
@@ -60,56 +60,6 @@ namespace cuda {
       KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
       name<T>);
 
-#define REGISTER_MS_KERNEL_TYPED(name, T)                         \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
-      name,                                                       \
-      kMSDomain,                                                  \
-      1,                                                          \
-      T,                                                          \
-      kCudaExecutionProvider,                                     \
-      KernelDefBuilder()                                          \
-          .InputMemoryType<OrtMemTypeCPUInput>(1)                 \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      name<T>);
-
-// CUDA's reduction descriptor cudnnReduceTensorDescriptor_t is a pointer so
-// it's safer to wrap it with automatically memory deleter as CudnnReduceDescriptor.
-// An implicit caster from CudnnReduceDescriptor to cudnnReduceTensorDescriptor_t
-// is implemented below, so CUDA can seamlessly work.
-class CudnnReduceDescriptor final {
- public:
-  CudnnReduceDescriptor() : desc_(nullptr) {
-  }
-
-  ~CudnnReduceDescriptor() {
-    if (desc_ != nullptr) {
-      cudnnDestroyReduceTensorDescriptor(desc_);
-      desc_ = nullptr;
-    }
-  }
-
-  CudnnReduceDescriptor(const CudnnReduceDescriptor&) = delete;
-  CudnnReduceDescriptor& operator=(const CudnnReduceDescriptor&) = delete;
-
-  Status Set(cudnnReduceTensorOp_t op, cudnnDataType_t type, cudnnReduceTensorIndices_t indices) {
-    if (!desc_)
-      CUDNN_RETURN_IF_ERROR(cudnnCreateReduceTensorDescriptor(&desc_));
-
-    CUDNN_RETURN_IF_ERROR(cudnnSetReduceTensorDescriptor(
-        desc_,
-        op,
-        type,
-        CUDNN_PROPAGATE_NAN,
-        indices,
-        CUDNN_32BIT_INDICES));  // currently only the 32-bit (unsigned int) type is supported.
-    return Status::OK();
-  }
-
-  operator cudnnReduceTensorDescriptor_t() const { return desc_; }
-
- private:
-  cudnnReduceTensorDescriptor_t desc_;
-};
 
 // TODO ReduceKernel::ReduceKernelShared() is still used by some other training classes though it's not used here - this should be refactored.
 template <bool allow_multi_axes>
@@ -314,11 +264,11 @@ template Status ReduceKernel<true>::ReduceKernelShared<MLFloat16, MLFloat16, CUD
     std::vector<int64_t>& output_dims) const;
 
 // `input_shape_override` (if provided) is the input shape for compute purposes
-static Status PrepareForReduce(const Tensor* X,
-                               bool keepdims,
-                               const std::vector<int64_t>& axes,
-                               PrepareReduceMetadata& prepare_reduce_metadata,
-                               const TensorShape* input_shape_override = nullptr) {
+Status PrepareForReduce(const Tensor* X,
+                        bool keepdims,
+                        const std::vector<int64_t>& axes,
+                        PrepareReduceMetadata& prepare_reduce_metadata,
+                        const TensorShape* input_shape_override) {
   ORT_ENFORCE(nullptr != X);
 
   const TensorShape& input_shape = input_shape_override ? *input_shape_override : X->Shape();
@@ -391,11 +341,11 @@ static Status PrepareForReduce(const Tensor* X,
 
 // `input_shape_override` is the input shape for compute purposes (if provided)
 template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
-static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
-                                /*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op,
-                                const std::vector<int64_t>& axes,
-                                bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
-                                const TensorShape* input_shape_override = nullptr) {
+Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
+                         /*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op,
+                         const std::vector<int64_t>& axes,
+                         bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
+                         const TensorShape* input_shape_override) {
   typedef typename ToCudaType<T>::MappedType CudaT;
   const TensorShape& input_shape = input_shape_override ? *input_shape_override : input.Shape();
 
@@ -610,33 +560,10 @@ template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
 Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const {
   const Tensor* X = ctx->Input<Tensor>(0);
 
-  const std::string& op_name = this->KernelDef().OpName();
-  std::vector<int64_t> axes_values = axes_;
-  if (op_name == "ReduceSumTraining") {
-    //override the attribute value with the input value for reduction_axes
-    const Tensor* axes_tensor = ctx->Input<Tensor>(1);
-    ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
-    ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
-    auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
-    const auto* data = axes_tensor->template Data<int64_t>();
-    std::vector<int64_t> axes(data, data + nDims);
-    axes_values = axes;
-    if (axes.size() > 0) {
-      ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed.");
-    }
-
-    // empty axes and no-op
-    if (axes.empty() && noop_with_empty_axes_) {
-      auto* Y = ctx->Output(0, X->Shape());
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<T>(), X->template Data<T>(), X->SizeInBytes() * sizeof(T), cudaMemcpyDeviceToDevice));
-      return Status::OK();
-    }
-  }
-
   PrepareReduceMetadata prepare_reduce_metadata;
   ORT_RETURN_IF_ERROR(PrepareForReduce(X,
                                        keepdims_,
-                                       axes_values,
+                                       axes_,
                                        prepare_reduce_metadata));
   Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
   bool fast_reduction = fast_reduction_;
@@ -646,7 +573,7 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
       fast_reduction = false;
   }
 
-  return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_values,
+  return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_,
                                                    calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
 }
 
@@ -657,33 +584,11 @@ Status ReduceKernel<true>::ComputeImpl<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
 
   const Tensor* X = ctx->Input<Tensor>(0);
 
-  const std::string& op_name = this->KernelDef().OpName();
-  std::vector<int64_t> axes_values = axes_;
-  if (op_name == "ReduceSumTraining") {
-    //override the attribute value with the input value for reduction_axes
-    const Tensor* axes_tensor = ctx->Input<Tensor>(1);
-    ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
-    auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
-    const auto* data = axes_tensor->template Data<int64_t>();
-    std::vector<int64_t> axes(data, data + nDims);
-    axes_values = axes;
-    if (axes.size() > 0) {
-      ORT_ENFORCE(noop_with_empty_axes_ == false, "Noop when axes is not empty is not allowed.");
-    }
-
-    // empty axes and no-op
-    if (axes.empty() && noop_with_empty_axes_) {
-      auto* Y = ctx->Output(0, X->Shape());
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), X->SizeInBytes() * sizeof(int32_t), cudaMemcpyDeviceToDevice));
-      return Status::OK();
-    }
-  }
-
   PrepareReduceMetadata prepare_reduce_metadata;
 
   ORT_RETURN_IF_ERROR(PrepareForReduce(X,
                                        keepdims_,
-                                       axes_values,
+                                       axes_,
                                        prepare_reduce_metadata));
 
   Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
@@ -982,11 +887,6 @@ REGISTER_KERNEL_TYPED_12(ReduceMin, int32_t)
 REGISTER_KERNEL_TYPED_12(ReduceMin, int8_t)
 REGISTER_KERNEL_TYPED_12(ReduceMin, uint8_t)
 
-REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, MLFloat16)
-REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, float)
-REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double)
-REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t)
-
 REGISTER_KERNEL_HFD(ReduceProd)
 REGISTER_KERNEL_HFD(ReduceSum)
 REGISTER_KERNEL_HFD(ReduceLogSum)
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h
index 8cc8111f25..8f3ddf80c8 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.h
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.h
@@ -59,6 +59,10 @@ class ReduceKernel : public CudaKernel, public ReduceKernelBase<allow_multi_axes
   template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices = CUDNN_REDUCE_TENSOR_NO_INDICES>
   Status ComputeImpl(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const;
 
+  // Used by ReduceSumTraining which will have axes as input
+  template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices = CUDNN_REDUCE_TENSOR_NO_INDICES>
+  Status ComputeImplEx(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const;
+
   template <typename T, typename OutT, cudnnReduceTensorIndices_t ReduceTensorIndices>
   Status ReduceKernelShared(
       const T* X,
@@ -175,18 +179,6 @@ class ReduceSum final : public ReduceKernel<true> {
   }
 };
 
-template <typename T>
-class ReduceSumTraining final : public ReduceKernel<true> {
- public:
-  ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
-    fast_reduction_ = true;
-  }
-
-  Status ComputeInternal(OpKernelContext* ctx) const override {
-    return ComputeImpl<T>(ctx, CUDNN_REDUCE_TENSOR_ADD);
-  }
-};
-
 template <typename T>
 class ReduceLogSum final : public ReduceKernel<true> {
  public:
@@ -223,5 +215,57 @@ class ReduceLogSumExp final : public ReduceKernel<true> {
   }
 };
 
+Status PrepareForReduce(const Tensor* X,
+                        bool keepdims,
+                        const std::vector<int64_t>& axes,
+                        PrepareReduceMetadata& prepare_reduce_metadata,
+                        const TensorShape* input_shape_override = nullptr);
+
+template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
+Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
+                         /*out*/ Tensor& output, cudnnReduceTensorOp_t cudnn_reduce_op,
+                         const std::vector<int64_t>& axes,
+                         bool calculate_log, bool calculate_sqt, bool log_sum_exp, bool fast_reduction,
+                         const TensorShape* input_shape_override = nullptr);
+
+// CUDA's reduction descriptor cudnnReduceTensorDescriptor_t is a pointer so
+// it's safer to wrap it with automatically memory deleter as CudnnReduceDescriptor.
+// An implicit caster from CudnnReduceDescriptor to cudnnReduceTensorDescriptor_t
+// is implemented below, so CUDA can seamlessly work.
+class CudnnReduceDescriptor final {
+ public:
+  CudnnReduceDescriptor() : desc_(nullptr) {
+  }
+
+  ~CudnnReduceDescriptor() {
+    if (desc_ != nullptr) {
+      cudnnDestroyReduceTensorDescriptor(desc_);
+      desc_ = nullptr;
+    }
+  }
+
+  CudnnReduceDescriptor(const CudnnReduceDescriptor&) = delete;
+  CudnnReduceDescriptor& operator=(const CudnnReduceDescriptor&) = delete;
+
+  Status Set(cudnnReduceTensorOp_t op, cudnnDataType_t type, cudnnReduceTensorIndices_t indices) {
+    if (!desc_)
+      CUDNN_RETURN_IF_ERROR(cudnnCreateReduceTensorDescriptor(&desc_));
+
+    CUDNN_RETURN_IF_ERROR(cudnnSetReduceTensorDescriptor(
+        desc_,
+        op,
+        type,
+        CUDNN_PROPAGATE_NAN,
+        indices,
+        CUDNN_32BIT_INDICES));  // currently only the 32-bit (unsigned int) type is supported.
+    return Status::OK();
+  }
+
+  operator cudnnReduceTensorDescriptor_t() const { return desc_; }
+
+ private:
+  cudnnReduceTensorDescriptor_t desc_;
+};
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc
new file mode 100644
index 0000000000..0732a43441
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.cc
@@ -0,0 +1,66 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "orttraining/training_ops/cpu/reduction/reduction_ops.h"
+#include "core/providers/common.h"
+#include "core/util/math_cpuonly.h"
+#include "core/providers/cpu/containers.h"
+#include "core/platform/threadpool.h"
+
+using namespace std;
+namespace onnxruntime {
+namespace contrib {
+
+#define REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(T)                \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      ReduceSumTraining,                                          \
+      kMSDomain,                                                  \
+      1,                                                          \
+      T,                                                          \
+      kCpuExecutionProvider,                                      \
+      KernelDefBuilder()                                          \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      ReduceSumTraining<T>);
+
+REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(float)
+REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(double)
+REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int32_t)
+REGISTER_REDUCESUMTRAINING_KERNEL_TYPED(int64_t)
+
+
+template <typename T>
+Status ReduceSumTraining<T>::Compute(OpKernelContext* ctx) const {
+  FastAllocVector<T> transposed_input_data(GetAllocator<T>(*ctx));
+  int64_t block_size;
+  int64_t blocks;
+  std::vector<int64_t> reduced_dims;
+  const Tensor* input = ctx->Input<Tensor>(0);
+
+  //override the attribute value with the input value for reduction_axes
+  const Tensor* axes_tensor = ctx->Input<Tensor>(1);
+  ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
+  ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1,
+              "An axes tensor must be a vector tensor.");
+  auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
+  const auto* data = axes_tensor->template Data<int64_t>();
+  std::vector<int64_t> axes(data, data + nDims);
+
+  // empty axes and no-op
+  if (axes.empty() && noop_with_empty_axes_) {
+    auto* output = ctx->Output(0, input->Shape());
+    memcpy(output->template MutableData<T>(), input->template Data<T>(), input->SizeInBytes());
+    return Status::OK();
+  }
+
+  bool no_transpose = PrepareForReduce<T>(input, transposed_input_data, block_size, blocks, axes, keepdims_, reduced_dims, true);
+
+  auto* output = ctx->Output(0, reduced_dims);
+
+  ReduceSumCore(input->template Data<T>(), output->template MutableData<T>(),
+                no_transpose, blocks, block_size, transposed_input_data, ctx->GetOperatorThreadPool());
+
+  return Status::OK();
+}
+
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.h b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.h
new file mode 100644
index 0000000000..2ec372c718
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cpu/reduction/reduction_ops.h
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/cpu/reduction/reduction_ops.h"
+#include "core/common/common.h"
+#include "core/common/optional.h"
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+
+template <typename T>
+class ReduceSumTraining final : public ReduceKernel<true> {
+ public:
+  ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
+  }
+
+  Status Compute(OpKernelContext* context) const override;
+
+  // For external calls requiring ReduceSumTraining implementation - will return the reduced output.
+  //`input_shape_override` overrides the shape of `input` for compute purposes.
+  static Tensor Impl(const Tensor& input, const std::vector<int64_t>& reduce_axes,
+                     AllocatorPtr allocator, concurrency::ThreadPool* tp, bool keep_dims,
+                     const TensorShape* input_shape_override = nullptr);
+};
+
+}  // namespace contrib
+}  // namespace onnxruntime
+
diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc
new file mode 100644
index 0000000000..f07a4fc982
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.cc
@@ -0,0 +1,168 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "orttraining/training_ops/cuda/reduction/reduction_ops.h"
+#include "core/providers/common.h"
+#include "core/providers/cuda/cudnn_common.h"
+#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
+#include "core/providers/cuda/math/binary_elementwise_ops_impl.h"
+#include "core/providers/cuda/math/binary_elementwise_ops.h"
+#include "core/providers/cpu/tensor/utils.h"
+#include "core/framework/op_kernel_context_internal.h"
+
+using namespace onnxruntime::common;
+namespace onnxruntime {
+namespace cuda {
+
+#define REGISTER_MS_KERNEL_TYPED(name, T)                         \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
+      name,                                                       \
+      kMSDomain,                                                  \
+      1,                                                          \
+      T,                                                          \
+      kCudaExecutionProvider,                                     \
+      KernelDefBuilder()                                          \
+          .InputMemoryType<OrtMemTypeCPUInput>(1)                 \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
+      name<T>);
+
+REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, MLFloat16)
+REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, float)
+REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, double)
+REGISTER_MS_KERNEL_TYPED(ReduceSumTraining, int32_t)
+
+template <bool allow_multi_axes>
+template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
+Status ReduceKernel<allow_multi_axes>::ComputeImplEx(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const {
+  const Tensor* X = ctx->Input<Tensor>(0);
+
+  //override the attribute value with the input value for reduction_axes
+  const Tensor* axes_tensor = ctx->Input<Tensor>(1);
+  ORT_ENFORCE(axes_tensor != nullptr, "Axes input is null");
+  ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
+  auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
+  const auto* data = axes_tensor->template Data<int64_t>();
+  std::vector<int64_t> axes(data, data + nDims);
+
+  // empty axes and no-op
+  if (axes.empty() && noop_with_empty_axes_) {
+    auto* Y = ctx->Output(0, X->Shape());
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<T>(), X->template Data<T>(), X->SizeInBytes(), cudaMemcpyDeviceToDevice));
+    return Status::OK();
+  }
+
+
+  PrepareReduceMetadata prepare_reduce_metadata;
+  ORT_RETURN_IF_ERROR(PrepareForReduce(X,
+                                       keepdims_,
+                                       axes,
+                                       prepare_reduce_metadata));
+  Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
+  bool fast_reduction = fast_reduction_;
+  if (fast_reduction) {
+    auto ctx_internal = static_cast<OpKernelContextInternal*>(ctx);
+    if (ctx_internal && ctx_internal->GetUseDeterministicCompute())
+      fast_reduction = false;
+  }
+
+  return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes,
+                                                   calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
+}
+
+template <>
+template <>
+Status ReduceKernel<true>::ComputeImplEx<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(OpKernelContext* ctx, cudnnReduceTensorOp_t cudnn_reduce_op) const {
+  typedef typename ToCudaType<int32_t>::MappedType CudaT;
+
+  const Tensor* X = ctx->Input<Tensor>(0);
+
+  //override the attribute value with the input value for reduction_axes
+  const Tensor* axes_tensor = ctx->Input<Tensor>(1);
+  ORT_ENFORCE(axes_tensor->Shape().NumDimensions() == 1, "An axes tensor must be a vector tensor.");
+  auto nDims = static_cast<size_t>(axes_tensor->Shape()[0]);
+  const auto* data = axes_tensor->template Data<int64_t>();
+  std::vector<int64_t> axes(data, data + nDims);
+
+  // empty axes and no-op
+  if (axes.empty() && noop_with_empty_axes_) {
+    auto* Y = ctx->Output(0, X->Shape());
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), X->SizeInBytes(), cudaMemcpyDeviceToDevice));
+    return Status::OK();
+  }
+  
+
+
+  PrepareReduceMetadata prepare_reduce_metadata;
+
+  ORT_RETURN_IF_ERROR(PrepareForReduce(X,
+                                       keepdims_,
+                                       axes,
+                                       prepare_reduce_metadata));
+
+  Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
+
+  int64_t input_count = prepare_reduce_metadata.input_count;
+  int64_t output_count = prepare_reduce_metadata.output_count;
+  std::vector<int64_t>& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn;
+  std::vector<int64_t>& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn;
+
+  // special case when there is a dim value of 0 in the shape.
+  if (input_count == 0) {
+    assert(Y->Shape().Size() == 0);
+    return Status::OK();
+  }
+
+  // cudnnReduceTensor for ReduceSum has issue if input and output has same size, we just need to copy the data for this case
+  if (input_count == output_count) {
+    if (Y->template MutableData<int32_t>() != X->template Data<int32_t>()) {
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(Y->template MutableData<int32_t>(), X->template Data<int32_t>(), input_count * sizeof(int32_t), cudaMemcpyDeviceToDevice));
+    }
+    return Status::OK();
+  }
+
+  // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
+  // Therefore zeroing out the memory is required
+  CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
+
+  size_t indices_bytes = 0;
+  size_t workspace_bytes = 0;
+  CudnnTensor input_tensor;
+  CudnnTensor output_tensor;
+  CudnnReduceDescriptor reduce_desc;
+
+  cudnnDataType_t cudnn_type_X = CUDNN_DATA_FLOAT;
+  IAllocatorUniquePtr<float> temp_X = GetScratchBuffer<float>(input_count);
+  Impl_Cast<CudaT, float>(reinterpret_cast<const CudaT*>(X->template Data<int32_t>()), temp_X.get(), X->Shape().Size());
+
+  ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, CUDNN_REDUCE_TENSOR_FLATTENED_INDICES));
+  ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_cudnn, cudnn_type_X));
+  ORT_RETURN_IF_ERROR(output_tensor.Set(output_dims_cudnn, cudnn_type_X));
+  CUDNN_RETURN_IF_ERROR(cudnnGetReductionIndicesSize(CudnnHandle(), reduce_desc, input_tensor, output_tensor, &indices_bytes));
+  CUDNN_RETURN_IF_ERROR(cudnnGetReductionWorkspaceSize(CudnnHandle(), reduce_desc, input_tensor, output_tensor, &workspace_bytes));
+  IAllocatorUniquePtr<uint32_t> indices_cuda = GetScratchBuffer<uint32_t>(indices_bytes);
+  IAllocatorUniquePtr<CudaT> workspace_cuda = GetScratchBuffer<CudaT>(workspace_bytes);
+
+  const auto one = Consts<float>::One;
+  const auto zero = Consts<float>::Zero;
+  auto temp_Y = GetScratchBuffer<float>(output_count);
+  CUDNN_RETURN_IF_ERROR(cudnnReduceTensor(CudnnHandle(),
+                                          reduce_desc,
+                                          indices_cuda.get(),
+                                          indices_bytes,
+                                          workspace_cuda.get(),
+                                          workspace_bytes,
+                                          &one,
+                                          input_tensor,
+                                          temp_X.get(),
+                                          &zero,
+                                          output_tensor,
+                                          temp_Y.get()));
+
+  Impl_Cast<float, int32_t>(temp_Y.get(), Y->template MutableData<int32_t>(), output_count);
+
+  return Status::OK();
+}
+
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.h b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.h
new file mode 100644
index 0000000000..d1da42e625
--- /dev/null
+++ b/orttraining/orttraining/training_ops/cuda/reduction/reduction_ops.h
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/optional.h"
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/reduction/reduction_ops.h"
+#include "core/providers/cuda/reduction/reduction_functions.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+class ReduceSumTraining final : public ReduceKernel<true> {
+ public:
+  ReduceSumTraining(const OpKernelInfo& info) : ReduceKernel<true>(info) {
+    fast_reduction_ = true;
+  }
+
+  Status ComputeInternal(OpKernelContext* ctx) const override {
+    return ComputeImplEx<T>(ctx, CUDNN_REDUCE_TENSOR_ADD);
+  }
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/winml/test/common/protobufHelpers.cpp b/winml/test/common/protobufHelpers.cpp
index 3838cfa78e..f84c9042c7 100644
--- a/winml/test/common/protobufHelpers.cpp
+++ b/winml/test/common/protobufHelpers.cpp
@@ -80,6 +80,16 @@ std::vector<int64_t> GetTypeSpecificDataFromTensorProto(
     onnx::TensorProto tensorProto) {
   return std::vector<int64_t>(std::begin(tensorProto.int64_data()), std::end(tensorProto.int64_data()));
 }
+template <>
+std::vector<uint8_t> GetTypeSpecificDataFromTensorProto(
+    onnx::TensorProto tensorProto) {
+  return std::vector<uint8_t>(std::begin(tensorProto.int32_data()), std::end(tensorProto.int32_data()));
+}
+template <>
+std::vector<double> GetTypeSpecificDataFromTensorProto(
+    onnx::TensorProto tensorProto) {
+  return std::vector<double>(std::begin(tensorProto.double_data()), std::end(tensorProto.double_data()));
+}
 
 template <typename DataType>
 std::vector<DataType> GetTensorDataFromTensorProto(
@@ -139,6 +149,10 @@ ITensor ProtobufHelpers::LoadTensorFromProtobufFile(
         return TensorInt64Bit::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto<int64_t>(tensorProto, elementCount));
       case (onnx::TensorProto::DataType::TensorProto_DataType_STRING):
         return TensorString::CreateFromIterable(tensorShape, GetTensorStringDataFromTensorProto(tensorProto, elementCount));
+      case (onnx::TensorProto::DataType::TensorProto_DataType_UINT8):
+        return TensorUInt8Bit::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto<uint8_t>(tensorProto, elementCount));
+      case (onnx::TensorProto::DataType::TensorProto_DataType_DOUBLE):
+        return TensorDouble::CreateFromIterable(tensorShape, GetTensorDataFromTensorProto<double>(tensorProto, elementCount));
       default:
         throw winrt::hresult_invalid_argument(L"Tensor type for creating tensor from protobuf file not supported.");
         break;