diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc
index cb58191c1d..6889fcfbb1 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cc
@@ -4,6 +4,8 @@
 #include "core/providers/cuda/reduction/reduction_functions.h"
 
 #include <algorithm>
+#include <cassert>
+#include <iterator>
 #include <utility>
 
 #include "core/common/optional.h"
@@ -13,6 +15,88 @@
 namespace onnxruntime {
 namespace cuda {
 
+namespace {
+// std::make_reverse_iterator is not implemented in older versions of GCC
+#if !defined(__GNUC__) || __GNUC__ >= 5
+using std::make_reverse_iterator;
+#else
+template <typename It>
+std::reverse_iterator<It> make_reverse_iterator(It it) {
+  return std::reverse_iterator<It>(it);
+}
+#endif
+
+// gets min and max of single contiguous range of axes if available
+optional<std::pair<int64_t, int64_t>> GetMinAndMaxContiguousAxes(
+    int64_t rank,
+    const std::vector<int64_t>& dims,
+    const std::vector<int64_t>& original_axes) {
+  assert(rank == static_cast<int64_t>(dims.size()));
+
+  // empty axes means reduce all dimensions
+  if (original_axes.empty()) {
+    return std::make_pair(int64_t{0}, rank - 1);
+  }
+
+  // normalize axis values and sort
+  const std::vector<int64_t> axes = [&original_axes, rank]() {
+    std::vector<int64_t> result(original_axes);
+    std::for_each(
+        result.begin(), result.end(),
+        [rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); });
+    std::sort(result.begin(), result.end());
+    return result;
+  }();
+
+  assert(!axes.empty());
+
+  const auto is_dim_one = [](int64_t dim) { return dim == 1; };
+
+  for (auto a = axes.begin(), b = axes.begin() + 1;
+       b != axes.end();
+       ++a, ++b) {
+    ORT_ENFORCE(*a != *b, "axes must not contain duplicate values");
+
+    // if axis values are adjacent, the axes are contiguous
+    if (*a + 1 == *b) {
+      continue;
+    }
+
+    // if all dimension values between adjacent axes are 1,
+    // treat the axes as contiguous
+    if (std::all_of(dims.begin() + *a + 1, dims.begin() + *b, is_dim_one)) {
+      continue;
+    }
+
+    // otherwise, not contiguous
+    return nullopt;
+  }
+
+  // expand axes over surrounding dimensions with value of 1
+  const int64_t min_axis = [&dims, &axes, &is_dim_one]() {
+    const auto& min_given_axis = axes.front();
+    // note that std::reverse_iterator(it) refers to the element at (it-1)
+    // it -> reverse it: element offset of -1
+    const auto before_min_given_axis_rit =
+        make_reverse_iterator(dims.begin() + min_given_axis);
+    const auto before_min_axis_rit =
+        std::find_if_not(before_min_given_axis_rit, dims.rend(), is_dim_one);
+    // reverse it -> it: element offset of +1
+    return std::distance(dims.begin(), before_min_axis_rit.base());
+  }();
+
+  const int64_t max_axis = [&dims, &axes, &is_dim_one]() {
+    const auto& max_given_axis = axes.back();
+    const auto after_max_given_axis_it = dims.begin() + max_given_axis + 1;
+    const auto after_max_axis_it =
+        std::find_if_not(after_max_given_axis_it, dims.end(), is_dim_one);
+    return std::distance(dims.begin(), after_max_axis_it - 1);
+  }();
+
+  return std::make_pair(min_axis, max_axis);
+}
+}  // namespace
+
 ApplicableMatrixReduction get_applicable_matrix_reduction(
     const cudnnReduceTensorOp_t cudnn_reduce_op,
     const std::vector<int64_t>& dims, const std::vector<int64_t>& original_axes,
@@ -22,53 +106,25 @@ ApplicableMatrixReduction get_applicable_matrix_reduction(
   }
 
   const auto rank = gsl::narrow<int64_t>(dims.size());
-
-  // min and max of single contiguous range of axes
-  const auto minmax_axes = [&]() -> optional<std::pair<int64_t, int64_t>> {
-    // empty axes means reduce all dimensions
-    if (original_axes.empty()) {
-      return std::make_pair(int64_t{0}, rank - 1);
-    }
-
-    // normalize axis values and sort
-    const std::vector<int64_t> axes = [&original_axes, rank]() {
-      std::vector<int64_t> result(original_axes);
-      std::for_each(
-          result.begin(), result.end(),
-          [rank](int64_t& axis) { axis = HandleNegativeAxis(axis, rank); });
-      std::sort(result.begin(), result.end());
-      return result;
-    }();
-
-    for (auto a = axes.begin(), b = axes.begin() + 1;
-         b != axes.end();
-         ++a, ++b) {
-      ORT_ENFORCE(*a != *b, "axes must not contain duplicate values");
-      if (*a + 1 != *b) {  // not contiguous
-        return {};
-      }
-    }
-
-    return std::make_pair(axes.front(), axes.back());
-  }();
-
-  if (!minmax_axes.has_value()) {
+  const auto min_and_max_axes = GetMinAndMaxContiguousAxes(rank, dims, original_axes);
+  if (!min_and_max_axes.has_value()) {
     return ApplicableMatrixReduction::None;
   }
 
-  const auto& min_axis = minmax_axes.value().first;
-  const auto& max_axis = minmax_axes.value().second;
+  const auto& min_axis = min_and_max_axes.value().first;
+  const auto& max_axis = min_and_max_axes.value().second;
 
   // axes from beginning means row reduction, axes to end means column reduction
-  // currently we don't handle axes from beginning to end, but that could be either
+  // for axes from beginning to end, either works and we do row reduction
   const bool axes_from_beginning = min_axis == 0;
   const bool axes_to_end = max_axis == rank - 1;
 
-  // handle axes anchored to one of beginning or end, not both
-  if (axes_from_beginning == axes_to_end) {
+  // handle axes anchored to beginning or end
+  if (!axes_from_beginning && !axes_to_end) {
     return ApplicableMatrixReduction::None;
   }
 
+  // the axis index right after the last flattened into matrix rows
   const int64_t m_end_axis = axes_from_beginning ? max_axis + 1 : min_axis;
 
   const TensorShape& shape = TensorShape::ReinterpretBaseType(dims);
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
index d85e414b5d..8f5b1bb45a 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
@@ -663,7 +663,7 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
   Tensor* Y = ctx->Output(0, prepare_reduce_metadata.squeezed_output_dims);
   const bool fast_reduction = fast_reduction_ && !ctx->GetUseDeterministicCompute();
 
-  return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes_,
+  return ReduceComputeCore<T, ReduceTensorIndices>(*cuda_ep_, *X, prepare_reduce_metadata, *Y, cudnn_reduce_op, axes,
                                                    calculate_log_, calculate_sqt_, log_sum_exp_, fast_reduction);
 }
 
diff --git a/onnxruntime/test/providers/cuda/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/reduction_functions_test.cc
index f504cf1161..e7aebfe1b3 100644
--- a/onnxruntime/test/providers/cuda/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/reduction_functions_test.cc
@@ -7,11 +7,15 @@
 
 #include "gtest/gtest.h"
 
+#include "core/common/optional.h"
 #include "core/providers/cuda/reduction/reduction_functions.h"
 #include "test/common/tensor_op_test_utils.h"
 #include "test/util/include/asserts.h"
 
+using onnxruntime::test::RandomValueGenerator;
+
 namespace onnxruntime {
+namespace cuda {
 namespace test {
 
 namespace {
@@ -55,7 +59,7 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f)
     expected_output_mean += input_value / float(size);
   }
   const auto buffer_size_in_bytes =
-      cuda::compute_reduction_buffer_size<float>(size);
+      compute_reduction_buffer_size<float>(size);
 
   auto device_input = AllocateDeviceMemory<float>(size);
   auto device_output_sum = AllocateDeviceMemory<float>();
@@ -65,19 +69,19 @@ void TestReduceRowToScalarApis(int size, float relative_error_tolerance = 1e-4f)
 
   cudaMemcpy(device_input.get(), input.data(), size * sizeof(float), cudaMemcpyHostToDevice);
 
-  ASSERT_STATUS_OK(cuda::reduce_sum(
+  ASSERT_STATUS_OK(reduce_sum(
       device_input.get(),
       device_output_sum.get(),
       size,
       buffer.get(),
       buffer_size_in_bytes));
-  ASSERT_STATUS_OK(cuda::reduce_square_sum(
+  ASSERT_STATUS_OK(reduce_square_sum(
       device_input.get(),
       device_output_square_sum.get(),
       size,
       buffer.get(),
       buffer_size_in_bytes));
-  ASSERT_STATUS_OK(cuda::reduce_mean(
+  ASSERT_STATUS_OK(reduce_mean(
       device_input.get(),
       device_output_mean.get(),
       size,
@@ -116,10 +120,10 @@ void TestReduceRowsToRow(int m, int n, bool reset_initial_output, float relative
 
   if (!reset_initial_output) {
     // manually initialize output data
-    cuda::Fill(d_out.get(), initial_value, n);
+    Fill(d_out.get(), initial_value, n);
   }
 
-  ASSERT_STATUS_OK(cuda::reduce_matrix_rows(
+  ASSERT_STATUS_OK(reduce_matrix_rows(
       d_in.get(), d_out.get(),
       m, n,
       reset_initial_output));
@@ -155,10 +159,10 @@ void TestReduceColumnsToColumn(int m, int n, float relative_error_tolerance = 1e
   cudaMemcpy(d_in.get(), values.data(), m * n * sizeof(float), cudaMemcpyHostToDevice);
 
   size_t buffer_size_in_bytes =
-      cuda::compute_reduce_matrix_columns_buffer_size<float>(m, n);
+      compute_reduce_matrix_columns_buffer_size<float>(m, n);
   auto d_buffer = AllocateDeviceMemory<char>(buffer_size_in_bytes);
 
-  ASSERT_STATUS_OK(cuda::reduce_matrix_columns(
+  ASSERT_STATUS_OK(reduce_matrix_columns(
       d_in.get(), d_out.get(),
       m, n,
       d_buffer.get(), buffer_size_in_bytes));
@@ -202,7 +206,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
   const size_t max_buffer_offset = 15;
 
   const size_t buffer_size_in_bytes =
-      cuda::compute_reduce_matrix_columns_buffer_size<double>(m, n) + max_buffer_offset;
+      compute_reduce_matrix_columns_buffer_size<double>(m, n) + max_buffer_offset;
 
   auto d_input = AllocateDeviceMemory<double>(m * n);
   auto d_output = AllocateDeviceMemory<double>(m);
@@ -217,7 +221,7 @@ TEST(ReductionFunctionsTest, BufferOffsets) {
     const auto input = random.Uniform<double>({m, n}, 1.0, 10.0);
     cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(double), cudaMemcpyHostToDevice);
 
-    ASSERT_STATUS_OK(cuda::reduce_matrix_columns(
+    ASSERT_STATUS_OK(reduce_matrix_columns(
         d_input.get(), d_output.get(),
         m, n,
         d_buffer.get() + buffer_offset,
@@ -234,7 +238,7 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
 
   // this should be too small
   const size_t buffer_size_in_bytes =
-      cuda::compute_reduce_matrix_columns_buffer_size<float>(m, n) / 10;
+      compute_reduce_matrix_columns_buffer_size<float>(m, n) / 10;
 
   auto d_input = AllocateDeviceMemory<float>(m * n);
   auto d_output = AllocateDeviceMemory<float>(m);
@@ -245,64 +249,108 @@ TEST(ReductionFunctionsTest, InvalidBufferSize) {
   cudaMemcpy(d_input.get(), input.data(), m * n * sizeof(float), cudaMemcpyHostToDevice);
 
   const auto status =
-      cuda::reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes);
+      reduce_matrix_columns(d_input.get(), d_output.get(), m, n, d_buffer.get(), buffer_size_in_bytes);
   ASSERT_FALSE(status.IsOK());
 }
 
 TEST(ReductionFunctionsTest, GetApplicableMatrixReduction) {
+  auto test_get_applicable_matrix_reduction =
+      [](cudnnReduceTensorOp_t cudnn_op,
+         const std::vector<int64_t>& dims, const std::vector<int64_t>& axes,
+         ApplicableMatrixReduction expected_reduction,
+         const optional<int>& expected_m = nullopt,
+         const optional<int>& expected_n = nullopt) {
+        SCOPED_TRACE(MakeString(
+            "cudnn_op: ", cudnn_op,
+            ", dims: ", TensorShape::ReinterpretBaseType(dims),
+            ", axes: ", TensorShape::ReinterpretBaseType(axes)));
+        int m{}, n{};
+        EXPECT_EQ(
+            static_cast<int>(get_applicable_matrix_reduction(cudnn_op, dims, axes, m, n)),
+            static_cast<int>(expected_reduction));
+        if (expected_m) {
+          EXPECT_EQ(m, *expected_m);
+        }
+        if (expected_n) {
+          EXPECT_EQ(n, *expected_n);
+        }
+      };
+
   const cudnnReduceTensorOp_t valid_op_type = CUDNN_REDUCE_TENSOR_ADD;
-  int m{}, n{};
 
   // contiguous axes from beginning
-  EXPECT_EQ(
-      cuda::get_applicable_matrix_reduction(
-          valid_op_type, {2, 4, 8, 16}, {0, 1}, m, n),
-      cuda::ApplicableMatrixReduction::Rows);
-  EXPECT_EQ(m, 2 * 4);
-  EXPECT_EQ(n, 8 * 16);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {2, 4, 8, 16}, {0, 1},
+      ApplicableMatrixReduction::Rows, 2 * 4, 8 * 16);
 
   // contiguous axes to end
-  EXPECT_EQ(
-      cuda::get_applicable_matrix_reduction(
-          valid_op_type, {2, 4, 8, 16}, {1, 2, 3}, m, n),
-      cuda::ApplicableMatrixReduction::Columns);
-  EXPECT_EQ(m, 2);
-  EXPECT_EQ(n, 4 * 8 * 16);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {2, 4, 8, 16}, {1, 2, 3},
+      ApplicableMatrixReduction::Columns, 2, 4 * 8 * 16);
 
   // single axis
-  EXPECT_EQ(
-      cuda::get_applicable_matrix_reduction(
-          valid_op_type, {2, 4, 8, 16}, {3}, m, n),
-      cuda::ApplicableMatrixReduction::Columns);
-  EXPECT_EQ(m, 2 * 4 * 8);
-  EXPECT_EQ(n, 16);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {2, 4, 8, 16}, {3},
+      ApplicableMatrixReduction::Columns, 2 * 4 * 8, 16);
+
+  // empty axes
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {2, 4, 8, 16}, {},
+      ApplicableMatrixReduction::Rows, 2 * 4 * 8 * 16, 1);
+
+  // all axes
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {2, 4, 8, 16}, {0, 1, 2, 3},
+      ApplicableMatrixReduction::Rows, 2 * 4 * 8 * 16, 1);
+
+  // handle ones
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {0},
+      ApplicableMatrixReduction::Rows, 1, 2 * 4 * 8);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1},
+      ApplicableMatrixReduction::Rows, 2, 4 * 8);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3},
+      ApplicableMatrixReduction::Rows, 2, 4 * 8);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3, 4},
+      ApplicableMatrixReduction::Rows, 2 * 4, 8);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {1, 3, 4, 6},
+      ApplicableMatrixReduction::Rows, 2 * 4 * 8, 1);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {3, 4, 6},
+      ApplicableMatrixReduction::Columns, 2, 4 * 8);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {4, 6},
+      ApplicableMatrixReduction::Columns, 2, 4 * 8);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {6},
+      ApplicableMatrixReduction::Columns, 2 * 4, 8);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {7},
+      ApplicableMatrixReduction::Columns, 2 * 4 * 8, 1);
 
   // unsupported axes
-  EXPECT_EQ(
-      cuda::get_applicable_matrix_reduction(
-          valid_op_type, {2, 4, 8, 16}, {0, 1, 2, 3}, m, n),
-      cuda::ApplicableMatrixReduction::None);
-  EXPECT_EQ(
-      cuda::get_applicable_matrix_reduction(
-          valid_op_type, {2, 4, 8, 16}, {}, m, n),
-      cuda::ApplicableMatrixReduction::None);
-  EXPECT_EQ(
-      cuda::get_applicable_matrix_reduction(
-          valid_op_type, {2, 4, 8, 16, 32, 64}, {0, 1, 3, 4}, m, n),
-      cuda::ApplicableMatrixReduction::None);
-  EXPECT_EQ(
-      cuda::get_applicable_matrix_reduction(
-          valid_op_type, {2, 4, 8, 16}, {1, 2}, m, n),
-      cuda::ApplicableMatrixReduction::None);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {2, 4, 8, 16, 32, 64}, {0, 1, 3, 4},
+      ApplicableMatrixReduction::None);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {2, 4, 8, 16}, {1, 2},
+      ApplicableMatrixReduction::None);
+  test_get_applicable_matrix_reduction(
+      valid_op_type, {1, 2, 1, 1, 4, 1, 8, 1}, {3, 6},
+      ApplicableMatrixReduction::None);
 
   // invalid op type
-  EXPECT_EQ(
-      cuda::get_applicable_matrix_reduction(
-          CUDNN_REDUCE_TENSOR_MAX, {2, 4, 8, 16}, {0, 1}, m, n),
-      cuda::ApplicableMatrixReduction::None);
+  test_get_applicable_matrix_reduction(
+      CUDNN_REDUCE_TENSOR_MAX, {2, 4, 8, 16}, {0, 1},
+      ApplicableMatrixReduction::None);
 }
 
 }  // namespace test
+}  // namespace cuda
 }  // namespace onnxruntime
 
 #endif