diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
index 161a39e9cf..10952bd85a 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
@@ -75,6 +75,7 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
   std::vector<int64_t> squeezed_output_dims;
   output_dims.reserve(input_dims.size());
 
+  // explicit 'axes' provided => reduce only on given axis values
   if (axes_.size() > 0) {
     output_dims = input_dims;
     for (auto reduced_axis : axes_) {
@@ -88,6 +89,7 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
       reduced[axis] = true;
     }
   } else {
+    // no axes provided (i.e.) default axes  => reduce on all dims
     for (auto dim : input_dims) {
       ORT_ENFORCE(keepdims_ || dim != 0,
                   "Can't reduce on dim with value of 0 if 'keepdims' is false. "
@@ -99,12 +101,18 @@ Status ReduceKernel<allow_multi_axes>::ComputeImpl(OpKernelContext* ctx, cudnnRe
   }
 
   if (keepdims_) {
+    // since keepdims is set, the final output dim is the same as the output_dims computed above
     squeezed_output_dims = output_dims;
-  } else {
+  } else if (axes_.size() > 0) {
+    // we are not going to keep the reduced dims, hence compute the final output dim accordingly
+    squeezed_output_dims.reserve(rank);  // even though we won't use the full capacity, it is better to reserve for peak possible usage
     for (size_t i = 0; i < rank; ++i) {
       if (!reduced[i])
         squeezed_output_dims.push_back(input_dims[i]);
     }
+  } else {
+    // 'axes' is empty and keepdims is false => we reduce on all axes AND drop all dims,
+    // so the result is just a scalar, we keep 'squeezed_output_dims' empty (i.e.) no-op
   }
 
   Tensor* Y = ctx->Output(0, TensorShape(squeezed_output_dims));
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 93a56808b3..1806ea726d 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -77,7 +77,7 @@ TEST(ReductionOpTest, ReduceL1_default_axes_keepdims) {
   test.Run();
 }
 
-TEST(ReductionOpTest, ReduceL1_do_not_keepdims) {
+TEST(ReductionOpTest, ReduceL1_do_not_keep_dims) {
   OpTester test("ReduceL1");
   test.AddAttribute("axes", std::vector<int64_t>{2});
   test.AddAttribute("keepdims", (int64_t)0);
@@ -94,7 +94,7 @@ TEST(ReductionOpTest, ReduceL1_do_not_keepdims) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
 }
 
-TEST(ReductionOpTest, ReduceL1_do_not_keepdims_2) {
+TEST(ReductionOpTest, ReduceL1_do_not_keep_dims_2) {
   OpTester test("ReduceL1");
   test.AddAttribute("axes", std::vector<int64_t>{0});
   test.AddAttribute("keepdims", (int64_t)0);
@@ -169,6 +169,22 @@ TEST(ReductionOpTest, ReduceL2_default_axes_keepdims) {
   test.Run();
 }
 
+TEST(ReductionOpTest, ReduceL2_default_axes_do_not_keep_dims) {
+  OpTester test("ReduceL2");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {}, {25.49509757f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceL2_do_not_keepdims) {
   OpTester test("ReduceL2");
   test.AddAttribute("axes", std::vector<int64_t>{2});
@@ -304,6 +320,22 @@ TEST(ReductionOpTest, ReduceLogSumExp_default_axes_keepdims) {
   test.Run();
 }
 
+TEST(ReductionOpTest, ReduceLogSumExp_default_axes_do_not_keep_dims) {
+  OpTester test("ReduceLogSumExp");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<float>("data", {3, 2, 2},
+                       {5.0f, 1.0f,
+                        20.0f, 2.0f,
+
+                        30.0f, 1.0f,
+                        40.0f, 2.0f,
+
+                        55.0f, 1.0f,
+                        60.0f, 2.0f});
+  test.AddOutput<float>("reduced", {}, {60.00671387f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceLogSumExp_do_not_keepdims) {
   OpTester test("ReduceLogSumExp");
   test.AddAttribute("axes", std::vector<int64_t>{1});
@@ -398,6 +430,22 @@ TEST(ReductionOpTest, ReduceMax_default_axes_keepdims) {
   test.Run();
 }
 
+TEST(ReductionOpTest, ReduceMax_default_axes_do_not_keep_dims) {
+  OpTester test("ReduceMax");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<float>("data", {3, 2, 2},
+                       {5.0f, 1.0f,
+                        20.0f, 2.0f,
+
+                        30.0f, 1.0f,
+                        40.0f, 2.0f,
+
+                        55.0f, 1.0f,
+                        60.0f, 2.0f});
+  test.AddOutput<float>("reduced", {}, {60.0f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceMax_do_not_keepdims) {
   OpTester test("ReduceMax");
   test.AddAttribute("axes", std::vector<int64_t>{1});
@@ -509,6 +557,22 @@ TEST(ReductionOpTest, ReduceMean_default_axes_keepdims) {
   test.Run();
 }
 
+TEST(ReductionOpTest, ReduceMean_default_axes_do_not_keep_dims) {
+  OpTester test("ReduceMean");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<float>("data", {3, 2, 2},
+                       {5.0f, 1.0f,
+                        20.0f, 2.0f,
+
+                        30.0f, 1.0f,
+                        40.0f, 2.0f,
+
+                        55.0f, 1.0f,
+                        60.0f, 2.0f});
+  test.AddOutput<float>("reduced", {}, {18.25f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceMean_do_not_keepdims) {
   OpTester test("ReduceMean");
   test.AddAttribute("axes", std::vector<int64_t>{1});
@@ -603,6 +667,32 @@ TEST(ReductionOpTest, ReduceMin_default_axes_keepdims) {
   test.Run();
 }
 
+TEST(ReductionOpTest, ReduceMin_default_axes_do_not_keep_dims) {
+  OpTester test("ReduceMin");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<float>("data", {3, 2, 2},
+                       {5.0f, 1.0f,
+                        20.0f, 2.0f,
+
+                        30.0f, 1.0f,
+                        40.0f, 2.0f,
+
+                        55.0f, 1.0f,
+                        60.0f, 2.0f});
+  test.AddOutput<float>("reduced", {}, {1.0f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
+TEST(ReductionOpTest, ReduceMin_default_axes_do_not_keep_dims_2D) {
+  OpTester test("ReduceMin");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<float>("data", {2, 2},
+                       {5.0f, 1.0f,
+                        20.0f, 2.0f});
+  test.AddOutput<float>("reduced", {}, {1.0f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceMin_do_not_keepdims) {
   OpTester test("ReduceMin");
   test.AddAttribute("axes", std::vector<int64_t>{1});
@@ -799,6 +889,22 @@ TEST(ReductionOpTest, ReduceSum_default_axes_keepdims) {
   test.Run();
 }
 
+TEST(ReductionOpTest, ReduceSum_default_axes_do_not_keep_dims) {
+  OpTester test("ReduceSum");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {}, {78.0f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceSum_do_not_keepdims) {
   OpTester test("ReduceSum");
   test.AddAttribute("axes", std::vector<int64_t>{1});
@@ -904,6 +1010,22 @@ TEST(ReductionOpTest, ReduceSumSquare_default_axes_keepdims) {
   test.Run();
 }
 
+TEST(ReductionOpTest, ReduceSumSquare_default_axes_do_not_keep_dims) {
+  OpTester test("ReduceSumSquare");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {}, {650.0f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceSumSquare_do_not_keepdims) {
   OpTester test("ReduceSumSquare");
   test.AddAttribute("axes", std::vector<int64_t>{1});
@@ -963,6 +1085,22 @@ TEST(ReductionOpTest, ReduceProd_default_axes_keepdims) {
   test.Run();
 }
 
+TEST(ReductionOpTest, ReduceProd_default_axes_do_not_keep_dims) {
+  OpTester test("ReduceProd");
+  test.AddAttribute("keepdims", static_cast<int64_t>(0));
+  test.AddInput<float>("data", {3, 2, 2},
+                       {1.0f, 2.0f,
+                        3.0f, 4.0f,
+
+                        5.0f, 6.0f,
+                        7.0f, 8.0f,
+
+                        9.0f, 10.0f,
+                        11.0f, 12.0f});
+  test.AddOutput<float>("reduced", {}, {479001600.f});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //TensorRT: full reduce without keepDimensions is not supported with explicit batch
+}
+
 TEST(ReductionOpTest, ReduceProd_do_not_keepdims) {
   OpTester test("ReduceProd");
   test.AddAttribute("axes", std::vector<int64_t>{1});