diff --git a/onnxruntime/core/providers/cpu/nn/conv_attributes.h b/onnxruntime/core/providers/cpu/nn/conv_attributes.h
index b31030acc5..0c20916371 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_attributes.h
+++ b/onnxruntime/core/providers/cpu/nn/conv_attributes.h
@@ -190,7 +190,8 @@ struct ConvAttributes {
                                           bool& post_slicing_needed,
                                           TensorShapeVector& slice_starts,
                                           TensorShapeVector& slice_ends,
-                                          TensorShapeVector& slice_axes) const {
+                                          TensorShapeVector& slice_axes,
+                                          bool channels_last = false) const {
     size_t rank = input_shape.NumDimensions();
     // Make sure all "metadata" containers have the right number of elements
     if (rank > strides_p.size())
@@ -256,7 +257,11 @@ struct ConvAttributes {
           }
 
           post_slicing_needed = true;
-          slice_axes.push_back(static_cast<int64_t>(dim) + 2);
+          if (channels_last) {
+            slice_axes.push_back(static_cast<int64_t>(dim) + 1);
+          } else {
+            slice_axes.push_back(static_cast<int64_t>(dim) + 2);
+          }
           slice_starts.push_back(excess_output_head);
           slice_ends.push_back(excess_output_head + output_dim_size);                      // we may modify this below
           output_shape_with_revised_pads.push_back(excess_output_head + output_dim_size);  // we may modify this below
@@ -286,7 +291,11 @@ struct ConvAttributes {
               // Head has not been over-padded. Only tail pads need to be modified.
               post_slicing_needed = true;
 
-              slice_axes.push_back(static_cast<int64_t>(dim) + 2);
+              if (channels_last) {
+                slice_axes.push_back(static_cast<int64_t>(dim) + 1);
+              } else {
+                slice_axes.push_back(static_cast<int64_t>(dim) + 2);
+              }
               slice_starts.push_back(0);
               slice_ends.push_back(output_dim_size - revised_dim_size);
             }
diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc
index bb2755f54a..81db3c4186 100644
--- a/onnxruntime/core/providers/cuda/nn/conv.cc
+++ b/onnxruntime/core/providers/cuda/nn/conv.cc
@@ -88,7 +88,7 @@ Status SliceOutUnwantedOutputSection(cudaStream_t stream,
 
 template <typename T, bool NHWC>
 Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected) const {
-  //set X
+  // set X
   const Tensor* X = context->Input<Tensor>(0);
   const TensorShape& x_shape = X->Shape();
   const auto x_dims = x_shape.AsShapeVector();
@@ -180,7 +180,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
     TensorShapeVector y_dims_with_adjusted_pads(y_dims);
     ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShapeWithAdjustedPads(spatial_shape, kernel_shape,
                                                                      strides, dilations, pads, y_dims, y_dims_with_adjusted_pads,
-                                                                     post_slicing_required, slice_starts, slice_ends, slice_axes));
+                                                                     post_slicing_required, slice_starts, slice_ends, slice_axes,
+                                                                     channels_last));
     if (channels_last) {
       y_dims.push_back(M);
       y_dims_with_adjusted_pads.push_back(M);
@@ -287,7 +288,7 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
       TensorShapeVector b_dims(2 + kernel_shape.size(), 1);
       b_dims[1] = b_shape[0];
       ORT_RETURN_IF_ERROR(s_.b_tensor.Set(b_dims, CudnnTensor::GetDataType<CudaT>()));
-      //s_.b_data = reinterpret_cast<const CudaT*>(B->Data<T>());
+      // s_.b_data = reinterpret_cast<const CudaT*>(B->Data<T>());
     } else if (bias_expected) {
       TensorShapeVector b_dims(2 + kernel_shape.size(), 1);
       b_dims[1] = w_dims[0];
diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc
index 2468e3f048..6846813c7c 100644
--- a/onnxruntime/core/providers/rocm/nn/conv.cc
+++ b/onnxruntime/core/providers/rocm/nn/conv.cc
@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+
 #include "core/providers/rocm/nn/conv.h"
 #include "core/common/span_utils.h"
 #include "core/providers/rocm/nn/conv_impl.h"
@@ -180,7 +181,8 @@ Status Conv<T, NHWC>::UpdateState(OpKernelContext* context, bool bias_expected)
     TensorShapeVector y_dims_with_adjusted_pads(y_dims);
     ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShapeWithAdjustedPads(spatial_shape, kernel_shape,
                                                                      strides, dilations, pads, y_dims, y_dims_with_adjusted_pads,
-                                                                     post_slicing_required, slice_starts, slice_ends, slice_axes));
+                                                                     post_slicing_required, slice_starts, slice_ends, slice_axes,
+                                                                     channels_last));
 
     if (channels_last) {
       y_dims.push_back(M);
@@ -346,19 +348,11 @@ Status Conv<T, NHWC>::ComputeInternal(OpKernelContext* context) const {
                                                   s_.y_data,
                                                   workspace.get(),
                                                   s_.workspace_bytes));
-  if (nullptr != s_.b_data) {
-    constexpr bool channels_last = NHWC;
-    if (channels_last) {
-      const Tensor* B = context->Input<Tensor>(2);
-      const auto& b_shape = B->Shape();
 
-      ConvBiasImpl(Stream(context), reinterpret_cast<HipT*>(s_.Y->MutableData<T>()),
-                   reinterpret_cast<const HipT*>(B->Data<T>()),
-                   reinterpret_cast<HipT*>(s_.Y->MutableData<T>()), b_shape[0], s_.Y->Shape().Size());
-    } else {
-      MIOPEN_RETURN_IF_ERROR(miopenConvolutionForwardBias(miopen_handle, &alpha, s_.b_tensor, s_.b_data,
-                                                          &beta, s_.y_tensor, s_.y_data));
-    }
+  constexpr bool channels_last = NHWC;
+  if (nullptr != s_.b_data && !channels_last) {
+    MIOPEN_RETURN_IF_ERROR(miopenConvolutionForwardBias(miopen_handle, &alpha, s_.b_tensor, s_.b_data,
+                                                        &beta, s_.y_tensor, s_.y_data));
   }
   // To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions
   // This may have lead to extra results that are unnecessary and hence we slice that off here
@@ -367,6 +361,14 @@ Status Conv<T, NHWC>::ComputeInternal(OpKernelContext* context) const {
                                                       s_.Y->MutableDataRaw(), s_.y_dims.GetDims(), s_.slice_starts,
                                                       s_.slice_ends, s_.slice_axes, s_.element_size));
   }
+  if (nullptr != s_.b_data && channels_last) {
+    const Tensor* B = context->Input<Tensor>(2);
+    const auto& b_shape = B->Shape();
+
+    ConvBiasImpl(Stream(context), reinterpret_cast<const HipT*>(s_.Y->MutableDataRaw()),
+                 reinterpret_cast<const HipT*>(B->Data<T>()),
+                 reinterpret_cast<HipT*>(s_.Y->MutableDataRaw()), b_shape[0], s_.Y->Shape().Size());
+  }
   return Status::OK();
 }
 
diff --git a/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc b/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc
index 3d00885dd0..a79677357c 100644
--- a/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc
+++ b/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc
@@ -229,5 +229,51 @@ TEST(NhwcConvTest, Conv2D_AutoPad2) {
   RunNhwcConv(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape);
 }
 
+TEST(NhwcConvTest, Conv2D_asymmetric_padding1) {
+  NhwcConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{3, 3},        // kernel_shape
+      vector<int64_t>{1, 1, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<float> X = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  vector<int64_t> X_shape = {1, 3, 3, 1};
+  vector<float> W = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f};
+  vector<int64_t> W_shape = {1, 3, 3, 1};
+  vector<float> B = {1.f};
+  vector<int64_t> B_shape = {1};
+  vector<int64_t> Y_shape = {1, 2, 2, 1};
+  auto expected_vals = {13.f, 22.f, 28.f, 46.f};
+
+  RunNhwcConv(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+}
+
+TEST(NhwcConvTest, Conv2D_asymmetric_padding2) {
+  NhwcConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{3, 3},        // kernel_shape
+      vector<int64_t>{0, 0, 1, 1},  // pads
+      vector<int64_t>{1, 1},        // strides
+      {}                            // excluded EPs
+  };
+
+  vector<float> X = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
+  vector<int64_t> X_shape = {1, 3, 3, 1};
+  vector<float> W = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f};
+  vector<int64_t> W_shape = {1, 3, 3, 1};
+  vector<float> B = {1.f};
+  vector<int64_t> B_shape = {1};
+  vector<int64_t> Y_shape = {1, 2, 2, 1};
+  auto expected_vals = {46.f, 34.f, 40.f, 29.f};
+
+  RunNhwcConv(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
+}
+
 }  // namespace test
 }  // namespace onnxruntime