diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc
index 4581d1f83a..e7d69e95a2 100644
--- a/onnxruntime/core/providers/cpu/nn/conv.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv.cc
@@ -71,7 +71,6 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
   const size_t kernel_rank = kernel_shape.size();
 
   BufferUniquePtr col_buffer;
-  std::vector<int64_t> col_buffer_shape;
 
   // Pointwise convolutions can use the original input tensor in place,
   // otherwise a temporary buffer is required for the im2col transform.
@@ -81,13 +80,6 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
 
     auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(T)) * col_buffer_size);
     col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
-
-    if (kernel_rank != 2) {
-      const auto& output_dims = output_shape.GetDims();
-      col_buffer_shape.reserve(1 + output_dims.size());
-      col_buffer_shape.push_back(kernel_dim);
-      col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end());
-    }
   }
 
   T* col_buffer_data = static_cast<T*>(col_buffer.get());
@@ -120,10 +112,9 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
         } else {
           math::Im2colNd<T, StorageOrder::NCHW>()(
               Xdata + group_id * X_offset,
-              X->Shape().GetDims().data() + 1,
-              col_buffer_shape.data(),
-              C * input_image_size,
-              col_buffer_size,
+              input_shape.GetDims().data(),
+              output_shape.GetDims().data(),
+              kernel_dim,
               kernel_shape.data(),
               strides.data(),
               dilations.data(),
@@ -251,19 +242,13 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
     BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc));
     auto* col_buffer_data = static_cast<float*>(col_buffer.get());
 
-    TensorShape image_shape = X->Shape().Slice(1);
-    std::vector<int64_t> col_buffer_shape{kernel_dim};
-    col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(),
-                            output_shape.GetDims().end());
-
     for (int image_id = 0; image_id < N; ++image_id) {
       for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
         math::Im2colNd<float, StorageOrder::NCHW>()(
             Xdata + group_id * X_offset,
-            image_shape.GetDims().data(),
-            col_buffer_shape.data(),
-            C * input_image_size,
-            col_buffer_size,
+            input_shape.GetDims().data(),
+            output_shape.GetDims().data(),
+            kernel_dim,
             kernel_shape.data(),
             strides.data(),
             dilations.data(),
diff --git a/onnxruntime/core/providers/cpu/nn/conv_integer.cc b/onnxruntime/core/providers/cpu/nn/conv_integer.cc
index 10ef6a3133..850da62f46 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_integer.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_integer.cc
@@ -92,7 +92,6 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
   const size_t kernel_rank = kernel_shape.size();
 
   BufferUniquePtr col_buffer;
-  std::vector<int64_t> col_buffer_shape;
 
   // Pointwise convolutions can use the original input tensor in place,
   // otherwise a temporary buffer is required for the im2col transform.
@@ -102,13 +101,6 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
 
     auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * col_buffer_size);
     col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
-
-    if (kernel_rank != 2) {
-      const auto& output_dims = output_shape.GetDims();
-      col_buffer_shape.reserve(1 + output_dims.size());
-      col_buffer_shape.push_back(kernel_dim);
-      col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end());
-    }
   }
 
   auto* col_buffer_data = static_cast<uint8_t*>(col_buffer.get());
@@ -143,10 +135,9 @@ Status ConvInteger::Compute(OpKernelContext* context) const {
         } else {
           math::Im2colNd<uint8_t, StorageOrder::NCHW>()(
               Xdata,
-              X->Shape().GetDims().data() + 1,
-              col_buffer_shape.data(),
-              C * input_image_size,
-              col_buffer_size,
+              input_shape.GetDims().data(),
+              output_shape.GetDims().data(),
+              kernel_dim,
               kernel_shape.data(),
               strides.data(),
               dilations.data(),
diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
index 0e07642868..67f3cca4cb 100644
--- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc
@@ -74,9 +74,6 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
   const T* filter_data = p.F->template Data<T>();
   T* Ydata = p.Y->template MutableData<T>();
 
-  std::vector<int64_t> col_buffer_shape{kernel_dim};
-  col_buffer_shape.insert(col_buffer_shape.end(), p.input_shape.GetDims().begin(), p.input_shape.GetDims().end());
-
   if (p.X->Shape().NumDimensions() == 4) {
     for (auto image_id = 0; image_id < p.N; ++image_id) {
       for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
@@ -124,8 +121,7 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
       Ydata += Y_offset * conv_transpose_attrs_.group;
     }
   } else {
-    TensorShape output_shape = p.Y->Shape().Slice(1);
-    output_shape[0] = output_shape[0] / conv_transpose_attrs_.group;
+    TensorShape output_shape = p.Y->Shape().Slice(2);
 
     for (auto image_id = 0; image_id < p.N; ++image_id) {
       for (int group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) {
@@ -147,9 +143,9 @@ Status ConvTranspose<T>::DoConvTranspose(OpKernelContext* context, bool dynamic_
         math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
             col_buffer_data,
             output_shape.GetDims().data(),
-            col_buffer_shape.data(),
-            output_shape.Size(),
-            col_buffer_size,
+            p.input_shape.GetDims().data(),
+            kernel_dim,
+            Y_offset,
             p.kernel_shape.data(),
             p.strides.data(),
             p.dilations.data(),
diff --git a/onnxruntime/core/providers/cpu/nn/qlinearconv.cc b/onnxruntime/core/providers/cpu/nn/qlinearconv.cc
index 328233ad6b..1b09dc0905 100644
--- a/onnxruntime/core/providers/cpu/nn/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/nn/qlinearconv.cc
@@ -75,7 +75,6 @@ Status QLinearConv<uint8_t>::Compute(OpKernelContext* context) const {
   const Tensor* B = context->Input<Tensor>(8);
 
   const int64_t N = X->Shape()[0];
-  const int64_t C = X->Shape()[1];
   const int64_t M = W->Shape()[0];
   ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
 
@@ -125,20 +124,12 @@ Status QLinearConv<uint8_t>::Compute(OpKernelContext* context) const {
   ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
 
   BufferUniquePtr col_buffer;
-  std::vector<int64_t> col_buffer_shape;
 
   // Pointwise convolutions can use the original input tensor in place,
   // otherwise a temporary buffer is required for the im2col transform.
   if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) {
     auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * col_buffer_size);
     col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
-
-    if (kernel_rank != 2) {
-      const auto& output_dims = output_shape.GetDims();
-      col_buffer_shape.reserve(1 + output_dims.size());
-      col_buffer_shape.push_back(kernel_dim);
-      col_buffer_shape.insert(col_buffer_shape.end(), output_dims.begin(), output_dims.end());
-    }
   }
 
   auto* col_buffer_data = static_cast<uint8_t*>(col_buffer.get());
@@ -187,10 +178,9 @@ Status QLinearConv<uint8_t>::Compute(OpKernelContext* context) const {
         } else {
           math::Im2colNd<uint8_t, StorageOrder::NCHW>()(
               Xdata,
-              X->Shape().GetDims().data() + 1,
-              col_buffer_shape.data(),
-              C * input_image_size,
-              col_buffer_size,
+              input_shape.GetDims().data(),
+              output_shape.GetDims().data(),
+              kernel_dim,
               kernel_shape.data(),
               strides.data(),
               dilations.data(),
@@ -450,7 +440,6 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
   ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W_shape, kernel_shape));
 
   const size_t kernel_rank = kernel_shape.size();
-  ORT_ENFORCE(kernel_rank == 2, "QLinearConv : must be 2D convolution");
 
   std::vector<int64_t> pads(conv_attrs_.pads);
   if (pads.empty()) {
@@ -544,9 +533,10 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
   auto* transpose_output = static_cast<uint8_t*>(alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * Y_offset));
   BufferUniquePtr transpose_output_buffer(transpose_output, BufferDeleter(alloc));
 
+  BufferUniquePtr col_buffer;
+
   // Pointwise convolutions can use the original input tensor in place,
   // otherwise a temporary buffer is required for the im2col transform.
-  BufferUniquePtr col_buffer;
   if (kernel_size != 1 || !conv_attrs_.HasStridesOneAndNoPadding()) {
     auto* col_data = alloc->Alloc(SafeInt<size_t>(sizeof(uint8_t)) * col_buffer_size);
     col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc));
@@ -582,6 +572,23 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
                     static_cast<size_t>(group_input_channels),
                     static_cast<size_t>(input_image_size));
 
+      if (kernel_rank != 2 && col_buffer_data != nullptr) {
+        // Try big Im2ColNd in this case, parallel it later if needed
+        math::Im2colNd<uint8_t, StorageOrder::NHWC>()(
+            transpose_input,
+            input_shape.GetDims().data(),
+            output_shape.GetDims().data(),
+            kernel_dim,
+            kernel_shape.data(),
+            strides.data(),
+            dilations.data(),
+            pads.data(),
+            static_cast<int>(kernel_rank),
+            col_buffer_data,
+            false,
+            X_zero_point_value);
+      }
+
       auto conv_worker = [&](ptrdiff_t batch) {
         auto work = concurrency::ThreadPool::PartitionWork(batch, thread_count, static_cast<ptrdiff_t>(output_image_size));
         int64_t output_start = static_cast<int64_t>(work.start);
@@ -592,24 +599,26 @@ Status QLinearConv<int8_t>::Compute(OpKernelContext* context) const {
         uint8_t* worker_gemm_input;
         if (col_buffer_data != nullptr) {
           worker_gemm_input = col_buffer_data + output_start * kernel_dim;
-          math::Im2col<uint8_t, StorageOrder::NHWC>()(
-              transpose_input,
-              group_input_channels,
-              input_shape[0],
-              input_shape[1],
-              kernel_shape[0],
-              kernel_shape[1],
-              dilations[0],
-              dilations[1],
-              pads[0],
-              pads[1],
-              strides[0],
-              strides[1],
-              output_shape[1],
-              output_start,
-              output_count,
-              worker_gemm_input,
-              X_zero_point_value);
+          if (kernel_rank == 2) {
+            math::Im2col<uint8_t, StorageOrder::NHWC>()(
+                transpose_input,
+                group_input_channels,
+                input_shape[0],
+                input_shape[1],
+                kernel_shape[0],
+                kernel_shape[1],
+                dilations[0],
+                dilations[1],
+                pads[0],
+                pads[1],
+                strides[0],
+                strides[1],
+                output_shape[1],
+                output_start,
+                output_count,
+                worker_gemm_input,
+                X_zero_point_value);
+          }
         } else {
           worker_gemm_input = transpose_input + output_start * kernel_dim;
         }
diff --git a/onnxruntime/core/util/math.h b/onnxruntime/core/util/math.h
index 16138e680f..14755102cd 100644
--- a/onnxruntime/core/util/math.h
+++ b/onnxruntime/core/util/math.h
@@ -254,9 +254,8 @@ struct Im2colNd {
   void operator()(
       const T* data_img,
       const int64_t* im_shape,
-      const int64_t* col_shape,
-      int64_t img_size,
-      int64_t col_size,
+      const int64_t* output_shape,
+      int64_t channels_col,
       const int64_t* kernel_shape,
       const int64_t* stride,
       const int64_t* dilation,
@@ -267,78 +266,13 @@ struct Im2colNd {
       T padding_value = 0);
 };
 
-template <typename T>
-struct Im2colNd<T, StorageOrder::NCHW> {
-  void operator()(const T* data_img, const int64_t* im_shape, const int64_t* col_shape, int64_t /*img_size*/,
-                  int64_t /*col_size*/, const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation,
-                  const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false,
-                  T padding_value = 0) {
-    int64_t kernel_size = 1;
-    for (int64_t i = 0; i < N; ++i) {
-      kernel_size *= kernel_shape[i];
-    }
-    int64_t channels_col = col_shape[0];
-    std::vector<int64_t> d_offset(N, 0);
-    std::vector<int64_t> d_iter(N, 0);
-    for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
-      // Loop over spatial axes in reverse order to compute a per-axis offset.
-      int64_t offset = c_col;
-      for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
-        if (d_i < N - 1) {
-          offset /= kernel_shape[d_i + 1];
-        }
-        d_offset[d_i] = offset % kernel_shape[d_i];
-      }
-      for (bool incremented = true; incremented;) {
-        // Loop over spatial axes in forward order to compute the indices in the
-        // image and column, and whether the index lies in the padding.
-        int64_t index_col = c_col;
-        int64_t index_im = c_col / kernel_size;
-        bool is_padding = false;
-        for (int64_t d_i = 0; d_i < N; ++d_i) {
-          int64_t d = d_iter[d_i];
-          int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
-          is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
-          index_col *= col_shape[d_i + 1];
-          index_col += d;
-          index_im *= im_shape[d_i + 1];
-          index_im += d_im;
-        }
-        if (!accumulate_output) {
-          if (is_padding) {
-            data_col[index_col] = padding_value;
-          } else {
-            data_col[index_col] = data_img[index_im];
-          }
-        } else if (!is_padding) {  // col2im
-          data_col[index_im] += data_img[index_col];
-        }
-        // Loop over spatial axes in reverse order to choose an index,
-        // like counting.
-        incremented = false;
-        for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
-          int64_t d_max = col_shape[d_i + 1];
-          ORT_ENFORCE(d_iter[d_i] < d_max);
-          if (d_iter[d_i] == d_max - 1) {
-            d_iter[d_i] = 0;
-          } else {  // d_iter[d_i] < d_max - 1
-            ++d_iter[d_i];
-            incremented = true;
-            break;
-          }
-        }
-      }  // while(incremented) {
-    }    // for (int c = 0; c < channels_col; ++c) {
-  }
-};
-
 template <typename T, class Provider, int order>
 void Col2imNd(
     const T* data_col,
     const int64_t* img_shape,
-    const int64_t* col_shape,
+    const int64_t* output_shape,
+    int64_t channels_col,
     int64_t img_size,
-    int64_t col_size,
     const int64_t* kernel_shape,
     const int64_t* stride,
     const int64_t* dilation,
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index b9455d6340..1b38f2512d 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -210,7 +210,6 @@ template void Gemv<double, CPUMathUtil>(const CBLAS_TRANSPOSE TransA, int M, int
 SPECIALIZED_AXPY(float)
 #undef SPECIALIZED_AXPY
 
-
 #define DELEGATE_SIMPLE_UNARY_FUNCTION(T, Funcname, expr)                  \
   template <>                                                              \
   void Funcname<T, CPUMathUtil>(int N, const T* x, T* y, CPUMathUtil*) {   \
@@ -420,6 +419,130 @@ void Im2col<T, StorageOrder::NHWC>::operator()(const T* data_im, int64_t channel
 
 template struct Im2col<uint8_t, StorageOrder::NHWC>;
 
+// Loop over spatial axes in reverse order to choose an index, like counting.
+static inline bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) {
+  bool has_next_output = false;
+  for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
+    int64_t d_max = shape[d_i];
+    ORT_ENFORCE(dims[d_i] < d_max);
+    if (dims[d_i] == d_max - 1) {
+      dims[d_i] = 0;
+    } else {  // dims[d_i] < d_max - 1
+      ++dims[d_i];
+      has_next_output = true;
+      break;
+    }
+  }
+  return has_next_output;
+}
+
+template <typename T>
+struct Im2colNd<T, StorageOrder::NCHW> {
+  void operator()(const T* data_img, const int64_t* im_shape, const int64_t* output_shape, int64_t channels_col,
+                  const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation,
+                  const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false,
+                  T padding_value = 0) {
+    int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1LL, std::multiplies<int64_t>());
+    std::vector<int64_t> d_offset(N, 0);
+    std::vector<int64_t> d_iter(N, 0);
+    for (int64_t c_col = 0; c_col < channels_col; ++c_col) {
+      // Loop over spatial axes in reverse order to compute a per-axis offset.
+      int64_t offset = c_col;
+      for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
+        if (d_i < N - 1) {
+          offset /= kernel_shape[d_i + 1];
+        }
+        d_offset[d_i] = offset % kernel_shape[d_i];
+      }
+      do {
+        // Loop over spatial axes in forward order to compute the indices in the
+        // image and column, and whether the index lies in the padding.
+        int64_t index_col = c_col;
+        int64_t index_im = c_col / kernel_size;
+        bool is_padding = false;
+        for (int64_t d_i = 0; d_i < N; ++d_i) {
+          int64_t d = d_iter[d_i];
+          int64_t d_im = d * stride[d_i] - pad[d_i] + d_offset[d_i] * dilation[d_i];
+          is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]);
+          index_col *= output_shape[d_i];
+          index_col += d;
+          index_im *= im_shape[d_i];
+          index_im += d_im;
+        }
+        if (!accumulate_output) {
+          if (is_padding) {
+            data_col[index_col] = padding_value;
+          } else {
+            data_col[index_col] = data_img[index_im];
+          }
+        } else if (!is_padding) {  // col2im
+          data_col[index_im] += data_img[index_col];
+        }
+      } while (NextPosition(N, output_shape, d_iter.data()));
+    }  // for (int c = 0; c < channels_col; ++c) {
+  }
+};
+
+template struct Im2colNd<float, StorageOrder::NCHW>;
+template struct Im2colNd<uint8_t, StorageOrder::NCHW>;
+
+template <typename T>
+struct Im2colNd<T, StorageOrder::NHWC> {
+  void operator()(const T* data_img, const int64_t* im_shape, const int64_t* output_shape, int64_t channels_col,
+                  const int64_t* kernel_shape, const int64_t* stride, const int64_t* dilation,
+                  const int64_t* pad, int64_t N, T* data_col, bool accumulate_output = false,
+                  T padding_value = 0) {
+    int64_t kernel_size = std::accumulate(kernel_shape, kernel_shape + N, 1LL, std::multiplies<int64_t>());
+    int64_t input_channels = channels_col / kernel_size;
+    ORT_ENFORCE(input_channels * kernel_size == channels_col, "Dimensions not match!");
+
+    // iterate dimensions on output image shape (without Batch and Channel)
+    std::vector<int64_t> d_output(N, 0);
+    // inner iterate dimensions on kernel shape (without output channel and input channel)
+    std::vector<int64_t> d_kernel(N, 0);
+
+    // Loop over spatial axes along the output image shape
+    int64_t outer_col_index = 0;
+    do {
+      // Loop over spatial axes in reverse order to choose an index on kernel dimensions
+      int64_t inner_col_index = 0;
+      do {
+        // Loop over spatial axes in forward order to compute the indices in the image
+        // and the inner col, and whether the index lies in the padding.
+        int64_t index_im = 0;
+        bool is_padding = false;
+        for (int64_t d_i = 0; d_i < N; ++d_i) {
+          int64_t d_im = d_output[d_i] * stride[d_i] - pad[d_i] + d_kernel[d_i] * dilation[d_i];
+          is_padding |= !is_a_ge_zero_and_a_lt_b(d_im, im_shape[d_i]);
+          index_im *= im_shape[d_i];
+          index_im += d_im;
+        }
+        index_im *= input_channels;
+        auto index_col = outer_col_index + inner_col_index;
+
+        if (!accumulate_output) {
+          if (is_padding) {
+            std::fill_n(data_col + index_col, input_channels, padding_value);
+          } else {
+            std::copy_n(data_img + index_im, input_channels, data_col + index_col);
+          }
+        } else if (!is_padding) {  // col2im
+          const T* ptr_im = data_img + index_col;
+          T* ptr_col = data_col + index_im;
+          for (int64_t i = 0; i < input_channels; ++i) {
+            *ptr_col++ += *ptr_im++;
+          }
+        }
+        inner_col_index += input_channels;
+      } while (NextPosition(N, kernel_shape, d_kernel.data()));
+
+      outer_col_index += channels_col;
+    } while (NextPosition(N, output_shape, d_output.data()));
+  }
+};
+
+template struct Im2colNd<uint8_t, StorageOrder::NHWC>;
+
 template <>
 void Col2im<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, int64_t channels, int64_t height,
                                                     int64_t width, int64_t kernel_h, int64_t kernel_w,
@@ -558,7 +681,7 @@ void Col2im<float, CPUMathUtil, StorageOrder::NHWC>(const float* data_col, int64
 
 template <>
 void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, const int64_t* img_shape,
-                                                      const int64_t* col_shape, int64_t img_size, int64_t col_size,
+                                                      const int64_t* output_shape, int64_t channels_col, int64_t img_size,
                                                       const int64_t* kernel_shape, const int64_t* stride,
                                                       const int64_t* dilation, const int64_t* pad, int64_t N,
                                                       float* data_img, CPUMathUtil* context) {
@@ -566,9 +689,8 @@ void Col2imNd<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, con
   Im2colNd<float, StorageOrder::NCHW>()(
       data_col,
       img_shape,
-      col_shape,
-      img_size,
-      col_size,
+      output_shape,
+      channels_col,
       kernel_shape,
       stride,
       dilation,
diff --git a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc
index a48e411a1b..9f6cc00b2e 100644
--- a/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/qlinearconv_op_test.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <algorithm>
+#include "core/util/math.h"
 #include "gtest/gtest.h"
 #include "test/providers/provider_test_utils.h"
 #include "core/mlas/inc/mlas.h"
@@ -334,6 +336,23 @@ class QLinearConvOpTester {
     return static_cast<T>(RoundHalfToEven(f) + requantize_values.zero_point_);
   }
 
+  static bool NextPosition(int64_t N, const int64_t* shape, int64_t* dims) {
+    // Loop over spatial axes in reverse order to choose an index, like counting.
+    bool incremented = false;
+    for (int64_t d_i = N - 1; d_i >= 0; --d_i) {
+      int64_t d_max = shape[d_i];
+      ORT_ENFORCE(dims[d_i] < d_max);
+      if (dims[d_i] == d_max - 1) {
+        dims[d_i] = 0;
+      } else {  // dims[d_i] < d_max - 1
+        ++dims[d_i];
+        incremented = true;
+        break;
+      }
+    }
+    return incremented;
+  }
+
   void ComputeExpectedOutput(std::vector<T1>& Y_data, std::vector<int64_t>& Y_shape) {
     ORT_ENFORCE(W_.shape_.size() > 2);
     ORT_ENFORCE(X_.shape_.size() == W_.shape_.size());
@@ -377,20 +396,10 @@ class QLinearConvOpTester {
     const int64_t* output_shape = Y_shape.data() + 2;
     Y_data.resize(ShapeSize(Y_shape));
 
-    const int64_t input_h = input_shape[0];
-    const int64_t input_w = input_shape[1];
-    const int64_t input_image_size = input_h * input_w;
-    const int64_t kernel_h = kernel_shape[0];
-    const int64_t kernel_w = kernel_shape[1];
-    const int64_t kernel_size = kernel_h * kernel_w;
-    const int64_t output_h = output_shape[0];
-    const int64_t output_w = output_shape[1];
-    const int64_t pad_t = pads[0];
-    const int64_t pad_l = pads[1];
-    const int64_t dilation_h = dilations[0];
-    const int64_t dilation_w = dilations[1];
-    const int64_t stride_h = strides[0];
-    const int64_t stride_w = strides[1];
+    const int64_t input_image_size = std::accumulate(
+        input_shape, input_shape + kernel_rank, 1LL, std::multiplies<int64_t>());
+    const int64_t kernel_size = std::accumulate(
+        kernel_shape, kernel_shape + kernel_rank, 1LL, std::multiplies<int64_t>());
     const int32_t X_zero_point = X_.zero_point_;
 
     const T1* Xdata = X_.data_.data();
@@ -409,29 +418,34 @@ class QLinearConvOpTester {
           float weight_scale = W_.scale_[(W_.scale_.size() == 1) ? 0 : channel_index];
           float requantize_scale = (X_.scale_[0] * weight_scale) / output_scale_;
 
-          for (int64_t oh = 0; oh < output_h; oh++) {
-            for (int64_t ow = 0; ow < output_w; ow++) {
-              int32_t sum = bias;
-              const T1* input_image = Xdata;
-              const T2* weight_data = weight_row;
-              for (int64_t ic = 0; ic < group_input_channels; ic++) {
-                for (int64_t kh = 0; kh < kernel_h; kh++) {
-                  int64_t ih = kh * dilation_h + oh * stride_h - pad_t;
-                  for (int64_t kw = 0; kw < kernel_w; kw++) {
-                    int64_t iw = kw * dilation_w + ow * stride_w - pad_l;
-                    int32_t w_value = static_cast<int32_t>(*weight_data++);
-                    if (static_cast<uint64_t>(ih) < static_cast<uint64_t>(input_h) &&
-                        static_cast<uint64_t>(iw) < static_cast<uint64_t>(input_w)) {
-                      int32_t x_value = static_cast<int32_t>(input_image[ih * input_w + iw]) - X_zero_point;
-                      sum += x_value * w_value;
-                    }
-                  }
+          std::vector<int64_t> d_output(kernel_rank, 0);
+          std::vector<int64_t> d_kernel(kernel_rank, 0);
+          do {
+            int32_t sum = bias;
+            const T1* input_image = Xdata;
+            const T2* weight_data = weight_row;
+            for (int64_t ic = 0; ic < group_input_channels; ic++) {
+              do {
+                int64_t input_offset = 0;
+                bool is_padding = false;
+                for (size_t axis = 0; axis < kernel_rank; ++axis) {
+                  int64_t input_dim = d_kernel[axis] * dilations[axis] + d_output[axis] * strides[axis] - pads[axis];
+                  is_padding |= !math::is_a_ge_zero_and_a_lt_b(input_dim, input_shape[axis]);
+                  input_offset *= input_shape[axis];
+                  input_offset += input_dim;
                 }
-                input_image += input_image_size;
-              }
-              *Ydata++ = RequantizeOutput<T1>(sum, requantize_scale, requantize_values);
+                int32_t w_value = static_cast<int32_t>(*weight_data++);
+                if (!is_padding) {
+                  int32_t x_value = static_cast<int32_t>(input_image[input_offset]) - X_zero_point;
+                  sum += x_value * w_value;
+                }
+              } while (NextPosition(kernel_rank, kernel_shape, d_kernel.data()));
+
+              input_image += input_image_size;
             }
-          }
+            *Ydata++ = RequantizeOutput<T1>(sum, requantize_scale, requantize_values);
+
+          } while (NextPosition(kernel_rank, output_shape, d_output.data()));
 
           weight_row += group_input_channels * kernel_size;
         }
@@ -538,6 +552,16 @@ class QLinearConvOpTester {
   }
 };
 
+TEST(QLinearConvTest, Conv1D_U8S8) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({3, 24, 15}, .05f, 4);
+  test.GenerateRandomWeights({32, 24, 3}, .125f, 0);
+  test.GenerateRandomBias();
+  test.SetPads({1, 1});
+  test.SetOutputScaleAndZeroPoint(.55f, 54);
+  test.Run();
+}
+
 TEST(QLinearConvTest, Conv2D_U8S8) {
   QLinearConvOpTester<uint8_t, int8_t> test;
   test.GenerateRandomInput({3, 24, 15, 11}, .05f, 4);
@@ -548,6 +572,52 @@ TEST(QLinearConvTest, Conv2D_U8S8) {
   test.Run();
 }
 
+TEST(QLinearConvTest, Conv3D_U8S8) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({2, 2, 15, 11, 6}, .05f, 4);
+  test.GenerateRandomWeights({5, 2, 3, 3, 3}, .125f, 0);
+  test.GenerateRandomBias();
+  test.SetPads({1, 1, 1, 1, 1, 1});
+  test.SetOutputScaleAndZeroPoint(.55f, 54);
+  test.Run();
+}
+
+TEST(QLinearConvTest, Conv1D_U8S8_Pointwise) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({3, 24, 15}, .05f, 4);
+  test.GenerateRandomWeights({32, 24, 1}, .125f, 0);
+  test.GenerateRandomBias();
+  test.SetOutputScaleAndZeroPoint(.55f, 54);
+  test.Run();
+}
+
+TEST(QLinearConvTest, Conv2D_U8S8_Pointwise) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({3, 24, 15, 11}, .05f, 4);
+  test.GenerateRandomWeights({32, 24, 1, 1}, .125f, 0);
+  test.GenerateRandomBias();
+  test.SetOutputScaleAndZeroPoint(.55f, 54);
+  test.Run();
+}
+
+TEST(QLinearConvTest, Conv3D_U8S8_Pointwise) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({2, 2, 15, 11, 6}, .05f, 4);
+  test.GenerateRandomWeights({5, 2, 1, 1, 1}, .125f, 0);
+  test.GenerateRandomBias();
+  test.SetOutputScaleAndZeroPoint(.55f, 54);
+  test.Run();
+}
+
+TEST(QLinearConvTest, Conv1D_U8S8_Dilations) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({1, 4, 19}, .02f, 20);
+  test.GenerateRandomWeights({6, 4, 3}, .11f, 0);
+  test.SetDilations({2});
+  test.SetOutputScaleAndZeroPoint(.24f, 15);
+  test.Run();
+}
+
 TEST(QLinearConvTest, Conv2D_U8S8_Dilations) {
   QLinearConvOpTester<uint8_t, int8_t> test;
   test.GenerateRandomInput({1, 4, 19, 16}, .02f, 20);
@@ -557,6 +627,24 @@ TEST(QLinearConvTest, Conv2D_U8S8_Dilations) {
   test.Run();
 }
 
+TEST(QLinearConvTest, Conv3D_U8S8_Dilations) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({1, 2, 19, 16, 8}, .02f, 20);
+  test.GenerateRandomWeights({6, 2, 3, 2, 2}, .11f, 0);
+  test.SetDilations({2, 2, 2});
+  test.SetOutputScaleAndZeroPoint(.24f, 15);
+  test.Run();
+}
+
+TEST(QLinearConvTest, Conv1D_U8S8_Strides) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({1, 7, 18}, .04f, 16);
+  test.GenerateRandomWeights({5, 7, 2}, .14f, 0);
+  test.SetStrides({2});
+  test.SetOutputScaleAndZeroPoint(.31f, 30);
+  test.Run();
+}
+
 TEST(QLinearConvTest, Conv2D_U8S8_Strides) {
   QLinearConvOpTester<uint8_t, int8_t> test;
   test.GenerateRandomInput({1, 7, 18, 24}, .04f, 16);
@@ -566,6 +654,26 @@ TEST(QLinearConvTest, Conv2D_U8S8_Strides) {
   test.Run();
 }
 
+TEST(QLinearConvTest, Conv3D_U8S8_Strides) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({1, 3, 18, 24, 18}, .04f, 16);
+  test.GenerateRandomWeights({2, 3, 2, 3, 2}, .14f, 0);
+  test.SetStrides({2, 2, 2});
+  test.SetOutputScaleAndZeroPoint(.31f, 30);
+  test.Run();
+}
+
+TEST(QLinearConvTest, Conv1D_U8S8_Groups) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({1, 8, 13}, .03f, 7);
+  test.GenerateRandomWeights({12, 4, 3}, .10f, 0);
+  test.GenerateRandomBias();
+  test.SetPads({1, 1});
+  test.SetGroups(2);
+  test.SetOutputScaleAndZeroPoint(.76f, 88);
+  test.Run();
+}
+
 TEST(QLinearConvTest, Conv2D_U8S8_Groups) {
   QLinearConvOpTester<uint8_t, int8_t> test;
   test.GenerateRandomInput({1, 8, 13, 17}, .03f, 7);
@@ -577,6 +685,17 @@ TEST(QLinearConvTest, Conv2D_U8S8_Groups) {
   test.Run();
 }
 
+TEST(QLinearConvTest, Conv3D_U8S8_Groups) {
+  QLinearConvOpTester<uint8_t, int8_t> test;
+  test.GenerateRandomInput({1, 4, 13, 17, 13}, .03f, 7);
+  test.GenerateRandomWeights({6, 2, 3, 3, 3}, .10f, 0);
+  test.GenerateRandomBias();
+  test.SetPads({1, 1, 1, 1, 1, 1});
+  test.SetGroups(2);
+  test.SetOutputScaleAndZeroPoint(.76f, 88);
+  test.Run();
+}
+
 TEST(QLinearConvTest, Conv2D_U8S8_Groups_PerChannel) {
   QLinearConvOpTester<uint8_t, int8_t> test;
   test.GenerateRandomInput({1, 8, 13, 17}, .03f, 7);
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 1afe5e85c5..b8fa8a2a0e 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -703,6 +703,21 @@ TEST(GradientCheckerTest, ConvGrad) {
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
 
+  //conv3d
+  {
+    TensorShape x_shape({2, 1, 5, 5, 5});
+    TensorShape w_shape({1, 1, 3, 3, 3});
+    TensorShape b_shape({1});
+    TensorShape y_shape({2, 1, 5, 5, 5});
+    gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                          {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
+                                           MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1})},
+                                          // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                          // Check for not has_gradient need to be disabled to pass this test.
+                                          false);
+    EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
+  }
+
   //conv_with_strides
   {
     TensorShape x_shape({2, 1, 7, 5});
@@ -718,6 +733,22 @@ TEST(GradientCheckerTest, ConvGrad) {
                                           false);
     EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
   }
+
+  //conv3d_with_strides
+  {
+    TensorShape x_shape({2, 1, 7, 5, 5});
+    TensorShape w_shape({1, 1, 3, 3, 3});
+    TensorShape b_shape({1});
+    TensorShape y_shape({2, 1, 4, 3, 3});
+    gradient_checker.ComputeGradientError(op_def, {x_shape, w_shape, b_shape}, {y_shape}, &max_error,
+                                          {MakeAttribute("kernel_shape", std::vector<int64_t>{3, 3, 3}),
+                                           MakeAttribute("pads", std::vector<int64_t>{1, 1, 1, 1, 1, 1}),
+                                           MakeAttribute("strides", std::vector<int64_t>{2, 2, 2})},
+                                          // TODO: ConvGrad does not handle the case where W does not have gradient.
+                                          // Check for not has_gradient need to be disabled to pass this test.
+                                          false);
+    EXPECT_IS_TINIER_THAN(max_error, error_tolerance);
+  }
 }
 
 static void TestConcatOpGrad(const std::string& op_type,
diff --git a/orttraining/orttraining/training_ops/cpu/nn/conv_grad.cc b/orttraining/orttraining/training_ops/cpu/nn/conv_grad.cc
index 27a6729716..416b79d326 100644
--- a/orttraining/orttraining/training_ops/cpu/nn/conv_grad.cc
+++ b/orttraining/orttraining/training_ops/cpu/nn/conv_grad.cc
@@ -98,11 +98,6 @@ Status ConvGrad<T>::Compute(OpKernelContext* context) const {
                               &CPUMathUtil::Instance());
   }
 
-  TensorShape image_shape = X->Shape().Slice(1);
-  std::vector<int64_t> col_buffer_shape{kernel_dim};
-  col_buffer_shape.insert(col_buffer_shape.end(), output_shape.GetDims().begin(),
-                          output_shape.GetDims().end());
-
   for (int image_id = 0; image_id < N; ++image_id) {
     for (int group_id = 0; group_id < conv_attrs_.group; ++group_id) {
       if (Is2DKernel) {
@@ -125,10 +120,9 @@ Status ConvGrad<T>::Compute(OpKernelContext* context) const {
       } else {
         math::Im2colNd<T, StorageOrder::NCHW>()(
             Xdata + group_id * X_offset,
-            image_shape.GetDims().data(),
-            col_buffer_shape.data(),
-            C * input_image_size,
-            col_buffer_size,
+            input_shape.GetDims().data(),
+            output_shape.GetDims().data(),
+            kernel_dim,
             kernel_shape.data(),
             strides.data(),
             dilations.data(),
@@ -208,10 +202,10 @@ Status ConvGrad<T>::Compute(OpKernelContext* context) const {
         } else {
           math::Col2imNd<T, CPUMathUtil, StorageOrder::NCHW>(
               col_buffer_data,
-              image_shape.GetDims().data(),
-              col_buffer_shape.data(),
+              input_shape.GetDims().data(),
+              output_shape.GetDims().data(),
+              kernel_dim,
               C * input_image_size,
-              col_buffer_size,
               kernel_shape.data(),
               strides.data(),
               dilations.data(),