diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 99c265729d..00289c145c 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -1181,6 +1181,7 @@ Do not modify directly.*
 |FusedMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |Gelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |GroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *out* Y:**T**|1+|**M** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
+|NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |QLinearAdd|*in* A:**T**<br> *in* A_scale:**tensor(float)**<br> *in* A_zero_point:**T**<br> *in* B:**T**<br> *in* B_scale:**tensor(float)**<br> *in* B_zero_point:**T**<br> *in* C_scale:**tensor(float)**<br> *in* C_zero_point:**T**<br> *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QLinearSigmoid|*in* X:**T**<br> *in* X_scale:**tensor(float)**<br> *in* X_zero_point:**T**<br> *in* Y_scale:**tensor(float)**<br> *in* Y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)|
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvInteger.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvInteger.cpp
index 4a689b3065..6a2f867ed8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvInteger.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvInteger.cpp
@@ -9,14 +9,14 @@ namespace Dml
 class DmlOperatorConvInteger : public DmlOperator, public ConvolutionHelperBase
 {
 private:
-    enum InputTensors 
-    { 
-        IN_X, 
-        IN_X_ZERO_POINT, 
-        IN_F, 
-        IN_F_ZERO_POINT, 
+    enum InputTensors
+    {
+        IN_X,
+        IN_X_ZERO_POINT,
+        IN_F,
+        IN_F_ZERO_POINT,
     };
-    
+
 public:
     using Self = DmlOperatorConvInteger;
 
@@ -24,15 +24,15 @@ public:
         const MLOperatorKernelCreationContext& kernelInfo
         )
     :   DmlOperator(kernelInfo),
-        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), false, false, 0, 1)
+        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), false, false, false, 0, 1)
     {
         std::vector<std::optional<uint32_t>> kernelInputIndices = {0, 2, 1, 3};
         std::vector<std::optional<uint32_t>> kernelOutputIndices = {0};
 
         DmlOperator::Initialize(kernelInfo, kernelInputIndices);
 
-        // DirectML is limited to handle only 2D. So for 1D tensors, massage the tensor descriptions. By default, the 
-        // TensorDesc simply right aligns all the values up to 4D (padding the leading dimensions with 1's), 
+        // DirectML is limited to handle only 2D. So for 1D tensors, massage the tensor descriptions. By default, the
+        // TensorDesc simply right aligns all the values up to 4D (padding the leading dimensions with 1's),
         // but 1D tensors actually need to insert the 1 between C and W. e.g. [2,3,4] becomes [2,3,1,4]
         m_inputTensorDescs[IN_X] = CreateTensorDescFromInput(kernelInfo, 0/*Onnx Index*/, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
         m_inputTensorDescs[IN_F] = CreateTensorDescFromInput(kernelInfo, 1/*Onnx Index*/, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
@@ -42,9 +42,9 @@ public:
         // Resize the Filter ZeroPoint to be the same dimension as the input tensor.
         // The 1D tensor needs to be moved to the C channel.
         m_inputTensorDescs[IN_F_ZERO_POINT] = CreateTensorDescFromInput(
-            kernelInfo, 
-            3/*Onnx Index*/, 
-            TensorAxis::DoNotCoerce, 
+            kernelInfo,
+            3/*Onnx Index*/,
+            TensorAxis::DoNotCoerce,
             TensorAxis::C,
             TensorAxis::LeftAligned,
             std::nullopt,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvolution.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvolution.cpp
index 548ab65cc0..9c343e9a76 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvolution.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvolution.cpp
@@ -15,10 +15,11 @@ public:
         const MLOperatorKernelCreationContext& kernelInfo,
         DML_CONVOLUTION_MODE mode,
         DML_CONVOLUTION_DIRECTION direction,
-        bool hasDynamicPads
+        bool hasDynamicPads,
+        bool isNhwc
         )
     :   DmlOperator(kernelInfo),
-        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), direction == DML_CONVOLUTION_DIRECTION_BACKWARD, hasDynamicPads, 0, 1)
+        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), direction == DML_CONVOLUTION_DIRECTION_BACKWARD, hasDynamicPads, isNhwc, 0, 1)
     {
         uint32_t biasIndex = hasDynamicPads ? 3 : 2;
         bool hasBiasInput = kernelInfo.GetInputCount() > biasIndex;
@@ -33,6 +34,43 @@ public:
         // e.g. [2,3,4] becomes [2,3,1,4]
         m_inputTensorDescs[0] = CreateTensorDescFromInput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
         m_inputTensorDescs[1] = CreateTensorDescFromInput(kernelInfo, 1, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
+        m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
+
+        if (isNhwc)
+        {
+            // Restrict to 4D like other implementations
+            ML_CHECK_VALID_ARGUMENT(m_inputTensorDescs[0].GetDimensionCount() == 4);
+            const auto inputSizes = m_inputTensorDescs[0].GetSizes();
+            const uint32_t inputBatch = inputSizes[0];
+            const uint32_t inputHeight = inputSizes[1];
+            const uint32_t inputWidth = inputSizes[2];
+            const uint32_t inputChannels = inputSizes[3];
+            const std::array<uint32_t, 4> nchwInputSizes = {inputBatch, inputChannels, inputHeight, inputWidth};
+            const std::array<uint32_t, 4> nchwInputStrides = {inputHeight * inputWidth * inputChannels, 1, inputWidth * inputChannels, inputChannels};
+            m_inputTensorDescs[0] = TensorDesc(m_inputTensorDescs[0].GetDmlDataType(), nchwInputSizes, nchwInputStrides);
+
+            // Restrict to 4D like other implementations
+            ML_CHECK_VALID_ARGUMENT(m_inputTensorDescs[1].GetDimensionCount() == 4);
+            const auto weightSizes = m_inputTensorDescs[1].GetSizes();
+            const uint32_t featureMaps = weightSizes[0];
+            const uint32_t kernelHeight = weightSizes[1];
+            const uint32_t kernelWidth = weightSizes[2];
+            const uint32_t channelsPerGroup = weightSizes[3];
+            const std::array<uint32_t, 4> nchwKernelSizes = {featureMaps, channelsPerGroup, kernelHeight, kernelWidth};
+            const std::array<uint32_t, 4> nchwKernelStrides = {kernelHeight * kernelWidth * channelsPerGroup, 1, kernelWidth * channelsPerGroup, channelsPerGroup};
+            m_inputTensorDescs[1] = TensorDesc(m_inputTensorDescs[1].GetDmlDataType(), nchwKernelSizes, nchwKernelStrides);
+
+            // Restrict to 4D like other implementations
+            ML_CHECK_VALID_ARGUMENT(m_outputTensorDescs[0].GetDimensionCount() == 4);
+            const auto outputSizes = m_outputTensorDescs[0].GetSizes();
+            const uint32_t outputBatch = outputSizes[0];
+            const uint32_t outputHeight = outputSizes[1];
+            const uint32_t outputWidth = outputSizes[2];
+            const uint32_t outputChannels = outputSizes[3];
+            const std::array<uint32_t, 4> nchwOutputSizes = {outputBatch, outputChannels, outputHeight, outputWidth};
+            const std::array<uint32_t, 4> nchwOutputStrides = {outputHeight * outputWidth * outputChannels, 1, outputWidth * outputChannels, outputChannels};
+            m_outputTensorDescs[0] = TensorDesc(m_outputTensorDescs[0].GetDmlDataType(), nchwOutputSizes, nchwOutputStrides);
+        }
 
         // Bias is optional so only adjust it if it exists.
         if (hasBiasInput)
@@ -47,9 +85,9 @@ public:
             // Resize the bias to be the same dimension as the input tensor.
             // The 1D tensor needs to be moved to the C channel.
             m_inputTensorDescs[biasIndex] = CreateTensorDescFromInput(
-                kernelInfo, 
-                biasIndex, 
-                TensorAxis::DoNotCoerce, 
+                kernelInfo,
+                biasIndex,
+                TensorAxis::DoNotCoerce,
                 TensorAxis::C,
                 TensorAxis::LeftAligned,
                 std::nullopt,
@@ -57,8 +95,6 @@ public:
                 );
         }
 
-        m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
-
         std::optional<ActivationOperatorDesc> fusedActivation = FusionHelpers::TryGetFusedActivationDesc(kernelInfo);
         DML_OPERATOR_DESC fusedActivationDmlDesc = fusedActivation ? fusedActivation->GetDmlDesc() : DML_OPERATOR_DESC();
         std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
@@ -95,20 +131,21 @@ public:
 };
 
 // A specific type of operation for registration.
-template <DML_CONVOLUTION_MODE Mode, DML_CONVOLUTION_DIRECTION Direction, bool hasDynamicPads = false>
+template <DML_CONVOLUTION_MODE Mode, DML_CONVOLUTION_DIRECTION Direction, bool hasDynamicPads = false, bool isNhwc = false>
 class DmlOperatorConvolutionTemplate : public DmlOperatorConvolution
 {
 public:
     DmlOperatorConvolutionTemplate(const MLOperatorKernelCreationContext& kernelInfo)
-    :   DmlOperatorConvolution(kernelInfo, Mode, Direction, hasDynamicPads)
+    :   DmlOperatorConvolution(kernelInfo, Mode, Direction, hasDynamicPads, isNhwc)
     {
     }
 };
 
 DML_OP_DEFINE_CREATION_FUNCTION(Conv,                           DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_FORWARD>);
+DML_OP_DEFINE_CREATION_FUNCTION(NhwcConv,                       DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_FORWARD, false, true>);
 DML_OP_DEFINE_CREATION_FUNCTION(ConvTranspose,                  DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_BACKWARD>);
-DML_OP_DEFINE_CREATION_FUNCTION(DmlFusedConv,                      DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_FORWARD>);
-DML_OP_DEFINE_CREATION_FUNCTION(DmlFusedConvTranspose,             DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_BACKWARD>);
+DML_OP_DEFINE_CREATION_FUNCTION(DmlFusedConv,                   DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_FORWARD>);
+DML_OP_DEFINE_CREATION_FUNCTION(DmlFusedConvTranspose,          DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_BACKWARD>);
 DML_OP_DEFINE_CREATION_FUNCTION(ConvTransposeWithDynamicPads,   DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_BACKWARD, true>);
 
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp
index 91f730da7a..d45fdef3c8 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp
@@ -9,19 +9,19 @@ namespace Dml
 class DmlOperatorQLinearConv : public DmlOperator, public ConvolutionHelperBase
 {
 private:
-    enum InputTensors 
-    { 
-        IN_X, 
+    enum InputTensors
+    {
+        IN_X,
         IN_X_SCALE,
-        IN_X_ZERO_POINT, 
-        IN_F, 
+        IN_X_ZERO_POINT,
+        IN_F,
         IN_F_SCALE,
-        IN_F_ZERO_POINT, 
+        IN_F_ZERO_POINT,
         IN_BIAS,
         IN_Y_SCALE,
         IN_Y_ZERO_POINT
     };
-    
+
 public:
     using Self = DmlOperatorQLinearConv;
 
@@ -29,15 +29,15 @@ public:
         const MLOperatorKernelCreationContext& kernelInfo
         )
     :   DmlOperator(kernelInfo),
-        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), false, false, 0, 3)
+        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), false, false, false, 0, 3)
     {
         std::vector<std::optional<uint32_t>> kernelInputIndices = {0, 1, 2, 3, 4, 5, 8, 6, 7};
         std::vector<std::optional<uint32_t>> kernelOutputIndices = {0};
 
         DmlOperator::Initialize(kernelInfo, kernelInputIndices);
 
-        // DirectML is limited to handle only 2D. So for 1D tensors, massage the tensor descriptions. By default, the 
-        // TensorDesc simply right aligns all the values up to 4D (padding the leading dimensions with 1's), 
+        // DirectML is limited to handle only 2D. So for 1D tensors, massage the tensor descriptions. By default, the
+        // TensorDesc simply right aligns all the values up to 4D (padding the leading dimensions with 1's),
         // but 1D tensors actually need to insert the 1 between C and W. e.g. [2,3,4] becomes [2,3,1,4]
         m_inputTensorDescs[IN_X] = CreateTensorDescFromInput(kernelInfo, 0/*Onnx Index*/, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
         m_inputTensorDescs[IN_F] = CreateTensorDescFromInput(kernelInfo, 3/*Onnx Index*/, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
@@ -56,9 +56,9 @@ public:
             // Resize the bias to be the same dimension as the input tensor.
             // The 1D tensor needs to be moved to the C channel.
             m_inputTensorDescs[IN_BIAS] = CreateTensorDescFromInput(
-                kernelInfo, 
-                8/*Onnx Index*/, 
-                TensorAxis::DoNotCoerce, 
+                kernelInfo,
+                8/*Onnx Index*/,
+                TensorAxis::DoNotCoerce,
                 TensorAxis::C,
                 TensorAxis::LeftAligned,
                 std::nullopt,
@@ -69,9 +69,9 @@ public:
         // Resize the Filter ZeroPoint to be the same dimension as the input tensor.
         // The 1D tensor needs to be moved to the C channel.
         m_inputTensorDescs[IN_F_ZERO_POINT] = CreateTensorDescFromInput(
-            kernelInfo, 
-            5/*Onnx Index*/, 
-            TensorAxis::DoNotCoerce, 
+            kernelInfo,
+            5/*Onnx Index*/,
+            TensorAxis::DoNotCoerce,
             TensorAxis::C,
             TensorAxis::LeftAligned,
             std::nullopt,
@@ -80,9 +80,9 @@ public:
         // Resize the Filter Scale to be the same dimension as the input tensor.
         // The 1D tensor needs to be moved to the C channel.
         m_inputTensorDescs[IN_F_SCALE] = CreateTensorDescFromInput(
-            kernelInfo, 
-            4/*Onnx Index*/, 
-            TensorAxis::DoNotCoerce, 
+            kernelInfo,
+            4/*Onnx Index*/,
+            TensorAxis::DoNotCoerce,
             TensorAxis::C,
             TensorAxis::LeftAligned,
             std::nullopt,
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
index e3067a4a22..f8e9e1a262 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@@ -194,6 +194,7 @@ struct OperatorRegistrationInformation
 DML_OP_EXTERN_CREATION_FUNCTION(Copy);
 DML_OP_EXTERN_CREATION_FUNCTION(FC);
 DML_OP_EXTERN_CREATION_FUNCTION(Conv);
+DML_OP_EXTERN_CREATION_FUNCTION(NhwcConv);
 DML_OP_EXTERN_CREATION_FUNCTION(ConvTranspose);
 DML_OP_EXTERN_CREATION_FUNCTION(ConvTransposeWithDynamicPads);
 DML_OP_EXTERN_CREATION_FUNCTION(AveragePool);
@@ -528,6 +529,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
     // Deep Learning Standard Layers
     {REG_INFO(      7,  Conv,                               typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(     11,  Conv,                               typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
+    {REG_INFO_MS(   1,  NhwcConv,                           typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(      7,  ConvTranspose,                      typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(     11,  ConvTranspose,                      typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
     {REG_INFO(      7,  AveragePool,                        typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
index 00e3e1fcd5..d91f7be447 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@@ -292,11 +292,12 @@ namespace OperatorHelper
     // are ordered such that they are at the end (e.g. NCHW or NCDHW).
     std::vector<DimensionType> InitializeKernelOutputDimensions(
         gsl::span<const DimensionType> inputDimensions,
-        const KernelArgs& args
+        const KernelArgs& args,
+        bool isNhwc
     )
     {
         ML_CHECK_VALID_ARGUMENT(gsl::narrow_cast<uint32_t>(inputDimensions.size()) >= args.spatialDimensionCount);
-        int dimOffset = gsl::narrow_cast<int>(inputDimensions.size()) - args.spatialDimensionCount;
+        int dimOffset = isNhwc ? 1 : gsl::narrow_cast<int>(inputDimensions.size()) - args.spatialDimensionCount;
 
         std::vector<DimensionType> outputDimensions(inputDimensions.begin(), inputDimensions.end());
 
@@ -478,7 +479,8 @@ namespace OperatorHelper
 
     void ResolveAutoPadding(
         KernelArgs& args,
-        gsl::span<const DimensionType> inputDimensions
+        gsl::span<const DimensionType> inputDimensions,
+        bool isNhwc
     )
     {
         if (!args.autoPad)
@@ -490,7 +492,9 @@ namespace OperatorHelper
         uint32_t spatialDimensionCount = gsl::narrow_cast<uint32_t>(inputDimensions.size()) - NonspatialDimensionCount;
         ML_CHECK_VALID_ARGUMENT(spatialDimensionCount <= NcdhwSpatialDimensionCount); // Support up to 3D convolution (in 5D tensor).
 
-        const int dimOffset = gsl::narrow_cast<int>(inputDimensions.size()) - spatialDimensionCount;
+        ML_CHECK_VALID_ARGUMENT(!isNhwc || inputDimensions.size() == 4);
+
+        const int dimOffset = isNhwc ? 1 : gsl::narrow_cast<int>(inputDimensions.size()) - spatialDimensionCount;
 
         for (size_t dim = 0; dim < spatialDimensionCount; ++dim)
         {
@@ -763,8 +767,16 @@ namespace OperatorHelper
         ResolvingPadding(inputDimensions);
 
         m_outputShapes.resize(1);
-        m_outputShapes[0] = InitializeKernelOutputDimensions(inputDimensions, m_kernel);
-        m_outputShapes[0].GetShape()[C] = filterDims[K];
+        m_outputShapes[0] = InitializeKernelOutputDimensions(inputDimensions, m_kernel, m_isNhwc);
+
+        if (m_isNhwc)
+        {
+            m_outputShapes[0].GetShape()[static_cast<uint32_t>(NhwcInputDims::C)] = filterDims[K];
+        }
+        else
+        {
+            m_outputShapes[0].GetShape()[C] = filterDims[K];
+        }
     }
 
     void ConvolutionHelperBase::InitializeKernelAndShapesTransposed(
@@ -868,7 +880,7 @@ namespace OperatorHelper
 
     void ConvolutionHelperBase::ResolvingPadding(gsl::span<const DimensionType> inputDimensions)
     {
-        ResolveAutoPadding(m_kernel, inputDimensions);
+        ResolveAutoPadding(m_kernel, inputDimensions, m_isNhwc);
     }
 
     std::vector<EdgeShapes> GemmHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
index 7ce27bfebe..bc1ef2dac0 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@@ -204,7 +204,8 @@ struct KernelArgs
 
 std::vector<DimensionType> InitializeKernelOutputDimensions(
     gsl::span<const DimensionType> inputDimensions,
-    const KernelArgs& args);
+    const KernelArgs& args,
+    bool isNhwc = false);
 
 std::vector<DimensionType> InitializeKernelOutputDimsTranspose(
     gsl::span<const DimensionType> inputDimensions,
@@ -219,7 +220,8 @@ KernelArgs InitializeKernel(
 
 void ResolveAutoPadding(
     KernelArgs& args,
-    gsl::span<const DimensionType> inputDimensions);
+    gsl::span<const DimensionType> inputDimensions,
+    bool isNhwc = false);
 
 void MatMulShapeMapping(
     std::vector<DimensionType>& inputShape0,
@@ -450,13 +452,15 @@ class ConvolutionHelperBase
 public:
     enum FilterDims { K };
     enum InputDims { N, C, H, W };
+    enum class NhwcInputDims { N, H, W, C };
 
 public:
     // Info_t is used to obtain attributes which will be used for calculating the output shape later.
     template<typename Info_t, typename Shape_t>
-    ConvolutionHelperBase(const Info_t& info, const Shape_t& shape, bool transpose, bool hasDynamicPads, uint32_t inputTensorIndex, uint32_t filterTensorIndex) :
+    ConvolutionHelperBase(const Info_t& info, const Shape_t& shape, bool transpose, bool hasDynamicPads, bool isNhwc, uint32_t inputTensorIndex, uint32_t filterTensorIndex) :
         m_inputTensorIndex(inputTensorIndex),
         m_filterTensorIndex(filterTensorIndex),
+        m_isNhwc(isNhwc),
         m_kernel(InitializeKernel(info, shape.GetInputTensorDimensionCount(inputTensorIndex), shape.GetInputTensorShape(filterTensorIndex)))
     {
         m_groupCount = info.template GetOptionalAttribute<uint32_t>(AttrName::Group, 1);
@@ -487,6 +491,7 @@ protected:
     uint32_t m_groupCount;
     uint32_t m_inputTensorIndex;
     uint32_t m_filterTensorIndex;
+    bool m_isNhwc;
     KernelArgs m_kernel;
     std::vector<EdgeShapes> m_outputShapes;
 };
@@ -495,28 +500,35 @@ class ConvHelper : public ConvolutionHelperBase
 {
 public:
     template<typename Info_t, typename Shape_t>
-    ConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, 0, 1) {}
+    ConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, false, 0, 1) {}
+};
+
+class NhwcConvHelper : public ConvolutionHelperBase
+{
+public:
+    template<typename Info_t, typename Shape_t>
+    NhwcConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, true, 0, 1) {}
 };
 
 class ConvTransposeHelper : public ConvolutionHelperBase
 {
 public:
     template<typename Info_t, typename Shape_t>
-    ConvTransposeHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, true, false, 0, 1) {}
+    ConvTransposeHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, true, false, false, 0, 1) {}
 };
 
 class ConvTransposeWithDynamicPadsHelper : public ConvolutionHelperBase
 {
 public:
     template<typename Info_t, typename Shape_t>
-    ConvTransposeWithDynamicPadsHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, true, true, 0, 1) {}
+    ConvTransposeWithDynamicPadsHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, true, true, false, 0, 1) {}
 };
 
 class QLinearConvHelper : public ConvolutionHelperBase
 {
 public:
     template<typename Info_t, typename Shape_t>
-    QLinearConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, 0, 3) {}
+    QLinearConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, false, 0, 3) {}
 };
 
 class GemmHelper
@@ -1416,6 +1428,7 @@ public:
 };
 
 using ShapeInferenceHelper_Conv = ConvHelper;
+using ShapeInferenceHelper_NhwcConv = NhwcConvHelper;
 using ShapeInferenceHelper_ConvTranspose = ConvTransposeHelper;
 using ShapeInferenceHelper_ConvTransposeWithDynamicPads = ConvTransposeWithDynamicPadsHelper;
 using ShapeInferenceHelper_ConvInteger = ConvHelper;
diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
index d5e4f8f134..b6fe40e5b7 100644
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@@ -410,6 +410,7 @@ namespace OperatorHelper
         static const int sc_sinceVer_Attention = 1;
         static const int sc_sinceVer_SkipLayerNormalization = 1;
         static const int sc_sinceVer_EmbedLayerNormalization = 1;
+        static const int sc_sinceVer_NhwcConv = 1;
         static const int sc_sinceVer_BiasAdd = 1;
         static const int sc_sinceVer_QuickGelu = 1;
         static const int sc_sinceVer_GroupNorm = 1;
diff --git a/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc b/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc
index a79677357c..3f298b0a8f 100644
--- a/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc
+++ b/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc
@@ -32,8 +32,9 @@ void TestNhwcConvOp(const NhwcConvOpAndTestAttributes& attributes,
   int min_cuda_architecture = use_float16 ? 530 : 0;
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
   bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
+  bool enable_dml = (nullptr != DefaultDmlExecutionProvider().get());
 
-  if (enable_cuda || enable_rocm) {
+  if (enable_cuda || enable_rocm || enable_dml) {
     OpTester test("NhwcConv", 1, onnxruntime::kMSDomain);
     test.AddAttribute("group", attributes.group);
     test.AddAttribute("kernel_shape", attributes.kernel_shape);
@@ -82,6 +83,10 @@ void TestNhwcConvOp(const NhwcConvOpAndTestAttributes& attributes,
       execution_providers.push_back(DefaultRocmExecutionProvider());
     }
 
+    if (enable_dml) {
+      execution_providers.push_back(DefaultDmlExecutionProvider());
+    }
+
     test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
   }
 }