[DML EP] Add NhwcConv (#15194)

2026-07-24 19:43:35 +00:00 · 2023-04-10 23:16:09 -07:00 · 2023-04-10 23:16:09 -07:00 · c5b6ee1a99
commit c5b6ee1a99
parent 9acbfc6a29
9 changed files with 129 additions and 58 deletions
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@ -1181,6 +1181,7 @@ Do not modify directly.*
 |FusedMatMul|*in* A:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |Gelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |GroupNorm|*in* X:**T**<br> *in* gamma:**M**<br> *in* beta:**M**<br> *out* Y:**T**|1+|**M** = tensor(float), tensor(float16)<br/> **T** = tensor(float), tensor(float16)|
+|NhwcConv|*in* X:**T**<br> *in* W:**T**<br> *in* B:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |QLinearAdd|*in* A:**T**<br> *in* A_scale:**tensor(float)**<br> *in* A_zero_point:**T**<br> *in* B:**T**<br> *in* B_scale:**tensor(float)**<br> *in* B_zero_point:**T**<br> *in* C_scale:**tensor(float)**<br> *in* C_zero_point:**T**<br> *out* C:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QLinearSigmoid|*in* X:**T**<br> *in* X_scale:**tensor(float)**<br> *in* X_zero_point:**T**<br> *in* Y_scale:**tensor(float)**<br> *in* Y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|1+|**T1** = tensor(float)<br/> **T2** = tensor(uint8)|
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvInteger.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvInteger.cpp
@ -9,14 +9,14 @@ namespace Dml
 class DmlOperatorConvInteger : public DmlOperator, public ConvolutionHelperBase
 {
 private:
-    enum InputTensors 
-    { 
-        IN_X, 
-        IN_X_ZERO_POINT, 
-        IN_F, 
-        IN_F_ZERO_POINT, 
+    enum InputTensors
+    {
+        IN_X,
+        IN_X_ZERO_POINT,
+        IN_F,
+        IN_F_ZERO_POINT,
    };
-    
+
 public:
    using Self = DmlOperatorConvInteger;

@ -24,15 +24,15 @@ public:
        const MLOperatorKernelCreationContext& kernelInfo
        )
    :   DmlOperator(kernelInfo),
-        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), false, false, 0, 1)
+        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), false, false, false, 0, 1)
    {
        std::vector<std::optional<uint32_t>> kernelInputIndices = {0, 2, 1, 3};
        std::vector<std::optional<uint32_t>> kernelOutputIndices = {0};

        DmlOperator::Initialize(kernelInfo, kernelInputIndices);

-        // DirectML is limited to handle only 2D. So for 1D tensors, massage the tensor descriptions. By default, the 
-        // TensorDesc simply right aligns all the values up to 4D (padding the leading dimensions with 1's), 
+        // DirectML is limited to handle only 2D. So for 1D tensors, massage the tensor descriptions. By default, the
+        // TensorDesc simply right aligns all the values up to 4D (padding the leading dimensions with 1's),
        // but 1D tensors actually need to insert the 1 between C and W. e.g. [2,3,4] becomes [2,3,1,4]
        m_inputTensorDescs[IN_X] = CreateTensorDescFromInput(kernelInfo, 0/*Onnx Index*/, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
        m_inputTensorDescs[IN_F] = CreateTensorDescFromInput(kernelInfo, 1/*Onnx Index*/, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
@ -42,9 +42,9 @@ public:
        // Resize the Filter ZeroPoint to be the same dimension as the input tensor.
        // The 1D tensor needs to be moved to the C channel.
        m_inputTensorDescs[IN_F_ZERO_POINT] = CreateTensorDescFromInput(
-            kernelInfo, 
-            3/*Onnx Index*/, 
-            TensorAxis::DoNotCoerce, 
+            kernelInfo,
+            3/*Onnx Index*/,
+            TensorAxis::DoNotCoerce,
            TensorAxis::C,
            TensorAxis::LeftAligned,
            std::nullopt,
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvolution.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorConvolution.cpp
@ -15,10 +15,11 @@ public:
        const MLOperatorKernelCreationContext& kernelInfo,
        DML_CONVOLUTION_MODE mode,
        DML_CONVOLUTION_DIRECTION direction,
-        bool hasDynamicPads
+        bool hasDynamicPads,
+        bool isNhwc
        )
    :   DmlOperator(kernelInfo),
-        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), direction == DML_CONVOLUTION_DIRECTION_BACKWARD, hasDynamicPads, 0, 1)
+        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), direction == DML_CONVOLUTION_DIRECTION_BACKWARD, hasDynamicPads, isNhwc, 0, 1)
    {
        uint32_t biasIndex = hasDynamicPads ? 3 : 2;
        bool hasBiasInput = kernelInfo.GetInputCount() > biasIndex;
@ -33,6 +34,43 @@ public:
        // e.g. [2,3,4] becomes [2,3,1,4]
        m_inputTensorDescs[0] = CreateTensorDescFromInput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
        m_inputTensorDescs[1] = CreateTensorDescFromInput(kernelInfo, 1, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
+        m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
+
+        if (isNhwc)
+        {
+            // Restrict to 4D like other implementations
+            ML_CHECK_VALID_ARGUMENT(m_inputTensorDescs[0].GetDimensionCount() == 4);
+            const auto inputSizes = m_inputTensorDescs[0].GetSizes();
+            const uint32_t inputBatch = inputSizes[0];
+            const uint32_t inputHeight = inputSizes[1];
+            const uint32_t inputWidth = inputSizes[2];
+            const uint32_t inputChannels = inputSizes[3];
+            const std::array<uint32_t, 4> nchwInputSizes = {inputBatch, inputChannels, inputHeight, inputWidth};
+            const std::array<uint32_t, 4> nchwInputStrides = {inputHeight * inputWidth * inputChannels, 1, inputWidth * inputChannels, inputChannels};
+            m_inputTensorDescs[0] = TensorDesc(m_inputTensorDescs[0].GetDmlDataType(), nchwInputSizes, nchwInputStrides);
+
+            // Restrict to 4D like other implementations
+            ML_CHECK_VALID_ARGUMENT(m_inputTensorDescs[1].GetDimensionCount() == 4);
+            const auto weightSizes = m_inputTensorDescs[1].GetSizes();
+            const uint32_t featureMaps = weightSizes[0];
+            const uint32_t kernelHeight = weightSizes[1];
+            const uint32_t kernelWidth = weightSizes[2];
+            const uint32_t channelsPerGroup = weightSizes[3];
+            const std::array<uint32_t, 4> nchwKernelSizes = {featureMaps, channelsPerGroup, kernelHeight, kernelWidth};
+            const std::array<uint32_t, 4> nchwKernelStrides = {kernelHeight * kernelWidth * channelsPerGroup, 1, kernelWidth * channelsPerGroup, channelsPerGroup};
+            m_inputTensorDescs[1] = TensorDesc(m_inputTensorDescs[1].GetDmlDataType(), nchwKernelSizes, nchwKernelStrides);
+
+            // Restrict to 4D like other implementations
+            ML_CHECK_VALID_ARGUMENT(m_outputTensorDescs[0].GetDimensionCount() == 4);
+            const auto outputSizes = m_outputTensorDescs[0].GetSizes();
+            const uint32_t outputBatch = outputSizes[0];
+            const uint32_t outputHeight = outputSizes[1];
+            const uint32_t outputWidth = outputSizes[2];
+            const uint32_t outputChannels = outputSizes[3];
+            const std::array<uint32_t, 4> nchwOutputSizes = {outputBatch, outputChannels, outputHeight, outputWidth};
+            const std::array<uint32_t, 4> nchwOutputStrides = {outputHeight * outputWidth * outputChannels, 1, outputWidth * outputChannels, outputChannels};
+            m_outputTensorDescs[0] = TensorDesc(m_outputTensorDescs[0].GetDmlDataType(), nchwOutputSizes, nchwOutputStrides);
+        }

        // Bias is optional so only adjust it if it exists.
        if (hasBiasInput)
@ -47,9 +85,9 @@ public:
            // Resize the bias to be the same dimension as the input tensor.
            // The 1D tensor needs to be moved to the C channel.
            m_inputTensorDescs[biasIndex] = CreateTensorDescFromInput(
-                kernelInfo, 
-                biasIndex, 
-                TensorAxis::DoNotCoerce, 
+                kernelInfo,
+                biasIndex,
+                TensorAxis::DoNotCoerce,
                TensorAxis::C,
                TensorAxis::LeftAligned,
                std::nullopt,
@ -57,8 +95,6 @@ public:
                );
        }

-        m_outputTensorDescs[0] = CreateTensorDescFromOutput(kernelInfo, 0, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
-
        std::optional<ActivationOperatorDesc> fusedActivation = FusionHelpers::TryGetFusedActivationDesc(kernelInfo);
        DML_OPERATOR_DESC fusedActivationDmlDesc = fusedActivation ? fusedActivation->GetDmlDesc() : DML_OPERATOR_DESC();
        std::vector<DML_TENSOR_DESC> inputDescs = GetDmlInputDescs();
@ -95,20 +131,21 @@ public:
 };

 // A specific type of operation for registration.
-template <DML_CONVOLUTION_MODE Mode, DML_CONVOLUTION_DIRECTION Direction, bool hasDynamicPads = false>
+template <DML_CONVOLUTION_MODE Mode, DML_CONVOLUTION_DIRECTION Direction, bool hasDynamicPads = false, bool isNhwc = false>
 class DmlOperatorConvolutionTemplate : public DmlOperatorConvolution
 {
 public:
    DmlOperatorConvolutionTemplate(const MLOperatorKernelCreationContext& kernelInfo)
-    :   DmlOperatorConvolution(kernelInfo, Mode, Direction, hasDynamicPads)
+    :   DmlOperatorConvolution(kernelInfo, Mode, Direction, hasDynamicPads, isNhwc)
    {
    }
 };

 DML_OP_DEFINE_CREATION_FUNCTION(Conv,                           DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_FORWARD>);
+DML_OP_DEFINE_CREATION_FUNCTION(NhwcConv,                       DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_FORWARD, false, true>);
 DML_OP_DEFINE_CREATION_FUNCTION(ConvTranspose,                  DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_BACKWARD>);
-DML_OP_DEFINE_CREATION_FUNCTION(DmlFusedConv,                      DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_FORWARD>);
-DML_OP_DEFINE_CREATION_FUNCTION(DmlFusedConvTranspose,             DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_BACKWARD>);
+DML_OP_DEFINE_CREATION_FUNCTION(DmlFusedConv,                   DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_FORWARD>);
+DML_OP_DEFINE_CREATION_FUNCTION(DmlFusedConvTranspose,          DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_BACKWARD>);
 DML_OP_DEFINE_CREATION_FUNCTION(ConvTransposeWithDynamicPads,   DmlOperatorConvolutionTemplate<DML_CONVOLUTION_MODE_CROSS_CORRELATION, DML_CONVOLUTION_DIRECTION_BACKWARD, true>);

 } // namespace Dml
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/DmlOperatorQLinearConv.cpp
@ -9,19 +9,19 @@ namespace Dml
 class DmlOperatorQLinearConv : public DmlOperator, public ConvolutionHelperBase
 {
 private:
-    enum InputTensors 
-    { 
-        IN_X, 
+    enum InputTensors
+    {
+        IN_X,
        IN_X_SCALE,
-        IN_X_ZERO_POINT, 
-        IN_F, 
+        IN_X_ZERO_POINT,
+        IN_F,
        IN_F_SCALE,
-        IN_F_ZERO_POINT, 
+        IN_F_ZERO_POINT,
        IN_BIAS,
        IN_Y_SCALE,
        IN_Y_ZERO_POINT
    };
-    
+
 public:
    using Self = DmlOperatorQLinearConv;

@ -29,15 +29,15 @@ public:
        const MLOperatorKernelCreationContext& kernelInfo
        )
    :   DmlOperator(kernelInfo),
-        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), false, false, 0, 3)
+        ConvolutionHelperBase(kernelInfo, kernelInfo.GetTensorShapeDescription(), false, false, false, 0, 3)
    {
        std::vector<std::optional<uint32_t>> kernelInputIndices = {0, 1, 2, 3, 4, 5, 8, 6, 7};
        std::vector<std::optional<uint32_t>> kernelOutputIndices = {0};

        DmlOperator::Initialize(kernelInfo, kernelInputIndices);

-        // DirectML is limited to handle only 2D. So for 1D tensors, massage the tensor descriptions. By default, the 
-        // TensorDesc simply right aligns all the values up to 4D (padding the leading dimensions with 1's), 
+        // DirectML is limited to handle only 2D. So for 1D tensors, massage the tensor descriptions. By default, the
+        // TensorDesc simply right aligns all the values up to 4D (padding the leading dimensions with 1's),
        // but 1D tensors actually need to insert the 1 between C and W. e.g. [2,3,4] becomes [2,3,1,4]
        m_inputTensorDescs[IN_X] = CreateTensorDescFromInput(kernelInfo, 0/*Onnx Index*/, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
        m_inputTensorDescs[IN_F] = CreateTensorDescFromInput(kernelInfo, 3/*Onnx Index*/, TensorAxis::DoNotCoerce, TensorAxis::NoPlacementAdjustment, NonspatialDimensionCount, std::nullopt);
@ -56,9 +56,9 @@ public:
            // Resize the bias to be the same dimension as the input tensor.
            // The 1D tensor needs to be moved to the C channel.
            m_inputTensorDescs[IN_BIAS] = CreateTensorDescFromInput(
-                kernelInfo, 
-                8/*Onnx Index*/, 
-                TensorAxis::DoNotCoerce, 
+                kernelInfo,
+                8/*Onnx Index*/,
+                TensorAxis::DoNotCoerce,
                TensorAxis::C,
                TensorAxis::LeftAligned,
                std::nullopt,
@ -69,9 +69,9 @@ public:
        // Resize the Filter ZeroPoint to be the same dimension as the input tensor.
        // The 1D tensor needs to be moved to the C channel.
        m_inputTensorDescs[IN_F_ZERO_POINT] = CreateTensorDescFromInput(
-            kernelInfo, 
-            5/*Onnx Index*/, 
-            TensorAxis::DoNotCoerce, 
+            kernelInfo,
+            5/*Onnx Index*/,
+            TensorAxis::DoNotCoerce,
            TensorAxis::C,
            TensorAxis::LeftAligned,
            std::nullopt,
@ -80,9 +80,9 @@ public:
        // Resize the Filter Scale to be the same dimension as the input tensor.
        // The 1D tensor needs to be moved to the C channel.
        m_inputTensorDescs[IN_F_SCALE] = CreateTensorDescFromInput(
-            kernelInfo, 
-            4/*Onnx Index*/, 
-            TensorAxis::DoNotCoerce, 
+            kernelInfo,
+            4/*Onnx Index*/,
+            TensorAxis::DoNotCoerce,
            TensorAxis::C,
            TensorAxis::LeftAligned,
            std::nullopt,
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp
@ -194,6 +194,7 @@ struct OperatorRegistrationInformation
 DML_OP_EXTERN_CREATION_FUNCTION(Copy);
 DML_OP_EXTERN_CREATION_FUNCTION(FC);
 DML_OP_EXTERN_CREATION_FUNCTION(Conv);
+DML_OP_EXTERN_CREATION_FUNCTION(NhwcConv);
 DML_OP_EXTERN_CREATION_FUNCTION(ConvTranspose);
 DML_OP_EXTERN_CREATION_FUNCTION(ConvTransposeWithDynamicPads);
 DML_OP_EXTERN_CREATION_FUNCTION(AveragePool);
@ -528,6 +529,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation
    // Deep Learning Standard Layers
    {REG_INFO(      7,  Conv,                               typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
    {REG_INFO(     11,  Conv,                               typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
+    {REG_INFO_MS(   1,  NhwcConv,                           typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
    {REG_INFO(      7,  ConvTranspose,                      typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
    {REG_INFO(     11,  ConvTranspose,                      typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
    {REG_INFO(      7,  AveragePool,                        typeNameListDefault,            supportedTypeListFloat16to32,           DmlGraphSupport::Supported)},
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@ -292,11 +292,12 @@ namespace OperatorHelper
    // are ordered such that they are at the end (e.g. NCHW or NCDHW).
    std::vector<DimensionType> InitializeKernelOutputDimensions(
        gsl::span<const DimensionType> inputDimensions,
-        const KernelArgs& args
+        const KernelArgs& args,
+        bool isNhwc
    )
    {
        ML_CHECK_VALID_ARGUMENT(gsl::narrow_cast<uint32_t>(inputDimensions.size()) >= args.spatialDimensionCount);
-        int dimOffset = gsl::narrow_cast<int>(inputDimensions.size()) - args.spatialDimensionCount;
+        int dimOffset = isNhwc ? 1 : gsl::narrow_cast<int>(inputDimensions.size()) - args.spatialDimensionCount;

        std::vector<DimensionType> outputDimensions(inputDimensions.begin(), inputDimensions.end());

@ -478,7 +479,8 @@ namespace OperatorHelper

    void ResolveAutoPadding(
        KernelArgs& args,
-        gsl::span<const DimensionType> inputDimensions
+        gsl::span<const DimensionType> inputDimensions,
+        bool isNhwc
    )
    {
        if (!args.autoPad)
@ -490,7 +492,9 @@ namespace OperatorHelper
        uint32_t spatialDimensionCount = gsl::narrow_cast<uint32_t>(inputDimensions.size()) - NonspatialDimensionCount;
        ML_CHECK_VALID_ARGUMENT(spatialDimensionCount <= NcdhwSpatialDimensionCount); // Support up to 3D convolution (in 5D tensor).

-        const int dimOffset = gsl::narrow_cast<int>(inputDimensions.size()) - spatialDimensionCount;
+        ML_CHECK_VALID_ARGUMENT(!isNhwc || inputDimensions.size() == 4);
+
+        const int dimOffset = isNhwc ? 1 : gsl::narrow_cast<int>(inputDimensions.size()) - spatialDimensionCount;

        for (size_t dim = 0; dim < spatialDimensionCount; ++dim)
        {
@ -763,8 +767,16 @@ namespace OperatorHelper
        ResolvingPadding(inputDimensions);

        m_outputShapes.resize(1);
-        m_outputShapes[0] = InitializeKernelOutputDimensions(inputDimensions, m_kernel);
-        m_outputShapes[0].GetShape()[C] = filterDims[K];
+        m_outputShapes[0] = InitializeKernelOutputDimensions(inputDimensions, m_kernel, m_isNhwc);
+
+        if (m_isNhwc)
+        {
+            m_outputShapes[0].GetShape()[static_cast<uint32_t>(NhwcInputDims::C)] = filterDims[K];
+        }
+        else
+        {
+            m_outputShapes[0].GetShape()[C] = filterDims[K];
+        }
    }

    void ConvolutionHelperBase::InitializeKernelAndShapesTransposed(
@ -868,7 +880,7 @@ namespace OperatorHelper

    void ConvolutionHelperBase::ResolvingPadding(gsl::span<const DimensionType> inputDimensions)
    {
-        ResolveAutoPadding(m_kernel, inputDimensions);
+        ResolveAutoPadding(m_kernel, inputDimensions, m_isNhwc);
    }

    std::vector<EdgeShapes> GemmHelper::GetOutputShapes(const MLShapeInferenceContext& shapeInfo) const
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.h
@ -204,7 +204,8 @@ struct KernelArgs

 std::vector<DimensionType> InitializeKernelOutputDimensions(
    gsl::span<const DimensionType> inputDimensions,
-    const KernelArgs& args);
+    const KernelArgs& args,
+    bool isNhwc = false);

 std::vector<DimensionType> InitializeKernelOutputDimsTranspose(
    gsl::span<const DimensionType> inputDimensions,
@ -219,7 +220,8 @@ KernelArgs InitializeKernel(

 void ResolveAutoPadding(
    KernelArgs& args,
-    gsl::span<const DimensionType> inputDimensions);
+    gsl::span<const DimensionType> inputDimensions,
+    bool isNhwc = false);

 void MatMulShapeMapping(
    std::vector<DimensionType>& inputShape0,
@ -450,13 +452,15 @@ class ConvolutionHelperBase
 public:
    enum FilterDims { K };
    enum InputDims { N, C, H, W };
+    enum class NhwcInputDims { N, H, W, C };

 public:
    // Info_t is used to obtain attributes which will be used for calculating the output shape later.
    template<typename Info_t, typename Shape_t>
-    ConvolutionHelperBase(const Info_t& info, const Shape_t& shape, bool transpose, bool hasDynamicPads, uint32_t inputTensorIndex, uint32_t filterTensorIndex) :
+    ConvolutionHelperBase(const Info_t& info, const Shape_t& shape, bool transpose, bool hasDynamicPads, bool isNhwc, uint32_t inputTensorIndex, uint32_t filterTensorIndex) :
        m_inputTensorIndex(inputTensorIndex),
        m_filterTensorIndex(filterTensorIndex),
+        m_isNhwc(isNhwc),
        m_kernel(InitializeKernel(info, shape.GetInputTensorDimensionCount(inputTensorIndex), shape.GetInputTensorShape(filterTensorIndex)))
    {
        m_groupCount = info.template GetOptionalAttribute<uint32_t>(AttrName::Group, 1);
@ -487,6 +491,7 @@ protected:
    uint32_t m_groupCount;
    uint32_t m_inputTensorIndex;
    uint32_t m_filterTensorIndex;
+    bool m_isNhwc;
    KernelArgs m_kernel;
    std::vector<EdgeShapes> m_outputShapes;
 };
@ -495,28 +500,35 @@ class ConvHelper : public ConvolutionHelperBase
 {
 public:
    template<typename Info_t, typename Shape_t>
-    ConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, 0, 1) {}
+    ConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, false, 0, 1) {}
+};
+
+class NhwcConvHelper : public ConvolutionHelperBase
+{
+public:
+    template<typename Info_t, typename Shape_t>
+    NhwcConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, true, 0, 1) {}
 };

 class ConvTransposeHelper : public ConvolutionHelperBase
 {
 public:
    template<typename Info_t, typename Shape_t>
-    ConvTransposeHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, true, false, 0, 1) {}
+    ConvTransposeHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, true, false, false, 0, 1) {}
 };

 class ConvTransposeWithDynamicPadsHelper : public ConvolutionHelperBase
 {
 public:
    template<typename Info_t, typename Shape_t>
-    ConvTransposeWithDynamicPadsHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, true, true, 0, 1) {}
+    ConvTransposeWithDynamicPadsHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, true, true, false, 0, 1) {}
 };

 class QLinearConvHelper : public ConvolutionHelperBase
 {
 public:
    template<typename Info_t, typename Shape_t>
-    QLinearConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, 0, 3) {}
+    QLinearConvHelper(const Info_t& info, const Shape_t& shape) : ConvolutionHelperBase(info, shape, false, false, false, 0, 3) {}
 };

 class GemmHelper
@ -1416,6 +1428,7 @@ public:
 };

 using ShapeInferenceHelper_Conv = ConvHelper;
+using ShapeInferenceHelper_NhwcConv = NhwcConvHelper;
 using ShapeInferenceHelper_ConvTranspose = ConvTransposeHelper;
 using ShapeInferenceHelper_ConvTransposeWithDynamicPads = ConvTransposeWithDynamicPadsHelper;
 using ShapeInferenceHelper_ConvInteger = ConvHelper;
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h
@ -410,6 +410,7 @@ namespace OperatorHelper
        static const int sc_sinceVer_Attention = 1;
        static const int sc_sinceVer_SkipLayerNormalization = 1;
        static const int sc_sinceVer_EmbedLayerNormalization = 1;
+        static const int sc_sinceVer_NhwcConv = 1;
        static const int sc_sinceVer_BiasAdd = 1;
        static const int sc_sinceVer_QuickGelu = 1;
        static const int sc_sinceVer_GroupNorm = 1;
--- a/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc
+++ b/onnxruntime/test/contrib_ops/nhwc_conv_op_test.cc
@ -32,8 +32,9 @@ void TestNhwcConvOp(const NhwcConvOpAndTestAttributes& attributes,
  int min_cuda_architecture = use_float16 ? 530 : 0;
  bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
  bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
+  bool enable_dml = (nullptr != DefaultDmlExecutionProvider().get());

-  if (enable_cuda || enable_rocm) {
+  if (enable_cuda || enable_rocm || enable_dml) {
    OpTester test("NhwcConv", 1, onnxruntime::kMSDomain);
    test.AddAttribute("group", attributes.group);
    test.AddAttribute("kernel_shape", attributes.kernel_shape);
@ -82,6 +83,10 @@ void TestNhwcConvOp(const NhwcConvOpAndTestAttributes& attributes,
      execution_providers.push_back(DefaultRocmExecutionProvider());
    }

+    if (enable_dml) {
+      execution_providers.push_back(DefaultDmlExecutionProvider());
+    }
+
    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
  }
 }