From d4341ea2de8c2ada0ddbfbf4acd577851d9cf897 Mon Sep 17 00:00:00 2001
From: Xiang Zhang <xianz@microsoft.com>
Date: Thu, 2 Jul 2020 01:24:12 +0000
Subject: [PATCH] Merged PR 4870266: Refactor fused graph kernel so dmlxp and
 ort share the same code

Related work items: #26719246
---
 .../src/FusedGraphKernel.cpp                  | 311 ++---------------
 .../src/GraphKernelHelper.cpp                 | 313 ++++++++++++++++++
 .../src/GraphKernelHelper.h                   |  67 ++++
 3 files changed, 412 insertions(+), 279 deletions(-)
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.cpp
 create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.h
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
index 0b357e4f68..f3f4caab3f 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -5,20 +5,12 @@
 
 #include "MLOperatorAuthorImpl.h"
 #include "FusedGraphKernel.h"
+#include "GraphKernelHelper.h"
 
 using namespace Windows::AI::MachineLearning::Adapter;
 
 namespace Dml
 {
-    template <typename T>
-    static T AlignToPow2(T offset, T alignment)
-    {
-        static_assert(std::is_unsigned_v<T>);
-        assert(alignment != 0);
-        assert((alignment & (alignment - 1)) == 0);
-        return (offset + alignment - 1) & ~(alignment - 1);
-    }
-
     class FusedGraphKernel : public onnxruntime::OpKernel
     {
     public:
@@ -73,37 +65,10 @@ namespace Dml
 
             const uint32_t graphInputCount = kernelInfo.GetInputCount();
 
-            auto gpuGraphInputConstnessGetter = [&kernelInfo, &fusedNodeInputDefs, &transferredInitializerMap](uint32_t index)
-            {
-                // Transferred initializers are uploaded to GPU memory
-                auto iter = transferredInitializerMap.find(fusedNodeInputDefs[index]->Name());
-                if (iter != transferredInitializerMap.end())
-                {
-                    return true;
-                }
-
-                // If an initializer wasn't transferred, the constant input may be available from ORT
-                const onnxruntime::Tensor* inputTensor = nullptr;
-                if (!kernelInfo.TryGetConstantInput(index, &inputTensor) || inputTensor == nullptr)
-                {
-                    return false;
-                }
-
-                // Check that the constant ORT input is in GPU memory
-                if (!strcmp(inputTensor->Location().name, onnxruntime::CPU) ||
-                    inputTensor->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput ||
-                    inputTensor->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput)
-                {
-                    return false;
-                }
-
-                return true;
-            };
-
             m_inputsConstant.resize(graphInputCount);
             for (uint32_t i = 0; i < graphInputCount; ++i)
             {
-                m_inputsConstant[i] = gpuGraphInputConstnessGetter(i);
+              m_inputsConstant[i] = GraphKernelHelper::GetGraphInputConstness(i, kernelInfo, fusedNodeInputDefs, transferredInitializerMap);
             }
 
             GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
@@ -118,116 +83,27 @@ namespace Dml
                 device.Get(),
                 m_executionHandle);
 
-            // Determine the last input which uses an initializer, so initializers can be freed incrementally
-            // while processing each input in order.
-            std::map<const onnx::TensorProto*, uint32_t> initializerToLastInputIndexMap;
-            for (uint32_t i = 0; i < graphInputCount; i++)
-            {
-                auto iter = transferredInitializerMap.find(fusedNodeInputDefs[i]->Name());
-                if (iter != transferredInitializerMap.end())
-                {
-                    initializerToLastInputIndexMap[&iter->second] = i;
-                }
-		    }
-
-            // Walk through each graph edge and mark used inputs
-            m_inputsUsed.assign(graphInputCount, false);
-            for (const DML_INPUT_GRAPH_EDGE_DESC& edge : graphDesc.inputEdges)
-            {
-                m_inputsUsed[edge.GraphInputIndex] = true;
-            }
-
             // Populate input bindings for operator initialization
-            std::vector<ComPtr<ID3D12Resource>> initInputResources; // For lifetime control
+            std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>> initInputResources;  // For lifetime control
             std::vector<DML_BUFFER_BINDING> initInputBindings(graphInputCount);
             m_nonOwnedGraphInputsFromInitializers.resize(graphInputCount);
-            std::vector<ComPtr<ID3D12Resource>> initializeResourceRefs;
-
-            for (uint32_t i = 0; i < initInputBindings.size(); i++)
-            {
-                // If the input isn't actually used by the graph, nothing ever needs to be bound (either for
-                // initialization or execution). So just throw away the transferred initializer and skip this input.
-                if (!m_inputsUsed[i])
-                {
-                    transferredInitializerMap.erase(fusedNodeInputDefs[i]->Name());
-                    continue;
-                }
-
-                // Look for the initializer among those transferred from the graph during partitioning
-                auto iter = transferredInitializerMap.find(fusedNodeInputDefs[i]->Name());
-                if (iter != transferredInitializerMap.end())
-                {
-                    std::byte* tensorPtr = nullptr;
-                    size_t tensorByteSize = 0;
-                    std::unique_ptr<std::byte[]> unpackedTensor;
-
-                    auto& initializer = iter->second;
-
-                    // The tensor may be stored as raw data or in typed fields.
-                    if (initializer.has_raw_data())
-                    {
-                        tensorPtr = (std::byte*)(initializer.raw_data().c_str());
-                        tensorByteSize = initializer.raw_data().size();
-                    }
-                    else
-                    {
-                        std::tie(unpackedTensor, tensorByteSize) = UnpackTensor(initializer);
-                        tensorPtr = unpackedTensor.get(); 
-                    }
-
-                    // Tensor sizes in DML must be a multiple of 4 bytes large.
-                    tensorByteSize = AlignToPow2<size_t>(tensorByteSize, 4);
-
-                    if (!m_inputsConstant[i])
-                    {
-                        // Store the resource to use during execution
-                        ComPtr<ID3D12Resource> defaultBuffer = CreateResource(tensorPtr, tensorByteSize);
-                        m_nonOwnedGraphInputsFromInitializers[i] = defaultBuffer;
-                        initializeResourceRefs.push_back(std::move(defaultBuffer));
-                    }
-                    else
-                    {
-                        ComPtr<ID3D12Resource> initializeInputBuffer;
-
-                        // D3D_FEATURE_LEVEL_1_0_CORE doesn't support Custom heaps
-                        if (m_provider->IsMcdmDevice())
-                        {
-                            initializeInputBuffer = CreateResource(tensorPtr, tensorByteSize);
-                        }
-                        else
-                        {
-                            initializeInputBuffer = CreateCpuResource(tensorPtr, tensorByteSize);
-                        }
-
-                        // Set the binding for operator initialization to the buffer
-                        initInputBindings[i].Buffer = initializeInputBuffer.Get();
-                        initInputBindings[i].SizeInBytes = tensorByteSize;
-                        initializeResourceRefs.push_back(std::move(initializeInputBuffer));
-                    }
-
-                    // Free the initializer if this is the last usage of it.
-                    if (initializerToLastInputIndexMap[&initializer] == i)
-                    {
-                        transferredInitializerMap.erase(iter);
-                    }
-                }
-                else if (m_inputsConstant[i])
-                {                
-                    const onnxruntime::Tensor* inputTensor = nullptr;
-                    THROW_HR_IF(E_UNEXPECTED, !kernelInfo.TryGetConstantInput(i, &inputTensor));
-
-                    uint64_t allocId;
-                    UnwrapTensor(inputTensor, &initInputBindings[i].Buffer, &allocId);
-                    initInputBindings[i].SizeInBytes = initInputBindings[i].Buffer->GetDesc().Width;
-
-                    initInputBindings[i].Buffer->Release(); // Avoid holding an additional reference
-                    initInputResources.push_back(initInputBindings[i].Buffer);
-                }
-            }
-
-            // All initializers should have been consumed and freed above
-            assert(transferredInitializerMap.empty());
+            std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>> initializeResourceRefs;
+            
+            GraphKernelHelper::PopulateInputBindings(
+                m_provider.Get(),
+                m_winmlProvider.Get(),
+                m_inputsConstant,
+                kernelInfo,
+                graphDesc,
+                fusedNodeInputDefs,
+                m_inputsUsed,
+                initInputBindings,
+                initInputResources,
+                m_nonOwnedGraphInputsFromInitializers,
+                initializeResourceRefs,
+                transferredInitializerMap);
 
+            DML_GRAPH_DESC dmlGraphDesc = {};
             std::vector<DML_OPERATOR_GRAPH_NODE_DESC> dmlOperatorGraphNodes(graphDesc.nodes.size());
             std::vector<DML_GRAPH_NODE_DESC> dmlGraphNodes(graphDesc.nodes.size());
 
@@ -235,38 +111,15 @@ namespace Dml
             std::vector<DML_GRAPH_EDGE_DESC> dmlOutputEdges(graphDesc.outputEdges.size());
             std::vector<DML_GRAPH_EDGE_DESC> dmlIntermediateEdges(graphDesc.intermediateEdges.size());
 
-            for (size_t i = 0; i < graphDesc.nodes.size(); ++i)
-            {
-                dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{ graphDesc.nodes[i].op.Get() };
-                dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{ DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i] };
-            }
-
-            for (size_t i = 0; i < graphDesc.inputEdges.size(); ++i)
-            {
-                dmlInputEdges[i] = DML_GRAPH_EDGE_DESC{ DML_GRAPH_EDGE_TYPE_INPUT, &graphDesc.inputEdges[i] };
-            }
-
-            for (size_t i = 0; i < graphDesc.outputEdges.size(); ++i)
-            {
-                dmlOutputEdges[i] = DML_GRAPH_EDGE_DESC{ DML_GRAPH_EDGE_TYPE_OUTPUT, &graphDesc.outputEdges[i] };
-            }
-
-            for (size_t i = 0; i < graphDesc.intermediateEdges.size(); ++i)
-            {
-                dmlIntermediateEdges[i] = DML_GRAPH_EDGE_DESC{ DML_GRAPH_EDGE_TYPE_INTERMEDIATE, &graphDesc.intermediateEdges[i] };
-            }
-
-            DML_GRAPH_DESC dmlGraphDesc = {};
-            dmlGraphDesc.InputCount = graphInputCount;
-            dmlGraphDesc.OutputCount = kernelInfo.GetOutputCount();
-            dmlGraphDesc.NodeCount = gsl::narrow_cast<uint32_t>(dmlGraphNodes.size());
-            dmlGraphDesc.Nodes = dmlGraphNodes.data();
-            dmlGraphDesc.InputEdgeCount = gsl::narrow_cast<uint32_t>(dmlInputEdges.size());
-            dmlGraphDesc.InputEdges = dmlInputEdges.data();
-            dmlGraphDesc.OutputEdgeCount = gsl::narrow_cast<uint32_t>(dmlOutputEdges.size());
-            dmlGraphDesc.OutputEdges = dmlOutputEdges.data();
-            dmlGraphDesc.IntermediateEdgeCount = gsl::narrow_cast<uint32_t>(dmlIntermediateEdges.size());
-            dmlGraphDesc.IntermediateEdges = dmlIntermediateEdges.data();
+            GraphKernelHelper::ConvertGraphDesc(
+                graphDesc, 
+                dmlGraphDesc, 
+                kernelInfo,
+                dmlOperatorGraphNodes,
+                dmlGraphNodes,
+                dmlInputEdges,
+                dmlOutputEdges,
+                dmlIntermediateEdges);
 
             DML_EXECUTION_FLAGS executionFlags = DML_EXECUTION_FLAG_NONE;
             if (graphDesc.reuseCommandList)
@@ -534,10 +387,10 @@ namespace Dml
                         const onnxruntime::Tensor* tensor = kernelContext->Input<onnxruntime::Tensor>(i);
 
                         uint64_t allocId;
-                        UnwrapTensor(tensor, &inputBindings[i].Buffer, &allocId);
+                        GraphKernelHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &inputBindings[i].Buffer, &allocId);
                         inputBindingsChanged = inputBindingsChanged || (!allocId || m_inputBindingAllocIds[i] != allocId);
                         inputBindings[i].Buffer->Release(); // Avoid holding an additional reference
-                        inputBindings[i].SizeInBytes = AlignToPow2<size_t>(tensor->SizeInBytes(), 4);
+                        inputBindings[i].SizeInBytes = GraphKernelHelper::AlignToPow2<size_t>(tensor->SizeInBytes(), 4);
                         inputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &inputBindings[i]};
                         m_inputBindingAllocIds[i] = allocId;
                     }
@@ -571,10 +424,10 @@ namespace Dml
                     );
 
                 uint64_t allocId;
-                UnwrapTensor(tensor, &outputBindings[i].Buffer, &allocId);
+                GraphKernelHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &outputBindings[i].Buffer, &allocId);
                 outputBindingsChanged = outputBindingsChanged || (!allocId || m_outputBindingAllocIds[i] != allocId);
                 outputBindings[i].Buffer->Release(); // Avoid holding an additional reference
-                outputBindings[i].SizeInBytes = AlignToPow2<size_t>(tensor->SizeInBytes(), 4);
+                outputBindings[i].SizeInBytes = GraphKernelHelper::AlignToPow2<size_t>(tensor->SizeInBytes(), 4);
                 outputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &outputBindings[i]};
                 m_outputBindingAllocIds[i] = allocId;
             }
@@ -624,106 +477,6 @@ namespace Dml
             m_winmlProvider->QueueReference(m_persistentResourceAllocatorUnk.Get());
         }
 
-        void UnwrapTensor(const onnxruntime::Tensor* tensor, ID3D12Resource** resource, uint64_t* allocId) const
-        {
-            IUnknown* allocationUnk = static_cast<IUnknown*>(const_cast<void*>(tensor->DataRaw()));
-            ComPtr<IUnknown> resourceUnk;
-            m_winmlProvider->GetABIDataInterface(false, allocationUnk, &resourceUnk);
-
-            *allocId = m_winmlProvider->TryGetPooledAllocationId(allocationUnk, 0);
-
-            THROW_IF_FAILED(resourceUnk->QueryInterface(resource));
-        }
-
-        ComPtr<ID3D12Resource> CreateResource(const std::byte* tensorPtr, size_t tensorByteSize) const
-        {
-            ComPtr<ID3D12Resource> buffer;
-
-            D3D12_HEAP_PROPERTIES heapProperties = {
-                D3D12_HEAP_TYPE_DEFAULT,
-                D3D12_CPU_PAGE_PROPERTY_UNKNOWN,
-                D3D12_MEMORY_POOL_UNKNOWN,
-                0,
-                0
-            };
-
-            D3D12_RESOURCE_DESC resourceDesc = {
-                D3D12_RESOURCE_DIMENSION_BUFFER,
-                0,
-                static_cast<uint64_t>((tensorByteSize + 3) & ~3),
-                1,
-                1,
-                1,
-                DXGI_FORMAT_UNKNOWN,
-                { 1, 0 },
-                D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
-                D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS
-            };
-
-            ComPtr<ID3D12Device> d3dDevice;
-            THROW_IF_FAILED(m_provider->GetD3DDevice(d3dDevice.GetAddressOf()));
-
-            THROW_IF_FAILED(d3dDevice->CreateCommittedResource(
-                &heapProperties,
-                D3D12_HEAP_FLAG_NONE,
-                &resourceDesc,
-                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-                nullptr,
-                IID_PPV_ARGS(buffer.GetAddressOf())
-            ));
-
-            THROW_IF_FAILED(m_provider->UploadToResource(buffer.Get(), tensorPtr, tensorByteSize));
-
-            return buffer;
-        }
-
-        ComPtr<ID3D12Resource> CreateCpuResource(const std::byte* tensorPtr, size_t tensorByteSize) const
-        {
-            ComPtr<ID3D12Resource> buffer;
-
-            D3D12_HEAP_PROPERTIES heapProperties = {
-                D3D12_HEAP_TYPE_CUSTOM,
-                D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE,
-                D3D12_MEMORY_POOL_L0,
-                0,
-                0
-            };
-
-            D3D12_RESOURCE_DESC resourceDesc = {
-                D3D12_RESOURCE_DIMENSION_BUFFER,
-                0,
-                static_cast<uint64_t>((tensorByteSize + 3) & ~3),
-                1,
-                1,
-                1,
-                DXGI_FORMAT_UNKNOWN,
-                { 1, 0 },
-                D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
-                D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS
-            };
-
-            ComPtr<ID3D12Device> d3dDevice;
-            THROW_IF_FAILED(m_provider->GetD3DDevice(d3dDevice.GetAddressOf()));
-
-            THROW_IF_FAILED(d3dDevice->CreateCommittedResource(
-                &heapProperties,
-                D3D12_HEAP_FLAG_NONE,
-                &resourceDesc,
-                D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-                nullptr,
-                IID_PPV_ARGS(buffer.GetAddressOf())
-            ));
-
-            // Map the buffer and copy the data
-            void* bufferData = nullptr;
-            D3D12_RANGE range = {0, tensorByteSize};
-            THROW_IF_FAILED(buffer->Map(0, &range, &bufferData));
-            memcpy(bufferData, tensorPtr, tensorByteSize);
-            buffer->Unmap(0, &range);
-
-            return buffer;
-        }
-
         ComPtr<IDMLCompiledOperator> m_compiledExecutionPlanOperator;
         std::vector<bool> m_inputsUsed;
         const void* m_executionHandle = nullptr;
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.cpp
new file mode 100644
index 0000000000..fa9792b08c
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.cpp
@@ -0,0 +1,313 @@
+#include "precomp.h"
+
+#include "GraphKernelHelper.h"
+
+namespace Dml
+{
+namespace GraphKernelHelper 
+{
+    Microsoft::WRL::ComPtr<ID3D12Resource>
+    CreateResource(
+        Dml::IExecutionProvider* provider,
+        const std::byte* tensorPtr,
+        size_t tensorByteSize)
+    {
+        Microsoft::WRL::ComPtr<ID3D12Resource> buffer;
+
+        D3D12_HEAP_PROPERTIES heapProperties = {
+            D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
+
+        D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER,
+                                            0,
+                                            static_cast<uint64_t>((tensorByteSize + 3) & ~3),
+                                            1,
+                                            1,
+                                            1,
+                                            DXGI_FORMAT_UNKNOWN,
+                                            {1, 0},
+                                            D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
+                                            D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS};
+
+        Microsoft::WRL::ComPtr<ID3D12Device> d3dDevice;
+        THROW_IF_FAILED(provider->GetD3DDevice(d3dDevice.GetAddressOf()));
+
+        THROW_IF_FAILED(d3dDevice->CreateCommittedResource(
+            &heapProperties,
+            D3D12_HEAP_FLAG_NONE,
+            &resourceDesc,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            nullptr,
+            IID_PPV_ARGS(buffer.GetAddressOf())));
+
+        THROW_IF_FAILED(provider->UploadToResource(buffer.Get(), tensorPtr, tensorByteSize));
+
+        return buffer;
+    }
+
+    Microsoft::WRL::ComPtr<ID3D12Resource>
+    CreateCpuResource(
+        Dml::IExecutionProvider* provider,
+        const std::byte* tensorPtr,
+        size_t tensorByteSize)
+    {
+        Microsoft::WRL::ComPtr<ID3D12Resource> buffer;
+
+        D3D12_HEAP_PROPERTIES heapProperties = {
+            D3D12_HEAP_TYPE_CUSTOM, D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE, D3D12_MEMORY_POOL_L0, 0, 0};
+
+        D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER,
+                                            0,
+                                            static_cast<uint64_t>((tensorByteSize + 3) & ~3),
+                                            1,
+                                            1,
+                                            1,
+                                            DXGI_FORMAT_UNKNOWN,
+                                            {1, 0},
+                                            D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
+                                            D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS};
+
+        Microsoft::WRL::ComPtr<ID3D12Device> d3dDevice;
+        THROW_IF_FAILED(provider->GetD3DDevice(d3dDevice.GetAddressOf()));
+
+        THROW_IF_FAILED(d3dDevice->CreateCommittedResource(
+            &heapProperties,
+            D3D12_HEAP_FLAG_NONE,
+            &resourceDesc,
+            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
+            nullptr,
+            IID_PPV_ARGS(buffer.GetAddressOf())));
+
+        // Map the buffer and copy the data
+        void* bufferData = nullptr;
+        D3D12_RANGE range = {0, tensorByteSize};
+        THROW_IF_FAILED(buffer->Map(0, &range, &bufferData));
+        memcpy(bufferData, tensorPtr, tensorByteSize);
+        buffer->Unmap(0, &range);
+
+        return buffer;
+    }
+
+    void UnwrapTensor(
+        IWinmlExecutionProvider* winmlProvider,
+        const onnxruntime::Tensor* tensor,
+        ID3D12Resource** resource,
+        uint64_t* allocId) 
+    {
+        IUnknown* allocationUnk = static_cast<IUnknown*>(const_cast<void*>(tensor->DataRaw()));
+        Microsoft::WRL::ComPtr<IUnknown> resourceUnk;
+        winmlProvider->GetABIDataInterface(false, allocationUnk, &resourceUnk);
+
+        *allocId = winmlProvider->TryGetPooledAllocationId(allocationUnk, 0);
+
+        THROW_IF_FAILED(resourceUnk->QueryInterface(resource));
+    }
+
+    bool GetGraphInputConstness(
+        uint32_t index,
+        const onnxruntime::OpKernelInfo& kernelInfo,
+        const onnxruntime::ConstPointerContainer<std::vector<onnxruntime::NodeArg*>>& fusedNodeInputDefs,
+        const std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap) 
+    {
+        // Transferred initializers are uploaded to GPU memory
+        auto iter = transferredInitializerMap.find(fusedNodeInputDefs[index]->Name());
+        if (iter != transferredInitializerMap.end())
+        {
+            return true;
+        }
+
+        // If an initializer wasn't transferred, the constant input may be available from ORT
+        const onnxruntime::Tensor* inputTensor = nullptr;
+        if (!kernelInfo.TryGetConstantInput(index, &inputTensor) || inputTensor == nullptr)
+        {
+            return false;
+        }
+
+        // Check that the constant ORT input is in GPU memory
+        if (!strcmp(inputTensor->Location().name, onnxruntime::CPU) ||
+            inputTensor->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput ||
+            inputTensor->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput)
+        {
+            return false;
+        }
+
+        return true;
+    };
+
+    std::vector<std::vector<std::byte>> PopulateInputBindings(
+        Dml::IExecutionProvider* provider,
+        IWinmlExecutionProvider* winmlProvider,
+        const std::vector<uint8_t>& inputsConstant,
+        const onnxruntime::OpKernelInfo& kernelInfo,
+        const Dml::GraphDescBuilder::GraphDesc& graphDesc,
+        const onnxruntime::ConstPointerContainer<std::vector<onnxruntime::NodeArg*>>& fusedNodeInputDefs,
+        _Out_ std::vector<bool>& inputsUsed,
+        _Out_ std::vector<DML_BUFFER_BINDING>& initInputBindings,
+        _Out_ std::vector<ComPtr<ID3D12Resource>>& initInputResources,
+        _Out_ std::vector<ComPtr<ID3D12Resource>>& nonOwnedGraphInputsFromInitializers,
+        _Out_ std::vector<ComPtr<ID3D12Resource>>& initializeResourceRefs,
+        _Inout_ std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap)
+    {
+        std::vector<std::vector<std::byte>> inputRawData;
+
+        const uint32_t graphInputCount = kernelInfo.GetInputCount();
+        // Determine the last input which uses an initializer, so initializers can be freed incrementally
+        // while processing each input in order.
+        std::map<const onnx::TensorProto*, uint32_t> initializerToLastInputIndexMap;
+        for (uint32_t i = 0; i < graphInputCount; i++) 
+        {
+            auto iter = transferredInitializerMap.find(fusedNodeInputDefs[i]->Name());
+            if (iter != transferredInitializerMap.end()) {
+                initializerToLastInputIndexMap[&iter->second] = i;
+            }
+        }
+
+        // Walk through each graph edge and mark used inputs
+        inputsUsed.assign(graphInputCount, false);
+        for (const DML_INPUT_GRAPH_EDGE_DESC& edge : graphDesc.inputEdges) {
+            inputsUsed[edge.GraphInputIndex] = true;
+        }
+        for (uint32_t i = 0; i < initInputBindings.size(); i++)
+        {
+            // If the input isn't actually used by the graph, nothing ever needs to be bound (either for
+            // initialization or execution). So just throw away the transferred initializer and skip this input.
+            if (!inputsUsed[i])
+            {
+                transferredInitializerMap.erase(fusedNodeInputDefs[i]->Name());
+                inputRawData.push_back(std::vector<std::byte>());
+                continue;
+            }
+
+            // Look for the initializer among those transferred from the graph during partitioning
+            auto iter = transferredInitializerMap.find(fusedNodeInputDefs[i]->Name());
+            if (iter != transferredInitializerMap.end())
+            {
+                std::byte* tensorPtr = nullptr;
+                size_t tensorByteSize = 0;
+                std::unique_ptr<std::byte[]> unpackedTensor;
+
+                auto& initializer = iter->second;
+
+                // The tensor may be stored as raw data or in typed fields.
+                if (initializer.has_raw_data())
+                {
+                    tensorPtr = (std::byte*)(initializer.raw_data().c_str());
+                    tensorByteSize = initializer.raw_data().size();
+                }
+                else
+                {
+                    std::tie(unpackedTensor, tensorByteSize) = UnpackTensor(initializer);
+                    tensorPtr = unpackedTensor.get(); 
+                }
+
+                // Tensor sizes in DML must be a multiple of 4 bytes large.
+                tensorByteSize = AlignToPow2<size_t>(tensorByteSize, 4);
+
+                inputRawData.push_back(std::vector<std::byte>(tensorPtr, tensorPtr + tensorByteSize));
+
+                if (!inputsConstant[i])
+                {
+                    // Store the resource to use during execution
+                    ComPtr<ID3D12Resource> defaultBuffer = CreateResource(provider, tensorPtr, tensorByteSize);
+                    nonOwnedGraphInputsFromInitializers[i] = defaultBuffer;
+                    initializeResourceRefs.push_back(std::move(defaultBuffer));
+                }
+                else
+                {
+                    ComPtr<ID3D12Resource> initializeInputBuffer;
+
+                    // D3D_FEATURE_LEVEL_1_0_CORE doesn't support Custom heaps
+                    if (provider->IsMcdmDevice())
+                    {
+                        initializeInputBuffer = CreateResource(provider, tensorPtr, tensorByteSize);
+                    }
+                    else
+                    {
+                        initializeInputBuffer = CreateCpuResource(provider, tensorPtr, tensorByteSize);
+                    }
+
+                    // Set the binding for operator initialization to the buffer
+                    initInputBindings[i].Buffer = initializeInputBuffer.Get();
+                    initInputBindings[i].SizeInBytes = tensorByteSize;
+                    initializeResourceRefs.push_back(std::move(initializeInputBuffer));
+                }
+
+                // Free the initializer if this is the last usage of it.
+                if (initializerToLastInputIndexMap[&initializer] == i)
+                {
+                    transferredInitializerMap.erase(iter);
+                }
+            }
+            else if (inputsConstant[i])
+            {                
+                const onnxruntime::Tensor* inputTensor = nullptr;
+                THROW_HR_IF(E_UNEXPECTED, !kernelInfo.TryGetConstantInput(i, &inputTensor));
+
+                const std::byte* tensorData = reinterpret_cast<const std::byte*>(inputTensor->DataRaw());
+                inputRawData.push_back(
+                    std::vector<std::byte>(tensorData, tensorData + inputTensor->SizeInBytes()));
+
+                uint64_t allocId;
+                UnwrapTensor(winmlProvider, inputTensor, &initInputBindings[i].Buffer, &allocId);
+                initInputBindings[i].SizeInBytes = initInputBindings[i].Buffer->GetDesc().Width;
+
+                initInputBindings[i].Buffer->Release(); // Avoid holding an additional reference
+                initInputResources.push_back(initInputBindings[i].Buffer);
+            } 
+            else 
+            {
+                inputRawData.push_back(std::vector<std::byte>());
+            }
+        }
+
+        // All initializers should have been consumed and freed above
+        assert(transferredInitializerMap.empty());
+        return inputRawData;
+    }
+
+    void ConvertGraphDesc(
+        const Dml::GraphDescBuilder::GraphDesc& graphDesc,
+        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
+        const onnxruntime::OpKernelInfo& kernelInfo,
+        _Out_ std::vector<DML_OPERATOR_GRAPH_NODE_DESC>& dmlOperatorGraphNodes,
+        _Out_ std::vector<DML_GRAPH_NODE_DESC>& dmlGraphNodes,
+        _Out_ std::vector<DML_GRAPH_EDGE_DESC>& dmlInputEdges,
+        _Out_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
+        _Out_ std::vector<DML_GRAPH_EDGE_DESC>& dmlIntermediateEdges)
+    {
+        const uint32_t graphInputCount = kernelInfo.GetInputCount();
+
+        for (size_t i = 0; i < graphDesc.nodes.size(); ++i)
+        {
+            dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{graphDesc.nodes[i].op.Get()};
+            dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i]};
+        }
+
+        for (size_t i = 0; i < graphDesc.inputEdges.size(); ++i)
+        {
+            dmlInputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INPUT, &graphDesc.inputEdges[i]};
+        }
+
+        for (size_t i = 0; i < graphDesc.outputEdges.size(); ++i)
+        {
+            dmlOutputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_OUTPUT, &graphDesc.outputEdges[i]};
+        }
+
+        for (size_t i = 0; i < graphDesc.intermediateEdges.size(); ++i)
+        {
+            dmlIntermediateEdges[i] =
+                DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, &graphDesc.intermediateEdges[i]};
+        }
+
+        dmlGraphDesc.InputCount = graphInputCount;
+        dmlGraphDesc.OutputCount = kernelInfo.GetOutputCount();
+        dmlGraphDesc.NodeCount = gsl::narrow_cast<uint32_t>(dmlGraphNodes.size());
+        dmlGraphDesc.Nodes = dmlGraphNodes.data();
+        dmlGraphDesc.InputEdgeCount = gsl::narrow_cast<uint32_t>(dmlInputEdges.size());
+        dmlGraphDesc.InputEdges = dmlInputEdges.data();
+        dmlGraphDesc.OutputEdgeCount = gsl::narrow_cast<uint32_t>(dmlOutputEdges.size());
+        dmlGraphDesc.OutputEdges = dmlOutputEdges.data();
+        dmlGraphDesc.IntermediateEdgeCount = gsl::narrow_cast<uint32_t>(dmlIntermediateEdges.size());
+        dmlGraphDesc.IntermediateEdges = dmlIntermediateEdges.data();
+    }
+}  // namespace GraphKernelHelper
+}  // namespace Dml
\ No newline at end of file
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.h
new file mode 100644
index 0000000000..b1b2e87cf8
--- /dev/null
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.h
@@ -0,0 +1,67 @@
+#include "GraphDescBuilder.h"
+
+namespace Dml
+{
+namespace GraphKernelHelper 
+{
+    using namespace Windows::AI::MachineLearning::Adapter;
+
+    template <typename T>
+    static T AlignToPow2(T offset, T alignment)
+    {
+        static_assert(std::is_unsigned_v<T>);
+        assert(alignment != 0);
+        assert((alignment & (alignment - 1)) == 0);
+        return (offset + alignment - 1) & ~(alignment - 1);
+    }
+    
+    Microsoft::WRL::ComPtr<ID3D12Resource>
+    CreateResource(
+        Dml::IExecutionProvider* provider,
+        const std::byte* tensorPtr,
+        size_t tensorByteSize);
+
+    Microsoft::WRL::ComPtr<ID3D12Resource>
+    CreateCpuResource(
+        Dml::IExecutionProvider* provider,
+        const std::byte* tensorPtr,
+        size_t tensorByteSize);
+
+    void UnwrapTensor(
+        IWinmlExecutionProvider* winmlProvider,
+        const onnxruntime::Tensor* tensor,
+        ID3D12Resource** resource,
+        uint64_t* allocId);
+
+    bool GetGraphInputConstness(
+        uint32_t index,
+        const onnxruntime::OpKernelInfo& kernelInfo,
+        const onnxruntime::ConstPointerContainer<std::vector<onnxruntime::NodeArg*>>& fusedNodeInputDefs,
+        const std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap);
+
+    std::vector<std::vector<std::byte>> PopulateInputBindings(
+        Dml::IExecutionProvider* provider,
+        IWinmlExecutionProvider* winmlProvider,
+        const std::vector<uint8_t>& inputsConstant,
+        const onnxruntime::OpKernelInfo& kernelInfo,
+        const Dml::GraphDescBuilder::GraphDesc& graphDesc,
+        const onnxruntime::ConstPointerContainer<std::vector<onnxruntime::NodeArg*>>& fusedNodeInputDefs,
+        _Out_ std::vector<bool>& inputsUsed,
+        _Out_ std::vector<DML_BUFFER_BINDING>& initInputBindings,
+        _Out_ std::vector<ComPtr<ID3D12Resource>>& initInputResources,
+        _Out_ std::vector<ComPtr<ID3D12Resource>>& nonOwnedGraphInputsFromInitializers,
+        _Out_ std::vector<ComPtr<ID3D12Resource>>& initializeResourceRefs,
+        _Inout_ std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap);
+
+    void ConvertGraphDesc(
+        const Dml::GraphDescBuilder::GraphDesc& graphDesc,
+        _Out_ DML_GRAPH_DESC& dmlGraphDesc,
+        const onnxruntime::OpKernelInfo& kernelInfo,
+        _Out_ std::vector<DML_OPERATOR_GRAPH_NODE_DESC>& dmlOperatorGraphNodes,
+        _Out_ std::vector<DML_GRAPH_NODE_DESC>& dmlGraphNodes,
+        _Out_ std::vector<DML_GRAPH_EDGE_DESC>& dmlInputEdges,
+        _Out_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
+        _Out_ std::vector<DML_GRAPH_EDGE_DESC>& dmlIntermediateEdges);
+    
+}  // namespace GraphKernelHelper
+}  // namespace Dml
\ No newline at end of file