[DML EP] Split fused kernels when the persistent resource is too big (#16780)

The approach is the following: 1. Build partitions 2. Try compiling each partition into a `IDMLCompiledOperator` 3. If the compiled operator's persistent resource is bigger than 4GB, tell the partitioner to split the partition in the middle and try again. 4. Once all partitions have been successfully compiled into an `IDMLCompiledOperator`, fuse the partitions into an ORT operator and register them all. This change is relatively simple (basically a basic retry mechanism), but it required a lot of refactoring just to make sure that we don't modify the graph until **all** partitions have been compiled successfully. This is because partly modifying the graph before making sure that all partitions can be compiled will break future retries. This path is not expected to be used a lot, and even then the loop is not expected to loop more than twice very often. This is a very specific edge case for large models that were able to merge a large number of nodes into a single partition.
2026-05-14 20:48:00 +00:00 · 2023-08-09 19:53:15 -07:00 · 2023-08-09 19:53:15 -07:00 · 7201dbebe5
commit 7201dbebe5
parent e951f837e4
7 changed files with 293 additions and 196 deletions
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.cpp
@ -106,7 +106,7 @@ namespace DmlGraphFusionHelper
    void ProcessInputData(
        const ExecutionProviderImpl* providerImpl,
        const std::vector<uint8_t>& isInputsUploadedByDmlEP,
-        std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
+        const std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
        const gsl::span<const std::string> subGraphInputArgNames,
        const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
        onnxruntime::Graph& graph,
@ -325,37 +325,60 @@ namespace DmlGraphFusionHelper
        dmlGraphDesc.IntermediateEdges = dmlIntermediateEdges.data();
    }

-    void CreateIDmlCompiledOperatorAndRegisterKernel(
-        onnxruntime::Graph& graph,
-        const onnxruntime::IndexedSubGraph& indexedSubGraph,
-        const onnxruntime::Node& fusedNode,
-        const std::unordered_map<std::string, GraphNodeProperties>& partitionNodePropsMap,
-        const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
-        const ExecutionProviderImpl* providerImpl,
-        onnxruntime::KernelRegistry* registryForPartitionKernels)
+    onnxruntime::IndexedSubGraph CreateIndexedSubGraph(
+        GraphPartition* partition,
+        uint32_t partitionIndex,
+        const std::string& partitionKernelPrefix)
    {
-        // convert partitionONNXGraph into DML EP GraphDesc
-        const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
-        const uint32_t fusedNodeOutputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->outputs.size());
+        assert(partition->IsDmlGraphPartition());

-        std::vector<uint8_t> isInputsUploadedByDmlEP(fusedNodeInputCount);
-        for (uint32_t index = 0; index < fusedNodeInputCount; ++index)
+        onnxruntime::IndexedSubGraph indexedSubGraph;
+        // Create a definition for the node.  The name must be unique.
+        auto def = std::make_unique<onnxruntime::IndexedSubGraph::MetaDef>();
+        def->name = DmlGraphFusionTransformer::DML_GRAPH_FUSION_NODE_NAME_PREFIX + partitionKernelPrefix + std::to_string(partitionIndex);
+        def->domain = DmlGraphFusionTransformer::DML_GRAPH_FUSION_NODE_DOMAIN;
+        def->since_version = 1;
+        def->inputs.insert(def->inputs.begin(), partition->GetInputs().begin(), partition->GetInputs().end());
+        def->outputs.insert(def->outputs.begin(), partition->GetOutputs().begin(), partition->GetOutputs().end());
+
+        indexedSubGraph.SetMetaDef(std::move(def));
+        indexedSubGraph.nodes = std::move(partition->GetNodeIndices());
+
+        return indexedSubGraph;
+    }
+
+    std::unordered_map<std::string, GraphNodeProperties> CreatePartitionNodePropsMap(
+        const onnxruntime::Graph& graph,
+        const onnxruntime::IndexedSubGraph& indexedSubGraph,
+        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>&& graphNodePropertyMap)
+    {
+        // Populate properties which will be passed to OpKernel for this graph via the function below
+        std::unordered_map<std::string, GraphNodeProperties> partitionNodePropsMap;
+        for (auto nodeIndex : indexedSubGraph.nodes)
        {
-            auto iter = initializerNameToInitializerMap.find(indexedSubGraph.GetMetaDef()->inputs[index]);
-            isInputsUploadedByDmlEP[index] = iter != initializerNameToInitializerMap.end() ? true : false;
+            const onnxruntime::Node* node = graph.GetNode(nodeIndex);
+
+#ifdef PRINT_PARTITON_INFO
+            printf("Partition %u\t%s\n", partitionIndex, GraphDescBuilder::GetUniqueNodeName(*node).c_str());
+#endif
+            partitionNodePropsMap.insert(std::make_pair(
+                GraphDescBuilder::GetUniqueNodeName(*node), std::move(graphNodePropertyMap[node])));
        }

-        ComPtr<IDMLDevice> device;
-        ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
-        GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
-            isInputsUploadedByDmlEP.data(),
-            isInputsUploadedByDmlEP.size(),
-            initializerNameToInitializerMap,
-            graph,
-            indexedSubGraph,
-            partitionNodePropsMap,
-            device.Get(),
-            providerImpl);
+#ifdef PRINT_PARTITON_INFO
+        printf("\n");
+#endif
+
+        return partitionNodePropsMap;
+    }
+
+    Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
+        const GraphDescBuilder::GraphDesc& graphDesc,
+        const onnxruntime::IndexedSubGraph& indexedSubGraph,
+        const ExecutionProviderImpl* providerImpl)
+    {
+        const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
+        const uint32_t fusedNodeOutputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->outputs.size());

        // convert DML EP GraphDesc into DML_GRAPH_DESC and create IDMLCompiledOperator
        DML_GRAPH_DESC dmlGraphDesc = {};
@ -387,14 +410,42 @@ namespace DmlGraphFusionHelper
            executionFlags |= DML_EXECUTION_FLAG_DISABLE_META_COMMANDS;
        }

+        ComPtr<IDMLDevice> device;
+        ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
+
        ComPtr<IDMLDevice1> device1;
        ORT_THROW_IF_FAILED(device.As(&device1));
+
        ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator;
        ORT_THROW_IF_FAILED(device1->CompileGraph(
            &dmlGraphDesc,
            executionFlags,
            IID_PPV_ARGS(&compiledExecutionPlanOperator)));

+        // UINT32_MAX is currently the maximum number of bytes allowed by D3D12 for the offset of a view over a resource
+        if (compiledExecutionPlanOperator->GetBindingProperties().PersistentResourceSize > UINT32_MAX)
+        {
+            return nullptr;
+        }
+
+        return compiledExecutionPlanOperator;
+    }
+
+    void FusePartitionAndRegisterKernel(
+        onnxruntime::Graph& graph,
+        onnxruntime::KernelRegistry* registryForPartitionKernels,
+        const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
+        const ExecutionProviderImpl* providerImpl,
+        const onnxruntime::IndexedSubGraph& indexedSubGraph,
+        std::vector<uint8_t>&& isInputsUploadedByDmlEP,
+        const GraphDescBuilder::GraphDesc& graphDesc,
+        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator)
+    {
+        auto& fusedNode = graph.BeginFuseSubGraph(indexedSubGraph, indexedSubGraph.GetMetaDef()->name);
+        fusedNode.SetExecutionProviderType(onnxruntime::kDmlExecutionProvider);
+
+        const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
+
        // Populate input bindings for operator initialization
        std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>> initializeResourceRefs; // For lifetime control
        std::vector<DML_BUFFER_BINDING> initInputBindings(fusedNodeInputCount);
@ -424,8 +475,8 @@ namespace DmlGraphFusionHelper
                                  nonOwnedGraphInputsFromInitializers,
                                  initializeResourceRefs,
                                  initInputBindings,
-                                  isInputsUploadedByDmlEP,
-                                  inputsUsed]
+                                  isInputsUploadedByDmlEP = std::move(isInputsUploadedByDmlEP),
+                                  inputsUsed = std::move(inputsUsed)]
                    (onnxruntime::FuncManager& func_mgr, const onnxruntime::OpKernelInfo& info, std::unique_ptr<onnxruntime::OpKernel>& out) mutable ->onnxruntime::Status
        {
            out.reset(CreateFusedGraphKernel(info,
@ -435,8 +486,8 @@ namespace DmlGraphFusionHelper
                                             nonOwnedGraphInputsFromInitializers,
                                             initializeResourceRefs,
                                             initInputBindings,
-                                             isInputsUploadedByDmlEP,
-                                             inputsUsed));
+                                             std::move(isInputsUploadedByDmlEP),
+                                             std::move(inputsUsed)));
            return Status::OK();
        };

@ -447,58 +498,7 @@ namespace DmlGraphFusionHelper
            .SinceVersion(indexedSubGraph.GetMetaDef()->since_version)
            .Provider(onnxruntime::kDmlExecutionProvider);
        ORT_THROW_IF_ERROR(registryForPartitionKernels->Register(builder, fused_kernel_func));
-    }

-    void FusePartitionAndRegisterKernel(
-        GraphPartition* partition,
-        uint32_t partitionIndex,
-        onnxruntime::Graph& graph,
-        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
-        onnxruntime::KernelRegistry* registryForPartitionKernels,
-        const std::string& partitionKernelPrefix,
-        const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
-        const ExecutionProviderImpl* providerImpl)
-    {
-        assert(partition->IsDmlGraphPartition());
-
-        onnxruntime::IndexedSubGraph indexedSubGraph;
-        // Create a definition for the node.  The name must be unique.
-        auto def = std::make_unique<onnxruntime::IndexedSubGraph::MetaDef>();
-        def->name = DmlGraphFusionTransformer::DML_GRAPH_FUSION_NODE_NAME_PREFIX + partitionKernelPrefix + std::to_string(partitionIndex);
-        def->domain = DmlGraphFusionTransformer::DML_GRAPH_FUSION_NODE_DOMAIN;
-        def->since_version = 1;
-        def->inputs.insert(def->inputs.begin(), partition->GetInputs().begin(), partition->GetInputs().end());
-        def->outputs.insert(def->outputs.begin(), partition->GetOutputs().begin(), partition->GetOutputs().end());
-
-        indexedSubGraph.SetMetaDef(std::move(def));
-        indexedSubGraph.nodes = std::move(partition->GetNodeIndices());
-        auto& fusedNode = graph.BeginFuseSubGraph(indexedSubGraph, indexedSubGraph.GetMetaDef()->name);
-        fusedNode.SetExecutionProviderType(onnxruntime::kDmlExecutionProvider);
-
-        // Populate properties which will be passed to OpKernel for this graph via the function below
-        std::unordered_map<std::string, GraphNodeProperties> partitionNodePropsMap;
-        for (auto nodeIndex : indexedSubGraph.nodes)
-        {
-            const onnxruntime::Node* node = graph.GetNode(nodeIndex);
-
-#ifdef PRINT_PARTITON_INFO
-            printf("Partition %u\t%s\n", partitionIndex, GraphDescBuilder::GetUniqueNodeName(*node).c_str());
-#endif
-            partitionNodePropsMap.insert(std::make_pair(
-                GraphDescBuilder::GetUniqueNodeName(*node), std::move(graphNodePropertyMap[node])));
-        }
-
-#ifdef PRINT_PARTITON_INFO
-        printf("\n");
-#endif
-        CreateIDmlCompiledOperatorAndRegisterKernel(
-            graph,
-            indexedSubGraph,
-            fusedNode,
-            partitionNodePropsMap,
-            initializerNameToInitializerMap,
-            providerImpl,
-            registryForPartitionKernels);
        graph.FinalizeFuseSubGraph(indexedSubGraph, fusedNode);
    }
 }
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionHelper.h
@ -56,23 +56,29 @@ namespace DmlGraphFusionHelper
        _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
        _Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlIntermediateEdges);

-    void CreateIDmlCompiledOperatorAndRegisterKernel(
-        onnxruntime::Graph& graph,
-        const onnxruntime::IndexedSubGraph& indexedSubGraph,
-        const onnxruntime::Node& fusedNode,
-        const std::unordered_map<std::string, GraphNodeProperties>& partitionNodePropsMap,
-        const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
-        const ExecutionProviderImpl* providerImpl,
-        onnxruntime::KernelRegistry* registryForPartitionKernels);
-
-    void FusePartitionAndRegisterKernel(
+    onnxruntime::IndexedSubGraph CreateIndexedSubGraph(
        GraphPartition* partition,
        uint32_t partitionIndex,
-        onnxruntime::Graph& graph,
-        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
-        onnxruntime::KernelRegistry* registryForPartitionKernels,
-        const std::string& partitionKernelPrefix,
-        const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
+        const std::string& partitionKernelPrefix);
+
+    std::unordered_map<std::string, GraphNodeProperties> CreatePartitionNodePropsMap(
+        const onnxruntime::Graph& graph,
+        const onnxruntime::IndexedSubGraph& indexedSubGraph,
+        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>&& graphNodePropertyMap);
+
+    Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
+        const GraphDescBuilder::GraphDesc& graphDesc,
+        const onnxruntime::IndexedSubGraph& indexedSubGraph,
        const ExecutionProviderImpl* providerImpl);
+
+    void FusePartitionAndRegisterKernel(
+        onnxruntime::Graph& graph,
+        onnxruntime::KernelRegistry* registryForPartitionKernels,
+        const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
+        const ExecutionProviderImpl* providerImpl,
+        const onnxruntime::IndexedSubGraph& indexedSubGraph,
+        std::vector<uint8_t>&& isInputsUploadedByDmlEP,
+        const GraphDescBuilder::GraphDesc& graphDesc,
+        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator);
 }
 }
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
@ -23,7 +23,16 @@ namespace Dml
         m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl())
    {
    }
-	
+
+    struct CompiledPartitionInfo
+    {
+        Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledOperator;
+        onnxruntime::IndexedSubGraph indexedSubGraph;
+        std::vector<uint8_t> isInputsUploadedByDmlEP;
+        GraphDescBuilder::GraphDesc graphDesc;
+        std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
+    };
+
    onnxruntime::common::Status DmlGraphFusionTransformer::ApplyImpl(
        onnxruntime::Graph& graph,
        bool& modified,
@ -37,96 +46,173 @@ namespace Dml
                                                             gsl::make_span(&registry, 1),
                                                             kernel_type_str_resolver};

-        // Initializers needed by any graph partition
-        std::unordered_set<std::string> requiredInitializerMap;
-        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties> graphNodePropertyMap;
-        onnxruntime::GraphViewer graphViewer(graph);
-        std::vector<std::unique_ptr<GraphPartition>> partitions = BuildPartitions(
-            graphViewer,
-            *m_providerImpl->GetInternalRegistrationInfoMap(), 
-            kernel_lookup,
-            m_providerImpl->GetSupportedDeviceDataTypeMask(),
-            graphNodePropertyMap, 
-            requiredInitializerMap);
+        std::vector<std::shared_ptr<CompiledPartitionInfo>> compiledPartitionInfos;
+        std::vector<onnxruntime::NodeIndex> additionalSplittingNodes;

-        // Create a map between each initialized tensor and the partition(s) it is part of.
-        auto initializerPartitionMap = DmlGraphFusionHelper::GetInitializerToPartitionMap(graphViewer, partitions);
-
-        for (uint32_t partitionIndex = 0; partitionIndex < partitions.size(); ++partitionIndex)
+        do
        {
-            auto& partition = partitions[partitionIndex];
+            // Initializers needed by any graph partition
+            std::unordered_set<std::string> requiredInitializerMap;
+            std::unordered_map<const onnxruntime::Node*, GraphNodeProperties> graphNodePropertyMap;
+            onnxruntime::GraphViewer graphViewer(graph);
+            std::vector<std::unique_ptr<GraphPartition>> partitions = BuildPartitions(
+                graphViewer,
+                *m_providerImpl->GetInternalRegistrationInfoMap(),
+                kernel_lookup,
+                m_providerImpl->GetSupportedDeviceDataTypeMask(),
+                graphNodePropertyMap,
+                requiredInitializerMap,
+                additionalSplittingNodes);

-            if (partition->GetRootMergedPartition() != partition.get() ||
-                !partition->IsDmlPartition())
+            // Reset the splitting nodes for the current iteration
+            additionalSplittingNodes.clear();
+
+            // Reset the compiled operators for the current iteration
+            compiledPartitionInfos.clear();
+            compiledPartitionInfos.resize(partitions.size());
+
+            // Create a map between each initialized tensor and the partition(s) it is part of.
+            auto initializerPartitionMap = DmlGraphFusionHelper::GetInitializerToPartitionMap(graphViewer, partitions);
+
+            for (uint32_t partitionIndex = 0; partitionIndex < partitions.size(); ++partitionIndex)
            {
-                continue;
-            }
+                auto& partition = partitions[partitionIndex];

-            // This map will tell which initializer can be removed from onnxruntime::Graph (and from it's field 
-            // onnx::GraphProto) while we upload the initializer to GPU. 
-            // Why we want to remove the initializer from ORT?
-            //  1. To keep the peak memory usage as low as possible. That's why we are doing incremental upload to GPU.
-            // What is initializer?
-            //  An initializer is a input tensor to an operator or the graph itself, which is contant and will never change.
-            // Why are we uploading the initialzer now?
-            //  This prevents OnnxRuntime from allocating GPU resources and uploading those initializers,
-            //  so the partiton's kernel can do so. In the process, it will pre-process weights while consuming a CPU
-            //  backed resource, avoiding an extra set of GPU resources in memory.
-            std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
-
-            
-            if (partition->IsDmlGraphPartition())
-            {
-                // populate transferredInitializerMap
-                for (const auto& input : partition->GetInputs())
+                if (partition->GetRootMergedPartition() != partition.get() ||
+                    !partition->IsDmlPartition())
                {
-                    const onnx::TensorProto* tensor = nullptr;
-                    if (graph.GetInitializedTensor(input, tensor))
-                    {
-                        // It's only safe to transfer tensors which are used by this partition alone.
-                        auto iter = initializerPartitionMap.find(tensor);
-                        assert(iter != initializerPartitionMap.end());
-                        if (iter->second.size() > 1)
-                        {
-                            // By including non-transferrable tensors in isInitializerTransferable, it causes DML to upload and preprocess them
-                            // to duplicate locations rather than treating them as being non-constant, which is helpful for optimization.
-                            // The size threshold for this should be no smaller than that used to combine initializers in the constant
-                            // sharing transform to prevent that transform from hurting performance.
-                            // If the kernel relies on this input to be initialized, it should also be small enough to copy cheaply.
-                            const uint64_t maximumElementsForDuplicationTensor = 64;
-                            static_assert(maximumElementsForDuplicationTensor >= onnxruntime::ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD);
-
-                            uint64_t totalElementCount = 1;
-                            for (int i = 0; i < tensor->dims().size(); ++i)
-                            {
-                                totalElementCount *= tensor->dims()[i];
-                            }
-
-                            if (totalElementCount <=  maximumElementsForDuplicationTensor ||
-                                requiredInitializerMap.find(input) != requiredInitializerMap.end())
-                            {
-                                isInitializerTransferable[input] = {tensor, false};
-                            }
-
-                            continue;
-                        }
-                        isInitializerTransferable[input] = {tensor, true};
-                    }
+                    continue;
                }

-                std::string partitionKernelPrefix = std::to_string(m_providerImpl->GetPartitionKernelPrefixVal()) + "_";
-                m_providerImpl->IncreasePartitionKernelPrefixVal();
+                // This map will tell which initializer can be removed from onnxruntime::Graph (and from it's field
+                // onnx::GraphProto) while we upload the initializer to GPU.
+                // Why we want to remove the initializer from ORT?
+                //  1. To keep the peak memory usage as low as possible. That's why we are doing incremental upload to GPU.
+                // What is initializer?
+                //  An initializer is a input tensor to an operator or the graph itself, which is contant and will never change.
+                // Why are we uploading the initialzer now?
+                //  This prevents OnnxRuntime from allocating GPU resources and uploading those initializers,
+                //  so the partiton's kernel can do so. In the process, it will pre-process weights while consuming a CPU
+                //  backed resource, avoiding an extra set of GPU resources in memory.
+                std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;

+                if (partition->IsDmlGraphPartition())
+                {
+                    // populate isInitializerTransferable
+                    for (const auto& input : partition->GetInputs())
+                    {
+                        const onnx::TensorProto* tensor = nullptr;
+                        if (graph.GetInitializedTensor(input, tensor))
+                        {
+                            // It's only safe to transfer tensors which are used by this partition alone.
+                            auto iter = initializerPartitionMap.find(tensor);
+                            assert(iter != initializerPartitionMap.end());
+                            if (iter->second.size() > 1)
+                            {
+                                // By including non-transferrable tensors in isInitializerTransferable, it causes DML to upload and preprocess them
+                                // to duplicate locations rather than treating them as being non-constant, which is helpful for optimization.
+                                // The size threshold for this should be no smaller than that used to combine initializers in the constant
+                                // sharing transform to prevent that transform from hurting performance.
+                                // If the kernel relies on this input to be initialized, it should also be small enough to copy cheaply.
+                                constexpr uint64_t maximumElementsForDuplicationTensor = 64;
+                                static_assert(maximumElementsForDuplicationTensor >= onnxruntime::ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD);
+
+                                uint64_t totalElementCount = 1;
+                                for (int i = 0; i < tensor->dims().size(); ++i)
+                                {
+                                    totalElementCount *= tensor->dims()[i];
+                                }
+
+                                if (totalElementCount <=  maximumElementsForDuplicationTensor ||
+                                    requiredInitializerMap.find(input) != requiredInitializerMap.end())
+                                {
+                                    isInitializerTransferable[input] = {tensor, false};
+                                }
+
+                                continue;
+                            }
+                            isInitializerTransferable[input] = {tensor, true};
+                        }
+                    }
+
+                    std::string partitionKernelPrefix = std::to_string(m_providerImpl->GetPartitionKernelPrefixVal()) + "_";
+                    m_providerImpl->IncreasePartitionKernelPrefixVal();
+
+                    auto indexedSubGraph = DmlGraphFusionHelper::CreateIndexedSubGraph(partition.get(), partitionIndex, partitionKernelPrefix);
+
+                    // Create a map of which inputs are uploaded by the DML EP
+                    const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
+                    std::vector<uint8_t> isInputsUploadedByDmlEP(fusedNodeInputCount);
+                    for (uint32_t index = 0; index < fusedNodeInputCount; ++index)
+                    {
+                        auto iter = isInitializerTransferable.find(indexedSubGraph.GetMetaDef()->inputs[index]);
+                        isInputsUploadedByDmlEP[index] = iter != isInitializerTransferable.end() ? true : false;
+                    }
+
+                    auto partitionNodePropsMap = DmlGraphFusionHelper::CreatePartitionNodePropsMap(
+                        graph,
+                        indexedSubGraph,
+                        std::move(graphNodePropertyMap));
+
+                    // Convert partitionONNXGraph into DML EP GraphDesc
+                    ComPtr<IDMLDevice> device;
+                    ORT_THROW_IF_FAILED(m_providerImpl->GetDmlDevice(device.GetAddressOf()));
+                    GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
+                        isInputsUploadedByDmlEP.data(),
+                        isInputsUploadedByDmlEP.size(),
+                        isInitializerTransferable,
+                        graph,
+                        indexedSubGraph,
+                        partitionNodePropsMap,
+                        device.Get(),
+                        m_providerImpl);
+
+                    // Compile the operator
+                    auto compiledPartition = DmlGraphFusionHelper::TryCreateCompiledOperator(
+                        graphDesc,
+                        indexedSubGraph,
+                        m_providerImpl);
+
+                    if (!compiledPartition)
+                    {
+                        // Fail early if even a single operator is too big to compile. This is highly unlikely.
+                        ORT_THROW_HR_IF(E_INVALIDARG, indexedSubGraph.nodes.size() < 2);
+
+                        // Tell the partitioner to split the current partition in half, in the middle
+                        additionalSplittingNodes.push_back(indexedSubGraph.nodes[indexedSubGraph.nodes.size() / 2]);
+
+                        // Exit early since we need to repartition
+                        break;
+                    }
+                    else
+                    {
+                        auto compiledPartitionInfo = std::make_shared<CompiledPartitionInfo>();
+                        compiledPartitionInfo->compiledOperator = std::move(compiledPartition);
+                        compiledPartitionInfo->indexedSubGraph = std::move(indexedSubGraph);
+                        compiledPartitionInfo->isInputsUploadedByDmlEP = std::move(isInputsUploadedByDmlEP);
+                        compiledPartitionInfo->graphDesc = std::move(graphDesc);
+                        compiledPartitionInfo->isInitializerTransferable = std::move(isInitializerTransferable);
+                        compiledPartitionInfos[partitionIndex] = std::move(compiledPartitionInfo);
+                    }
+                }
+            }
+        }
+        while (!additionalSplittingNodes.empty());
+
+        for (auto&& compiledPartitionInfo : compiledPartitionInfos)
+        {
+            // Null compiled operators were not DML partitions
+            if (compiledPartitionInfo)
+            {
                DmlGraphFusionHelper::FusePartitionAndRegisterKernel(
-                    partition.get(), 
-                    partitionIndex, 
-                    graph, 
-                    graphNodePropertyMap,
+                    graph,
                    m_providerImpl->GetKernelRegistry().get(),
-                    partitionKernelPrefix,
-                    isInitializerTransferable,
-                    m_providerImpl
-                );
+                    compiledPartitionInfo->isInitializerTransferable,
+                    m_providerImpl,
+                    compiledPartitionInfo->indexedSubGraph,
+                    std::move(compiledPartitionInfo->isInputsUploadedByDmlEP),
+                    compiledPartitionInfo->graphDesc,
+                    compiledPartitionInfo->compiledOperator);
            }
        }

--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@ -24,13 +24,13 @@ namespace Dml
            std::vector<ComPtr<ID3D12Resource>>& nonOwnedGraphInputsFromInitializers,
            std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>>& initializeResourceRefs,
            std::vector<DML_BUFFER_BINDING> initInputBindings,
-            std::vector<uint8_t>& isInputsUploadedByDmlEP,
-            std::vector<bool>& inputsUsed) :
+            std::vector<uint8_t>&& isInputsUploadedByDmlEP,
+            std::vector<bool>&& inputsUsed) :
        OpKernel(kernelInfo),
        m_compiledExecutionPlanOperator(compiledExecutionPlanOperator),
-        m_inputsUsed(inputsUsed),
+        m_inputsUsed(std::move(inputsUsed)),
        m_outputShapes(outputShapes),
-        m_isInputsUploadedByDmlEP(isInputsUploadedByDmlEP),
+        m_isInputsUploadedByDmlEP(std::move(isInputsUploadedByDmlEP)),
        m_nonOwnedGraphInputsFromInitializers(nonOwnedGraphInputsFromInitializers)
        {
            // Get the execution provider interfaces
@ -443,8 +443,8 @@ namespace Dml
        std::vector<ComPtr<ID3D12Resource>>& nonOwnedGraphInputsFromInitializers,
        std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>>& initializeResourceRefs,
        std::vector<DML_BUFFER_BINDING> initInputBindings,
-        std::vector<uint8_t>& isInputsUploadedByDmlEP,
-        std::vector<bool>& inputsUsed
+        std::vector<uint8_t>&& isInputsUploadedByDmlEP,
+        std::vector<bool>&& inputsUsed
        )
    {
        return new FusedGraphKernel(
@ -455,8 +455,8 @@ namespace Dml
            nonOwnedGraphInputsFromInitializers,
            initializeResourceRefs,
            initInputBindings,
-            isInputsUploadedByDmlEP,
-            inputsUsed
+            std::move(isInputsUploadedByDmlEP),
+            std::move(inputsUsed)
        );
    }
 } // namespace Dml
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.h
@ -15,7 +15,7 @@ namespace Dml
        std::vector<ComPtr<ID3D12Resource>>& nonOwnedGraphInputsFromInitializers,
        std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>>& initializeResourceRefs,
        std::vector<DML_BUFFER_BINDING> initInputBindings,
-        std::vector<uint8_t>& isInputsUploadedByDmlEP,
-        std::vector<bool>& inputsUsed
+        std::vector<uint8_t>&& isInputsUploadedByDmlEP,
+        std::vector<bool>&& inputsUsed
    );
 } // namespace Dml
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.cpp
@ -209,14 +209,15 @@ namespace Dml

    // Creates a partition for a node which is not a DML graph node, and finalizes partitions
    // which are inputs of the new partition.
-    std::unique_ptr<GraphPartition> CreateNonGraphNodePartitionAndFinalizeInputs(
+    std::unique_ptr<GraphPartition> CreatePartitionAndFinalizeInputs(
        const onnxruntime::Node& node,
        bool isDmlNode,
+        bool isDmlGraphPartitionNode,
        std::unordered_map<std::string, GraphPartition*>& nodeNameToPartitionMap
    )
    {
        std::unique_ptr<GraphPartition> partition = std::make_unique<GraphPartition>();
-        partition->SetIsDmlGraphPartition(false);
+        partition->SetIsDmlGraphPartition(isDmlGraphPartitionNode);
        partition->SetIsDmlPartition(isDmlNode);
        partition->AddNodeIndex(node.Index());

@ -383,7 +384,7 @@ namespace Dml
        uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
        std::unordered_set<std::string>& requiredInitializerMap,
-        std::function<void(const onnxruntime::Node&)> onNodeUnsupportedInGraph)
+        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes)
    {
        // Nodes are uniquely identified by the name of their first output argument
        std::vector<std::unique_ptr<GraphPartition>> partitions;
@ -420,6 +421,8 @@ namespace Dml
        // Check whether this graph is a subgraph, or contains any node with a subgraph.
        bool modelUsesSubgraph = ModelUsesSubgraph(graph);

+        uint32_t splittingNodeIndex = 0;
+
        // Build up partitions while traversing the graph.
        for (size_t nodeIndex : toplogicalOrder)
        {
@ -456,12 +459,14 @@ namespace Dml
            // anyhow due to CPU/GPU copies.
            if (modelUsesSubgraph || !isDmlGraphNode)
            {
-                if (onNodeUnsupportedInGraph)
-                {
-                    onNodeUnsupportedInGraph(node);
-                }
+                partitions.push_back(CreatePartitionAndFinalizeInputs(node, isDmlNode, false, nodeNameToPartitionMap));
+                continue;
+            }

-                partitions.push_back(CreateNonGraphNodePartitionAndFinalizeInputs(node, isDmlNode, nodeNameToPartitionMap));
+            if (splittingNodeIndex < additionalSplittingNodes.size() && additionalSplittingNodes[splittingNodeIndex] == nodeIndex)
+            {
+                partitions.push_back(CreatePartitionAndFinalizeInputs(node, isDmlNode, isDmlGraphNode, nodeNameToPartitionMap));
+                ++splittingNodeIndex;
                continue;
            }

--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphPartitioner.h
@ -48,5 +48,5 @@ namespace Dml
        uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
        std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
        std::unordered_set<std::string>& requiredInitializerMap,
-        std::function<void(const onnxruntime::Node&)> onNodeUnsupportedInGraph = nullptr);
+        gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes);
 } // namespace Dml