[DML EP] Split fused kernels when the persistent resource is too big (#16780)

The approach is the following:

1. Build partitions
2. Try compiling each partition into a `IDMLCompiledOperator`
3. If the compiled operator's persistent resource is bigger than 4GB,
tell the partitioner to split the partition in the middle and try again.
4. Once all partitions have been successfully compiled into an
`IDMLCompiledOperator`, fuse the partitions into an ORT operator and
register them all.

This change is relatively simple (basically a basic retry mechanism),
but it required a lot of refactoring just to make sure that we don't
modify the graph until **all** partitions have been compiled
successfully. This is because partly modifying the graph before making
sure that all partitions can be compiled will break future retries.

This path is not expected to be used a lot, and even then the loop is
not expected to loop more than twice very often. This is a very specific
edge case for large models that were able to merge a large number of
nodes into a single partition.
This commit is contained in:
Patrice Vignola 2023-08-09 19:53:15 -07:00 committed by GitHub
parent e951f837e4
commit 7201dbebe5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 293 additions and 196 deletions

View file

@ -106,7 +106,7 @@ namespace DmlGraphFusionHelper
void ProcessInputData(
const ExecutionProviderImpl* providerImpl,
const std::vector<uint8_t>& isInputsUploadedByDmlEP,
std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
const std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
const gsl::span<const std::string> subGraphInputArgNames,
const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
onnxruntime::Graph& graph,
@ -325,37 +325,60 @@ namespace DmlGraphFusionHelper
dmlGraphDesc.IntermediateEdges = dmlIntermediateEdges.data();
}
void CreateIDmlCompiledOperatorAndRegisterKernel(
onnxruntime::Graph& graph,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
const onnxruntime::Node& fusedNode,
const std::unordered_map<std::string, GraphNodeProperties>& partitionNodePropsMap,
const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
const ExecutionProviderImpl* providerImpl,
onnxruntime::KernelRegistry* registryForPartitionKernels)
onnxruntime::IndexedSubGraph CreateIndexedSubGraph(
GraphPartition* partition,
uint32_t partitionIndex,
const std::string& partitionKernelPrefix)
{
// convert partitionONNXGraph into DML EP GraphDesc
const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
const uint32_t fusedNodeOutputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->outputs.size());
assert(partition->IsDmlGraphPartition());
std::vector<uint8_t> isInputsUploadedByDmlEP(fusedNodeInputCount);
for (uint32_t index = 0; index < fusedNodeInputCount; ++index)
onnxruntime::IndexedSubGraph indexedSubGraph;
// Create a definition for the node. The name must be unique.
auto def = std::make_unique<onnxruntime::IndexedSubGraph::MetaDef>();
def->name = DmlGraphFusionTransformer::DML_GRAPH_FUSION_NODE_NAME_PREFIX + partitionKernelPrefix + std::to_string(partitionIndex);
def->domain = DmlGraphFusionTransformer::DML_GRAPH_FUSION_NODE_DOMAIN;
def->since_version = 1;
def->inputs.insert(def->inputs.begin(), partition->GetInputs().begin(), partition->GetInputs().end());
def->outputs.insert(def->outputs.begin(), partition->GetOutputs().begin(), partition->GetOutputs().end());
indexedSubGraph.SetMetaDef(std::move(def));
indexedSubGraph.nodes = std::move(partition->GetNodeIndices());
return indexedSubGraph;
}
std::unordered_map<std::string, GraphNodeProperties> CreatePartitionNodePropsMap(
const onnxruntime::Graph& graph,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>&& graphNodePropertyMap)
{
// Populate properties which will be passed to OpKernel for this graph via the function below
std::unordered_map<std::string, GraphNodeProperties> partitionNodePropsMap;
for (auto nodeIndex : indexedSubGraph.nodes)
{
auto iter = initializerNameToInitializerMap.find(indexedSubGraph.GetMetaDef()->inputs[index]);
isInputsUploadedByDmlEP[index] = iter != initializerNameToInitializerMap.end() ? true : false;
const onnxruntime::Node* node = graph.GetNode(nodeIndex);
#ifdef PRINT_PARTITON_INFO
printf("Partition %u\t%s\n", partitionIndex, GraphDescBuilder::GetUniqueNodeName(*node).c_str());
#endif
partitionNodePropsMap.insert(std::make_pair(
GraphDescBuilder::GetUniqueNodeName(*node), std::move(graphNodePropertyMap[node])));
}
ComPtr<IDMLDevice> device;
ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
isInputsUploadedByDmlEP.data(),
isInputsUploadedByDmlEP.size(),
initializerNameToInitializerMap,
graph,
indexedSubGraph,
partitionNodePropsMap,
device.Get(),
providerImpl);
#ifdef PRINT_PARTITON_INFO
printf("\n");
#endif
return partitionNodePropsMap;
}
Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
const GraphDescBuilder::GraphDesc& graphDesc,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
const ExecutionProviderImpl* providerImpl)
{
const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
const uint32_t fusedNodeOutputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->outputs.size());
// convert DML EP GraphDesc into DML_GRAPH_DESC and create IDMLCompiledOperator
DML_GRAPH_DESC dmlGraphDesc = {};
@ -387,14 +410,42 @@ namespace DmlGraphFusionHelper
executionFlags |= DML_EXECUTION_FLAG_DISABLE_META_COMMANDS;
}
ComPtr<IDMLDevice> device;
ORT_THROW_IF_FAILED(providerImpl->GetDmlDevice(device.GetAddressOf()));
ComPtr<IDMLDevice1> device1;
ORT_THROW_IF_FAILED(device.As(&device1));
ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator;
ORT_THROW_IF_FAILED(device1->CompileGraph(
&dmlGraphDesc,
executionFlags,
IID_PPV_ARGS(&compiledExecutionPlanOperator)));
// UINT32_MAX is currently the maximum number of bytes allowed by D3D12 for the offset of a view over a resource
if (compiledExecutionPlanOperator->GetBindingProperties().PersistentResourceSize > UINT32_MAX)
{
return nullptr;
}
return compiledExecutionPlanOperator;
}
void FusePartitionAndRegisterKernel(
onnxruntime::Graph& graph,
onnxruntime::KernelRegistry* registryForPartitionKernels,
const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
const ExecutionProviderImpl* providerImpl,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
std::vector<uint8_t>&& isInputsUploadedByDmlEP,
const GraphDescBuilder::GraphDesc& graphDesc,
Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator)
{
auto& fusedNode = graph.BeginFuseSubGraph(indexedSubGraph, indexedSubGraph.GetMetaDef()->name);
fusedNode.SetExecutionProviderType(onnxruntime::kDmlExecutionProvider);
const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
// Populate input bindings for operator initialization
std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>> initializeResourceRefs; // For lifetime control
std::vector<DML_BUFFER_BINDING> initInputBindings(fusedNodeInputCount);
@ -424,8 +475,8 @@ namespace DmlGraphFusionHelper
nonOwnedGraphInputsFromInitializers,
initializeResourceRefs,
initInputBindings,
isInputsUploadedByDmlEP,
inputsUsed]
isInputsUploadedByDmlEP = std::move(isInputsUploadedByDmlEP),
inputsUsed = std::move(inputsUsed)]
(onnxruntime::FuncManager& func_mgr, const onnxruntime::OpKernelInfo& info, std::unique_ptr<onnxruntime::OpKernel>& out) mutable ->onnxruntime::Status
{
out.reset(CreateFusedGraphKernel(info,
@ -435,8 +486,8 @@ namespace DmlGraphFusionHelper
nonOwnedGraphInputsFromInitializers,
initializeResourceRefs,
initInputBindings,
isInputsUploadedByDmlEP,
inputsUsed));
std::move(isInputsUploadedByDmlEP),
std::move(inputsUsed)));
return Status::OK();
};
@ -447,58 +498,7 @@ namespace DmlGraphFusionHelper
.SinceVersion(indexedSubGraph.GetMetaDef()->since_version)
.Provider(onnxruntime::kDmlExecutionProvider);
ORT_THROW_IF_ERROR(registryForPartitionKernels->Register(builder, fused_kernel_func));
}
void FusePartitionAndRegisterKernel(
GraphPartition* partition,
uint32_t partitionIndex,
onnxruntime::Graph& graph,
std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
onnxruntime::KernelRegistry* registryForPartitionKernels,
const std::string& partitionKernelPrefix,
const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
const ExecutionProviderImpl* providerImpl)
{
assert(partition->IsDmlGraphPartition());
onnxruntime::IndexedSubGraph indexedSubGraph;
// Create a definition for the node. The name must be unique.
auto def = std::make_unique<onnxruntime::IndexedSubGraph::MetaDef>();
def->name = DmlGraphFusionTransformer::DML_GRAPH_FUSION_NODE_NAME_PREFIX + partitionKernelPrefix + std::to_string(partitionIndex);
def->domain = DmlGraphFusionTransformer::DML_GRAPH_FUSION_NODE_DOMAIN;
def->since_version = 1;
def->inputs.insert(def->inputs.begin(), partition->GetInputs().begin(), partition->GetInputs().end());
def->outputs.insert(def->outputs.begin(), partition->GetOutputs().begin(), partition->GetOutputs().end());
indexedSubGraph.SetMetaDef(std::move(def));
indexedSubGraph.nodes = std::move(partition->GetNodeIndices());
auto& fusedNode = graph.BeginFuseSubGraph(indexedSubGraph, indexedSubGraph.GetMetaDef()->name);
fusedNode.SetExecutionProviderType(onnxruntime::kDmlExecutionProvider);
// Populate properties which will be passed to OpKernel for this graph via the function below
std::unordered_map<std::string, GraphNodeProperties> partitionNodePropsMap;
for (auto nodeIndex : indexedSubGraph.nodes)
{
const onnxruntime::Node* node = graph.GetNode(nodeIndex);
#ifdef PRINT_PARTITON_INFO
printf("Partition %u\t%s\n", partitionIndex, GraphDescBuilder::GetUniqueNodeName(*node).c_str());
#endif
partitionNodePropsMap.insert(std::make_pair(
GraphDescBuilder::GetUniqueNodeName(*node), std::move(graphNodePropertyMap[node])));
}
#ifdef PRINT_PARTITON_INFO
printf("\n");
#endif
CreateIDmlCompiledOperatorAndRegisterKernel(
graph,
indexedSubGraph,
fusedNode,
partitionNodePropsMap,
initializerNameToInitializerMap,
providerImpl,
registryForPartitionKernels);
graph.FinalizeFuseSubGraph(indexedSubGraph, fusedNode);
}
}

View file

@ -56,23 +56,29 @@ namespace DmlGraphFusionHelper
_Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlOutputEdges,
_Inout_ std::vector<DML_GRAPH_EDGE_DESC>& dmlIntermediateEdges);
void CreateIDmlCompiledOperatorAndRegisterKernel(
onnxruntime::Graph& graph,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
const onnxruntime::Node& fusedNode,
const std::unordered_map<std::string, GraphNodeProperties>& partitionNodePropsMap,
const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
const ExecutionProviderImpl* providerImpl,
onnxruntime::KernelRegistry* registryForPartitionKernels);
void FusePartitionAndRegisterKernel(
onnxruntime::IndexedSubGraph CreateIndexedSubGraph(
GraphPartition* partition,
uint32_t partitionIndex,
onnxruntime::Graph& graph,
std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
onnxruntime::KernelRegistry* registryForPartitionKernels,
const std::string& partitionKernelPrefix,
const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& isInitializerTransferable,
const std::string& partitionKernelPrefix);
std::unordered_map<std::string, GraphNodeProperties> CreatePartitionNodePropsMap(
const onnxruntime::Graph& graph,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>&& graphNodePropertyMap);
Microsoft::WRL::ComPtr<IDMLCompiledOperator> TryCreateCompiledOperator(
const GraphDescBuilder::GraphDesc& graphDesc,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
const ExecutionProviderImpl* providerImpl);
void FusePartitionAndRegisterKernel(
onnxruntime::Graph& graph,
onnxruntime::KernelRegistry* registryForPartitionKernels,
const std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>>& initializerNameToInitializerMap,
const ExecutionProviderImpl* providerImpl,
const onnxruntime::IndexedSubGraph& indexedSubGraph,
std::vector<uint8_t>&& isInputsUploadedByDmlEP,
const GraphDescBuilder::GraphDesc& graphDesc,
Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator);
}
}

View file

@ -23,7 +23,16 @@ namespace Dml
m_providerImpl(static_cast<const ExecutionProvider*>(provider)->GetImpl())
{
}
struct CompiledPartitionInfo
{
Microsoft::WRL::ComPtr<IDMLCompiledOperator> compiledOperator;
onnxruntime::IndexedSubGraph indexedSubGraph;
std::vector<uint8_t> isInputsUploadedByDmlEP;
GraphDescBuilder::GraphDesc graphDesc;
std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
};
onnxruntime::common::Status DmlGraphFusionTransformer::ApplyImpl(
onnxruntime::Graph& graph,
bool& modified,
@ -37,96 +46,173 @@ namespace Dml
gsl::make_span(&registry, 1),
kernel_type_str_resolver};
// Initializers needed by any graph partition
std::unordered_set<std::string> requiredInitializerMap;
std::unordered_map<const onnxruntime::Node*, GraphNodeProperties> graphNodePropertyMap;
onnxruntime::GraphViewer graphViewer(graph);
std::vector<std::unique_ptr<GraphPartition>> partitions = BuildPartitions(
graphViewer,
*m_providerImpl->GetInternalRegistrationInfoMap(),
kernel_lookup,
m_providerImpl->GetSupportedDeviceDataTypeMask(),
graphNodePropertyMap,
requiredInitializerMap);
std::vector<std::shared_ptr<CompiledPartitionInfo>> compiledPartitionInfos;
std::vector<onnxruntime::NodeIndex> additionalSplittingNodes;
// Create a map between each initialized tensor and the partition(s) it is part of.
auto initializerPartitionMap = DmlGraphFusionHelper::GetInitializerToPartitionMap(graphViewer, partitions);
for (uint32_t partitionIndex = 0; partitionIndex < partitions.size(); ++partitionIndex)
do
{
auto& partition = partitions[partitionIndex];
// Initializers needed by any graph partition
std::unordered_set<std::string> requiredInitializerMap;
std::unordered_map<const onnxruntime::Node*, GraphNodeProperties> graphNodePropertyMap;
onnxruntime::GraphViewer graphViewer(graph);
std::vector<std::unique_ptr<GraphPartition>> partitions = BuildPartitions(
graphViewer,
*m_providerImpl->GetInternalRegistrationInfoMap(),
kernel_lookup,
m_providerImpl->GetSupportedDeviceDataTypeMask(),
graphNodePropertyMap,
requiredInitializerMap,
additionalSplittingNodes);
if (partition->GetRootMergedPartition() != partition.get() ||
!partition->IsDmlPartition())
// Reset the splitting nodes for the current iteration
additionalSplittingNodes.clear();
// Reset the compiled operators for the current iteration
compiledPartitionInfos.clear();
compiledPartitionInfos.resize(partitions.size());
// Create a map between each initialized tensor and the partition(s) it is part of.
auto initializerPartitionMap = DmlGraphFusionHelper::GetInitializerToPartitionMap(graphViewer, partitions);
for (uint32_t partitionIndex = 0; partitionIndex < partitions.size(); ++partitionIndex)
{
continue;
}
auto& partition = partitions[partitionIndex];
// This map will tell which initializer can be removed from onnxruntime::Graph (and from it's field
// onnx::GraphProto) while we upload the initializer to GPU.
// Why we want to remove the initializer from ORT?
// 1. To keep the peak memory usage as low as possible. That's why we are doing incremental upload to GPU.
// What is initializer?
// An initializer is a input tensor to an operator or the graph itself, which is contant and will never change.
// Why are we uploading the initialzer now?
// This prevents OnnxRuntime from allocating GPU resources and uploading those initializers,
// so the partiton's kernel can do so. In the process, it will pre-process weights while consuming a CPU
// backed resource, avoiding an extra set of GPU resources in memory.
std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
if (partition->IsDmlGraphPartition())
{
// populate transferredInitializerMap
for (const auto& input : partition->GetInputs())
if (partition->GetRootMergedPartition() != partition.get() ||
!partition->IsDmlPartition())
{
const onnx::TensorProto* tensor = nullptr;
if (graph.GetInitializedTensor(input, tensor))
{
// It's only safe to transfer tensors which are used by this partition alone.
auto iter = initializerPartitionMap.find(tensor);
assert(iter != initializerPartitionMap.end());
if (iter->second.size() > 1)
{
// By including non-transferrable tensors in isInitializerTransferable, it causes DML to upload and preprocess them
// to duplicate locations rather than treating them as being non-constant, which is helpful for optimization.
// The size threshold for this should be no smaller than that used to combine initializers in the constant
// sharing transform to prevent that transform from hurting performance.
// If the kernel relies on this input to be initialized, it should also be small enough to copy cheaply.
const uint64_t maximumElementsForDuplicationTensor = 64;
static_assert(maximumElementsForDuplicationTensor >= onnxruntime::ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD);
uint64_t totalElementCount = 1;
for (int i = 0; i < tensor->dims().size(); ++i)
{
totalElementCount *= tensor->dims()[i];
}
if (totalElementCount <= maximumElementsForDuplicationTensor ||
requiredInitializerMap.find(input) != requiredInitializerMap.end())
{
isInitializerTransferable[input] = {tensor, false};
}
continue;
}
isInitializerTransferable[input] = {tensor, true};
}
continue;
}
std::string partitionKernelPrefix = std::to_string(m_providerImpl->GetPartitionKernelPrefixVal()) + "_";
m_providerImpl->IncreasePartitionKernelPrefixVal();
// This map will tell which initializer can be removed from onnxruntime::Graph (and from it's field
// onnx::GraphProto) while we upload the initializer to GPU.
// Why we want to remove the initializer from ORT?
// 1. To keep the peak memory usage as low as possible. That's why we are doing incremental upload to GPU.
// What is initializer?
// An initializer is a input tensor to an operator or the graph itself, which is contant and will never change.
// Why are we uploading the initialzer now?
// This prevents OnnxRuntime from allocating GPU resources and uploading those initializers,
// so the partiton's kernel can do so. In the process, it will pre-process weights while consuming a CPU
// backed resource, avoiding an extra set of GPU resources in memory.
std::unordered_map<std::string, std::pair<const ONNX_NAMESPACE::TensorProto*, bool>> isInitializerTransferable;
if (partition->IsDmlGraphPartition())
{
// populate isInitializerTransferable
for (const auto& input : partition->GetInputs())
{
const onnx::TensorProto* tensor = nullptr;
if (graph.GetInitializedTensor(input, tensor))
{
// It's only safe to transfer tensors which are used by this partition alone.
auto iter = initializerPartitionMap.find(tensor);
assert(iter != initializerPartitionMap.end());
if (iter->second.size() > 1)
{
// By including non-transferrable tensors in isInitializerTransferable, it causes DML to upload and preprocess them
// to duplicate locations rather than treating them as being non-constant, which is helpful for optimization.
// The size threshold for this should be no smaller than that used to combine initializers in the constant
// sharing transform to prevent that transform from hurting performance.
// If the kernel relies on this input to be initialized, it should also be small enough to copy cheaply.
constexpr uint64_t maximumElementsForDuplicationTensor = 64;
static_assert(maximumElementsForDuplicationTensor >= onnxruntime::ConstantSharing::TENSOR_ELEM_COUNT_THRESHOLD);
uint64_t totalElementCount = 1;
for (int i = 0; i < tensor->dims().size(); ++i)
{
totalElementCount *= tensor->dims()[i];
}
if (totalElementCount <= maximumElementsForDuplicationTensor ||
requiredInitializerMap.find(input) != requiredInitializerMap.end())
{
isInitializerTransferable[input] = {tensor, false};
}
continue;
}
isInitializerTransferable[input] = {tensor, true};
}
}
std::string partitionKernelPrefix = std::to_string(m_providerImpl->GetPartitionKernelPrefixVal()) + "_";
m_providerImpl->IncreasePartitionKernelPrefixVal();
auto indexedSubGraph = DmlGraphFusionHelper::CreateIndexedSubGraph(partition.get(), partitionIndex, partitionKernelPrefix);
// Create a map of which inputs are uploaded by the DML EP
const uint32_t fusedNodeInputCount = gsl::narrow_cast<uint32_t>(indexedSubGraph.GetMetaDef()->inputs.size());
std::vector<uint8_t> isInputsUploadedByDmlEP(fusedNodeInputCount);
for (uint32_t index = 0; index < fusedNodeInputCount; ++index)
{
auto iter = isInitializerTransferable.find(indexedSubGraph.GetMetaDef()->inputs[index]);
isInputsUploadedByDmlEP[index] = iter != isInitializerTransferable.end() ? true : false;
}
auto partitionNodePropsMap = DmlGraphFusionHelper::CreatePartitionNodePropsMap(
graph,
indexedSubGraph,
std::move(graphNodePropertyMap));
// Convert partitionONNXGraph into DML EP GraphDesc
ComPtr<IDMLDevice> device;
ORT_THROW_IF_FAILED(m_providerImpl->GetDmlDevice(device.GetAddressOf()));
GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
isInputsUploadedByDmlEP.data(),
isInputsUploadedByDmlEP.size(),
isInitializerTransferable,
graph,
indexedSubGraph,
partitionNodePropsMap,
device.Get(),
m_providerImpl);
// Compile the operator
auto compiledPartition = DmlGraphFusionHelper::TryCreateCompiledOperator(
graphDesc,
indexedSubGraph,
m_providerImpl);
if (!compiledPartition)
{
// Fail early if even a single operator is too big to compile. This is highly unlikely.
ORT_THROW_HR_IF(E_INVALIDARG, indexedSubGraph.nodes.size() < 2);
// Tell the partitioner to split the current partition in half, in the middle
additionalSplittingNodes.push_back(indexedSubGraph.nodes[indexedSubGraph.nodes.size() / 2]);
// Exit early since we need to repartition
break;
}
else
{
auto compiledPartitionInfo = std::make_shared<CompiledPartitionInfo>();
compiledPartitionInfo->compiledOperator = std::move(compiledPartition);
compiledPartitionInfo->indexedSubGraph = std::move(indexedSubGraph);
compiledPartitionInfo->isInputsUploadedByDmlEP = std::move(isInputsUploadedByDmlEP);
compiledPartitionInfo->graphDesc = std::move(graphDesc);
compiledPartitionInfo->isInitializerTransferable = std::move(isInitializerTransferable);
compiledPartitionInfos[partitionIndex] = std::move(compiledPartitionInfo);
}
}
}
}
while (!additionalSplittingNodes.empty());
for (auto&& compiledPartitionInfo : compiledPartitionInfos)
{
// Null compiled operators were not DML partitions
if (compiledPartitionInfo)
{
DmlGraphFusionHelper::FusePartitionAndRegisterKernel(
partition.get(),
partitionIndex,
graph,
graphNodePropertyMap,
graph,
m_providerImpl->GetKernelRegistry().get(),
partitionKernelPrefix,
isInitializerTransferable,
m_providerImpl
);
compiledPartitionInfo->isInitializerTransferable,
m_providerImpl,
compiledPartitionInfo->indexedSubGraph,
std::move(compiledPartitionInfo->isInputsUploadedByDmlEP),
compiledPartitionInfo->graphDesc,
compiledPartitionInfo->compiledOperator);
}
}

View file

@ -24,13 +24,13 @@ namespace Dml
std::vector<ComPtr<ID3D12Resource>>& nonOwnedGraphInputsFromInitializers,
std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>>& initializeResourceRefs,
std::vector<DML_BUFFER_BINDING> initInputBindings,
std::vector<uint8_t>& isInputsUploadedByDmlEP,
std::vector<bool>& inputsUsed) :
std::vector<uint8_t>&& isInputsUploadedByDmlEP,
std::vector<bool>&& inputsUsed) :
OpKernel(kernelInfo),
m_compiledExecutionPlanOperator(compiledExecutionPlanOperator),
m_inputsUsed(inputsUsed),
m_inputsUsed(std::move(inputsUsed)),
m_outputShapes(outputShapes),
m_isInputsUploadedByDmlEP(isInputsUploadedByDmlEP),
m_isInputsUploadedByDmlEP(std::move(isInputsUploadedByDmlEP)),
m_nonOwnedGraphInputsFromInitializers(nonOwnedGraphInputsFromInitializers)
{
// Get the execution provider interfaces
@ -443,8 +443,8 @@ namespace Dml
std::vector<ComPtr<ID3D12Resource>>& nonOwnedGraphInputsFromInitializers,
std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>>& initializeResourceRefs,
std::vector<DML_BUFFER_BINDING> initInputBindings,
std::vector<uint8_t>& isInputsUploadedByDmlEP,
std::vector<bool>& inputsUsed
std::vector<uint8_t>&& isInputsUploadedByDmlEP,
std::vector<bool>&& inputsUsed
)
{
return new FusedGraphKernel(
@ -455,8 +455,8 @@ namespace Dml
nonOwnedGraphInputsFromInitializers,
initializeResourceRefs,
initInputBindings,
isInputsUploadedByDmlEP,
inputsUsed
std::move(isInputsUploadedByDmlEP),
std::move(inputsUsed)
);
}
} // namespace Dml

View file

@ -15,7 +15,7 @@ namespace Dml
std::vector<ComPtr<ID3D12Resource>>& nonOwnedGraphInputsFromInitializers,
std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>>& initializeResourceRefs,
std::vector<DML_BUFFER_BINDING> initInputBindings,
std::vector<uint8_t>& isInputsUploadedByDmlEP,
std::vector<bool>& inputsUsed
std::vector<uint8_t>&& isInputsUploadedByDmlEP,
std::vector<bool>&& inputsUsed
);
} // namespace Dml

View file

@ -209,14 +209,15 @@ namespace Dml
// Creates a partition for a node which is not a DML graph node, and finalizes partitions
// which are inputs of the new partition.
std::unique_ptr<GraphPartition> CreateNonGraphNodePartitionAndFinalizeInputs(
std::unique_ptr<GraphPartition> CreatePartitionAndFinalizeInputs(
const onnxruntime::Node& node,
bool isDmlNode,
bool isDmlGraphPartitionNode,
std::unordered_map<std::string, GraphPartition*>& nodeNameToPartitionMap
)
{
std::unique_ptr<GraphPartition> partition = std::make_unique<GraphPartition>();
partition->SetIsDmlGraphPartition(false);
partition->SetIsDmlGraphPartition(isDmlGraphPartitionNode);
partition->SetIsDmlPartition(isDmlNode);
partition->AddNodeIndex(node.Index());
@ -383,7 +384,7 @@ namespace Dml
uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
std::unordered_set<std::string>& requiredInitializerMap,
std::function<void(const onnxruntime::Node&)> onNodeUnsupportedInGraph)
gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes)
{
// Nodes are uniquely identified by the name of their first output argument
std::vector<std::unique_ptr<GraphPartition>> partitions;
@ -420,6 +421,8 @@ namespace Dml
// Check whether this graph is a subgraph, or contains any node with a subgraph.
bool modelUsesSubgraph = ModelUsesSubgraph(graph);
uint32_t splittingNodeIndex = 0;
// Build up partitions while traversing the graph.
for (size_t nodeIndex : toplogicalOrder)
{
@ -456,12 +459,14 @@ namespace Dml
// anyhow due to CPU/GPU copies.
if (modelUsesSubgraph || !isDmlGraphNode)
{
if (onNodeUnsupportedInGraph)
{
onNodeUnsupportedInGraph(node);
}
partitions.push_back(CreatePartitionAndFinalizeInputs(node, isDmlNode, false, nodeNameToPartitionMap));
continue;
}
partitions.push_back(CreateNonGraphNodePartitionAndFinalizeInputs(node, isDmlNode, nodeNameToPartitionMap));
if (splittingNodeIndex < additionalSplittingNodes.size() && additionalSplittingNodes[splittingNodeIndex] == nodeIndex)
{
partitions.push_back(CreatePartitionAndFinalizeInputs(node, isDmlNode, isDmlGraphNode, nodeNameToPartitionMap));
++splittingNodeIndex;
continue;
}

View file

@ -48,5 +48,5 @@ namespace Dml
uint32_t supportedDeviceDataTypeMask, // Each bit corresponds to each DML_TENSOR_DATA_TYPE.
std::unordered_map<const onnxruntime::Node*, GraphNodeProperties>& graphNodePropertyMap,
std::unordered_set<std::string>& requiredInitializerMap,
std::function<void(const onnxruntime::Node&)> onNodeUnsupportedInGraph = nullptr);
gsl::span<const onnxruntime::NodeIndex> additionalSplittingNodes);
} // namespace Dml