From d4341ea2de8c2ada0ddbfbf4acd577851d9cf897 Mon Sep 17 00:00:00 2001 From: Xiang Zhang Date: Thu, 2 Jul 2020 01:24:12 +0000 Subject: [PATCH] Merged PR 4870266: Refactor fused graph kernel so dmlxp and ort share the same code Related work items: #26719246 --- .../src/FusedGraphKernel.cpp | 311 ++--------------- .../src/GraphKernelHelper.cpp | 313 ++++++++++++++++++ .../src/GraphKernelHelper.h | 67 ++++ 3 files changed, 412 insertions(+), 279 deletions(-) create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.cpp create mode 100644 onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.h diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp index 0b357e4f68..f3f4caab3f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp @@ -5,20 +5,12 @@ #include "MLOperatorAuthorImpl.h" #include "FusedGraphKernel.h" +#include "GraphKernelHelper.h" using namespace Windows::AI::MachineLearning::Adapter; namespace Dml { - template - static T AlignToPow2(T offset, T alignment) - { - static_assert(std::is_unsigned_v); - assert(alignment != 0); - assert((alignment & (alignment - 1)) == 0); - return (offset + alignment - 1) & ~(alignment - 1); - } - class FusedGraphKernel : public onnxruntime::OpKernel { public: @@ -73,37 +65,10 @@ namespace Dml const uint32_t graphInputCount = kernelInfo.GetInputCount(); - auto gpuGraphInputConstnessGetter = [&kernelInfo, &fusedNodeInputDefs, &transferredInitializerMap](uint32_t index) - { - // Transferred initializers are uploaded to GPU memory - auto iter = transferredInitializerMap.find(fusedNodeInputDefs[index]->Name()); - if (iter != transferredInitializerMap.end()) - { - return true; - } - - // If an initializer wasn't transferred, the constant input may be available from ORT - const onnxruntime::Tensor* inputTensor = nullptr; - if (!kernelInfo.TryGetConstantInput(index, &inputTensor) || inputTensor == nullptr) - { - return false; - } - - // Check that the constant ORT input is in GPU memory - if (!strcmp(inputTensor->Location().name, onnxruntime::CPU) || - inputTensor->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput || - inputTensor->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput) - { - return false; - } - - return true; - }; - m_inputsConstant.resize(graphInputCount); for (uint32_t i = 0; i < graphInputCount; ++i) { - m_inputsConstant[i] = gpuGraphInputConstnessGetter(i); + m_inputsConstant[i] = GraphKernelHelper::GetGraphInputConstness(i, kernelInfo, fusedNodeInputDefs, transferredInitializerMap); } GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc( @@ -118,116 +83,27 @@ namespace Dml device.Get(), m_executionHandle); - // Determine the last input which uses an initializer, so initializers can be freed incrementally - // while processing each input in order. - std::map initializerToLastInputIndexMap; - for (uint32_t i = 0; i < graphInputCount; i++) - { - auto iter = transferredInitializerMap.find(fusedNodeInputDefs[i]->Name()); - if (iter != transferredInitializerMap.end()) - { - initializerToLastInputIndexMap[&iter->second] = i; - } - } - - // Walk through each graph edge and mark used inputs - m_inputsUsed.assign(graphInputCount, false); - for (const DML_INPUT_GRAPH_EDGE_DESC& edge : graphDesc.inputEdges) - { - m_inputsUsed[edge.GraphInputIndex] = true; - } - // Populate input bindings for operator initialization - std::vector> initInputResources; // For lifetime control + std::vector> initInputResources; // For lifetime control std::vector initInputBindings(graphInputCount); m_nonOwnedGraphInputsFromInitializers.resize(graphInputCount); - std::vector> initializeResourceRefs; - - for (uint32_t i = 0; i < initInputBindings.size(); i++) - { - // If the input isn't actually used by the graph, nothing ever needs to be bound (either for - // initialization or execution). So just throw away the transferred initializer and skip this input. - if (!m_inputsUsed[i]) - { - transferredInitializerMap.erase(fusedNodeInputDefs[i]->Name()); - continue; - } - - // Look for the initializer among those transferred from the graph during partitioning - auto iter = transferredInitializerMap.find(fusedNodeInputDefs[i]->Name()); - if (iter != transferredInitializerMap.end()) - { - std::byte* tensorPtr = nullptr; - size_t tensorByteSize = 0; - std::unique_ptr unpackedTensor; - - auto& initializer = iter->second; - - // The tensor may be stored as raw data or in typed fields. - if (initializer.has_raw_data()) - { - tensorPtr = (std::byte*)(initializer.raw_data().c_str()); - tensorByteSize = initializer.raw_data().size(); - } - else - { - std::tie(unpackedTensor, tensorByteSize) = UnpackTensor(initializer); - tensorPtr = unpackedTensor.get(); - } - - // Tensor sizes in DML must be a multiple of 4 bytes large. - tensorByteSize = AlignToPow2(tensorByteSize, 4); - - if (!m_inputsConstant[i]) - { - // Store the resource to use during execution - ComPtr defaultBuffer = CreateResource(tensorPtr, tensorByteSize); - m_nonOwnedGraphInputsFromInitializers[i] = defaultBuffer; - initializeResourceRefs.push_back(std::move(defaultBuffer)); - } - else - { - ComPtr initializeInputBuffer; - - // D3D_FEATURE_LEVEL_1_0_CORE doesn't support Custom heaps - if (m_provider->IsMcdmDevice()) - { - initializeInputBuffer = CreateResource(tensorPtr, tensorByteSize); - } - else - { - initializeInputBuffer = CreateCpuResource(tensorPtr, tensorByteSize); - } - - // Set the binding for operator initialization to the buffer - initInputBindings[i].Buffer = initializeInputBuffer.Get(); - initInputBindings[i].SizeInBytes = tensorByteSize; - initializeResourceRefs.push_back(std::move(initializeInputBuffer)); - } - - // Free the initializer if this is the last usage of it. - if (initializerToLastInputIndexMap[&initializer] == i) - { - transferredInitializerMap.erase(iter); - } - } - else if (m_inputsConstant[i]) - { - const onnxruntime::Tensor* inputTensor = nullptr; - THROW_HR_IF(E_UNEXPECTED, !kernelInfo.TryGetConstantInput(i, &inputTensor)); - - uint64_t allocId; - UnwrapTensor(inputTensor, &initInputBindings[i].Buffer, &allocId); - initInputBindings[i].SizeInBytes = initInputBindings[i].Buffer->GetDesc().Width; - - initInputBindings[i].Buffer->Release(); // Avoid holding an additional reference - initInputResources.push_back(initInputBindings[i].Buffer); - } - } - - // All initializers should have been consumed and freed above - assert(transferredInitializerMap.empty()); + std::vector> initializeResourceRefs; + + GraphKernelHelper::PopulateInputBindings( + m_provider.Get(), + m_winmlProvider.Get(), + m_inputsConstant, + kernelInfo, + graphDesc, + fusedNodeInputDefs, + m_inputsUsed, + initInputBindings, + initInputResources, + m_nonOwnedGraphInputsFromInitializers, + initializeResourceRefs, + transferredInitializerMap); + DML_GRAPH_DESC dmlGraphDesc = {}; std::vector dmlOperatorGraphNodes(graphDesc.nodes.size()); std::vector dmlGraphNodes(graphDesc.nodes.size()); @@ -235,38 +111,15 @@ namespace Dml std::vector dmlOutputEdges(graphDesc.outputEdges.size()); std::vector dmlIntermediateEdges(graphDesc.intermediateEdges.size()); - for (size_t i = 0; i < graphDesc.nodes.size(); ++i) - { - dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{ graphDesc.nodes[i].op.Get() }; - dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{ DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i] }; - } - - for (size_t i = 0; i < graphDesc.inputEdges.size(); ++i) - { - dmlInputEdges[i] = DML_GRAPH_EDGE_DESC{ DML_GRAPH_EDGE_TYPE_INPUT, &graphDesc.inputEdges[i] }; - } - - for (size_t i = 0; i < graphDesc.outputEdges.size(); ++i) - { - dmlOutputEdges[i] = DML_GRAPH_EDGE_DESC{ DML_GRAPH_EDGE_TYPE_OUTPUT, &graphDesc.outputEdges[i] }; - } - - for (size_t i = 0; i < graphDesc.intermediateEdges.size(); ++i) - { - dmlIntermediateEdges[i] = DML_GRAPH_EDGE_DESC{ DML_GRAPH_EDGE_TYPE_INTERMEDIATE, &graphDesc.intermediateEdges[i] }; - } - - DML_GRAPH_DESC dmlGraphDesc = {}; - dmlGraphDesc.InputCount = graphInputCount; - dmlGraphDesc.OutputCount = kernelInfo.GetOutputCount(); - dmlGraphDesc.NodeCount = gsl::narrow_cast(dmlGraphNodes.size()); - dmlGraphDesc.Nodes = dmlGraphNodes.data(); - dmlGraphDesc.InputEdgeCount = gsl::narrow_cast(dmlInputEdges.size()); - dmlGraphDesc.InputEdges = dmlInputEdges.data(); - dmlGraphDesc.OutputEdgeCount = gsl::narrow_cast(dmlOutputEdges.size()); - dmlGraphDesc.OutputEdges = dmlOutputEdges.data(); - dmlGraphDesc.IntermediateEdgeCount = gsl::narrow_cast(dmlIntermediateEdges.size()); - dmlGraphDesc.IntermediateEdges = dmlIntermediateEdges.data(); + GraphKernelHelper::ConvertGraphDesc( + graphDesc, + dmlGraphDesc, + kernelInfo, + dmlOperatorGraphNodes, + dmlGraphNodes, + dmlInputEdges, + dmlOutputEdges, + dmlIntermediateEdges); DML_EXECUTION_FLAGS executionFlags = DML_EXECUTION_FLAG_NONE; if (graphDesc.reuseCommandList) @@ -534,10 +387,10 @@ namespace Dml const onnxruntime::Tensor* tensor = kernelContext->Input(i); uint64_t allocId; - UnwrapTensor(tensor, &inputBindings[i].Buffer, &allocId); + GraphKernelHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &inputBindings[i].Buffer, &allocId); inputBindingsChanged = inputBindingsChanged || (!allocId || m_inputBindingAllocIds[i] != allocId); inputBindings[i].Buffer->Release(); // Avoid holding an additional reference - inputBindings[i].SizeInBytes = AlignToPow2(tensor->SizeInBytes(), 4); + inputBindings[i].SizeInBytes = GraphKernelHelper::AlignToPow2(tensor->SizeInBytes(), 4); inputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &inputBindings[i]}; m_inputBindingAllocIds[i] = allocId; } @@ -571,10 +424,10 @@ namespace Dml ); uint64_t allocId; - UnwrapTensor(tensor, &outputBindings[i].Buffer, &allocId); + GraphKernelHelper::UnwrapTensor(m_winmlProvider.Get(), tensor, &outputBindings[i].Buffer, &allocId); outputBindingsChanged = outputBindingsChanged || (!allocId || m_outputBindingAllocIds[i] != allocId); outputBindings[i].Buffer->Release(); // Avoid holding an additional reference - outputBindings[i].SizeInBytes = AlignToPow2(tensor->SizeInBytes(), 4); + outputBindings[i].SizeInBytes = GraphKernelHelper::AlignToPow2(tensor->SizeInBytes(), 4); outputBindingDescs[i] = {DML_BINDING_TYPE_BUFFER, &outputBindings[i]}; m_outputBindingAllocIds[i] = allocId; } @@ -624,106 +477,6 @@ namespace Dml m_winmlProvider->QueueReference(m_persistentResourceAllocatorUnk.Get()); } - void UnwrapTensor(const onnxruntime::Tensor* tensor, ID3D12Resource** resource, uint64_t* allocId) const - { - IUnknown* allocationUnk = static_cast(const_cast(tensor->DataRaw())); - ComPtr resourceUnk; - m_winmlProvider->GetABIDataInterface(false, allocationUnk, &resourceUnk); - - *allocId = m_winmlProvider->TryGetPooledAllocationId(allocationUnk, 0); - - THROW_IF_FAILED(resourceUnk->QueryInterface(resource)); - } - - ComPtr CreateResource(const std::byte* tensorPtr, size_t tensorByteSize) const - { - ComPtr buffer; - - D3D12_HEAP_PROPERTIES heapProperties = { - D3D12_HEAP_TYPE_DEFAULT, - D3D12_CPU_PAGE_PROPERTY_UNKNOWN, - D3D12_MEMORY_POOL_UNKNOWN, - 0, - 0 - }; - - D3D12_RESOURCE_DESC resourceDesc = { - D3D12_RESOURCE_DIMENSION_BUFFER, - 0, - static_cast((tensorByteSize + 3) & ~3), - 1, - 1, - 1, - DXGI_FORMAT_UNKNOWN, - { 1, 0 }, - D3D12_TEXTURE_LAYOUT_ROW_MAJOR, - D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS - }; - - ComPtr d3dDevice; - THROW_IF_FAILED(m_provider->GetD3DDevice(d3dDevice.GetAddressOf())); - - THROW_IF_FAILED(d3dDevice->CreateCommittedResource( - &heapProperties, - D3D12_HEAP_FLAG_NONE, - &resourceDesc, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - nullptr, - IID_PPV_ARGS(buffer.GetAddressOf()) - )); - - THROW_IF_FAILED(m_provider->UploadToResource(buffer.Get(), tensorPtr, tensorByteSize)); - - return buffer; - } - - ComPtr CreateCpuResource(const std::byte* tensorPtr, size_t tensorByteSize) const - { - ComPtr buffer; - - D3D12_HEAP_PROPERTIES heapProperties = { - D3D12_HEAP_TYPE_CUSTOM, - D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE, - D3D12_MEMORY_POOL_L0, - 0, - 0 - }; - - D3D12_RESOURCE_DESC resourceDesc = { - D3D12_RESOURCE_DIMENSION_BUFFER, - 0, - static_cast((tensorByteSize + 3) & ~3), - 1, - 1, - 1, - DXGI_FORMAT_UNKNOWN, - { 1, 0 }, - D3D12_TEXTURE_LAYOUT_ROW_MAJOR, - D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS - }; - - ComPtr d3dDevice; - THROW_IF_FAILED(m_provider->GetD3DDevice(d3dDevice.GetAddressOf())); - - THROW_IF_FAILED(d3dDevice->CreateCommittedResource( - &heapProperties, - D3D12_HEAP_FLAG_NONE, - &resourceDesc, - D3D12_RESOURCE_STATE_UNORDERED_ACCESS, - nullptr, - IID_PPV_ARGS(buffer.GetAddressOf()) - )); - - // Map the buffer and copy the data - void* bufferData = nullptr; - D3D12_RANGE range = {0, tensorByteSize}; - THROW_IF_FAILED(buffer->Map(0, &range, &bufferData)); - memcpy(bufferData, tensorPtr, tensorByteSize); - buffer->Unmap(0, &range); - - return buffer; - } - ComPtr m_compiledExecutionPlanOperator; std::vector m_inputsUsed; const void* m_executionHandle = nullptr; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.cpp new file mode 100644 index 0000000000..fa9792b08c --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.cpp @@ -0,0 +1,313 @@ +#include "precomp.h" + +#include "GraphKernelHelper.h" + +namespace Dml +{ +namespace GraphKernelHelper +{ + Microsoft::WRL::ComPtr + CreateResource( + Dml::IExecutionProvider* provider, + const std::byte* tensorPtr, + size_t tensorByteSize) + { + Microsoft::WRL::ComPtr buffer; + + D3D12_HEAP_PROPERTIES heapProperties = { + D3D12_HEAP_TYPE_DEFAULT, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0}; + + D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER, + 0, + static_cast((tensorByteSize + 3) & ~3), + 1, + 1, + 1, + DXGI_FORMAT_UNKNOWN, + {1, 0}, + D3D12_TEXTURE_LAYOUT_ROW_MAJOR, + D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS}; + + Microsoft::WRL::ComPtr d3dDevice; + THROW_IF_FAILED(provider->GetD3DDevice(d3dDevice.GetAddressOf())); + + THROW_IF_FAILED(d3dDevice->CreateCommittedResource( + &heapProperties, + D3D12_HEAP_FLAG_NONE, + &resourceDesc, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + nullptr, + IID_PPV_ARGS(buffer.GetAddressOf()))); + + THROW_IF_FAILED(provider->UploadToResource(buffer.Get(), tensorPtr, tensorByteSize)); + + return buffer; + } + + Microsoft::WRL::ComPtr + CreateCpuResource( + Dml::IExecutionProvider* provider, + const std::byte* tensorPtr, + size_t tensorByteSize) + { + Microsoft::WRL::ComPtr buffer; + + D3D12_HEAP_PROPERTIES heapProperties = { + D3D12_HEAP_TYPE_CUSTOM, D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE, D3D12_MEMORY_POOL_L0, 0, 0}; + + D3D12_RESOURCE_DESC resourceDesc = {D3D12_RESOURCE_DIMENSION_BUFFER, + 0, + static_cast((tensorByteSize + 3) & ~3), + 1, + 1, + 1, + DXGI_FORMAT_UNKNOWN, + {1, 0}, + D3D12_TEXTURE_LAYOUT_ROW_MAJOR, + D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS}; + + Microsoft::WRL::ComPtr d3dDevice; + THROW_IF_FAILED(provider->GetD3DDevice(d3dDevice.GetAddressOf())); + + THROW_IF_FAILED(d3dDevice->CreateCommittedResource( + &heapProperties, + D3D12_HEAP_FLAG_NONE, + &resourceDesc, + D3D12_RESOURCE_STATE_UNORDERED_ACCESS, + nullptr, + IID_PPV_ARGS(buffer.GetAddressOf()))); + + // Map the buffer and copy the data + void* bufferData = nullptr; + D3D12_RANGE range = {0, tensorByteSize}; + THROW_IF_FAILED(buffer->Map(0, &range, &bufferData)); + memcpy(bufferData, tensorPtr, tensorByteSize); + buffer->Unmap(0, &range); + + return buffer; + } + + void UnwrapTensor( + IWinmlExecutionProvider* winmlProvider, + const onnxruntime::Tensor* tensor, + ID3D12Resource** resource, + uint64_t* allocId) + { + IUnknown* allocationUnk = static_cast(const_cast(tensor->DataRaw())); + Microsoft::WRL::ComPtr resourceUnk; + winmlProvider->GetABIDataInterface(false, allocationUnk, &resourceUnk); + + *allocId = winmlProvider->TryGetPooledAllocationId(allocationUnk, 0); + + THROW_IF_FAILED(resourceUnk->QueryInterface(resource)); + } + + bool GetGraphInputConstness( + uint32_t index, + const onnxruntime::OpKernelInfo& kernelInfo, + const onnxruntime::ConstPointerContainer>& fusedNodeInputDefs, + const std::unordered_map& transferredInitializerMap) + { + // Transferred initializers are uploaded to GPU memory + auto iter = transferredInitializerMap.find(fusedNodeInputDefs[index]->Name()); + if (iter != transferredInitializerMap.end()) + { + return true; + } + + // If an initializer wasn't transferred, the constant input may be available from ORT + const onnxruntime::Tensor* inputTensor = nullptr; + if (!kernelInfo.TryGetConstantInput(index, &inputTensor) || inputTensor == nullptr) + { + return false; + } + + // Check that the constant ORT input is in GPU memory + if (!strcmp(inputTensor->Location().name, onnxruntime::CPU) || + inputTensor->Location().mem_type == ::OrtMemType::OrtMemTypeCPUOutput || + inputTensor->Location().mem_type == ::OrtMemType::OrtMemTypeCPUInput) + { + return false; + } + + return true; + }; + + std::vector> PopulateInputBindings( + Dml::IExecutionProvider* provider, + IWinmlExecutionProvider* winmlProvider, + const std::vector& inputsConstant, + const onnxruntime::OpKernelInfo& kernelInfo, + const Dml::GraphDescBuilder::GraphDesc& graphDesc, + const onnxruntime::ConstPointerContainer>& fusedNodeInputDefs, + _Out_ std::vector& inputsUsed, + _Out_ std::vector& initInputBindings, + _Out_ std::vector>& initInputResources, + _Out_ std::vector>& nonOwnedGraphInputsFromInitializers, + _Out_ std::vector>& initializeResourceRefs, + _Inout_ std::unordered_map& transferredInitializerMap) + { + std::vector> inputRawData; + + const uint32_t graphInputCount = kernelInfo.GetInputCount(); + // Determine the last input which uses an initializer, so initializers can be freed incrementally + // while processing each input in order. + std::map initializerToLastInputIndexMap; + for (uint32_t i = 0; i < graphInputCount; i++) + { + auto iter = transferredInitializerMap.find(fusedNodeInputDefs[i]->Name()); + if (iter != transferredInitializerMap.end()) { + initializerToLastInputIndexMap[&iter->second] = i; + } + } + + // Walk through each graph edge and mark used inputs + inputsUsed.assign(graphInputCount, false); + for (const DML_INPUT_GRAPH_EDGE_DESC& edge : graphDesc.inputEdges) { + inputsUsed[edge.GraphInputIndex] = true; + } + for (uint32_t i = 0; i < initInputBindings.size(); i++) + { + // If the input isn't actually used by the graph, nothing ever needs to be bound (either for + // initialization or execution). So just throw away the transferred initializer and skip this input. + if (!inputsUsed[i]) + { + transferredInitializerMap.erase(fusedNodeInputDefs[i]->Name()); + inputRawData.push_back(std::vector()); + continue; + } + + // Look for the initializer among those transferred from the graph during partitioning + auto iter = transferredInitializerMap.find(fusedNodeInputDefs[i]->Name()); + if (iter != transferredInitializerMap.end()) + { + std::byte* tensorPtr = nullptr; + size_t tensorByteSize = 0; + std::unique_ptr unpackedTensor; + + auto& initializer = iter->second; + + // The tensor may be stored as raw data or in typed fields. + if (initializer.has_raw_data()) + { + tensorPtr = (std::byte*)(initializer.raw_data().c_str()); + tensorByteSize = initializer.raw_data().size(); + } + else + { + std::tie(unpackedTensor, tensorByteSize) = UnpackTensor(initializer); + tensorPtr = unpackedTensor.get(); + } + + // Tensor sizes in DML must be a multiple of 4 bytes large. + tensorByteSize = AlignToPow2(tensorByteSize, 4); + + inputRawData.push_back(std::vector(tensorPtr, tensorPtr + tensorByteSize)); + + if (!inputsConstant[i]) + { + // Store the resource to use during execution + ComPtr defaultBuffer = CreateResource(provider, tensorPtr, tensorByteSize); + nonOwnedGraphInputsFromInitializers[i] = defaultBuffer; + initializeResourceRefs.push_back(std::move(defaultBuffer)); + } + else + { + ComPtr initializeInputBuffer; + + // D3D_FEATURE_LEVEL_1_0_CORE doesn't support Custom heaps + if (provider->IsMcdmDevice()) + { + initializeInputBuffer = CreateResource(provider, tensorPtr, tensorByteSize); + } + else + { + initializeInputBuffer = CreateCpuResource(provider, tensorPtr, tensorByteSize); + } + + // Set the binding for operator initialization to the buffer + initInputBindings[i].Buffer = initializeInputBuffer.Get(); + initInputBindings[i].SizeInBytes = tensorByteSize; + initializeResourceRefs.push_back(std::move(initializeInputBuffer)); + } + + // Free the initializer if this is the last usage of it. + if (initializerToLastInputIndexMap[&initializer] == i) + { + transferredInitializerMap.erase(iter); + } + } + else if (inputsConstant[i]) + { + const onnxruntime::Tensor* inputTensor = nullptr; + THROW_HR_IF(E_UNEXPECTED, !kernelInfo.TryGetConstantInput(i, &inputTensor)); + + const std::byte* tensorData = reinterpret_cast(inputTensor->DataRaw()); + inputRawData.push_back( + std::vector(tensorData, tensorData + inputTensor->SizeInBytes())); + + uint64_t allocId; + UnwrapTensor(winmlProvider, inputTensor, &initInputBindings[i].Buffer, &allocId); + initInputBindings[i].SizeInBytes = initInputBindings[i].Buffer->GetDesc().Width; + + initInputBindings[i].Buffer->Release(); // Avoid holding an additional reference + initInputResources.push_back(initInputBindings[i].Buffer); + } + else + { + inputRawData.push_back(std::vector()); + } + } + + // All initializers should have been consumed and freed above + assert(transferredInitializerMap.empty()); + return inputRawData; + } + + void ConvertGraphDesc( + const Dml::GraphDescBuilder::GraphDesc& graphDesc, + _Out_ DML_GRAPH_DESC& dmlGraphDesc, + const onnxruntime::OpKernelInfo& kernelInfo, + _Out_ std::vector& dmlOperatorGraphNodes, + _Out_ std::vector& dmlGraphNodes, + _Out_ std::vector& dmlInputEdges, + _Out_ std::vector& dmlOutputEdges, + _Out_ std::vector& dmlIntermediateEdges) + { + const uint32_t graphInputCount = kernelInfo.GetInputCount(); + + for (size_t i = 0; i < graphDesc.nodes.size(); ++i) + { + dmlOperatorGraphNodes[i] = DML_OPERATOR_GRAPH_NODE_DESC{graphDesc.nodes[i].op.Get()}; + dmlGraphNodes[i] = DML_GRAPH_NODE_DESC{DML_GRAPH_NODE_TYPE_OPERATOR, &dmlOperatorGraphNodes[i]}; + } + + for (size_t i = 0; i < graphDesc.inputEdges.size(); ++i) + { + dmlInputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INPUT, &graphDesc.inputEdges[i]}; + } + + for (size_t i = 0; i < graphDesc.outputEdges.size(); ++i) + { + dmlOutputEdges[i] = DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_OUTPUT, &graphDesc.outputEdges[i]}; + } + + for (size_t i = 0; i < graphDesc.intermediateEdges.size(); ++i) + { + dmlIntermediateEdges[i] = + DML_GRAPH_EDGE_DESC{DML_GRAPH_EDGE_TYPE_INTERMEDIATE, &graphDesc.intermediateEdges[i]}; + } + + dmlGraphDesc.InputCount = graphInputCount; + dmlGraphDesc.OutputCount = kernelInfo.GetOutputCount(); + dmlGraphDesc.NodeCount = gsl::narrow_cast(dmlGraphNodes.size()); + dmlGraphDesc.Nodes = dmlGraphNodes.data(); + dmlGraphDesc.InputEdgeCount = gsl::narrow_cast(dmlInputEdges.size()); + dmlGraphDesc.InputEdges = dmlInputEdges.data(); + dmlGraphDesc.OutputEdgeCount = gsl::narrow_cast(dmlOutputEdges.size()); + dmlGraphDesc.OutputEdges = dmlOutputEdges.data(); + dmlGraphDesc.IntermediateEdgeCount = gsl::narrow_cast(dmlIntermediateEdges.size()); + dmlGraphDesc.IntermediateEdges = dmlIntermediateEdges.data(); + } +} // namespace GraphKernelHelper +} // namespace Dml \ No newline at end of file diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.h new file mode 100644 index 0000000000..b1b2e87cf8 --- /dev/null +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphKernelHelper.h @@ -0,0 +1,67 @@ +#include "GraphDescBuilder.h" + +namespace Dml +{ +namespace GraphKernelHelper +{ + using namespace Windows::AI::MachineLearning::Adapter; + + template + static T AlignToPow2(T offset, T alignment) + { + static_assert(std::is_unsigned_v); + assert(alignment != 0); + assert((alignment & (alignment - 1)) == 0); + return (offset + alignment - 1) & ~(alignment - 1); + } + + Microsoft::WRL::ComPtr + CreateResource( + Dml::IExecutionProvider* provider, + const std::byte* tensorPtr, + size_t tensorByteSize); + + Microsoft::WRL::ComPtr + CreateCpuResource( + Dml::IExecutionProvider* provider, + const std::byte* tensorPtr, + size_t tensorByteSize); + + void UnwrapTensor( + IWinmlExecutionProvider* winmlProvider, + const onnxruntime::Tensor* tensor, + ID3D12Resource** resource, + uint64_t* allocId); + + bool GetGraphInputConstness( + uint32_t index, + const onnxruntime::OpKernelInfo& kernelInfo, + const onnxruntime::ConstPointerContainer>& fusedNodeInputDefs, + const std::unordered_map& transferredInitializerMap); + + std::vector> PopulateInputBindings( + Dml::IExecutionProvider* provider, + IWinmlExecutionProvider* winmlProvider, + const std::vector& inputsConstant, + const onnxruntime::OpKernelInfo& kernelInfo, + const Dml::GraphDescBuilder::GraphDesc& graphDesc, + const onnxruntime::ConstPointerContainer>& fusedNodeInputDefs, + _Out_ std::vector& inputsUsed, + _Out_ std::vector& initInputBindings, + _Out_ std::vector>& initInputResources, + _Out_ std::vector>& nonOwnedGraphInputsFromInitializers, + _Out_ std::vector>& initializeResourceRefs, + _Inout_ std::unordered_map& transferredInitializerMap); + + void ConvertGraphDesc( + const Dml::GraphDescBuilder::GraphDesc& graphDesc, + _Out_ DML_GRAPH_DESC& dmlGraphDesc, + const onnxruntime::OpKernelInfo& kernelInfo, + _Out_ std::vector& dmlOperatorGraphNodes, + _Out_ std::vector& dmlGraphNodes, + _Out_ std::vector& dmlInputEdges, + _Out_ std::vector& dmlOutputEdges, + _Out_ std::vector& dmlIntermediateEdges); + +} // namespace GraphKernelHelper +} // namespace Dml \ No newline at end of file