onnxruntime/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
Sheil Kumar 84c1340f9b
Refactor implementation of Tensor<T> and underlying buffer stores to improve binary size and maintainability (#5836)
* refactor tensor buffers to make cleaner

* refactor to make tensor backing buffer implementation smaller and cleaner

* missed virtual on destructor

* remove unnecessary static_pointer_cast

* add string vector accessor

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
2020-11-18 14:56:47 -08:00

695 lines
No EOL
35 KiB
C++

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "pch.h"
#include <winmeta.h> // winmeta needed for TraceLoggingKeyword
#include <TraceLoggingProvider.h>
#include <TraceloggingConfig.h>
#include <evntrace.h>
#include <MemoryBuffer.h>
#include "inc/VideoFrameToTensorConverter.h"
#include "CpuTensorizer.h"
#include "inc/D3DDeviceCache.h"
#include "LearningModelDevice.h"
#include "EventTimer.h"
#include "robuffer.h"
#include "inc/DisjointBufferHelpers.h"
using namespace Microsoft::WRL;
using namespace Windows::Graphics::DirectX::Direct3D11;
using namespace _winml;
class DX12TextureToGPUTensorTelemetryEvent {
public:
DX12TextureToGPUTensorTelemetryEvent(const ImageTensorDescription& tensorDesc) {
runtime_session_id_ = telemetry_helper.GetRuntimeSessionId();
TraceLoggingWrite(
winml_trace_logging_provider,
"DX12TextureToGPUTensorStart",
TraceLoggingKeyword(WINML_PROVIDER_KEYWORD_DEFAULT),
TraceLoggingHexInt32(tensorDesc.channelType, "Type"),
TraceLoggingInt64(tensorDesc.sizes[2], "Height"),
TraceLoggingInt64(tensorDesc.sizes[3], "Width"),
TraceLoggingInt32(runtime_session_id_, "runtimeSessionId"),
TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES));
}
~DX12TextureToGPUTensorTelemetryEvent() {
TraceLoggingWrite(
winml_trace_logging_provider,
"DX12TextureToGPUTensorStop",
TraceLoggingKeyword(WINML_PROVIDER_KEYWORD_DEFAULT),
TraceLoggingHexInt32(S_OK, "HRESULT"),
TraceLoggingInt32(runtime_session_id_, "runtimeSessionId"),
TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES));
}
private:
int runtime_session_id_;
};
class SoftwareBitmapToGPUTensorTelemetryEvent {
public:
SoftwareBitmapToGPUTensorTelemetryEvent(const ImageTensorDescription& tensorDesc) {
runtime_session_id_ = telemetry_helper.GetRuntimeSessionId();
TraceLoggingWrite(
winml_trace_logging_provider,
"SoftwareBitmapToGPUTensorStart",
TraceLoggingKeyword(WINML_PROVIDER_KEYWORD_DEFAULT),
TraceLoggingHexInt32(tensorDesc.channelType, "Type"),
TraceLoggingInt64(tensorDesc.sizes[2], "Height"),
TraceLoggingInt64(tensorDesc.sizes[3], "Width"),
TraceLoggingInt32(runtime_session_id_, "runtimeSessionId"),
TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES));
}
~SoftwareBitmapToGPUTensorTelemetryEvent() {
TraceLoggingWrite(
winml_trace_logging_provider,
"SoftwareBitmapToGPUTensorStop",
TraceLoggingKeyword(WINML_PROVIDER_KEYWORD_DEFAULT),
TraceLoggingHexInt32(S_OK, "HRESULT"),
TraceLoggingInt32(runtime_session_id_, "runtimeSessionId"),
TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES));
}
private:
int runtime_session_id_;
};
class ConvertVideoFrameWithSoftwareBitmapToCPUTensorTelemetryEvent {
public:
ConvertVideoFrameWithSoftwareBitmapToCPUTensorTelemetryEvent(const ImageTensorDescription& tensorDesc) {
runtime_session_id_ = telemetry_helper.GetRuntimeSessionId();
TraceLoggingWrite(
winml_trace_logging_provider,
"ConvertVideoFrameWithSoftwareBitmapToCPUTensorStart",
TraceLoggingKeyword(WINML_PROVIDER_KEYWORD_DEFAULT),
TraceLoggingHexInt32(tensorDesc.channelType, "Type"),
TraceLoggingInt64(tensorDesc.sizes[2], "Height"),
TraceLoggingInt64(tensorDesc.sizes[3], "Width"),
TraceLoggingInt32(runtime_session_id_, "runtimeSessionId"),
TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES));
}
~ConvertVideoFrameWithSoftwareBitmapToCPUTensorTelemetryEvent() {
TraceLoggingWrite(
winml_trace_logging_provider,
"ConvertVideoFrameWithSoftwareBitmapToCPUTensorStop",
TraceLoggingKeyword(WINML_PROVIDER_KEYWORD_DEFAULT),
TraceLoggingHexInt32(S_OK, "HRESULT"),
TraceLoggingInt32(runtime_session_id_, "runtimeSessionId"),
TelemetryPrivacyDataTag(PDT_ProductAndServiceUsage),
TraceLoggingBool(true, "UTCReplace_AppSessionGuid"),
TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES));
}
private:
int runtime_session_id_;
};
void VideoFrameToTensorConverter::VideoFrameToSoftwareTensor(
_In_ const wm::IVideoFrame& inputVideoFrame,
_In_ const wgi::BitmapBounds& inputBounds,
_In_ const ImageTensorDescription& tensorDesc,
_Out_ BYTE* pOutputCPUTensor) {
CWinMLAutoLock lock(&lock_);
wgi::SoftwareBitmap spInputSoftwareBitmap = inputVideoFrame.SoftwareBitmap();
wgdx::Direct3D11::IDirect3DSurface spInputSurface = inputVideoFrame.Direct3DSurface();
// only one of softwarebitmap or direct3Dsurface should be non-null
if ((spInputSoftwareBitmap == nullptr && spInputSurface == nullptr) || (spInputSoftwareBitmap != nullptr && spInputSurface != nullptr)) {
WINML_THROW_IF_FAILED(E_INVALIDARG);
}
UINT32 tensorHeight = static_cast<UINT32>(tensorDesc.sizes[2]);
UINT32 tensorWidth = static_cast<UINT32>(tensorDesc.sizes[3]);
if (spInputSurface || _winmli::NeedsVideoFrameConversion(inputVideoFrame, {}, inputBounds, tensorWidth, tensorHeight)) {
if (converted_video_frame_ == nullptr ||
_winmli::NeedsVideoFrameConversion(converted_video_frame_, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight)) {
converted_video_frame_ = wm::VideoFrame::CreateWithSoftwareBitmap(
wgi::SoftwareBitmap(wgi::BitmapPixelFormat::Bgra8, tensorWidth, tensorHeight));
}
// Resize the input VideoFrame to converted_video_frame_
_winmli::ConvertVideoFrameToVideoFrame(
inputVideoFrame,
inputBounds,
tensorWidth,
tensorHeight,
converted_video_frame_);
ConvertSoftwareBitmapToCPUTensor(
converted_video_frame_.SoftwareBitmap(),
tensorDesc,
{0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight},
pOutputCPUTensor);
} else {
ConvertSoftwareBitmapToCPUTensor(
inputVideoFrame.SoftwareBitmap(),
tensorDesc,
inputBounds,
pOutputCPUTensor);
}
}
ComPtr<ID3D12Resource> VideoFrameToTensorConverter::ShareD3D11Texture(ID3D11Texture2D* pTexture, ID3D12Device* pDevice)
{
assert(pTexture != nullptr);
assert(pDevice != nullptr);
ComPtr<IDXGIResource1> spDxgiResource;
WINML_THROW_IF_FAILED(pTexture->QueryInterface(IID_PPV_ARGS(&spDxgiResource)));
HANDLE hSharedTexture;
WINML_THROW_IF_FAILED(spDxgiResource->CreateSharedHandle(nullptr, GENERIC_ALL, nullptr, &hSharedTexture));
wil::unique_handle safeHandle(hSharedTexture);
ComPtr<ID3D12Resource> d3d12Resource;
WINML_THROW_IF_FAILED(pDevice->OpenSharedHandle(safeHandle.get(), IID_PPV_ARGS(&d3d12Resource)));
shared_handle_ = safeHandle.get();
return d3d12Resource;
}
void VideoFrameToTensorConverter::VideoFrameToDX12Tensor(
_In_ const UINT32 batchIdx,
_In_ winml::LearningModelSession& session,
_In_ const wm::IVideoFrame& inputVideoFrame,
_In_ const wgi::BitmapBounds& inputBounds,
_In_ const ImageTensorDescription& tensorDesc,
_Inout_ ID3D12Resource* pOutputTensor) {
// Validate Tensor description
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.dataType == kImageTensorDataTypeFloat32 || tensorDesc.dataType == kImageTensorDataTypeFloat16, "Target tensor description must either be kImageTensorDataTypeFloat32, or kImageTensorDataTypeFloat16. %d was supplied.", tensorDesc.dataType);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.channelType != kImageTensorChannelTypeRGB8 || tensorDesc.sizes[1] == 3, "Target tensor description expects kImageTensorChannelTypeRGB8, but has %lld channels specified instead of 3.", tensorDesc.sizes[1]);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.channelType != kImageTensorChannelTypeBGR8 || tensorDesc.sizes[1] == 3, "Target tensor description expects kImageTensorChannelTypeBGR8, but has %lld channels specified instead of 3.", tensorDesc.sizes[1]);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.channelType != kImageTensorChannelTypeGRAY8 || tensorDesc.sizes[1] == 1, "Target tensor description expects kImageTensorChannelTypeGRAY8, but has %lld channels specified instead of 1.", tensorDesc.sizes[1]);
CWinMLAutoLock lock(&lock_);
auto device = session.Device().as<winmlp::LearningModelDevice>();
_winml::D3DDeviceCache* pDeviceCache = device->GetD3DDeviceCache();
wgdx::Direct3D11::IDirect3DSurface spDirect3DSurface = inputVideoFrame.Direct3DSurface();
if (inputVideoFrame.SoftwareBitmap()) {
ConvertSoftwareBitmapToGPUTensor(batchIdx, inputVideoFrame, *pDeviceCache, inputBounds, tensorDesc, pOutputTensor);
} else if (spDirect3DSurface) {
ComPtr<ID3D11Texture2D> spVideoFrameTexture;
wgi::BitmapBounds scaledBounds = inputBounds;
// TODO: Scale during the tensorization phase instead of using the video frame pipeline when the input bounds are not the same size as the tensor
if (!_winmli::DirectXPixelFormatSupported(spDirect3DSurface.Description().Format) || static_cast<UINT>(inputBounds.Width) != tensorDesc.sizes[3] || static_cast<UINT>(inputBounds.Height) != tensorDesc.sizes[2]) {
// Force the VideoFrame to not do a conversion if the format is supported since we do it during the tensorization anyway
wgdx::DirectXPixelFormat newFormat = _winmli::DirectXPixelFormatSupported(spDirect3DSurface.Description().Format)
? spDirect3DSurface.Description().Format
: _winmli::GetDirectXPixelFormatFromChannelType(tensorDesc.channelType);
// Change the input bounds since the video frame pipeline already cropped the texture
scaledBounds = {0, 0, static_cast<uint32_t>(tensorDesc.sizes[3]), static_cast<uint32_t>(tensorDesc.sizes[2])};
// Use the Video Frame pipeline if we don't have our own converter for this color format
spVideoFrameTexture = CreateTextureFromUnsupportedColorFormat(inputVideoFrame, inputBounds, scaledBounds, newFormat);
} else {
// If the color format is known or the input widths are not smaller than the tensor desc, just use the video frame as is
spVideoFrameTexture = _winmli::GetTextureFromDirect3DSurface(spDirect3DSurface);
}
D3D11_TEXTURE2D_DESC videoFrameTextureDesc;
spVideoFrameTexture->GetDesc(&videoFrameTextureDesc);
if (_winmli::TextureIsOnDevice(spVideoFrameTexture.Get(), pDeviceCache->GetD3D11Device())) {
// The texture is on our device, so we can just create own texture, share it and cache it
if (!D3D11_cached_texture_) {
WINML_THROW_IF_FAILED(pDeviceCache->GetD3D11Device()->CreateTexture2D(&videoFrameTextureDesc, nullptr, &D3D11_cached_texture_));
input_D3D12_resource_ = ShareD3D11Texture(D3D11_cached_texture_.Get(), pDeviceCache->GetD3D12Device());
} else {
D3D11_TEXTURE2D_DESC cachedTextureDesc;
D3D11_cached_texture_->GetDesc(&cachedTextureDesc);
if (cachedTextureDesc.Width != scaledBounds.Width || cachedTextureDesc.Height != scaledBounds.Height || cachedTextureDesc.Format != videoFrameTextureDesc.Format) {
// The dimensions or format don't match, so we need to re-create our texture
WINML_THROW_IF_FAILED(pDeviceCache->GetD3D11Device()->CreateTexture2D(&videoFrameTextureDesc, nullptr, &D3D11_cached_texture_));
input_D3D12_resource_ = ShareD3D11Texture(D3D11_cached_texture_.Get(), pDeviceCache->GetD3D12Device());
}
}
CopyTextureIntoTexture(spVideoFrameTexture.Get(), scaledBounds, D3D11_cached_texture_.Get());
} else {
// We are not on the same device, so we can't rely on our cached texture
ComPtr<ID3D11Device> spTextureDevice;
spVideoFrameTexture->GetDevice(&spTextureDevice);
ComPtr<ID3D11Texture2D> spSharedD3D11Texture;
HANDLE sharedHandle = nullptr;
UINT comPtrSize = static_cast<UINT>(sizeof(spSharedD3D11Texture.GetAddressOf()));
UINT handleSize = static_cast<UINT>(sizeof(sharedHandle));
if ((FAILED(spVideoFrameTexture->GetPrivateData(d3d11_texture_GUID_, &comPtrSize, spSharedD3D11Texture.GetAddressOf())) || !spSharedD3D11Texture.Get()) || (FAILED(spVideoFrameTexture->GetPrivateData(handle_GUID_, &handleSize, &sharedHandle)) || sharedHandle != shared_handle_)) {
// Create a new shared texture that we cache on the video frame texture
WINML_THROW_IF_FAILED(spTextureDevice->CreateTexture2D(&videoFrameTextureDesc, nullptr, &spSharedD3D11Texture));
input_D3D12_resource_ = ShareD3D11Texture(spSharedD3D11Texture.Get(), pDeviceCache->GetD3D12Device());
// Cache the shared texture on the video frame texture in order to tie their lifetime together
WINML_THROW_IF_FAILED(spVideoFrameTexture->SetPrivateDataInterface(d3d11_texture_GUID_, spSharedD3D11Texture.Get()));
WINML_THROW_IF_FAILED(spVideoFrameTexture->SetPrivateData(handle_GUID_, sizeof(shared_handle_), &shared_handle_));
}
// Copy from the video frame texture to the shared texture
CopyTextureIntoTexture(spVideoFrameTexture.Get(), scaledBounds, spSharedD3D11Texture.Get());
}
// Sync to make sure that the D3D11 texture is done copying
SyncD3D11ToD3D12(*pDeviceCache, spVideoFrameTexture.Get());
// We cropped the texture, shared it and converted it to a known color format, so it's time to tensorize
// TODO: merge all videoframes to a single DX12Texture Resource before call ConvertDX12TextureToGPUTensor.
ConvertDX12TextureToGPUTensor(batchIdx, input_D3D12_resource_.Get(), *pDeviceCache, tensorDesc, pOutputTensor);
} else {
// Invalid video frame
WINML_THROW_IF_FAILED(E_INVALIDARG);
}
}
void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor(
_In_ UINT32 batchIdx,
_In_ ID3D12Resource* pInputResource,
_In_ _winml::D3DDeviceCache& device_cache,
_In_ const ImageTensorDescription& tensorDesc,
_Inout_ ID3D12Resource* pOutputResource) {
assert(pInputResource != nullptr);
assert(pOutputResource != nullptr);
CWinMLAutoLock lock(&lock_);
D3D12_RESOURCE_DESC inputDesc = pInputResource->GetDesc();
D3D12_RESOURCE_DESC outputDesc = pOutputResource->GetDesc();
ComPtr<ID3D12Device> spDx12Device = device_cache.GetD3D12Device();
// we're inside a lock from the caller of this function, so it's ok to use this static
static EventTimer eventTimer;
std::optional<DX12TextureToGPUTensorTelemetryEvent> telemetryLogger;
if (eventTimer.Start()) {
telemetryLogger.emplace(tensorDesc);
}
// Validate input description
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
inputDesc.Format == DXGI_FORMAT_B8G8R8X8_UNORM || inputDesc.Format == DXGI_FORMAT_B8G8R8A8_UNORM || inputDesc.Format == DXGI_FORMAT_R8G8B8A8_UNORM || inputDesc.Format == DXGI_FORMAT_R8_UNORM,
"Format was input image %d. Input image format must Bgra8, Rgba8 or Gray8.",
inputDesc.Format);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, inputDesc.Width != 0, "Invalid input image height provided. Width is set to zero.");
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, inputDesc.Height != 0, "Invalid input image height provided. Height is set to zero.");
// Validate Tensor description
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.dataType == kImageTensorDataTypeFloat32 || tensorDesc.dataType == kImageTensorDataTypeFloat16, "Target tensor description must either be kImageTensorDataTypeFloat32, or kImageTensorDataTypeFloat16. %d was supplied.", tensorDesc.dataType);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.channelType != kImageTensorChannelTypeRGB8 || tensorDesc.sizes[1] == 3, "Target tensor description expects kImageTensorChannelTypeRGB8, but has %lld channels specified instead of 3.", tensorDesc.sizes[1]);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.channelType != kImageTensorChannelTypeBGR8 || tensorDesc.sizes[1] == 3, "Target tensor description expects kImageTensorChannelTypeBGR8, but has %lld channels specified instead of 3.", tensorDesc.sizes[1]);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.channelType != kImageTensorChannelTypeGRAY8 || tensorDesc.sizes[1] == 1, "Target tensor description expects kImageTensorChannelTypeGRAY8, but has %lld channels specified instead of 1.", tensorDesc.sizes[1]);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.sizes[2] == inputDesc.Height, "Target tensor height (%lld) does not match input height (%d).", tensorDesc.sizes[2], inputDesc.Height);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.sizes[3] == (UINT)inputDesc.Width, "Target tensor width (%lld) does not match input width (%d).", tensorDesc.sizes[3], (UINT)inputDesc.Width);
UINT uiTensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? sizeof(FLOAT) : sizeof(uint16_t);
// Validate Tensor Resource
{
D3D12_HEAP_PROPERTIES outputHeapProperties;
D3D12_HEAP_FLAGS outputHeapFlags;
WINML_THROW_IF_FAILED(pOutputResource->GetHeapProperties(&outputHeapProperties, &outputHeapFlags));
UINT64 ullNumElementsTensor = 1;
for (UINT uiIdx = 0; uiIdx < kImageTensorDimensionCountMax; uiIdx++) {
WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, tensorDesc.sizes[uiIdx], &ullNumElementsTensor));
}
if (ullNumElementsTensor > UINT_MAX) {
WINML_THROW_IF_FAILED(E_INVALIDARG);
}
UINT64 ullTensorSize = 0;
WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, uiTensorElementSize, &ullTensorSize));
if (outputDesc.Width < ullTensorSize ||
outputDesc.Height != 1 ||
outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER ||
!(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) ||
outputHeapProperties.Type != D3D12_HEAP_TYPE_DEFAULT) {
WINML_THROW_IF_FAILED(E_INVALIDARG);
}
}
{
ComPtr<ID3D12Device> spDx12DeviceIn, spDx12DeviceOut;
WINML_THROW_IF_FAILED(pInputResource->GetDevice(IID_PPV_ARGS(&spDx12DeviceIn)));
WINML_THROW_IF_FAILED(pOutputResource->GetDevice(IID_PPV_ARGS(&spDx12DeviceOut)));
if (spDx12Device != spDx12DeviceIn || spDx12Device != spDx12DeviceOut) {
// Both input and output should have the same device
WINML_THROW_IF_FAILED(E_INVALIDARG);
}
}
// Create descriptor heaps.
UINT srvUavDescriptorSize = spDx12Device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
if (descriptor_heap_ == nullptr) {
// Describe and create a shader resource view (SRV) and unordered access view (UAV) descriptor heap.
D3D12_DESCRIPTOR_HEAP_DESC srvUavHeapDesc = {};
srvUavHeapDesc.NumDescriptors = DescriptorCount;
srvUavHeapDesc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
srvUavHeapDesc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_SHADER_VISIBLE;
WINML_THROW_IF_FAILED(spDx12Device->CreateDescriptorHeap(&srvUavHeapDesc, IID_PPV_ARGS(&descriptor_heap_)));
descriptor_heap_->SetName(L"Tensorize Descriptor Heap");
}
// Create SRV and UAV for input and output respectively
{
D3D12_SHADER_RESOURCE_VIEW_DESC srvDesc = {};
srvDesc.Shader4ComponentMapping = D3D12_DEFAULT_SHADER_4_COMPONENT_MAPPING;
srvDesc.Format = inputDesc.Format;
srvDesc.ViewDimension = D3D12_SRV_DIMENSION_TEXTURE2D;
srvDesc.Texture2D.MipLevels = 1;
CD3DX12_CPU_DESCRIPTOR_HANDLE srvHandle(descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), SrvBufferIdx, srvUavDescriptorSize);
spDx12Device->CreateShaderResourceView(pInputResource, &srvDesc, srvHandle);
D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = CreateUAVDescription(batchIdx, outputDesc, tensorDesc);
CD3DX12_CPU_DESCRIPTOR_HANDLE uavHandle(descriptor_heap_->GetCPUDescriptorHandleForHeapStart(), UavBufferIdx, srvUavDescriptorSize);
spDx12Device->CreateUnorderedAccessView(pOutputResource, nullptr, &uavDesc, uavHandle);
}
//
// Pipeline setup for shader operation
//
PipelineStateCacheType type = PipelineStateCacheType::kFloat32;
if (tensorDesc.dataType == kImageTensorDataTypeFloat16) {
type = PipelineStateCacheType::kFloat16;
}
// Set the origin format
PipelineStateCacheFormat formatFrom = PipelineStateCacheFormat::kBGR8;
if (inputDesc.Format == DXGI_FORMAT_R8G8B8A8_UNORM) {
formatFrom = PipelineStateCacheFormat::kRGB8;
} else if (inputDesc.Format == DXGI_FORMAT_R8_UNORM) {
formatFrom = PipelineStateCacheFormat::kGRAY8;
}
// Set the destination format
PipelineStateCacheFormat formatTo = PipelineStateCacheFormat::kBGR8;
if (tensorDesc.channelType == kImageTensorChannelTypeRGB8) {
formatTo = PipelineStateCacheFormat::kRGB8;
} else if (tensorDesc.channelType == kImageTensorChannelTypeGRAY8) {
formatTo = PipelineStateCacheFormat::kGRAY8;
}
root_signature_ = device_cache.GetTensorizeRootSignature();
pipeline_state_ = device_cache.GetCachedPipelineState(type, formatFrom, formatTo, PipelineStateCacheOperation::kTensorize);
ResetCommandList(device_cache);
// Write compute commands into the command list and put it into the queue.
{
command_list_->SetComputeRootSignature(root_signature_.Get());
ID3D12DescriptorHeap* ppHeaps[] = {descriptor_heap_.Get()};
command_list_->SetDescriptorHeaps(_countof(ppHeaps), ppHeaps);
// This code currently re-uses the same decriptors each execution, which is unsafe if previous executions are in flight.
if (fence_completion_value_ > 0)
{
device_cache.WaitForFenceValue(fence_completion_value_);
}
CD3DX12_GPU_DESCRIPTOR_HANDLE srvHandle(descriptor_heap_->GetGPUDescriptorHandleForHeapStart(), SrvBufferIdx, srvUavDescriptorSize);
CD3DX12_GPU_DESCRIPTOR_HANDLE uavHandle(descriptor_heap_->GetGPUDescriptorHandleForHeapStart(), UavBufferIdx, srvUavDescriptorSize);
{
ConstantBufferCS constantBufferCS = {};
constantBufferCS.height = inputDesc.Height;
constantBufferCS.width = (UINT)inputDesc.Width;
command_list_->SetComputeRoot32BitConstants(0, 2, &constantBufferCS, 0);
}
command_list_->SetComputeRootDescriptorTable(1, srvHandle);
command_list_->SetComputeRootDescriptorTable(2, uavHandle);
UINT64 dispatchWidth = (inputDesc.Width - 1) / 16 + 1;
UINT64 dispatchHeight = (inputDesc.Height - 1) / 4 + 1;
command_list_->Dispatch(static_cast<uint32_t>(dispatchWidth), static_cast<uint32_t>(dispatchHeight), 1);
WINML_THROW_IF_FAILED(command_list_->Close());
ID3D12CommandList* pComputeToGPUCLs[] = {command_list_.Get()};
device_cache.GetCommandQueue()->ExecuteCommandLists(ARRAYSIZE(pComputeToGPUCLs), pComputeToGPUCLs);
fence_completion_value_ = device_cache.QueueFenceToD3D12();
}
}
void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor(
_In_ UINT32 batchIdx,
_In_ const wm::IVideoFrame& videoFrame,
_In_ _winml::D3DDeviceCache& device_cache,
_In_ const wgi::BitmapBounds& inputBounds,
_In_ const ImageTensorDescription& tensorDesc,
_Inout_ ID3D12Resource* pOutputResource) {
assert(pOutputResource != nullptr);
assert(videoFrame.SoftwareBitmap() != nullptr);
// we're inside a lock from the caller of this function, so it's ok to use this static
static EventTimer eventTimer;
std::optional<SoftwareBitmapToGPUTensorTelemetryEvent> telemetryLogger;
if (eventTimer.Start()) {
telemetryLogger.emplace(tensorDesc);
}
wgi::SoftwareBitmap convertedSoftwareBitmap = nullptr;
wgi::BitmapBounds scaledBounds = inputBounds;
// TODO: Scale during the tensorization phase instead of using the video frame pipeline when the input bounds are not the same size as the tensor
if (static_cast<UINT>(inputBounds.Width) != tensorDesc.sizes[3] || static_cast<UINT>(inputBounds.Height) != tensorDesc.sizes[2]) {
scaledBounds = {0, 0, static_cast<uint32_t>(tensorDesc.sizes[3]), static_cast<uint32_t>(tensorDesc.sizes[2])};
// Force the VideoFrame to not do a conversion if the format is supported since we do it during the tensorization anyway
wgi::BitmapPixelFormat newPixelFormat = _winmli::SoftwareBitmapFormatSupported(videoFrame.SoftwareBitmap())
? videoFrame.SoftwareBitmap().BitmapPixelFormat()
: _winmli::GetBitmapPixelFormatFromChannelType(tensorDesc.channelType);
convertedSoftwareBitmap = wgi::SoftwareBitmap(newPixelFormat, static_cast<int32_t>(tensorDesc.sizes[3]), static_cast<int32_t>(tensorDesc.sizes[2]));
wm::VideoFrame convertedVideoFrame = wm::VideoFrame::CreateWithSoftwareBitmap(convertedSoftwareBitmap);
videoFrame.as<wm::IVideoFrame2>().CopyToAsync(convertedVideoFrame, inputBounds, scaledBounds).get();
convertedSoftwareBitmap = convertedVideoFrame.SoftwareBitmap();
} else if (!_winmli::SoftwareBitmapFormatSupported(videoFrame.SoftwareBitmap())) {
convertedSoftwareBitmap = wgi::SoftwareBitmap::Convert(videoFrame.SoftwareBitmap(), _winmli::GetBitmapPixelFormatFromChannelType(tensorDesc.channelType));
} else {
// We don't need a conversion
convertedSoftwareBitmap = videoFrame.SoftwareBitmap();
}
assert(convertedSoftwareBitmap != nullptr);
D3D12_RESOURCE_DESC outputDesc = pOutputResource->GetDesc();
uint32_t tensorElementSize = tensorDesc.dataType == kImageTensorDataTypeFloat32 ? 4 : 2;
uint32_t bufferSize = static_cast<uint32_t>(tensorDesc.sizes[1] * tensorDesc.sizes[2] * tensorDesc.sizes[3] * tensorElementSize);
// TODO: Make an allocator for upload heaps
if (!upload_heap_ || upload_heap_->GetDesc().Width < bufferSize) {
WINML_THROW_IF_FAILED(device_cache.GetD3D12Device()->CreateCommittedResource(
&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD),
D3D12_HEAP_FLAG_NONE,
&CD3DX12_RESOURCE_DESC::Buffer(bufferSize),
D3D12_RESOURCE_STATE_GENERIC_READ,
nullptr,
IID_PPV_ARGS(&upload_heap_)));
}
void* pCPUTensorBuffer = nullptr;
WINML_THROW_IF_FAILED(upload_heap_->Map(0, &CD3DX12_RANGE(0, 0), &pCPUTensorBuffer));
// We avoid the Video Frame pipeline by manually sending the CPU data to the GPU, and we tensorize while we are filling the
// upload heap. The image may already have been cropped/scaled by the video frame pipeline, so we send the scaled bounds
// instead of the initial input bounds
ConvertSoftwareBitmapToCPUTensor(convertedSoftwareBitmap, tensorDesc, scaledBounds, pCPUTensorBuffer);
upload_heap_->Unmap(0, &CD3DX12_RANGE(0, bufferSize));
ResetCommandList(device_cache);
auto barrier = CD3DX12_RESOURCE_BARRIER::Transition(pOutputResource, D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST);
command_list_->ResourceBarrier(1, &barrier);
command_list_->CopyBufferRegion(pOutputResource, bufferSize * batchIdx, upload_heap_.Get(), 0, bufferSize);
WINML_THROW_IF_FAILED(command_list_->Close());
ID3D12CommandList* ppCommandLists[] = {command_list_.Get()};
device_cache.GetCommandQueue()->ExecuteCommandLists(_countof(ppCommandLists), ppCommandLists);
}
void VideoFrameToTensorConverter::ConvertBuffersToBatchedGPUTensor(
const std::vector<wss::IBuffer>& buffers,
size_t buffer_size_in_bytes,
_winml::D3DDeviceCache& device_cache,
ID3D12Resource* output_resource) {
// Copy the cpu memory into the gpu resource
if (!upload_heap_ || upload_heap_->GetDesc().Width < buffer_size_in_bytes) {
WINML_THROW_IF_FAILED(device_cache.GetD3D12Device()->CreateCommittedResource(
&CD3DX12_HEAP_PROPERTIES(D3D12_HEAP_TYPE_UPLOAD),
D3D12_HEAP_FLAG_NONE,
&CD3DX12_RESOURCE_DESC::Buffer(buffer_size_in_bytes),
D3D12_RESOURCE_STATE_GENERIC_READ,
nullptr,
IID_PPV_ARGS(&upload_heap_)));
}
byte* gpu_buffer = nullptr;
WINML_THROW_IF_FAILED(upload_heap_->Map(0, &CD3DX12_RANGE(0, 0), reinterpret_cast<void**>(&gpu_buffer)));
auto gpu_buffer_span = gsl::span<byte>(gpu_buffer, buffer_size_in_bytes);
_winml::LoadSpanFromDisjointBuffers(
buffers.size(),
[&](size_t i) {
byte* buffer_start = nullptr;
auto byte_access = buffers[i].as<Windows::Storage::Streams::IBufferByteAccess>();
byte_access->Buffer(&buffer_start);
return gsl::span<byte>(buffer_start, static_cast<size_t>(buffers[i].Capacity()));
},
gpu_buffer_span);
upload_heap_->Unmap(0, &CD3DX12_RANGE(0, buffer_size_in_bytes));
ResetCommandList(device_cache);
auto barrier1 = CD3DX12_RESOURCE_BARRIER::Transition(output_resource, D3D12_RESOURCE_STATE_COMMON, D3D12_RESOURCE_STATE_COPY_DEST);
command_list_->ResourceBarrier(1, &barrier1);
command_list_->CopyBufferRegion(output_resource, 0, upload_heap_.Get(), 0, buffer_size_in_bytes);
auto barrier2 = CD3DX12_RESOURCE_BARRIER::Transition(output_resource, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
command_list_->ResourceBarrier(1, &barrier2);
WINML_THROW_IF_FAILED(command_list_->Close());
ID3D12CommandList* lists[] = {command_list_.Get()};
device_cache.GetCommandQueue()->ExecuteCommandLists(_countof(lists), lists);
}
D3D12_UNORDERED_ACCESS_VIEW_DESC VideoFrameToTensorConverter::CreateUAVDescription(
const UINT32 batchIdx,
const D3D12_RESOURCE_DESC& resourceDesc,
const _winml::ImageTensorDescription& desc) {
UINT uiTensorElementSize =
desc.dataType == kImageTensorDataTypeFloat32 ? sizeof(UINT) : sizeof(uint16_t);
D3D12_UNORDERED_ACCESS_VIEW_DESC uavDesc = {};
uavDesc.ViewDimension = D3D12_UAV_DIMENSION_BUFFER;
UINT singleImageSize = static_cast<UINT>(desc.sizes[1] * desc.sizes[2] * desc.sizes[3]);
uavDesc.Buffer.FirstElement = batchIdx * desc.sizes[1] * desc.sizes[2] * desc.sizes[3];
uavDesc.Buffer.NumElements = singleImageSize;
uavDesc.Buffer.CounterOffsetInBytes = 0;
uavDesc.Buffer.Flags = D3D12_BUFFER_UAV_FLAG_NONE;
if (desc.dataType == kImageTensorDataTypeFloat32) {
// fp32 uses structured buffers so the format can be set to unknown,
// and the stride needs to be set.
uavDesc.Format = DXGI_FORMAT_UNKNOWN;
uavDesc.Buffer.StructureByteStride = uiTensorElementSize;
} else if (desc.dataType == kImageTensorDataTypeFloat16) {
// fp16 uses unstructured buffers because structured buffers dont support fp16 on
// most hardware. The format can be set to unknown to a specific known format,
// and the stride must be zeroed.
uavDesc.Format = DXGI_FORMAT_R16_FLOAT;
uavDesc.Buffer.StructureByteStride = 0;
} else {
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
false,
"Tensorization conversion is only supported to kImageTensorDataTypeFloat32, or kImageTensorDataTypeFloat16.");
}
return uavDesc;
}
void VideoFrameToTensorConverter::ConvertSoftwareBitmapToCPUTensor(
_In_ const wgi::SoftwareBitmap& softwareBitmap,
_In_ const _winml::ImageTensorDescription& tensorDesc,
_In_ const wgi::BitmapBounds& inputBounds,
_Inout_ void* pCPUTensor) {
assert(softwareBitmap != nullptr);
// we're inside a lock from the caller of this function, so it's ok to use this static
static EventTimer eventTimer;
std::optional<ConvertVideoFrameWithSoftwareBitmapToCPUTensorTelemetryEvent> telemetryLogger;
if (eventTimer.Start()) {
telemetryLogger.emplace(tensorDesc);
}
auto height = softwareBitmap.PixelHeight();
auto width = softwareBitmap.PixelWidth();
auto format = softwareBitmap.BitmapPixelFormat();
// Validate input description
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
format == wgi::BitmapPixelFormat::Bgra8 || format == wgi::BitmapPixelFormat::Rgba8 || format == wgi::BitmapPixelFormat::Gray8,
"Format was input image %d. Input image format must Bgra8, Rgba8 or Gray8.",
format);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, height > 0, "Invalid input image height provided. Height is set to zero.");
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, width > 0, "Invalid input image width provided. Height is set to zero.");
// Validate Tensor description
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.dataType == kImageTensorDataTypeFloat32 || tensorDesc.dataType == kImageTensorDataTypeFloat16, "Target tensor description must either be kImageTensorDataTypeFloat32, or kImageTensorDataTypeFloat16. %d was supplied.", tensorDesc.dataType);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.channelType != kImageTensorChannelTypeRGB8 || tensorDesc.sizes[1] == 3, "Target tensor description expects kImageTensorChannelTypeRGB8, but has %lld channels specified instead of 3.", tensorDesc.sizes[1]);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.channelType != kImageTensorChannelTypeBGR8 || tensorDesc.sizes[1] == 3, "Target tensor description expects kImageTensorChannelTypeBGR8, but has %lld channels specified instead of 3.", tensorDesc.sizes[1]);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.channelType != kImageTensorChannelTypeGRAY8 || tensorDesc.sizes[1] == 1, "Target tensor description expects kImageTensorChannelTypeGRAY8, but has %lld channels specified instead of 1.", tensorDesc.sizes[1]);
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
tensorDesc.channelType == kImageTensorChannelTypeGRAY8 ||
tensorDesc.channelType == kImageTensorChannelTypeBGR8 ||
tensorDesc.channelType == kImageTensorChannelTypeRGB8,
"Target tensor description expects kImageTensorChannelTypeGRAY8, kImageTensorChannelTypeBGR8, or kImageTensorChannelTypeRGB8 but has %d was specified.",
tensorDesc.channelType);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.sizes[2] == (UINT)inputBounds.Height, "Target tensor height (%lld) does not match input height (%d).", tensorDesc.sizes[2], (UINT)inputBounds.Height);
WINML_THROW_HR_IF_FALSE_MSG(E_INVALIDARG, tensorDesc.sizes[3] == (UINT)inputBounds.Width, "Target tensor width (%lld) does not match input width (%d).", tensorDesc.sizes[3], (UINT)inputBounds.Width);
// get the byte buffer out of a softwarebitmap
BYTE* pData = nullptr;
UINT32 bufferSize = 0;
wgi::BitmapBuffer spBitmapBuffer(softwareBitmap.LockBuffer(wgi::BitmapBufferAccessMode::Read));
wf::IMemoryBufferReference reference = spBitmapBuffer.CreateReference();
auto spByteAccess = reference.as<Windows::Foundation::IMemoryBufferByteAccess>();
WINML_THROW_IF_FAILED(spByteAccess->GetBuffer(&pData, &bufferSize));
UINT32 bufferWidth = bufferSize / height;
ImageTensorChannelType channelType = _winmli::GetChannelTypeFromSoftwareBitmap(softwareBitmap);
if (tensorDesc.dataType == _winml::kImageTensorDataTypeFloat32) {
WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData<float>(
channelType,
tensorDesc.channelType,
tensorDesc.pixelRange,
pData,
bufferWidth,
inputBounds,
reinterpret_cast<float*>(pCPUTensor)));
} else if (tensorDesc.dataType == _winml::kImageTensorDataTypeFloat16) {
WINML_THROW_IF_FAILED(CpuTensorizer::TensorizeData<DirectX::PackedVector::HALF>(
channelType,
tensorDesc.channelType,
tensorDesc.pixelRange,
pData,
bufferWidth,
inputBounds,
reinterpret_cast<DirectX::PackedVector::HALF*>(pCPUTensor)));
}
}