onnxruntime/winml/lib/Api/impl/TensorBuffer.h
Sheil Kumar 671fa60327
Enable direct tensorization and detensorization to many buffers in WinML (#5791)
* switch to work PC

* back with iterable of buffers

* add raw api tests

* tensorization

* last test

* all tests pass!

* small cleanup

* whitespace

* newline

* whitespace

* refactor common code into DisjointBufferHelpers

* remove unused file

* warning

* skip gpu tests when hardware not available

* Add error condition when createreference is invoked

* add null check to cretereference

* uncomment out check

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
2020-11-16 10:06:22 -08:00

234 lines
No EOL
6 KiB
C++

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "robuffer.h"
#include "winrt/Windows.Storage.Streams.h"
#include "DisjointBufferHelpers.h"
namespace _winml {
class VectorBuffer : public winrt::implements<
VectorBuffer,
wss::IBuffer,
Windows::Storage::Streams::IBufferByteAccess> {
public:
VectorBuffer(size_t size) : buffer_(size) {}
uint32_t Capacity() const {
return static_cast<uint32_t>(buffer_.size());
}
uint32_t Length() const {
throw winrt::hresult_error(E_NOTIMPL);
}
void Length(uint32_t /*value*/) {
throw winrt::hresult_error(E_NOTIMPL);
}
STDMETHOD(Buffer)
(uint8_t** value) {
RETURN_HR_IF_NULL(E_POINTER, value);
*value = buffer_.data();
return S_OK;
}
private:
std::vector<BYTE> buffer_;
};
template <typename T>
class TensorBuffer {
wss::IBuffer combined_buffer_;
std::vector<wss::IBuffer> buffers_;
size_t size_;
TensorBuffer(size_t size) :
size_(size),
combined_buffer_(winrt::make<VectorBuffer>(size * sizeof(T))),
buffers_ { combined_buffer_ } {
auto buffer = BufferAt(0);
// The initial release of WinML (RS5) shipped with behavior that would
// zero-initialize uninitialized tensors. After measuring, the performance impact
// of memsetting the memory buffer is quite small (<1ms for 3channel 720x720 TensorFloats).
// To maintain parity with RS5 behavior, we always zero out the memory buffer.
memset(buffer.data(), 0, buffer.size_bytes());
}
TensorBuffer(
size_t size,
wfc::IIterable<wss::IBuffer> const& buffers) : size_(size),
combined_buffer_(nullptr),
buffers_(begin(buffers), end(buffers)) {
if (buffers_.size() == 1) {
combined_buffer_ = buffers_[0];
} else {
// If there are many buffers, then the combined buffer will be a separately allocated value that combines all of the buffers.
// This needs to be lazily done however, as the extra memory should not be allocated when not needed (GPU).
}
}
auto CombinedBuffer() {
if (combined_buffer_ == nullptr) {
combined_buffer_ = winrt::make<VectorBuffer>(size_ * sizeof(T));
}
return BufferFrom(combined_buffer_);
}
public:
static auto Create(size_t size) {
return std::shared_ptr<TensorBuffer>(new TensorBuffer(size));
}
static auto Create(
size_t size,
wss::IBuffer buffer) {
return std::shared_ptr<TensorBuffer>(new TensorBuffer(size, buffer));
}
static auto Create(
size_t size,
wfc::IIterable<wss::IBuffer> const& buffers) {
return std::shared_ptr<TensorBuffer>(new TensorBuffer(size, buffers));
}
auto NumElements() {
return size_;
}
auto SizeInBytes() {
return size_ * sizeof(T);
}
auto NumBuffers() {
return buffers_.size();
}
auto& Buffers() {
return buffers_;
}
auto Buffer(bool should_sync_buffer) {
if (buffers_.size() == 1) {
// Single buffer optimization to not create a temporary buffer that concatenates disjoint buffers into one.
return BufferAt(0);
}
auto span = CombinedBuffer();
if (should_sync_buffer) {
_winml::LoadOrStoreDisjointBuffers(
true /*load buffer*/,
buffers_.size(),
[this](size_t i) { return BufferAt(i); },
span);
}
return span;
}
auto Flush() {
auto should_flush = buffers_.size() != 1;
if (should_flush) {
auto span = CombinedBuffer();
_winml::LoadOrStoreDisjointBuffers(
false /*store buffer*/,
buffers_.size(),
[this](size_t i) { return BufferAt(i); },
span);
}
return should_flush;
}
auto Set(size_t size_in_bytes, const T* data) {
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
size_in_bytes <= (size_ * sizeof(T)),
"Argument size (%llu) exceeds the tensor size (%llu).",
static_cast<uint64_t>(size_in_bytes),
static_cast<uint64_t>(size_ * sizeof(T)));
gsl::span<byte> span(reinterpret_cast<byte*>(const_cast<T*>(data)), size_in_bytes);
_winml::LoadOrStoreDisjointBuffers(
false /*store buffer*/,
buffers_.size(),
[this](size_t i) { return BufferAt(i); },
span);
}
auto Set(std::vector<T>&& moveableData) {
Set(moveableData.size() * sizeof(T), moveableData.data());
}
private:
auto BufferFrom(wss::IBuffer buffer) {
byte* current_data = nullptr;
auto bufferByteAccess = buffer.as<Windows::Storage::Streams::IBufferByteAccess>();
bufferByteAccess->Buffer(&current_data);
return gsl::span<byte>(
current_data,
static_cast<size_t>(buffer.Capacity()));
}
auto BufferAt(size_t index) {
return BufferFrom(buffers_[index]);
}
};
template <>
class TensorBuffer<std::string> {
std::vector<std::string> buffer_;
TensorBuffer(size_t size) : buffer_(size) {}
public:
static auto Create(size_t size) {
return std::shared_ptr<TensorBuffer>(new TensorBuffer(size));
}
auto NumElements() {
return buffer_.size();
}
auto SizeInBytes() {
return buffer_.size();
}
auto NumBuffers() {
return 1;
}
auto Flush() {
return false;
}
auto Buffers() -> std::vector<wss::IBuffer>& {
WINML_THROW_HR(E_UNEXPECTED);
}
auto BufferAt(size_t index) {
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
index == 0,
"TensorString can only be backed by a single buffer!");
return gsl::span<byte>(reinterpret_cast<byte*>(buffer_.data()), buffer_.size());
}
auto Buffer(bool /*should_sync_buffer*/) {
return BufferAt(0);
}
auto Set(size_t size, std::string_view* data) {
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
size <= buffer_.size(),
"Argument size (%d) exceeds the tensor size (%d).",
static_cast<int>(size),
static_cast<int>(buffer_.size()));
// Copy
std::copy(data, data + size, buffer_.begin());
}
};
} // namespace _winml