Refactor implementation of Tensor<T> and underlying buffer stores to improve binary size and maintainability (#5836)

* refactor tensor buffers to make cleaner

* refactor to make tensor backing buffer implementation smaller and cleaner

* missed virtual on destructor

* remove unnecessary static_pointer_cast

* add string vector accessor

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
This commit is contained in:
Sheil Kumar 2020-11-18 14:56:47 -08:00 committed by GitHub
parent 85f945a875
commit 84c1340f9b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 548 additions and 401 deletions

View file

@ -400,15 +400,18 @@ endif(onnxruntime_USE_DML)
# Add static library that will be archived/linked for both static/dynamic library
add_library(winml_lib_api STATIC
${winml_lib_api_dir}/impl/FeatureCompatibility.h
${winml_lib_api_dir}/impl/IData.h
${winml_lib_api_dir}/impl/IMapFeatureValue.h
${winml_lib_api_dir}/impl/ISequenceFeatureValue.h
${winml_lib_api_dir}/impl/MapBase.h
${winml_lib_api_dir}/impl/NumericData.h
${winml_lib_api_dir}/impl/SequenceBase.h
${winml_lib_api_dir}/impl/StringData.h
${winml_lib_api_dir}/impl/Tensor.h
${winml_lib_api_dir}/impl/TensorBase.h
${winml_lib_api_dir}/impl/TensorBuffer.h
${winml_lib_api_dir}/impl/TensorKindFrom.h
${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
${winml_lib_api_dir}/NumericData.cpp
${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
${winml_lib_api_dir}/ImageFeatureDescriptor.h
${winml_lib_api_dir}/ImageFeatureValue.cpp
@ -429,8 +432,11 @@ add_library(winml_lib_api STATIC
${winml_lib_api_dir}/MapFeatureDescriptor.h
${winml_lib_api_dir}/SequenceFeatureDescriptor.cpp
${winml_lib_api_dir}/SequenceFeatureDescriptor.h
${winml_lib_api_dir}/StringData.cpp
${winml_lib_api_dir}/TensorFeatureDescriptor.cpp
${winml_lib_api_dir}/TensorFeatureDescriptor.h
${winml_lib_api_dir}/VectorBackedBuffer.h
${winml_lib_api_dir}/VectorBackedBuffer.cpp
${winml_lib_api_dir}/pch/pch.h
)

View file

@ -3,7 +3,7 @@
namespace _winml {
void LoadOrStoreDisjointBuffers(
static void LoadOrStoreDisjointBuffers(
bool should_load_buffer,
size_t num_buffers,
std::function<gsl::span<byte>(size_t)> get_buffer,
@ -31,4 +31,18 @@ void LoadOrStoreDisjointBuffers(
}
}
void LoadSpanFromDisjointBuffers(
size_t num_buffers,
std::function<gsl::span<byte>(size_t)> get_buffer,
gsl::span<byte>& buffer_span) {
LoadOrStoreDisjointBuffers(true /*load into the span*/, num_buffers, get_buffer, buffer_span);
}
void StoreSpanIntoDisjointBuffers(
size_t num_buffers,
std::function<gsl::span<byte>(size_t)> get_buffer,
gsl::span<byte>& buffer_span) {
LoadOrStoreDisjointBuffers(false /*store into buffers*/, num_buffers, get_buffer, buffer_span);
}
} // namespace _winml

View file

@ -630,8 +630,7 @@ void TensorToVideoFrameConverter::ConvertBatchedDX12TensorToBuffers(
byte* readback_buffer = nullptr;
WINML_THROW_IF_FAILED(readback_heap_->Map(0, &CD3DX12_RANGE(0, buffer_size_in_bytes), reinterpret_cast<void**>(&readback_buffer)));
auto readback_buffer_span = gsl::span<byte>(readback_buffer, buffer_size_in_bytes);
_winml::LoadOrStoreDisjointBuffers(
false /*load disjoint buffers into*/,
_winml::StoreSpanIntoDisjointBuffers(
buffers.size(),
[&](size_t i) {
byte* buffer_start = nullptr;

View file

@ -559,8 +559,7 @@ void VideoFrameToTensorConverter::ConvertBuffersToBatchedGPUTensor(
WINML_THROW_IF_FAILED(upload_heap_->Map(0, &CD3DX12_RANGE(0, 0), reinterpret_cast<void**>(&gpu_buffer)));
auto gpu_buffer_span = gsl::span<byte>(gpu_buffer, buffer_size_in_bytes);
_winml::LoadOrStoreDisjointBuffers(
true /*load disjoint buffers into*/,
_winml::LoadSpanFromDisjointBuffers(
buffers.size(),
[&](size_t i) {
byte* buffer_start = nullptr;

View file

@ -7,8 +7,12 @@
namespace _winml {
void LoadOrStoreDisjointBuffers(
bool should_load_buffer,
void LoadSpanFromDisjointBuffers(
size_t num_buffers,
std::function<gsl::span<byte>(size_t)> get_buffer,
gsl::span<byte>& buffer_span);
void StoreSpanIntoDisjointBuffers(
size_t num_buffers,
std::function<gsl::span<byte>(size_t)> get_buffer,
gsl::span<byte>& buffer_span);

View file

@ -0,0 +1,129 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "pch.h"
#include "impl/NumericData.h"
#include "VectorBackedBuffer.h"
#include "robuffer.h"
#include "winrt/Windows.Storage.Streams.h"
#include "DisjointBufferHelpers.h"
namespace _winml {
std::shared_ptr<_winml::idata> numeric_data::create(
size_t num_elements,
size_t element_size_in_bytes,
wfc::IIterable<wss::IBuffer> const& buffers) {
return std::make_shared<numeric_data>(num_elements, element_size_in_bytes, buffers);
}
numeric_data::numeric_data(
size_t num_elements, size_t element_size_in_bytes, wfc::IIterable<wss::IBuffer> const& buffers) :
num_elements_(num_elements),
element_size_in_bytes_(element_size_in_bytes),
combined_buffer_(nullptr),
buffers_() {
if (buffers != nullptr) {
buffers_ = { begin(buffers), end(buffers) };
}
if (buffers_.size() == 0) {
combined_buffer_ = winrt::make<vector_backed_buffer>(num_elements * element_size_in_bytes);
buffers_ = { combined_buffer_ };
auto buffer = buffer_at(0);
// The initial release of WinML (RS5) shipped with behavior that would
// zero-initialize uninitialized tensors. After measuring, the performance impact
// of memsetting the memory buffer is quite small (<1ms for 3channel 720x720 TensorFloats).
// To maintain parity with RS5 behavior, we always zero out the memory buffer.
memset(buffer.data(), 0, buffer.size_bytes());
}
else if (buffers_.size() == 1) {
combined_buffer_ = buffers_[0];
}
else {
// If there are many buffers, then the combined buffer will be a separately allocated value that combines all of the buffers.
// This needs to be lazily done however, as the extra memory should not be allocated when not needed (GPU).
}
}
size_t numeric_data::num_elements() {
return num_elements_;
}
size_t numeric_data::size_in_bytes() {
return num_elements_ * element_size_in_bytes_;
}
size_t numeric_data::num_buffers() {
return buffers_.size();
}
std::vector<wss::IBuffer>& numeric_data::buffers() {
return buffers_;
}
gsl::span<byte> numeric_data::buffer(bool should_sync_buffer) {
if (buffers_.size() == 1) {
// Single buffer optimization to not create a temporary buffer that concatenates disjoint buffers into one.
return buffer_at(0);
}
auto span = combined_buffer();
if (should_sync_buffer) {
_winml::LoadSpanFromDisjointBuffers(
buffers_.size(),
[this](size_t i) { return buffer_at(i); },
span);
}
return span;
}
bool numeric_data::flush() {
auto should_flush = buffers_.size() != 1;
if (should_flush) {
auto span = combined_buffer();
_winml::StoreSpanIntoDisjointBuffers(
buffers_.size(),
[this](size_t i) { return buffer_at(i); },
span);
}
return should_flush;
}
void numeric_data::set(size_t data_size, const byte* data) {
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
data_size <= (num_elements_ * element_size_in_bytes_),
"Argument size (%llu) exceeds the tensor size (%llu).",
static_cast<uint64_t>(data_size),
static_cast<uint64_t>(num_elements_ * element_size_in_bytes_));
gsl::span<byte> span(const_cast<byte*>(data), data_size);
_winml::StoreSpanIntoDisjointBuffers(
buffers_.size(),
[this](size_t i) { return buffer_at(i); },
span);
}
static gsl::span<byte> get_span_from_ibuffer(wss::IBuffer buffer) {
byte* current_data = nullptr;
auto bufferByteAccess = buffer.as<Windows::Storage::Streams::IBufferByteAccess>();
bufferByteAccess->Buffer(&current_data);
return gsl::span<byte>(
current_data,
static_cast<size_t>(buffer.Capacity()));
}
gsl::span<byte> numeric_data::buffer_at(size_t index) {
return get_span_from_ibuffer(buffers_[index]);
}
gsl::span<byte> numeric_data::combined_buffer() {
if (combined_buffer_ == nullptr) {
combined_buffer_ = winrt::make<vector_backed_buffer>(num_elements_ * element_size_in_bytes_);
}
return get_span_from_ibuffer(combined_buffer_);
}
} // namespace _winml

View file

@ -0,0 +1,62 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "pch.h"
#include "impl/StringData.h"
namespace _winml {
string_data::string_data(size_t size) :
buffer_(size) {}
std::shared_ptr<_winml::idata> string_data::create(size_t size) {
return std::make_shared<string_data>(size);
}
size_t string_data::num_elements() {
return buffer_.size();
}
size_t string_data::size_in_bytes() {
WINML_THROW_HR(E_UNEXPECTED);
}
size_t string_data::num_buffers() {
return 1;
}
bool string_data::flush() {
// Vacuously true
return true;
}
std::vector<wss::IBuffer>& string_data::buffers() {
WINML_THROW_HR(E_UNEXPECTED);
}
gsl::span<byte> string_data::buffer(bool /*should_sync_buffer*/) {
return gsl::span<byte>(reinterpret_cast<byte*>(buffer_.data()), buffer_.size());
}
void string_data::set(size_t num_elements, const std::string_view* data) {
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
num_elements <= buffer_.size(),
"Argument size (%d) exceeds the tensor size (%d).",
static_cast<int>(num_elements),
static_cast<int>(buffer_.size()));
// Copy
std::copy(data, data + num_elements, buffer_.begin());
}
void string_data::set(size_t /*data_size*/, const byte* /*data*/) {
WINML_THROW_HR(E_UNEXPECTED);
}
std::vector<std::string>& string_data::get_backing_vector() {
return buffer_;
}
} // namespace _winml

View file

@ -0,0 +1,29 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "pch.h"
#include "VectorBackedBuffer.h"
namespace _winml {
vector_backed_buffer::vector_backed_buffer(size_t size) : buffer_(size) {}
uint32_t vector_backed_buffer::Capacity() const {
return static_cast<uint32_t>(buffer_.size());
}
uint32_t vector_backed_buffer::Length() const {
throw winrt::hresult_error(E_NOTIMPL);
}
void vector_backed_buffer::Length(uint32_t /*value*/) {
throw winrt::hresult_error(E_NOTIMPL);
}
STDMETHODIMP vector_backed_buffer::Buffer(uint8_t** value) {
RETURN_HR_IF_NULL(E_POINTER, value);
*value = buffer_.data();
return S_OK;
}
} // namespace _winml

View file

@ -0,0 +1,28 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "robuffer.h"
#include "winrt/Windows.Storage.Streams.h"
namespace _winml {
class vector_backed_buffer : public winrt::implements<
vector_backed_buffer,
wss::IBuffer,
Windows::Storage::Streams::IBufferByteAccess> {
public:
vector_backed_buffer(size_t size);
uint32_t Capacity() const;
uint32_t Length() const;
void Length(uint32_t /*value*/);
STDMETHOD(Buffer)(uint8_t** value);
private:
std::vector<BYTE> buffer_;
};
} // namespace _winml

View file

@ -0,0 +1,25 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "IEngine.h"
// ILotusValueProviderPrivate exposes a private Lotus interface to the engine so that it can retrieve tensor
// resources stored in winrt structures.
namespace _winml {
struct idata {
virtual ~idata(){}
virtual size_t num_elements() = 0;
virtual size_t size_in_bytes() = 0;
virtual size_t num_buffers() = 0;
virtual std::vector<wss::IBuffer>& buffers() = 0;
virtual gsl::span<byte> buffer(bool should_sync_buffer) = 0;
virtual bool flush() = 0;
virtual void set(size_t data_size, const byte* data) = 0;
};
} // namespace _winml

View file

@ -0,0 +1,46 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "IData.h"
#include "robuffer.h"
#include "winrt/Windows.Storage.Streams.h"
namespace _winml {
class numeric_data : public _winml::idata {
public:
static std::shared_ptr<_winml::idata> create(
size_t num_elements,
size_t element_size_in_bytes,
wfc::IIterable<wss::IBuffer> const& buffers);
// Privte constructor as this type should be created as a shared_ptr
numeric_data(size_t num_elements, size_t element_size_in_bytes, wfc::IIterable<wss::IBuffer> const& buffers);
gsl::span<byte> buffer_at(size_t index);
gsl::span<byte> combined_buffer();
public:
size_t num_elements() override;
size_t size_in_bytes() override;
size_t num_buffers() override;
// Buffer accessors
std::vector<wss::IBuffer>& buffers() override;
gsl::span<byte> buffer(bool should_sync_buffer) override;
// Flush to buffers API
bool flush() override;
// Set APIs
void set(size_t data_size, const byte* data) override;
private:
wss::IBuffer combined_buffer_;
std::vector<wss::IBuffer> buffers_;
size_t num_elements_;
size_t element_size_in_bytes_;
};
} // namespace _winml

View file

@ -0,0 +1,40 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "IData.h"
#include "robuffer.h"
#include "winrt/Windows.Storage.Streams.h"
namespace _winml {
class string_data : public _winml::idata {
public:
static std::shared_ptr<_winml::idata> create(size_t size);
string_data(size_t size);
size_t num_elements() override;
size_t size_in_bytes() override;
size_t num_buffers() override;
// Buffer accessors
std::vector<wss::IBuffer>& buffers() override;
gsl::span<byte> buffer(bool should_sync_buffer) override;
// Flush to buffers API
bool flush() override;
// Set APIs
void set(size_t data_size, const byte* data) override;
public:
void set(size_t num_elements, const std::string_view* data);
std::vector<std::string>& get_backing_vector();
private:
std::vector<std::string> buffer_;
};
} // namespace _winml

View file

@ -3,7 +3,8 @@
#pragma once
#include "TensorBuffer.h"
#include "NumericData.h"
#include "StringData.h"
//
// the Tensor class is the actual object for CPU memory buffers.
@ -12,81 +13,83 @@
//
namespace _winml {
inline size_t compute_size_of_shape(const std::vector<int64_t>& shape) {
auto size_of_shape =
static_cast<size_t>(
std::accumulate(
std::begin(shape),
std::end(shape),
static_cast<int64_t>(1),
std::multiplies<int64_t>()));
return size_of_shape;
}
template <typename T>
inline auto create_data(
const std::vector<int64_t>& shape,
const wfc::IIterable<wss::IBuffer>& buffers) {
return _winml::numeric_data::create(compute_size_of_shape(shape), sizeof(T), buffers);
}
template <>
inline auto create_data<std::string>(
const std::vector<int64_t>& shape,
const wfc::IIterable<wss::IBuffer>& /*buffers*/) {
return _winml::string_data::create(compute_size_of_shape(shape));
}
template <typename T>
class Tensor {
private:
std::shared_ptr<TensorBuffer<T>> buffer_;
std::shared_ptr<_winml::idata> data_;
std::vector<int64_t> shape_;
public:
private:
Tensor() = delete;
Tensor(
std::vector<int64_t> const& shape,
wfc::IIterable<wss::IBuffer> const& buffers) :
shape_(shape),
buffer_(TensorBuffer<T>::Create(
static_cast<size_t>(std::accumulate(
std::begin(shape), std::end(shape),
static_cast<int64_t>(1), std::multiplies<int64_t>())),
buffers)) {}
public:
Tensor(const std::vector<int64_t>& shape) :
shape_(shape),
data_(create_data<T>(shape, nullptr)) {}
Tensor(
std::vector<int64_t> const& shape) : shape_(shape),
buffer_(TensorBuffer<T>::Create(
static_cast<size_t>(std::accumulate(
std::begin(shape), std::end(shape),
static_cast<int64_t>(1),
std::multiplies<int64_t>())))) {}
Tensor(
std::vector<int64_t> const&& shape) : shape_(std::move(shape)),
buffer_(TensorBuffer<T>::Create(
static_cast<size_t>(std::accumulate(
std::begin(shape), std::end(shape),
static_cast<int64_t>(1),
std::multiplies<int64_t>())))) {
}
auto number_of_elements() const {
return buffer_->NumElements();
}
const std::vector<int64_t>& shape,
const wfc::IIterable<wss::IBuffer>& buffers) :
shape_(shape),
data_(create_data<T>(shape, buffers)) {}
auto size_in_bytes() const {
return buffer_->SizeInBytes();
return data_->size_in_bytes();
}
auto num_buffers() {
return buffer_->NumBuffers();
return data_->num_buffers();
}
auto& buffers() {
return buffer_->Buffers();
return data_->buffers();
}
auto buffer(bool should_sync_buffer = true) {
auto span = buffer_->Buffer(should_sync_buffer);
return gsl::span<T>(reinterpret_cast<T*>(span.data()), buffer_->NumElements());
gsl::span<T> buffer(bool should_sync_buffer = true) {
auto span = data_->buffer(should_sync_buffer);
return gsl::span<T>(reinterpret_cast<T*>(span.data()), data_->num_elements());
}
auto flush() {
return buffer_->Flush();
return data_->flush();
}
void set(size_t size, const T* pData) {
buffer_->Set(size * sizeof(T), pData);
}
void set(std::vector<T>&& other) {
buffer_->Set(other);
void set(size_t size, const T* data) {
auto size_in_bytes = size * sizeof(T);
data_->set(size_in_bytes, reinterpret_cast<const byte*>(data));
}
const std::vector<int64_t>& shape() const {
return shape_;
}
auto get_tensor_buffer() {
return buffer_;
auto get_data() {
return data_;
}
};
} // namespace _winml

View file

@ -74,28 +74,28 @@ struct TensorBase : TBase {
/// b) TensorBase(winrt::Windows::Foundation::Collections::IIterable<int64_t> const& shape)
/// 3) use provided backing gpu memory
/// a) TensorBase(std::vector<int64_t> const& shape, ID3D12Resource* pResource)
TensorBase() : m_resources(std::make_shared<TensorResources<T>>()) {
TensorBase() : resources_(std::make_shared<TensorResources<T>>()) {
}
TensorBase(wfc::IIterable<int64_t> const& shape) : shape_(begin(shape), end(shape)),
m_resources(std::make_shared<TensorResources<T>>()) {
GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape_);
resources_(std::make_shared<TensorResources<T>>()) {
CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
}
TensorBase(std::vector<int64_t> const& shape) : shape_(shape),
m_resources(std::make_shared<TensorResources<T>>()) {
GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape_);
resources_(std::make_shared<TensorResources<T>>()) {
CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
}
TensorBase(std::vector<int64_t> const& shape, ID3D12Resource* resource) : shape_(shape),
m_resources(std::make_shared<TensorResources<T>>()) {
resources_(std::make_shared<TensorResources<T>>()) {
// This Api is not supported for TensorString
WINML_THROW_HR_IF_TRUE_MSG(
E_ILLEGAL_METHOD_CALL,
(std::is_same<T, std::string>::value),
"TensorString objects cannot be created from a ID3D12Resource!");
GetGpuResource().copy_from(resource);
GpuTensor().copy_from(resource);
}
HRESULT CreateGPUMLValue(ID3D12Resource* resource, BindingContext& context, IValue** out) {
@ -117,21 +117,21 @@ struct TensorBase : TBase {
auto engine = session->GetEngine();
auto should_sync_buffer = context.type == _winml::BindingType::kInput;
if (GetCpuResource() != nullptr) {
if (CpuTensor() != nullptr) {
return CreateTensorValueFromExternalBuffer(engine, should_sync_buffer, out);
}
// If there is no matching cpu resource, then fallback to a gpu resource
if (GetGpuResource() != nullptr) {
return CreateGPUMLValue(GetGpuResource().get(), context, out);
if (GpuTensor() != nullptr) {
return CreateGPUMLValue(GpuTensor().get(), context, out);
}
WINML_THROW_HR(WINML_ERR_INVALID_BINDING);
}
HRESULT GPUTensorize(_winml::BindingContext& context, IValue** out) {
if (GetGpuResource() != nullptr) {
return CreateGPUMLValue(GetGpuResource().get(), context, out);
if (GpuTensor() != nullptr) {
return CreateGPUMLValue(GpuTensor().get(), context, out);
}
// Get engine
@ -142,8 +142,8 @@ struct TensorBase : TBase {
auto should_sync_buffer = context.type == _winml::BindingType::kInput;
// If there is no matching gpu resource, then fallback to a cpu resource
if (GetCpuResource() != nullptr) {
auto num_backing_buffers = GetCpuResource()->num_buffers();
if (CpuTensor() != nullptr) {
auto num_backing_buffers = CpuTensor()->num_buffers();
if (num_backing_buffers == 1) {
// If we have a single backing cpu buffer, there is no need to create GPU resources.
// The engine will use the buffer provided, and perform the needed copies into the GPU context as needed.
@ -154,24 +154,24 @@ struct TensorBase : TBase {
// If we are binding inputs, then a GPU resource needs to be allocated, and individual buffer contents need
// to be copied directly into a gpu resource.
if (GetGpuResource() == nullptr) {
GetGpuResource() = CreateD3D12Resource(session);
if (GpuTensor() == nullptr) {
GpuTensor() = CreateD3D12Resource(session);
}
_winml::ConverterResourceDescription descriptor = {};
descriptor.pixel_format = static_cast<DWORD>(wgdx::DirectXPixelFormat::Unknown);
descriptor.width = static_cast<int>(GetCpuResource()->size_in_bytes());
descriptor.width = static_cast<int>(CpuTensor()->size_in_bytes());
descriptor.height = static_cast<int>(1);
descriptor.luid = device->GetD3DDevice()->GetAdapterLuid(); // Converted image on GPU
context.converter = _winml::PoolObjectWrapper::Create(device->TensorizerStore()->Fetch(descriptor));
context.converter->Get()->Tensorizer->ConvertBuffersToBatchedGPUTensor(
GetCpuResource()->buffers(),
GetCpuResource()->size_in_bytes(),
CpuTensor()->buffers(),
CpuTensor()->size_in_bytes(),
*device->GetD3DDeviceCache(),
GetGpuResource().get());
GpuTensor().get());
return CreateGPUMLValue(GetGpuResource().get(), context, out);
return CreateGPUMLValue(GpuTensor().get(), context, out);
} else if (context.type == _winml::BindingType::kOutput) {
// If we are binding outputs, then the buffers do not need to bound. If the engine produces a output on the gpu
@ -179,8 +179,8 @@ struct TensorBase : TBase {
// into the output buffers without temporary intermediary buffers! No binding here is necessary.
// If the output produces a cpu buffer (even in the GPU case), we will already have a cpu buffer, and just need
// to copy back to the output buffers, no binding is necessary.
GetGpuResource() = CreateD3D12Resource(session);
return CreateGPUMLValue(GetGpuResource().get(), context, out);
GpuTensor() = CreateD3D12Resource(session);
return CreateGPUMLValue(GpuTensor().get(), context, out);
}
}
}
@ -188,11 +188,11 @@ struct TensorBase : TBase {
if (TensorKind() == winml::TensorKind::String) {
// Lazily allocate the cpu TensorString resource
// TensorStrings are CPU only, and so a gpu resource cannot be allocated for them.
GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape_);
CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
return CreateTensorValueFromExternalBuffer(engine, should_sync_buffer, out);
} else {
GetGpuResource() = CreateD3D12Resource(session);
return CreateGPUMLValue(GetGpuResource().get(), context, out);
GpuTensor() = CreateD3D12Resource(session);
return CreateGPUMLValue(GpuTensor().get(), context, out);
}
}
@ -242,8 +242,8 @@ struct TensorBase : TBase {
void EnsureBufferNotInUse() {
auto isBufferInUse =
std::any_of(
m_outstandingReferences.begin(),
m_outstandingReferences.end(),
outstanding_references_.begin(),
outstanding_references_.end(),
[](auto weakRef) { return weakRef.get() != nullptr; });
WINML_THROW_HR_IF_TRUE_MSG(WINML_ERR_INVALID_BINDING, isBufferInUse, "The tensor has outstanding memory buffer references that must be closed prior to evaluation!");
@ -254,7 +254,7 @@ struct TensorBase : TBase {
(_winml::BindingContext& context, IValue** out) {
RETURN_HR_IF_NULL_MSG(
WINML_ERR_INVALID_BINDING,
m_resources,
resources_,
"The tensor has been closed and its resources have been detached!");
EnsureBufferNotInUse();
@ -289,7 +289,7 @@ struct TensorBase : TBase {
// the conditions of ASSERT_TEMPLATE_PARAMETERS_EXACT() are met.
ASSERT_TEMPLATE_PARAMETERS<ElementType, ElementViewType>();
GetCpuResource()->set(size, reinterpret_cast<ElementType*>(data));
CpuTensor()->set(size, reinterpret_cast<ElementType*>(data));
}
template <>
@ -297,7 +297,8 @@ struct TensorBase : TBase {
// Ensure that this call is being called with the correct template parameters
ASSERT_TEMPLATE_PARAMETERS<std::string, winrt::hstring>();
GetCpuResource()->get_tensor_buffer()->Set(size, reinterpret_cast<std::string_view*>(data));
auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
string_data->set(size, reinterpret_cast<std::string_view*>(data));
}
template <typename ElementType = T, typename ElementViewType = ViewT>
@ -307,8 +308,8 @@ struct TensorBase : TBase {
ASSERT_TEMPLATE_PARAMETERS<ElementType, ElementViewType>();
RETURN_IF_FAILED_MSG(engine->CreateTensorValueFromExternalBuffer(
GetCpuResource()->buffer(sync_buffer).data(), GetCpuResource()->size_in_bytes(), GetCpuResource()->shape().data(),
GetCpuResource()->shape().size(), TensorKind(), value),
CpuTensor()->buffer(sync_buffer).data(), CpuTensor()->size_in_bytes(), CpuTensor()->shape().data(),
CpuTensor()->shape().size(), TensorKind(), value),
"Failed to prepare buffer for copy back from device resource.");
return S_OK;
}
@ -318,17 +319,19 @@ struct TensorBase : TBase {
// Ensure that this call is being called with the correct template parameters
ASSERT_TEMPLATE_PARAMETERS<std::string, winrt::hstring>();
auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
auto& string_vector = string_data->get_backing_vector();
std::vector<const char*> raw_values;
auto string_array = static_cast<std::string*>(GetCpuResource()->buffer().data());
std::transform(
string_array,
string_array + GetCpuResource()->number_of_elements(),
std::begin(string_vector),
std::end(string_vector),
std::back_inserter(raw_values),
[&](auto& str) { return str.c_str(); });
RETURN_IF_FAILED_MSG(engine->CreateStringTensorValueFromDataWithCopy(
raw_values.data(), raw_values.size(), GetCpuResource()->shape().data(),
GetCpuResource()->shape().size(), value),
raw_values.data(), raw_values.size(), CpuTensor()->shape().data(),
CpuTensor()->shape().size(), value),
"Failed to prepare buffer for copy back from device resource.");
return S_OK;
}
@ -338,7 +341,7 @@ struct TensorBase : TBase {
(BindingContext& context, IValue* value) {
RETURN_HR_IF_NULL_MSG(
E_ILLEGAL_METHOD_CALL,
m_resources,
resources_,
"The tensor has been closed and its resources have been detached during evaluation!");
_winml::Resource updated_resource;
@ -348,14 +351,14 @@ struct TensorBase : TBase {
RETURN_IF_FAILED_MSG(value->GetTensorShape(shape_), "Failed to get the tensor shape from resource!");
// make sure we always have a CPU resource
if (GetCpuResource() == nullptr) {
GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape_);
if (CpuTensor() == nullptr) {
CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
}
bool is_cpu;
if (SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu) {
// Get the data pointer and size
auto buffer = GetCpuResource()->buffer(false);
auto buffer = CpuTensor()->buffer(false);
if (updated_resource.get() != reinterpret_cast<void*>(buffer.data())) {
// Only copy the data if the source and destination are not the same!
@ -366,7 +369,7 @@ struct TensorBase : TBase {
} else {
// If the engine wrote to the data directly, it is possible that the underlying data was held by many buffers
// In that case the underlying buffers will not match the engine output, and they need to be flushed.
GetCpuResource()->flush();
CpuTensor()->flush();
}
} else {
// If we got a gpu resource, we should move the data to the cpu so accessors can retrieve the data.
@ -377,7 +380,7 @@ struct TensorBase : TBase {
auto device = session->Device().as<winmlp::LearningModelDevice>();
auto engine = session->GetEngine();
if (GetCpuResource()->num_buffers() == 1) {
if (CpuTensor()->num_buffers() == 1) {
winrt::com_ptr<IValue> dest;
RETURN_IF_FAILED_MSG(CreateTensorValueFromExternalBuffer(engine, false, dest.put()),
"Failed to prepare buffer for copy back from device resource.");
@ -395,7 +398,7 @@ struct TensorBase : TBase {
d3dResource,
buffer_size_in_bytes,
*device->GetD3DDeviceCache(),
GetCpuResource()->buffers());
CpuTensor()->buffers());
// Reset the Allocator before return to the Cache. Must Sync this background thread to that completion before we do.
device->GetD3DDeviceCache()->SyncD3D12ToCPU();
@ -615,7 +618,7 @@ struct TensorBase : TBase {
// Ensure that CreateReference is only called when there is 1 buffer.
WINML_THROW_HR_IF_TRUE_MSG(
E_ILLEGAL_METHOD_CALL,
GetCpuResource() != nullptr && GetCpuResource()->num_buffers() != 1, "A single buffer reference cannot be retrieved when the tensor is backed by multiple buffers!");
CpuTensor() != nullptr && CpuTensor()->num_buffers() != 1, "A single buffer reference cannot be retrieved when the tensor is backed by multiple buffers!");
// Create a TensorMemoryBufferReference<T>
@ -624,11 +627,11 @@ struct TensorBase : TBase {
// "has been closed. In that case, the returned IMemoryBufferReference is already closed."
// Creating a TensorMemoryBufferReference<T> with a null pointer is equivalent to creating it as closed.
auto memoryBufferReference = winrt::make<TensorMemoryBufferReference<T>>(shape_, m_resources);
auto memoryBufferReference = winrt::make<TensorMemoryBufferReference<T>>(shape_, resources_);
// Create and cache a weak reference to the TensorMemoryBufferReference<T>
winrt::weak_ref<TensorMemoryBufferReference<T>> weak(memoryBufferReference.as<TensorMemoryBufferReference<T>>());
m_outstandingReferences.push_back(weak);
outstanding_references_.push_back(weak);
// Return the strong ref to the caller
return memoryBufferReference;
@ -638,7 +641,7 @@ struct TensorBase : TBase {
// IMemoryBuffer::Close
void Close() try {
// Let go of the lifetime of the resources, this is will indicate that the memorybuffer is closed
m_resources = nullptr;
resources_ = nullptr;
}
WINML_CATCH_ALL
@ -653,10 +656,10 @@ struct TensorBase : TBase {
RETURN_HR_IF_NULL_MSG(
E_ILLEGAL_METHOD_CALL,
m_resources,
resources_,
"The tensor has been closed and its resources have been detached!");
return m_resources->GetBuffer(shape_, value, capacity);
return resources_->GetBuffer(shape_, value, capacity);
}
// ITensorNative::GetD3D12Resource
@ -667,10 +670,10 @@ struct TensorBase : TBase {
RETURN_HR_IF(ERROR_INVALID_FUNCTION, (std::is_same<T, std::string>::value));
RETURN_HR_IF_NULL_MSG(
E_ILLEGAL_METHOD_CALL,
m_resources,
resources_,
"The tensor has been closed and its resources have been detached!");
GetGpuResource().copy_to(ppResource);
GpuTensor().copy_to(ppResource);
return S_OK;
}
WINML_CATCH_ALL_COM
@ -689,12 +692,11 @@ struct TensorBase : TBase {
// owned IVectorView object.
// Get the raw buffer pointer from the native tensor implementation.
auto number_of_elements = GetCpuResource()->number_of_elements();
auto buffer = GetCpuResource()->buffer();
auto buffer = CpuTensor()->buffer();
auto element_data = static_cast<ElementType*>(buffer.data());
// Copy data that will be passed back to caller.
auto copy = std::vector<ElementType>(element_data, element_data + number_of_elements);
auto copy = std::vector<ElementType>(element_data, element_data + buffer.size());
// Create IVectorView from copied data.
return winrt::single_threaded_vector<ElementViewType>(std::move(copy)).GetView();
@ -707,18 +709,17 @@ struct TensorBase : TBase {
// Ensure that this call is being called with the correct template parameters
ASSERT_TEMPLATE_PARAMETERS<_winml::Half, float>();
auto number_of_elements = GetCpuResource()->number_of_elements();
auto buffer = GetCpuResource()->buffer();
auto buffer = CpuTensor()->buffer();
auto element_data = static_cast<_winml::Half*>(buffer.data());
// Copy the HALFs to floats
std::vector<float> float_value(number_of_elements);
std::vector<float> float_value(buffer.size());
DirectX::PackedVector::XMConvertHalfToFloatStream(
float_value.data(),
sizeof(float) /* output stride */,
reinterpret_cast<DirectX::PackedVector::HALF*>(element_data),
sizeof(_winml::Half) /* input stride */,
number_of_elements);
buffer.size());
// Create IVectorView from copied data.
return winrt::single_threaded_vector<float>(std::move(float_value)).GetView();
@ -731,16 +732,15 @@ struct TensorBase : TBase {
// Ensure that this call is being called with the correct template parameters
ASSERT_TEMPLATE_PARAMETERS<std::string, winrt::hstring>();
auto number_of_elements = GetCpuResource()->number_of_elements();
auto buffer = GetCpuResource()->buffer();
auto element_data = static_cast<std::string*>(buffer.data());
auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
auto& string_vector = string_data->get_backing_vector();
auto copy = std::vector<winrt::hstring>(number_of_elements, L"");
auto copy = std::vector<winrt::hstring>(string_vector.size(), L"");
std::generate(
copy.begin(),
copy.end(),
[n = 0, &element_data]() mutable {
return _winml::Strings::HStringFromUTF8(element_data[n++]);
[n = 0, &string_vector]() mutable {
return _winml::Strings::HStringFromUTF8(string_vector[n++]);
});
return winrt::single_threaded_vector<winrt::hstring>(std::move(copy)).GetView();
@ -752,14 +752,13 @@ struct TensorBase : TBase {
wfc::IVectorView<uint8_t> GetAsVectorView<int8_t, uint8_t>() try {
ASSERT_TEMPLATE_PARAMETERS<int8_t, uint8_t>();
auto number_of_elements = GetCpuResource()->number_of_elements();
auto buffer = GetCpuResource()->buffer();
auto buffer = CpuTensor()->buffer();
auto element_data = static_cast<int8_t*>(buffer.data());
// Copy data that will be passed back to caller.
gsl::span<uint8_t> span(reinterpret_cast<uint8_t*>(element_data), number_of_elements);
std::vector<uint8_t> copy(span.begin(), span.begin() + number_of_elements);
gsl::span<uint8_t> span(reinterpret_cast<uint8_t*>(element_data), buffer.size());
std::vector<uint8_t> copy(span.begin(), span.begin() + buffer.size());
// Create IVectorView from copied data.
return winrt::single_threaded_vector<uint8_t>(std::move(copy)).GetView();
@ -809,10 +808,10 @@ struct TensorBase : TBase {
RETURN_HR_IF_NULL(E_POINTER, pIsPlaceHolder);
RETURN_HR_IF_NULL_MSG(
E_ILLEGAL_METHOD_CALL,
m_resources,
resources_,
"The tensor has been closed and its resources have been detached!");
*pIsPlaceHolder = GetCpuResource() == nullptr && GetGpuResource() == nullptr;
*pIsPlaceHolder = CpuTensor() == nullptr && GpuTensor() == nullptr;
return S_OK;
}
@ -827,7 +826,7 @@ struct TensorBase : TBase {
ASSERT_TEMPLATE_PARAMETERS_EXACT<ElementType, ElementViewType>();
shape_ = shape;
GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
}
template <>
@ -837,7 +836,7 @@ struct TensorBase : TBase {
ASSERT_TEMPLATE_PARAMETERS<_winml::Half, float>();
shape_ = shape;
GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
}
template <>
@ -847,7 +846,7 @@ struct TensorBase : TBase {
ASSERT_TEMPLATE_PARAMETERS<int8_t, uint8_t>();
shape_ = shape;
GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
}
// Specialized version to convert hstring to string
@ -875,12 +874,12 @@ struct TensorBase : TBase {
// Ensure that the Set APIs are only called when there is 1 buffer.
// These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
// They should always be backed by a single underlying buffer.
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);
// This method accepts data as an array, T[], from the caller.
// This is a non-destructive API, so the caller data is
// left untouched, and the data is copied into internal buffers.
GetCpuResource()->set(data.size(), data.data());
CpuTensor()->set(data.size(), data.data());
}
// Specialized version to convert floats to float16
@ -892,13 +891,12 @@ struct TensorBase : TBase {
// Ensure that the Set APIs are only called when there is 1 buffer.
// These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
// They should always be backed by a single underlying buffer.
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);
auto number_of_elements = GetCpuResource()->number_of_elements();
auto buffer = GetCpuResource()->buffer();
auto buffer = CpuTensor()->buffer();
auto element_data = static_cast<_winml::Half*>(buffer.data());
THROW_HR_IF(E_UNEXPECTED, data.size() != number_of_elements);
THROW_HR_IF(E_UNEXPECTED, data.size() != buffer.size());
DirectX::PackedVector::XMConvertFloatToHalfStream(
reinterpret_cast<DirectX::PackedVector::HALF*>(element_data),
sizeof(_winml::Half) /* output stride */,
@ -916,12 +914,12 @@ struct TensorBase : TBase {
// Ensure that the Set APIs are only called when there is 1 buffer.
// These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
// They should always be backed by a single underlying buffer.
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);
auto size = data.size();
auto pData = data.data();
GetCpuResource()->set(size, reinterpret_cast<int8_t*>(const_cast<uint8_t*>(pData)));
CpuTensor()->set(size, reinterpret_cast<int8_t*>(const_cast<uint8_t*>(pData)));
}
// Specialized version to convert hstring to string
@ -933,17 +931,16 @@ struct TensorBase : TBase {
// Ensure that the Set APIs are only called when there is 1 buffer.
// These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
// They should always be backed by a single underlying buffer.
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);
auto number_of_elements = GetCpuResource()->number_of_elements();
auto buffer = GetCpuResource()->buffer();
THROW_HR_IF(E_UNEXPECTED, data.size() > number_of_elements);
auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
auto& string_vector = string_data->get_backing_vector();
auto element_data = static_cast<std::string*>(buffer.data());
THROW_HR_IF(E_UNEXPECTED, data.size() > string_vector.size());
// Convert and copy into the underlying buffer
std::transform(
data.begin(), data.end(), element_data,
data.begin(), data.end(), std::begin(string_vector),
[](auto& element) mutable {
return _winml::Strings::UTF8FromHString(element);
});
@ -962,9 +959,9 @@ struct TensorBase : TBase {
// Ensure that the Set APIs are only called when there is 1 buffer.
// These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
// They should always be backed by a single underlying buffer.
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);
auto buffer = GetCpuResource()->buffer();
auto buffer = CpuTensor()->buffer();
auto element_data = static_cast<ElementType*>(buffer.data());
// This method accepts data as an IVectorView<T>.
@ -983,9 +980,9 @@ struct TensorBase : TBase {
// Ensure that the Set APIs are only called when there is 1 buffer.
// These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
// They should always be backed by a single underlying buffer.
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);
auto buffer = GetCpuResource()->buffer();
auto buffer = CpuTensor()->buffer();
auto element_data = static_cast<_winml::Half*>(buffer.data());
// Now that we take in IIterables and not vector views
@ -1009,9 +1006,9 @@ struct TensorBase : TBase {
// Ensure that the Set APIs are only called when there is 1 buffer.
// These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
// They should always be backed by a single underlying buffer.
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);
auto buffer = GetCpuResource()->buffer();
auto buffer = CpuTensor()->buffer();
auto element_data = static_cast<int8_t*>(buffer.data());
std::transform(begin(data), end(data), element_data, [](auto element) { return static_cast<int8_t>(element); });
}
@ -1026,39 +1023,39 @@ struct TensorBase : TBase {
// Ensure that the Set APIs are only called when there is 1 buffer.
// These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
// They should always be backed by a single underlying buffer.
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);
auto buffer = GetCpuResource()->buffer();
auto element_data = static_cast<std::string*>(buffer.data());
auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
auto& string_vector = string_data->get_backing_vector();
// Convert and copy into the underlying buffer
std::transform(begin(data), end(data), element_data, [](const auto& element) {
std::transform(begin(data), end(data), std::begin(string_vector), [](const auto& element) {
return _winml::Strings::UTF8FromHString(element);
});
}
std::shared_ptr<_winml::Tensor<T>>& GetCpuResource() {
std::shared_ptr<_winml::Tensor<T>>& CpuTensor() {
WINML_THROW_HR_IF_NULL_MSG(
E_ILLEGAL_METHOD_CALL,
m_resources,
resources_,
"The tensor has been closed and its resources are detached!");
return m_resources->CpuResource;
return resources_->cpu_resource_;
}
winrt::com_ptr<ID3D12Resource>& GetGpuResource() {
winrt::com_ptr<ID3D12Resource>& GpuTensor() {
WINML_THROW_HR_IF_NULL_MSG(
E_ILLEGAL_METHOD_CALL,
m_resources,
resources_,
"The tensor has been closed and its resources are detached!");
return m_resources->GpuResource;
return resources_->gpu_resource_;
}
private:
std::vector<int64_t> shape_;
std::shared_ptr<TensorResources<T>> m_resources;
std::vector<winrt::weak_ref<TensorMemoryBufferReference<T>>> m_outstandingReferences;
std::shared_ptr<TensorResources<T>> resources_;
std::vector<winrt::weak_ref<TensorMemoryBufferReference<T>>> outstanding_references_;
bool m_isClosed = false;
};

View file

@ -1,234 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "robuffer.h"
#include "winrt/Windows.Storage.Streams.h"
#include "DisjointBufferHelpers.h"
namespace _winml {
class VectorBuffer : public winrt::implements<
VectorBuffer,
wss::IBuffer,
Windows::Storage::Streams::IBufferByteAccess> {
public:
VectorBuffer(size_t size) : buffer_(size) {}
uint32_t Capacity() const {
return static_cast<uint32_t>(buffer_.size());
}
uint32_t Length() const {
throw winrt::hresult_error(E_NOTIMPL);
}
void Length(uint32_t /*value*/) {
throw winrt::hresult_error(E_NOTIMPL);
}
STDMETHOD(Buffer)
(uint8_t** value) {
RETURN_HR_IF_NULL(E_POINTER, value);
*value = buffer_.data();
return S_OK;
}
private:
std::vector<BYTE> buffer_;
};
template <typename T>
class TensorBuffer {
wss::IBuffer combined_buffer_;
std::vector<wss::IBuffer> buffers_;
size_t size_;
TensorBuffer(size_t size) :
size_(size),
combined_buffer_(winrt::make<VectorBuffer>(size * sizeof(T))),
buffers_ { combined_buffer_ } {
auto buffer = BufferAt(0);
// The initial release of WinML (RS5) shipped with behavior that would
// zero-initialize uninitialized tensors. After measuring, the performance impact
// of memsetting the memory buffer is quite small (<1ms for 3channel 720x720 TensorFloats).
// To maintain parity with RS5 behavior, we always zero out the memory buffer.
memset(buffer.data(), 0, buffer.size_bytes());
}
TensorBuffer(
size_t size,
wfc::IIterable<wss::IBuffer> const& buffers) : size_(size),
combined_buffer_(nullptr),
buffers_(begin(buffers), end(buffers)) {
if (buffers_.size() == 1) {
combined_buffer_ = buffers_[0];
} else {
// If there are many buffers, then the combined buffer will be a separately allocated value that combines all of the buffers.
// This needs to be lazily done however, as the extra memory should not be allocated when not needed (GPU).
}
}
auto CombinedBuffer() {
if (combined_buffer_ == nullptr) {
combined_buffer_ = winrt::make<VectorBuffer>(size_ * sizeof(T));
}
return BufferFrom(combined_buffer_);
}
public:
static auto Create(size_t size) {
return std::shared_ptr<TensorBuffer>(new TensorBuffer(size));
}
static auto Create(
size_t size,
wss::IBuffer buffer) {
return std::shared_ptr<TensorBuffer>(new TensorBuffer(size, buffer));
}
static auto Create(
size_t size,
wfc::IIterable<wss::IBuffer> const& buffers) {
return std::shared_ptr<TensorBuffer>(new TensorBuffer(size, buffers));
}
auto NumElements() {
return size_;
}
auto SizeInBytes() {
return size_ * sizeof(T);
}
auto NumBuffers() {
return buffers_.size();
}
auto& Buffers() {
return buffers_;
}
auto Buffer(bool should_sync_buffer) {
if (buffers_.size() == 1) {
// Single buffer optimization to not create a temporary buffer that concatenates disjoint buffers into one.
return BufferAt(0);
}
auto span = CombinedBuffer();
if (should_sync_buffer) {
_winml::LoadOrStoreDisjointBuffers(
true /*load buffer*/,
buffers_.size(),
[this](size_t i) { return BufferAt(i); },
span);
}
return span;
}
auto Flush() {
auto should_flush = buffers_.size() != 1;
if (should_flush) {
auto span = CombinedBuffer();
_winml::LoadOrStoreDisjointBuffers(
false /*store buffer*/,
buffers_.size(),
[this](size_t i) { return BufferAt(i); },
span);
}
return should_flush;
}
auto Set(size_t size_in_bytes, const T* data) {
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
size_in_bytes <= (size_ * sizeof(T)),
"Argument size (%llu) exceeds the tensor size (%llu).",
static_cast<uint64_t>(size_in_bytes),
static_cast<uint64_t>(size_ * sizeof(T)));
gsl::span<byte> span(reinterpret_cast<byte*>(const_cast<T*>(data)), size_in_bytes);
_winml::LoadOrStoreDisjointBuffers(
false /*store buffer*/,
buffers_.size(),
[this](size_t i) { return BufferAt(i); },
span);
}
auto Set(std::vector<T>&& moveableData) {
Set(moveableData.size() * sizeof(T), moveableData.data());
}
private:
auto BufferFrom(wss::IBuffer buffer) {
byte* current_data = nullptr;
auto bufferByteAccess = buffer.as<Windows::Storage::Streams::IBufferByteAccess>();
bufferByteAccess->Buffer(&current_data);
return gsl::span<byte>(
current_data,
static_cast<size_t>(buffer.Capacity()));
}
auto BufferAt(size_t index) {
return BufferFrom(buffers_[index]);
}
};
template <>
class TensorBuffer<std::string> {
std::vector<std::string> buffer_;
TensorBuffer(size_t size) : buffer_(size) {}
public:
static auto Create(size_t size) {
return std::shared_ptr<TensorBuffer>(new TensorBuffer(size));
}
auto NumElements() {
return buffer_.size();
}
auto SizeInBytes() {
return buffer_.size();
}
auto NumBuffers() {
return 1;
}
auto Flush() {
return false;
}
auto Buffers() -> std::vector<wss::IBuffer>& {
WINML_THROW_HR(E_UNEXPECTED);
}
auto BufferAt(size_t index) {
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
index == 0,
"TensorString can only be backed by a single buffer!");
return gsl::span<byte>(reinterpret_cast<byte*>(buffer_.data()), buffer_.size());
}
auto Buffer(bool /*should_sync_buffer*/) {
return BufferAt(0);
}
auto Set(size_t size, std::string_view* data) {
WINML_THROW_HR_IF_FALSE_MSG(
E_INVALIDARG,
size <= buffer_.size(),
"Argument size (%d) exceeds the tensor size (%d).",
static_cast<int>(size),
static_cast<int>(buffer_.size()));
// Copy
std::copy(data, data + size, buffer_.begin());
}
};
} // namespace _winml

View file

@ -29,12 +29,12 @@ struct TensorResources {
*capacity = 0;
// Lazily allocate the cpu resource on call to GetBuffer
if (CpuResource == nullptr) {
CpuResource = std::make_shared<_winml::Tensor<T>>(shape);
if (cpu_resource_ == nullptr) {
cpu_resource_ = std::make_shared<_winml::Tensor<T>>(shape);
}
// Get the data pointer and size
auto buffer = CpuResource->buffer();
auto buffer = cpu_resource_->buffer();
// Set out parameters
*capacity = static_cast<uint32_t>(buffer.size_bytes());
@ -45,8 +45,8 @@ struct TensorResources {
}
// Theses are access directly by TensorMemoryBufferReference<T> and TensorBase
std::shared_ptr<_winml::Tensor<T>> CpuResource;
winrt::com_ptr<ID3D12Resource> GpuResource;
std::shared_ptr<_winml::Tensor<T>> cpu_resource_;
winrt::com_ptr<ID3D12Resource> gpu_resource_;
};
// This class holds onto the lifetime of TensorResources<T> so that they can be kept alive by TensorBase AND its active MBRs.