Refactor implementation of Tensor<T> and underlying buffer stores to improve binary size and maintainability (#5836)

* refactor tensor buffers to make cleaner * refactor to make tensor backing buffer implementation smaller and cleaner * missed virtual on destructor * remove unnecessary static_pointer_cast * add string vector accessor Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
2026-05-14 20:48:00 +00:00 · 2020-11-18 14:56:47 -08:00 · 2020-11-18 14:56:47 -08:00 · 84c1340f9b
commit 84c1340f9b
parent 85f945a875
16 changed files with 548 additions and 401 deletions
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@ -400,15 +400,18 @@ endif(onnxruntime_USE_DML)
 # Add static library that will be archived/linked for both static/dynamic library
 add_library(winml_lib_api STATIC
  ${winml_lib_api_dir}/impl/FeatureCompatibility.h
+  ${winml_lib_api_dir}/impl/IData.h
  ${winml_lib_api_dir}/impl/IMapFeatureValue.h
  ${winml_lib_api_dir}/impl/ISequenceFeatureValue.h
  ${winml_lib_api_dir}/impl/MapBase.h
+  ${winml_lib_api_dir}/impl/NumericData.h
  ${winml_lib_api_dir}/impl/SequenceBase.h
+  ${winml_lib_api_dir}/impl/StringData.h
  ${winml_lib_api_dir}/impl/Tensor.h
  ${winml_lib_api_dir}/impl/TensorBase.h
-  ${winml_lib_api_dir}/impl/TensorBuffer.h
  ${winml_lib_api_dir}/impl/TensorKindFrom.h
  ${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
+  ${winml_lib_api_dir}/NumericData.cpp
  ${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
  ${winml_lib_api_dir}/ImageFeatureDescriptor.h
  ${winml_lib_api_dir}/ImageFeatureValue.cpp
@ -429,8 +432,11 @@ add_library(winml_lib_api STATIC
  ${winml_lib_api_dir}/MapFeatureDescriptor.h
  ${winml_lib_api_dir}/SequenceFeatureDescriptor.cpp
  ${winml_lib_api_dir}/SequenceFeatureDescriptor.h
+  ${winml_lib_api_dir}/StringData.cpp
  ${winml_lib_api_dir}/TensorFeatureDescriptor.cpp
  ${winml_lib_api_dir}/TensorFeatureDescriptor.h
+  ${winml_lib_api_dir}/VectorBackedBuffer.h
+  ${winml_lib_api_dir}/VectorBackedBuffer.cpp
  ${winml_lib_api_dir}/pch/pch.h
 )

--- a/winml/lib/Api.Image/DisjointBufferHelpers.cpp
+++ b/winml/lib/Api.Image/DisjointBufferHelpers.cpp
@ -3,7 +3,7 @@

 namespace _winml {

-void LoadOrStoreDisjointBuffers(
+static void LoadOrStoreDisjointBuffers(
    bool should_load_buffer,
    size_t num_buffers,
    std::function<gsl::span<byte>(size_t)> get_buffer,
@ -31,4 +31,18 @@ void LoadOrStoreDisjointBuffers(
  }
 }

+void LoadSpanFromDisjointBuffers(
+    size_t num_buffers,
+    std::function<gsl::span<byte>(size_t)> get_buffer,
+    gsl::span<byte>& buffer_span) {
+  LoadOrStoreDisjointBuffers(true /*load into the span*/, num_buffers, get_buffer, buffer_span);
+}
+
+void StoreSpanIntoDisjointBuffers(
+    size_t num_buffers,
+    std::function<gsl::span<byte>(size_t)> get_buffer,
+    gsl::span<byte>& buffer_span) {
+  LoadOrStoreDisjointBuffers(false /*store into buffers*/, num_buffers, get_buffer, buffer_span);
+}
+
 } // namespace _winml
--- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
+++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp
@ -630,8 +630,7 @@ void TensorToVideoFrameConverter::ConvertBatchedDX12TensorToBuffers(
  byte* readback_buffer = nullptr;
  WINML_THROW_IF_FAILED(readback_heap_->Map(0, &CD3DX12_RANGE(0, buffer_size_in_bytes), reinterpret_cast<void**>(&readback_buffer)));
  auto readback_buffer_span = gsl::span<byte>(readback_buffer, buffer_size_in_bytes);
-  _winml::LoadOrStoreDisjointBuffers(
-      false /*load disjoint buffers into*/,
+  _winml::StoreSpanIntoDisjointBuffers(
      buffers.size(),
      [&](size_t i) {
        byte* buffer_start = nullptr;
--- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
+++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp
@ -559,8 +559,7 @@ void VideoFrameToTensorConverter::ConvertBuffersToBatchedGPUTensor(
  WINML_THROW_IF_FAILED(upload_heap_->Map(0, &CD3DX12_RANGE(0, 0), reinterpret_cast<void**>(&gpu_buffer)));
  auto gpu_buffer_span = gsl::span<byte>(gpu_buffer, buffer_size_in_bytes);

-  _winml::LoadOrStoreDisjointBuffers(
-      true /*load disjoint buffers into*/,
+  _winml::LoadSpanFromDisjointBuffers(
      buffers.size(),
      [&](size_t i) {
        byte* buffer_start = nullptr;
--- a/winml/lib/Api.Image/inc/DisjointBufferHelpers.h
+++ b/winml/lib/Api.Image/inc/DisjointBufferHelpers.h
@ -7,8 +7,12 @@

 namespace _winml {

-void LoadOrStoreDisjointBuffers(
-    bool should_load_buffer,
+void LoadSpanFromDisjointBuffers(
+    size_t num_buffers,
+    std::function<gsl::span<byte>(size_t)> get_buffer,
+    gsl::span<byte>& buffer_span);
+
+void StoreSpanIntoDisjointBuffers(
    size_t num_buffers,
    std::function<gsl::span<byte>(size_t)> get_buffer,
    gsl::span<byte>& buffer_span);
--- a/winml/lib/Api/NumericData.cpp
+++ b/winml/lib/Api/NumericData.cpp
@ -0,0 +1,129 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "pch.h"
+
+#include "impl/NumericData.h"
+#include "VectorBackedBuffer.h"
+#include "robuffer.h"
+#include "winrt/Windows.Storage.Streams.h"
+#include "DisjointBufferHelpers.h"
+
+namespace _winml {
+
+std::shared_ptr<_winml::idata> numeric_data::create(
+  size_t num_elements,
+  size_t element_size_in_bytes,
+  wfc::IIterable<wss::IBuffer> const& buffers) {
+  return std::make_shared<numeric_data>(num_elements, element_size_in_bytes, buffers);
+}
+
+numeric_data::numeric_data(
+  size_t num_elements, size_t element_size_in_bytes, wfc::IIterable<wss::IBuffer> const& buffers) :
+  num_elements_(num_elements),
+  element_size_in_bytes_(element_size_in_bytes),
+  combined_buffer_(nullptr),
+  buffers_() {
+  if (buffers != nullptr) {
+    buffers_ = { begin(buffers), end(buffers) };
+  }
+  
+  if (buffers_.size() == 0) {
+    combined_buffer_ = winrt::make<vector_backed_buffer>(num_elements * element_size_in_bytes);
+    buffers_ = { combined_buffer_ };
+    auto buffer = buffer_at(0);
+  
+    // The initial release of WinML (RS5) shipped with behavior that would
+    // zero-initialize uninitialized tensors. After measuring, the performance impact
+    // of memsetting the memory buffer is quite small (<1ms for 3channel 720x720 TensorFloats).
+    // To maintain parity with RS5 behavior, we always zero out the memory buffer.
+    memset(buffer.data(), 0, buffer.size_bytes());
+  }
+  else if (buffers_.size() == 1) {
+    combined_buffer_ = buffers_[0];
+  }
+  else {
+    // If there are many buffers, then the combined buffer will be a separately allocated value that combines all of the buffers.
+    // This needs to be lazily done however, as the extra memory should not be allocated when not needed (GPU).
+  }
+}
+
+size_t numeric_data::num_elements() {
+  return num_elements_;
+}
+
+size_t numeric_data::size_in_bytes() {
+  return num_elements_ * element_size_in_bytes_;
+}
+
+size_t numeric_data::num_buffers() {
+  return buffers_.size();
+}
+
+std::vector<wss::IBuffer>& numeric_data::buffers() {
+  return buffers_;
+}
+
+gsl::span<byte> numeric_data::buffer(bool should_sync_buffer) {
+  if (buffers_.size() == 1) {
+    // Single buffer optimization to not create a temporary buffer that concatenates disjoint buffers into one.
+    return buffer_at(0);
+  }
+  auto span = combined_buffer();
+  if (should_sync_buffer) {
+    _winml::LoadSpanFromDisjointBuffers(
+      buffers_.size(),
+      [this](size_t i) { return buffer_at(i); },
+      span);
+  }
+
+  return span;
+}
+
+bool numeric_data::flush() {
+  auto should_flush = buffers_.size() != 1;
+  if (should_flush) {
+    auto span = combined_buffer();
+    _winml::StoreSpanIntoDisjointBuffers(
+        buffers_.size(),
+        [this](size_t i) { return buffer_at(i); },
+        span);
+  }
+  return should_flush;
+}
+
+void numeric_data::set(size_t data_size, const byte* data) {
+  WINML_THROW_HR_IF_FALSE_MSG(
+      E_INVALIDARG,
+      data_size <= (num_elements_ * element_size_in_bytes_),
+      "Argument size (%llu) exceeds the tensor size (%llu).",
+      static_cast<uint64_t>(data_size),
+      static_cast<uint64_t>(num_elements_ * element_size_in_bytes_));
+  
+  gsl::span<byte> span(const_cast<byte*>(data), data_size);
+  _winml::StoreSpanIntoDisjointBuffers(
+    buffers_.size(),
+    [this](size_t i) { return buffer_at(i); },
+    span);
+}
+
+static gsl::span<byte> get_span_from_ibuffer(wss::IBuffer buffer) {
+  byte* current_data = nullptr;
+  auto bufferByteAccess = buffer.as<Windows::Storage::Streams::IBufferByteAccess>();
+  bufferByteAccess->Buffer(&current_data);
+  return gsl::span<byte>(
+      current_data,
+      static_cast<size_t>(buffer.Capacity()));
+}
+
+gsl::span<byte> numeric_data::buffer_at(size_t index) {
+  return get_span_from_ibuffer(buffers_[index]);
+}
+
+gsl::span<byte> numeric_data::combined_buffer() {
+  if (combined_buffer_ == nullptr) {
+    combined_buffer_ = winrt::make<vector_backed_buffer>(num_elements_ * element_size_in_bytes_);
+  }
+  return get_span_from_ibuffer(combined_buffer_);
+}
+
+}  // namespace _winml
--- a/winml/lib/Api/StringData.cpp
+++ b/winml/lib/Api/StringData.cpp
@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "pch.h"
+
+#include "impl/StringData.h"
+
+namespace _winml {
+
+string_data::string_data(size_t size) :
+    buffer_(size) {}
+
+std::shared_ptr<_winml::idata> string_data::create(size_t size) {
+  return std::make_shared<string_data>(size);
+}
+
+size_t string_data::num_elements() {
+  return buffer_.size();
+}
+
+size_t string_data::size_in_bytes() {
+  WINML_THROW_HR(E_UNEXPECTED);
+}
+
+size_t string_data::num_buffers() {
+  return 1;
+}
+
+bool string_data::flush() {
+  // Vacuously true
+  return true;
+}
+
+std::vector<wss::IBuffer>& string_data::buffers() {
+  WINML_THROW_HR(E_UNEXPECTED);
+}
+
+gsl::span<byte> string_data::buffer(bool /*should_sync_buffer*/) {
+  return gsl::span<byte>(reinterpret_cast<byte*>(buffer_.data()), buffer_.size());
+}
+
+void string_data::set(size_t num_elements, const std::string_view* data) {
+  WINML_THROW_HR_IF_FALSE_MSG(
+      E_INVALIDARG,
+      num_elements <= buffer_.size(),
+      "Argument size (%d) exceeds the tensor size (%d).",
+      static_cast<int>(num_elements),
+      static_cast<int>(buffer_.size()));
+
+  // Copy
+  std::copy(data, data + num_elements, buffer_.begin());
+}
+
+void string_data::set(size_t /*data_size*/, const byte* /*data*/) {
+  WINML_THROW_HR(E_UNEXPECTED);
+}
+
+std::vector<std::string>& string_data::get_backing_vector() {
+  return buffer_;
+}
+
+}  // namespace _winml
--- a/winml/lib/Api/VectorBackedBuffer.cpp
+++ b/winml/lib/Api/VectorBackedBuffer.cpp
@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#include "pch.h"
+
+#include "VectorBackedBuffer.h"
+
+namespace _winml {
+
+vector_backed_buffer::vector_backed_buffer(size_t size) : buffer_(size) {}
+
+uint32_t vector_backed_buffer::Capacity() const {
+  return static_cast<uint32_t>(buffer_.size());
+}
+
+uint32_t vector_backed_buffer::Length() const {
+  throw winrt::hresult_error(E_NOTIMPL);
+}
+
+void vector_backed_buffer::Length(uint32_t /*value*/) {
+  throw winrt::hresult_error(E_NOTIMPL);
+}
+
+STDMETHODIMP vector_backed_buffer::Buffer(uint8_t** value) {
+  RETURN_HR_IF_NULL(E_POINTER, value);
+  *value = buffer_.data();
+  return S_OK;
+}
+
+}  // namespace _winml
--- a/winml/lib/Api/VectorBackedBuffer.h
+++ b/winml/lib/Api/VectorBackedBuffer.h
@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "robuffer.h"
+#include "winrt/Windows.Storage.Streams.h"
+
+namespace _winml {
+
+class vector_backed_buffer : public winrt::implements<
+                         vector_backed_buffer,
+                         wss::IBuffer,
+                         Windows::Storage::Streams::IBufferByteAccess> {
+ public:
+  vector_backed_buffer(size_t size);
+
+  uint32_t Capacity() const;
+  uint32_t Length() const;
+  void Length(uint32_t /*value*/);
+
+  STDMETHOD(Buffer)(uint8_t** value);
+
+ private:
+  std::vector<BYTE> buffer_;
+};
+
+}  // namespace _winml
--- a/winml/lib/Api/impl/IData.h
+++ b/winml/lib/Api/impl/IData.h
@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "IEngine.h"
+
+// ILotusValueProviderPrivate exposes a private Lotus interface to the engine so that it can retrieve tensor
+// resources stored in winrt structures.
+
+namespace _winml {
+
+struct idata {
+  virtual ~idata(){}
+  
+  virtual size_t num_elements() = 0;
+  virtual size_t size_in_bytes() = 0;
+  virtual size_t num_buffers() = 0;
+  virtual std::vector<wss::IBuffer>& buffers() = 0;
+  virtual gsl::span<byte> buffer(bool should_sync_buffer) = 0;
+  virtual bool flush() = 0;
+  virtual void set(size_t data_size, const byte* data) = 0;
+};
+
+}  // namespace _winml
--- a/winml/lib/Api/impl/NumericData.h
+++ b/winml/lib/Api/impl/NumericData.h
@ -0,0 +1,46 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "IData.h"
+#include "robuffer.h"
+#include "winrt/Windows.Storage.Streams.h"
+
+namespace _winml {
+
+class numeric_data : public _winml::idata {
+ public:
+  static std::shared_ptr<_winml::idata> create(
+    size_t num_elements,
+    size_t element_size_in_bytes,
+    wfc::IIterable<wss::IBuffer> const& buffers);
+
+  // Privte constructor as this type should be created as a shared_ptr
+  numeric_data(size_t num_elements, size_t element_size_in_bytes, wfc::IIterable<wss::IBuffer> const& buffers);
+  gsl::span<byte> buffer_at(size_t index);
+  gsl::span<byte> combined_buffer();
+
+ public:
+  size_t num_elements() override;
+  size_t size_in_bytes() override;
+  size_t num_buffers() override;
+
+  // Buffer accessors
+  std::vector<wss::IBuffer>& buffers() override;
+  gsl::span<byte> buffer(bool should_sync_buffer) override;
+
+  // Flush to buffers API
+  bool flush() override;
+
+  // Set APIs
+  void set(size_t data_size, const byte* data) override;
+
+ private:
+  wss::IBuffer combined_buffer_;
+  std::vector<wss::IBuffer> buffers_;
+  size_t num_elements_;
+  size_t element_size_in_bytes_;
+};
+
+}  // namespace _winml
--- a/winml/lib/Api/impl/StringData.h
+++ b/winml/lib/Api/impl/StringData.h
@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "IData.h"
+#include "robuffer.h"
+#include "winrt/Windows.Storage.Streams.h"
+
+namespace _winml {
+
+class string_data : public _winml::idata {
+ public:
+  static std::shared_ptr<_winml::idata> create(size_t size);
+
+  string_data(size_t size);
+
+  size_t num_elements() override;
+  size_t size_in_bytes() override;
+  size_t num_buffers() override;
+
+  // Buffer accessors
+  std::vector<wss::IBuffer>& buffers() override;
+  gsl::span<byte> buffer(bool should_sync_buffer) override;
+
+  // Flush to buffers API
+  bool flush() override;
+
+  // Set APIs
+  void set(size_t data_size, const byte* data) override;
+
+ public:
+  void set(size_t num_elements, const std::string_view* data);
+  std::vector<std::string>& get_backing_vector();
+
+ private:
+  std::vector<std::string> buffer_;
+};
+
+}  // namespace _winml
--- a/winml/lib/Api/impl/Tensor.h
+++ b/winml/lib/Api/impl/Tensor.h
@ -3,7 +3,8 @@

 #pragma once

-#include "TensorBuffer.h"
+#include "NumericData.h"
+#include "StringData.h"

 //
 // the Tensor class is the actual object for CPU memory buffers.
@ -12,81 +13,83 @@
 //
 namespace _winml {

+inline size_t compute_size_of_shape(const std::vector<int64_t>& shape) {
+  auto size_of_shape =
+    static_cast<size_t>(
+      std::accumulate(
+        std::begin(shape),
+        std::end(shape),
+        static_cast<int64_t>(1),
+        std::multiplies<int64_t>()));
+  return size_of_shape;
+}
+
+template <typename T>
+inline auto create_data(
+  const std::vector<int64_t>& shape,
+  const wfc::IIterable<wss::IBuffer>& buffers) {
+  return _winml::numeric_data::create(compute_size_of_shape(shape), sizeof(T), buffers);
+}
+
+template <>
+inline auto create_data<std::string>(
+  const std::vector<int64_t>& shape,
+  const wfc::IIterable<wss::IBuffer>& /*buffers*/) {
+  return _winml::string_data::create(compute_size_of_shape(shape));
+}
+
 template <typename T>
 class Tensor {
 private:
-  std::shared_ptr<TensorBuffer<T>> buffer_;
+  std::shared_ptr<_winml::idata> data_;
  std::vector<int64_t> shape_;

- public:
+ private:
  Tensor() = delete;

-  Tensor(
-      std::vector<int64_t> const& shape,
-      wfc::IIterable<wss::IBuffer> const& buffers) :
-                            shape_(shape),
-                            buffer_(TensorBuffer<T>::Create(
-                                        static_cast<size_t>(std::accumulate(
-                                             std::begin(shape), std::end(shape),
-                                             static_cast<int64_t>(1), std::multiplies<int64_t>())),
-                                        buffers)) {}
+ public:
+  Tensor(const std::vector<int64_t>& shape) :
+    shape_(shape),
+    data_(create_data<T>(shape, nullptr)) {}

  Tensor(
-      std::vector<int64_t> const& shape) : shape_(shape),
-                                           buffer_(TensorBuffer<T>::Create(
-                                                        static_cast<size_t>(std::accumulate(
-                                                            std::begin(shape), std::end(shape),
-                                                            static_cast<int64_t>(1),
-                                                            std::multiplies<int64_t>())))) {}
-
-  Tensor(
-      std::vector<int64_t> const&& shape) : shape_(std::move(shape)),
-                                            buffer_(TensorBuffer<T>::Create(
-                                                        static_cast<size_t>(std::accumulate(
-                                                            std::begin(shape), std::end(shape),
-                                                            static_cast<int64_t>(1),
-                                                            std::multiplies<int64_t>())))) {
-  }
-
-  auto number_of_elements() const {
-    return buffer_->NumElements();
-  }
+    const std::vector<int64_t>& shape,
+    const wfc::IIterable<wss::IBuffer>& buffers) :
+      shape_(shape),
+      data_(create_data<T>(shape, buffers)) {}

  auto size_in_bytes() const {
-    return buffer_->SizeInBytes();
+    return data_->size_in_bytes();
  }

  auto num_buffers() {
-    return buffer_->NumBuffers();
+    return data_->num_buffers();
  }

  auto& buffers() {
-    return buffer_->Buffers();
+    return data_->buffers();
  }

-  auto buffer(bool should_sync_buffer = true) {
-    auto span = buffer_->Buffer(should_sync_buffer);
-    return gsl::span<T>(reinterpret_cast<T*>(span.data()), buffer_->NumElements());
+  gsl::span<T> buffer(bool should_sync_buffer = true) {
+    auto span = data_->buffer(should_sync_buffer);
+    return gsl::span<T>(reinterpret_cast<T*>(span.data()), data_->num_elements());
  }

  auto flush() {
-    return buffer_->Flush();
+    return data_->flush();
  }

-  void set(size_t size, const T* pData) {
-    buffer_->Set(size * sizeof(T), pData);
-  }
-
-  void set(std::vector<T>&& other) {
-    buffer_->Set(other);
+  void set(size_t size, const T* data) {
+    auto size_in_bytes = size * sizeof(T);
+    data_->set(size_in_bytes, reinterpret_cast<const byte*>(data));
  }

  const std::vector<int64_t>& shape() const {
    return shape_;
  }

-  auto get_tensor_buffer() {
-    return buffer_;
+  auto get_data() {
+    return data_;
  }
 };
 }  // namespace _winml
--- a/winml/lib/Api/impl/TensorBase.h
+++ b/winml/lib/Api/impl/TensorBase.h
@ -74,28 +74,28 @@ struct TensorBase : TBase {
  ///    b) TensorBase(winrt::Windows::Foundation::Collections::IIterable<int64_t> const& shape)
  ///  3) use provided backing gpu memory
  ///    a) TensorBase(std::vector<int64_t> const& shape, ID3D12Resource* pResource)
-  TensorBase() : m_resources(std::make_shared<TensorResources<T>>()) {
+  TensorBase() : resources_(std::make_shared<TensorResources<T>>()) {
  }

  TensorBase(wfc::IIterable<int64_t> const& shape) : shape_(begin(shape), end(shape)),
-                                                     m_resources(std::make_shared<TensorResources<T>>()) {
-    GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape_);
+                                                     resources_(std::make_shared<TensorResources<T>>()) {
+    CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
  }

  TensorBase(std::vector<int64_t> const& shape) : shape_(shape),
-                                                  m_resources(std::make_shared<TensorResources<T>>()) {
-    GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape_);
+                                                  resources_(std::make_shared<TensorResources<T>>()) {
+    CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
  }

  TensorBase(std::vector<int64_t> const& shape, ID3D12Resource* resource) : shape_(shape),
-                                                                            m_resources(std::make_shared<TensorResources<T>>()) {
+                                                                            resources_(std::make_shared<TensorResources<T>>()) {
    // This Api is not supported for TensorString
    WINML_THROW_HR_IF_TRUE_MSG(
        E_ILLEGAL_METHOD_CALL,
        (std::is_same<T, std::string>::value),
        "TensorString objects cannot be created from a ID3D12Resource!");

-    GetGpuResource().copy_from(resource);
+    GpuTensor().copy_from(resource);
  }

  HRESULT CreateGPUMLValue(ID3D12Resource* resource, BindingContext& context, IValue** out) {
@ -117,21 +117,21 @@ struct TensorBase : TBase {
    auto engine = session->GetEngine();
    auto should_sync_buffer = context.type == _winml::BindingType::kInput;

-    if (GetCpuResource() != nullptr) {
+    if (CpuTensor() != nullptr) {
      return CreateTensorValueFromExternalBuffer(engine, should_sync_buffer, out);
    }

    // If there is no matching cpu resource, then fallback to a gpu resource
-    if (GetGpuResource() != nullptr) {
-      return CreateGPUMLValue(GetGpuResource().get(), context, out);
+    if (GpuTensor() != nullptr) {
+      return CreateGPUMLValue(GpuTensor().get(), context, out);
    }

    WINML_THROW_HR(WINML_ERR_INVALID_BINDING);
  }

  HRESULT GPUTensorize(_winml::BindingContext& context, IValue** out) {
-    if (GetGpuResource() != nullptr) {
-      return CreateGPUMLValue(GetGpuResource().get(), context, out);
+    if (GpuTensor() != nullptr) {
+      return CreateGPUMLValue(GpuTensor().get(), context, out);
    }

    // Get engine
@ -142,8 +142,8 @@ struct TensorBase : TBase {
    auto should_sync_buffer = context.type == _winml::BindingType::kInput;

    // If there is no matching gpu resource, then fallback to a cpu resource
-    if (GetCpuResource() != nullptr) {
-      auto num_backing_buffers = GetCpuResource()->num_buffers(); 
+    if (CpuTensor() != nullptr) {
+      auto num_backing_buffers = CpuTensor()->num_buffers(); 
      if (num_backing_buffers == 1) {
        // If we have a single backing cpu buffer, there is no need to create GPU resources.
        // The engine will use the buffer provided, and perform the needed copies into the GPU context as needed.
@ -154,24 +154,24 @@ struct TensorBase : TBase {
          // If we are binding inputs, then a GPU resource needs to be allocated, and individual buffer contents need
          // to be copied directly into a gpu resource.

-          if (GetGpuResource() == nullptr) {
-            GetGpuResource() = CreateD3D12Resource(session);
+          if (GpuTensor() == nullptr) {
+            GpuTensor() = CreateD3D12Resource(session);
          }

          _winml::ConverterResourceDescription descriptor = {};
          descriptor.pixel_format = static_cast<DWORD>(wgdx::DirectXPixelFormat::Unknown);
-          descriptor.width = static_cast<int>(GetCpuResource()->size_in_bytes());
+          descriptor.width = static_cast<int>(CpuTensor()->size_in_bytes());
          descriptor.height = static_cast<int>(1);
          descriptor.luid = device->GetD3DDevice()->GetAdapterLuid();  // Converted image on GPU

          context.converter = _winml::PoolObjectWrapper::Create(device->TensorizerStore()->Fetch(descriptor));
          context.converter->Get()->Tensorizer->ConvertBuffersToBatchedGPUTensor(
-            GetCpuResource()->buffers(),
-            GetCpuResource()->size_in_bytes(),
+            CpuTensor()->buffers(),
+            CpuTensor()->size_in_bytes(),
            *device->GetD3DDeviceCache(),
-            GetGpuResource().get());
+            GpuTensor().get());

-          return CreateGPUMLValue(GetGpuResource().get(), context, out);
+          return CreateGPUMLValue(GpuTensor().get(), context, out);

        } else if (context.type == _winml::BindingType::kOutput) {
          // If we are binding outputs, then the buffers do not need to bound. If the engine produces a output on the gpu
@ -179,8 +179,8 @@ struct TensorBase : TBase {
          // into the output buffers without temporary intermediary buffers! No binding here is necessary.
          // If the output produces a cpu buffer (even in the GPU case), we will already have a cpu buffer, and just need
          // to copy back to the output buffers, no binding is necessary.
-          GetGpuResource() = CreateD3D12Resource(session);
-          return CreateGPUMLValue(GetGpuResource().get(), context, out);
+          GpuTensor() = CreateD3D12Resource(session);
+          return CreateGPUMLValue(GpuTensor().get(), context, out);
        }
      }
    }
@ -188,11 +188,11 @@ struct TensorBase : TBase {
    if (TensorKind() == winml::TensorKind::String) {
      // Lazily allocate the cpu TensorString resource
      // TensorStrings are CPU only, and so a gpu resource cannot be allocated for them.
-      GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape_);
+      CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
      return CreateTensorValueFromExternalBuffer(engine, should_sync_buffer, out);
    } else {
-      GetGpuResource() = CreateD3D12Resource(session);
-      return CreateGPUMLValue(GetGpuResource().get(), context, out);
+      GpuTensor() = CreateD3D12Resource(session);
+      return CreateGPUMLValue(GpuTensor().get(), context, out);
    }
  }

@ -242,8 +242,8 @@ struct TensorBase : TBase {
  void EnsureBufferNotInUse() {
    auto isBufferInUse =
        std::any_of(
-            m_outstandingReferences.begin(),
-            m_outstandingReferences.end(),
+            outstanding_references_.begin(),
+            outstanding_references_.end(),
            [](auto weakRef) { return weakRef.get() != nullptr; });

    WINML_THROW_HR_IF_TRUE_MSG(WINML_ERR_INVALID_BINDING, isBufferInUse, "The tensor has outstanding memory buffer references that must be closed prior to evaluation!");
@ -254,7 +254,7 @@ struct TensorBase : TBase {
  (_winml::BindingContext& context, IValue** out) {
    RETURN_HR_IF_NULL_MSG(
        WINML_ERR_INVALID_BINDING,
-        m_resources,
+        resources_,
        "The tensor has been closed and its resources have been detached!");

    EnsureBufferNotInUse();
@ -289,7 +289,7 @@ struct TensorBase : TBase {
    // the conditions of ASSERT_TEMPLATE_PARAMETERS_EXACT() are met.
    ASSERT_TEMPLATE_PARAMETERS<ElementType, ElementViewType>();

-    GetCpuResource()->set(size, reinterpret_cast<ElementType*>(data));
+    CpuTensor()->set(size, reinterpret_cast<ElementType*>(data));
  }

  template <>
@ -297,7 +297,8 @@ struct TensorBase : TBase {
    // Ensure that this call is being called with the correct template parameters
    ASSERT_TEMPLATE_PARAMETERS<std::string, winrt::hstring>();

-    GetCpuResource()->get_tensor_buffer()->Set(size, reinterpret_cast<std::string_view*>(data));
+    auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
+    string_data->set(size, reinterpret_cast<std::string_view*>(data));
  }

  template <typename ElementType = T, typename ElementViewType = ViewT>
@ -307,8 +308,8 @@ struct TensorBase : TBase {
    ASSERT_TEMPLATE_PARAMETERS<ElementType, ElementViewType>();

    RETURN_IF_FAILED_MSG(engine->CreateTensorValueFromExternalBuffer(
-                             GetCpuResource()->buffer(sync_buffer).data(), GetCpuResource()->size_in_bytes(), GetCpuResource()->shape().data(),
-                             GetCpuResource()->shape().size(), TensorKind(), value),
+                             CpuTensor()->buffer(sync_buffer).data(), CpuTensor()->size_in_bytes(), CpuTensor()->shape().data(),
+                             CpuTensor()->shape().size(), TensorKind(), value),
                         "Failed to prepare buffer for copy back from device resource.");
    return S_OK;
  }
@ -318,17 +319,19 @@ struct TensorBase : TBase {
    // Ensure that this call is being called with the correct template parameters
    ASSERT_TEMPLATE_PARAMETERS<std::string, winrt::hstring>();

+    auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
+    auto& string_vector = string_data->get_backing_vector();
+
    std::vector<const char*> raw_values;
-    auto string_array = static_cast<std::string*>(GetCpuResource()->buffer().data());
    std::transform(
-        string_array,
-        string_array + GetCpuResource()->number_of_elements(),
+        std::begin(string_vector),
+        std::end(string_vector),
        std::back_inserter(raw_values),
        [&](auto& str) { return str.c_str(); });

    RETURN_IF_FAILED_MSG(engine->CreateStringTensorValueFromDataWithCopy(
-                             raw_values.data(), raw_values.size(), GetCpuResource()->shape().data(),
-                             GetCpuResource()->shape().size(), value),
+                             raw_values.data(), raw_values.size(), CpuTensor()->shape().data(),
+                             CpuTensor()->shape().size(), value),
                         "Failed to prepare buffer for copy back from device resource.");
    return S_OK;
  }
@ -338,7 +341,7 @@ struct TensorBase : TBase {
  (BindingContext& context, IValue* value) {
    RETURN_HR_IF_NULL_MSG(
        E_ILLEGAL_METHOD_CALL,
-        m_resources,
+        resources_,
        "The tensor has been closed and its resources have been detached during evaluation!");

    _winml::Resource updated_resource;
@ -348,14 +351,14 @@ struct TensorBase : TBase {
    RETURN_IF_FAILED_MSG(value->GetTensorShape(shape_), "Failed to get the tensor shape from resource!");

    // make sure we always have a CPU resource
-    if (GetCpuResource() == nullptr) {
-      GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape_);
+    if (CpuTensor() == nullptr) {
+      CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape_);
    }

    bool is_cpu;
    if (SUCCEEDED(value->IsCpu(&is_cpu)) && is_cpu) {
      // Get the data pointer and size
-      auto buffer = GetCpuResource()->buffer(false);
+      auto buffer = CpuTensor()->buffer(false);

      if (updated_resource.get() != reinterpret_cast<void*>(buffer.data())) {
        // Only copy the data if the source and destination are not the same!
@ -366,7 +369,7 @@ struct TensorBase : TBase {
      } else {
        // If the engine wrote to the data directly, it is possible that the underlying data was held by many buffers
        // In that case the underlying buffers will not match the engine output, and they need to be flushed.
-        GetCpuResource()->flush();
+        CpuTensor()->flush();
      }
    } else {
      // If we got a gpu resource, we should move the data to the cpu so accessors can retrieve the data.
@ -377,7 +380,7 @@ struct TensorBase : TBase {
      auto device = session->Device().as<winmlp::LearningModelDevice>();
      auto engine = session->GetEngine();

-      if (GetCpuResource()->num_buffers() == 1) {
+      if (CpuTensor()->num_buffers() == 1) {
        winrt::com_ptr<IValue> dest;
        RETURN_IF_FAILED_MSG(CreateTensorValueFromExternalBuffer(engine, false, dest.put()),
                             "Failed to prepare buffer for copy back from device resource.");
@ -395,7 +398,7 @@ struct TensorBase : TBase {
            d3dResource,
            buffer_size_in_bytes,
            *device->GetD3DDeviceCache(),
-            GetCpuResource()->buffers());
+            CpuTensor()->buffers());

        // Reset the Allocator before return to the Cache. Must Sync this background thread to that completion before we do.
        device->GetD3DDeviceCache()->SyncD3D12ToCPU();
@ -615,7 +618,7 @@ struct TensorBase : TBase {
    // Ensure that CreateReference is only called when there is 1 buffer.
    WINML_THROW_HR_IF_TRUE_MSG(
        E_ILLEGAL_METHOD_CALL,
-        GetCpuResource() != nullptr && GetCpuResource()->num_buffers() != 1, "A single buffer reference cannot be retrieved when the tensor is backed by multiple buffers!");
+        CpuTensor() != nullptr && CpuTensor()->num_buffers() != 1, "A single buffer reference cannot be retrieved when the tensor is backed by multiple buffers!");

    // Create a TensorMemoryBufferReference<T>

@ -624,11 +627,11 @@ struct TensorBase : TBase {
    // "has been closed. In that case, the returned IMemoryBufferReference is already closed."
    // Creating a TensorMemoryBufferReference<T> with a null pointer is equivalent to creating it as closed.

-    auto memoryBufferReference = winrt::make<TensorMemoryBufferReference<T>>(shape_, m_resources);
+    auto memoryBufferReference = winrt::make<TensorMemoryBufferReference<T>>(shape_, resources_);

    // Create and cache a weak reference to the TensorMemoryBufferReference<T>
    winrt::weak_ref<TensorMemoryBufferReference<T>> weak(memoryBufferReference.as<TensorMemoryBufferReference<T>>());
-    m_outstandingReferences.push_back(weak);
+    outstanding_references_.push_back(weak);

    // Return the strong ref to the caller
    return memoryBufferReference;
@ -638,7 +641,7 @@ struct TensorBase : TBase {
  // IMemoryBuffer::Close
  void Close() try {
    // Let go of the lifetime of the resources, this is will indicate that the memorybuffer is closed
-    m_resources = nullptr;
+    resources_ = nullptr;
  }
  WINML_CATCH_ALL

@ -653,10 +656,10 @@ struct TensorBase : TBase {

    RETURN_HR_IF_NULL_MSG(
        E_ILLEGAL_METHOD_CALL,
-        m_resources,
+        resources_,
        "The tensor has been closed and its resources have been detached!");

-    return m_resources->GetBuffer(shape_, value, capacity);
+    return resources_->GetBuffer(shape_, value, capacity);
  }

  // ITensorNative::GetD3D12Resource
@ -667,10 +670,10 @@ struct TensorBase : TBase {
      RETURN_HR_IF(ERROR_INVALID_FUNCTION, (std::is_same<T, std::string>::value));
      RETURN_HR_IF_NULL_MSG(
          E_ILLEGAL_METHOD_CALL,
-          m_resources,
+          resources_,
          "The tensor has been closed and its resources have been detached!");

-      GetGpuResource().copy_to(ppResource);
+      GpuTensor().copy_to(ppResource);
      return S_OK;
    }
    WINML_CATCH_ALL_COM
@ -689,12 +692,11 @@ struct TensorBase : TBase {
    // owned IVectorView object.

    // Get the raw buffer pointer from the native tensor implementation.
-    auto number_of_elements = GetCpuResource()->number_of_elements();
-    auto buffer = GetCpuResource()->buffer();
+    auto buffer = CpuTensor()->buffer();
    auto element_data = static_cast<ElementType*>(buffer.data());
-    
+
    // Copy data that will be passed back to caller.
-    auto copy = std::vector<ElementType>(element_data, element_data + number_of_elements);
+    auto copy = std::vector<ElementType>(element_data, element_data + buffer.size());

    // Create IVectorView from copied data.
    return winrt::single_threaded_vector<ElementViewType>(std::move(copy)).GetView();
@ -707,18 +709,17 @@ struct TensorBase : TBase {
    // Ensure that this call is being called with the correct template parameters
    ASSERT_TEMPLATE_PARAMETERS<_winml::Half, float>();

-    auto number_of_elements = GetCpuResource()->number_of_elements();
-    auto buffer = GetCpuResource()->buffer();
+    auto buffer = CpuTensor()->buffer();
    auto element_data = static_cast<_winml::Half*>(buffer.data());

    // Copy the HALFs to floats
-    std::vector<float> float_value(number_of_elements);
+    std::vector<float> float_value(buffer.size());
    DirectX::PackedVector::XMConvertHalfToFloatStream(
        float_value.data(),
        sizeof(float) /* output stride */,
        reinterpret_cast<DirectX::PackedVector::HALF*>(element_data),
        sizeof(_winml::Half) /* input stride */,
-        number_of_elements);
+        buffer.size());

    // Create IVectorView from copied data.
    return winrt::single_threaded_vector<float>(std::move(float_value)).GetView();
@ -731,16 +732,15 @@ struct TensorBase : TBase {
    // Ensure that this call is being called with the correct template parameters
    ASSERT_TEMPLATE_PARAMETERS<std::string, winrt::hstring>();

-    auto number_of_elements = GetCpuResource()->number_of_elements();
-    auto buffer = GetCpuResource()->buffer();
-    auto element_data = static_cast<std::string*>(buffer.data());
+    auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
+    auto& string_vector = string_data->get_backing_vector();

-    auto copy = std::vector<winrt::hstring>(number_of_elements, L"");
+    auto copy = std::vector<winrt::hstring>(string_vector.size(), L"");
    std::generate(
        copy.begin(),
        copy.end(),
-        [n = 0, &element_data]() mutable {
-          return _winml::Strings::HStringFromUTF8(element_data[n++]);
+        [n = 0, &string_vector]() mutable {
+          return _winml::Strings::HStringFromUTF8(string_vector[n++]);
        });

    return winrt::single_threaded_vector<winrt::hstring>(std::move(copy)).GetView();
@ -752,14 +752,13 @@ struct TensorBase : TBase {
  wfc::IVectorView<uint8_t> GetAsVectorView<int8_t, uint8_t>() try {
    ASSERT_TEMPLATE_PARAMETERS<int8_t, uint8_t>();

-    auto number_of_elements = GetCpuResource()->number_of_elements();
-    auto buffer = GetCpuResource()->buffer();
+    auto buffer = CpuTensor()->buffer();
    auto element_data = static_cast<int8_t*>(buffer.data());

    // Copy data that will be passed back to caller.

-    gsl::span<uint8_t> span(reinterpret_cast<uint8_t*>(element_data), number_of_elements);
-    std::vector<uint8_t> copy(span.begin(), span.begin() + number_of_elements);
+    gsl::span<uint8_t> span(reinterpret_cast<uint8_t*>(element_data), buffer.size());
+    std::vector<uint8_t> copy(span.begin(), span.begin() + buffer.size());

    // Create IVectorView from copied data.
    return winrt::single_threaded_vector<uint8_t>(std::move(copy)).GetView();
@ -809,10 +808,10 @@ struct TensorBase : TBase {
    RETURN_HR_IF_NULL(E_POINTER, pIsPlaceHolder);
    RETURN_HR_IF_NULL_MSG(
        E_ILLEGAL_METHOD_CALL,
-        m_resources,
+        resources_,
        "The tensor has been closed and its resources have been detached!");

-    *pIsPlaceHolder = GetCpuResource() == nullptr && GetGpuResource() == nullptr;
+    *pIsPlaceHolder = CpuTensor() == nullptr && GpuTensor() == nullptr;
    return S_OK;
  }

@ -827,7 +826,7 @@ struct TensorBase : TBase {
    ASSERT_TEMPLATE_PARAMETERS_EXACT<ElementType, ElementViewType>();

    shape_ = shape;
-    GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
+    CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
  }

  template <>
@ -837,7 +836,7 @@ struct TensorBase : TBase {
    ASSERT_TEMPLATE_PARAMETERS<_winml::Half, float>();

    shape_ = shape;
-    GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
+    CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
  }

  template <>
@ -847,7 +846,7 @@ struct TensorBase : TBase {
    ASSERT_TEMPLATE_PARAMETERS<int8_t, uint8_t>();

    shape_ = shape;
-    GetCpuResource() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
+    CpuTensor() = std::make_shared<_winml::Tensor<T>>(shape, buffers);
  }

  // Specialized version to convert hstring to string
@ -875,12 +874,12 @@ struct TensorBase : TBase {
    // Ensure that the Set APIs are only called when there is 1 buffer.
    // These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
    // They should always be backed by a single underlying buffer.
-    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
+    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);

    // This method accepts data as an array, T[], from the caller.
    // This is a non-destructive API, so the caller data is
    // left untouched, and the data is copied into internal buffers.
-    GetCpuResource()->set(data.size(), data.data());
+    CpuTensor()->set(data.size(), data.data());
  }

  // Specialized version to convert floats to float16
@ -892,13 +891,12 @@ struct TensorBase : TBase {
    // Ensure that the Set APIs are only called when there is 1 buffer.
    // These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
    // They should always be backed by a single underlying buffer.
-    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
+    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);

-    auto number_of_elements = GetCpuResource()->number_of_elements();
-    auto buffer = GetCpuResource()->buffer();
+    auto buffer = CpuTensor()->buffer();
    auto element_data = static_cast<_winml::Half*>(buffer.data());

-    THROW_HR_IF(E_UNEXPECTED, data.size() != number_of_elements);
+    THROW_HR_IF(E_UNEXPECTED, data.size() != buffer.size());
    DirectX::PackedVector::XMConvertFloatToHalfStream(
        reinterpret_cast<DirectX::PackedVector::HALF*>(element_data),
        sizeof(_winml::Half) /* output stride */,
@ -916,12 +914,12 @@ struct TensorBase : TBase {
    // Ensure that the Set APIs are only called when there is 1 buffer.
    // These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
    // They should always be backed by a single underlying buffer.
-    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
+    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);

    auto size = data.size();
    auto pData = data.data();

-    GetCpuResource()->set(size, reinterpret_cast<int8_t*>(const_cast<uint8_t*>(pData)));
+    CpuTensor()->set(size, reinterpret_cast<int8_t*>(const_cast<uint8_t*>(pData)));
  }

  // Specialized version to convert hstring to string
@ -933,17 +931,16 @@ struct TensorBase : TBase {
    // Ensure that the Set APIs are only called when there is 1 buffer.
    // These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
    // They should always be backed by a single underlying buffer.
-    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
+    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);

-    auto number_of_elements = GetCpuResource()->number_of_elements();
-    auto buffer = GetCpuResource()->buffer();
-    THROW_HR_IF(E_UNEXPECTED, data.size() > number_of_elements);
+    auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
+    auto& string_vector = string_data->get_backing_vector();

-    auto element_data = static_cast<std::string*>(buffer.data());
+    THROW_HR_IF(E_UNEXPECTED, data.size() > string_vector.size());

    // Convert and copy into the underlying buffer
    std::transform(
-        data.begin(), data.end(), element_data,
+        data.begin(), data.end(), std::begin(string_vector),
        [](auto& element) mutable {
          return _winml::Strings::UTF8FromHString(element);
        });
@ -962,9 +959,9 @@ struct TensorBase : TBase {
    // Ensure that the Set APIs are only called when there is 1 buffer.
    // These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
    // They should always be backed by a single underlying buffer.
-    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
+    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);

-    auto buffer = GetCpuResource()->buffer();
+    auto buffer = CpuTensor()->buffer();
    auto element_data = static_cast<ElementType*>(buffer.data());

    // This method accepts data as an IVectorView<T>.
@ -983,9 +980,9 @@ struct TensorBase : TBase {
    // Ensure that the Set APIs are only called when there is 1 buffer.
    // These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
    // They should always be backed by a single underlying buffer.
-    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
+    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);

-    auto buffer = GetCpuResource()->buffer();
+    auto buffer = CpuTensor()->buffer();
    auto element_data = static_cast<_winml::Half*>(buffer.data());

    // Now that we take in IIterables and not vector views
@ -1009,9 +1006,9 @@ struct TensorBase : TBase {
    // Ensure that the Set APIs are only called when there is 1 buffer.
    // These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
    // They should always be backed by a single underlying buffer.
-    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
+    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);

-    auto buffer = GetCpuResource()->buffer();
+    auto buffer = CpuTensor()->buffer();
    auto element_data = static_cast<int8_t*>(buffer.data());
    std::transform(begin(data), end(data), element_data, [](auto element) { return static_cast<int8_t>(element); });
  }
@ -1026,39 +1023,39 @@ struct TensorBase : TBase {
    // Ensure that the Set APIs are only called when there is 1 buffer.
    // These APIs are only called when the tensor is being constructed from various collection and pointer public APIs.
    // They should always be backed by a single underlying buffer.
-    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, GetCpuResource()->num_buffers() != 1);
+    FAIL_FAST_HR_IF(E_ILLEGAL_METHOD_CALL, CpuTensor()->num_buffers() != 1);

-    auto buffer = GetCpuResource()->buffer();
-    auto element_data = static_cast<std::string*>(buffer.data());
+    auto string_data = std::static_pointer_cast<_winml::string_data>(CpuTensor()->get_data());
+    auto& string_vector = string_data->get_backing_vector();

    // Convert and copy into the underlying buffer
-    std::transform(begin(data), end(data), element_data, [](const auto& element) {
+    std::transform(begin(data), end(data), std::begin(string_vector), [](const auto& element) {
      return _winml::Strings::UTF8FromHString(element);
    });
  }

-  std::shared_ptr<_winml::Tensor<T>>& GetCpuResource() {
+  std::shared_ptr<_winml::Tensor<T>>& CpuTensor() {
    WINML_THROW_HR_IF_NULL_MSG(
        E_ILLEGAL_METHOD_CALL,
-        m_resources,
+        resources_,
        "The tensor has been closed and its resources are detached!");

-    return m_resources->CpuResource;
+    return resources_->cpu_resource_;
  }

-  winrt::com_ptr<ID3D12Resource>& GetGpuResource() {
+  winrt::com_ptr<ID3D12Resource>& GpuTensor() {
    WINML_THROW_HR_IF_NULL_MSG(
        E_ILLEGAL_METHOD_CALL,
-        m_resources,
+        resources_,
        "The tensor has been closed and its resources are detached!");

-    return m_resources->GpuResource;
+    return resources_->gpu_resource_;
  }

 private:
  std::vector<int64_t> shape_;
-  std::shared_ptr<TensorResources<T>> m_resources;
-  std::vector<winrt::weak_ref<TensorMemoryBufferReference<T>>> m_outstandingReferences;
+  std::shared_ptr<TensorResources<T>> resources_;
+  std::vector<winrt::weak_ref<TensorMemoryBufferReference<T>>> outstanding_references_;
  bool m_isClosed = false;
 };

--- a/winml/lib/Api/impl/TensorBuffer.h
+++ b/winml/lib/Api/impl/TensorBuffer.h
@ -1,234 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "robuffer.h"
-#include "winrt/Windows.Storage.Streams.h"
-#include "DisjointBufferHelpers.h"
-
-namespace _winml {
-
-class VectorBuffer : public winrt::implements<
-                         VectorBuffer,
-                         wss::IBuffer,
-                         Windows::Storage::Streams::IBufferByteAccess> {
- public:
-  VectorBuffer(size_t size) : buffer_(size) {}
-
-  uint32_t Capacity() const {
-    return static_cast<uint32_t>(buffer_.size());
-  }
-
-  uint32_t Length() const {
-    throw winrt::hresult_error(E_NOTIMPL);
-  }
-
-  void Length(uint32_t /*value*/) {
-    throw winrt::hresult_error(E_NOTIMPL);
-  }
-
-  STDMETHOD(Buffer)
-  (uint8_t** value) {
-    RETURN_HR_IF_NULL(E_POINTER, value);
-    *value = buffer_.data();
-    return S_OK;
-  }
-
- private:
-  std::vector<BYTE> buffer_;
-};
-
-template <typename T>
-class TensorBuffer {
-  wss::IBuffer combined_buffer_;
-  std::vector<wss::IBuffer> buffers_;
-  size_t size_;
-
-  TensorBuffer(size_t size) :
-      size_(size),
-      combined_buffer_(winrt::make<VectorBuffer>(size * sizeof(T))),
-      buffers_ { combined_buffer_ } {
-    auto buffer = BufferAt(0);
-
-    // The initial release of WinML (RS5) shipped with behavior that would
-    // zero-initialize uninitialized tensors. After measuring, the performance impact
-    // of memsetting the memory buffer is quite small (<1ms for 3channel 720x720 TensorFloats).
-    // To maintain parity with RS5 behavior, we always zero out the memory buffer.
-    memset(buffer.data(), 0, buffer.size_bytes());
-  }
-
-  TensorBuffer(
-      size_t size,
-      wfc::IIterable<wss::IBuffer> const& buffers) : size_(size),
-                                                     combined_buffer_(nullptr),
-                                                     buffers_(begin(buffers), end(buffers)) {
-    if (buffers_.size() == 1) {
-      combined_buffer_ = buffers_[0];
-    } else {
-      // If there are many buffers, then the combined buffer will be a separately allocated value that combines all of the buffers.
-      // This needs to be lazily done however, as the extra memory should not be allocated when not needed (GPU).
-    }
-  }
-
-  auto CombinedBuffer() {
-    if (combined_buffer_ == nullptr) {
-      combined_buffer_ = winrt::make<VectorBuffer>(size_ * sizeof(T));
-    }
-    return BufferFrom(combined_buffer_);
-  }
-
- public:
-  static auto Create(size_t size) {
-    return std::shared_ptr<TensorBuffer>(new TensorBuffer(size));
-  }
-
-  static auto Create(
-      size_t size,
-      wss::IBuffer buffer) {
-    return std::shared_ptr<TensorBuffer>(new TensorBuffer(size, buffer));
-  }
-
-  static auto Create(
-      size_t size,
-      wfc::IIterable<wss::IBuffer> const& buffers) {
-    return std::shared_ptr<TensorBuffer>(new TensorBuffer(size, buffers));
-  }
-
-  auto NumElements() {
-    return size_;
-  }
-
-  auto SizeInBytes() {
-    return size_ * sizeof(T);
-  }
-
-  auto NumBuffers() {
-    return buffers_.size();
-  }
-
-  auto& Buffers() {
-    return buffers_;
-  }
-
-  auto Buffer(bool should_sync_buffer) {
-    if (buffers_.size() == 1) {
-      // Single buffer optimization to not create a temporary buffer that concatenates disjoint buffers into one.
-      return BufferAt(0);
-    }
-    auto span = CombinedBuffer();
-    if (should_sync_buffer) {
-      _winml::LoadOrStoreDisjointBuffers(
-        true /*load buffer*/,
-        buffers_.size(),
-        [this](size_t i) { return BufferAt(i); },
-        span);
-    }
-
-    return span;
-  }
-
-  auto Flush() {
-    auto should_flush = buffers_.size() != 1;
-    if (should_flush) {
-      auto span = CombinedBuffer();
-      _winml::LoadOrStoreDisjointBuffers(
-          false /*store buffer*/,
-          buffers_.size(),
-          [this](size_t i) { return BufferAt(i); },
-          span);
-    }
-    return should_flush;
-  }
-
-  auto Set(size_t size_in_bytes, const T* data) {
-    WINML_THROW_HR_IF_FALSE_MSG(
-        E_INVALIDARG,
-        size_in_bytes <= (size_ * sizeof(T)),
-        "Argument size (%llu) exceeds the tensor size (%llu).",
-        static_cast<uint64_t>(size_in_bytes),
-        static_cast<uint64_t>(size_ * sizeof(T)));
-    
-    gsl::span<byte> span(reinterpret_cast<byte*>(const_cast<T*>(data)), size_in_bytes);
-    _winml::LoadOrStoreDisjointBuffers(
-      false /*store buffer*/,
-      buffers_.size(),
-      [this](size_t i) { return BufferAt(i); },
-      span);
-  }
-
-  auto Set(std::vector<T>&& moveableData) {
-    Set(moveableData.size() * sizeof(T), moveableData.data());
-  }
-
- private:
-  auto BufferFrom(wss::IBuffer buffer) {
-    byte* current_data = nullptr;
-    auto bufferByteAccess = buffer.as<Windows::Storage::Streams::IBufferByteAccess>();
-    bufferByteAccess->Buffer(&current_data);
-    return gsl::span<byte>(
-        current_data,
-        static_cast<size_t>(buffer.Capacity()));
-  }
-
-  auto BufferAt(size_t index) {
-    return BufferFrom(buffers_[index]);
-  }
-};
-
-template <>
-class TensorBuffer<std::string> {
-  std::vector<std::string> buffer_;
-
-  TensorBuffer(size_t size) : buffer_(size) {}
-
- public:
-  static auto Create(size_t size) {
-    return std::shared_ptr<TensorBuffer>(new TensorBuffer(size));
-  }
-
-  auto NumElements() {
-    return buffer_.size();
-  }
-
-  auto SizeInBytes() {
-    return buffer_.size();
-  }
-
-  auto NumBuffers() {
-    return 1;
-  }
-
-  auto Flush() {
-    return false;
-  }
-
-  auto Buffers() -> std::vector<wss::IBuffer>& {
-    WINML_THROW_HR(E_UNEXPECTED);
-  }
-
-  auto BufferAt(size_t index) {
-    WINML_THROW_HR_IF_FALSE_MSG(
-        E_INVALIDARG,
-        index == 0,
-        "TensorString can only be backed by a single buffer!");
-    return gsl::span<byte>(reinterpret_cast<byte*>(buffer_.data()), buffer_.size());
-  }
-
-  auto Buffer(bool /*should_sync_buffer*/) {
-    return BufferAt(0);
-  }
-
-  auto Set(size_t size, std::string_view* data) {
-    WINML_THROW_HR_IF_FALSE_MSG(
-        E_INVALIDARG,
-        size <= buffer_.size(),
-        "Argument size (%d) exceeds the tensor size (%d).",
-        static_cast<int>(size),
-        static_cast<int>(buffer_.size()));
-
-    // Copy
-    std::copy(data, data + size, buffer_.begin());
-  }
-};
-}  // namespace _winml
--- a/winml/lib/Api/impl/TensorMemoryBufferReference.h
+++ b/winml/lib/Api/impl/TensorMemoryBufferReference.h
@ -29,12 +29,12 @@ struct TensorResources {
      *capacity = 0;

      // Lazily allocate the cpu resource on call to GetBuffer
-      if (CpuResource == nullptr) {
-        CpuResource = std::make_shared<_winml::Tensor<T>>(shape);
+      if (cpu_resource_ == nullptr) {
+        cpu_resource_ = std::make_shared<_winml::Tensor<T>>(shape);
      }

      // Get the data pointer and size
-      auto buffer = CpuResource->buffer();
+      auto buffer = cpu_resource_->buffer();

      // Set out parameters
      *capacity = static_cast<uint32_t>(buffer.size_bytes());
@ -45,8 +45,8 @@ struct TensorResources {
  }

  // Theses are access directly by TensorMemoryBufferReference<T> and TensorBase
-  std::shared_ptr<_winml::Tensor<T>> CpuResource;
-  winrt::com_ptr<ID3D12Resource> GpuResource;
+  std::shared_ptr<_winml::Tensor<T>> cpu_resource_;
+  winrt::com_ptr<ID3D12Resource> gpu_resource_;
 };

 // This class holds onto the lifetime of TensorResources<T> so that they can be kept alive by TensorBase AND its active MBRs.