From b35468289ae1c78006aee4d622fdff80efbe0538 Mon Sep 17 00:00:00 2001
From: ytaous <4484531+ytaous@users.noreply.github.com>
Date: Tue, 7 Apr 2020 13:07:11 -0700
Subject: [PATCH] View Op - new unit tests and add support for tensor memcpy by
 offset/size (#3439)

* view ops UTs

* update per comments

* PR comments - code clean up

* code clean up per comments

Co-authored-by: Ethan Tao <ettao@microsoft.com>
---
 include/onnxruntime/core/framework/tensor.h   |   1 +
 onnxruntime/core/framework/allocatormgr.cc    |   8 +-
 .../test/training_ops/cuda/view_op_test.cc    | 195 ++++++++++++++++++
 .../training_ops/cuda/tensor/view.cc          |   8 +-
 4 files changed, 203 insertions(+), 9 deletions(-)
 create mode 100644 orttraining/orttraining/test/training_ops/cuda/view_op_test.cc
diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h
index 447e6cff75..ac31611c42 100644
--- a/include/onnxruntime/core/framework/tensor.h
+++ b/include/onnxruntime/core/framework/tensor.h
@@ -193,6 +193,7 @@ class Tensor final {
   /**
    * Get the byte offset with respect to the p_data
    * @warning this is a temporary solution for reusing the buffer bigger than needed.
+   * @warning use with caution - make sure you do boundary check before calling this method (see view.cc)
    */
   inline ptrdiff_t ByteOffset() const {
     return byte_offset_;
diff --git a/onnxruntime/core/framework/allocatormgr.cc b/onnxruntime/core/framework/allocatormgr.cc
index 35a4cb6db6..8ece8b06d5 100644
--- a/onnxruntime/core/framework/allocatormgr.cc
+++ b/onnxruntime/core/framework/allocatormgr.cc
@@ -11,12 +11,6 @@
 
 namespace onnxruntime {
 
-#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
-  using TArenaAllocator = MiMallocArena;
-#else
-  using TArenaAllocator = BFCArena;
-#endif
-
 using namespace ::onnxruntime::common;
 
 AllocatorPtr CreateAllocator(DeviceAllocatorRegistrationInfo info, OrtDevice::DeviceId device_id) {
@@ -24,7 +18,7 @@ AllocatorPtr CreateAllocator(DeviceAllocatorRegistrationInfo info, OrtDevice::De
   if (device_allocator->AllowsArena()) {
 #ifdef USE_MIMALLOC
     return std::shared_ptr<IArenaAllocator>(
-          onnxruntime::make_unique<MiMallocArena>(std::move(device_allocator), info.max_mem, arena_extend_strategy));
+          onnxruntime::make_unique<MiMallocArena>(std::move(device_allocator), info.max_mem));
 #else
     return std::shared_ptr<IArenaAllocator>(
           onnxruntime::make_unique<BFCArena>(std::move(device_allocator), info.max_mem, info.arena_extend_strategy));
diff --git a/orttraining/orttraining/test/training_ops/cuda/view_op_test.cc b/orttraining/orttraining/test/training_ops/cuda/view_op_test.cc
new file mode 100644
index 0000000000..7918ee6337
--- /dev/null
+++ b/orttraining/orttraining/test/training_ops/cuda/view_op_test.cc
@@ -0,0 +1,195 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "gtest/gtest.h"
+#include "test/providers/provider_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <class T>
+using ShapeAndData = std::pair<const std::vector<int64_t>, const std::vector<T>>;
+
+using ShapeAndFloatData = ShapeAndData<float>;
+using ShapeAndDoubleData = ShapeAndData<double>;
+using ShapeAndHalfData = ShapeAndData<MLFloat16>;
+using ShapeData = ShapeAndData<int64_t>;
+using ExpectResult = OpTester::ExpectResult;
+
+template <typename T>
+void RunTest(const ShapeAndData<T>& input,
+             const std::vector<ShapeData>& shapes,
+             const std::vector<ShapeAndData<T>>& outputs,
+             bool expect_failure = false,
+             const std::string& err_msg = {}) {
+  OpTester test("View", 1, onnxruntime::kMSDomain);
+
+  test.AddInput<T>("input0", input.first, input.second);
+
+  int i = 1;
+  for (auto& s : shapes) {
+    auto& shape = s.first;
+    auto& data = s.second;
+    std::ostringstream oss;
+    oss << "input" << i++;
+    test.AddInput<int64_t>(oss.str().c_str(), shape, data);
+  }
+
+  i = 0;
+  for (auto& output : outputs) {
+    auto& shape = output.first;
+    auto& data = output.second;
+    std::ostringstream oss;
+    oss << "output" << i++;
+    test.AddOutput<T>(oss.str().c_str(), shape, data);
+  }
+
+  std::unordered_set<std::string> excluded_providers;
+
+  test.Run(expect_failure ? ExpectResult::kExpectFailure : ExpectResult::kExpectSuccess, err_msg, excluded_providers);
+}
+
+TEST(ViewOperatorTest, TwoViewFloat_1) {
+  std::vector<ShapeData> shapes;
+  std::vector<ShapeAndFloatData> outputs;
+
+  // input shape and data
+  ShapeAndFloatData input = {{4, 2},
+                             {1.f, 2.f,
+                              3.f, 4.f,
+                              5.f, 6.f,
+                              7.f, 8.f}};
+
+  shapes.push_back({{2}, std::vector<int64_t>(2, 2)});  
+  shapes.push_back({{2}, std::vector<int64_t>(2, 2)});
+
+  outputs.push_back({{2, 2},
+                     {1.f, 2.f,
+                      3.f, 4.f}});
+  outputs.push_back({{2, 2},
+                     {5.f, 6.f,
+                      7.f, 8.f}});
+
+  RunTest<float>(input, shapes, outputs);
+}
+
+TEST(ViewOperatorTest, TwoViewFloat_2) {
+  std::vector<ShapeData> shapes;
+  std::vector<ShapeAndFloatData> outputs;
+
+  // input shape and data
+  ShapeAndFloatData input = {{4, 2},
+                             {1.f, 2.f,
+                              3.f, 4.f,
+                              5.f, 6.f,
+                              7.f, 8.f}};
+
+  shapes.push_back({{2}, {1, 2}});
+  shapes.push_back({{2}, {3, 2}});
+
+  outputs.push_back({{1, 2}, {1.f, 2.f}});
+  outputs.push_back({{3, 2},
+                     {3.f, 4.f,
+                      5.f, 6.f,
+                      7.f, 8.f}});
+
+  RunTest<float>(input, shapes, outputs);
+}
+
+TEST(ViewOperatorTest, TwoViewFloat_3) {
+  std::vector<ShapeData> shapes;
+  std::vector<ShapeAndFloatData> outputs;
+
+  // input shape and data
+  ShapeAndFloatData input = {{4, 2},
+                             {1.f, 2.f,
+                              3.f, 4.f,
+                              5.f, 6.f,
+                              7.f, 8.f}};
+
+  shapes.push_back({{2}, {1, 2}});
+  shapes.push_back({{3}, {1, 3, 2}});
+
+  outputs.push_back({{1, 2}, {1.f, 2.f}});
+  outputs.push_back({{1, 3, 2},
+                     {3.f, 4.f,
+                      5.f, 6.f,
+                      7.f, 8.f}});
+
+  RunTest<float>(input, shapes, outputs);
+}
+
+TEST(ViewOperatorTest, ThreeViewFloat) {
+  std::vector<ShapeData> shapes;
+  std::vector<ShapeAndFloatData> outputs;
+
+  // input shape and data
+  ShapeAndFloatData input = {{4, 3},
+                             {1.f, 2.f, 3.f, 4.f, 5.f, 6.f,
+                              7.f, 8.f, 9.f, 10.f, 11.f, 12.f}};
+                       
+  shapes.push_back({{2}, {1, 2}});
+  shapes.push_back({{3}, {1, 3, 2}});
+  shapes.push_back({{2}, {4, 1}});
+
+  outputs.push_back({{1, 2}, {1.f, 2.f}});
+  outputs.push_back({{1, 3, 2},
+                     {3.f, 4.f, 5.f, 6.f, 7.f, 8.f}});
+  outputs.push_back({{4, 1},
+                     {9.f, 10.f, 11.f, 12.f}});
+
+  RunTest<float>(input, shapes, outputs);
+}
+
+TEST(ViewOperatorTest, TwoViewDouble) {
+  std::vector<ShapeData> shapes;
+  std::vector<ShapeAndDoubleData> outputs;
+
+  // input shape and data
+  ShapeAndDoubleData input = {{3, 2},  
+                              {1.f, 2.f,
+                               3.f, 4.f,
+                               5.f, 6.f}};
+
+  shapes.push_back({{2}, {2, 1}});
+  shapes.push_back({{3}, {1, 2, 2}});
+
+  outputs.push_back({{2, 1},
+                     {1.f, 2.f}});
+  outputs.push_back({{1, 2, 2},
+                     {3.f, 4.f, 5.f, 6.f}});
+
+  RunTest<double>(input, shapes, outputs);  
+
+}
+
+TEST(ViewOperatorTest, TwoViewHalf) {
+  std::vector<ShapeData> shapes;
+  std::vector<ShapeAndHalfData> outputs;
+
+  std::vector<float> data = {1.0f, 2.0f,
+                             3.0f, 4.0f,
+                             5.0f, 6.0f};
+  std::vector<MLFloat16> data_half(6);
+  ConvertFloatToMLFloat16(data.data(), data_half.data(), 6);
+  // input shape and data
+  ShapeAndHalfData input = {{3, 2}, data_half};
+
+  shapes.push_back({{2}, {2, 1}});
+  shapes.push_back({{3}, {1, 2, 2}});
+
+  std::vector<float> data1 = {1.0f, 2.0f};
+  std::vector<MLFloat16> data_half1(2);
+  ConvertFloatToMLFloat16(data1.data(), data_half1.data(), 2);
+  outputs.push_back({{2, 1}, data_half1});
+
+  std::vector<float> data2 = {3.f, 4.f, 5.f, 6.f};
+  std::vector<MLFloat16> data_half2(4);
+  ConvertFloatToMLFloat16(data2.data(), data_half2.data(), 4);
+  outputs.push_back({{1, 2, 2}, data_half2});
+
+  RunTest<MLFloat16>(input, shapes, outputs);
+}
+
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/orttraining/orttraining/training_ops/cuda/tensor/view.cc b/orttraining/orttraining/training_ops/cuda/tensor/view.cc
index 5507d263f6..af6c140101 100644
--- a/orttraining/orttraining/training_ops/cuda/tensor/view.cc
+++ b/orttraining/orttraining/training_ops/cuda/tensor/view.cc
@@ -73,9 +73,13 @@ Status View::ComputeInternal(OpKernelContext* context) const {
     Tensor* Y = context->Output(i, y_shapes[i]);
     if (Y != nullptr) {
       if (X_data != Y->MutableDataRaw()) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "View output is not sharing the underlaying buffer of input");
+        // View output is not sharing the underlaying buffer of input, copy instead
+        const void* source = static_cast<const char*>(X_data) + y_byte_offsets[i];
+        void* target = Y->MutableDataRaw();
+        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, Y->SizeInBytes(), cudaMemcpyDeviceToDevice));
+      } else {
+        Y->SetByteOffset(y_byte_offsets[i]);
       }
-      Y->SetByteOffset(y_byte_offsets[i]);
     }
   }