From b35468289ae1c78006aee4d622fdff80efbe0538 Mon Sep 17 00:00:00 2001 From: ytaous <4484531+ytaous@users.noreply.github.com> Date: Tue, 7 Apr 2020 13:07:11 -0700 Subject: [PATCH] View Op - new unit tests and add support for tensor memcpy by offset/size (#3439) * view ops UTs * update per comments * PR comments - code clean up * code clean up per comments Co-authored-by: Ethan Tao --- include/onnxruntime/core/framework/tensor.h | 1 + onnxruntime/core/framework/allocatormgr.cc | 8 +- .../test/training_ops/cuda/view_op_test.cc | 195 ++++++++++++++++++ .../training_ops/cuda/tensor/view.cc | 8 +- 4 files changed, 203 insertions(+), 9 deletions(-) create mode 100644 orttraining/orttraining/test/training_ops/cuda/view_op_test.cc diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h index 447e6cff75..ac31611c42 100644 --- a/include/onnxruntime/core/framework/tensor.h +++ b/include/onnxruntime/core/framework/tensor.h @@ -193,6 +193,7 @@ class Tensor final { /** * Get the byte offset with respect to the p_data * @warning this is a temporary solution for reusing the buffer bigger than needed. + * @warning use with caution - make sure you do boundary check before calling this method (see view.cc) */ inline ptrdiff_t ByteOffset() const { return byte_offset_; diff --git a/onnxruntime/core/framework/allocatormgr.cc b/onnxruntime/core/framework/allocatormgr.cc index 35a4cb6db6..8ece8b06d5 100644 --- a/onnxruntime/core/framework/allocatormgr.cc +++ b/onnxruntime/core/framework/allocatormgr.cc @@ -11,12 +11,6 @@ namespace onnxruntime { -#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) - using TArenaAllocator = MiMallocArena; -#else - using TArenaAllocator = BFCArena; -#endif - using namespace ::onnxruntime::common; AllocatorPtr CreateAllocator(DeviceAllocatorRegistrationInfo info, OrtDevice::DeviceId device_id) { @@ -24,7 +18,7 @@ AllocatorPtr CreateAllocator(DeviceAllocatorRegistrationInfo info, OrtDevice::De if (device_allocator->AllowsArena()) { #ifdef USE_MIMALLOC return std::shared_ptr( - onnxruntime::make_unique(std::move(device_allocator), info.max_mem, arena_extend_strategy)); + onnxruntime::make_unique(std::move(device_allocator), info.max_mem)); #else return std::shared_ptr( onnxruntime::make_unique(std::move(device_allocator), info.max_mem, info.arena_extend_strategy)); diff --git a/orttraining/orttraining/test/training_ops/cuda/view_op_test.cc b/orttraining/orttraining/test/training_ops/cuda/view_op_test.cc new file mode 100644 index 0000000000..7918ee6337 --- /dev/null +++ b/orttraining/orttraining/test/training_ops/cuda/view_op_test.cc @@ -0,0 +1,195 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "gtest/gtest.h" +#include "test/providers/provider_test_utils.h" + +namespace onnxruntime { +namespace test { + +template +using ShapeAndData = std::pair, const std::vector>; + +using ShapeAndFloatData = ShapeAndData; +using ShapeAndDoubleData = ShapeAndData; +using ShapeAndHalfData = ShapeAndData; +using ShapeData = ShapeAndData; +using ExpectResult = OpTester::ExpectResult; + +template +void RunTest(const ShapeAndData& input, + const std::vector& shapes, + const std::vector>& outputs, + bool expect_failure = false, + const std::string& err_msg = {}) { + OpTester test("View", 1, onnxruntime::kMSDomain); + + test.AddInput("input0", input.first, input.second); + + int i = 1; + for (auto& s : shapes) { + auto& shape = s.first; + auto& data = s.second; + std::ostringstream oss; + oss << "input" << i++; + test.AddInput(oss.str().c_str(), shape, data); + } + + i = 0; + for (auto& output : outputs) { + auto& shape = output.first; + auto& data = output.second; + std::ostringstream oss; + oss << "output" << i++; + test.AddOutput(oss.str().c_str(), shape, data); + } + + std::unordered_set excluded_providers; + + test.Run(expect_failure ? ExpectResult::kExpectFailure : ExpectResult::kExpectSuccess, err_msg, excluded_providers); +} + +TEST(ViewOperatorTest, TwoViewFloat_1) { + std::vector shapes; + std::vector outputs; + + // input shape and data + ShapeAndFloatData input = {{4, 2}, + {1.f, 2.f, + 3.f, 4.f, + 5.f, 6.f, + 7.f, 8.f}}; + + shapes.push_back({{2}, std::vector(2, 2)}); + shapes.push_back({{2}, std::vector(2, 2)}); + + outputs.push_back({{2, 2}, + {1.f, 2.f, + 3.f, 4.f}}); + outputs.push_back({{2, 2}, + {5.f, 6.f, + 7.f, 8.f}}); + + RunTest(input, shapes, outputs); +} + +TEST(ViewOperatorTest, TwoViewFloat_2) { + std::vector shapes; + std::vector outputs; + + // input shape and data + ShapeAndFloatData input = {{4, 2}, + {1.f, 2.f, + 3.f, 4.f, + 5.f, 6.f, + 7.f, 8.f}}; + + shapes.push_back({{2}, {1, 2}}); + shapes.push_back({{2}, {3, 2}}); + + outputs.push_back({{1, 2}, {1.f, 2.f}}); + outputs.push_back({{3, 2}, + {3.f, 4.f, + 5.f, 6.f, + 7.f, 8.f}}); + + RunTest(input, shapes, outputs); +} + +TEST(ViewOperatorTest, TwoViewFloat_3) { + std::vector shapes; + std::vector outputs; + + // input shape and data + ShapeAndFloatData input = {{4, 2}, + {1.f, 2.f, + 3.f, 4.f, + 5.f, 6.f, + 7.f, 8.f}}; + + shapes.push_back({{2}, {1, 2}}); + shapes.push_back({{3}, {1, 3, 2}}); + + outputs.push_back({{1, 2}, {1.f, 2.f}}); + outputs.push_back({{1, 3, 2}, + {3.f, 4.f, + 5.f, 6.f, + 7.f, 8.f}}); + + RunTest(input, shapes, outputs); +} + +TEST(ViewOperatorTest, ThreeViewFloat) { + std::vector shapes; + std::vector outputs; + + // input shape and data + ShapeAndFloatData input = {{4, 3}, + {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, + 7.f, 8.f, 9.f, 10.f, 11.f, 12.f}}; + + shapes.push_back({{2}, {1, 2}}); + shapes.push_back({{3}, {1, 3, 2}}); + shapes.push_back({{2}, {4, 1}}); + + outputs.push_back({{1, 2}, {1.f, 2.f}}); + outputs.push_back({{1, 3, 2}, + {3.f, 4.f, 5.f, 6.f, 7.f, 8.f}}); + outputs.push_back({{4, 1}, + {9.f, 10.f, 11.f, 12.f}}); + + RunTest(input, shapes, outputs); +} + +TEST(ViewOperatorTest, TwoViewDouble) { + std::vector shapes; + std::vector outputs; + + // input shape and data + ShapeAndDoubleData input = {{3, 2}, + {1.f, 2.f, + 3.f, 4.f, + 5.f, 6.f}}; + + shapes.push_back({{2}, {2, 1}}); + shapes.push_back({{3}, {1, 2, 2}}); + + outputs.push_back({{2, 1}, + {1.f, 2.f}}); + outputs.push_back({{1, 2, 2}, + {3.f, 4.f, 5.f, 6.f}}); + + RunTest(input, shapes, outputs); + +} + +TEST(ViewOperatorTest, TwoViewHalf) { + std::vector shapes; + std::vector outputs; + + std::vector data = {1.0f, 2.0f, + 3.0f, 4.0f, + 5.0f, 6.0f}; + std::vector data_half(6); + ConvertFloatToMLFloat16(data.data(), data_half.data(), 6); + // input shape and data + ShapeAndHalfData input = {{3, 2}, data_half}; + + shapes.push_back({{2}, {2, 1}}); + shapes.push_back({{3}, {1, 2, 2}}); + + std::vector data1 = {1.0f, 2.0f}; + std::vector data_half1(2); + ConvertFloatToMLFloat16(data1.data(), data_half1.data(), 2); + outputs.push_back({{2, 1}, data_half1}); + + std::vector data2 = {3.f, 4.f, 5.f, 6.f}; + std::vector data_half2(4); + ConvertFloatToMLFloat16(data2.data(), data_half2.data(), 4); + outputs.push_back({{1, 2, 2}, data_half2}); + + RunTest(input, shapes, outputs); +} + +} // namespace test +} // namespace onnxruntime diff --git a/orttraining/orttraining/training_ops/cuda/tensor/view.cc b/orttraining/orttraining/training_ops/cuda/tensor/view.cc index 5507d263f6..af6c140101 100644 --- a/orttraining/orttraining/training_ops/cuda/tensor/view.cc +++ b/orttraining/orttraining/training_ops/cuda/tensor/view.cc @@ -73,9 +73,13 @@ Status View::ComputeInternal(OpKernelContext* context) const { Tensor* Y = context->Output(i, y_shapes[i]); if (Y != nullptr) { if (X_data != Y->MutableDataRaw()) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "View output is not sharing the underlaying buffer of input"); + // View output is not sharing the underlaying buffer of input, copy instead + const void* source = static_cast(X_data) + y_byte_offsets[i]; + void* target = Y->MutableDataRaw(); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target, source, Y->SizeInBytes(), cudaMemcpyDeviceToDevice)); + } else { + Y->SetByteOffset(y_byte_offsets[i]); } - Y->SetByteOffset(y_byte_offsets[i]); } }