From eb5c1f0fcc6038f2743e7f5d2c1c43a014bdfdcb Mon Sep 17 00:00:00 2001 From: Sherlock Date: Mon, 14 Dec 2020 13:13:41 -0800 Subject: [PATCH] Unify activation and initializer alignment value (#6109) * Unify activation and initializer alignment value * Fix VerifyInputTensorsAllocatedContiguously --- .../onnxruntime/core/framework/allocator.h | 2 ++ .../onnxruntime/core/framework/op_kernel.h | 10 +++++++++ onnxruntime/core/framework/execution_frame.cc | 2 +- onnxruntime/core/framework/op_kernel.cc | 8 +++++++ .../core/framework/parallel_executor.cc | 5 +++-- .../core/framework/sequential_executor.cc | 5 +++-- onnxruntime/core/framework/session_state.cc | 4 ++-- .../core/framework/simple_tensor_allocator.cc | 3 +-- .../tensor_allocator_with_mem_pattern.h | 3 +-- .../core/framework/tensorprotoutils.cc | 18 +++++++--------- onnxruntime/core/framework/utils.cc | 21 ++++++++++++------- onnxruntime/test/framework/allocator_test.cc | 12 +++++------ .../test/framework/execution_frame_test.cc | 4 ++-- onnxruntime/test/onnx/tensorprotoutils.cc | 3 ++- 14 files changed, 62 insertions(+), 38 deletions(-) diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h index c86ed44f49..b66548fee6 100644 --- a/include/onnxruntime/core/framework/allocator.h +++ b/include/onnxruntime/core/framework/allocator.h @@ -25,6 +25,8 @@ constexpr const char* CUDA_PINNED = "CudaPinned"; constexpr const char* MIGRAPHX = "MIGraphX"; constexpr const char* MIGRAPHX_PINNED = "MIGraphXPinned"; +constexpr size_t kAllocAlignment = 256; + // forward declaration class SessionState; diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h index 7dd02e5e0c..2e3937a119 100644 --- a/include/onnxruntime/core/framework/op_kernel.h +++ b/include/onnxruntime/core/framework/op_kernel.h @@ -221,6 +221,16 @@ class OpKernelContext { **/ const std::string& GetOpDomain() const; + /** + Returns the optype of the underlying kernel + **/ + const std::string& GetOpType() const; + + /** + Returns the node name of the underlying kernel + **/ + const std::string& GetNodeName() const; + /** Returns the intra-op threadpool, if available. */ diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc index 765b7f110e..d6f67478bd 100644 --- a/onnxruntime/core/framework/execution_frame.cc +++ b/onnxruntime/core/framework/execution_frame.cc @@ -329,7 +329,7 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va if (static_cast(len) > std::numeric_limits::max()) { return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Tensor shape is too large"); } - if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(static_cast(len), element_type->Size(), &size)) { + if (!IAllocator::CalcMemSizeForArrayWithAlignment(static_cast(len), element_type->Size(), &size)) { return Status(ONNXRUNTIME, FAIL, "size overflow"); } diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc index b3c05bb4e5..6c7ddf1abf 100644 --- a/onnxruntime/core/framework/op_kernel.cc +++ b/onnxruntime/core/framework/op_kernel.cc @@ -139,10 +139,18 @@ onnxruntime::NodeIndex OpKernelContext::GetNodeIndex() const { return kernel_->Node().Index(); } +const std::string& OpKernelContext::GetNodeName() const { + return kernel_->Node().Name(); +} + const std::string& OpKernelContext::GetOpDomain() const { return kernel_->KernelDef().Domain(); } +const std::string& OpKernelContext::GetOpType() const { + return kernel_->Node().OpType(); +} + const OrtValue* OpKernelContext::GetInputMLValue(int index) const { if (index < 0 || index >= InputCount()) return nullptr; diff --git a/onnxruntime/core/framework/parallel_executor.cc b/onnxruntime/core/framework/parallel_executor.cc index 9c0c70770c..371758426c 100644 --- a/onnxruntime/core/framework/parallel_executor.cc +++ b/onnxruntime/core/framework/parallel_executor.cc @@ -192,8 +192,9 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index, // Execute the kernel. ORT_TRY { #ifdef ENABLE_TRAINING - if (p_op_kernel->KernelDef().AllocateInputsContiguously()) - utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context); + if (p_op_kernel->KernelDef().AllocateInputsContiguously()) { + ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context)); + } #endif status = p_op_kernel->Compute(&op_kernel_context); diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index e3806a8091..1e5e688146 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -308,8 +308,9 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std: #endif ORT_TRY { #ifdef ENABLE_TRAINING - if (p_op_kernel->KernelDef().AllocateInputsContiguously()) - utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context); + if (p_op_kernel->KernelDef().AllocateInputsContiguously()) { + ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context)); + } #endif compute_status = p_op_kernel->Compute(&op_kernel_context); diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 8a971ba539..cb79e6e631 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -453,7 +453,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vector(size, ml_data_type->Size(), &size)) { + if (!IAllocator::CalcMemSizeForArrayWithAlignment(size, ml_data_type->Size(), &size)) { return Status(ONNXRUNTIME, FAIL, "Size overflow"); } @@ -489,7 +489,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vectorallocation_plan[ml_value_idx].alloc_kind == AllocKind::kAllocate && ml_data_type != DataTypeImpl::GetType() && size != 0) { size_t aligned_size = 0; - if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(size, ml_data_type->Size(), &aligned_size)) { + if (!IAllocator::CalcMemSizeForArrayWithAlignment(size, ml_data_type->Size(), &aligned_size)) { return Status(ONNXRUNTIME, FAIL, "Size overflow"); } diff --git a/onnxruntime/core/framework/simple_tensor_allocator.cc b/onnxruntime/core/framework/simple_tensor_allocator.cc index 1b1839162a..360e052963 100644 --- a/onnxruntime/core/framework/simple_tensor_allocator.cc +++ b/onnxruntime/core/framework/simple_tensor_allocator.cc @@ -18,8 +18,7 @@ common::Status SimpleTensorAllocator::GetPreallocatedBuffer(int ort_value_index, } size_t len = 0; - static constexpr int alignment = 256; - ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto(*iter->second, &len)); + ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto(*iter->second, &len)); const struct OrtMemoryInfo& location = seq_plan_.GetLocation(ort_value_index); if (len == 0) { out = onnxruntime::make_unique(nullptr, 0, location); diff --git a/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h b/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h index 179c19e1d0..f92b444579 100644 --- a/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h +++ b/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h @@ -108,8 +108,7 @@ class TensorAllocatorWithMemPattern : public ITensorAllocator { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error."); } size_t len = 0; - static constexpr int alignment = 256; - ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto(*value, &len)); + ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto(*value, &len)); ORT_RETURN_IF_ERROR(planner_.TraceAllocation(id, len)); return Status::OK(); } diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index 9f5e7583ea..3605487ba5 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -810,8 +810,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT return status; } - -#if !defined (ORT_MINIMAL_BUILD) +#if !defined(ORT_MINIMAL_BUILD) // Determines if this is a type specific zero using IsZeroFunc = bool (*)(const void*); // Copy element @@ -820,7 +819,6 @@ using CopyElementFunc = void (*)(void* dest, const void* src, int64_t dest_index static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements, size_t element_size, IsZeroFunc is_zero, CopyElementFunc copy, TensorProto& values, TensorProto& indices) { - auto advance = [element_size](const void* start, size_t elements) -> const void* { return (reinterpret_cast(start) + elements * element_size); }; @@ -834,7 +832,7 @@ static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements, indices_data.Add(index); } ++index; - cbegin = advance(cbegin, 1U); + cbegin = advance(cbegin, 1U); } auto& raw_data = *values.mutable_raw_data(); @@ -859,7 +857,7 @@ void CopyElement(void* dst, const void* src, int64_t dst_index, int64_t src_inde } common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& dense_proto, - ONNX_NAMESPACE::SparseTensorProto& result) { + ONNX_NAMESPACE::SparseTensorProto& result) { ORT_ENFORCE(HasDataType(dense_proto), "Must have a valid data type"); const bool is_string_data = dense_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING; @@ -890,13 +888,13 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& switch (element_size) { case 1: { - // bytes + // bytes SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size, IsZero, CopyElement, values, indices); break; } case 4: { - // float + // float SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size, IsZero, CopyElement, values, indices); break; @@ -916,10 +914,10 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& return Status::OK(); } -#endif // !ORT_MINIMAL_BUILD +#endif // !ORT_MINIMAL_BUILD -template common::Status GetSizeInBytesFromTensorProto<256>(const ONNX_NAMESPACE::TensorProto& tensor_proto, - size_t* out); +template common::Status GetSizeInBytesFromTensorProto(const ONNX_NAMESPACE::TensorProto& tensor_proto, + size_t* out); template common::Status GetSizeInBytesFromTensorProto<0>(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out); #define CASE_UNPACK(TYPE, ELEMENT_TYPE, DATA_SIZE) \ diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc index 575c7e2fb4..c049760b66 100644 --- a/onnxruntime/core/framework/utils.cc +++ b/onnxruntime/core/framework/utils.cc @@ -575,16 +575,21 @@ common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context) ORT_ENFORCE(prev_input->Shape().Size() >= 0); - size_t input_element_count = static_cast(prev_input->Shape().Size()); - size_t input_element_size = prev_input->DataType()->Size(); - size_t input_aligned_bytes = 0; + const void* curr_address = curr_input->DataRaw(); + const void* prev_address = prev_input->DataRaw(); + const void* prev_end_address = reinterpret_cast(prev_address) + prev_input->SizeInBytes(); - ORT_RETURN_IF_NOT(IAllocator::CalcMemSizeForArrayWithAlignment<256>(input_element_count, input_element_size, - &input_aligned_bytes)); + void* aligned_address = const_cast(prev_end_address); + size_t dummy_space = kAllocAlignment * 2; + std::align(kAllocAlignment, 1, aligned_address, dummy_space); - ORT_RETURN_IF_NOT( - curr_input->DataRaw() == static_cast(prev_input->DataRaw()) + input_aligned_bytes || - curr_input->DataRaw() == static_cast(prev_input->DataRaw()) + prev_input->SizeInBytes()); + if (!(curr_address == prev_end_address || curr_address == aligned_address)) { + const std::string node = context->GetNodeName().empty() ? context->GetOpType() : context->GetNodeName(); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Contiguous memory checking failed on node ", node, ": ", + "input #", i - 1, " address is ", prev_address, " and #bytes = ", prev_input->SizeInBytes(), + ", input #", i, " address is ", curr_address); + } prev_input = curr_input; } diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc index 6034c9b92a..0b197af0fe 100644 --- a/onnxruntime/test/framework/allocator_test.cc +++ b/onnxruntime/test/framework/allocator_test.cc @@ -87,9 +87,9 @@ TEST(AllocatorTest, TestOverflowChecks) { EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size)); EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size)); - // we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max - EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements - (64 / element_size), element_size, &size)); - EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size)); + // we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max + EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment(num_elements - (kAllocAlignment / element_size), element_size, &size)); + EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment(num_elements, element_size, &size)); element_size = std::numeric_limits::max() / 8; num_elements = 8; @@ -97,9 +97,9 @@ TEST(AllocatorTest, TestOverflowChecks) { EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size)); EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size)); - // we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max - EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size - (64 / num_elements), &size)); - EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size)); + // we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max + EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment(num_elements, element_size - (kAllocAlignment / num_elements), &size)); + EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment(num_elements, element_size, &size)); } } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc index f6677f9041..e6308b0e72 100644 --- a/onnxruntime/test/framework/execution_frame_test.cc +++ b/onnxruntime/test/framework/execution_frame_test.cc @@ -249,9 +249,9 @@ TEST_F(ExecutionFrameTest, MemPatternTest) { ASSERT_EQ(pattern.patterns.size(), pattern.locations.size()); ASSERT_EQ(pattern.patterns.size(), 1u); auto p = pattern.GetPatterns(cpu_allocator->Info()); - ASSERT_EQ(p->PeakSize(), 2u * 64u); // each allocation is 64-byte aligned + ASSERT_EQ(p->PeakSize(), 2u * kAllocAlignment); // each allocation is kAllocAlignment-byte aligned ASSERT_EQ(p->GetBlock(3)->offset_, 0u); - ASSERT_EQ(p->GetBlock(4)->offset_, 64u); + ASSERT_EQ(p->GetBlock(4)->offset_, kAllocAlignment); } TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) { diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc index 2a86e76786..f9369e9be3 100644 --- a/onnxruntime/test/onnx/tensorprotoutils.cc +++ b/onnxruntime/test/onnx/tensorprotoutils.cc @@ -13,6 +13,7 @@ #include "core/common/status.h" #include "core/framework/data_types.h" #include "core/framework/endian.h" +#include "core/framework/allocator.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/graph/onnx_protobuf.h" #include "callback.h" @@ -457,7 +458,7 @@ Status TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuff return Status::OK(); } -template Status GetSizeInBytesFromTensorProto<256>(const onnx::TensorProto& tensor_proto, size_t* out); +template Status GetSizeInBytesFromTensorProto(const onnx::TensorProto& tensor_proto, size_t* out); template Status GetSizeInBytesFromTensorProto<0>(const onnx::TensorProto& tensor_proto, size_t* out); } // namespace test } // namespace onnxruntime