From eb5c1f0fcc6038f2743e7f5d2c1c43a014bdfdcb Mon Sep 17 00:00:00 2001
From: Sherlock <baihan.huang@gmail.com>
Date: Mon, 14 Dec 2020 13:13:41 -0800
Subject: [PATCH] Unify activation and initializer alignment value (#6109)

* Unify activation and initializer alignment value

* Fix VerifyInputTensorsAllocatedContiguously
---
 .../onnxruntime/core/framework/allocator.h    |  2 ++
 .../onnxruntime/core/framework/op_kernel.h    | 10 +++++++++
 onnxruntime/core/framework/execution_frame.cc |  2 +-
 onnxruntime/core/framework/op_kernel.cc       |  8 +++++++
 .../core/framework/parallel_executor.cc       |  5 +++--
 .../core/framework/sequential_executor.cc     |  5 +++--
 onnxruntime/core/framework/session_state.cc   |  4 ++--
 .../core/framework/simple_tensor_allocator.cc |  3 +--
 .../tensor_allocator_with_mem_pattern.h       |  3 +--
 .../core/framework/tensorprotoutils.cc        | 18 +++++++---------
 onnxruntime/core/framework/utils.cc           | 21 ++++++++++++-------
 onnxruntime/test/framework/allocator_test.cc  | 12 +++++------
 .../test/framework/execution_frame_test.cc    |  4 ++--
 onnxruntime/test/onnx/tensorprotoutils.cc     |  3 ++-
 14 files changed, 62 insertions(+), 38 deletions(-)
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index c86ed44f49..b66548fee6 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -25,6 +25,8 @@ constexpr const char* CUDA_PINNED = "CudaPinned";
 constexpr const char* MIGRAPHX = "MIGraphX";
 constexpr const char* MIGRAPHX_PINNED = "MIGraphXPinned";
 
+constexpr size_t kAllocAlignment = 256;
+
 // forward declaration
 class SessionState;
 
diff --git a/include/onnxruntime/core/framework/op_kernel.h b/include/onnxruntime/core/framework/op_kernel.h
index 7dd02e5e0c..2e3937a119 100644
--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@@ -221,6 +221,16 @@ class OpKernelContext {
   **/
   const std::string& GetOpDomain() const;
 
+  /**
+  Returns the optype of the underlying kernel
+  **/
+  const std::string& GetOpType() const;
+
+  /**
+  Returns the node name of the underlying kernel
+  **/
+  const std::string& GetNodeName() const;
+
   /**
   Returns the intra-op threadpool, if available.
   */
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index 765b7f110e..d6f67478bd 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -329,7 +329,7 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
   if (static_cast<uint64_t>(len) > std::numeric_limits<size_t>::max()) {
     return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Tensor shape is too large");
   }
-  if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(static_cast<size_t>(len), element_type->Size(), &size)) {
+  if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(static_cast<size_t>(len), element_type->Size(), &size)) {
     return Status(ONNXRUNTIME, FAIL, "size overflow");
   }
 
diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc
index b3c05bb4e5..6c7ddf1abf 100644
--- a/onnxruntime/core/framework/op_kernel.cc
+++ b/onnxruntime/core/framework/op_kernel.cc
@@ -139,10 +139,18 @@ onnxruntime::NodeIndex OpKernelContext::GetNodeIndex() const {
   return kernel_->Node().Index();
 }
 
+const std::string& OpKernelContext::GetNodeName() const {
+  return kernel_->Node().Name();
+}
+
 const std::string& OpKernelContext::GetOpDomain() const {
   return kernel_->KernelDef().Domain();
 }
 
+const std::string& OpKernelContext::GetOpType() const {
+  return kernel_->Node().OpType();
+}
+
 const OrtValue* OpKernelContext::GetInputMLValue(int index) const {
   if (index < 0 || index >= InputCount())
     return nullptr;
diff --git a/onnxruntime/core/framework/parallel_executor.cc b/onnxruntime/core/framework/parallel_executor.cc
index 9c0c70770c..371758426c 100644
--- a/onnxruntime/core/framework/parallel_executor.cc
+++ b/onnxruntime/core/framework/parallel_executor.cc
@@ -192,8 +192,9 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
     // Execute the kernel.
     ORT_TRY {
 #ifdef ENABLE_TRAINING
-      if (p_op_kernel->KernelDef().AllocateInputsContiguously())
-        utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context);
+      if (p_op_kernel->KernelDef().AllocateInputsContiguously()) {
+        ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context));
+      }
 #endif
 
       status = p_op_kernel->Compute(&op_kernel_context);
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index e3806a8091..1e5e688146 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -308,8 +308,9 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
 #endif
       ORT_TRY {
 #ifdef ENABLE_TRAINING
-        if (p_op_kernel->KernelDef().AllocateInputsContiguously())
-          utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context);
+        if (p_op_kernel->KernelDef().AllocateInputsContiguously()) {
+          ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context));
+        }
 #endif
 
         compute_status = p_op_kernel->Compute(&op_kernel_context);
diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc
index 8a971ba539..cb79e6e631 100644
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@@ -453,7 +453,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vector<std::reference_
         return Status(ONNXRUNTIME, FAIL, "Unknown shape found in memory pattern compute, node name is : " + node_name);
       }
 
-      if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(size, ml_data_type->Size(), &size)) {
+      if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(size, ml_data_type->Size(), &size)) {
         return Status(ONNXRUNTIME, FAIL, "Size overflow");
       }
 
@@ -489,7 +489,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vector<std::reference_
       if (exe_plan->allocation_plan[ml_value_idx].alloc_kind == AllocKind::kAllocate &&
           ml_data_type != DataTypeImpl::GetType<std::string>() && size != 0) {
         size_t aligned_size = 0;
-        if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(size, ml_data_type->Size(), &aligned_size)) {
+        if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(size, ml_data_type->Size(), &aligned_size)) {
           return Status(ONNXRUNTIME, FAIL, "Size overflow");
         }
 
diff --git a/onnxruntime/core/framework/simple_tensor_allocator.cc b/onnxruntime/core/framework/simple_tensor_allocator.cc
index 1b1839162a..360e052963 100644
--- a/onnxruntime/core/framework/simple_tensor_allocator.cc
+++ b/onnxruntime/core/framework/simple_tensor_allocator.cc
@@ -18,8 +18,7 @@ common::Status SimpleTensorAllocator::GetPreallocatedBuffer(int ort_value_index,
   }
 
   size_t len = 0;
-  static constexpr int alignment = 256;
-  ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<alignment>(*iter->second, &len));
+  ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<kAllocAlignment>(*iter->second, &len));
   const struct OrtMemoryInfo& location = seq_plan_.GetLocation(ort_value_index);
   if (len == 0) {
     out = onnxruntime::make_unique<MemBuffer>(nullptr, 0, location);
diff --git a/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h b/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h
index 179c19e1d0..f92b444579 100644
--- a/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h
+++ b/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h
@@ -108,8 +108,7 @@ class TensorAllocatorWithMemPattern : public ITensorAllocator {
       return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error.");
     }
     size_t len = 0;
-    static constexpr int alignment = 256;
-    ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<alignment>(*value, &len));
+    ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<kAllocAlignment>(*value, &len));
     ORT_RETURN_IF_ERROR(planner_.TraceAllocation(id, len));
     return Status::OK();
   }
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index 9f5e7583ea..3605487ba5 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -810,8 +810,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
   return status;
 }
 
-
-#if !defined (ORT_MINIMAL_BUILD)
+#if !defined(ORT_MINIMAL_BUILD)
 // Determines if this is a type specific zero
 using IsZeroFunc = bool (*)(const void*);
 // Copy element
@@ -820,7 +819,6 @@ using CopyElementFunc = void (*)(void* dest, const void* src, int64_t dest_index
 static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements, size_t element_size,
                             IsZeroFunc is_zero, CopyElementFunc copy,
                             TensorProto& values, TensorProto& indices) {
-
   auto advance = [element_size](const void* start, size_t elements) -> const void* {
     return (reinterpret_cast<const uint8_t*>(start) + elements * element_size);
   };
@@ -834,7 +832,7 @@ static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements,
       indices_data.Add(index);
     }
     ++index;
-    cbegin = advance(cbegin, 1U); 
+    cbegin = advance(cbegin, 1U);
   }
 
   auto& raw_data = *values.mutable_raw_data();
@@ -859,7 +857,7 @@ void CopyElement(void* dst, const void* src, int64_t dst_index, int64_t src_inde
 }
 
 common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& dense_proto,
-                                                  ONNX_NAMESPACE::SparseTensorProto& result) {
+                                              ONNX_NAMESPACE::SparseTensorProto& result) {
   ORT_ENFORCE(HasDataType(dense_proto), "Must have a valid data type");
 
   const bool is_string_data = dense_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING;
@@ -890,13 +888,13 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&
 
   switch (element_size) {
     case 1: {
-    // bytes
+      // bytes
       SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size,
                       IsZero<uint8_t>, CopyElement<uint8_t>, values, indices);
       break;
     }
     case 4: {
-    // float
+      // float
       SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size,
                       IsZero<uint32_t>, CopyElement<uint32_t>, values, indices);
       break;
@@ -916,10 +914,10 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&
   return Status::OK();
 }
 
-#endif // !ORT_MINIMAL_BUILD
+#endif  // !ORT_MINIMAL_BUILD
 
-template common::Status GetSizeInBytesFromTensorProto<256>(const ONNX_NAMESPACE::TensorProto& tensor_proto,
-                                                           size_t* out);
+template common::Status GetSizeInBytesFromTensorProto<kAllocAlignment>(const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                                                       size_t* out);
 template common::Status GetSizeInBytesFromTensorProto<0>(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out);
 
 #define CASE_UNPACK(TYPE, ELEMENT_TYPE, DATA_SIZE)                              \
diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
index 575c7e2fb4..c049760b66 100644
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@@ -575,16 +575,21 @@ common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context)
 
     ORT_ENFORCE(prev_input->Shape().Size() >= 0);
 
-    size_t input_element_count = static_cast<size_t>(prev_input->Shape().Size());
-    size_t input_element_size = prev_input->DataType()->Size();
-    size_t input_aligned_bytes = 0;
+    const void* curr_address = curr_input->DataRaw();
+    const void* prev_address = prev_input->DataRaw();
+    const void* prev_end_address = reinterpret_cast<const char*>(prev_address) + prev_input->SizeInBytes();
 
-    ORT_RETURN_IF_NOT(IAllocator::CalcMemSizeForArrayWithAlignment<256>(input_element_count, input_element_size,
-                                                                        &input_aligned_bytes));
+    void* aligned_address = const_cast<void*>(prev_end_address);
+    size_t dummy_space = kAllocAlignment * 2;
+    std::align(kAllocAlignment, 1, aligned_address, dummy_space);
 
-    ORT_RETURN_IF_NOT(
-        curr_input->DataRaw() == static_cast<const int8_t*>(prev_input->DataRaw()) + input_aligned_bytes ||
-        curr_input->DataRaw() == static_cast<const int8_t*>(prev_input->DataRaw()) + prev_input->SizeInBytes());
+    if (!(curr_address == prev_end_address || curr_address == aligned_address)) {
+      const std::string node = context->GetNodeName().empty() ? context->GetOpType() : context->GetNodeName();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "Contiguous memory checking failed on node ", node, ": ",
+                             "input #", i - 1, " address is ", prev_address, " and #bytes = ", prev_input->SizeInBytes(),
+                             ", input #", i, " address is ", curr_address);
+    }
 
     prev_input = curr_input;
   }
diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc
index 6034c9b92a..0b197af0fe 100644
--- a/onnxruntime/test/framework/allocator_test.cc
+++ b/onnxruntime/test/framework/allocator_test.cc
@@ -87,9 +87,9 @@ TEST(AllocatorTest, TestOverflowChecks) {
   EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size));
   EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size));
 
-  // we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max
-  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements - (64 / element_size), element_size, &size));
-  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size));
+  // we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max
+  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements - (kAllocAlignment / element_size), element_size, &size));
+  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size, &size));
 
   element_size = std::numeric_limits<size_t>::max() / 8;
   num_elements = 8;
@@ -97,9 +97,9 @@ TEST(AllocatorTest, TestOverflowChecks) {
   EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size));
   EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size));
 
-  // we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max
-  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size - (64 / num_elements), &size));
-  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size));
+  // we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max
+  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size - (kAllocAlignment / num_elements), &size));
+  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size, &size));
 }
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc
index f6677f9041..e6308b0e72 100644
--- a/onnxruntime/test/framework/execution_frame_test.cc
+++ b/onnxruntime/test/framework/execution_frame_test.cc
@@ -249,9 +249,9 @@ TEST_F(ExecutionFrameTest, MemPatternTest) {
   ASSERT_EQ(pattern.patterns.size(), pattern.locations.size());
   ASSERT_EQ(pattern.patterns.size(), 1u);
   auto p = pattern.GetPatterns(cpu_allocator->Info());
-  ASSERT_EQ(p->PeakSize(), 2u * 64u);  // each allocation is 64-byte aligned
+  ASSERT_EQ(p->PeakSize(), 2u * kAllocAlignment);  // each allocation is kAllocAlignment-byte aligned
   ASSERT_EQ(p->GetBlock(3)->offset_, 0u);
-  ASSERT_EQ(p->GetBlock(4)->offset_, 64u);
+  ASSERT_EQ(p->GetBlock(4)->offset_, kAllocAlignment);
 }
 
 TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) {
diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc
index 2a86e76786..f9369e9be3 100644
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@@ -13,6 +13,7 @@
 #include "core/common/status.h"
 #include "core/framework/data_types.h"
 #include "core/framework/endian.h"
+#include "core/framework/allocator.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/graph/onnx_protobuf.h"
 #include "callback.h"
@@ -457,7 +458,7 @@ Status TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuff
   return Status::OK();
 }
 
-template Status GetSizeInBytesFromTensorProto<256>(const onnx::TensorProto& tensor_proto, size_t* out);
+template Status GetSizeInBytesFromTensorProto<kAllocAlignment>(const onnx::TensorProto& tensor_proto, size_t* out);
 template Status GetSizeInBytesFromTensorProto<0>(const onnx::TensorProto& tensor_proto, size_t* out);
 }  // namespace test
 }  // namespace onnxruntime