Unify activation and initializer alignment value (#6109)

* Unify activation and initializer alignment value * Fix VerifyInputTensorsAllocatedContiguously
2026-07-15 18:23:41 +00:00 · 2020-12-14 13:13:41 -08:00 · 2020-12-14 13:13:41 -08:00 · eb5c1f0fcc
commit eb5c1f0fcc
parent cde723a136
14 changed files with 62 additions and 38 deletions
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@ -25,6 +25,8 @@ constexpr const char* CUDA_PINNED = "CudaPinned";
 constexpr const char* MIGRAPHX = "MIGraphX";
 constexpr const char* MIGRAPHX_PINNED = "MIGraphXPinned";

+constexpr size_t kAllocAlignment = 256;
+
 // forward declaration
 class SessionState;

--- a/include/onnxruntime/core/framework/op_kernel.h
+++ b/include/onnxruntime/core/framework/op_kernel.h
@ -221,6 +221,16 @@ class OpKernelContext {
  **/
  const std::string& GetOpDomain() const;

+  /**
+  Returns the optype of the underlying kernel
+  **/
+  const std::string& GetOpType() const;
+
+  /**
+  Returns the node name of the underlying kernel
+  **/
+  const std::string& GetNodeName() const;
+
  /**
  Returns the intra-op threadpool, if available.
  */
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@ -329,7 +329,7 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
  if (static_cast<uint64_t>(len) > std::numeric_limits<size_t>::max()) {
    return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Tensor shape is too large");
  }
-  if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(static_cast<size_t>(len), element_type->Size(), &size)) {
+  if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(static_cast<size_t>(len), element_type->Size(), &size)) {
    return Status(ONNXRUNTIME, FAIL, "size overflow");
  }

--- a/onnxruntime/core/framework/op_kernel.cc
+++ b/onnxruntime/core/framework/op_kernel.cc
@ -139,10 +139,18 @@ onnxruntime::NodeIndex OpKernelContext::GetNodeIndex() const {
  return kernel_->Node().Index();
 }

+const std::string& OpKernelContext::GetNodeName() const {
+  return kernel_->Node().Name();
+}
+
 const std::string& OpKernelContext::GetOpDomain() const {
  return kernel_->KernelDef().Domain();
 }

+const std::string& OpKernelContext::GetOpType() const {
+  return kernel_->Node().OpType();
+}
+
 const OrtValue* OpKernelContext::GetInputMLValue(int index) const {
  if (index < 0 || index >= InputCount())
    return nullptr;
--- a/onnxruntime/core/framework/parallel_executor.cc
+++ b/onnxruntime/core/framework/parallel_executor.cc
@ -192,8 +192,9 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
    // Execute the kernel.
    ORT_TRY {
 #ifdef ENABLE_TRAINING
-      if (p_op_kernel->KernelDef().AllocateInputsContiguously())
-        utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context);
+      if (p_op_kernel->KernelDef().AllocateInputsContiguously()) {
+        ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context));
+      }
 #endif

      status = p_op_kernel->Compute(&op_kernel_context);
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@ -308,8 +308,9 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
 #endif
      ORT_TRY {
 #ifdef ENABLE_TRAINING
-        if (p_op_kernel->KernelDef().AllocateInputsContiguously())
-          utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context);
+        if (p_op_kernel->KernelDef().AllocateInputsContiguously()) {
+          ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context));
+        }
 #endif

        compute_status = p_op_kernel->Compute(&op_kernel_context);
--- a/onnxruntime/core/framework/session_state.cc
+++ b/onnxruntime/core/framework/session_state.cc
@ -453,7 +453,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vector<std::reference_
        return Status(ONNXRUNTIME, FAIL, "Unknown shape found in memory pattern compute, node name is : " + node_name);
      }

-      if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(size, ml_data_type->Size(), &size)) {
+      if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(size, ml_data_type->Size(), &size)) {
        return Status(ONNXRUNTIME, FAIL, "Size overflow");
      }

@ -489,7 +489,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vector<std::reference_
      if (exe_plan->allocation_plan[ml_value_idx].alloc_kind == AllocKind::kAllocate &&
          ml_data_type != DataTypeImpl::GetType<std::string>() && size != 0) {
        size_t aligned_size = 0;
-        if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(size, ml_data_type->Size(), &aligned_size)) {
+        if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(size, ml_data_type->Size(), &aligned_size)) {
          return Status(ONNXRUNTIME, FAIL, "Size overflow");
        }

--- a/onnxruntime/core/framework/simple_tensor_allocator.cc
+++ b/onnxruntime/core/framework/simple_tensor_allocator.cc
@ -18,8 +18,7 @@ common::Status SimpleTensorAllocator::GetPreallocatedBuffer(int ort_value_index,
  }

  size_t len = 0;
-  static constexpr int alignment = 256;
-  ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<alignment>(*iter->second, &len));
+  ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<kAllocAlignment>(*iter->second, &len));
  const struct OrtMemoryInfo& location = seq_plan_.GetLocation(ort_value_index);
  if (len == 0) {
    out = onnxruntime::make_unique<MemBuffer>(nullptr, 0, location);
--- a/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h
+++ b/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h
@ -108,8 +108,7 @@ class TensorAllocatorWithMemPattern : public ITensorAllocator {
      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error.");
    }
    size_t len = 0;
-    static constexpr int alignment = 256;
-    ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<alignment>(*value, &len));
+    ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<kAllocAlignment>(*value, &len));
    ORT_RETURN_IF_ERROR(planner_.TraceAllocation(id, len));
    return Status::OK();
  }
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@ -810,8 +810,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
  return status;
 }

-
-#if !defined (ORT_MINIMAL_BUILD)
+#if !defined(ORT_MINIMAL_BUILD)
 // Determines if this is a type specific zero
 using IsZeroFunc = bool (*)(const void*);
 // Copy element
@ -820,7 +819,6 @@ using CopyElementFunc = void (*)(void* dest, const void* src, int64_t dest_index
 static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements, size_t element_size,
                            IsZeroFunc is_zero, CopyElementFunc copy,
                            TensorProto& values, TensorProto& indices) {
-
  auto advance = [element_size](const void* start, size_t elements) -> const void* {
    return (reinterpret_cast<const uint8_t*>(start) + elements * element_size);
  };
@ -834,7 +832,7 @@ static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements,
      indices_data.Add(index);
    }
    ++index;
-    cbegin = advance(cbegin, 1U); 
+    cbegin = advance(cbegin, 1U);
  }

  auto& raw_data = *values.mutable_raw_data();
@ -859,7 +857,7 @@ void CopyElement(void* dst, const void* src, int64_t dst_index, int64_t src_inde
 }

 common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& dense_proto,
-                                                  ONNX_NAMESPACE::SparseTensorProto& result) {
+                                              ONNX_NAMESPACE::SparseTensorProto& result) {
  ORT_ENFORCE(HasDataType(dense_proto), "Must have a valid data type");

  const bool is_string_data = dense_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING;
@ -890,13 +888,13 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&

  switch (element_size) {
    case 1: {
-    // bytes
+      // bytes
      SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size,
                      IsZero<uint8_t>, CopyElement<uint8_t>, values, indices);
      break;
    }
    case 4: {
-    // float
+      // float
      SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size,
                      IsZero<uint32_t>, CopyElement<uint32_t>, values, indices);
      break;
@ -916,10 +914,10 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&
  return Status::OK();
 }

-#endif // !ORT_MINIMAL_BUILD
+#endif  // !ORT_MINIMAL_BUILD

-template common::Status GetSizeInBytesFromTensorProto<256>(const ONNX_NAMESPACE::TensorProto& tensor_proto,
-                                                           size_t* out);
+template common::Status GetSizeInBytesFromTensorProto<kAllocAlignment>(const ONNX_NAMESPACE::TensorProto& tensor_proto,
+                                                                       size_t* out);
 template common::Status GetSizeInBytesFromTensorProto<0>(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out);

 #define CASE_UNPACK(TYPE, ELEMENT_TYPE, DATA_SIZE)                              \
--- a/onnxruntime/core/framework/utils.cc
+++ b/onnxruntime/core/framework/utils.cc
@ -575,16 +575,21 @@ common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context)

    ORT_ENFORCE(prev_input->Shape().Size() >= 0);

-    size_t input_element_count = static_cast<size_t>(prev_input->Shape().Size());
-    size_t input_element_size = prev_input->DataType()->Size();
-    size_t input_aligned_bytes = 0;
+    const void* curr_address = curr_input->DataRaw();
+    const void* prev_address = prev_input->DataRaw();
+    const void* prev_end_address = reinterpret_cast<const char*>(prev_address) + prev_input->SizeInBytes();

-    ORT_RETURN_IF_NOT(IAllocator::CalcMemSizeForArrayWithAlignment<256>(input_element_count, input_element_size,
-                                                                        &input_aligned_bytes));
+    void* aligned_address = const_cast<void*>(prev_end_address);
+    size_t dummy_space = kAllocAlignment * 2;
+    std::align(kAllocAlignment, 1, aligned_address, dummy_space);

-    ORT_RETURN_IF_NOT(
-        curr_input->DataRaw() == static_cast<const int8_t*>(prev_input->DataRaw()) + input_aligned_bytes ||
-        curr_input->DataRaw() == static_cast<const int8_t*>(prev_input->DataRaw()) + prev_input->SizeInBytes());
+    if (!(curr_address == prev_end_address || curr_address == aligned_address)) {
+      const std::string node = context->GetNodeName().empty() ? context->GetOpType() : context->GetNodeName();
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                             "Contiguous memory checking failed on node ", node, ": ",
+                             "input #", i - 1, " address is ", prev_address, " and #bytes = ", prev_input->SizeInBytes(),
+                             ", input #", i, " address is ", curr_address);
+    }

    prev_input = curr_input;
  }
--- a/onnxruntime/test/framework/allocator_test.cc
+++ b/onnxruntime/test/framework/allocator_test.cc
@ -87,9 +87,9 @@ TEST(AllocatorTest, TestOverflowChecks) {
  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size));
  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size));

-  // we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max
-  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements - (64 / element_size), element_size, &size));
-  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size));
+  // we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max
+  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements - (kAllocAlignment / element_size), element_size, &size));
+  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size, &size));

  element_size = std::numeric_limits<size_t>::max() / 8;
  num_elements = 8;
@ -97,9 +97,9 @@ TEST(AllocatorTest, TestOverflowChecks) {
  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size));
  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size));

-  // we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max
-  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size - (64 / num_elements), &size));
-  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size));
+  // we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max
+  EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size - (kAllocAlignment / num_elements), &size));
+  EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size, &size));
 }
 }  // namespace test
 }  // namespace onnxruntime
--- a/onnxruntime/test/framework/execution_frame_test.cc
+++ b/onnxruntime/test/framework/execution_frame_test.cc
@ -249,9 +249,9 @@ TEST_F(ExecutionFrameTest, MemPatternTest) {
  ASSERT_EQ(pattern.patterns.size(), pattern.locations.size());
  ASSERT_EQ(pattern.patterns.size(), 1u);
  auto p = pattern.GetPatterns(cpu_allocator->Info());
-  ASSERT_EQ(p->PeakSize(), 2u * 64u);  // each allocation is 64-byte aligned
+  ASSERT_EQ(p->PeakSize(), 2u * kAllocAlignment);  // each allocation is kAllocAlignment-byte aligned
  ASSERT_EQ(p->GetBlock(3)->offset_, 0u);
-  ASSERT_EQ(p->GetBlock(4)->offset_, 64u);
+  ASSERT_EQ(p->GetBlock(4)->offset_, kAllocAlignment);
 }

 TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) {
--- a/onnxruntime/test/onnx/tensorprotoutils.cc
+++ b/onnxruntime/test/onnx/tensorprotoutils.cc
@ -13,6 +13,7 @@
 #include "core/common/status.h"
 #include "core/framework/data_types.h"
 #include "core/framework/endian.h"
+#include "core/framework/allocator.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/graph/onnx_protobuf.h"
 #include "callback.h"
@ -457,7 +458,7 @@ Status TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuff
  return Status::OK();
 }

-template Status GetSizeInBytesFromTensorProto<256>(const onnx::TensorProto& tensor_proto, size_t* out);
+template Status GetSizeInBytesFromTensorProto<kAllocAlignment>(const onnx::TensorProto& tensor_proto, size_t* out);
 template Status GetSizeInBytesFromTensorProto<0>(const onnx::TensorProto& tensor_proto, size_t* out);
 }  // namespace test
 }  // namespace onnxruntime