mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-29 23:06:41 +00:00
Unify activation and initializer alignment value (#6109)
* Unify activation and initializer alignment value * Fix VerifyInputTensorsAllocatedContiguously
This commit is contained in:
parent
cde723a136
commit
eb5c1f0fcc
14 changed files with 62 additions and 38 deletions
|
|
@ -25,6 +25,8 @@ constexpr const char* CUDA_PINNED = "CudaPinned";
|
|||
constexpr const char* MIGRAPHX = "MIGraphX";
|
||||
constexpr const char* MIGRAPHX_PINNED = "MIGraphXPinned";
|
||||
|
||||
constexpr size_t kAllocAlignment = 256;
|
||||
|
||||
// forward declaration
|
||||
class SessionState;
|
||||
|
||||
|
|
|
|||
|
|
@ -221,6 +221,16 @@ class OpKernelContext {
|
|||
**/
|
||||
const std::string& GetOpDomain() const;
|
||||
|
||||
/**
|
||||
Returns the optype of the underlying kernel
|
||||
**/
|
||||
const std::string& GetOpType() const;
|
||||
|
||||
/**
|
||||
Returns the node name of the underlying kernel
|
||||
**/
|
||||
const std::string& GetNodeName() const;
|
||||
|
||||
/**
|
||||
Returns the intra-op threadpool, if available.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -329,7 +329,7 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
|
|||
if (static_cast<uint64_t>(len) > std::numeric_limits<size_t>::max()) {
|
||||
return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Tensor shape is too large");
|
||||
}
|
||||
if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(static_cast<size_t>(len), element_type->Size(), &size)) {
|
||||
if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(static_cast<size_t>(len), element_type->Size(), &size)) {
|
||||
return Status(ONNXRUNTIME, FAIL, "size overflow");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -139,10 +139,18 @@ onnxruntime::NodeIndex OpKernelContext::GetNodeIndex() const {
|
|||
return kernel_->Node().Index();
|
||||
}
|
||||
|
||||
const std::string& OpKernelContext::GetNodeName() const {
|
||||
return kernel_->Node().Name();
|
||||
}
|
||||
|
||||
const std::string& OpKernelContext::GetOpDomain() const {
|
||||
return kernel_->KernelDef().Domain();
|
||||
}
|
||||
|
||||
const std::string& OpKernelContext::GetOpType() const {
|
||||
return kernel_->Node().OpType();
|
||||
}
|
||||
|
||||
const OrtValue* OpKernelContext::GetInputMLValue(int index) const {
|
||||
if (index < 0 || index >= InputCount())
|
||||
return nullptr;
|
||||
|
|
|
|||
|
|
@ -192,8 +192,9 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
|
|||
// Execute the kernel.
|
||||
ORT_TRY {
|
||||
#ifdef ENABLE_TRAINING
|
||||
if (p_op_kernel->KernelDef().AllocateInputsContiguously())
|
||||
utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context);
|
||||
if (p_op_kernel->KernelDef().AllocateInputsContiguously()) {
|
||||
ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context));
|
||||
}
|
||||
#endif
|
||||
|
||||
status = p_op_kernel->Compute(&op_kernel_context);
|
||||
|
|
|
|||
|
|
@ -308,8 +308,9 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
|
|||
#endif
|
||||
ORT_TRY {
|
||||
#ifdef ENABLE_TRAINING
|
||||
if (p_op_kernel->KernelDef().AllocateInputsContiguously())
|
||||
utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context);
|
||||
if (p_op_kernel->KernelDef().AllocateInputsContiguously()) {
|
||||
ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context));
|
||||
}
|
||||
#endif
|
||||
|
||||
compute_status = p_op_kernel->Compute(&op_kernel_context);
|
||||
|
|
|
|||
|
|
@ -453,7 +453,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vector<std::reference_
|
|||
return Status(ONNXRUNTIME, FAIL, "Unknown shape found in memory pattern compute, node name is : " + node_name);
|
||||
}
|
||||
|
||||
if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(size, ml_data_type->Size(), &size)) {
|
||||
if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(size, ml_data_type->Size(), &size)) {
|
||||
return Status(ONNXRUNTIME, FAIL, "Size overflow");
|
||||
}
|
||||
|
||||
|
|
@ -489,7 +489,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vector<std::reference_
|
|||
if (exe_plan->allocation_plan[ml_value_idx].alloc_kind == AllocKind::kAllocate &&
|
||||
ml_data_type != DataTypeImpl::GetType<std::string>() && size != 0) {
|
||||
size_t aligned_size = 0;
|
||||
if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(size, ml_data_type->Size(), &aligned_size)) {
|
||||
if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(size, ml_data_type->Size(), &aligned_size)) {
|
||||
return Status(ONNXRUNTIME, FAIL, "Size overflow");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -18,8 +18,7 @@ common::Status SimpleTensorAllocator::GetPreallocatedBuffer(int ort_value_index,
|
|||
}
|
||||
|
||||
size_t len = 0;
|
||||
static constexpr int alignment = 256;
|
||||
ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<alignment>(*iter->second, &len));
|
||||
ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<kAllocAlignment>(*iter->second, &len));
|
||||
const struct OrtMemoryInfo& location = seq_plan_.GetLocation(ort_value_index);
|
||||
if (len == 0) {
|
||||
out = onnxruntime::make_unique<MemBuffer>(nullptr, 0, location);
|
||||
|
|
|
|||
|
|
@ -108,8 +108,7 @@ class TensorAllocatorWithMemPattern : public ITensorAllocator {
|
|||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error.");
|
||||
}
|
||||
size_t len = 0;
|
||||
static constexpr int alignment = 256;
|
||||
ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<alignment>(*value, &len));
|
||||
ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<kAllocAlignment>(*value, &len));
|
||||
ORT_RETURN_IF_ERROR(planner_.TraceAllocation(id, len));
|
||||
return Status::OK();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -810,8 +810,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
|
|||
return status;
|
||||
}
|
||||
|
||||
|
||||
#if !defined (ORT_MINIMAL_BUILD)
|
||||
#if !defined(ORT_MINIMAL_BUILD)
|
||||
// Determines if this is a type specific zero
|
||||
using IsZeroFunc = bool (*)(const void*);
|
||||
// Copy element
|
||||
|
|
@ -820,7 +819,6 @@ using CopyElementFunc = void (*)(void* dest, const void* src, int64_t dest_index
|
|||
static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements, size_t element_size,
|
||||
IsZeroFunc is_zero, CopyElementFunc copy,
|
||||
TensorProto& values, TensorProto& indices) {
|
||||
|
||||
auto advance = [element_size](const void* start, size_t elements) -> const void* {
|
||||
return (reinterpret_cast<const uint8_t*>(start) + elements * element_size);
|
||||
};
|
||||
|
|
@ -834,7 +832,7 @@ static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements,
|
|||
indices_data.Add(index);
|
||||
}
|
||||
++index;
|
||||
cbegin = advance(cbegin, 1U);
|
||||
cbegin = advance(cbegin, 1U);
|
||||
}
|
||||
|
||||
auto& raw_data = *values.mutable_raw_data();
|
||||
|
|
@ -859,7 +857,7 @@ void CopyElement(void* dst, const void* src, int64_t dst_index, int64_t src_inde
|
|||
}
|
||||
|
||||
common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& dense_proto,
|
||||
ONNX_NAMESPACE::SparseTensorProto& result) {
|
||||
ONNX_NAMESPACE::SparseTensorProto& result) {
|
||||
ORT_ENFORCE(HasDataType(dense_proto), "Must have a valid data type");
|
||||
|
||||
const bool is_string_data = dense_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING;
|
||||
|
|
@ -890,13 +888,13 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&
|
|||
|
||||
switch (element_size) {
|
||||
case 1: {
|
||||
// bytes
|
||||
// bytes
|
||||
SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size,
|
||||
IsZero<uint8_t>, CopyElement<uint8_t>, values, indices);
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
// float
|
||||
// float
|
||||
SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size,
|
||||
IsZero<uint32_t>, CopyElement<uint32_t>, values, indices);
|
||||
break;
|
||||
|
|
@ -916,10 +914,10 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
#endif // !ORT_MINIMAL_BUILD
|
||||
#endif // !ORT_MINIMAL_BUILD
|
||||
|
||||
template common::Status GetSizeInBytesFromTensorProto<256>(const ONNX_NAMESPACE::TensorProto& tensor_proto,
|
||||
size_t* out);
|
||||
template common::Status GetSizeInBytesFromTensorProto<kAllocAlignment>(const ONNX_NAMESPACE::TensorProto& tensor_proto,
|
||||
size_t* out);
|
||||
template common::Status GetSizeInBytesFromTensorProto<0>(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out);
|
||||
|
||||
#define CASE_UNPACK(TYPE, ELEMENT_TYPE, DATA_SIZE) \
|
||||
|
|
|
|||
|
|
@ -575,16 +575,21 @@ common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context)
|
|||
|
||||
ORT_ENFORCE(prev_input->Shape().Size() >= 0);
|
||||
|
||||
size_t input_element_count = static_cast<size_t>(prev_input->Shape().Size());
|
||||
size_t input_element_size = prev_input->DataType()->Size();
|
||||
size_t input_aligned_bytes = 0;
|
||||
const void* curr_address = curr_input->DataRaw();
|
||||
const void* prev_address = prev_input->DataRaw();
|
||||
const void* prev_end_address = reinterpret_cast<const char*>(prev_address) + prev_input->SizeInBytes();
|
||||
|
||||
ORT_RETURN_IF_NOT(IAllocator::CalcMemSizeForArrayWithAlignment<256>(input_element_count, input_element_size,
|
||||
&input_aligned_bytes));
|
||||
void* aligned_address = const_cast<void*>(prev_end_address);
|
||||
size_t dummy_space = kAllocAlignment * 2;
|
||||
std::align(kAllocAlignment, 1, aligned_address, dummy_space);
|
||||
|
||||
ORT_RETURN_IF_NOT(
|
||||
curr_input->DataRaw() == static_cast<const int8_t*>(prev_input->DataRaw()) + input_aligned_bytes ||
|
||||
curr_input->DataRaw() == static_cast<const int8_t*>(prev_input->DataRaw()) + prev_input->SizeInBytes());
|
||||
if (!(curr_address == prev_end_address || curr_address == aligned_address)) {
|
||||
const std::string node = context->GetNodeName().empty() ? context->GetOpType() : context->GetNodeName();
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
|
||||
"Contiguous memory checking failed on node ", node, ": ",
|
||||
"input #", i - 1, " address is ", prev_address, " and #bytes = ", prev_input->SizeInBytes(),
|
||||
", input #", i, " address is ", curr_address);
|
||||
}
|
||||
|
||||
prev_input = curr_input;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -87,9 +87,9 @@ TEST(AllocatorTest, TestOverflowChecks) {
|
|||
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size));
|
||||
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size));
|
||||
|
||||
// we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max
|
||||
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements - (64 / element_size), element_size, &size));
|
||||
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size));
|
||||
// we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max
|
||||
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements - (kAllocAlignment / element_size), element_size, &size));
|
||||
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size, &size));
|
||||
|
||||
element_size = std::numeric_limits<size_t>::max() / 8;
|
||||
num_elements = 8;
|
||||
|
|
@ -97,9 +97,9 @@ TEST(AllocatorTest, TestOverflowChecks) {
|
|||
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size));
|
||||
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size));
|
||||
|
||||
// we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max
|
||||
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size - (64 / num_elements), &size));
|
||||
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size));
|
||||
// we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max
|
||||
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size - (kAllocAlignment / num_elements), &size));
|
||||
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size, &size));
|
||||
}
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -249,9 +249,9 @@ TEST_F(ExecutionFrameTest, MemPatternTest) {
|
|||
ASSERT_EQ(pattern.patterns.size(), pattern.locations.size());
|
||||
ASSERT_EQ(pattern.patterns.size(), 1u);
|
||||
auto p = pattern.GetPatterns(cpu_allocator->Info());
|
||||
ASSERT_EQ(p->PeakSize(), 2u * 64u); // each allocation is 64-byte aligned
|
||||
ASSERT_EQ(p->PeakSize(), 2u * kAllocAlignment); // each allocation is kAllocAlignment-byte aligned
|
||||
ASSERT_EQ(p->GetBlock(3)->offset_, 0u);
|
||||
ASSERT_EQ(p->GetBlock(4)->offset_, 64u);
|
||||
ASSERT_EQ(p->GetBlock(4)->offset_, kAllocAlignment);
|
||||
}
|
||||
|
||||
TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) {
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@
|
|||
#include "core/common/status.h"
|
||||
#include "core/framework/data_types.h"
|
||||
#include "core/framework/endian.h"
|
||||
#include "core/framework/allocator.h"
|
||||
#include "core/session/onnxruntime_cxx_api.h"
|
||||
#include "core/graph/onnx_protobuf.h"
|
||||
#include "callback.h"
|
||||
|
|
@ -457,7 +458,7 @@ Status TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuff
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
template Status GetSizeInBytesFromTensorProto<256>(const onnx::TensorProto& tensor_proto, size_t* out);
|
||||
template Status GetSizeInBytesFromTensorProto<kAllocAlignment>(const onnx::TensorProto& tensor_proto, size_t* out);
|
||||
template Status GetSizeInBytesFromTensorProto<0>(const onnx::TensorProto& tensor_proto, size_t* out);
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
Loading…
Reference in a new issue