Unify activation and initializer alignment value (#6109)

* Unify activation and initializer alignment value

* Fix VerifyInputTensorsAllocatedContiguously
This commit is contained in:
Sherlock 2020-12-14 13:13:41 -08:00 committed by GitHub
parent cde723a136
commit eb5c1f0fcc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 62 additions and 38 deletions

View file

@ -25,6 +25,8 @@ constexpr const char* CUDA_PINNED = "CudaPinned";
constexpr const char* MIGRAPHX = "MIGraphX";
constexpr const char* MIGRAPHX_PINNED = "MIGraphXPinned";
constexpr size_t kAllocAlignment = 256;
// forward declaration
class SessionState;

View file

@ -221,6 +221,16 @@ class OpKernelContext {
**/
const std::string& GetOpDomain() const;
/**
Returns the optype of the underlying kernel
**/
const std::string& GetOpType() const;
/**
Returns the node name of the underlying kernel
**/
const std::string& GetNodeName() const;
/**
Returns the intra-op threadpool, if available.
*/

View file

@ -329,7 +329,7 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
if (static_cast<uint64_t>(len) > std::numeric_limits<size_t>::max()) {
return Status(ONNXRUNTIME, INVALID_ARGUMENT, "Tensor shape is too large");
}
if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(static_cast<size_t>(len), element_type->Size(), &size)) {
if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(static_cast<size_t>(len), element_type->Size(), &size)) {
return Status(ONNXRUNTIME, FAIL, "size overflow");
}

View file

@ -139,10 +139,18 @@ onnxruntime::NodeIndex OpKernelContext::GetNodeIndex() const {
return kernel_->Node().Index();
}
const std::string& OpKernelContext::GetNodeName() const {
return kernel_->Node().Name();
}
const std::string& OpKernelContext::GetOpDomain() const {
return kernel_->KernelDef().Domain();
}
const std::string& OpKernelContext::GetOpType() const {
return kernel_->Node().OpType();
}
const OrtValue* OpKernelContext::GetInputMLValue(int index) const {
if (index < 0 || index >= InputCount())
return nullptr;

View file

@ -192,8 +192,9 @@ Status ParallelExecutor::RunNodeAsync(size_t p_node_index,
// Execute the kernel.
ORT_TRY {
#ifdef ENABLE_TRAINING
if (p_op_kernel->KernelDef().AllocateInputsContiguously())
utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context);
if (p_op_kernel->KernelDef().AllocateInputsContiguously()) {
ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context));
}
#endif
status = p_op_kernel->Compute(&op_kernel_context);

View file

@ -308,8 +308,9 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
#endif
ORT_TRY {
#ifdef ENABLE_TRAINING
if (p_op_kernel->KernelDef().AllocateInputsContiguously())
utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context);
if (p_op_kernel->KernelDef().AllocateInputsContiguously()) {
ORT_RETURN_IF_ERROR(utils::VerifyInputTensorsAllocatedContiguously(&op_kernel_context));
}
#endif
compute_status = p_op_kernel->Compute(&op_kernel_context);

View file

@ -453,7 +453,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vector<std::reference_
return Status(ONNXRUNTIME, FAIL, "Unknown shape found in memory pattern compute, node name is : " + node_name);
}
if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(size, ml_data_type->Size(), &size)) {
if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(size, ml_data_type->Size(), &size)) {
return Status(ONNXRUNTIME, FAIL, "Size overflow");
}
@ -489,7 +489,7 @@ Status SessionState::GeneratePatternGroupCache(const std::vector<std::reference_
if (exe_plan->allocation_plan[ml_value_idx].alloc_kind == AllocKind::kAllocate &&
ml_data_type != DataTypeImpl::GetType<std::string>() && size != 0) {
size_t aligned_size = 0;
if (!IAllocator::CalcMemSizeForArrayWithAlignment<64>(size, ml_data_type->Size(), &aligned_size)) {
if (!IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(size, ml_data_type->Size(), &aligned_size)) {
return Status(ONNXRUNTIME, FAIL, "Size overflow");
}

View file

@ -18,8 +18,7 @@ common::Status SimpleTensorAllocator::GetPreallocatedBuffer(int ort_value_index,
}
size_t len = 0;
static constexpr int alignment = 256;
ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<alignment>(*iter->second, &len));
ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<kAllocAlignment>(*iter->second, &len));
const struct OrtMemoryInfo& location = seq_plan_.GetLocation(ort_value_index);
if (len == 0) {
out = onnxruntime::make_unique<MemBuffer>(nullptr, 0, location);

View file

@ -108,8 +108,7 @@ class TensorAllocatorWithMemPattern : public ITensorAllocator {
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error.");
}
size_t len = 0;
static constexpr int alignment = 256;
ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<alignment>(*value, &len));
ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<kAllocAlignment>(*value, &len));
ORT_RETURN_IF_ERROR(planner_.TraceAllocation(id, len));
return Status::OK();
}

View file

@ -810,8 +810,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT
return status;
}
#if !defined (ORT_MINIMAL_BUILD)
#if !defined(ORT_MINIMAL_BUILD)
// Determines if this is a type specific zero
using IsZeroFunc = bool (*)(const void*);
// Copy element
@ -820,7 +819,6 @@ using CopyElementFunc = void (*)(void* dest, const void* src, int64_t dest_index
static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements, size_t element_size,
IsZeroFunc is_zero, CopyElementFunc copy,
TensorProto& values, TensorProto& indices) {
auto advance = [element_size](const void* start, size_t elements) -> const void* {
return (reinterpret_cast<const uint8_t*>(start) + elements * element_size);
};
@ -834,7 +832,7 @@ static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements,
indices_data.Add(index);
}
++index;
cbegin = advance(cbegin, 1U);
cbegin = advance(cbegin, 1U);
}
auto& raw_data = *values.mutable_raw_data();
@ -859,7 +857,7 @@ void CopyElement(void* dst, const void* src, int64_t dst_index, int64_t src_inde
}
common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto& dense_proto,
ONNX_NAMESPACE::SparseTensorProto& result) {
ONNX_NAMESPACE::SparseTensorProto& result) {
ORT_ENFORCE(HasDataType(dense_proto), "Must have a valid data type");
const bool is_string_data = dense_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING;
@ -890,13 +888,13 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&
switch (element_size) {
case 1: {
// bytes
// bytes
SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size,
IsZero<uint8_t>, CopyElement<uint8_t>, values, indices);
break;
}
case 4: {
// float
// float
SparsifyGeneric(dense_raw_data.get(), n_dense_elements, element_size,
IsZero<uint32_t>, CopyElement<uint32_t>, values, indices);
break;
@ -916,10 +914,10 @@ common::Status DenseTensorToSparseTensorProto(const ONNX_NAMESPACE::TensorProto&
return Status::OK();
}
#endif // !ORT_MINIMAL_BUILD
#endif // !ORT_MINIMAL_BUILD
template common::Status GetSizeInBytesFromTensorProto<256>(const ONNX_NAMESPACE::TensorProto& tensor_proto,
size_t* out);
template common::Status GetSizeInBytesFromTensorProto<kAllocAlignment>(const ONNX_NAMESPACE::TensorProto& tensor_proto,
size_t* out);
template common::Status GetSizeInBytesFromTensorProto<0>(const ONNX_NAMESPACE::TensorProto& tensor_proto, size_t* out);
#define CASE_UNPACK(TYPE, ELEMENT_TYPE, DATA_SIZE) \

View file

@ -575,16 +575,21 @@ common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context)
ORT_ENFORCE(prev_input->Shape().Size() >= 0);
size_t input_element_count = static_cast<size_t>(prev_input->Shape().Size());
size_t input_element_size = prev_input->DataType()->Size();
size_t input_aligned_bytes = 0;
const void* curr_address = curr_input->DataRaw();
const void* prev_address = prev_input->DataRaw();
const void* prev_end_address = reinterpret_cast<const char*>(prev_address) + prev_input->SizeInBytes();
ORT_RETURN_IF_NOT(IAllocator::CalcMemSizeForArrayWithAlignment<256>(input_element_count, input_element_size,
&input_aligned_bytes));
void* aligned_address = const_cast<void*>(prev_end_address);
size_t dummy_space = kAllocAlignment * 2;
std::align(kAllocAlignment, 1, aligned_address, dummy_space);
ORT_RETURN_IF_NOT(
curr_input->DataRaw() == static_cast<const int8_t*>(prev_input->DataRaw()) + input_aligned_bytes ||
curr_input->DataRaw() == static_cast<const int8_t*>(prev_input->DataRaw()) + prev_input->SizeInBytes());
if (!(curr_address == prev_end_address || curr_address == aligned_address)) {
const std::string node = context->GetNodeName().empty() ? context->GetOpType() : context->GetNodeName();
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
"Contiguous memory checking failed on node ", node, ": ",
"input #", i - 1, " address is ", prev_address, " and #bytes = ", prev_input->SizeInBytes(),
", input #", i, " address is ", curr_address);
}
prev_input = curr_input;
}

View file

@ -87,9 +87,9 @@ TEST(AllocatorTest, TestOverflowChecks) {
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size));
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size));
// we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements - (64 / element_size), element_size, &size));
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size));
// we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements - (kAllocAlignment / element_size), element_size, &size));
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size, &size));
element_size = std::numeric_limits<size_t>::max() / 8;
num_elements = 8;
@ -97,9 +97,9 @@ TEST(AllocatorTest, TestOverflowChecks) {
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements, element_size, &size));
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<0>(num_elements + 1, element_size, &size));
// we need to add 63 to apply the alignment mask, so num_elements * element_size must be 64 short of the max
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size - (64 / num_elements), &size));
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<64>(num_elements, element_size, &size));
// we need to add kAllocAlignment-1 bytes to apply the alignment mask, so num_elements * element_size must be kAllocAlignment-bytes short of the max
EXPECT_TRUE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size - (kAllocAlignment / num_elements), &size));
EXPECT_FALSE(IAllocator::CalcMemSizeForArrayWithAlignment<kAllocAlignment>(num_elements, element_size, &size));
}
} // namespace test
} // namespace onnxruntime

View file

@ -249,9 +249,9 @@ TEST_F(ExecutionFrameTest, MemPatternTest) {
ASSERT_EQ(pattern.patterns.size(), pattern.locations.size());
ASSERT_EQ(pattern.patterns.size(), 1u);
auto p = pattern.GetPatterns(cpu_allocator->Info());
ASSERT_EQ(p->PeakSize(), 2u * 64u); // each allocation is 64-byte aligned
ASSERT_EQ(p->PeakSize(), 2u * kAllocAlignment); // each allocation is kAllocAlignment-byte aligned
ASSERT_EQ(p->GetBlock(3)->offset_, 0u);
ASSERT_EQ(p->GetBlock(4)->offset_, 64u);
ASSERT_EQ(p->GetBlock(4)->offset_, kAllocAlignment);
}
TEST(ExecutionFrameTestWithoutSessionState, BadModelInvalidDimParamUsage) {

View file

@ -13,6 +13,7 @@
#include "core/common/status.h"
#include "core/framework/data_types.h"
#include "core/framework/endian.h"
#include "core/framework/allocator.h"
#include "core/session/onnxruntime_cxx_api.h"
#include "core/graph/onnx_protobuf.h"
#include "callback.h"
@ -457,7 +458,7 @@ Status TensorProtoToMLValue(const onnx::TensorProto& tensor_proto, const MemBuff
return Status::OK();
}
template Status GetSizeInBytesFromTensorProto<256>(const onnx::TensorProto& tensor_proto, size_t* out);
template Status GetSizeInBytesFromTensorProto<kAllocAlignment>(const onnx::TensorProto& tensor_proto, size_t* out);
template Status GetSizeInBytesFromTensorProto<0>(const onnx::TensorProto& tensor_proto, size_t* out);
} // namespace test
} // namespace onnxruntime