Use a custom allocator for temporary buffers in reduction_ops.cc (#2775)

* port the mimalloc allocator

* hook mimalloc opt into common.h and reduction ops

* repurpose USE_MIMALLOC to only denote subbing in of default allocator with mimalloc and some refactoring

* fix unintended cherry pick diffs

* polish alloctor_mimalloc

* explicitly disable mimalloc where it already had been disabled

* update mimalloc to pull in stl allocator

* switch mimalloc stl allocator to use mimalloc library version

* turn mimalloc on by default (only the stl changes are enabled, the python interacting ones are off already and shall remain so)

* move FastAllocVector into cpu specific code

* separate out defines into arena and stl changes

* the rest of the define renames

* bfc arena allocator

* some typos and rename the bfc arena allocator to fit existing class naming conventions

* adjustments in response to comments

* different template instantiations are friends
This commit is contained in:
kile0 2020-02-22 22:04:30 -08:00 committed by GitHub
parent 61ae134469
commit f367fd921c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
16 changed files with 129 additions and 32 deletions

View file

@ -611,7 +611,7 @@ else()
endif()
set(onnxruntime_DELAYLOAD_FLAGS "")
if (onnxruntime_USE_JEMALLOC)
if (onnxruntime_USE_MIMALLOC)
if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
message( FATAL_ERROR "You cannot specify both jemalloc and mimalloc." )
endif()

@ -1 +1 @@
Subproject commit e2202f6bbe4e2051014ac20c38b3cf88492e9d2f
Subproject commit 2d54553b7a78c7c35620b827e7e5ab2228ecb495

View file

@ -1,7 +1,11 @@
set(mimalloc_root_dir ${PROJECT_SOURCE_DIR}/external/mimalloc)
add_definitions(-DUSE_MIMALLOC) # used in ONNXRuntime
if(onnxruntime_USE_MIMALLOC_STL_ALLOCATOR)
add_definitions(-DUSE_MIMALLOC_STL_ALLOCATOR) # used in ONNXRuntime
endif()
if(onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
add_definitions(-DUSE_MIMALLOC_ARENA_ALLOCATOR) # used in ONNXRuntime
endif()
include_directories(${mimalloc_root_dir}/include)
option(MI_OVERRIDE "" OFF)

View file

@ -72,7 +72,7 @@ if (onnxruntime_USE_TELEMETRY)
set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
endif()
if (onnxruntime_USE_MIMALLOC)
if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
if(onnxruntime_USE_CUDA OR onnxruntime_USE_OPENVINO)
message(WARNING "Ignoring directive to use mimalloc on unimplemented targets")
elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")

View file

@ -36,7 +36,7 @@
#include "core/common/make_unique.h"
#include "core/common/status.h"
#ifdef USE_MIMALLOC
#ifdef USE_MIMALLOC_ARENA_ALLOCATOR
#include <mimalloc.h>
#endif

View file

@ -280,7 +280,7 @@ class CPUAllocator : public IDeviceAllocator {
std::unique_ptr<OrtMemoryInfo> memory_info_;
};
#ifdef USE_MIMALLOC
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
class MiMallocAllocator : public IDeviceAllocator {
public:
explicit MiMallocAllocator(std::unique_ptr<OrtMemoryInfo> memory_info) {
@ -302,10 +302,10 @@ class MiMallocAllocator : public IDeviceAllocator {
#endif
#ifdef USE_MIMALLOC
using TAllocator = MiMallocAllocator;
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
using TAllocator = MiMallocAllocator;
#else
using TAllocator = CPUAllocator;
using TAllocator = CPUAllocator;
#endif
using AllocatorPtr = std::shared_ptr<IAllocator>;

View file

@ -32,7 +32,7 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz
return ok;
}
#ifdef USE_MIMALLOC
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
void* MiMallocAllocator::Alloc(size_t size) {
return mi_malloc(size);
}

View file

@ -11,7 +11,7 @@
namespace onnxruntime {
#ifdef USE_MIMALLOC
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
using TArenaAllocator = MiMallocArena;
#else
using TArenaAllocator = BFCArena;

View file

@ -1,4 +1,4 @@
#ifdef USE_MIMALLOC
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
#include "mimalloc.h"
#include "core/framework/mimalloc_arena.h"

View file

@ -1,4 +1,4 @@
#ifdef USE_MIMALLOC
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
#include "core/common/common.h"
#include "core/framework/arena.h"
#include "onnxruntime_config.h"

View file

@ -0,0 +1,51 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
namespace onnxruntime {
// An STL wrapper for ORT allocators. This enables overriding the
// std::allocator used in STL containers for better memory performance.
template <class T>
class OrtStlAllocator {
template <class U> friend class OrtStlAllocator;
AllocatorPtr allocator_;
public:
typedef T value_type;
using propagate_on_container_copy_assignment = std::true_type;
using propagate_on_container_move_assignment = std::true_type;
using propagate_on_container_swap = std::true_type;
using is_always_equal = std::true_type;
OrtStlAllocator(const AllocatorPtr& a) noexcept {
allocator_ = a;
}
OrtStlAllocator(const OrtStlAllocator& other) noexcept {
allocator_ = other.allocator_;
}
template <class U>
OrtStlAllocator(const OrtStlAllocator<U>& other) noexcept {
allocator_ = other.allocator_;
}
T* allocate(size_t n, const void* hint = 0) {
ORT_UNUSED_PARAMETER(hint);
return reinterpret_cast<T*>(allocator_->Alloc(n * sizeof(T)));
}
void deallocate(T* p, size_t n) {
ORT_UNUSED_PARAMETER(n);
allocator_->Free(p);
}
};
template <class T1, class T2>
bool operator==(const OrtStlAllocator<T1>& lhs, const OrtStlAllocator<T2>& rhs) noexcept {
return lhs.allocator_ == rhs.allocator_;
}
template <class T1, class T2>
bool operator!=(const OrtStlAllocator<T1>& lhs, const OrtStlAllocator<T2>& rhs) noexcept {
return lhs.allocator_ != rhs.allocator_;
}
} // namespace onnxruntime

View file

@ -0,0 +1,40 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#if defined(USE_MIMALLOC_STL_ALLOCATOR)
#include <mimalloc.h>
#else
#include "core/framework/ort_stl_allocator.h"
#endif
namespace onnxruntime {
#if defined(USE_MIMALLOC_STL_ALLOCATOR)
template <typename T>
mi_stl_allocator<T> GetAllocator(const OpKernelContext& context) {
ORT_UNUSED_PARAMETER(context);
return mi_stl_allocator<T>();
}
template <typename T>
using FastAllocVector = std::vector<T,mi_stl_allocator<T>>;
#else
template <typename T>
OrtStlAllocator<T> GetAllocator(const OpKernelContext& context) {
AllocatorPtr allocator;
auto status = context.GetTempSpaceAllocator(&allocator);
ORT_ENFORCE(status.IsOK());
return OrtStlAllocator<T>(allocator);
}
template <typename T>
using FastAllocVector = std::vector<T,OrtStlAllocator<T>>;
#endif
} // namespace onnxruntime

View file

@ -32,7 +32,7 @@ class CPUExecutionProvider : public IExecutionProvider {
std::numeric_limits<size_t>::max()};
#ifdef USE_JEMALLOC
#if defined(USE_MIMALLOC)
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) || defined(USE_MIMALLOC_STL_ALLOCATOR)
#error jemalloc and mimalloc should not both be enabled
#endif

View file

@ -4,6 +4,7 @@
#include "core/providers/cpu/reduction/reduction_ops.h"
#include "core/providers/common.h"
#include "core/util/math_cpuonly.h"
#include "core/providers/cpu/containers.h"
using namespace std;
namespace onnxruntime {
@ -125,7 +126,7 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 11);
// size of each reduce.
template <typename T>
bool PrepareForReduce(OpKernelContext* ctx,
std::vector<T>& transposedInputData,
FastAllocVector<T>& transposedInputData,
Tensor** reducedTensor,
int64_t& block_size,
int64_t& blocks,
@ -307,7 +308,7 @@ bool PrepareForReduce(OpKernelContext* ctx,
template <typename T>
Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -323,7 +324,7 @@ Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -339,7 +340,7 @@ Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -359,7 +360,7 @@ Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -383,7 +384,7 @@ Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -407,7 +408,7 @@ Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -434,7 +435,7 @@ Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -458,7 +459,7 @@ Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -474,7 +475,7 @@ Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -501,7 +502,7 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -517,7 +518,7 @@ Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;
@ -537,7 +538,7 @@ Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
template <typename T>
Status ArgMin<T>::Compute(OpKernelContext* ctx) const {
std::vector<T> transposedInputData;
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
int64_t block_size;
int64_t blocks;
Tensor* reduced;

View file

@ -811,7 +811,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
});
}
#ifdef USE_MIMALLOC
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
static struct {
PyMemAllocatorEx mem;
PyMemAllocatorEx raw;
@ -823,7 +823,7 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
m.doc() = "pybind11 stateful interface to ONNX runtime";
RegisterExceptions(m);
#ifdef USE_MIMALLOC
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
PyMemAllocatorEx alloc;
alloc.malloc = [](void* ctx, size_t size) {
ORT_UNUSED_PARAMETER(ctx);

View file

@ -128,7 +128,7 @@ Use the individual flags to only run the specified stages.
parser.add_argument("--skip_submodule_sync", action='store_true', help="Don't do a 'git submodule update'. Makes the Update phase faster.")
parser.add_argument("--use_vstest", action='store_true', help="Use use_vstest for running unitests.")
parser.add_argument("--use_jemalloc", action='store_true', help="Use jemalloc.")
parser.add_argument("--use_mimalloc", action='store_true', help="Use mimalloc.")
parser.add_argument("--use_mimalloc", default=['none'], choices=['none', 'stl', 'arena', 'all'], help="Use mimalloc.")
parser.add_argument("--use_openblas", action='store_true', help="Build with OpenBLAS.")
parser.add_argument("--use_dnnl", action='store_true', help="Build with DNNL.")
parser.add_argument("--use_mklml", action='store_true', help="Build with MKLML.")
@ -306,7 +306,8 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
"-Donnxruntime_USE_FEATURIZERS=" + ("ON" if args.use_featurizers else "OFF"),
"-Donnxruntime_CUDA_HOME=" + (cuda_home if args.use_cuda else ""),
"-Donnxruntime_USE_JEMALLOC=" + ("ON" if args.use_jemalloc else "OFF"),
"-Donnxruntime_USE_MIMALLOC=" + ("ON" if args.use_mimalloc else "OFF"),
"-Donnxruntime_USE_MIMALLOC_STL_ALLOCATOR=" + ("ON" if args.use_mimalloc == "stl" or args.use_mimalloc == "all" else "OFF"),
"-Donnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR=" + ("ON" if args.use_mimalloc == "arena" or args.use_mimalloc == "all" else "OFF"),
"-Donnxruntime_ENABLE_PYTHON=" + ("ON" if args.enable_pybind else "OFF"),
"-Donnxruntime_BUILD_CSHARP=" + ("ON" if args.build_csharp else "OFF"),
"-Donnxruntime_BUILD_JAVA=" + ("ON" if args.build_java else "OFF"),