mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-02 03:55:34 +00:00
Use a custom allocator for temporary buffers in reduction_ops.cc (#2775)
* port the mimalloc allocator * hook mimalloc opt into common.h and reduction ops * repurpose USE_MIMALLOC to only denote subbing in of default allocator with mimalloc and some refactoring * fix unintended cherry pick diffs * polish alloctor_mimalloc * explicitly disable mimalloc where it already had been disabled * update mimalloc to pull in stl allocator * switch mimalloc stl allocator to use mimalloc library version * turn mimalloc on by default (only the stl changes are enabled, the python interacting ones are off already and shall remain so) * move FastAllocVector into cpu specific code * separate out defines into arena and stl changes * the rest of the define renames * bfc arena allocator * some typos and rename the bfc arena allocator to fit existing class naming conventions * adjustments in response to comments * different template instantiations are friends
This commit is contained in:
parent
61ae134469
commit
f367fd921c
16 changed files with 129 additions and 32 deletions
|
|
@ -611,7 +611,7 @@ else()
|
|||
endif()
|
||||
set(onnxruntime_DELAYLOAD_FLAGS "")
|
||||
if (onnxruntime_USE_JEMALLOC)
|
||||
if (onnxruntime_USE_MIMALLOC)
|
||||
if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
message( FATAL_ERROR "You cannot specify both jemalloc and mimalloc." )
|
||||
endif()
|
||||
|
||||
|
|
|
|||
2
cmake/external/mimalloc
vendored
2
cmake/external/mimalloc
vendored
|
|
@ -1 +1 @@
|
|||
Subproject commit e2202f6bbe4e2051014ac20c38b3cf88492e9d2f
|
||||
Subproject commit 2d54553b7a78c7c35620b827e7e5ab2228ecb495
|
||||
8
cmake/external/mimalloc.cmake
vendored
8
cmake/external/mimalloc.cmake
vendored
|
|
@ -1,7 +1,11 @@
|
|||
|
||||
set(mimalloc_root_dir ${PROJECT_SOURCE_DIR}/external/mimalloc)
|
||||
|
||||
add_definitions(-DUSE_MIMALLOC) # used in ONNXRuntime
|
||||
if(onnxruntime_USE_MIMALLOC_STL_ALLOCATOR)
|
||||
add_definitions(-DUSE_MIMALLOC_STL_ALLOCATOR) # used in ONNXRuntime
|
||||
endif()
|
||||
if(onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
add_definitions(-DUSE_MIMALLOC_ARENA_ALLOCATOR) # used in ONNXRuntime
|
||||
endif()
|
||||
include_directories(${mimalloc_root_dir}/include)
|
||||
|
||||
option(MI_OVERRIDE "" OFF)
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ if (onnxruntime_USE_TELEMETRY)
|
|||
set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
|
||||
endif()
|
||||
|
||||
if (onnxruntime_USE_MIMALLOC)
|
||||
if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
if(onnxruntime_USE_CUDA OR onnxruntime_USE_OPENVINO)
|
||||
message(WARNING "Ignoring directive to use mimalloc on unimplemented targets")
|
||||
elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@
|
|||
#include "core/common/make_unique.h"
|
||||
#include "core/common/status.h"
|
||||
|
||||
#ifdef USE_MIMALLOC
|
||||
#ifdef USE_MIMALLOC_ARENA_ALLOCATOR
|
||||
#include <mimalloc.h>
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -280,7 +280,7 @@ class CPUAllocator : public IDeviceAllocator {
|
|||
std::unique_ptr<OrtMemoryInfo> memory_info_;
|
||||
};
|
||||
|
||||
#ifdef USE_MIMALLOC
|
||||
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
class MiMallocAllocator : public IDeviceAllocator {
|
||||
public:
|
||||
explicit MiMallocAllocator(std::unique_ptr<OrtMemoryInfo> memory_info) {
|
||||
|
|
@ -302,10 +302,10 @@ class MiMallocAllocator : public IDeviceAllocator {
|
|||
|
||||
#endif
|
||||
|
||||
#ifdef USE_MIMALLOC
|
||||
using TAllocator = MiMallocAllocator;
|
||||
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
using TAllocator = MiMallocAllocator;
|
||||
#else
|
||||
using TAllocator = CPUAllocator;
|
||||
using TAllocator = CPUAllocator;
|
||||
#endif
|
||||
|
||||
using AllocatorPtr = std::shared_ptr<IAllocator>;
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz
|
|||
return ok;
|
||||
}
|
||||
|
||||
#ifdef USE_MIMALLOC
|
||||
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
void* MiMallocAllocator::Alloc(size_t size) {
|
||||
return mi_malloc(size);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
namespace onnxruntime {
|
||||
|
||||
#ifdef USE_MIMALLOC
|
||||
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
using TArenaAllocator = MiMallocArena;
|
||||
#else
|
||||
using TArenaAllocator = BFCArena;
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
#ifdef USE_MIMALLOC
|
||||
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
#include "mimalloc.h"
|
||||
#include "core/framework/mimalloc_arena.h"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
#ifdef USE_MIMALLOC
|
||||
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/arena.h"
|
||||
#include "onnxruntime_config.h"
|
||||
|
|
|
|||
51
onnxruntime/core/framework/ort_stl_allocator.h
Normal file
51
onnxruntime/core/framework/ort_stl_allocator.h
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
// An STL wrapper for ORT allocators. This enables overriding the
|
||||
// std::allocator used in STL containers for better memory performance.
|
||||
template <class T>
|
||||
class OrtStlAllocator {
|
||||
template <class U> friend class OrtStlAllocator;
|
||||
AllocatorPtr allocator_;
|
||||
|
||||
public:
|
||||
typedef T value_type;
|
||||
using propagate_on_container_copy_assignment = std::true_type;
|
||||
using propagate_on_container_move_assignment = std::true_type;
|
||||
using propagate_on_container_swap = std::true_type;
|
||||
using is_always_equal = std::true_type;
|
||||
|
||||
OrtStlAllocator(const AllocatorPtr& a) noexcept {
|
||||
allocator_ = a;
|
||||
}
|
||||
OrtStlAllocator(const OrtStlAllocator& other) noexcept {
|
||||
allocator_ = other.allocator_;
|
||||
}
|
||||
template <class U>
|
||||
OrtStlAllocator(const OrtStlAllocator<U>& other) noexcept {
|
||||
allocator_ = other.allocator_;
|
||||
}
|
||||
|
||||
T* allocate(size_t n, const void* hint = 0) {
|
||||
ORT_UNUSED_PARAMETER(hint);
|
||||
return reinterpret_cast<T*>(allocator_->Alloc(n * sizeof(T)));
|
||||
}
|
||||
|
||||
void deallocate(T* p, size_t n) {
|
||||
ORT_UNUSED_PARAMETER(n);
|
||||
allocator_->Free(p);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T1, class T2>
|
||||
bool operator==(const OrtStlAllocator<T1>& lhs, const OrtStlAllocator<T2>& rhs) noexcept {
|
||||
return lhs.allocator_ == rhs.allocator_;
|
||||
}
|
||||
template <class T1, class T2>
|
||||
bool operator!=(const OrtStlAllocator<T1>& lhs, const OrtStlAllocator<T2>& rhs) noexcept {
|
||||
return lhs.allocator_ != rhs.allocator_;
|
||||
}
|
||||
|
||||
} // namespace onnxruntime
|
||||
40
onnxruntime/core/providers/cpu/containers.h
Normal file
40
onnxruntime/core/providers/cpu/containers.h
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#if defined(USE_MIMALLOC_STL_ALLOCATOR)
|
||||
#include <mimalloc.h>
|
||||
#else
|
||||
#include "core/framework/ort_stl_allocator.h"
|
||||
#endif
|
||||
|
||||
namespace onnxruntime {
|
||||
|
||||
#if defined(USE_MIMALLOC_STL_ALLOCATOR)
|
||||
|
||||
template <typename T>
|
||||
mi_stl_allocator<T> GetAllocator(const OpKernelContext& context) {
|
||||
ORT_UNUSED_PARAMETER(context);
|
||||
return mi_stl_allocator<T>();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
using FastAllocVector = std::vector<T,mi_stl_allocator<T>>;
|
||||
|
||||
#else
|
||||
|
||||
template <typename T>
|
||||
OrtStlAllocator<T> GetAllocator(const OpKernelContext& context) {
|
||||
AllocatorPtr allocator;
|
||||
auto status = context.GetTempSpaceAllocator(&allocator);
|
||||
ORT_ENFORCE(status.IsOK());
|
||||
return OrtStlAllocator<T>(allocator);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
using FastAllocVector = std::vector<T,OrtStlAllocator<T>>;
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -32,7 +32,7 @@ class CPUExecutionProvider : public IExecutionProvider {
|
|||
std::numeric_limits<size_t>::max()};
|
||||
|
||||
#ifdef USE_JEMALLOC
|
||||
#if defined(USE_MIMALLOC)
|
||||
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) || defined(USE_MIMALLOC_STL_ALLOCATOR)
|
||||
#error jemalloc and mimalloc should not both be enabled
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include "core/providers/cpu/reduction/reduction_ops.h"
|
||||
#include "core/providers/common.h"
|
||||
#include "core/util/math_cpuonly.h"
|
||||
#include "core/providers/cpu/containers.h"
|
||||
using namespace std;
|
||||
namespace onnxruntime {
|
||||
|
||||
|
|
@ -125,7 +126,7 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 11);
|
|||
// size of each reduce.
|
||||
template <typename T>
|
||||
bool PrepareForReduce(OpKernelContext* ctx,
|
||||
std::vector<T>& transposedInputData,
|
||||
FastAllocVector<T>& transposedInputData,
|
||||
Tensor** reducedTensor,
|
||||
int64_t& block_size,
|
||||
int64_t& blocks,
|
||||
|
|
@ -307,7 +308,7 @@ bool PrepareForReduce(OpKernelContext* ctx,
|
|||
|
||||
template <typename T>
|
||||
Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -323,7 +324,7 @@ Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -339,7 +340,7 @@ Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -359,7 +360,7 @@ Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -383,7 +384,7 @@ Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -407,7 +408,7 @@ Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -434,7 +435,7 @@ Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -458,7 +459,7 @@ Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -474,7 +475,7 @@ Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -501,7 +502,7 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -517,7 +518,7 @@ Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
@ -537,7 +538,7 @@ Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
template <typename T>
|
||||
Status ArgMin<T>::Compute(OpKernelContext* ctx) const {
|
||||
std::vector<T> transposedInputData;
|
||||
FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
|
||||
int64_t block_size;
|
||||
int64_t blocks;
|
||||
Tensor* reduced;
|
||||
|
|
|
|||
|
|
@ -811,7 +811,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
|
|||
});
|
||||
}
|
||||
|
||||
#ifdef USE_MIMALLOC
|
||||
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
static struct {
|
||||
PyMemAllocatorEx mem;
|
||||
PyMemAllocatorEx raw;
|
||||
|
|
@ -823,7 +823,7 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
|
|||
m.doc() = "pybind11 stateful interface to ONNX runtime";
|
||||
RegisterExceptions(m);
|
||||
|
||||
#ifdef USE_MIMALLOC
|
||||
#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
|
||||
PyMemAllocatorEx alloc;
|
||||
alloc.malloc = [](void* ctx, size_t size) {
|
||||
ORT_UNUSED_PARAMETER(ctx);
|
||||
|
|
|
|||
|
|
@ -128,7 +128,7 @@ Use the individual flags to only run the specified stages.
|
|||
parser.add_argument("--skip_submodule_sync", action='store_true', help="Don't do a 'git submodule update'. Makes the Update phase faster.")
|
||||
parser.add_argument("--use_vstest", action='store_true', help="Use use_vstest for running unitests.")
|
||||
parser.add_argument("--use_jemalloc", action='store_true', help="Use jemalloc.")
|
||||
parser.add_argument("--use_mimalloc", action='store_true', help="Use mimalloc.")
|
||||
parser.add_argument("--use_mimalloc", default=['none'], choices=['none', 'stl', 'arena', 'all'], help="Use mimalloc.")
|
||||
parser.add_argument("--use_openblas", action='store_true', help="Build with OpenBLAS.")
|
||||
parser.add_argument("--use_dnnl", action='store_true', help="Build with DNNL.")
|
||||
parser.add_argument("--use_mklml", action='store_true', help="Build with MKLML.")
|
||||
|
|
@ -306,7 +306,8 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
|
|||
"-Donnxruntime_USE_FEATURIZERS=" + ("ON" if args.use_featurizers else "OFF"),
|
||||
"-Donnxruntime_CUDA_HOME=" + (cuda_home if args.use_cuda else ""),
|
||||
"-Donnxruntime_USE_JEMALLOC=" + ("ON" if args.use_jemalloc else "OFF"),
|
||||
"-Donnxruntime_USE_MIMALLOC=" + ("ON" if args.use_mimalloc else "OFF"),
|
||||
"-Donnxruntime_USE_MIMALLOC_STL_ALLOCATOR=" + ("ON" if args.use_mimalloc == "stl" or args.use_mimalloc == "all" else "OFF"),
|
||||
"-Donnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR=" + ("ON" if args.use_mimalloc == "arena" or args.use_mimalloc == "all" else "OFF"),
|
||||
"-Donnxruntime_ENABLE_PYTHON=" + ("ON" if args.enable_pybind else "OFF"),
|
||||
"-Donnxruntime_BUILD_CSHARP=" + ("ON" if args.build_csharp else "OFF"),
|
||||
"-Donnxruntime_BUILD_JAVA=" + ("ON" if args.build_java else "OFF"),
|
||||
|
|
|
|||
Loading…
Reference in a new issue