diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index eb4e0841c2..c09029a54b 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -611,7 +611,7 @@ else() endif() set(onnxruntime_DELAYLOAD_FLAGS "") if (onnxruntime_USE_JEMALLOC) - if (onnxruntime_USE_MIMALLOC) + if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR) message( FATAL_ERROR "You cannot specify both jemalloc and mimalloc." ) endif() diff --git a/cmake/external/mimalloc b/cmake/external/mimalloc index e2202f6bbe..2d54553b7a 160000 --- a/cmake/external/mimalloc +++ b/cmake/external/mimalloc @@ -1 +1 @@ -Subproject commit e2202f6bbe4e2051014ac20c38b3cf88492e9d2f +Subproject commit 2d54553b7a78c7c35620b827e7e5ab2228ecb495 diff --git a/cmake/external/mimalloc.cmake b/cmake/external/mimalloc.cmake index a7dfd04a04..77ca987b0c 100644 --- a/cmake/external/mimalloc.cmake +++ b/cmake/external/mimalloc.cmake @@ -1,7 +1,11 @@ - set(mimalloc_root_dir ${PROJECT_SOURCE_DIR}/external/mimalloc) -add_definitions(-DUSE_MIMALLOC) # used in ONNXRuntime +if(onnxruntime_USE_MIMALLOC_STL_ALLOCATOR) + add_definitions(-DUSE_MIMALLOC_STL_ALLOCATOR) # used in ONNXRuntime +endif() +if(onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR) + add_definitions(-DUSE_MIMALLOC_ARENA_ALLOCATOR) # used in ONNXRuntime +endif() include_directories(${mimalloc_root_dir}/include) option(MI_OVERRIDE "" OFF) diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake index e688f2d9d7..d33e9e1412 100644 --- a/cmake/onnxruntime_common.cmake +++ b/cmake/onnxruntime_common.cmake @@ -72,7 +72,7 @@ if (onnxruntime_USE_TELEMETRY) set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h") endif() -if (onnxruntime_USE_MIMALLOC) +if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR) if(onnxruntime_USE_CUDA OR onnxruntime_USE_OPENVINO) message(WARNING "Ignoring directive to use mimalloc on unimplemented targets") elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU") diff --git a/include/onnxruntime/core/common/common.h b/include/onnxruntime/core/common/common.h index ab079b04b9..a0adf066f6 100644 --- a/include/onnxruntime/core/common/common.h +++ b/include/onnxruntime/core/common/common.h @@ -36,7 +36,7 @@ #include "core/common/make_unique.h" #include "core/common/status.h" -#ifdef USE_MIMALLOC +#ifdef USE_MIMALLOC_ARENA_ALLOCATOR #include #endif diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h index f3045e9964..91d2401ab1 100644 --- a/include/onnxruntime/core/framework/allocator.h +++ b/include/onnxruntime/core/framework/allocator.h @@ -280,7 +280,7 @@ class CPUAllocator : public IDeviceAllocator { std::unique_ptr memory_info_; }; -#ifdef USE_MIMALLOC +#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) class MiMallocAllocator : public IDeviceAllocator { public: explicit MiMallocAllocator(std::unique_ptr memory_info) { @@ -302,10 +302,10 @@ class MiMallocAllocator : public IDeviceAllocator { #endif -#ifdef USE_MIMALLOC -using TAllocator = MiMallocAllocator; +#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) + using TAllocator = MiMallocAllocator; #else -using TAllocator = CPUAllocator; + using TAllocator = CPUAllocator; #endif using AllocatorPtr = std::shared_ptr; diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc index f3ada447ae..33fe2e7873 100644 --- a/onnxruntime/core/framework/allocator.cc +++ b/onnxruntime/core/framework/allocator.cc @@ -32,7 +32,7 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz return ok; } -#ifdef USE_MIMALLOC +#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) void* MiMallocAllocator::Alloc(size_t size) { return mi_malloc(size); } diff --git a/onnxruntime/core/framework/allocatormgr.cc b/onnxruntime/core/framework/allocatormgr.cc index 3dec82e3a3..1c585e6a8b 100644 --- a/onnxruntime/core/framework/allocatormgr.cc +++ b/onnxruntime/core/framework/allocatormgr.cc @@ -11,7 +11,7 @@ namespace onnxruntime { -#ifdef USE_MIMALLOC +#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) using TArenaAllocator = MiMallocArena; #else using TArenaAllocator = BFCArena; diff --git a/onnxruntime/core/framework/mimalloc_arena.cc b/onnxruntime/core/framework/mimalloc_arena.cc index e40cd9da5a..fffbe683c6 100644 --- a/onnxruntime/core/framework/mimalloc_arena.cc +++ b/onnxruntime/core/framework/mimalloc_arena.cc @@ -1,4 +1,4 @@ -#ifdef USE_MIMALLOC +#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) #include "mimalloc.h" #include "core/framework/mimalloc_arena.h" diff --git a/onnxruntime/core/framework/mimalloc_arena.h b/onnxruntime/core/framework/mimalloc_arena.h index 1a06d3a90d..d9c75a29f8 100644 --- a/onnxruntime/core/framework/mimalloc_arena.h +++ b/onnxruntime/core/framework/mimalloc_arena.h @@ -1,4 +1,4 @@ -#ifdef USE_MIMALLOC +#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) #include "core/common/common.h" #include "core/framework/arena.h" #include "onnxruntime_config.h" diff --git a/onnxruntime/core/framework/ort_stl_allocator.h b/onnxruntime/core/framework/ort_stl_allocator.h new file mode 100644 index 0000000000..62c313de94 --- /dev/null +++ b/onnxruntime/core/framework/ort_stl_allocator.h @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +namespace onnxruntime { + +// An STL wrapper for ORT allocators. This enables overriding the +// std::allocator used in STL containers for better memory performance. +template +class OrtStlAllocator { + template friend class OrtStlAllocator; + AllocatorPtr allocator_; + +public: + typedef T value_type; + using propagate_on_container_copy_assignment = std::true_type; + using propagate_on_container_move_assignment = std::true_type; + using propagate_on_container_swap = std::true_type; + using is_always_equal = std::true_type; + + OrtStlAllocator(const AllocatorPtr& a) noexcept { + allocator_ = a; + } + OrtStlAllocator(const OrtStlAllocator& other) noexcept { + allocator_ = other.allocator_; + } + template + OrtStlAllocator(const OrtStlAllocator& other) noexcept { + allocator_ = other.allocator_; + } + + T* allocate(size_t n, const void* hint = 0) { + ORT_UNUSED_PARAMETER(hint); + return reinterpret_cast(allocator_->Alloc(n * sizeof(T))); + } + + void deallocate(T* p, size_t n) { + ORT_UNUSED_PARAMETER(n); + allocator_->Free(p); + } +}; + +template +bool operator==(const OrtStlAllocator& lhs, const OrtStlAllocator& rhs) noexcept { + return lhs.allocator_ == rhs.allocator_; +} +template +bool operator!=(const OrtStlAllocator& lhs, const OrtStlAllocator& rhs) noexcept { + return lhs.allocator_ != rhs.allocator_; +} + +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/cpu/containers.h b/onnxruntime/core/providers/cpu/containers.h new file mode 100644 index 0000000000..7dcdaee1f5 --- /dev/null +++ b/onnxruntime/core/providers/cpu/containers.h @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#if defined(USE_MIMALLOC_STL_ALLOCATOR) +#include +#else +#include "core/framework/ort_stl_allocator.h" +#endif + +namespace onnxruntime { + +#if defined(USE_MIMALLOC_STL_ALLOCATOR) + +template +mi_stl_allocator GetAllocator(const OpKernelContext& context) { + ORT_UNUSED_PARAMETER(context); + return mi_stl_allocator(); +} + +template +using FastAllocVector = std::vector>; + +#else + +template +OrtStlAllocator GetAllocator(const OpKernelContext& context) { + AllocatorPtr allocator; + auto status = context.GetTempSpaceAllocator(&allocator); + ORT_ENFORCE(status.IsOK()); + return OrtStlAllocator(allocator); +} + +template +using FastAllocVector = std::vector>; + +#endif + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.h b/onnxruntime/core/providers/cpu/cpu_execution_provider.h index 79099e65ba..441e4eb2cf 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.h +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.h @@ -32,7 +32,7 @@ class CPUExecutionProvider : public IExecutionProvider { std::numeric_limits::max()}; #ifdef USE_JEMALLOC -#if defined(USE_MIMALLOC) +#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) || defined(USE_MIMALLOC_STL_ALLOCATOR) #error jemalloc and mimalloc should not both be enabled #endif diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc index cdfab52141..713386600a 100644 --- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc @@ -4,6 +4,7 @@ #include "core/providers/cpu/reduction/reduction_ops.h" #include "core/providers/common.h" #include "core/util/math_cpuonly.h" +#include "core/providers/cpu/containers.h" using namespace std; namespace onnxruntime { @@ -125,7 +126,7 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 11); // size of each reduce. template bool PrepareForReduce(OpKernelContext* ctx, - std::vector& transposedInputData, + FastAllocVector& transposedInputData, Tensor** reducedTensor, int64_t& block_size, int64_t& blocks, @@ -307,7 +308,7 @@ bool PrepareForReduce(OpKernelContext* ctx, template Status ReduceL1::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -323,7 +324,7 @@ Status ReduceL1::Compute(OpKernelContext* ctx) const { template Status ReduceL2::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -339,7 +340,7 @@ Status ReduceL2::Compute(OpKernelContext* ctx) const { template Status ReduceLogSum::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -359,7 +360,7 @@ Status ReduceLogSum::Compute(OpKernelContext* ctx) const { template Status ReduceLogSumExp::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -383,7 +384,7 @@ Status ReduceLogSumExp::Compute(OpKernelContext* ctx) const { template Status ReduceMax::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -407,7 +408,7 @@ Status ReduceMax::Compute(OpKernelContext* ctx) const { template Status ReduceMean::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -434,7 +435,7 @@ Status ReduceMean::Compute(OpKernelContext* ctx) const { template Status ReduceMin::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -458,7 +459,7 @@ Status ReduceMin::Compute(OpKernelContext* ctx) const { template Status ReduceProd::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -474,7 +475,7 @@ Status ReduceProd::Compute(OpKernelContext* ctx) const { template Status ReduceSum::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -501,7 +502,7 @@ Status ReduceSum::Compute(OpKernelContext* ctx) const { template Status ReduceSumSquare::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -517,7 +518,7 @@ Status ReduceSumSquare::Compute(OpKernelContext* ctx) const { template Status ArgMax::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; @@ -537,7 +538,7 @@ Status ArgMax::Compute(OpKernelContext* ctx) const { template Status ArgMin::Compute(OpKernelContext* ctx) const { - std::vector transposedInputData; + FastAllocVector transposedInputData(GetAllocator(*ctx)); int64_t block_size; int64_t blocks; Tensor* reduced; diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 5166bfbc05..d4baca48da 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -811,7 +811,7 @@ including arg name, arg type (contains both type and shape).)pbdoc") }); } -#ifdef USE_MIMALLOC +#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) static struct { PyMemAllocatorEx mem; PyMemAllocatorEx raw; @@ -823,7 +823,7 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) { m.doc() = "pybind11 stateful interface to ONNX runtime"; RegisterExceptions(m); -#ifdef USE_MIMALLOC +#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) PyMemAllocatorEx alloc; alloc.malloc = [](void* ctx, size_t size) { ORT_UNUSED_PARAMETER(ctx); diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index d3ac8fbb96..f136dd8edd 100755 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -128,7 +128,7 @@ Use the individual flags to only run the specified stages. parser.add_argument("--skip_submodule_sync", action='store_true', help="Don't do a 'git submodule update'. Makes the Update phase faster.") parser.add_argument("--use_vstest", action='store_true', help="Use use_vstest for running unitests.") parser.add_argument("--use_jemalloc", action='store_true', help="Use jemalloc.") - parser.add_argument("--use_mimalloc", action='store_true', help="Use mimalloc.") + parser.add_argument("--use_mimalloc", default=['none'], choices=['none', 'stl', 'arena', 'all'], help="Use mimalloc.") parser.add_argument("--use_openblas", action='store_true', help="Build with OpenBLAS.") parser.add_argument("--use_dnnl", action='store_true', help="Build with DNNL.") parser.add_argument("--use_mklml", action='store_true', help="Build with MKLML.") @@ -306,7 +306,8 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home "-Donnxruntime_USE_FEATURIZERS=" + ("ON" if args.use_featurizers else "OFF"), "-Donnxruntime_CUDA_HOME=" + (cuda_home if args.use_cuda else ""), "-Donnxruntime_USE_JEMALLOC=" + ("ON" if args.use_jemalloc else "OFF"), - "-Donnxruntime_USE_MIMALLOC=" + ("ON" if args.use_mimalloc else "OFF"), + "-Donnxruntime_USE_MIMALLOC_STL_ALLOCATOR=" + ("ON" if args.use_mimalloc == "stl" or args.use_mimalloc == "all" else "OFF"), + "-Donnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR=" + ("ON" if args.use_mimalloc == "arena" or args.use_mimalloc == "all" else "OFF"), "-Donnxruntime_ENABLE_PYTHON=" + ("ON" if args.enable_pybind else "OFF"), "-Donnxruntime_BUILD_CSHARP=" + ("ON" if args.build_csharp else "OFF"), "-Donnxruntime_BUILD_JAVA=" + ("ON" if args.build_java else "OFF"),