Use a custom allocator for temporary buffers in reduction_ops.cc (#2775)

* port the mimalloc allocator * hook mimalloc opt into common.h and reduction ops * repurpose USE_MIMALLOC to only denote subbing in of default allocator with mimalloc and some refactoring * fix unintended cherry pick diffs * polish alloctor_mimalloc * explicitly disable mimalloc where it already had been disabled * update mimalloc to pull in stl allocator * switch mimalloc stl allocator to use mimalloc library version * turn mimalloc on by default (only the stl changes are enabled, the python interacting ones are off already and shall remain so) * move FastAllocVector into cpu specific code * separate out defines into arena and stl changes * the rest of the define renames * bfc arena allocator * some typos and rename the bfc arena allocator to fit existing class naming conventions * adjustments in response to comments * different template instantiations are friends
2026-07-05 04:17:53 +00:00 · 2020-02-22 22:04:30 -08:00 · 2020-02-22 22:04:30 -08:00 · f367fd921c
commit f367fd921c
parent 61ae134469
16 changed files with 129 additions and 32 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -611,7 +611,7 @@ else()
 endif()
 set(onnxruntime_DELAYLOAD_FLAGS "")
 if (onnxruntime_USE_JEMALLOC)
-  if (onnxruntime_USE_MIMALLOC)
+  if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
    message( FATAL_ERROR "You cannot specify both jemalloc and mimalloc." )
  endif()

--- a/cmake/external/mimalloc
+++ b/cmake/external/mimalloc
@ -1 +1 @@
-Subproject commit e2202f6bbe4e2051014ac20c38b3cf88492e9d2f
+Subproject commit 2d54553b7a78c7c35620b827e7e5ab2228ecb495
--- a/cmake/external/mimalloc.cmake
+++ b/cmake/external/mimalloc.cmake
@ -1,7 +1,11 @@
-
 set(mimalloc_root_dir ${PROJECT_SOURCE_DIR}/external/mimalloc)

-add_definitions(-DUSE_MIMALLOC) # used in ONNXRuntime
+if(onnxruntime_USE_MIMALLOC_STL_ALLOCATOR)
+  add_definitions(-DUSE_MIMALLOC_STL_ALLOCATOR) # used in ONNXRuntime
+endif()
+if(onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
+  add_definitions(-DUSE_MIMALLOC_ARENA_ALLOCATOR) # used in ONNXRuntime
+endif()
 include_directories(${mimalloc_root_dir}/include)

 option(MI_OVERRIDE "" OFF)
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@ -72,7 +72,7 @@ if (onnxruntime_USE_TELEMETRY)
  set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
 endif()

-if (onnxruntime_USE_MIMALLOC)
+if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
    if(onnxruntime_USE_CUDA OR onnxruntime_USE_OPENVINO) 
        message(WARNING "Ignoring directive to use mimalloc on unimplemented targets")
    elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
--- a/include/onnxruntime/core/common/common.h
+++ b/include/onnxruntime/core/common/common.h
@ -36,7 +36,7 @@
 #include "core/common/make_unique.h"
 #include "core/common/status.h"

-#ifdef USE_MIMALLOC
+#ifdef USE_MIMALLOC_ARENA_ALLOCATOR
 #include <mimalloc.h>
 #endif

--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@ -280,7 +280,7 @@ class CPUAllocator : public IDeviceAllocator {
  std::unique_ptr<OrtMemoryInfo> memory_info_;
 };

-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 class MiMallocAllocator : public IDeviceAllocator {
 public:
  explicit MiMallocAllocator(std::unique_ptr<OrtMemoryInfo> memory_info) {
@ -302,10 +302,10 @@ class MiMallocAllocator : public IDeviceAllocator {

 #endif

-#ifdef USE_MIMALLOC
-using TAllocator = MiMallocAllocator;
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
+  using TAllocator = MiMallocAllocator;
 #else
-using TAllocator = CPUAllocator;
+  using TAllocator = CPUAllocator;
 #endif

 using AllocatorPtr = std::shared_ptr<IAllocator>;
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@ -32,7 +32,7 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz
  return ok;
 }

-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 void* MiMallocAllocator::Alloc(size_t size) {
  return mi_malloc(size);
 }
--- a/onnxruntime/core/framework/allocatormgr.cc
+++ b/onnxruntime/core/framework/allocatormgr.cc
@ -11,7 +11,7 @@

 namespace onnxruntime {

-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
  using TArenaAllocator = MiMallocArena;
 #else
  using TArenaAllocator = BFCArena;
--- a/onnxruntime/core/framework/mimalloc_arena.cc
+++ b/onnxruntime/core/framework/mimalloc_arena.cc
@ -1,4 +1,4 @@
-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 #include "mimalloc.h"
 #include "core/framework/mimalloc_arena.h"

--- a/onnxruntime/core/framework/mimalloc_arena.h
+++ b/onnxruntime/core/framework/mimalloc_arena.h
@ -1,4 +1,4 @@
-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 #include "core/common/common.h"
 #include "core/framework/arena.h"
 #include "onnxruntime_config.h"
--- a/onnxruntime/core/framework/ort_stl_allocator.h
+++ b/onnxruntime/core/framework/ort_stl_allocator.h
@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+namespace onnxruntime {
+
+// An STL wrapper for ORT allocators. This enables overriding the 
+// std::allocator used in STL containers for better memory performance.
+template <class T>
+class OrtStlAllocator {
+  template <class U> friend class OrtStlAllocator;
+  AllocatorPtr allocator_;
+
+public:
+  typedef T value_type;
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::true_type;
+  using propagate_on_container_swap = std::true_type;
+  using is_always_equal = std::true_type;
+
+  OrtStlAllocator(const AllocatorPtr& a) noexcept {
+    allocator_ = a;
+  }
+  OrtStlAllocator(const OrtStlAllocator& other) noexcept {
+    allocator_ = other.allocator_;
+  }
+  template <class U>
+  OrtStlAllocator(const OrtStlAllocator<U>& other) noexcept {
+    allocator_ = other.allocator_;
+  }
+
+  T* allocate(size_t n, const void* hint = 0) {
+    ORT_UNUSED_PARAMETER(hint);
+    return reinterpret_cast<T*>(allocator_->Alloc(n * sizeof(T)));
+  }
+
+  void deallocate(T* p, size_t n) {
+    ORT_UNUSED_PARAMETER(n);
+    allocator_->Free(p);
+  }
+};
+
+template <class T1, class T2>
+bool operator==(const OrtStlAllocator<T1>& lhs, const OrtStlAllocator<T2>& rhs) noexcept {
+  return lhs.allocator_ == rhs.allocator_; 
+}
+template <class T1, class T2>
+bool operator!=(const OrtStlAllocator<T1>& lhs, const OrtStlAllocator<T2>& rhs) noexcept {
+  return lhs.allocator_ != rhs.allocator_; 
+}
+
+} // namespace onnxruntime
--- a/onnxruntime/core/providers/cpu/containers.h
+++ b/onnxruntime/core/providers/cpu/containers.h
@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if defined(USE_MIMALLOC_STL_ALLOCATOR)
+#include <mimalloc.h>
+#else
+#include "core/framework/ort_stl_allocator.h"
+#endif
+
+namespace onnxruntime {
+
+#if defined(USE_MIMALLOC_STL_ALLOCATOR)
+
+template <typename T>
+mi_stl_allocator<T> GetAllocator(const OpKernelContext& context) {
+  ORT_UNUSED_PARAMETER(context);
+  return mi_stl_allocator<T>();
+}
+
+template <typename T>
+using FastAllocVector = std::vector<T,mi_stl_allocator<T>>;
+
+#else
+
+template <typename T>
+OrtStlAllocator<T> GetAllocator(const OpKernelContext& context) {
+  AllocatorPtr allocator;
+  auto status = context.GetTempSpaceAllocator(&allocator);
+  ORT_ENFORCE(status.IsOK());
+  return OrtStlAllocator<T>(allocator);
+}
+
+template <typename T>
+using FastAllocVector = std::vector<T,OrtStlAllocator<T>>;
+
+#endif 
+
+} // namespace onnxruntime
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.h
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.h
@ -32,7 +32,7 @@ class CPUExecutionProvider : public IExecutionProvider {
                                                std::numeric_limits<size_t>::max()};

 #ifdef USE_JEMALLOC
-#if defined(USE_MIMALLOC)
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) || defined(USE_MIMALLOC_STL_ALLOCATOR)
 #error jemalloc and mimalloc should not both be enabled
 #endif

--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@ -4,6 +4,7 @@
 #include "core/providers/cpu/reduction/reduction_ops.h"
 #include "core/providers/common.h"
 #include "core/util/math_cpuonly.h"
+#include "core/providers/cpu/containers.h"
 using namespace std;
 namespace onnxruntime {

@ -125,7 +126,7 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 11);
 //               size of each reduce.
 template <typename T>
 bool PrepareForReduce(OpKernelContext* ctx,
-                      std::vector<T>& transposedInputData,
+                      FastAllocVector<T>& transposedInputData,
                      Tensor** reducedTensor,
                      int64_t& block_size,
                      int64_t& blocks,
@ -307,7 +308,7 @@ bool PrepareForReduce(OpKernelContext* ctx,

 template <typename T>
 Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -323,7 +324,7 @@ Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -339,7 +340,7 @@ Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -359,7 +360,7 @@ Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -383,7 +384,7 @@ Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -407,7 +408,7 @@ Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -434,7 +435,7 @@ Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -458,7 +459,7 @@ Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -474,7 +475,7 @@ Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -501,7 +502,7 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -517,7 +518,7 @@ Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
@ -537,7 +538,7 @@ Status ArgMax<T>::Compute(OpKernelContext* ctx) const {

 template <typename T>
 Status ArgMin<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
  int64_t block_size;
  int64_t blocks;
  Tensor* reduced;
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@ -811,7 +811,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
      });
 }

-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 static struct {
  PyMemAllocatorEx mem;
  PyMemAllocatorEx raw;
@ -823,7 +823,7 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
  m.doc() = "pybind11 stateful interface to ONNX runtime";
  RegisterExceptions(m);

-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
  PyMemAllocatorEx alloc;
  alloc.malloc = [](void* ctx, size_t size) {
    ORT_UNUSED_PARAMETER(ctx);
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@ -128,7 +128,7 @@ Use the individual flags to only run the specified stages.
    parser.add_argument("--skip_submodule_sync", action='store_true', help="Don't do a 'git submodule update'. Makes the Update phase faster.")
    parser.add_argument("--use_vstest", action='store_true', help="Use use_vstest for running unitests.")
    parser.add_argument("--use_jemalloc", action='store_true', help="Use jemalloc.")
-    parser.add_argument("--use_mimalloc", action='store_true', help="Use mimalloc.")
+    parser.add_argument("--use_mimalloc", default=['none'], choices=['none', 'stl', 'arena', 'all'], help="Use mimalloc.")
    parser.add_argument("--use_openblas", action='store_true', help="Build with OpenBLAS.")
    parser.add_argument("--use_dnnl", action='store_true', help="Build with DNNL.")
    parser.add_argument("--use_mklml", action='store_true', help="Build with MKLML.")
@ -306,7 +306,8 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
                 "-Donnxruntime_USE_FEATURIZERS=" + ("ON" if args.use_featurizers else "OFF"),
                 "-Donnxruntime_CUDA_HOME=" + (cuda_home if args.use_cuda else ""),
                 "-Donnxruntime_USE_JEMALLOC=" + ("ON" if args.use_jemalloc else "OFF"),
-                 "-Donnxruntime_USE_MIMALLOC=" + ("ON" if args.use_mimalloc else "OFF"),
+                 "-Donnxruntime_USE_MIMALLOC_STL_ALLOCATOR=" + ("ON" if args.use_mimalloc == "stl" or args.use_mimalloc == "all" else "OFF"),
+                 "-Donnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR=" + ("ON" if args.use_mimalloc == "arena" or args.use_mimalloc == "all" else "OFF"),
                 "-Donnxruntime_ENABLE_PYTHON=" + ("ON" if args.enable_pybind else "OFF"),
                 "-Donnxruntime_BUILD_CSHARP=" + ("ON" if args.build_csharp else "OFF"),
                 "-Donnxruntime_BUILD_JAVA=" + ("ON" if args.build_java else "OFF"),