diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index eb4e0841c2..c09029a54b 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -611,7 +611,7 @@ else()
 endif()
 set(onnxruntime_DELAYLOAD_FLAGS "")
 if (onnxruntime_USE_JEMALLOC)
-  if (onnxruntime_USE_MIMALLOC)
+  if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
     message( FATAL_ERROR "You cannot specify both jemalloc and mimalloc." )
   endif()
 
diff --git a/cmake/external/mimalloc b/cmake/external/mimalloc
index e2202f6bbe..2d54553b7a 160000
--- a/cmake/external/mimalloc
+++ b/cmake/external/mimalloc
@@ -1 +1 @@
-Subproject commit e2202f6bbe4e2051014ac20c38b3cf88492e9d2f
+Subproject commit 2d54553b7a78c7c35620b827e7e5ab2228ecb495
diff --git a/cmake/external/mimalloc.cmake b/cmake/external/mimalloc.cmake
index a7dfd04a04..77ca987b0c 100644
--- a/cmake/external/mimalloc.cmake
+++ b/cmake/external/mimalloc.cmake
@@ -1,7 +1,11 @@
-
 set(mimalloc_root_dir ${PROJECT_SOURCE_DIR}/external/mimalloc)
 
-add_definitions(-DUSE_MIMALLOC) # used in ONNXRuntime
+if(onnxruntime_USE_MIMALLOC_STL_ALLOCATOR)
+  add_definitions(-DUSE_MIMALLOC_STL_ALLOCATOR) # used in ONNXRuntime
+endif()
+if(onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
+  add_definitions(-DUSE_MIMALLOC_ARENA_ALLOCATOR) # used in ONNXRuntime
+endif()
 include_directories(${mimalloc_root_dir}/include)
 
 option(MI_OVERRIDE "" OFF)
diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
index e688f2d9d7..d33e9e1412 100644
--- a/cmake/onnxruntime_common.cmake
+++ b/cmake/onnxruntime_common.cmake
@@ -72,7 +72,7 @@ if (onnxruntime_USE_TELEMETRY)
   set_target_properties(onnxruntime_common PROPERTIES COMPILE_FLAGS "/FI${ONNXRUNTIME_INCLUDE_DIR}/core/platform/windows/TraceLoggingConfigPrivate.h")
 endif()
 
-if (onnxruntime_USE_MIMALLOC)
+if (onnxruntime_USE_MIMALLOC_STL_ALLOCATOR OR onnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR)
     if(onnxruntime_USE_CUDA OR onnxruntime_USE_OPENVINO) 
         message(WARNING "Ignoring directive to use mimalloc on unimplemented targets")
     elseif (${CMAKE_CXX_COMPILER_ID} MATCHES "GNU")
diff --git a/include/onnxruntime/core/common/common.h b/include/onnxruntime/core/common/common.h
index ab079b04b9..a0adf066f6 100644
--- a/include/onnxruntime/core/common/common.h
+++ b/include/onnxruntime/core/common/common.h
@@ -36,7 +36,7 @@
 #include "core/common/make_unique.h"
 #include "core/common/status.h"
 
-#ifdef USE_MIMALLOC
+#ifdef USE_MIMALLOC_ARENA_ALLOCATOR
 #include <mimalloc.h>
 #endif
 
diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h
index f3045e9964..91d2401ab1 100644
--- a/include/onnxruntime/core/framework/allocator.h
+++ b/include/onnxruntime/core/framework/allocator.h
@@ -280,7 +280,7 @@ class CPUAllocator : public IDeviceAllocator {
   std::unique_ptr<OrtMemoryInfo> memory_info_;
 };
 
-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 class MiMallocAllocator : public IDeviceAllocator {
  public:
   explicit MiMallocAllocator(std::unique_ptr<OrtMemoryInfo> memory_info) {
@@ -302,10 +302,10 @@ class MiMallocAllocator : public IDeviceAllocator {
 
 #endif
 
-#ifdef USE_MIMALLOC
-using TAllocator = MiMallocAllocator;
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
+  using TAllocator = MiMallocAllocator;
 #else
-using TAllocator = CPUAllocator;
+  using TAllocator = CPUAllocator;
 #endif
 
 using AllocatorPtr = std::shared_ptr<IAllocator>;
diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc
index f3ada447ae..33fe2e7873 100644
--- a/onnxruntime/core/framework/allocator.cc
+++ b/onnxruntime/core/framework/allocator.cc
@@ -32,7 +32,7 @@ bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, siz
   return ok;
 }
 
-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 void* MiMallocAllocator::Alloc(size_t size) {
   return mi_malloc(size);
 }
diff --git a/onnxruntime/core/framework/allocatormgr.cc b/onnxruntime/core/framework/allocatormgr.cc
index 3dec82e3a3..1c585e6a8b 100644
--- a/onnxruntime/core/framework/allocatormgr.cc
+++ b/onnxruntime/core/framework/allocatormgr.cc
@@ -11,7 +11,7 @@
 
 namespace onnxruntime {
 
-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
   using TArenaAllocator = MiMallocArena;
 #else
   using TArenaAllocator = BFCArena;
diff --git a/onnxruntime/core/framework/mimalloc_arena.cc b/onnxruntime/core/framework/mimalloc_arena.cc
index e40cd9da5a..fffbe683c6 100644
--- a/onnxruntime/core/framework/mimalloc_arena.cc
+++ b/onnxruntime/core/framework/mimalloc_arena.cc
@@ -1,4 +1,4 @@
-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 #include "mimalloc.h"
 #include "core/framework/mimalloc_arena.h"
 
diff --git a/onnxruntime/core/framework/mimalloc_arena.h b/onnxruntime/core/framework/mimalloc_arena.h
index 1a06d3a90d..d9c75a29f8 100644
--- a/onnxruntime/core/framework/mimalloc_arena.h
+++ b/onnxruntime/core/framework/mimalloc_arena.h
@@ -1,4 +1,4 @@
-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 #include "core/common/common.h"
 #include "core/framework/arena.h"
 #include "onnxruntime_config.h"
diff --git a/onnxruntime/core/framework/ort_stl_allocator.h b/onnxruntime/core/framework/ort_stl_allocator.h
new file mode 100644
index 0000000000..62c313de94
--- /dev/null
+++ b/onnxruntime/core/framework/ort_stl_allocator.h
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+namespace onnxruntime {
+
+// An STL wrapper for ORT allocators. This enables overriding the 
+// std::allocator used in STL containers for better memory performance.
+template <class T>
+class OrtStlAllocator {
+  template <class U> friend class OrtStlAllocator;
+  AllocatorPtr allocator_;
+
+public:
+  typedef T value_type;
+  using propagate_on_container_copy_assignment = std::true_type;
+  using propagate_on_container_move_assignment = std::true_type;
+  using propagate_on_container_swap = std::true_type;
+  using is_always_equal = std::true_type;
+
+  OrtStlAllocator(const AllocatorPtr& a) noexcept {
+    allocator_ = a;
+  }
+  OrtStlAllocator(const OrtStlAllocator& other) noexcept {
+    allocator_ = other.allocator_;
+  }
+  template <class U>
+  OrtStlAllocator(const OrtStlAllocator<U>& other) noexcept {
+    allocator_ = other.allocator_;
+  }
+
+  T* allocate(size_t n, const void* hint = 0) {
+    ORT_UNUSED_PARAMETER(hint);
+    return reinterpret_cast<T*>(allocator_->Alloc(n * sizeof(T)));
+  }
+
+  void deallocate(T* p, size_t n) {
+    ORT_UNUSED_PARAMETER(n);
+    allocator_->Free(p);
+  }
+};
+
+template <class T1, class T2>
+bool operator==(const OrtStlAllocator<T1>& lhs, const OrtStlAllocator<T2>& rhs) noexcept {
+  return lhs.allocator_ == rhs.allocator_; 
+}
+template <class T1, class T2>
+bool operator!=(const OrtStlAllocator<T1>& lhs, const OrtStlAllocator<T2>& rhs) noexcept {
+  return lhs.allocator_ != rhs.allocator_; 
+}
+
+} // namespace onnxruntime
\ No newline at end of file
diff --git a/onnxruntime/core/providers/cpu/containers.h b/onnxruntime/core/providers/cpu/containers.h
new file mode 100644
index 0000000000..7dcdaee1f5
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/containers.h
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if defined(USE_MIMALLOC_STL_ALLOCATOR)
+#include <mimalloc.h>
+#else
+#include "core/framework/ort_stl_allocator.h"
+#endif
+
+namespace onnxruntime {
+
+#if defined(USE_MIMALLOC_STL_ALLOCATOR)
+
+template <typename T>
+mi_stl_allocator<T> GetAllocator(const OpKernelContext& context) {
+  ORT_UNUSED_PARAMETER(context);
+  return mi_stl_allocator<T>();
+}
+
+template <typename T>
+using FastAllocVector = std::vector<T,mi_stl_allocator<T>>;
+
+#else
+
+template <typename T>
+OrtStlAllocator<T> GetAllocator(const OpKernelContext& context) {
+  AllocatorPtr allocator;
+  auto status = context.GetTempSpaceAllocator(&allocator);
+  ORT_ENFORCE(status.IsOK());
+  return OrtStlAllocator<T>(allocator);
+}
+
+template <typename T>
+using FastAllocVector = std::vector<T,OrtStlAllocator<T>>;
+
+#endif 
+
+} // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.h b/onnxruntime/core/providers/cpu/cpu_execution_provider.h
index 79099e65ba..441e4eb2cf 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.h
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.h
@@ -32,7 +32,7 @@ class CPUExecutionProvider : public IExecutionProvider {
                                                 std::numeric_limits<size_t>::max()};
 
 #ifdef USE_JEMALLOC
-#if defined(USE_MIMALLOC)
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR) || defined(USE_MIMALLOC_STL_ALLOCATOR)
 #error jemalloc and mimalloc should not both be enabled
 #endif
 
diff --git a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
index cdfab52141..713386600a 100644
--- a/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cpu/reduction/reduction_ops.cc
@@ -4,6 +4,7 @@
 #include "core/providers/cpu/reduction/reduction_ops.h"
 #include "core/providers/common.h"
 #include "core/util/math_cpuonly.h"
+#include "core/providers/cpu/containers.h"
 using namespace std;
 namespace onnxruntime {
 
@@ -125,7 +126,7 @@ REGISTER_UNARY_ELEMENTWISE_KERNEL(ArgMin, 11);
 //               size of each reduce.
 template <typename T>
 bool PrepareForReduce(OpKernelContext* ctx,
-                      std::vector<T>& transposedInputData,
+                      FastAllocVector<T>& transposedInputData,
                       Tensor** reducedTensor,
                       int64_t& block_size,
                       int64_t& blocks,
@@ -307,7 +308,7 @@ bool PrepareForReduce(OpKernelContext* ctx,
 
 template <typename T>
 Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -323,7 +324,7 @@ Status ReduceL1<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -339,7 +340,7 @@ Status ReduceL2<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -359,7 +360,7 @@ Status ReduceLogSum<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -383,7 +384,7 @@ Status ReduceLogSumExp<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -407,7 +408,7 @@ Status ReduceMax<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -434,7 +435,7 @@ Status ReduceMean<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -458,7 +459,7 @@ Status ReduceMin<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -474,7 +475,7 @@ Status ReduceProd<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -501,7 +502,7 @@ Status ReduceSum<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -517,7 +518,7 @@ Status ReduceSumSquare<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
@@ -537,7 +538,7 @@ Status ArgMax<T>::Compute(OpKernelContext* ctx) const {
 
 template <typename T>
 Status ArgMin<T>::Compute(OpKernelContext* ctx) const {
-  std::vector<T> transposedInputData;
+  FastAllocVector<T> transposedInputData(GetAllocator<T>(*ctx));
   int64_t block_size;
   int64_t blocks;
   Tensor* reduced;
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 5166bfbc05..d4baca48da 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -811,7 +811,7 @@ including arg name, arg type (contains both type and shape).)pbdoc")
       });
 }
 
-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
 static struct {
   PyMemAllocatorEx mem;
   PyMemAllocatorEx raw;
@@ -823,7 +823,7 @@ PYBIND11_MODULE(onnxruntime_pybind11_state, m) {
   m.doc() = "pybind11 stateful interface to ONNX runtime";
   RegisterExceptions(m);
 
-#ifdef USE_MIMALLOC
+#if defined(USE_MIMALLOC_ARENA_ALLOCATOR)
   PyMemAllocatorEx alloc;
   alloc.malloc = [](void* ctx, size_t size) {
     ORT_UNUSED_PARAMETER(ctx);
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index d3ac8fbb96..f136dd8edd 100755
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -128,7 +128,7 @@ Use the individual flags to only run the specified stages.
     parser.add_argument("--skip_submodule_sync", action='store_true', help="Don't do a 'git submodule update'. Makes the Update phase faster.")
     parser.add_argument("--use_vstest", action='store_true', help="Use use_vstest for running unitests.")
     parser.add_argument("--use_jemalloc", action='store_true', help="Use jemalloc.")
-    parser.add_argument("--use_mimalloc", action='store_true', help="Use mimalloc.")
+    parser.add_argument("--use_mimalloc", default=['none'], choices=['none', 'stl', 'arena', 'all'], help="Use mimalloc.")
     parser.add_argument("--use_openblas", action='store_true', help="Build with OpenBLAS.")
     parser.add_argument("--use_dnnl", action='store_true', help="Build with DNNL.")
     parser.add_argument("--use_mklml", action='store_true', help="Build with MKLML.")
@@ -306,7 +306,8 @@ def generate_build_tree(cmake_path, source_dir, build_dir, cuda_home, cudnn_home
                  "-Donnxruntime_USE_FEATURIZERS=" + ("ON" if args.use_featurizers else "OFF"),
                  "-Donnxruntime_CUDA_HOME=" + (cuda_home if args.use_cuda else ""),
                  "-Donnxruntime_USE_JEMALLOC=" + ("ON" if args.use_jemalloc else "OFF"),
-                 "-Donnxruntime_USE_MIMALLOC=" + ("ON" if args.use_mimalloc else "OFF"),
+                 "-Donnxruntime_USE_MIMALLOC_STL_ALLOCATOR=" + ("ON" if args.use_mimalloc == "stl" or args.use_mimalloc == "all" else "OFF"),
+                 "-Donnxruntime_USE_MIMALLOC_ARENA_ALLOCATOR=" + ("ON" if args.use_mimalloc == "arena" or args.use_mimalloc == "all" else "OFF"),
                  "-Donnxruntime_ENABLE_PYTHON=" + ("ON" if args.enable_pybind else "OFF"),
                  "-Donnxruntime_BUILD_CSHARP=" + ("ON" if args.build_csharp else "OFF"),
                  "-Donnxruntime_BUILD_JAVA=" + ("ON" if args.build_java else "OFF"),