2022-01-27 07:23:39 +00:00
|
|
|
#include <c10/core/impl/alloc_cpu.h>
|
|
|
|
|
|
|
|
|
|
#include <c10/core/alignment.h>
|
|
|
|
|
#include <c10/util/Flags.h>
|
|
|
|
|
#include <c10/util/Logging.h>
|
2024-10-15 07:47:21 +00:00
|
|
|
#include <c10/util/env.h>
|
2024-11-22 07:02:30 +00:00
|
|
|
#include <c10/util/error.h>
|
2022-01-27 07:23:39 +00:00
|
|
|
#include <c10/util/irange.h>
|
|
|
|
|
#include <c10/util/numa.h>
|
2025-01-30 03:14:27 +00:00
|
|
|
#include <cstring>
|
2022-01-27 07:23:39 +00:00
|
|
|
|
2023-06-27 08:53:23 +00:00
|
|
|
#ifdef USE_MIMALLOC
|
|
|
|
|
#include <mimalloc.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
2023-12-16 18:16:19 +00:00
|
|
|
#ifdef __linux__
|
|
|
|
|
#include <sys/mman.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
2022-01-27 07:23:39 +00:00
|
|
|
// TODO: rename flags to C10
|
|
|
|
|
C10_DEFINE_bool(
|
|
|
|
|
caffe2_cpu_allocator_do_zero_fill,
|
|
|
|
|
false,
|
|
|
|
|
"If set, do memory zerofilling when allocating on CPU");
|
|
|
|
|
|
|
|
|
|
C10_DEFINE_bool(
|
|
|
|
|
caffe2_cpu_allocator_do_junk_fill,
|
|
|
|
|
false,
|
|
|
|
|
"If set, fill memory with deterministic junk when allocating on CPU");
|
|
|
|
|
|
|
|
|
|
namespace c10 {
|
|
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
|
|
// Fill the data memory region of num bytes with a particular garbage pattern.
|
|
|
|
|
// The garbage value is chosen to be NaN if interpreted as floating point value,
|
|
|
|
|
// or a very large integer.
|
|
|
|
|
void memset_junk(void* data, size_t num) {
|
|
|
|
|
// This garbage pattern is NaN when interpreted as floating point values,
|
|
|
|
|
// or as very large integer values.
|
|
|
|
|
static constexpr int32_t kJunkPattern = 0x7fedbeef;
|
|
|
|
|
static constexpr int64_t kJunkPattern64 =
|
|
|
|
|
static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
|
2023-02-01 14:44:13 +00:00
|
|
|
auto int64_count = num / sizeof(kJunkPattern64);
|
|
|
|
|
auto remaining_bytes = num % sizeof(kJunkPattern64);
|
2022-01-27 07:23:39 +00:00
|
|
|
int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
|
|
|
|
|
for (const auto i : c10::irange(int64_count)) {
|
|
|
|
|
data_i64[i] = kJunkPattern64;
|
|
|
|
|
}
|
|
|
|
|
if (remaining_bytes > 0) {
|
|
|
|
|
memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-16 18:16:19 +00:00
|
|
|
#if defined(__linux__) && !defined(__ANDROID__)
|
|
|
|
|
static inline bool is_thp_alloc_enabled() {
|
|
|
|
|
static bool value = [&] {
|
2024-10-15 07:47:21 +00:00
|
|
|
auto env = c10::utils::check_env("THP_MEM_ALLOC_ENABLE");
|
|
|
|
|
return env.has_value() ? env.value() : 0;
|
2023-12-16 18:16:19 +00:00
|
|
|
}();
|
|
|
|
|
return value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline size_t c10_compute_alignment(size_t nbytes) {
|
|
|
|
|
static const auto pagesize = sysconf(_SC_PAGESIZE);
|
|
|
|
|
// for kernels that don't provide page size, default it to 4K
|
|
|
|
|
const size_t thp_alignment = (pagesize < 0 ? gPagesize : pagesize);
|
|
|
|
|
return (is_thp_alloc_enabled() ? thp_alignment : gAlignment);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
inline bool is_thp_alloc(size_t nbytes) {
|
|
|
|
|
// enable thp (transparent huge pages) for larger buffers
|
|
|
|
|
return (is_thp_alloc_enabled() && (nbytes >= gAlloc_threshold_thp));
|
|
|
|
|
}
|
|
|
|
|
#elif !defined(__ANDROID__) && !defined(_MSC_VER)
|
2024-10-19 13:17:43 +00:00
|
|
|
constexpr size_t c10_compute_alignment([[maybe_unused]] size_t nbytes) {
|
2023-12-16 18:16:19 +00:00
|
|
|
return gAlignment;
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-19 13:17:43 +00:00
|
|
|
constexpr bool is_thp_alloc([[maybe_unused]] size_t nbytes) {
|
2023-12-16 18:16:19 +00:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
2022-01-27 07:23:39 +00:00
|
|
|
} // namespace
|
|
|
|
|
|
|
|
|
|
void* alloc_cpu(size_t nbytes) {
|
|
|
|
|
if (nbytes == 0) {
|
|
|
|
|
return nullptr;
|
|
|
|
|
}
|
|
|
|
|
// We might have clowny upstream code that tries to alloc a negative number
|
|
|
|
|
// of bytes. Let's catch it early.
|
|
|
|
|
CAFFE_ENFORCE(
|
|
|
|
|
((ptrdiff_t)nbytes) >= 0,
|
|
|
|
|
"alloc_cpu() seems to have been called with negative number: ",
|
|
|
|
|
nbytes);
|
|
|
|
|
|
2024-10-08 19:05:00 +00:00
|
|
|
void* data = nullptr;
|
2022-01-27 07:23:39 +00:00
|
|
|
#ifdef __ANDROID__
|
|
|
|
|
data = memalign(gAlignment, nbytes);
|
2022-06-03 17:38:06 +00:00
|
|
|
CAFFE_ENFORCE(
|
|
|
|
|
data,
|
|
|
|
|
"DefaultCPUAllocator: not enough memory: you tried to allocate ",
|
|
|
|
|
nbytes,
|
|
|
|
|
" bytes.");
|
2022-01-27 07:23:39 +00:00
|
|
|
#elif defined(_MSC_VER)
|
2023-06-27 08:53:23 +00:00
|
|
|
#ifdef USE_MIMALLOC
|
|
|
|
|
data = mi_malloc_aligned(nbytes, gAlignment);
|
|
|
|
|
#else
|
2022-01-27 07:23:39 +00:00
|
|
|
data = _aligned_malloc(nbytes, gAlignment);
|
2023-06-27 08:53:23 +00:00
|
|
|
#endif
|
2022-01-27 07:23:39 +00:00
|
|
|
CAFFE_ENFORCE(
|
|
|
|
|
data,
|
|
|
|
|
"DefaultCPUAllocator: not enough memory: you tried to allocate ",
|
|
|
|
|
nbytes,
|
|
|
|
|
" bytes.");
|
2022-06-03 17:38:06 +00:00
|
|
|
#else
|
2023-12-16 18:16:19 +00:00
|
|
|
int err = posix_memalign(&data, c10_compute_alignment(nbytes), nbytes);
|
2022-06-03 17:38:06 +00:00
|
|
|
CAFFE_ENFORCE(
|
|
|
|
|
err == 0,
|
|
|
|
|
"DefaultCPUAllocator: can't allocate memory: you tried to allocate ",
|
|
|
|
|
nbytes,
|
|
|
|
|
" bytes. Error code ",
|
|
|
|
|
err,
|
|
|
|
|
" (",
|
2024-11-22 07:02:30 +00:00
|
|
|
c10::utils::str_error(err),
|
2022-06-03 17:38:06 +00:00
|
|
|
")");
|
2023-12-16 18:16:19 +00:00
|
|
|
if (is_thp_alloc(nbytes)) {
|
|
|
|
|
#ifdef __linux__
|
|
|
|
|
// MADV_HUGEPAGE advise is available only for linux.
|
|
|
|
|
// general posix compliant systems can check POSIX_MADV_SEQUENTIAL advise.
|
|
|
|
|
int ret = madvise(data, nbytes, MADV_HUGEPAGE);
|
|
|
|
|
if (ret != 0) {
|
2024-11-22 07:02:30 +00:00
|
|
|
TORCH_WARN_ONCE(
|
|
|
|
|
"thp madvise for HUGEPAGE failed with ",
|
|
|
|
|
c10::utils::str_error(errno));
|
2023-12-16 18:16:19 +00:00
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
}
|
2022-06-03 17:38:06 +00:00
|
|
|
#endif
|
2022-01-27 07:23:39 +00:00
|
|
|
|
|
|
|
|
// move data to a thread's NUMA node
|
|
|
|
|
NUMAMove(data, nbytes, GetCurrentNUMANode());
|
|
|
|
|
CHECK(
|
|
|
|
|
!FLAGS_caffe2_cpu_allocator_do_zero_fill ||
|
|
|
|
|
!FLAGS_caffe2_cpu_allocator_do_junk_fill)
|
|
|
|
|
<< "Cannot request both zero-fill and junk-fill at the same time";
|
|
|
|
|
if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
|
|
|
|
|
memset(data, 0, nbytes);
|
|
|
|
|
} else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
|
|
|
|
|
memset_junk(data, nbytes);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return data;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void free_cpu(void* data) {
|
|
|
|
|
#ifdef _MSC_VER
|
2023-06-27 08:53:23 +00:00
|
|
|
#ifdef USE_MIMALLOC
|
|
|
|
|
mi_free(data);
|
|
|
|
|
#else
|
2022-01-27 07:23:39 +00:00
|
|
|
_aligned_free(data);
|
2023-06-27 08:53:23 +00:00
|
|
|
#endif
|
2022-01-27 07:23:39 +00:00
|
|
|
#else
|
|
|
|
|
// NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
|
|
|
|
|
free(data);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
[Windows][cpu] mkl use mimalloc as allocator on Windows (#138419)
We did a lot of optimization for PyTorch Windows, and we got good progress of it. But still some models have performance gap between PyTorch Windows and PyTorch Linux. Ref: https://pytorch.org/blog/performance-boost-windows/#conclusion
From the blog conclusion, we found the `ResNet50` is typical case of it.
Let's focus on the `ResNet50`, and collect the profiling log:
```cmd
(nightly) D:\xu_git\dnnl_cb>python test_script_resnet50.py
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
model_inference 3.91% 682.427ms 100.00% 17.448s 17.448s 1
aten::conv2d 0.18% 30.906ms 64.79% 11.305s 2.133ms 5300
aten::convolution 0.45% 78.031ms 64.62% 11.275s 2.127ms 5300
aten::_convolution 0.30% 51.670ms 64.17% 11.196s 2.113ms 5300
aten::mkldnn_convolution 63.58% 11.093s 63.87% 11.145s 2.103ms 5300
aten::batch_norm 0.13% 23.536ms 20.10% 3.506s 661.580us 5300
aten::_batch_norm_impl_index 0.28% 49.486ms 19.96% 3.483s 657.139us 5300
aten::native_batch_norm 19.26% 3.360s 19.64% 3.427s 646.615us 5300
aten::max_pool2d 0.01% 1.038ms 5.84% 1.018s 10.181ms 100
aten::max_pool2d_with_indices 5.83% 1.017s 5.83% 1.017s 10.171ms 100
aten::add_ 3.38% 588.907ms 3.38% 588.907ms 85.349us 6900
aten::relu_ 0.35% 60.358ms 1.67% 292.155ms 59.624us 4900
aten::clamp_min_ 1.33% 231.797ms 1.33% 231.797ms 47.306us 4900
aten::empty 0.46% 80.195ms 0.46% 80.195ms 1.513us 53000
aten::linear 0.01% 927.300us 0.23% 39.353ms 393.532us 100
aten::addmm 0.20% 35.379ms 0.21% 37.016ms 370.155us 100
aten::empty_like 0.12% 20.455ms 0.17% 29.976ms 5.656us 5300
aten::as_strided_ 0.11% 18.830ms 0.11% 18.830ms 3.553us 5300
aten::adaptive_avg_pool2d 0.00% 419.900us 0.08% 14.265ms 142.647us 100
aten::mean 0.01% 1.737ms 0.08% 13.845ms 138.448us 100
aten::sum 0.05% 8.113ms 0.05% 8.648ms 86.479us 100
aten::resize_ 0.03% 5.182ms 0.03% 5.182ms 0.978us 5300
aten::div_ 0.01% 1.445ms 0.02% 3.460ms 34.600us 100
aten::to 0.00% 337.000us 0.01% 2.015ms 20.154us 100
aten::_to_copy 0.01% 977.500us 0.01% 1.678ms 16.784us 100
aten::copy_ 0.01% 1.474ms 0.01% 1.474ms 7.371us 200
aten::t 0.00% 775.900us 0.01% 1.410ms 14.104us 100
aten::flatten 0.00% 420.900us 0.01% 1.311ms 13.106us 100
aten::view 0.01% 889.700us 0.01% 889.700us 8.897us 100
aten::transpose 0.00% 410.700us 0.00% 634.500us 6.345us 100
aten::expand 0.00% 496.800us 0.00% 566.800us 5.668us 100
aten::fill_ 0.00% 534.800us 0.00% 534.800us 5.348us 100
aten::as_strided 0.00% 293.800us 0.00% 293.800us 1.469us 200
aten::empty_strided 0.00% 241.700us 0.00% 241.700us 2.417us 100
aten::resolve_conj 0.00% 54.800us 0.00% 54.800us 0.274us 200
--------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 17.448s
Execution time: 20.02380895614624
```
We found the major kernel consume CPU resource is `aten::mkldnn_convolution`. It was dispatched to `MKLDNN`.
Acturally, we had optimized memory allocation via integrated mimalloc to pytorch C10 module. It helps PyTorch Windows boost a lot, but it does not cover `MKL` and `MKLDNN`'s intermediary temporary memory.
We still have potential to improve PyTorch Windows performance via optimize `MKL` and `MKLDNN`'s intermediary temporary memory.
So, I discussed with Intel MKL team, and get a method to register high performance memory allocation API to MKL, and it would help MKL to boost memory performance. Please check the online document: https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-windows/2023-0/redefining-memory-functions.html
This PR is optimize MKL memory alloction performance on Windows, via register mi_malloc to MKL. PR Changes:
1. Add cmake option: `USE_MIMALLOC_ON_MKL`, It is sub-option of `USE_MIMALLOC`.
2. Wrap and export mi_malloc APIs in C10, when `USE_MIMALLOC_ON_MKL` is `ON`.
3. Add MklAllocationHelp.cpp to register allocation APIs to MKL, when `USE_MIMALLOC_ON_MKL` is `ON`.
For `oneDNN`, it is still tracking in this proposal: https://github.com/oneapi-src/oneDNN/issues/1898
Pull Request resolved: https://github.com/pytorch/pytorch/pull/138419
Approved by: https://github.com/jgong5, https://github.com/ezyang
2024-10-24 05:29:47 +00:00
|
|
|
#ifdef USE_MIMALLOC_ON_MKL
|
|
|
|
|
namespace mi_malloc_wrapper {
|
|
|
|
|
void* c10_mi_malloc(size_t size) {
|
|
|
|
|
return mi_malloc(size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void* c10_mi_calloc(size_t count, size_t size) {
|
|
|
|
|
return mi_calloc(count, size);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void* c10_mi_realloc(void* p, size_t newsize) {
|
|
|
|
|
return mi_realloc(p, newsize);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void* c10_mi_malloc_aligned(size_t size, size_t alignment) {
|
|
|
|
|
return mi_malloc_aligned(size, alignment);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void c10_mi_free(void* p) {
|
|
|
|
|
mi_free(p);
|
|
|
|
|
}
|
|
|
|
|
} // namespace mi_malloc_wrapper
|
|
|
|
|
#endif
|
2022-01-27 07:23:39 +00:00
|
|
|
} // namespace c10
|