pytorch/c10/core/impl/alloc_cpu.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

194 lines
5.1 KiB
C++
Raw Normal View History

#include <c10/core/impl/alloc_cpu.h>
#include <c10/core/alignment.h>
#include <c10/util/Flags.h>
#include <c10/util/Logging.h>
#include <c10/util/env.h>
#include <c10/util/error.h>
#include <c10/util/irange.h>
#include <c10/util/numa.h>
#include <cstring>
#ifdef USE_MIMALLOC
#include <mimalloc.h>
#endif
#ifdef __linux__
#include <sys/mman.h>
#include <unistd.h>
#endif
// TODO: rename flags to C10
C10_DEFINE_bool(
caffe2_cpu_allocator_do_zero_fill,
false,
"If set, do memory zerofilling when allocating on CPU");
C10_DEFINE_bool(
caffe2_cpu_allocator_do_junk_fill,
false,
"If set, fill memory with deterministic junk when allocating on CPU");
namespace c10 {
namespace {
// Fill the data memory region of num bytes with a particular garbage pattern.
// The garbage value is chosen to be NaN if interpreted as floating point value,
// or a very large integer.
void memset_junk(void* data, size_t num) {
// This garbage pattern is NaN when interpreted as floating point values,
// or as very large integer values.
static constexpr int32_t kJunkPattern = 0x7fedbeef;
static constexpr int64_t kJunkPattern64 =
static_cast<int64_t>(kJunkPattern) << 32 | kJunkPattern;
auto int64_count = num / sizeof(kJunkPattern64);
auto remaining_bytes = num % sizeof(kJunkPattern64);
int64_t* data_i64 = reinterpret_cast<int64_t*>(data);
for (const auto i : c10::irange(int64_count)) {
data_i64[i] = kJunkPattern64;
}
if (remaining_bytes > 0) {
memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes);
}
}
#if defined(__linux__) && !defined(__ANDROID__)
static inline bool is_thp_alloc_enabled() {
static bool value = [&] {
auto env = c10::utils::check_env("THP_MEM_ALLOC_ENABLE");
return env.has_value() ? env.value() : 0;
}();
return value;
}
inline size_t c10_compute_alignment(size_t nbytes) {
static const auto pagesize = sysconf(_SC_PAGESIZE);
// for kernels that don't provide page size, default it to 4K
const size_t thp_alignment = (pagesize < 0 ? gPagesize : pagesize);
return (is_thp_alloc_enabled() ? thp_alignment : gAlignment);
}
inline bool is_thp_alloc(size_t nbytes) {
// enable thp (transparent huge pages) for larger buffers
return (is_thp_alloc_enabled() && (nbytes >= gAlloc_threshold_thp));
}
#elif !defined(__ANDROID__) && !defined(_MSC_VER)
constexpr size_t c10_compute_alignment([[maybe_unused]] size_t nbytes) {
return gAlignment;
}
constexpr bool is_thp_alloc([[maybe_unused]] size_t nbytes) {
return false;
}
#endif
} // namespace
void* alloc_cpu(size_t nbytes) {
if (nbytes == 0) {
return nullptr;
}
// We might have clowny upstream code that tries to alloc a negative number
// of bytes. Let's catch it early.
CAFFE_ENFORCE(
((ptrdiff_t)nbytes) >= 0,
"alloc_cpu() seems to have been called with negative number: ",
nbytes);
void* data = nullptr;
#ifdef __ANDROID__
data = memalign(gAlignment, nbytes);
CAFFE_ENFORCE(
data,
"DefaultCPUAllocator: not enough memory: you tried to allocate ",
nbytes,
" bytes.");
#elif defined(_MSC_VER)
#ifdef USE_MIMALLOC
data = mi_malloc_aligned(nbytes, gAlignment);
#else
data = _aligned_malloc(nbytes, gAlignment);
#endif
CAFFE_ENFORCE(
data,
"DefaultCPUAllocator: not enough memory: you tried to allocate ",
nbytes,
" bytes.");
#else
int err = posix_memalign(&data, c10_compute_alignment(nbytes), nbytes);
CAFFE_ENFORCE(
err == 0,
"DefaultCPUAllocator: can't allocate memory: you tried to allocate ",
nbytes,
" bytes. Error code ",
err,
" (",
c10::utils::str_error(err),
")");
if (is_thp_alloc(nbytes)) {
#ifdef __linux__
// MADV_HUGEPAGE advise is available only for linux.
// general posix compliant systems can check POSIX_MADV_SEQUENTIAL advise.
int ret = madvise(data, nbytes, MADV_HUGEPAGE);
if (ret != 0) {
TORCH_WARN_ONCE(
"thp madvise for HUGEPAGE failed with ",
c10::utils::str_error(errno));
}
#endif
}
#endif
// move data to a thread's NUMA node
NUMAMove(data, nbytes, GetCurrentNUMANode());
CHECK(
!FLAGS_caffe2_cpu_allocator_do_zero_fill ||
!FLAGS_caffe2_cpu_allocator_do_junk_fill)
<< "Cannot request both zero-fill and junk-fill at the same time";
if (FLAGS_caffe2_cpu_allocator_do_zero_fill) {
memset(data, 0, nbytes);
} else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) {
memset_junk(data, nbytes);
}
return data;
}
void free_cpu(void* data) {
#ifdef _MSC_VER
#ifdef USE_MIMALLOC
mi_free(data);
#else
_aligned_free(data);
#endif
#else
// NOLINTNEXTLINE(cppcoreguidelines-no-malloc)
free(data);
#endif
}
[Windows][cpu] mkl use mimalloc as allocator on Windows (#138419) We did a lot of optimization for PyTorch Windows, and we got good progress of it. But still some models have performance gap between PyTorch Windows and PyTorch Linux. Ref: https://pytorch.org/blog/performance-boost-windows/#conclusion From the blog conclusion, we found the `ResNet50` is typical case of it. Let's focus on the `ResNet50`, and collect the profiling log: ```cmd (nightly) D:\xu_git\dnnl_cb>python test_script_resnet50.py --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ model_inference 3.91% 682.427ms 100.00% 17.448s 17.448s 1 aten::conv2d 0.18% 30.906ms 64.79% 11.305s 2.133ms 5300 aten::convolution 0.45% 78.031ms 64.62% 11.275s 2.127ms 5300 aten::_convolution 0.30% 51.670ms 64.17% 11.196s 2.113ms 5300 aten::mkldnn_convolution 63.58% 11.093s 63.87% 11.145s 2.103ms 5300 aten::batch_norm 0.13% 23.536ms 20.10% 3.506s 661.580us 5300 aten::_batch_norm_impl_index 0.28% 49.486ms 19.96% 3.483s 657.139us 5300 aten::native_batch_norm 19.26% 3.360s 19.64% 3.427s 646.615us 5300 aten::max_pool2d 0.01% 1.038ms 5.84% 1.018s 10.181ms 100 aten::max_pool2d_with_indices 5.83% 1.017s 5.83% 1.017s 10.171ms 100 aten::add_ 3.38% 588.907ms 3.38% 588.907ms 85.349us 6900 aten::relu_ 0.35% 60.358ms 1.67% 292.155ms 59.624us 4900 aten::clamp_min_ 1.33% 231.797ms 1.33% 231.797ms 47.306us 4900 aten::empty 0.46% 80.195ms 0.46% 80.195ms 1.513us 53000 aten::linear 0.01% 927.300us 0.23% 39.353ms 393.532us 100 aten::addmm 0.20% 35.379ms 0.21% 37.016ms 370.155us 100 aten::empty_like 0.12% 20.455ms 0.17% 29.976ms 5.656us 5300 aten::as_strided_ 0.11% 18.830ms 0.11% 18.830ms 3.553us 5300 aten::adaptive_avg_pool2d 0.00% 419.900us 0.08% 14.265ms 142.647us 100 aten::mean 0.01% 1.737ms 0.08% 13.845ms 138.448us 100 aten::sum 0.05% 8.113ms 0.05% 8.648ms 86.479us 100 aten::resize_ 0.03% 5.182ms 0.03% 5.182ms 0.978us 5300 aten::div_ 0.01% 1.445ms 0.02% 3.460ms 34.600us 100 aten::to 0.00% 337.000us 0.01% 2.015ms 20.154us 100 aten::_to_copy 0.01% 977.500us 0.01% 1.678ms 16.784us 100 aten::copy_ 0.01% 1.474ms 0.01% 1.474ms 7.371us 200 aten::t 0.00% 775.900us 0.01% 1.410ms 14.104us 100 aten::flatten 0.00% 420.900us 0.01% 1.311ms 13.106us 100 aten::view 0.01% 889.700us 0.01% 889.700us 8.897us 100 aten::transpose 0.00% 410.700us 0.00% 634.500us 6.345us 100 aten::expand 0.00% 496.800us 0.00% 566.800us 5.668us 100 aten::fill_ 0.00% 534.800us 0.00% 534.800us 5.348us 100 aten::as_strided 0.00% 293.800us 0.00% 293.800us 1.469us 200 aten::empty_strided 0.00% 241.700us 0.00% 241.700us 2.417us 100 aten::resolve_conj 0.00% 54.800us 0.00% 54.800us 0.274us 200 --------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ Self CPU time total: 17.448s Execution time: 20.02380895614624 ``` We found the major kernel consume CPU resource is `aten::mkldnn_convolution`. It was dispatched to `MKLDNN`. Acturally, we had optimized memory allocation via integrated mimalloc to pytorch C10 module. It helps PyTorch Windows boost a lot, but it does not cover `MKL` and `MKLDNN`'s intermediary temporary memory. We still have potential to improve PyTorch Windows performance via optimize `MKL` and `MKLDNN`'s intermediary temporary memory. So, I discussed with Intel MKL team, and get a method to register high performance memory allocation API to MKL, and it would help MKL to boost memory performance. Please check the online document: https://www.intel.com/content/www/us/en/docs/onemkl/developer-guide-windows/2023-0/redefining-memory-functions.html This PR is optimize MKL memory alloction performance on Windows, via register mi_malloc to MKL. PR Changes: 1. Add cmake option: `USE_MIMALLOC_ON_MKL`, It is sub-option of `USE_MIMALLOC`. 2. Wrap and export mi_malloc APIs in C10, when `USE_MIMALLOC_ON_MKL` is `ON`. 3. Add MklAllocationHelp.cpp to register allocation APIs to MKL, when `USE_MIMALLOC_ON_MKL` is `ON`. For `oneDNN`, it is still tracking in this proposal: https://github.com/oneapi-src/oneDNN/issues/1898 Pull Request resolved: https://github.com/pytorch/pytorch/pull/138419 Approved by: https://github.com/jgong5, https://github.com/ezyang
2024-10-24 05:29:47 +00:00
#ifdef USE_MIMALLOC_ON_MKL
namespace mi_malloc_wrapper {
void* c10_mi_malloc(size_t size) {
return mi_malloc(size);
}
void* c10_mi_calloc(size_t count, size_t size) {
return mi_calloc(count, size);
}
void* c10_mi_realloc(void* p, size_t newsize) {
return mi_realloc(p, newsize);
}
void* c10_mi_malloc_aligned(size_t size, size_t alignment) {
return mi_malloc_aligned(size, alignment);
}
void c10_mi_free(void* p) {
mi_free(p);
}
} // namespace mi_malloc_wrapper
#endif
} // namespace c10