diff --git a/c10/BUILD.bazel b/c10/BUILD.bazel index dce57475fa5..37ab56a1e76 100644 --- a/c10/BUILD.bazel +++ b/c10/BUILD.bazel @@ -77,6 +77,7 @@ cc_library( deps = [ ":headers", "//c10/core:ScalarType", + "//c10/core:alloc_cpu", "//c10/core:base", "//c10/util:TypeCast", "//c10/util:base", diff --git a/c10/core/CPUAllocator.cpp b/c10/core/CPUAllocator.cpp index 6e1faf08b44..88df9b72069 100644 --- a/c10/core/CPUAllocator.cpp +++ b/c10/core/CPUAllocator.cpp @@ -1,106 +1,18 @@ #include #include #include +#include #include #include -#include -// TODO: rename flags to C10 +// TODO: rename flag to C10 C10_DEFINE_bool( caffe2_report_cpu_memory_usage, false, "If set, print out detailed memory usage"); -C10_DEFINE_bool( - caffe2_cpu_allocator_do_zero_fill, - false, - "If set, do memory zerofilling when allocating on CPU"); - -C10_DEFINE_bool( - caffe2_cpu_allocator_do_junk_fill, - false, - "If set, fill memory with deterministic junk when allocating on CPU"); - namespace c10 { -void memset_junk(void* data, size_t num) { - // This garbage pattern is NaN when interpreted as floating point values, - // or as very large integer values. - static constexpr int32_t kJunkPattern = 0x7fedbeef; - static constexpr int64_t kJunkPattern64 = - static_cast(kJunkPattern) << 32 | kJunkPattern; - int32_t int64_count = num / sizeof(kJunkPattern64); - int32_t remaining_bytes = num % sizeof(kJunkPattern64); - int64_t* data_i64 = reinterpret_cast(data); - for (const auto i : c10::irange(int64_count)) { - data_i64[i] = kJunkPattern64; - } - if (remaining_bytes > 0) { - memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes); - } -} - -void* alloc_cpu(size_t nbytes) { - if (nbytes == 0) { - return nullptr; - } - // We might have clowny upstream code that tries to alloc a negative number - // of bytes. Let's catch it early. - CAFFE_ENFORCE( - ((ptrdiff_t)nbytes) >= 0, - "alloc_cpu() seems to have been called with negative number: ", - nbytes); - - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - void* data; -#ifdef __ANDROID__ - data = memalign(gAlignment, nbytes); -#elif defined(_MSC_VER) - data = _aligned_malloc(nbytes, gAlignment); -#else - int err = posix_memalign(&data, gAlignment, nbytes); - if (err != 0) { - CAFFE_THROW( - "DefaultCPUAllocator: can't allocate memory: you tried to allocate ", - nbytes, - " bytes. Error code ", - err, - " (", - strerror(err), - ")"); - } -#endif - - CAFFE_ENFORCE( - data, - "DefaultCPUAllocator: not enough memory: you tried to allocate ", - nbytes, - " bytes."); - - // move data to a thread's NUMA node - NUMAMove(data, nbytes, GetCurrentNUMANode()); - CHECK( - !FLAGS_caffe2_cpu_allocator_do_zero_fill || - !FLAGS_caffe2_cpu_allocator_do_junk_fill) - << "Cannot request both zero-fill and junk-fill at the same time"; - if (FLAGS_caffe2_cpu_allocator_do_zero_fill) { - memset(data, 0, nbytes); - } else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) { - memset_junk(data, nbytes); - } - - return data; -} - -void free_cpu(void* data) { -#ifdef _MSC_VER - _aligned_free(data); -#else - // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) - free(data); -#endif -} - struct C10_API DefaultCPUAllocator final : at::Allocator { DefaultCPUAllocator() = default; at::DataPtr allocate(size_t nbytes) const override { diff --git a/c10/core/CPUAllocator.h b/c10/core/CPUAllocator.h index 56e25ffd2ef..bf94097417a 100644 --- a/c10/core/CPUAllocator.h +++ b/c10/core/CPUAllocator.h @@ -6,12 +6,9 @@ #include #include // legacy, update dependents to include this directly #include -#include // TODO: rename to c10 C10_DECLARE_bool(caffe2_report_cpu_memory_usage); -C10_DECLARE_bool(caffe2_cpu_allocator_do_zero_fill); -C10_DECLARE_bool(caffe2_cpu_allocator_do_junk_fill); namespace c10 { @@ -20,14 +17,6 @@ using MemoryDeleter = void (*)(void*); // A helper function that is basically doing nothing. C10_API void NoDelete(void*); -// Fill the data memory region of num bytes with a particular garbage pattern. -// The garbage value is chosen to be NaN if interpreted as floating point value, -// or a very large integer. -C10_API void memset_junk(void* data, size_t num); - -C10_API void* alloc_cpu(size_t nbytes); -C10_API void free_cpu(void* data); - // A simple struct that is used to report C10's memory allocation and // deallocation status to the profiler class C10_API ProfiledCPUMemoryReporter { diff --git a/c10/core/build.bzl b/c10/core/build.bzl index 1fc096778df..478d07a9c3e 100644 --- a/c10/core/build.bzl +++ b/c10/core/build.bzl @@ -16,6 +16,23 @@ def define_targets(rules): visibility = ["//visibility:public"], ) + rules.cc_library( + name = "alloc_cpu", + srcs = ["impl/alloc_cpu.cpp"], + hdrs = ["impl/alloc_cpu.h"], + # This library defines flags, The use of alwayslink keeps them + # from being stripped. + alwayslink = True, + linkstatic = True, + local_defines = ["C10_BUILD_MAIN_LIB"], + visibility = ["//visibility:public"], + deps = [ + ":alignment", + "//c10/macros", + "//c10/util:base", + ], + ) + rules.cc_library( name = "base", srcs = rules.glob( @@ -25,6 +42,7 @@ def define_targets(rules): ], exclude = [ "CPUAllocator.cpp", + "impl/alloc_cpu.cpp", ], ), hdrs = rules.glob( @@ -34,6 +52,7 @@ def define_targets(rules): ], exclude = [ "CPUAllocator.h", + "impl/alloc_cpu.h", ], ), # This library uses flags and registration. Do not let the diff --git a/c10/core/impl/alloc_cpu.cpp b/c10/core/impl/alloc_cpu.cpp new file mode 100644 index 00000000000..2570316f42c --- /dev/null +++ b/c10/core/impl/alloc_cpu.cpp @@ -0,0 +1,107 @@ +#include + +#include +#include +#include +#include +#include + +// TODO: rename flags to C10 +C10_DEFINE_bool( + caffe2_cpu_allocator_do_zero_fill, + false, + "If set, do memory zerofilling when allocating on CPU"); + +C10_DEFINE_bool( + caffe2_cpu_allocator_do_junk_fill, + false, + "If set, fill memory with deterministic junk when allocating on CPU"); + +namespace c10 { + +namespace { + +// Fill the data memory region of num bytes with a particular garbage pattern. +// The garbage value is chosen to be NaN if interpreted as floating point value, +// or a very large integer. +void memset_junk(void* data, size_t num) { + // This garbage pattern is NaN when interpreted as floating point values, + // or as very large integer values. + static constexpr int32_t kJunkPattern = 0x7fedbeef; + static constexpr int64_t kJunkPattern64 = + static_cast(kJunkPattern) << 32 | kJunkPattern; + int32_t int64_count = num / sizeof(kJunkPattern64); + int32_t remaining_bytes = num % sizeof(kJunkPattern64); + int64_t* data_i64 = reinterpret_cast(data); + for (const auto i : c10::irange(int64_count)) { + data_i64[i] = kJunkPattern64; + } + if (remaining_bytes > 0) { + memcpy(data_i64 + int64_count, &kJunkPattern64, remaining_bytes); + } +} + +} // namespace + +void* alloc_cpu(size_t nbytes) { + if (nbytes == 0) { + return nullptr; + } + // We might have clowny upstream code that tries to alloc a negative number + // of bytes. Let's catch it early. + CAFFE_ENFORCE( + ((ptrdiff_t)nbytes) >= 0, + "alloc_cpu() seems to have been called with negative number: ", + nbytes); + + // NOLINTNEXTLINE(cppcoreguidelines-init-variables) + void* data; +#ifdef __ANDROID__ + data = memalign(gAlignment, nbytes); +#elif defined(_MSC_VER) + data = _aligned_malloc(nbytes, gAlignment); +#else + int err = posix_memalign(&data, gAlignment, nbytes); + if (err != 0) { + CAFFE_THROW( + "DefaultCPUAllocator: can't allocate memory: you tried to allocate ", + nbytes, + " bytes. Error code ", + err, + " (", + strerror(err), + ")"); + } +#endif + + CAFFE_ENFORCE( + data, + "DefaultCPUAllocator: not enough memory: you tried to allocate ", + nbytes, + " bytes."); + + // move data to a thread's NUMA node + NUMAMove(data, nbytes, GetCurrentNUMANode()); + CHECK( + !FLAGS_caffe2_cpu_allocator_do_zero_fill || + !FLAGS_caffe2_cpu_allocator_do_junk_fill) + << "Cannot request both zero-fill and junk-fill at the same time"; + if (FLAGS_caffe2_cpu_allocator_do_zero_fill) { + memset(data, 0, nbytes); + } else if (FLAGS_caffe2_cpu_allocator_do_junk_fill) { + memset_junk(data, nbytes); + } + + return data; +} + +void free_cpu(void* data) { +#ifdef _MSC_VER + _aligned_free(data); +#else + // NOLINTNEXTLINE(cppcoreguidelines-no-malloc) + free(data); +#endif +} + +} // namespace c10 diff --git a/c10/core/impl/alloc_cpu.h b/c10/core/impl/alloc_cpu.h new file mode 100644 index 00000000000..dc0f97f0f3c --- /dev/null +++ b/c10/core/impl/alloc_cpu.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +#include + +namespace c10 { + +C10_API void* alloc_cpu(size_t nbytes); +C10_API void free_cpu(void* data); + +} // namespace c10 diff --git a/c10/mobile/CPUCachingAllocator.cpp b/c10/mobile/CPUCachingAllocator.cpp index 43a721ad240..683cfe14553 100644 --- a/c10/mobile/CPUCachingAllocator.cpp +++ b/c10/mobile/CPUCachingAllocator.cpp @@ -1,5 +1,7 @@ #include +#include + namespace c10 { namespace { diff --git a/c10/mobile/CPUCachingAllocator.h b/c10/mobile/CPUCachingAllocator.h index d5f4303e1b3..3f78c63a3fe 100644 --- a/c10/mobile/CPUCachingAllocator.h +++ b/c10/mobile/CPUCachingAllocator.h @@ -5,7 +5,6 @@ #include #include -#include #include #include #include diff --git a/c10/mobile/CPUProfilingAllocator.cpp b/c10/mobile/CPUProfilingAllocator.cpp index 1861295c10e..7dfa433bbc7 100644 --- a/c10/mobile/CPUProfilingAllocator.cpp +++ b/c10/mobile/CPUProfilingAllocator.cpp @@ -1,8 +1,12 @@ #include +#include #include #include +#include +#include + namespace c10 { namespace { diff --git a/c10/mobile/CPUProfilingAllocator.h b/c10/mobile/CPUProfilingAllocator.h index bb080d9df97..798647e4202 100644 --- a/c10/mobile/CPUProfilingAllocator.h +++ b/c10/mobile/CPUProfilingAllocator.h @@ -5,7 +5,6 @@ #include #include -#include #include #include #include