2022-07-20 16:51:39 +00:00
|
|
|
#include <c10/core/Allocator.h>
|
2019-02-13 05:13:25 +00:00
|
|
|
#include <c10/core/CPUAllocator.h>
|
|
|
|
|
#include <c10/core/DeviceType.h>
|
2022-01-27 07:23:39 +00:00
|
|
|
#include <c10/core/alignment.h>
|
2022-01-27 07:23:39 +00:00
|
|
|
#include <c10/core/impl/alloc_cpu.h>
|
2020-09-29 18:31:16 +00:00
|
|
|
#include <c10/mobile/CPUCachingAllocator.h>
|
Profiling allocator for mobile. (#43951)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43951
AllocationPlan: Stores the sequence of allocations, their sizes
and liftime of the allocations. Along with this
it also stores the total size of a single memory
blob, total_size, required to satisfy all the allocations.
It also stores the offsets in the blob, of size
total_size, corresponding to each allocation.
Thus allocation plan contains:
- allocation sizes
- allocation lifetimes
- allocation offsets
- total size
AllocationPlaner: Takes a pointer to the allocation plan and fills
it ups with plan, i.e. sizes, lifetimes, offsets,
total size.
This is done via WithProfileAllocationsGuard which
takes in AllocationPlan* and constructs
AllocationPlanner* and set the thread local
allocation_planner to it.
MobileCPUAllocator profiles allocations via
allocation_planner.
In WithValidateAllocationsGuard, allocations profiled
in the allocation plan are validated.
CPUProfilingAllocator:
Application owns CPUProfilingAllocator
Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator
and AllocationPlan created earlier. Then CPUProfilingAllocator will
manage allocations and frees according to the plan. Allocations that
are not managed by CPUProfilingAllocator will be routed through
c10::alloc_cpu, c10::free_cpu.
Test Plan:
cpu_profiling_allocator_test on mobile.
Imported from OSS
Reviewed By: dreiss
Differential Revision: D23451019
fbshipit-source-id: 98bf1dbcfa8fcfb83d505ac01095e84a3f5b778d
2020-10-06 16:07:22 +00:00
|
|
|
#include <c10/mobile/CPUProfilingAllocator.h>
|
2023-02-04 02:15:50 +00:00
|
|
|
#include <c10/util/Logging.h>
|
2019-02-13 05:13:25 +00:00
|
|
|
|
2022-01-27 07:23:39 +00:00
|
|
|
// TODO: rename flag to C10
|
2019-02-13 05:13:25 +00:00
|
|
|
C10_DEFINE_bool(
|
|
|
|
|
caffe2_report_cpu_memory_usage,
|
|
|
|
|
false,
|
|
|
|
|
"If set, print out detailed memory usage");
|
|
|
|
|
|
|
|
|
|
namespace c10 {
|
|
|
|
|
|
|
|
|
|
struct C10_API DefaultCPUAllocator final : at::Allocator {
|
2021-07-06 16:45:04 +00:00
|
|
|
DefaultCPUAllocator() = default;
|
2024-03-05 09:53:01 +00:00
|
|
|
at::DataPtr allocate(size_t nbytes) override {
|
2022-07-20 16:51:39 +00:00
|
|
|
void* data = nullptr;
|
|
|
|
|
try {
|
|
|
|
|
data = c10::alloc_cpu(nbytes);
|
|
|
|
|
} catch (c10::Error& e) {
|
|
|
|
|
profiledCPUMemoryReporter().OutOfMemory(nbytes);
|
|
|
|
|
throw e;
|
|
|
|
|
}
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
profiledCPUMemoryReporter().New(data, nbytes);
|
|
|
|
|
return {data, data, &ReportAndDelete, at::Device(at::DeviceType::CPU)};
|
2019-02-13 05:13:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ReportAndDelete(void* ptr) {
|
|
|
|
|
if (!ptr) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
profiledCPUMemoryReporter().Delete(ptr);
|
2019-02-21 04:16:50 +00:00
|
|
|
free_cpu(ptr);
|
2019-02-13 05:13:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
at::DeleterFnPtr raw_deleter() const override {
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
return &ReportAndDelete;
|
2019-02-13 05:13:25 +00:00
|
|
|
}
|
2024-01-10 15:34:16 +00:00
|
|
|
|
|
|
|
|
void copy_data(void* dest, const void* src, std::size_t count) const final {
|
|
|
|
|
default_copy_data(dest, src, count);
|
|
|
|
|
}
|
2019-02-13 05:13:25 +00:00
|
|
|
};
|
|
|
|
|
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
ProfiledCPUMemoryReporter& profiledCPUMemoryReporter() {
|
|
|
|
|
static ProfiledCPUMemoryReporter reporter_;
|
|
|
|
|
return reporter_;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-23 18:00:28 +00:00
|
|
|
// QNNPACK AND XNNPACK may out-of-bound access the input and / or output
|
|
|
|
|
// tensors. This is by-design, and chosen to make the implementation of
|
|
|
|
|
// micro-kernels both simpler and faster as a result of not having to
|
|
|
|
|
// individually handle the corner cases where the number of processed elements
|
|
|
|
|
// is not a multiple of SIMD register width. This behavior will trigger ASAN
|
|
|
|
|
// though, and may result in a segfault if the accessed memory location just so
|
|
|
|
|
// happens to fall on a page the current process has no read access to. Here we
|
|
|
|
|
// define a custom allocator that allocates the extra storage required to keep
|
|
|
|
|
// this behavior safe. This allocator could have been restricted to QNNPACK and
|
|
|
|
|
// XNNPACK only, but that would have negative performance ramifications, as
|
|
|
|
|
// input tensors must now be reallocated, and copied over, if the tensor is not
|
|
|
|
|
// allocated with this allocator to begin with. Making this allocator the
|
|
|
|
|
// default on mobile builds minimizes the probability of unnecessary
|
|
|
|
|
// reallocations and copies, and also enables acceleration of operations where
|
|
|
|
|
// the output tensor is allocated outside of the function doing the
|
|
|
|
|
// implementation, wherein the implementation cannot simply re-allocate the
|
|
|
|
|
// output with the guarding allocator.
|
|
|
|
|
//
|
|
|
|
|
// PreGuardBytes: Number of guard bytes to allocate before the allocation.
|
|
|
|
|
// PostGuardBytes: Number of guard bytes to allocate after the allocation.
|
|
|
|
|
|
|
|
|
|
template <uint32_t PreGuardBytes, uint32_t PostGuardBytes>
|
|
|
|
|
class DefaultMobileCPUAllocator final : public at::Allocator {
|
|
|
|
|
public:
|
|
|
|
|
static void deleter(void* const pointer) {
|
|
|
|
|
if (C10_UNLIKELY(!pointer)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
// TODO: enable with better TLS support on mobile
|
|
|
|
|
// profiledCPUMemoryReporter().Delete(pointer);
|
2020-08-22 02:07:27 +00:00
|
|
|
auto allocator_ptr = GetThreadLocalCachingAllocator();
|
Profiling allocator for mobile. (#43951)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43951
AllocationPlan: Stores the sequence of allocations, their sizes
and liftime of the allocations. Along with this
it also stores the total size of a single memory
blob, total_size, required to satisfy all the allocations.
It also stores the offsets in the blob, of size
total_size, corresponding to each allocation.
Thus allocation plan contains:
- allocation sizes
- allocation lifetimes
- allocation offsets
- total size
AllocationPlaner: Takes a pointer to the allocation plan and fills
it ups with plan, i.e. sizes, lifetimes, offsets,
total size.
This is done via WithProfileAllocationsGuard which
takes in AllocationPlan* and constructs
AllocationPlanner* and set the thread local
allocation_planner to it.
MobileCPUAllocator profiles allocations via
allocation_planner.
In WithValidateAllocationsGuard, allocations profiled
in the allocation plan are validated.
CPUProfilingAllocator:
Application owns CPUProfilingAllocator
Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator
and AllocationPlan created earlier. Then CPUProfilingAllocator will
manage allocations and frees according to the plan. Allocations that
are not managed by CPUProfilingAllocator will be routed through
c10::alloc_cpu, c10::free_cpu.
Test Plan:
cpu_profiling_allocator_test on mobile.
Imported from OSS
Reviewed By: dreiss
Differential Revision: D23451019
fbshipit-source-id: 98bf1dbcfa8fcfb83d505ac01095e84a3f5b778d
2020-10-06 16:07:22 +00:00
|
|
|
auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
|
2020-08-22 02:07:27 +00:00
|
|
|
if (allocator_ptr != nullptr) {
|
|
|
|
|
allocator_ptr->free(pointer);
|
Profiling allocator for mobile. (#43951)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43951
AllocationPlan: Stores the sequence of allocations, their sizes
and liftime of the allocations. Along with this
it also stores the total size of a single memory
blob, total_size, required to satisfy all the allocations.
It also stores the offsets in the blob, of size
total_size, corresponding to each allocation.
Thus allocation plan contains:
- allocation sizes
- allocation lifetimes
- allocation offsets
- total size
AllocationPlaner: Takes a pointer to the allocation plan and fills
it ups with plan, i.e. sizes, lifetimes, offsets,
total size.
This is done via WithProfileAllocationsGuard which
takes in AllocationPlan* and constructs
AllocationPlanner* and set the thread local
allocation_planner to it.
MobileCPUAllocator profiles allocations via
allocation_planner.
In WithValidateAllocationsGuard, allocations profiled
in the allocation plan are validated.
CPUProfilingAllocator:
Application owns CPUProfilingAllocator
Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator
and AllocationPlan created earlier. Then CPUProfilingAllocator will
manage allocations and frees according to the plan. Allocations that
are not managed by CPUProfilingAllocator will be routed through
c10::alloc_cpu, c10::free_cpu.
Test Plan:
cpu_profiling_allocator_test on mobile.
Imported from OSS
Reviewed By: dreiss
Differential Revision: D23451019
fbshipit-source-id: 98bf1dbcfa8fcfb83d505ac01095e84a3f5b778d
2020-10-06 16:07:22 +00:00
|
|
|
} else if (profiling_allocator_ptr != nullptr) {
|
|
|
|
|
profiling_allocator_ptr->free(pointer);
|
2020-08-22 02:07:27 +00:00
|
|
|
} else {
|
|
|
|
|
c10::free_cpu(pointer);
|
|
|
|
|
// This adds extra cost to freeing memory to the default case when
|
|
|
|
|
// caching allocator is not enabled.
|
2021-04-14 18:16:51 +00:00
|
|
|
// NOLINTNEXTLINE(clang-analyzer-unix.Malloc)
|
2020-08-22 02:07:27 +00:00
|
|
|
CPUCachingAllocator::record_free(pointer);
|
Profiling allocator for mobile. (#43951)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43951
AllocationPlan: Stores the sequence of allocations, their sizes
and liftime of the allocations. Along with this
it also stores the total size of a single memory
blob, total_size, required to satisfy all the allocations.
It also stores the offsets in the blob, of size
total_size, corresponding to each allocation.
Thus allocation plan contains:
- allocation sizes
- allocation lifetimes
- allocation offsets
- total size
AllocationPlaner: Takes a pointer to the allocation plan and fills
it ups with plan, i.e. sizes, lifetimes, offsets,
total size.
This is done via WithProfileAllocationsGuard which
takes in AllocationPlan* and constructs
AllocationPlanner* and set the thread local
allocation_planner to it.
MobileCPUAllocator profiles allocations via
allocation_planner.
In WithValidateAllocationsGuard, allocations profiled
in the allocation plan are validated.
CPUProfilingAllocator:
Application owns CPUProfilingAllocator
Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator
and AllocationPlan created earlier. Then CPUProfilingAllocator will
manage allocations and frees according to the plan. Allocations that
are not managed by CPUProfilingAllocator will be routed through
c10::alloc_cpu, c10::free_cpu.
Test Plan:
cpu_profiling_allocator_test on mobile.
Imported from OSS
Reviewed By: dreiss
Differential Revision: D23451019
fbshipit-source-id: 98bf1dbcfa8fcfb83d505ac01095e84a3f5b778d
2020-10-06 16:07:22 +00:00
|
|
|
auto allocation_planner = GetThreadLocalAllocationPlanner();
|
|
|
|
|
if (allocation_planner != nullptr) {
|
|
|
|
|
allocation_planner->record_free(pointer);
|
|
|
|
|
}
|
2020-08-22 02:07:27 +00:00
|
|
|
}
|
2020-04-23 18:00:28 +00:00
|
|
|
}
|
|
|
|
|
|
2024-03-05 09:53:01 +00:00
|
|
|
DataPtr allocate(const size_t nbytes) override {
|
2020-04-23 18:00:28 +00:00
|
|
|
if (C10_UNLIKELY(0u == nbytes)) {
|
|
|
|
|
return {
|
|
|
|
|
nullptr,
|
|
|
|
|
nullptr,
|
|
|
|
|
&deleter,
|
|
|
|
|
at::Device(DeviceType::CPU),
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
|
2024-10-08 19:05:00 +00:00
|
|
|
void* data = nullptr;
|
2020-08-22 02:07:27 +00:00
|
|
|
auto allocator_ptr = GetThreadLocalCachingAllocator();
|
Profiling allocator for mobile. (#43951)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43951
AllocationPlan: Stores the sequence of allocations, their sizes
and liftime of the allocations. Along with this
it also stores the total size of a single memory
blob, total_size, required to satisfy all the allocations.
It also stores the offsets in the blob, of size
total_size, corresponding to each allocation.
Thus allocation plan contains:
- allocation sizes
- allocation lifetimes
- allocation offsets
- total size
AllocationPlaner: Takes a pointer to the allocation plan and fills
it ups with plan, i.e. sizes, lifetimes, offsets,
total size.
This is done via WithProfileAllocationsGuard which
takes in AllocationPlan* and constructs
AllocationPlanner* and set the thread local
allocation_planner to it.
MobileCPUAllocator profiles allocations via
allocation_planner.
In WithValidateAllocationsGuard, allocations profiled
in the allocation plan are validated.
CPUProfilingAllocator:
Application owns CPUProfilingAllocator
Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator
and AllocationPlan created earlier. Then CPUProfilingAllocator will
manage allocations and frees according to the plan. Allocations that
are not managed by CPUProfilingAllocator will be routed through
c10::alloc_cpu, c10::free_cpu.
Test Plan:
cpu_profiling_allocator_test on mobile.
Imported from OSS
Reviewed By: dreiss
Differential Revision: D23451019
fbshipit-source-id: 98bf1dbcfa8fcfb83d505ac01095e84a3f5b778d
2020-10-06 16:07:22 +00:00
|
|
|
auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
|
2020-08-22 02:07:27 +00:00
|
|
|
if (allocator_ptr != nullptr) {
|
|
|
|
|
data = allocator_ptr->allocate(alloc_size);
|
Profiling allocator for mobile. (#43951)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43951
AllocationPlan: Stores the sequence of allocations, their sizes
and liftime of the allocations. Along with this
it also stores the total size of a single memory
blob, total_size, required to satisfy all the allocations.
It also stores the offsets in the blob, of size
total_size, corresponding to each allocation.
Thus allocation plan contains:
- allocation sizes
- allocation lifetimes
- allocation offsets
- total size
AllocationPlaner: Takes a pointer to the allocation plan and fills
it ups with plan, i.e. sizes, lifetimes, offsets,
total size.
This is done via WithProfileAllocationsGuard which
takes in AllocationPlan* and constructs
AllocationPlanner* and set the thread local
allocation_planner to it.
MobileCPUAllocator profiles allocations via
allocation_planner.
In WithValidateAllocationsGuard, allocations profiled
in the allocation plan are validated.
CPUProfilingAllocator:
Application owns CPUProfilingAllocator
Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator
and AllocationPlan created earlier. Then CPUProfilingAllocator will
manage allocations and frees according to the plan. Allocations that
are not managed by CPUProfilingAllocator will be routed through
c10::alloc_cpu, c10::free_cpu.
Test Plan:
cpu_profiling_allocator_test on mobile.
Imported from OSS
Reviewed By: dreiss
Differential Revision: D23451019
fbshipit-source-id: 98bf1dbcfa8fcfb83d505ac01095e84a3f5b778d
2020-10-06 16:07:22 +00:00
|
|
|
} else if (profiling_allocator_ptr != nullptr) {
|
|
|
|
|
data = profiling_allocator_ptr->allocate(alloc_size);
|
2020-08-22 02:07:27 +00:00
|
|
|
} else {
|
2022-07-20 16:51:39 +00:00
|
|
|
try {
|
|
|
|
|
data = c10::alloc_cpu(alloc_size);
|
|
|
|
|
} catch (c10::Error& e) {
|
|
|
|
|
profiledCPUMemoryReporter().OutOfMemory(alloc_size);
|
|
|
|
|
throw e;
|
|
|
|
|
}
|
Profiling allocator for mobile. (#43951)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/43951
AllocationPlan: Stores the sequence of allocations, their sizes
and liftime of the allocations. Along with this
it also stores the total size of a single memory
blob, total_size, required to satisfy all the allocations.
It also stores the offsets in the blob, of size
total_size, corresponding to each allocation.
Thus allocation plan contains:
- allocation sizes
- allocation lifetimes
- allocation offsets
- total size
AllocationPlaner: Takes a pointer to the allocation plan and fills
it ups with plan, i.e. sizes, lifetimes, offsets,
total size.
This is done via WithProfileAllocationsGuard which
takes in AllocationPlan* and constructs
AllocationPlanner* and set the thread local
allocation_planner to it.
MobileCPUAllocator profiles allocations via
allocation_planner.
In WithValidateAllocationsGuard, allocations profiled
in the allocation plan are validated.
CPUProfilingAllocator:
Application owns CPUProfilingAllocator
Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator
and AllocationPlan created earlier. Then CPUProfilingAllocator will
manage allocations and frees according to the plan. Allocations that
are not managed by CPUProfilingAllocator will be routed through
c10::alloc_cpu, c10::free_cpu.
Test Plan:
cpu_profiling_allocator_test on mobile.
Imported from OSS
Reviewed By: dreiss
Differential Revision: D23451019
fbshipit-source-id: 98bf1dbcfa8fcfb83d505ac01095e84a3f5b778d
2020-10-06 16:07:22 +00:00
|
|
|
auto allocation_planner = GetThreadLocalAllocationPlanner();
|
|
|
|
|
if (allocation_planner != nullptr) {
|
|
|
|
|
allocation_planner->record_allocation(alloc_size, data);
|
|
|
|
|
}
|
2020-08-22 02:07:27 +00:00
|
|
|
}
|
2022-07-20 16:51:39 +00:00
|
|
|
profiledCPUMemoryReporter().New(data, alloc_size);
|
2020-04-23 18:00:28 +00:00
|
|
|
return {
|
|
|
|
|
reinterpret_cast<uint8_t*>(data) + PreGuardBytes,
|
|
|
|
|
data,
|
|
|
|
|
&deleter,
|
|
|
|
|
at::Device(DeviceType::CPU),
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-24 06:03:21 +00:00
|
|
|
DeleterFnPtr raw_deleter() const override {
|
2020-04-23 18:00:28 +00:00
|
|
|
return deleter;
|
|
|
|
|
}
|
2024-01-10 15:34:16 +00:00
|
|
|
|
|
|
|
|
bool is_simple_data_ptr(const c10::DataPtr& data_ptr) const final {
|
|
|
|
|
return reinterpret_cast<const uint8_t*>(data_ptr.get()) ==
|
|
|
|
|
reinterpret_cast<const uint8_t*>(data_ptr.get_context()) +
|
|
|
|
|
PreGuardBytes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void copy_data(void* dest, const void* src, std::size_t count) const final {
|
|
|
|
|
default_copy_data(dest, src, count);
|
|
|
|
|
}
|
2020-04-23 18:00:28 +00:00
|
|
|
};
|
|
|
|
|
|
2019-02-13 05:13:25 +00:00
|
|
|
void NoDelete(void*) {}
|
|
|
|
|
|
|
|
|
|
at::Allocator* GetCPUAllocator() {
|
|
|
|
|
return GetAllocator(DeviceType::CPU);
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-07 00:25:07 +00:00
|
|
|
void SetCPUAllocator(at::Allocator* alloc, uint8_t priority) {
|
|
|
|
|
SetAllocator(DeviceType::CPU, alloc, priority);
|
2019-02-13 05:13:25 +00:00
|
|
|
}
|
|
|
|
|
|
2020-04-23 18:00:28 +00:00
|
|
|
// The Mobile CPU allocator must always be present even on non-mobile builds
|
|
|
|
|
// because QNNPACK and XNNPACK are not mobile specific.
|
|
|
|
|
//
|
|
|
|
|
// Pre-guard: 8 bytes for QNNPACK, but set to gAlignment to ensure SIMD
|
|
|
|
|
// alignment, not on the allocated memory, but memory location
|
|
|
|
|
// returned to the user.
|
|
|
|
|
// Post-guard: 16 bytes for XNNPACK.
|
|
|
|
|
|
2021-04-14 18:16:51 +00:00
|
|
|
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers,cppcoreguidelines-avoid-non-const-global-variables)
|
2020-04-23 18:00:28 +00:00
|
|
|
static DefaultMobileCPUAllocator<gAlignment, 16u> g_mobile_cpu_allocator;
|
|
|
|
|
|
|
|
|
|
at::Allocator* GetDefaultMobileCPUAllocator() {
|
|
|
|
|
return &g_mobile_cpu_allocator;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef C10_MOBILE
|
|
|
|
|
|
|
|
|
|
at::Allocator* GetDefaultCPUAllocator() {
|
|
|
|
|
return GetDefaultMobileCPUAllocator();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
REGISTER_ALLOCATOR(DeviceType::CPU, &g_mobile_cpu_allocator);
|
|
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
2019-02-13 05:13:25 +00:00
|
|
|
// Global default CPU Allocator
|
|
|
|
|
static DefaultCPUAllocator g_cpu_alloc;
|
|
|
|
|
|
|
|
|
|
at::Allocator* GetDefaultCPUAllocator() {
|
|
|
|
|
return &g_cpu_alloc;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
REGISTER_ALLOCATOR(DeviceType::CPU, &g_cpu_alloc);
|
|
|
|
|
|
2020-04-23 18:00:28 +00:00
|
|
|
#endif /* C10_Mobile */
|
|
|
|
|
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
void ProfiledCPUMemoryReporter::New(void* ptr, size_t nbytes) {
|
|
|
|
|
if (nbytes == 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
auto profile_memory = memoryProfilingEnabled();
|
|
|
|
|
size_t allocated = 0;
|
|
|
|
|
if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
|
|
|
|
|
std::lock_guard<std::mutex> guard(mutex_);
|
|
|
|
|
size_table_[ptr] = nbytes;
|
|
|
|
|
allocated_ += nbytes;
|
|
|
|
|
allocated = allocated_;
|
|
|
|
|
}
|
|
|
|
|
if (FLAGS_caffe2_report_cpu_memory_usage) {
|
|
|
|
|
LOG(INFO) << "C10 alloc " << nbytes << " bytes, total alloc " << allocated
|
|
|
|
|
<< " bytes.";
|
|
|
|
|
}
|
|
|
|
|
if (profile_memory) {
|
2021-08-04 21:57:38 +00:00
|
|
|
reportMemoryUsageToProfiler(
|
2023-02-01 14:44:13 +00:00
|
|
|
ptr,
|
|
|
|
|
static_cast<int64_t>(nbytes),
|
2023-02-03 03:44:40 +00:00
|
|
|
allocated,
|
2023-02-01 14:44:13 +00:00
|
|
|
0,
|
|
|
|
|
c10::Device(c10::DeviceType::CPU));
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
}
|
2019-02-13 05:13:25 +00:00
|
|
|
}
|
|
|
|
|
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
void ProfiledCPUMemoryReporter::Delete(void* ptr) {
|
|
|
|
|
size_t nbytes = 0;
|
|
|
|
|
auto profile_memory = memoryProfilingEnabled();
|
|
|
|
|
size_t allocated = 0;
|
|
|
|
|
if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
|
|
|
|
|
std::lock_guard<std::mutex> guard(mutex_);
|
|
|
|
|
auto it = size_table_.find(ptr);
|
|
|
|
|
if (it != size_table_.end()) {
|
|
|
|
|
allocated_ -= it->second;
|
|
|
|
|
allocated = allocated_;
|
|
|
|
|
nbytes = it->second;
|
|
|
|
|
size_table_.erase(it);
|
2021-02-04 12:11:57 +00:00
|
|
|
} else {
|
2021-05-14 02:20:27 +00:00
|
|
|
// C10_LOG_EVERY_MS might log every time in some builds,
|
|
|
|
|
// using a simple counter to avoid spammy logs
|
|
|
|
|
if (log_cnt_++ % 1000 == 0) {
|
|
|
|
|
LOG(WARNING) << "Memory block of unknown size was allocated before "
|
|
|
|
|
<< "the profiling started, profiler results will not "
|
|
|
|
|
<< "include the deallocation event";
|
|
|
|
|
}
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (nbytes == 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (FLAGS_caffe2_report_cpu_memory_usage) {
|
2020-11-30 22:57:56 +00:00
|
|
|
LOG(INFO) << "C10 deleted " << nbytes << " bytes, total alloc " << allocated
|
|
|
|
|
<< " bytes.";
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
}
|
|
|
|
|
if (profile_memory) {
|
2020-11-30 22:57:56 +00:00
|
|
|
reportMemoryUsageToProfiler(
|
2023-02-01 14:44:13 +00:00
|
|
|
ptr,
|
|
|
|
|
-static_cast<int64_t>(nbytes),
|
2023-02-03 03:44:40 +00:00
|
|
|
allocated,
|
2023-02-01 14:44:13 +00:00
|
|
|
0,
|
|
|
|
|
c10::Device(c10::DeviceType::CPU));
|
2020-11-30 22:57:56 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-20 16:51:39 +00:00
|
|
|
void ProfiledCPUMemoryReporter::OutOfMemory(size_t nbytes) {
|
|
|
|
|
auto profile_memory = memoryProfilingEnabled();
|
|
|
|
|
size_t allocated = 0;
|
|
|
|
|
if (FLAGS_caffe2_report_cpu_memory_usage || profile_memory) {
|
|
|
|
|
std::lock_guard<std::mutex> guard(mutex_);
|
|
|
|
|
|
|
|
|
|
allocated = allocated_;
|
|
|
|
|
}
|
|
|
|
|
if (nbytes == 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (FLAGS_caffe2_report_cpu_memory_usage) {
|
|
|
|
|
LOG(INFO) << "C10 Out of Memory. Trying to allocate " << nbytes
|
|
|
|
|
<< " bytes, total alloc " << allocated << " bytes.";
|
|
|
|
|
}
|
|
|
|
|
if (profile_memory) {
|
|
|
|
|
reportOutOfMemoryToProfiler(
|
2023-10-18 20:32:53 +00:00
|
|
|
static_cast<int64_t>(nbytes),
|
|
|
|
|
allocated,
|
|
|
|
|
0,
|
|
|
|
|
c10::Device(c10::DeviceType::CPU));
|
2022-07-20 16:51:39 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-11-30 22:57:56 +00:00
|
|
|
C10_API at::Allocator* cpu_caching_alloc = nullptr;
|
|
|
|
|
C10_API uint8_t cpu_caching_alloc_priority = 0;
|
|
|
|
|
|
|
|
|
|
void SetCPUCachingAllocator(Allocator* alloc, uint8_t priority) {
|
|
|
|
|
if (priority >= cpu_caching_alloc_priority) {
|
|
|
|
|
cpu_caching_alloc = alloc;
|
|
|
|
|
cpu_caching_alloc_priority = priority;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Allocator* GetCPUCachingAllocator() {
|
|
|
|
|
if (cpu_caching_alloc == nullptr) {
|
|
|
|
|
VLOG(1)
|
|
|
|
|
<< "There is not caching allocator registered for CPU, use the default allocator instead.";
|
|
|
|
|
return GetAllocator(DeviceType::CPU);
|
Memory profiling (#37775)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/37775
Adding memory usage into profiler table output
Test Plan:
BUILD_BINARY=1 USE_BLAS=MKL USE_MKLDNN=0 USE_CUDA=0 python setup.py
develop install --cmake
```
import torch
import torchvision.models as models
model = models.resnet18()
inp = torch.randn(5, 3, 224, 224)
with torch.autograd.profiler.profile(profile_memory=True, record_shapes=True) as prof:
model(inp)
print(prof.key_averages(group_by_input_shape=True).table(sort_by="cpu_memory_usage", row_limit=15))
```
```
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg CPU Mem Total Number of Calls Input Shapes
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
resize_ 0.37% 577.936us 0.37% 577.936us 9.796us 339.03 Mb 59 [[0]]
empty 0.69% 1.061ms 0.74% 1.139ms 5.556us 47.42 Mb 205 []
stride 0.00% 0.853us 0.00% 0.853us 0.853us 19.53 Kb 1 [[5, 1000]]
empty_strided 0.01% 21.393us 0.02% 26.033us 5.207us 252 b 5 []
is_complex 0.02% 37.425us 0.02% 37.425us 1.291us 208 b 29 [[]]
masked_select 0.04% 55.333us 0.06% 93.616us 46.808us 120 b 2 [[30], [30]]
conv2d 0.01% 18.009us 9.62% 14.902ms 14.902ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
convolution 0.01% 12.436us 9.61% 14.884ms 14.884ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_convolution 0.03% 52.381us 9.60% 14.871ms 14.871ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
size 0.00% 5.429us 0.00% 5.429us 0.339us 0 b 16 [[5, 3, 224, 224]]
contiguous 0.00% 1.934us 0.00% 1.934us 0.967us 0 b 2 [[5, 3, 224, 224]]
_convolution_nogroup 0.02% 27.505us 9.57% 14.814ms 14.814ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
_nnpack_available 0.02% 34.267us 0.02% 34.267us 1.713us 0 b 20 []
thnn_conv2d 0.01% 13.274us 9.54% 14.771ms 14.771ms 0 b 1 [[5, 3, 224, 224], [64, 3, 7, 7], [
thnn_conv2d_forward 5.98% 9.264ms 19.02% 29.446ms 14.723ms 0 b 2 [[5, 3, 224, 224], [64, 3, 7, 7], [
--------------------------- --------------- --------------- --------------- --------------- --------------- --------------- --------------- -----------------------------------
Self CPU time total: 154.855ms
```
Reviewed By: ngimel
Differential Revision: D21384248
Pulled By: ilia-cher
fbshipit-source-id: 31359cce2aa06f6255ed1ad8c60d03cb640bfec3
2020-05-19 22:46:56 +00:00
|
|
|
}
|
2020-11-30 22:57:56 +00:00
|
|
|
return cpu_caching_alloc;
|
2019-02-13 05:13:25 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace c10
|