[Profiler][Minor] Extend Python bindings (#83622)

Adding some fields which are needed for memory profiling.

Differential Revision: [D38528382](https://our.internmc.facebook.com/intern/diff/D38528382/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/83622
Approved by: https://github.com/Gamrix
This commit is contained in:
Taylor Robie 2022-08-26 10:33:15 -07:00 committed by PyTorch MergeBot
parent 681c38704e
commit 014a333df3
4 changed files with 130 additions and 31 deletions

View file

@ -1283,10 +1283,11 @@ class TestTorchTidyProfiler(TestCase):
def test_tensor_properties(self):
x = torch.ones(10, 10).as_strided([4, 4], [12, 3])
y = torch.ones(4, 1)
y = torch.ones(4, 1, requires_grad=True)
with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
_ = x + y
_ = x * y
nodes = p.profiler.kineto_results.experimental_event_tree()
node = find_node_with_name(nodes, "aten::add")
@ -1306,6 +1307,13 @@ class TestTorchTidyProfiler(TestCase):
self.assertEqual(layout_info, [torch.strided, torch.strided, None])
device_info = [x.device if x else None for x in input_info.tensor_metadata]
self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
self.assertEqual(node.extra_fields.scope, torch.profiler.RecordScope.FUNCTION)
mul_node = find_node_with_name(nodes, "aten::mul")
self.assertIsNotNone(mul_node)
self.assertEqual(
node.extra_fields.sequence_number + 1,
mul_node.extra_fields.sequence_number)
def test_sparse_tensors(self):
i = [[0, 1, 1], [2, 0, 2]]
@ -1363,7 +1371,6 @@ class TestTorchTidyProfiler(TestCase):
device_info = [x.device if x else None for x in input_info.tensor_metadata]
self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
def test_scalar_ins(self):
x = torch.ones(5, 5)
alpha = 0.9
@ -1415,6 +1422,39 @@ class TestTorchTidyProfiler(TestCase):
expected += [(name, val.storage().data_ptr()) for name, val in net.fc2._parameters.items()]
self.assertEqual(expected, params, f"{expected} vs. {params}")
def test_allocations(self):
gc.collect()
with profile(profile_memory=True) as p:
x = torch.empty((3, 4))
nodes = p.profiler.kineto_results.experimental_event_tree()
node = find_node_with_name(nodes, "[memory]")
self.assertIsNotNone(node)
alloc_size = 3 * 4 * 4 # fp32 -> 4 bytes
ptr = node.extra_fields.ptr
self.assertGreater(ptr, 0)
self.assertEqual(node.extra_fields.alloc_size, alloc_size)
self.assertEqual(node.extra_fields.device_type, torch._C._autograd.DeviceType.CPU)
self.assertEqual(node.extra_fields.device_index, -1)
total_allocated = node.extra_fields.total_allocated
# total_reserved is only for CUDACachingAllocator
self.assertEqual(node.extra_fields.total_reserved, 0)
with profile(profile_memory=True) as p:
del x
gc.collect()
nodes = p.profiler.kineto_results.experimental_event_tree()
node = find_node_with_name(nodes, "[memory]")
self.assertIsNotNone(node)
self.assertEqual(node.extra_fields.ptr, ptr)
self.assertEqual(node.extra_fields.alloc_size, -alloc_size)
self.assertEqual(node.extra_fields.device_type, torch._C._autograd.DeviceType.CPU)
self.assertEqual(node.extra_fields.device_index, -1)
self.assertEqual(node.extra_fields.total_allocated, total_allocated - alloc_size)
@dataclass(frozen=True)

View file

@ -3,6 +3,18 @@ from typing import List, Union
# defined in torch/csrc/profiler/python/init.cpp
class RecordScope(Enum):
FUNCTION = ...
BACKWARD_FUNCTION = ...
TORCHSCRIPT_FUNCTION = ...
KERNEL_FUNCTION_DTYPE = ...
CUSTOM_CLASS = ...
BUILD_FEATURE = ...
LITE_INTERPRETER = ...
USER_SCOPE = ...
STATIC_RUNTIME_OP = ...
STATIC_RUNTIME_MODEL = ...
class ProfilerState(Enum):
Disable = ...
CPU = ...
@ -12,8 +24,7 @@ class ProfilerState(Enum):
KINETO = ...
KINETO_GPU_FALLBACK = ...
class ActiveProfilerType:
...
class ActiveProfilerType: ...
class ProfilerActivity(Enum):
CPU = ...
@ -50,9 +61,15 @@ class _ProfilerEvent:
duration_time_ns: int
parent: _ProfilerEvent
children: List[_ProfilerEvent]
extra_fields: Union[_ExtraFields_Allocation, _ExtraFields_Backend,
_ExtraFields_PyCall, _ExtraFields_PyCCall,
_ExtraFields_TorchOp]
extra_fields: Union[
_ExtraFields_TorchOp,
_ExtraFields_Backend,
_ExtraFields_Allocation,
_ExtraFields_OutOfMemory,
_ExtraFields_PyCall,
_ExtraFields_PyCCall,
_ExtraFields_Kineto,
]
def name(self) -> str: ...
...
@ -79,11 +96,9 @@ class _ExtraFields_TorchOp:
inputs: _Inputs
...
class _ExtraFields_Backend:
...
class _ExtraFields_Allocation:
...
class _ExtraFields_Backend: ...
class _ExtraFields_Allocation: ...
class _ExtraFields_OutOfMemory: ...
class _ExtraFields_PyCCall:
caller: _PyFrameState
@ -93,5 +108,4 @@ class _ExtraFields_PyCall:
caller: _PyFrameState
...
class _ExtraFields_Kineto:
...
class _ExtraFields_Kineto: ...

View file

@ -1,5 +1,6 @@
#include <torch/csrc/profiler/python/init.h>
#include <ATen/record_function.h>
#include <torch/csrc/autograd/utils/wrap_outputs.h>
#include <torch/csrc/jit/python/pybind_utils.h>
#include <torch/csrc/profiler/collection.h>
@ -14,6 +15,18 @@ void initPythonBindings(PyObject* module) {
using namespace torch::profiler::impl;
py::enum_<at::RecordScope>(m, "RecordScope")
.value("FUNCTION", at::RecordScope::FUNCTION)
.value("BACKWARD_FUNCTION", at::RecordScope::BACKWARD_FUNCTION)
.value("TORCHSCRIPT_FUNCTION", at::RecordScope::TORCHSCRIPT_FUNCTION)
.value("KERNEL_FUNCTION_DTYPE", at::RecordScope::KERNEL_FUNCTION_DTYPE)
.value("CUSTOM_CLASS", at::RecordScope::CUSTOM_CLASS)
.value("BUILD_FEATURE", at::RecordScope::BUILD_FEATURE)
.value("LITE_INTERPRETER", at::RecordScope::LITE_INTERPRETER)
.value("USER_SCOPE", at::RecordScope::USER_SCOPE)
.value("STATIC_RUNTIME_OP", at::RecordScope::STATIC_RUNTIME_OP)
.value("STATIC_RUNTIME_MODEL", at::RecordScope::STATIC_RUNTIME_MODEL);
py::enum_<ProfilerState>(m, "ProfilerState")
.value("Disabled", ProfilerState::Disabled)
.value("CPU", ProfilerState::CPU)
@ -122,15 +135,27 @@ void initPythonBindings(PyObject* module) {
return py::reinterpret_borrow<py::object>(thp_device);
});
py::class_<ExtraFields<EventType::TorchOp>>(m, "_ExtraFields_TorchOp")
.def_readonly("inputs", &ExtraFields<EventType::TorchOp>::inputs_)
.def_readonly(
"allow_tf32_cublas",
&ExtraFields<EventType::TorchOp>::allow_tf32_cublas_);
using torch_op_t = ExtraFields<EventType::TorchOp>;
py::class_<torch_op_t>(m, "_ExtraFields_TorchOp")
.def_readonly("inputs", &torch_op_t::inputs_)
.def_readonly("scope", &torch_op_t::scope_)
.def_readonly("sequence_number", &torch_op_t::sequence_number_)
.def_readonly("allow_tf32_cublas", &torch_op_t::allow_tf32_cublas_);
py::class_<ExtraFields<EventType::Backend>>(m, "_ExtraFields_Backend");
py::class_<ExtraFields<EventType::Allocation>>(m, "_ExtraFields_Allocation");
using allocation_t = ExtraFields<EventType::Allocation>;
py::class_<allocation_t>(m, "_ExtraFields_Allocation")
.def_property_readonly(
"ptr",
[](const allocation_t& a) {
return reinterpret_cast<intptr_t>(a.ptr_);
})
.def_readonly("alloc_size", &allocation_t::alloc_size_)
.def_readonly("device_type", &allocation_t::device_type_)
.def_readonly("device_index", &allocation_t::device_index_)
.def_readonly("total_allocated", &allocation_t::total_allocated_)
.def_readonly("total_reserved", &allocation_t::total_reserved_);
py::class_<NNModuleInfo>(m, "_NNModuleInfo")
.def_property_readonly(
@ -149,7 +174,8 @@ void initPythonBindings(PyObject* module) {
py::class_<ExtraFields<EventType::PyCall>>(m, "_ExtraFields_PyCall")
.def_readonly("module", &ExtraFields<EventType::PyCall>::module_)
.def_readonly("callsite", &ExtraFields<EventType::PyCall>::callsite_)
.def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
.def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_)
.def_readonly("module", &ExtraFields<EventType::PyCall>::module_);
py::class_<ExtraFields<EventType::PyCCall>>(m, "_ExtraFields_PyCCall")
.def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
@ -162,6 +188,9 @@ void initPythonBindings(PyObject* module) {
return s.funcname_.str();
});
py::class_<ExtraFields<EventType::OutOfMemory>>(
m, "_ExtraFields_OutOfMemory");
py::class_<ExtraFields<EventType::Kineto>>(m, "_ExtraFields_Kineto");
py::class_<Result, std::shared_ptr<Result>>(m, "_ProfilerEvent")

View file

@ -1,4 +1,4 @@
r'''
r"""
PyTorch Profiler is a tool that allows the collection of performance metrics during training and inference.
Profiler's context manager API can be used to better understand what model operators are the most expensive,
examine their input shapes and stack traces, study device kernel activity and visualize the execution trace.
@ -6,16 +6,32 @@ examine their input shapes and stack traces, study device kernel activity and vi
.. note::
An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.
'''
from .profiler import profile, _KinetoProfile, \
schedule, supported_activities, tensorboard_trace_handler, ProfilerAction, \
ExecutionGraphObserver
from torch._C._autograd import kineto_available, _supported_activities, DeviceType
from torch._C._profiler import ProfilerActivity, _ExperimentalConfig
"""
from torch._C._autograd import _supported_activities, DeviceType, kineto_available
from torch._C._profiler import _ExperimentalConfig, ProfilerActivity, RecordScope
from torch.autograd.profiler import record_function
__all__ = ['profile', 'schedule', 'supported_activities',
'tensorboard_trace_handler', 'ProfilerAction', 'ProfilerActivity',
'kineto_available', 'DeviceType', 'record_function', 'ExecutionGraphObserver']
from .profiler import (
_KinetoProfile,
ExecutionGraphObserver,
profile,
ProfilerAction,
schedule,
supported_activities,
tensorboard_trace_handler,
)
__all__ = [
"profile",
"schedule",
"supported_activities",
"tensorboard_trace_handler",
"ProfilerAction",
"ProfilerActivity",
"kineto_available",
"DeviceType",
"record_function",
"ExecutionGraphObserver",
]
from . import itt