mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
[Profiler][Minor] Extend Python bindings (#83622)
Adding some fields which are needed for memory profiling. Differential Revision: [D38528382](https://our.internmc.facebook.com/intern/diff/D38528382/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/83622 Approved by: https://github.com/Gamrix
This commit is contained in:
parent
681c38704e
commit
014a333df3
4 changed files with 130 additions and 31 deletions
|
|
@ -1283,10 +1283,11 @@ class TestTorchTidyProfiler(TestCase):
|
|||
|
||||
def test_tensor_properties(self):
|
||||
x = torch.ones(10, 10).as_strided([4, 4], [12, 3])
|
||||
y = torch.ones(4, 1)
|
||||
y = torch.ones(4, 1, requires_grad=True)
|
||||
|
||||
with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
|
||||
_ = x + y
|
||||
_ = x * y
|
||||
|
||||
nodes = p.profiler.kineto_results.experimental_event_tree()
|
||||
node = find_node_with_name(nodes, "aten::add")
|
||||
|
|
@ -1306,6 +1307,13 @@ class TestTorchTidyProfiler(TestCase):
|
|||
self.assertEqual(layout_info, [torch.strided, torch.strided, None])
|
||||
device_info = [x.device if x else None for x in input_info.tensor_metadata]
|
||||
self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
|
||||
self.assertEqual(node.extra_fields.scope, torch.profiler.RecordScope.FUNCTION)
|
||||
|
||||
mul_node = find_node_with_name(nodes, "aten::mul")
|
||||
self.assertIsNotNone(mul_node)
|
||||
self.assertEqual(
|
||||
node.extra_fields.sequence_number + 1,
|
||||
mul_node.extra_fields.sequence_number)
|
||||
|
||||
def test_sparse_tensors(self):
|
||||
i = [[0, 1, 1], [2, 0, 2]]
|
||||
|
|
@ -1363,7 +1371,6 @@ class TestTorchTidyProfiler(TestCase):
|
|||
device_info = [x.device if x else None for x in input_info.tensor_metadata]
|
||||
self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
|
||||
|
||||
|
||||
def test_scalar_ins(self):
|
||||
x = torch.ones(5, 5)
|
||||
alpha = 0.9
|
||||
|
|
@ -1415,6 +1422,39 @@ class TestTorchTidyProfiler(TestCase):
|
|||
expected += [(name, val.storage().data_ptr()) for name, val in net.fc2._parameters.items()]
|
||||
self.assertEqual(expected, params, f"{expected} vs. {params}")
|
||||
|
||||
def test_allocations(self):
|
||||
gc.collect()
|
||||
with profile(profile_memory=True) as p:
|
||||
x = torch.empty((3, 4))
|
||||
|
||||
nodes = p.profiler.kineto_results.experimental_event_tree()
|
||||
node = find_node_with_name(nodes, "[memory]")
|
||||
self.assertIsNotNone(node)
|
||||
|
||||
alloc_size = 3 * 4 * 4 # fp32 -> 4 bytes
|
||||
ptr = node.extra_fields.ptr
|
||||
self.assertGreater(ptr, 0)
|
||||
self.assertEqual(node.extra_fields.alloc_size, alloc_size)
|
||||
self.assertEqual(node.extra_fields.device_type, torch._C._autograd.DeviceType.CPU)
|
||||
self.assertEqual(node.extra_fields.device_index, -1)
|
||||
total_allocated = node.extra_fields.total_allocated
|
||||
|
||||
# total_reserved is only for CUDACachingAllocator
|
||||
self.assertEqual(node.extra_fields.total_reserved, 0)
|
||||
|
||||
with profile(profile_memory=True) as p:
|
||||
del x
|
||||
gc.collect()
|
||||
|
||||
nodes = p.profiler.kineto_results.experimental_event_tree()
|
||||
node = find_node_with_name(nodes, "[memory]")
|
||||
self.assertIsNotNone(node)
|
||||
|
||||
self.assertEqual(node.extra_fields.ptr, ptr)
|
||||
self.assertEqual(node.extra_fields.alloc_size, -alloc_size)
|
||||
self.assertEqual(node.extra_fields.device_type, torch._C._autograd.DeviceType.CPU)
|
||||
self.assertEqual(node.extra_fields.device_index, -1)
|
||||
self.assertEqual(node.extra_fields.total_allocated, total_allocated - alloc_size)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
|
|
|
|||
|
|
@ -3,6 +3,18 @@ from typing import List, Union
|
|||
|
||||
# defined in torch/csrc/profiler/python/init.cpp
|
||||
|
||||
class RecordScope(Enum):
|
||||
FUNCTION = ...
|
||||
BACKWARD_FUNCTION = ...
|
||||
TORCHSCRIPT_FUNCTION = ...
|
||||
KERNEL_FUNCTION_DTYPE = ...
|
||||
CUSTOM_CLASS = ...
|
||||
BUILD_FEATURE = ...
|
||||
LITE_INTERPRETER = ...
|
||||
USER_SCOPE = ...
|
||||
STATIC_RUNTIME_OP = ...
|
||||
STATIC_RUNTIME_MODEL = ...
|
||||
|
||||
class ProfilerState(Enum):
|
||||
Disable = ...
|
||||
CPU = ...
|
||||
|
|
@ -12,8 +24,7 @@ class ProfilerState(Enum):
|
|||
KINETO = ...
|
||||
KINETO_GPU_FALLBACK = ...
|
||||
|
||||
class ActiveProfilerType:
|
||||
...
|
||||
class ActiveProfilerType: ...
|
||||
|
||||
class ProfilerActivity(Enum):
|
||||
CPU = ...
|
||||
|
|
@ -50,9 +61,15 @@ class _ProfilerEvent:
|
|||
duration_time_ns: int
|
||||
parent: _ProfilerEvent
|
||||
children: List[_ProfilerEvent]
|
||||
extra_fields: Union[_ExtraFields_Allocation, _ExtraFields_Backend,
|
||||
_ExtraFields_PyCall, _ExtraFields_PyCCall,
|
||||
_ExtraFields_TorchOp]
|
||||
extra_fields: Union[
|
||||
_ExtraFields_TorchOp,
|
||||
_ExtraFields_Backend,
|
||||
_ExtraFields_Allocation,
|
||||
_ExtraFields_OutOfMemory,
|
||||
_ExtraFields_PyCall,
|
||||
_ExtraFields_PyCCall,
|
||||
_ExtraFields_Kineto,
|
||||
]
|
||||
def name(self) -> str: ...
|
||||
...
|
||||
|
||||
|
|
@ -79,11 +96,9 @@ class _ExtraFields_TorchOp:
|
|||
inputs: _Inputs
|
||||
...
|
||||
|
||||
class _ExtraFields_Backend:
|
||||
...
|
||||
|
||||
class _ExtraFields_Allocation:
|
||||
...
|
||||
class _ExtraFields_Backend: ...
|
||||
class _ExtraFields_Allocation: ...
|
||||
class _ExtraFields_OutOfMemory: ...
|
||||
|
||||
class _ExtraFields_PyCCall:
|
||||
caller: _PyFrameState
|
||||
|
|
@ -93,5 +108,4 @@ class _ExtraFields_PyCall:
|
|||
caller: _PyFrameState
|
||||
...
|
||||
|
||||
class _ExtraFields_Kineto:
|
||||
...
|
||||
class _ExtraFields_Kineto: ...
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
#include <torch/csrc/profiler/python/init.h>
|
||||
|
||||
#include <ATen/record_function.h>
|
||||
#include <torch/csrc/autograd/utils/wrap_outputs.h>
|
||||
#include <torch/csrc/jit/python/pybind_utils.h>
|
||||
#include <torch/csrc/profiler/collection.h>
|
||||
|
|
@ -14,6 +15,18 @@ void initPythonBindings(PyObject* module) {
|
|||
|
||||
using namespace torch::profiler::impl;
|
||||
|
||||
py::enum_<at::RecordScope>(m, "RecordScope")
|
||||
.value("FUNCTION", at::RecordScope::FUNCTION)
|
||||
.value("BACKWARD_FUNCTION", at::RecordScope::BACKWARD_FUNCTION)
|
||||
.value("TORCHSCRIPT_FUNCTION", at::RecordScope::TORCHSCRIPT_FUNCTION)
|
||||
.value("KERNEL_FUNCTION_DTYPE", at::RecordScope::KERNEL_FUNCTION_DTYPE)
|
||||
.value("CUSTOM_CLASS", at::RecordScope::CUSTOM_CLASS)
|
||||
.value("BUILD_FEATURE", at::RecordScope::BUILD_FEATURE)
|
||||
.value("LITE_INTERPRETER", at::RecordScope::LITE_INTERPRETER)
|
||||
.value("USER_SCOPE", at::RecordScope::USER_SCOPE)
|
||||
.value("STATIC_RUNTIME_OP", at::RecordScope::STATIC_RUNTIME_OP)
|
||||
.value("STATIC_RUNTIME_MODEL", at::RecordScope::STATIC_RUNTIME_MODEL);
|
||||
|
||||
py::enum_<ProfilerState>(m, "ProfilerState")
|
||||
.value("Disabled", ProfilerState::Disabled)
|
||||
.value("CPU", ProfilerState::CPU)
|
||||
|
|
@ -122,15 +135,27 @@ void initPythonBindings(PyObject* module) {
|
|||
return py::reinterpret_borrow<py::object>(thp_device);
|
||||
});
|
||||
|
||||
py::class_<ExtraFields<EventType::TorchOp>>(m, "_ExtraFields_TorchOp")
|
||||
.def_readonly("inputs", &ExtraFields<EventType::TorchOp>::inputs_)
|
||||
.def_readonly(
|
||||
"allow_tf32_cublas",
|
||||
&ExtraFields<EventType::TorchOp>::allow_tf32_cublas_);
|
||||
using torch_op_t = ExtraFields<EventType::TorchOp>;
|
||||
py::class_<torch_op_t>(m, "_ExtraFields_TorchOp")
|
||||
.def_readonly("inputs", &torch_op_t::inputs_)
|
||||
.def_readonly("scope", &torch_op_t::scope_)
|
||||
.def_readonly("sequence_number", &torch_op_t::sequence_number_)
|
||||
.def_readonly("allow_tf32_cublas", &torch_op_t::allow_tf32_cublas_);
|
||||
|
||||
py::class_<ExtraFields<EventType::Backend>>(m, "_ExtraFields_Backend");
|
||||
|
||||
py::class_<ExtraFields<EventType::Allocation>>(m, "_ExtraFields_Allocation");
|
||||
using allocation_t = ExtraFields<EventType::Allocation>;
|
||||
py::class_<allocation_t>(m, "_ExtraFields_Allocation")
|
||||
.def_property_readonly(
|
||||
"ptr",
|
||||
[](const allocation_t& a) {
|
||||
return reinterpret_cast<intptr_t>(a.ptr_);
|
||||
})
|
||||
.def_readonly("alloc_size", &allocation_t::alloc_size_)
|
||||
.def_readonly("device_type", &allocation_t::device_type_)
|
||||
.def_readonly("device_index", &allocation_t::device_index_)
|
||||
.def_readonly("total_allocated", &allocation_t::total_allocated_)
|
||||
.def_readonly("total_reserved", &allocation_t::total_reserved_);
|
||||
|
||||
py::class_<NNModuleInfo>(m, "_NNModuleInfo")
|
||||
.def_property_readonly(
|
||||
|
|
@ -149,7 +174,8 @@ void initPythonBindings(PyObject* module) {
|
|||
py::class_<ExtraFields<EventType::PyCall>>(m, "_ExtraFields_PyCall")
|
||||
.def_readonly("module", &ExtraFields<EventType::PyCall>::module_)
|
||||
.def_readonly("callsite", &ExtraFields<EventType::PyCall>::callsite_)
|
||||
.def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
|
||||
.def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_)
|
||||
.def_readonly("module", &ExtraFields<EventType::PyCall>::module_);
|
||||
|
||||
py::class_<ExtraFields<EventType::PyCCall>>(m, "_ExtraFields_PyCCall")
|
||||
.def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
|
||||
|
|
@ -162,6 +188,9 @@ void initPythonBindings(PyObject* module) {
|
|||
return s.funcname_.str();
|
||||
});
|
||||
|
||||
py::class_<ExtraFields<EventType::OutOfMemory>>(
|
||||
m, "_ExtraFields_OutOfMemory");
|
||||
|
||||
py::class_<ExtraFields<EventType::Kineto>>(m, "_ExtraFields_Kineto");
|
||||
|
||||
py::class_<Result, std::shared_ptr<Result>>(m, "_ProfilerEvent")
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
r'''
|
||||
r"""
|
||||
PyTorch Profiler is a tool that allows the collection of performance metrics during training and inference.
|
||||
Profiler's context manager API can be used to better understand what model operators are the most expensive,
|
||||
examine their input shapes and stack traces, study device kernel activity and visualize the execution trace.
|
||||
|
|
@ -6,16 +6,32 @@ examine their input shapes and stack traces, study device kernel activity and vi
|
|||
.. note::
|
||||
An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.
|
||||
|
||||
'''
|
||||
from .profiler import profile, _KinetoProfile, \
|
||||
schedule, supported_activities, tensorboard_trace_handler, ProfilerAction, \
|
||||
ExecutionGraphObserver
|
||||
from torch._C._autograd import kineto_available, _supported_activities, DeviceType
|
||||
from torch._C._profiler import ProfilerActivity, _ExperimentalConfig
|
||||
"""
|
||||
from torch._C._autograd import _supported_activities, DeviceType, kineto_available
|
||||
from torch._C._profiler import _ExperimentalConfig, ProfilerActivity, RecordScope
|
||||
from torch.autograd.profiler import record_function
|
||||
|
||||
__all__ = ['profile', 'schedule', 'supported_activities',
|
||||
'tensorboard_trace_handler', 'ProfilerAction', 'ProfilerActivity',
|
||||
'kineto_available', 'DeviceType', 'record_function', 'ExecutionGraphObserver']
|
||||
from .profiler import (
|
||||
_KinetoProfile,
|
||||
ExecutionGraphObserver,
|
||||
profile,
|
||||
ProfilerAction,
|
||||
schedule,
|
||||
supported_activities,
|
||||
tensorboard_trace_handler,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"profile",
|
||||
"schedule",
|
||||
"supported_activities",
|
||||
"tensorboard_trace_handler",
|
||||
"ProfilerAction",
|
||||
"ProfilerActivity",
|
||||
"kineto_available",
|
||||
"DeviceType",
|
||||
"record_function",
|
||||
"ExecutionGraphObserver",
|
||||
]
|
||||
|
||||
from . import itt
|
||||
|
|
|
|||
Loading…
Reference in a new issue