[Profiler][Minor] Extend Python bindings (#83622)

Adding some fields which are needed for memory profiling. Differential Revision: [D38528382](https://our.internmc.facebook.com/intern/diff/D38528382/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/83622 Approved by: https://github.com/Gamrix
2026-05-14 20:57:59 +00:00 · 2022-08-26 10:33:15 -07:00 · 2022-08-26 10:33:15 -07:00 · 014a333df3
commit 014a333df3
parent 681c38704e
4 changed files with 130 additions and 31 deletions
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@ -1283,10 +1283,11 @@ class TestTorchTidyProfiler(TestCase):

    def test_tensor_properties(self):
        x = torch.ones(10, 10).as_strided([4, 4], [12, 3])
-        y = torch.ones(4, 1)
+        y = torch.ones(4, 1, requires_grad=True)

        with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
            _ = x + y
+            _ = x * y

        nodes = p.profiler.kineto_results.experimental_event_tree()
        node = find_node_with_name(nodes, "aten::add")
@ -1306,6 +1307,13 @@ class TestTorchTidyProfiler(TestCase):
        self.assertEqual(layout_info, [torch.strided, torch.strided, None])
        device_info = [x.device if x else None for x in input_info.tensor_metadata]
        self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
+        self.assertEqual(node.extra_fields.scope, torch.profiler.RecordScope.FUNCTION)
+
+        mul_node = find_node_with_name(nodes, "aten::mul")
+        self.assertIsNotNone(mul_node)
+        self.assertEqual(
+            node.extra_fields.sequence_number + 1,
+            mul_node.extra_fields.sequence_number)

    def test_sparse_tensors(self):
        i = [[0, 1, 1], [2, 0, 2]]
@ -1363,7 +1371,6 @@ class TestTorchTidyProfiler(TestCase):
        device_info = [x.device if x else None for x in input_info.tensor_metadata]
        self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])

-
    def test_scalar_ins(self):
        x = torch.ones(5, 5)
        alpha = 0.9
@ -1415,6 +1422,39 @@ class TestTorchTidyProfiler(TestCase):
        expected += [(name, val.storage().data_ptr()) for name, val in net.fc2._parameters.items()]
        self.assertEqual(expected, params, f"{expected} vs. {params}")

+    def test_allocations(self):
+        gc.collect()
+        with profile(profile_memory=True) as p:
+            x = torch.empty((3, 4))
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "[memory]")
+        self.assertIsNotNone(node)
+
+        alloc_size = 3 * 4 * 4  # fp32 -> 4 bytes
+        ptr = node.extra_fields.ptr
+        self.assertGreater(ptr, 0)
+        self.assertEqual(node.extra_fields.alloc_size, alloc_size)
+        self.assertEqual(node.extra_fields.device_type, torch._C._autograd.DeviceType.CPU)
+        self.assertEqual(node.extra_fields.device_index, -1)
+        total_allocated = node.extra_fields.total_allocated
+
+        # total_reserved is only for CUDACachingAllocator
+        self.assertEqual(node.extra_fields.total_reserved, 0)
+
+        with profile(profile_memory=True) as p:
+            del x
+            gc.collect()
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "[memory]")
+        self.assertIsNotNone(node)
+
+        self.assertEqual(node.extra_fields.ptr, ptr)
+        self.assertEqual(node.extra_fields.alloc_size, -alloc_size)
+        self.assertEqual(node.extra_fields.device_type, torch._C._autograd.DeviceType.CPU)
+        self.assertEqual(node.extra_fields.device_index, -1)
+        self.assertEqual(node.extra_fields.total_allocated, total_allocated - alloc_size)


@dataclass(frozen=True)
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@ -3,6 +3,18 @@ from typing import List, Union

 # defined in torch/csrc/profiler/python/init.cpp

+class RecordScope(Enum):
+    FUNCTION = ...
+    BACKWARD_FUNCTION = ...
+    TORCHSCRIPT_FUNCTION = ...
+    KERNEL_FUNCTION_DTYPE = ...
+    CUSTOM_CLASS = ...
+    BUILD_FEATURE = ...
+    LITE_INTERPRETER = ...
+    USER_SCOPE = ...
+    STATIC_RUNTIME_OP = ...
+    STATIC_RUNTIME_MODEL = ...
+
 class ProfilerState(Enum):
    Disable = ...
    CPU = ...
@ -12,8 +24,7 @@ class ProfilerState(Enum):
    KINETO = ...
    KINETO_GPU_FALLBACK = ...

-class ActiveProfilerType:
-    ...
+class ActiveProfilerType: ...

 class ProfilerActivity(Enum):
    CPU = ...
@ -50,9 +61,15 @@ class _ProfilerEvent:
    duration_time_ns: int
    parent: _ProfilerEvent
    children: List[_ProfilerEvent]
-    extra_fields: Union[_ExtraFields_Allocation, _ExtraFields_Backend,
-                        _ExtraFields_PyCall, _ExtraFields_PyCCall,
-                        _ExtraFields_TorchOp]
+    extra_fields: Union[
+        _ExtraFields_TorchOp,
+        _ExtraFields_Backend,
+        _ExtraFields_Allocation,
+        _ExtraFields_OutOfMemory,
+        _ExtraFields_PyCall,
+        _ExtraFields_PyCCall,
+        _ExtraFields_Kineto,
+    ]
    def name(self) -> str: ...
    ...

@ -79,11 +96,9 @@ class _ExtraFields_TorchOp:
    inputs: _Inputs
    ...

-class _ExtraFields_Backend:
-    ...
-
-class _ExtraFields_Allocation:
-    ...
+class _ExtraFields_Backend: ...
+class _ExtraFields_Allocation: ...
+class _ExtraFields_OutOfMemory: ...

 class _ExtraFields_PyCCall:
    caller: _PyFrameState
@ -93,5 +108,4 @@ class _ExtraFields_PyCall:
    caller: _PyFrameState
    ...

-class _ExtraFields_Kineto:
-    ...
+class _ExtraFields_Kineto: ...
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@ -1,5 +1,6 @@
 #include <torch/csrc/profiler/python/init.h>

+#include <ATen/record_function.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/profiler/collection.h>
@ -14,6 +15,18 @@ void initPythonBindings(PyObject* module) {

  using namespace torch::profiler::impl;

+  py::enum_<at::RecordScope>(m, "RecordScope")
+      .value("FUNCTION", at::RecordScope::FUNCTION)
+      .value("BACKWARD_FUNCTION", at::RecordScope::BACKWARD_FUNCTION)
+      .value("TORCHSCRIPT_FUNCTION", at::RecordScope::TORCHSCRIPT_FUNCTION)
+      .value("KERNEL_FUNCTION_DTYPE", at::RecordScope::KERNEL_FUNCTION_DTYPE)
+      .value("CUSTOM_CLASS", at::RecordScope::CUSTOM_CLASS)
+      .value("BUILD_FEATURE", at::RecordScope::BUILD_FEATURE)
+      .value("LITE_INTERPRETER", at::RecordScope::LITE_INTERPRETER)
+      .value("USER_SCOPE", at::RecordScope::USER_SCOPE)
+      .value("STATIC_RUNTIME_OP", at::RecordScope::STATIC_RUNTIME_OP)
+      .value("STATIC_RUNTIME_MODEL", at::RecordScope::STATIC_RUNTIME_MODEL);
+
  py::enum_<ProfilerState>(m, "ProfilerState")
      .value("Disabled", ProfilerState::Disabled)
      .value("CPU", ProfilerState::CPU)
@ -122,15 +135,27 @@ void initPythonBindings(PyObject* module) {
        return py::reinterpret_borrow<py::object>(thp_device);
      });

-  py::class_<ExtraFields<EventType::TorchOp>>(m, "_ExtraFields_TorchOp")
-      .def_readonly("inputs", &ExtraFields<EventType::TorchOp>::inputs_)
-      .def_readonly(
-          "allow_tf32_cublas",
-          &ExtraFields<EventType::TorchOp>::allow_tf32_cublas_);
+  using torch_op_t = ExtraFields<EventType::TorchOp>;
+  py::class_<torch_op_t>(m, "_ExtraFields_TorchOp")
+      .def_readonly("inputs", &torch_op_t::inputs_)
+      .def_readonly("scope", &torch_op_t::scope_)
+      .def_readonly("sequence_number", &torch_op_t::sequence_number_)
+      .def_readonly("allow_tf32_cublas", &torch_op_t::allow_tf32_cublas_);

  py::class_<ExtraFields<EventType::Backend>>(m, "_ExtraFields_Backend");

-  py::class_<ExtraFields<EventType::Allocation>>(m, "_ExtraFields_Allocation");
+  using allocation_t = ExtraFields<EventType::Allocation>;
+  py::class_<allocation_t>(m, "_ExtraFields_Allocation")
+      .def_property_readonly(
+          "ptr",
+          [](const allocation_t& a) {
+            return reinterpret_cast<intptr_t>(a.ptr_);
+          })
+      .def_readonly("alloc_size", &allocation_t::alloc_size_)
+      .def_readonly("device_type", &allocation_t::device_type_)
+      .def_readonly("device_index", &allocation_t::device_index_)
+      .def_readonly("total_allocated", &allocation_t::total_allocated_)
+      .def_readonly("total_reserved", &allocation_t::total_reserved_);

  py::class_<NNModuleInfo>(m, "_NNModuleInfo")
      .def_property_readonly(
@ -149,7 +174,8 @@ void initPythonBindings(PyObject* module) {
  py::class_<ExtraFields<EventType::PyCall>>(m, "_ExtraFields_PyCall")
      .def_readonly("module", &ExtraFields<EventType::PyCall>::module_)
      .def_readonly("callsite", &ExtraFields<EventType::PyCall>::callsite_)
-      .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
+      .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_)
+      .def_readonly("module", &ExtraFields<EventType::PyCall>::module_);

  py::class_<ExtraFields<EventType::PyCCall>>(m, "_ExtraFields_PyCCall")
      .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
@ -162,6 +188,9 @@ void initPythonBindings(PyObject* module) {
        return s.funcname_.str();
      });

+  py::class_<ExtraFields<EventType::OutOfMemory>>(
+      m, "_ExtraFields_OutOfMemory");
+
  py::class_<ExtraFields<EventType::Kineto>>(m, "_ExtraFields_Kineto");

  py::class_<Result, std::shared_ptr<Result>>(m, "_ProfilerEvent")
--- a/torch/profiler/init.py
+++ b/torch/profiler/init.py
@ -1,4 +1,4 @@
-r'''
+r"""
 PyTorch Profiler is a tool that allows the collection of performance metrics during training and inference.
 Profiler's context manager API can be used to better understand what model operators are the most expensive,
 examine their input shapes and stack traces, study device kernel activity and visualize the execution trace.
@ -6,16 +6,32 @@ examine their input shapes and stack traces, study device kernel activity and vi
 .. note::
    An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.

-'''
-from .profiler import profile, _KinetoProfile, \
-    schedule, supported_activities, tensorboard_trace_handler, ProfilerAction, \
-    ExecutionGraphObserver
-from torch._C._autograd import kineto_available, _supported_activities, DeviceType
-from torch._C._profiler import ProfilerActivity, _ExperimentalConfig
+"""
+from torch._C._autograd import _supported_activities, DeviceType, kineto_available
+from torch._C._profiler import _ExperimentalConfig, ProfilerActivity, RecordScope
 from torch.autograd.profiler import record_function

-__all__ = ['profile', 'schedule', 'supported_activities',
-           'tensorboard_trace_handler', 'ProfilerAction', 'ProfilerActivity',
-           'kineto_available', 'DeviceType', 'record_function', 'ExecutionGraphObserver']
+from .profiler import (
+    _KinetoProfile,
+    ExecutionGraphObserver,
+    profile,
+    ProfilerAction,
+    schedule,
+    supported_activities,
+    tensorboard_trace_handler,
+)
+
+__all__ = [
+    "profile",
+    "schedule",
+    "supported_activities",
+    "tensorboard_trace_handler",
+    "ProfilerAction",
+    "ProfilerActivity",
+    "kineto_available",
+    "DeviceType",
+    "record_function",
+    "ExecutionGraphObserver",
+]

 from . import itt