From 014a333df37ca331d4ae969d200aece76b1d4536 Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@fb.com>
Date: Fri, 26 Aug 2022 10:33:15 -0700
Subject: [PATCH] [Profiler][Minor] Extend Python bindings (#83622)

Adding some fields which are needed for memory profiling.

Differential Revision: [D38528382](https://our.internmc.facebook.com/intern/diff/D38528382/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/83622
Approved by: https://github.com/Gamrix
---
 test/test_profiler.py               | 44 +++++++++++++++++++++++++++--
 torch/_C/_profiler.pyi              | 38 +++++++++++++++++--------
 torch/csrc/profiler/python/init.cpp | 43 +++++++++++++++++++++++-----
 torch/profiler/__init__.py          | 36 ++++++++++++++++-------
 4 files changed, 130 insertions(+), 31 deletions(-)

diff --git a/test/test_profiler.py b/test/test_profiler.py
index 63e370208d7..378afcab310 100644
--- a/test/test_profiler.py
+++ b/test/test_profiler.py
@@ -1283,10 +1283,11 @@ class TestTorchTidyProfiler(TestCase):
 
     def test_tensor_properties(self):
         x = torch.ones(10, 10).as_strided([4, 4], [12, 3])
-        y = torch.ones(4, 1)
+        y = torch.ones(4, 1, requires_grad=True)
 
         with profile(with_stack=True, profile_memory=True, record_shapes=True) as p:
             _ = x + y
+            _ = x * y
 
         nodes = p.profiler.kineto_results.experimental_event_tree()
         node = find_node_with_name(nodes, "aten::add")
@@ -1306,6 +1307,13 @@ class TestTorchTidyProfiler(TestCase):
         self.assertEqual(layout_info, [torch.strided, torch.strided, None])
         device_info = [x.device if x else None for x in input_info.tensor_metadata]
         self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
+        self.assertEqual(node.extra_fields.scope, torch.profiler.RecordScope.FUNCTION)
+
+        mul_node = find_node_with_name(nodes, "aten::mul")
+        self.assertIsNotNone(mul_node)
+        self.assertEqual(
+            node.extra_fields.sequence_number + 1,
+            mul_node.extra_fields.sequence_number)
 
     def test_sparse_tensors(self):
         i = [[0, 1, 1], [2, 0, 2]]
@@ -1363,7 +1371,6 @@ class TestTorchTidyProfiler(TestCase):
         device_info = [x.device if x else None for x in input_info.tensor_metadata]
         self.assertEqual(device_info, [torch.device("cpu"), torch.device("cpu"), None])
 
-
     def test_scalar_ins(self):
         x = torch.ones(5, 5)
         alpha = 0.9
@@ -1415,6 +1422,39 @@ class TestTorchTidyProfiler(TestCase):
         expected += [(name, val.storage().data_ptr()) for name, val in net.fc2._parameters.items()]
         self.assertEqual(expected, params, f"{expected} vs. {params}")
 
+    def test_allocations(self):
+        gc.collect()
+        with profile(profile_memory=True) as p:
+            x = torch.empty((3, 4))
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "[memory]")
+        self.assertIsNotNone(node)
+
+        alloc_size = 3 * 4 * 4  # fp32 -> 4 bytes
+        ptr = node.extra_fields.ptr
+        self.assertGreater(ptr, 0)
+        self.assertEqual(node.extra_fields.alloc_size, alloc_size)
+        self.assertEqual(node.extra_fields.device_type, torch._C._autograd.DeviceType.CPU)
+        self.assertEqual(node.extra_fields.device_index, -1)
+        total_allocated = node.extra_fields.total_allocated
+
+        # total_reserved is only for CUDACachingAllocator
+        self.assertEqual(node.extra_fields.total_reserved, 0)
+
+        with profile(profile_memory=True) as p:
+            del x
+            gc.collect()
+
+        nodes = p.profiler.kineto_results.experimental_event_tree()
+        node = find_node_with_name(nodes, "[memory]")
+        self.assertIsNotNone(node)
+
+        self.assertEqual(node.extra_fields.ptr, ptr)
+        self.assertEqual(node.extra_fields.alloc_size, -alloc_size)
+        self.assertEqual(node.extra_fields.device_type, torch._C._autograd.DeviceType.CPU)
+        self.assertEqual(node.extra_fields.device_index, -1)
+        self.assertEqual(node.extra_fields.total_allocated, total_allocated - alloc_size)
 
 
 @dataclass(frozen=True)
diff --git a/torch/_C/_profiler.pyi b/torch/_C/_profiler.pyi
index 1690490604c..06676df7ce4 100644
--- a/torch/_C/_profiler.pyi
+++ b/torch/_C/_profiler.pyi
@@ -3,6 +3,18 @@ from typing import List, Union
 
 # defined in torch/csrc/profiler/python/init.cpp
 
+class RecordScope(Enum):
+    FUNCTION = ...
+    BACKWARD_FUNCTION = ...
+    TORCHSCRIPT_FUNCTION = ...
+    KERNEL_FUNCTION_DTYPE = ...
+    CUSTOM_CLASS = ...
+    BUILD_FEATURE = ...
+    LITE_INTERPRETER = ...
+    USER_SCOPE = ...
+    STATIC_RUNTIME_OP = ...
+    STATIC_RUNTIME_MODEL = ...
+
 class ProfilerState(Enum):
     Disable = ...
     CPU = ...
@@ -12,8 +24,7 @@ class ProfilerState(Enum):
     KINETO = ...
     KINETO_GPU_FALLBACK = ...
 
-class ActiveProfilerType:
-    ...
+class ActiveProfilerType: ...
 
 class ProfilerActivity(Enum):
     CPU = ...
@@ -50,9 +61,15 @@ class _ProfilerEvent:
     duration_time_ns: int
     parent: _ProfilerEvent
     children: List[_ProfilerEvent]
-    extra_fields: Union[_ExtraFields_Allocation, _ExtraFields_Backend,
-                        _ExtraFields_PyCall, _ExtraFields_PyCCall,
-                        _ExtraFields_TorchOp]
+    extra_fields: Union[
+        _ExtraFields_TorchOp,
+        _ExtraFields_Backend,
+        _ExtraFields_Allocation,
+        _ExtraFields_OutOfMemory,
+        _ExtraFields_PyCall,
+        _ExtraFields_PyCCall,
+        _ExtraFields_Kineto,
+    ]
     def name(self) -> str: ...
     ...
 
@@ -79,11 +96,9 @@ class _ExtraFields_TorchOp:
     inputs: _Inputs
     ...
 
-class _ExtraFields_Backend:
-    ...
-
-class _ExtraFields_Allocation:
-    ...
+class _ExtraFields_Backend: ...
+class _ExtraFields_Allocation: ...
+class _ExtraFields_OutOfMemory: ...
 
 class _ExtraFields_PyCCall:
     caller: _PyFrameState
@@ -93,5 +108,4 @@ class _ExtraFields_PyCall:
     caller: _PyFrameState
     ...
 
-class _ExtraFields_Kineto:
-    ...
+class _ExtraFields_Kineto: ...
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index e4cd54182d5..43153f2c163 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/profiler/python/init.h>
 
+#include <ATen/record_function.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/jit/python/pybind_utils.h>
 #include <torch/csrc/profiler/collection.h>
@@ -14,6 +15,18 @@ void initPythonBindings(PyObject* module) {
 
   using namespace torch::profiler::impl;
 
+  py::enum_<at::RecordScope>(m, "RecordScope")
+      .value("FUNCTION", at::RecordScope::FUNCTION)
+      .value("BACKWARD_FUNCTION", at::RecordScope::BACKWARD_FUNCTION)
+      .value("TORCHSCRIPT_FUNCTION", at::RecordScope::TORCHSCRIPT_FUNCTION)
+      .value("KERNEL_FUNCTION_DTYPE", at::RecordScope::KERNEL_FUNCTION_DTYPE)
+      .value("CUSTOM_CLASS", at::RecordScope::CUSTOM_CLASS)
+      .value("BUILD_FEATURE", at::RecordScope::BUILD_FEATURE)
+      .value("LITE_INTERPRETER", at::RecordScope::LITE_INTERPRETER)
+      .value("USER_SCOPE", at::RecordScope::USER_SCOPE)
+      .value("STATIC_RUNTIME_OP", at::RecordScope::STATIC_RUNTIME_OP)
+      .value("STATIC_RUNTIME_MODEL", at::RecordScope::STATIC_RUNTIME_MODEL);
+
   py::enum_<ProfilerState>(m, "ProfilerState")
       .value("Disabled", ProfilerState::Disabled)
       .value("CPU", ProfilerState::CPU)
@@ -122,15 +135,27 @@ void initPythonBindings(PyObject* module) {
         return py::reinterpret_borrow<py::object>(thp_device);
       });
 
-  py::class_<ExtraFields<EventType::TorchOp>>(m, "_ExtraFields_TorchOp")
-      .def_readonly("inputs", &ExtraFields<EventType::TorchOp>::inputs_)
-      .def_readonly(
-          "allow_tf32_cublas",
-          &ExtraFields<EventType::TorchOp>::allow_tf32_cublas_);
+  using torch_op_t = ExtraFields<EventType::TorchOp>;
+  py::class_<torch_op_t>(m, "_ExtraFields_TorchOp")
+      .def_readonly("inputs", &torch_op_t::inputs_)
+      .def_readonly("scope", &torch_op_t::scope_)
+      .def_readonly("sequence_number", &torch_op_t::sequence_number_)
+      .def_readonly("allow_tf32_cublas", &torch_op_t::allow_tf32_cublas_);
 
   py::class_<ExtraFields<EventType::Backend>>(m, "_ExtraFields_Backend");
 
-  py::class_<ExtraFields<EventType::Allocation>>(m, "_ExtraFields_Allocation");
+  using allocation_t = ExtraFields<EventType::Allocation>;
+  py::class_<allocation_t>(m, "_ExtraFields_Allocation")
+      .def_property_readonly(
+          "ptr",
+          [](const allocation_t& a) {
+            return reinterpret_cast<intptr_t>(a.ptr_);
+          })
+      .def_readonly("alloc_size", &allocation_t::alloc_size_)
+      .def_readonly("device_type", &allocation_t::device_type_)
+      .def_readonly("device_index", &allocation_t::device_index_)
+      .def_readonly("total_allocated", &allocation_t::total_allocated_)
+      .def_readonly("total_reserved", &allocation_t::total_reserved_);
 
   py::class_<NNModuleInfo>(m, "_NNModuleInfo")
       .def_property_readonly(
@@ -149,7 +174,8 @@ void initPythonBindings(PyObject* module) {
   py::class_<ExtraFields<EventType::PyCall>>(m, "_ExtraFields_PyCall")
       .def_readonly("module", &ExtraFields<EventType::PyCall>::module_)
       .def_readonly("callsite", &ExtraFields<EventType::PyCall>::callsite_)
-      .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
+      .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_)
+      .def_readonly("module", &ExtraFields<EventType::PyCall>::module_);
 
   py::class_<ExtraFields<EventType::PyCCall>>(m, "_ExtraFields_PyCCall")
       .def_readonly("caller", &ExtraFields<EventType::PyCall>::caller_);
@@ -162,6 +188,9 @@ void initPythonBindings(PyObject* module) {
         return s.funcname_.str();
       });
 
+  py::class_<ExtraFields<EventType::OutOfMemory>>(
+      m, "_ExtraFields_OutOfMemory");
+
   py::class_<ExtraFields<EventType::Kineto>>(m, "_ExtraFields_Kineto");
 
   py::class_<Result, std::shared_ptr<Result>>(m, "_ProfilerEvent")
diff --git a/torch/profiler/__init__.py b/torch/profiler/__init__.py
index aa6b3b72237..a0185a9799b 100644
--- a/torch/profiler/__init__.py
+++ b/torch/profiler/__init__.py
@@ -1,4 +1,4 @@
-r'''
+r"""
 PyTorch Profiler is a tool that allows the collection of performance metrics during training and inference.
 Profiler's context manager API can be used to better understand what model operators are the most expensive,
 examine their input shapes and stack traces, study device kernel activity and visualize the execution trace.
@@ -6,16 +6,32 @@ examine their input shapes and stack traces, study device kernel activity and vi
 .. note::
     An earlier version of the API in :mod:`torch.autograd` module is considered legacy and will be deprecated.
 
-'''
-from .profiler import profile, _KinetoProfile, \
-    schedule, supported_activities, tensorboard_trace_handler, ProfilerAction, \
-    ExecutionGraphObserver
-from torch._C._autograd import kineto_available, _supported_activities, DeviceType
-from torch._C._profiler import ProfilerActivity, _ExperimentalConfig
+"""
+from torch._C._autograd import _supported_activities, DeviceType, kineto_available
+from torch._C._profiler import _ExperimentalConfig, ProfilerActivity, RecordScope
 from torch.autograd.profiler import record_function
 
-__all__ = ['profile', 'schedule', 'supported_activities',
-           'tensorboard_trace_handler', 'ProfilerAction', 'ProfilerActivity',
-           'kineto_available', 'DeviceType', 'record_function', 'ExecutionGraphObserver']
+from .profiler import (
+    _KinetoProfile,
+    ExecutionGraphObserver,
+    profile,
+    ProfilerAction,
+    schedule,
+    supported_activities,
+    tensorboard_trace_handler,
+)
+
+__all__ = [
+    "profile",
+    "schedule",
+    "supported_activities",
+    "tensorboard_trace_handler",
+    "ProfilerAction",
+    "ProfilerActivity",
+    "kineto_available",
+    "DeviceType",
+    "record_function",
+    "ExecutionGraphObserver",
+]
 
 from . import itt