From 73fbca1ea6ecc08ae4455a12b68fc2ead93a088c Mon Sep 17 00:00:00 2001
From: Sherlock Huang <bahuang@fb.com>
Date: Tue, 20 Sep 2022 20:49:22 +0000
Subject: [PATCH] Improve and expose cpp_backtrace to python binding (#84896)

We can now get cpp stack trace by calling torch.utils.get_cpp_backtrace()

Sample output when calling from a torch_dispatch stack:
```
<omitting python frames>
frame #23: torch::handle_torch_function_no_python_arg_parser(c10::ArrayRef<pybind11::handle>, _object*, _object*, char const*, _object*, char const*, torch::TorchFunctionName) (0x7f69330bab90 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/utils/python_arg_parser.cpp:323)
frame #24: <unknown function> (0x7f6932a09e79 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/python_variable.cpp:2252)
frame #25: <unknown function> (0x7f69261aee33 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/PythonFallbackKernel.cpp:56)
frame #26: <unknown function> (0x7f69261afef9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:19)
frame #27: c10::BoxedKernel::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadced in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:41)
frame #28: <unknown function> (0x7f6926fae9b9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/boxing.h:227)
frame #29: at::Tensor c10::Dispatcher::redispatch<at::Tensor, at::Tensor const&>(c10::TypedOperatorHandle<at::Tensor (at::Tensor const&)> const&, c10::DispatchKeySet, at::Tensor const&) const (0x7f6926e821f5 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:106)
frame #30: at::_ops::alias::redispatch(c10::DispatchKeySet, at::Tensor const&) (0x7f6927142c31 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:438)
frame #31: <unknown function> (0x7f692ae4f8be in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp:1361)
frame #32: <unknown function> (0x7f692ae4f9b1 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp:1362)
frame #33: <unknown function> (0x7f692aef77e9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h:13)
frame #34: <unknown function> (0x7f6926fae7d8 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:50)
frame #35: at::Tensor c10::Dispatcher::redispatch<at::Tensor, at::Tensor const&>(c10::TypedOperatorHandle<at::Tensor (at::Tensor const&)> const&, c10::DispatchKeySet, at::Tensor const&) const (0x7f6926e821c9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:97)
frame #36: at::_ops::alias::redispatch(c10::DispatchKeySet, at::Tensor const&) (0x7f6927142c31 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:438)
frame #37: <unknown function> (0x7f6929ec654a in /fsx/users/bahuang/repos/pytorch_fsx/build/aten/src/ATen/RedispatchFunctions.h:10697)
frame #38: <unknown function> (0x7f6929d9edae in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/VariableType_1.cpp:2837)
frame #39: <unknown function> (0x7f6929d9f043 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/VariableType_1.cpp:2838)
frame #40: <unknown function> (0x7f6929e7d2f9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h:13)
frame #41: <unknown function> (0x7f6929eb1344 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:478)
frame #42: <unknown function> (0x7f6929ea7b99 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:490)
frame #43: <unknown function> (0x7f6929e7d370 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:563)
frame #44: <unknown function> (0x7f6929e7d43a in /fsx/users/bahuang/repos/pytorch_fsx/c10/util/C++17.h:239)
frame #45: <unknown function> (0x7f6929e7d48c in /fsx/users/bahuang/repos/pytorch_fsx/c10/util/C++17.h:364)
frame #46: <unknown function> (0x7f6929e7d50a in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:554)
frame #47: c10::BoxedKernel::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadced in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:41)
frame #48: c10::KernelFunction::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadd26 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:43)
frame #49: c10::Dispatcher::redispatchBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f692603890a in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:652)
frame #50: <unknown function> (0x7f69260387f9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:388)
frame #51: <unknown function> (0x7f69261af0ef in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/PythonFallbackKernel.cpp:96)
frame #52: <unknown function> (0x7f69261aff2b in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:25)
frame #53: c10::BoxedKernel::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadced in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:41)
frame #54: c10::KernelFunction::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadd26 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:43)
frame #55: c10::Dispatcher::callBoxed(c10::OperatorHandle const&, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6925fd6ab2 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:628)
frame #56: <unknown function> (0x7f6925fd6690 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:376)
frame #57: <unknown function> (0x7f692bf5b525 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:380)
frame #58: <unknown function> (0x7f692bf59fac in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/runtime/register_c10_ops.cpp:15)
frame #59: <unknown function> (0x7f692bf5af41 in /usr/include/c++/7/bits/std_function.h:316)
frame #60: std::function<void (std::vector<c10::IValue, std::allocator<c10::IValue> >&)>::operator()(std::vector<c10::IValue, std::allocator<c10::IValue> >&) const (0x7f6932ab9a0f in /usr/include/c++/7/bits/std_function.h:706)
frame #61: <unknown function> (0x7f6932aad541 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/stack.h:41)
frame #62: <unknown function> (0x7f6932ab3102 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/python/pybind_utils.h:1206 (discriminator 1))
frame #63: <unknown function> (0x7f6932ab3943 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/python/pybind_utils.h:1272)
frame #64: <unknown function> (0x7f6932a46120 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/python/init.cpp:1767)
frame #65: <unknown function> (0x7f6932a997be in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/cast.h:1441)
frame #66: <unknown function> (0x7f6932a8a985 in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/cast.h:1410)
frame #67: <unknown function> (0x7f6932a66e1e in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/pybind11.h:249)
frame #68: <unknown function> (0x7f6932a66ec2 in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/pybind11.h:224)
frame #69: <unknown function> (0x7f6932473111 in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/pybind11.h:929)
frame #104: __libc_start_main (0x7f693485dc87 in /build/glibc-uZu3wS/glibc-2.27/csu/../csu/libc-start.c:310)
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/84896
Approved by: https://github.com/ezyang
---
 CONTRIBUTING.md              |   1 +
 c10/util/Backtrace.cpp       | 100 ++++++++++++++++++++++++++++++++---
 torch/_C/__init__.pyi.in     |   1 +
 torch/csrc/Module.cpp        |  14 +++++
 torch/utils/__init__.py      |   1 +
 torch/utils/cpp_backtrace.py |  11 ++++
 6 files changed, 121 insertions(+), 7 deletions(-)
 create mode 100644 torch/utils/cpp_backtrace.py
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e2101017d99..de599e50138 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -924,6 +924,7 @@ add-auto-load-safe-path /path/to/pytorch/.gdbinit
 
 ### C++ stacktraces
 Set `TORCH_SHOW_CPP_STACKTRACES=1` to get the C++ stacktrace when an error occurs in Python.
+Set `TORCH_SHOW_CPP_STACKTRACES_WITH_LINENO=1` to get the C++ stacktrace with file and line number.
 
 ## CUDA development tips
 
diff --git a/c10/util/Backtrace.cpp b/c10/util/Backtrace.cpp
index d19f8d0ba5f..0afd0fdc7b2 100644
--- a/c10/util/Backtrace.cpp
+++ b/c10/util/Backtrace.cpp
@@ -1,6 +1,7 @@
 #include <c10/util/Backtrace.h>
 #include <c10/util/Optional.h>
 #include <c10/util/Type.h>
+#include <c10/util/env.h>
 #include <c10/util/irange.h>
 
 #include <functional>
@@ -21,7 +22,14 @@
 #include <dlfcn.h>
 #include <unwind.h>
 #else
+#include <dlfcn.h>
 #include <execinfo.h>
+
+#ifndef __APPLE__
+// link.h is not available on IOS and Mac builds
+#include <link.h>
+#endif
+
 #endif
 #endif
 
@@ -87,6 +95,46 @@ void dump_stack(
 #if SUPPORTS_BACKTRACE
 namespace {
 
+#if !defined(C10_ANDROID) && !defined(__APPLE__)
+
+// converts a function's address in memory to its VMA address in the executable
+// file. VMA is what addr2line expects
+size_t ConvertToVMA(size_t addr) {
+  Dl_info info;
+  link_map* link_map;
+  dladdr1((void*)addr, &info, (void**)&link_map, RTLD_DL_LINKMAP);
+  return addr - link_map->l_addr;
+}
+
+std::string exec(const char* cmd) {
+  std::array<char, 128> buffer;
+  std::string result;
+  std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
+  if (!pipe) {
+    throw std::runtime_error("popen() failed!");
+  }
+  while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
+    result += buffer.data();
+  }
+  return result;
+}
+
+std::string rstrip(const std::string& s) {
+  const std::string WHITESPACE = " \n\r\t\f\v";
+  size_t end = s.find_last_not_of(WHITESPACE);
+  return (end == std::string::npos) ? "" : s.substr(0, end + 1);
+}
+
+bool use_addr2line() {
+  static bool _use_addr2line = []() {
+    return c10::utils::check_env("TORCH_SHOW_CPP_STACKTRACES_WITH_LINENO") ==
+        true;
+  }();
+  return _use_addr2line;
+}
+
+#endif // !defined(C10_ANDROID) && !defined(__APPLE__)
+
 struct FrameInformation {
   /// If available, the demangled name of the function at this frame, else
   /// whatever (possibly mangled) name we got from `backtrace()`.
@@ -99,6 +147,10 @@ struct FrameInformation {
   /// NOTE: In debugger parlance, the "object file" refers to the ELF file that
   /// the symbol originates from, i.e. either an executable or a library.
   std::string object_file;
+  /// Source file name and line number
+  std::string source_file_lineno;
+
+  bool is_python_frame;
 };
 
 #ifndef C10_ANDROID
@@ -108,7 +160,8 @@ bool is_python_frame(const FrameInformation& frame) {
 }
 
 c10::optional<FrameInformation> parse_frame_information(
-    const std::string& frame_string) {
+    const std::string& frame_string,
+    void* frame_pointer) {
   FrameInformation frame;
 
   // This is the function name in the CXX ABI mangled format, e.g. something
@@ -141,6 +194,7 @@ c10::optional<FrameInformation> parse_frame_information(
   frame.object_file = frame_string.substr(0, function_name_start - 1);
   frame.offset_into_function =
       frame_string.substr(offset_start, offset_end - offset_start);
+  frame.is_python_frame = is_python_frame(frame);
 
   // NOTE: We don't need to parse the return address because
   // we already have it from the call to `backtrace()`.
@@ -171,6 +225,30 @@ c10::optional<FrameInformation> parse_frame_information(
   }
 
   frame.function_name = demangle(mangled_function_name.c_str());
+
+#if !defined(__APPLE__)
+
+  if (use_addr2line() && !frame.is_python_frame) {
+    Dl_info info;
+    if (dladdr(frame_pointer, &info)) {
+      char command[256];
+      size_t VMA_addr = ConvertToVMA((size_t)frame_pointer);
+      // Need to decrease the VMA address by 1 to get the correct line number
+      // https://stackoverflow.com/questions/11579509/wrong-line-numbers-from-addr2line/63841497#63841497
+      VMA_addr -= 1;
+      snprintf(
+          command,
+          sizeof(command),
+          "addr2line -e %s -C %zx",
+          info.dli_fname,
+          VMA_addr);
+
+      frame.source_file_lineno = rstrip(exec(command));
+    }
+  }
+
+#endif // !defined(__APPLE__)
+
   return frame;
 }
 #endif /* !defined(C10_ANDROID) */
@@ -283,9 +361,10 @@ std::string get_backtrace(
   bool has_skipped_python_frames = false;
 
   for (const auto frame_number : c10::irange(callstack.size())) {
-    const auto frame = parse_frame_information(symbols[frame_number]);
+    const auto frame =
+        parse_frame_information(symbols[frame_number], callstack[frame_number]);
 
-    if (skip_python_frames && frame && is_python_frame(*frame)) {
+    if (skip_python_frames && frame && frame->is_python_frame) {
       if (!has_skipped_python_frames) {
         stream << "<omitting python frames>\n";
         has_skipped_python_frames = true;
@@ -297,10 +376,17 @@ std::string get_backtrace(
     stream << "frame #" << frame_number << ": ";
 
     if (frame) {
-      // <function_name> + <offset> (<return-address> in <object-file>)
-      stream << frame->function_name << " + " << frame->offset_into_function
-             << " (" << callstack[frame_number] << " in " << frame->object_file
-             << ")\n";
+      if (frame->source_file_lineno.empty()) {
+        // <function_name> + <offset> (<return-address> in <object-file>)
+        stream << frame->function_name << " + " << frame->offset_into_function
+               << " (" << callstack[frame_number] << " in "
+               << frame->object_file << ")\n";
+
+      } else {
+        // <function_name> (<return-address> in <filename>:<line-number>)
+        stream << frame->function_name << " (" << callstack[frame_number]
+               << " in " << frame->source_file_lineno << ")\n";
+      }
     } else {
       // In the edge-case where we couldn't parse the frame string, we can
       // just use it directly (it may have a different format).
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index 02f33ee94e0..9117099cf85 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -843,6 +843,7 @@ def _remove_meta_from_tls_dispatch_include() -> None: ...
 # https://code.activestate.com/lists/python-dev/139675/
 def _to_dlpack(data: Tensor) -> Any: ...  # THPModule_toDLPack
 def _from_dlpack(data: Any) -> Tensor: ...  # THPModule_fromDLPack
+def _get_cpp_backtrace(frames_to_skip: _int, maximum_number_of_frames: _int) -> str: ...  # THPModule_getCppBacktrace
 def set_flush_denormal(arg: _bool) -> _bool: ...  # THPModule_setFlushDenormal
 def get_default_dtype() -> _dtype: ...  # THPModule_getDefaultDtype
 def _get_default_device() -> str: ...  # THPModule_getDefaultDevice
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 5a7263d621a..b2399bf8105 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -425,6 +425,19 @@ PyObject* THPModule_fromDLPack(PyObject* _unused, PyObject* data) {
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THModule_getCppBacktrace(PyObject* _unused, PyObject* args) {
+  HANDLE_TH_ERRORS
+  size_t frames_to_skip;
+  size_t maximum_number_of_frames;
+  if (!PyArg_ParseTuple(
+          args, "LL", &frames_to_skip, &maximum_number_of_frames)) {
+    return nullptr;
+  }
+  return THPUtils_packString(
+      c10::get_backtrace(frames_to_skip, maximum_number_of_frames, true));
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject* THPModule_setAllowTF32CuDNN(PyObject* _unused, PyObject* arg) {
   THPUtils_assert(
       PyBool_Check(arg),
@@ -866,6 +879,7 @@ static PyMethodDef TorchMethods[] = {
      nullptr},
     {"_to_dlpack", THPModule_toDLPack, METH_O, nullptr},
     {"_from_dlpack", THPModule_fromDLPack, METH_O, nullptr},
+    {"_get_cpp_backtrace", THModule_getCppBacktrace, METH_VARARGS, nullptr},
     {"set_flush_denormal", THPModule_setFlushDenormal, METH_O, nullptr},
     {"get_default_dtype", THPModule_getDefaultDtype, METH_NOARGS, nullptr},
     {"_get_default_device", THPModule_getDefaultDevice, METH_NOARGS, nullptr},
diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
index 2d5649a653c..f05ffc3fc96 100644
--- a/torch/utils/__init__.py
+++ b/torch/utils/__init__.py
@@ -3,6 +3,7 @@ import sys
 
 from .throughput_benchmark import ThroughputBenchmark
 from ._crash_handler import enable_minidumps, disable_minidumps, enable_minidumps_on_exceptions
+from .cpp_backtrace import get_cpp_backtrace
 
 # Set the module for a given object for nicer printing
 def set_module(obj, mod):
diff --git a/torch/utils/cpp_backtrace.py b/torch/utils/cpp_backtrace.py
new file mode 100644
index 00000000000..d45c216adca
--- /dev/null
+++ b/torch/utils/cpp_backtrace.py
@@ -0,0 +1,11 @@
+from torch._C import _get_cpp_backtrace
+
+def get_cpp_backtrace(frames_to_skip=0, maximum_number_of_frames=64) -> str:
+    r"""
+    Returns a string containing the C++ stack trace of the current thread.
+    Args:
+        frames_to_skip (int): the number of frames to skip from the top of the stack
+        maximum_number_of_frames (int): the maximum number of frames to return
+    """
+
+    return _get_cpp_backtrace(frames_to_skip, maximum_number_of_frames)