2020-09-25 18:35:39 +00:00
|
|
|
#include <gtest/gtest.h>
|
|
|
|
|
|
2019-08-12 21:48:06 +00:00
|
|
|
#include <ATen/ATen.h>
|
|
|
|
|
#include <ATen/Parallel.h>
|
2020-03-26 18:15:49 +00:00
|
|
|
#include <ATen/core/interned_strings.h>
|
|
|
|
|
#include <ATen/core/ivalue.h>
|
2019-08-12 21:48:06 +00:00
|
|
|
|
2019-03-15 20:53:23 +00:00
|
|
|
#include "test/cpp/jit/test_utils.h"
|
2018-10-07 05:58:28 +00:00
|
|
|
|
2020-02-27 20:18:24 +00:00
|
|
|
#include <torch/csrc/jit/ir/type_hashing.h>
|
2020-03-26 18:15:49 +00:00
|
|
|
#include <torch/csrc/jit/passes/canonicalize.h>
|
2018-12-26 14:52:25 +00:00
|
|
|
#include "torch/csrc/autograd/generated/variable_factories.h"
|
2018-10-07 05:58:28 +00:00
|
|
|
#include "torch/csrc/autograd/variable.h"
|
2020-02-27 20:18:24 +00:00
|
|
|
#include "torch/csrc/jit/codegen/fuser/interface.h"
|
2020-03-26 18:15:49 +00:00
|
|
|
#include "torch/csrc/jit/frontend/code_template.h"
|
|
|
|
|
#include "torch/csrc/jit/frontend/tracer.h"
|
2020-02-27 20:18:24 +00:00
|
|
|
#include "torch/csrc/jit/ir/alias_analysis.h"
|
2020-03-26 18:15:49 +00:00
|
|
|
#include "torch/csrc/jit/ir/attributes.h"
|
|
|
|
|
#include "torch/csrc/jit/ir/irparser.h"
|
|
|
|
|
#include "torch/csrc/jit/ir/scope.h"
|
2020-06-05 20:41:53 +00:00
|
|
|
#include "torch/csrc/jit/jit_log.h"
|
2019-06-10 18:40:49 +00:00
|
|
|
#include "torch/csrc/jit/passes/bailout_graph.h"
|
2018-11-15 01:20:36 +00:00
|
|
|
#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
|
2018-11-07 07:17:01 +00:00
|
|
|
#include "torch/csrc/jit/passes/constant_propagation.h"
|
2018-10-07 05:58:28 +00:00
|
|
|
#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
|
|
|
|
|
#include "torch/csrc/jit/passes/dead_code_elimination.h"
|
2018-12-13 15:51:08 +00:00
|
|
|
#include "torch/csrc/jit/passes/graph_fuser.h"
|
2019-06-03 16:36:49 +00:00
|
|
|
#include "torch/csrc/jit/passes/guard_elimination.h"
|
2019-12-10 23:37:39 +00:00
|
|
|
#include "torch/csrc/jit/passes/inline_autodiff_subgraphs.h"
|
2019-05-20 17:37:49 +00:00
|
|
|
#include "torch/csrc/jit/passes/insert_guards.h"
|
2019-06-13 00:19:26 +00:00
|
|
|
#include "torch/csrc/jit/passes/liveness.h"
|
2020-06-10 20:46:11 +00:00
|
|
|
#include "torch/csrc/jit/passes/loop_unrolling.h"
|
2018-10-07 05:58:28 +00:00
|
|
|
#include "torch/csrc/jit/passes/lower_grad_of.h"
|
2018-11-18 17:20:29 +00:00
|
|
|
#include "torch/csrc/jit/passes/lower_tuples.h"
|
2020-03-26 18:15:49 +00:00
|
|
|
#include "torch/csrc/jit/passes/pass_manager.h"
|
2018-10-07 05:58:28 +00:00
|
|
|
#include "torch/csrc/jit/passes/requires_grad_analysis.h"
|
|
|
|
|
#include "torch/csrc/jit/passes/shape_analysis.h"
|
2018-11-15 01:20:36 +00:00
|
|
|
#include "torch/csrc/jit/passes/utils/subgraph_utils.h"
|
2020-03-26 18:15:49 +00:00
|
|
|
#include "torch/csrc/jit/runtime/argument_spec.h"
|
|
|
|
|
#include "torch/csrc/jit/runtime/autodiff.h"
|
|
|
|
|
#include "torch/csrc/jit/runtime/custom_operator.h"
|
|
|
|
|
#include "torch/csrc/jit/runtime/interpreter.h"
|
2020-02-27 20:18:24 +00:00
|
|
|
#include "torch/csrc/jit/runtime/symbolic_script.h"
|
2020-03-26 18:15:49 +00:00
|
|
|
#include "torch/csrc/jit/serialization/import.h"
|
2018-10-07 05:58:28 +00:00
|
|
|
|
|
|
|
|
#include "torch/csrc/autograd/engine.h"
|
|
|
|
|
#include "torch/csrc/autograd/variable.h"
|
|
|
|
|
|
2020-08-29 06:29:27 +00:00
|
|
|
#include <torch/csrc/jit/runtime/graph_executor.h>
|
2019-03-12 18:25:37 +00:00
|
|
|
#include <torch/csrc/jit/testing/file_check.h>
|
2020-02-13 02:38:55 +00:00
|
|
|
#include <torch/script.h>
|
2020-08-29 06:29:27 +00:00
|
|
|
|
2020-02-27 20:18:24 +00:00
|
|
|
#include "torch/csrc/jit/api/module.h"
|
2020-03-26 18:15:49 +00:00
|
|
|
#include "torch/csrc/jit/frontend/ir_emitter.h"
|
|
|
|
|
#include "torch/csrc/jit/runtime/profiling_record.h"
|
2019-04-11 20:30:42 +00:00
|
|
|
#include "torch/jit.h"
|
2018-10-07 05:58:28 +00:00
|
|
|
|
|
|
|
|
#include "onnx/onnx_pb.h"
|
|
|
|
|
|
2018-10-25 19:16:22 +00:00
|
|
|
#include <c10/util/Exception.h>
|
2020-05-12 02:20:52 +00:00
|
|
|
#include <c10/util/ThreadLocalDebugInfo.h>
|
2018-10-25 19:16:22 +00:00
|
|
|
|
2018-10-07 05:58:28 +00:00
|
|
|
#include <algorithm>
|
|
|
|
|
#include <cstddef>
|
|
|
|
|
#include <functional>
|
|
|
|
|
#include <iostream>
|
|
|
|
|
#include <memory>
|
2020-10-29 05:36:13 +00:00
|
|
|
#include <set>
|
2018-10-07 05:58:28 +00:00
|
|
|
#include <stdexcept>
|
|
|
|
|
#include <string>
|
|
|
|
|
#include <tuple>
|
|
|
|
|
#include <unordered_set>
|
|
|
|
|
#include <utility>
|
|
|
|
|
#include <vector>
|
|
|
|
|
|
2020-05-07 21:46:41 +00:00
|
|
|
using namespace torch::autograd::profiler;
|
|
|
|
|
|
2018-10-07 05:58:28 +00:00
|
|
|
namespace torch {
|
|
|
|
|
namespace jit {
|
2020-03-12 03:57:02 +00:00
|
|
|
inline c10::AliasAnalysisKind aliasAnalysisFromSchema() {
|
|
|
|
|
return c10::AliasAnalysisKind::FROM_SCHEMA;
|
2019-08-18 23:46:56 +00:00
|
|
|
}
|
2018-10-07 05:58:28 +00:00
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
std::ostream& operator<<(std::ostream& out, const std::vector<T>& list) {
|
|
|
|
|
size_t i = 0;
|
|
|
|
|
out << "{";
|
|
|
|
|
for (auto&& e : list) {
|
|
|
|
|
if (i++ > 0)
|
|
|
|
|
out << ", ";
|
|
|
|
|
out << e;
|
|
|
|
|
}
|
|
|
|
|
out << "}";
|
|
|
|
|
return out;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(InternedStringsTest, Basic) {
|
2018-10-07 05:58:28 +00:00
|
|
|
ASSERT_EQ(prim::Param, Symbol::prim("Param"));
|
|
|
|
|
ASSERT_EQ(prim::Return, Symbol::prim("Return"));
|
|
|
|
|
ASSERT_EQ(prim::Return.toUnqualString(), std::string("Return"));
|
|
|
|
|
ASSERT_EQ(prim::Return.toQualString(), std::string("prim::Return"));
|
|
|
|
|
Symbol newsym = Symbol::aten("__NEW_SYMBOL");
|
|
|
|
|
size_t symstart = newsym;
|
|
|
|
|
ASSERT_EQ(newsym.toQualString(), std::string("aten::__NEW_SYMBOL"));
|
|
|
|
|
// TODO: This test is a bit too close to the implementation details.
|
|
|
|
|
ASSERT_EQ(Symbol::aten("What"), symstart + 1);
|
|
|
|
|
ASSERT_EQ(Symbol::aten("What2"), symstart + 2);
|
|
|
|
|
ASSERT_EQ(Symbol::aten("What"), symstart + 1);
|
|
|
|
|
ASSERT_EQ(Symbol::aten("What2"), symstart + 2);
|
|
|
|
|
ASSERT_EQ(Symbol(symstart + 2).toUnqualString(), std::string("What2"));
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(FromQualStringTest, Basic) {
|
2018-10-07 05:58:28 +00:00
|
|
|
ASSERT_EQ(Symbol::fromQualString("prim::Param"), Symbol::prim("Param"));
|
|
|
|
|
ASSERT_EQ(Symbol::fromQualString("aten::mm"), Symbol::aten("mm"));
|
|
|
|
|
ASSERT_EQ(Symbol::fromQualString("onnx::LSTM"), Symbol::onnx("LSTM"));
|
|
|
|
|
ASSERT_EQ(Symbol::fromQualString("attr::value"), Symbol::attr("value"));
|
|
|
|
|
ASSERT_EQ(Symbol::fromQualString("scope::"), Symbol::scope(""));
|
|
|
|
|
ASSERT_EQ(Symbol::fromQualString("::").toUnqualString(), std::string(""));
|
|
|
|
|
ASSERT_EQ(
|
|
|
|
|
Symbol::fromQualString("::").ns().toQualString(),
|
|
|
|
|
std::string("namespaces::"));
|
|
|
|
|
ASSERT_EQ(
|
|
|
|
|
Symbol::fromQualString("new_ns::param").toUnqualString(),
|
|
|
|
|
std::string("param"));
|
|
|
|
|
ASSERT_EQ(
|
|
|
|
|
Symbol::fromQualString("new_ns::param").ns().toUnqualString(),
|
|
|
|
|
std::string("new_ns"));
|
|
|
|
|
ASSERT_EQ(
|
|
|
|
|
Symbol::fromQualString("new_ns::param").ns(),
|
|
|
|
|
Symbol::fromQualString("namespaces::new_ns"));
|
|
|
|
|
|
|
|
|
|
auto bad_inputs = {"scope", ":", ""};
|
|
|
|
|
for (auto input : bad_inputs) {
|
|
|
|
|
try {
|
|
|
|
|
Symbol::fromQualString(input);
|
|
|
|
|
ASSERT_TRUE(0);
|
2018-11-07 19:18:17 +00:00
|
|
|
} catch (const std::exception& c) {
|
2018-10-07 05:58:28 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(THNNConvTest, Basic) {
|
2018-11-18 17:20:29 +00:00
|
|
|
std::vector<int64_t> input_size = {4, 3, 15, 17}; // B x C x H x W
|
|
|
|
|
std::vector<int64_t> kernel_size = {3, 5};
|
|
|
|
|
std::vector<int64_t> stride = {1, 2};
|
|
|
|
|
std::vector<int64_t> padding = {2, 1};
|
|
|
|
|
constexpr int out_channels = 5;
|
|
|
|
|
|
|
|
|
|
// make inputs
|
|
|
|
|
at::Tensor input = torch::randn(input_size);
|
2018-12-26 14:52:25 +00:00
|
|
|
at::Tensor weight = torch::randn(
|
|
|
|
|
{out_channels, input_size[1], kernel_size[0], kernel_size[1]});
|
2018-11-18 17:20:29 +00:00
|
|
|
at::Tensor bias = torch::randn({out_channels});
|
|
|
|
|
|
|
|
|
|
// run forward eagerly
|
|
|
|
|
at::Tensor output, finput, fgradinput;
|
2018-12-26 14:52:25 +00:00
|
|
|
std::tie(output, finput, fgradinput) = at::thnn_conv2d_forward(
|
|
|
|
|
input, weight, kernel_size, bias, stride, padding);
|
2018-11-18 17:20:29 +00:00
|
|
|
|
|
|
|
|
// make grad_outputs
|
2019-11-19 05:45:42 +00:00
|
|
|
at::Tensor grad_output =
|
|
|
|
|
torch::randn_like(output, at::MemoryFormat::Preserve);
|
|
|
|
|
at::Tensor grad_finput =
|
|
|
|
|
torch::zeros_like(finput, at::MemoryFormat::Preserve);
|
|
|
|
|
at::Tensor grad_fgradinput =
|
|
|
|
|
torch::zeros_like(fgradinput, at::MemoryFormat::Preserve);
|
2018-11-18 17:20:29 +00:00
|
|
|
|
|
|
|
|
// run backward eagerly
|
|
|
|
|
at::Tensor grad_input, grad_weight, grad_bias;
|
2018-12-26 14:52:25 +00:00
|
|
|
std::tie(grad_input, grad_weight, grad_bias) = at::thnn_conv2d_backward(
|
|
|
|
|
grad_output,
|
|
|
|
|
input,
|
|
|
|
|
weight,
|
|
|
|
|
kernel_size,
|
|
|
|
|
stride,
|
|
|
|
|
padding,
|
|
|
|
|
finput,
|
|
|
|
|
fgradinput,
|
|
|
|
|
{true, true, true});
|
2018-11-18 17:20:29 +00:00
|
|
|
|
|
|
|
|
// make JIT graph
|
|
|
|
|
auto graph = std::make_shared<Graph>();
|
remove list specialization from ivalue (#30734)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/30734
What are specialized lists?
The IValues that hold List[int], List[Tensor], and List[AnythingElse] are different C++ types.
e.g. List[int] has a std::vector<int> while List[AnythingElse] holds a std::vector<IValue>.
Why do we have specialized lists?
When we first created the JIT we needed to bind the ATen C++ API which has std::vector<int>,
std::vector<Tensor> as inputs. The easiest way to match this API was to make our IValues contain
these same types. Conversion was just unwrapping the IValue, very easy and cheap.
What is the problem with specialized lists?
We end up with significant special cases through the compiler. Other types like Dict are not
specialized. So in the Pickler, for instance, there is a single piece of logic to handle
their serialization. For Lists, we end up with multiple cases. Furthermore, it doesn't
match Python, leading to problems along translation boundaries. Our pickle serialization
is slightly different than python, so it is harder to load objects from our IValue serialization
as Python values.
They also make it harder to provide an easy-to-use user API. We'd like to match pybind11 for C++
bindings to TorchScript. This would entail having a single torch::List class (untemplated)
that can be used to construct inputs. This is made much harder if the underlying ivalue needs
to be different depending on the type inside the list. The ideal case would be to have a constructor like
```
template<typename T>
List(std::vector<T> foo);
```
It would then set up the type tags correctly based on type T, without the need for passing tags.
Do specialized lists improve perf?
Not in a way we have been able to measure. Our major concern initially was having to translate
a std::vector<IValue> to std::vector<int> to call ATen functions. This was especially a concern
for aten::_convolution which takes a number of mostly-constant lists of integers. However,
when we measure the effect of actually having to do this conversion for an aten::_convolution,
it does not take measurable time (benchmark results below).
This is true even if you use a trivial convolution (e.g. 1x1x1), and comment out the actual convolution code.
What are the issues removing them?
This PR removes list specialization but keeps the serialization format, and IValue APIs almost exactly
the same. The only visible change is that toTensorListRef and family have turned into toTensorVector
because they now return by value a copy of the list as a vector.
Further PRs can then clean up the complexity issues that arose from speclization. This will likely
involve removing the isTensorList/isIntList functions, and refactoring the code that used them to
work generically. At some point we will also change serialization to no longer write specialized
lists in the pickle binary. This is forward incompatible, so will go in its own PR.
Benchmark:
```
import torch
import torch.nn as nn
import torch.nn.functional as F
import time
class MnistNet(nn.Module):
def __init__(self):
super(MnistNet, self).__init__()
self.conv1 = nn.Conv2d(1, 1, kernel_size=1)
self.conv2 = nn.Conv2d(1, 1, kernel_size=1)
def forward(self, x):
for i in range(10):
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
return x
model = MnistNet()
x = torch.rand(1, 1, 1, 1)
r = torch.jit.trace(model, x )
r(x)
r(x)
r(x)
r(x)
print(torch.jit.last_executed_optimized_graph())
while True:
b = time.time()
for i in range(100):
r(x)
e = time.time()
print(e - b)
```
Results (no observable difference):
```
Before (actual conv)
0.13251137733459473
0.13260436058044434
0.13276338577270508
0.1327497959136963
0.13250041007995605
0.13270330429077148
0.13290190696716309
0.13265132904052734
0.13274288177490234
0.1326758861541748
0.13253355026245117
0.13254785537719727
0.13260746002197266
0.13285017013549805
0.13264012336730957
0.132490873336792
0.13280034065246582
0.13243484497070312
0.1325232982635498
0.1326127052307129
0.13264131546020508
0.13274383544921875
0.13298296928405762
0.1326909065246582
-------------------
After (actual conv)
0.13127517700195312
0.13150334358215332
0.13092470169067383
0.13102364540100098
0.13134360313415527
0.13155555725097656
0.13314104080200195
0.13151955604553223
0.13160037994384766
0.1315293312072754
0.13137340545654297
0.13148093223571777
0.131455659866333
0.1327371597290039
0.13134026527404785
0.13152337074279785
0.13151192665100098
0.13165974617004395
0.13403725624084473
0.13251852989196777
0.13135504722595215
0.1315624713897705
0.1317615509033203
0.1314380168914795
0.13157200813293457
--------------------
The following replace the convolution operator with a no-op, to show
that even if the conv op was made faster, then we still would not see
a difference:
Before (fake conv)
0.0069539546966552734
0.0069522857666015625
0.007120847702026367
0.007344722747802734
0.007689952850341797
0.007932662963867188
0.00761723518371582
0.007501363754272461
0.007532835006713867
0.007141828536987305
0.007174253463745117
0.007114410400390625
0.007071495056152344
------------------
After (fake conv)
0.007458209991455078
0.007337093353271484
0.007268190383911133
0.007313251495361328
0.007306575775146484
0.007468700408935547
0.0073091983795166016
0.007308483123779297
0.007538318634033203
0.007356882095336914
0.007464170455932617
0.007372140884399414
```
Test Plan: Imported from OSS
Differential Revision: D18814702
Pulled By: zdevito
fbshipit-source-id: 0371c73b63068fdc12f24b801371ea90f23531a6
2020-01-13 02:26:36 +00:00
|
|
|
auto ksz_val = graph->insertConstant(kernel_size);
|
|
|
|
|
auto kst_val = graph->insertConstant(stride);
|
|
|
|
|
auto pad_val = graph->insertConstant(padding);
|
2018-11-18 17:20:29 +00:00
|
|
|
|
|
|
|
|
auto inputg = graph->addInput("self");
|
|
|
|
|
auto weightg = graph->addInput("weight");
|
|
|
|
|
auto biasg = graph->addInput("bias");
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
Value* conv = graph->insert(
|
|
|
|
|
aten::thnn_conv2d_forward,
|
|
|
|
|
{inputg, weightg, ksz_val, biasg, kst_val, pad_val});
|
2018-11-18 17:20:29 +00:00
|
|
|
auto outputs = conv->node()->outputs();
|
|
|
|
|
for (auto output : outputs) {
|
|
|
|
|
graph->registerOutput(output);
|
|
|
|
|
}
|
|
|
|
|
LowerAllTuples(graph);
|
|
|
|
|
graph->lint();
|
|
|
|
|
|
|
|
|
|
// differentiate JIT graph
|
|
|
|
|
EliminateDeadCode(graph); // Tracing of some ops depends on the DCE trick
|
|
|
|
|
ConstantPropagation(graph);
|
|
|
|
|
auto grad_spec = differentiate(graph);
|
|
|
|
|
LowerGradOf(*grad_spec.df);
|
|
|
|
|
|
|
|
|
|
// prepare JIT inputs / gradients
|
|
|
|
|
tensor_list tensors_in;
|
|
|
|
|
tensors_in.push_back(input);
|
|
|
|
|
tensors_in.push_back(weight);
|
|
|
|
|
tensors_in.push_back(bias);
|
|
|
|
|
|
|
|
|
|
tensor_list tensor_grads_in;
|
|
|
|
|
tensor_grads_in.push_back(grad_output);
|
|
|
|
|
tensor_grads_in.push_back(grad_finput);
|
|
|
|
|
tensor_grads_in.push_back(grad_fgradinput);
|
|
|
|
|
|
|
|
|
|
// Get outputs from the interpreter
|
|
|
|
|
tensor_list tensors_out, tensor_grads_out;
|
|
|
|
|
std::tie(tensors_out, tensor_grads_out) =
|
2018-12-26 14:52:25 +00:00
|
|
|
runGradient(grad_spec, tensors_in, tensor_grads_in);
|
2018-11-18 17:20:29 +00:00
|
|
|
|
|
|
|
|
// prepare expected structs
|
|
|
|
|
tensor_list expected_tensors_out, expected_tensor_grads_out;
|
|
|
|
|
expected_tensors_out.push_back(output);
|
|
|
|
|
expected_tensors_out.push_back(finput);
|
|
|
|
|
expected_tensors_out.push_back(fgradinput);
|
|
|
|
|
expected_tensor_grads_out.push_back(grad_input);
|
|
|
|
|
expected_tensor_grads_out.push_back(grad_weight);
|
|
|
|
|
expected_tensor_grads_out.push_back(grad_bias);
|
|
|
|
|
|
|
|
|
|
// Compare results
|
|
|
|
|
assertAllClose(tensors_out, expected_tensors_out);
|
|
|
|
|
assertAllClose(tensor_grads_out, expected_tensor_grads_out);
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(ATenNativeBatchNormTest, Basic) {
|
2018-12-26 14:52:25 +00:00
|
|
|
// aten::native_batch_norm(Tensor input, Tensor weight, Tensor bias, Tensor
|
|
|
|
|
// running_mean, Tensor running_var, bool training, float momentum, float eps)
|
|
|
|
|
// -> (Tensor, Tensor, Tensor)
|
2018-11-18 17:20:29 +00:00
|
|
|
std::vector<int64_t> input_size = {4, 3, 15, 17}; // B x C x H x W
|
|
|
|
|
bool training = true;
|
|
|
|
|
float momentum = 0.9;
|
|
|
|
|
float eps = 1e-5;
|
|
|
|
|
|
|
|
|
|
// make inputs
|
|
|
|
|
at::Tensor input = torch::randn(input_size);
|
|
|
|
|
at::Tensor weight = torch::randn({input_size[1]});
|
|
|
|
|
at::Tensor bias = torch::randn({input_size[1]});
|
|
|
|
|
at::Tensor running_mean = torch::randn({input_size[1]});
|
|
|
|
|
at::Tensor running_var = torch::randn({input_size[1]});
|
|
|
|
|
|
|
|
|
|
// running_mean and running_var are changed in-place, so clone and send them
|
|
|
|
|
at::Tensor running_mean_eager = running_mean.clone();
|
|
|
|
|
at::Tensor running_var_eager = running_var.clone();
|
|
|
|
|
at::Tensor running_mean_jit = running_mean.clone();
|
|
|
|
|
at::Tensor running_var_jit = running_var.clone();
|
|
|
|
|
|
|
|
|
|
// run forward eagerly
|
|
|
|
|
at::Tensor output, savemean, saveinvstd;
|
2018-12-26 14:52:25 +00:00
|
|
|
std::tie(output, savemean, saveinvstd) = at::native_batch_norm(
|
|
|
|
|
input,
|
|
|
|
|
weight,
|
|
|
|
|
bias,
|
|
|
|
|
running_mean_eager,
|
|
|
|
|
running_var_eager,
|
|
|
|
|
training,
|
|
|
|
|
momentum,
|
|
|
|
|
eps);
|
2018-11-18 17:20:29 +00:00
|
|
|
|
|
|
|
|
// make grad_outputs
|
2019-11-19 05:45:42 +00:00
|
|
|
at::Tensor grad_output =
|
|
|
|
|
torch::randn_like(output, at::MemoryFormat::Preserve);
|
|
|
|
|
at::Tensor grad_savemean =
|
|
|
|
|
torch::zeros_like(savemean, at::MemoryFormat::Preserve);
|
|
|
|
|
at::Tensor grad_saveinvstd =
|
|
|
|
|
torch::zeros_like(saveinvstd, at::MemoryFormat::Preserve);
|
2018-11-18 17:20:29 +00:00
|
|
|
|
|
|
|
|
// run backward eagerly
|
|
|
|
|
at::Tensor grad_input, grad_weight, grad_bias;
|
2018-12-26 14:52:25 +00:00
|
|
|
// aten::native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor
|
|
|
|
|
// weight, Tensor running_mean, Tensor running_var, Tensor save_mean, Tensor
|
|
|
|
|
// save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor,
|
|
|
|
|
// Tensor, Tensor)
|
|
|
|
|
std::tie(grad_input, grad_weight, grad_bias) = at::native_batch_norm_backward(
|
|
|
|
|
grad_output,
|
|
|
|
|
input,
|
|
|
|
|
weight,
|
|
|
|
|
running_mean_eager,
|
|
|
|
|
running_var_eager,
|
|
|
|
|
savemean,
|
|
|
|
|
saveinvstd,
|
|
|
|
|
training,
|
|
|
|
|
eps,
|
|
|
|
|
{true, true, true});
|
2018-11-18 17:20:29 +00:00
|
|
|
|
|
|
|
|
// make JIT graph
|
|
|
|
|
auto graph = std::make_shared<Graph>();
|
|
|
|
|
auto training_val = graph->insertConstant(IValue(training));
|
|
|
|
|
auto momentum_val = graph->insertConstant(IValue(momentum));
|
|
|
|
|
auto eps_val = graph->insertConstant(IValue(eps));
|
|
|
|
|
|
|
|
|
|
auto inputg = graph->addInput("self");
|
|
|
|
|
auto weightg = graph->addInput("weight");
|
|
|
|
|
auto biasg = graph->addInput("bias");
|
|
|
|
|
auto running_meang = graph->addInput("running_mean");
|
|
|
|
|
auto running_varg = graph->addInput("running_var");
|
|
|
|
|
|
2018-12-26 14:52:25 +00:00
|
|
|
Value* bn = graph->insert(
|
|
|
|
|
aten::native_batch_norm,
|
|
|
|
|
{inputg,
|
|
|
|
|
weightg,
|
|
|
|
|
biasg,
|
|
|
|
|
running_meang,
|
|
|
|
|
running_varg,
|
|
|
|
|
training_val,
|
|
|
|
|
momentum_val,
|
|
|
|
|
eps_val});
|
2018-11-18 17:20:29 +00:00
|
|
|
auto outputs = bn->node()->outputs();
|
|
|
|
|
for (auto output : outputs) {
|
|
|
|
|
graph->registerOutput(output);
|
|
|
|
|
}
|
|
|
|
|
LowerAllTuples(graph);
|
|
|
|
|
graph->lint();
|
|
|
|
|
|
|
|
|
|
// differentiate JIT graph
|
|
|
|
|
EliminateDeadCode(graph); // Tracing of some ops depends on the DCE trick
|
|
|
|
|
ConstantPropagation(graph);
|
|
|
|
|
auto grad_spec = differentiate(graph);
|
|
|
|
|
LowerGradOf(*grad_spec.df);
|
|
|
|
|
|
|
|
|
|
// prepare JIT inputs / gradients
|
|
|
|
|
tensor_list tensors_in;
|
|
|
|
|
tensors_in.push_back(input);
|
|
|
|
|
tensors_in.push_back(weight);
|
|
|
|
|
tensors_in.push_back(bias);
|
|
|
|
|
tensors_in.push_back(running_mean_jit);
|
|
|
|
|
tensors_in.push_back(running_var_jit);
|
|
|
|
|
|
|
|
|
|
tensor_list tensor_grads_in;
|
|
|
|
|
tensor_grads_in.push_back(grad_output);
|
|
|
|
|
tensor_grads_in.push_back(grad_savemean);
|
|
|
|
|
tensor_grads_in.push_back(grad_saveinvstd);
|
|
|
|
|
|
|
|
|
|
// Get outputs from the interpreter
|
|
|
|
|
tensor_list tensors_out, tensor_grads_out;
|
|
|
|
|
std::tie(tensors_out, tensor_grads_out) =
|
2018-12-26 14:52:25 +00:00
|
|
|
runGradient(grad_spec, tensors_in, tensor_grads_in);
|
2018-11-18 17:20:29 +00:00
|
|
|
|
|
|
|
|
// prepare expected structs
|
|
|
|
|
tensor_list expected_tensors_out, expected_tensor_grads_out;
|
|
|
|
|
expected_tensors_out.push_back(output);
|
|
|
|
|
expected_tensors_out.push_back(savemean);
|
|
|
|
|
expected_tensors_out.push_back(saveinvstd);
|
|
|
|
|
expected_tensors_out.push_back(running_mean_eager);
|
|
|
|
|
expected_tensors_out.push_back(running_var_eager);
|
|
|
|
|
expected_tensor_grads_out.push_back(grad_input);
|
|
|
|
|
expected_tensor_grads_out.push_back(grad_weight);
|
|
|
|
|
expected_tensor_grads_out.push_back(grad_bias);
|
|
|
|
|
|
|
|
|
|
tensors_out.push_back(running_mean_jit);
|
|
|
|
|
tensors_out.push_back(running_var_jit);
|
|
|
|
|
|
|
|
|
|
// Compare results
|
|
|
|
|
assertAllClose(tensors_out, expected_tensors_out);
|
|
|
|
|
assertAllClose(tensor_grads_out, expected_tensor_grads_out);
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(CustomFusionTest, Basic) {
|
2019-08-20 03:47:50 +00:00
|
|
|
auto graph_string = R"IR(
|
|
|
|
|
graph(%0 : Float(2, 3, 4),
|
|
|
|
|
%1 : Float(2, 3, 4)):
|
|
|
|
|
%2 : Tensor = aten::mul(%0, %1)
|
|
|
|
|
%3 : Tensor = aten::mul(%2, %0)
|
|
|
|
|
return (%3))IR";
|
|
|
|
|
auto g = std::make_shared<Graph>();
|
2020-03-12 06:29:34 +00:00
|
|
|
torch::jit::parseIR(graph_string, g.get());
|
2019-05-07 06:11:58 +00:00
|
|
|
|
|
|
|
|
torch::jit::overrideCanFuseOnCPU(true);
|
|
|
|
|
CustomFuseGraph(
|
2019-08-20 03:47:50 +00:00
|
|
|
g,
|
2019-05-07 06:11:58 +00:00
|
|
|
[](Node* n) { return n->kind() != prim::Param; },
|
|
|
|
|
Symbol::fromQualString("prim::FusionGroup"));
|
|
|
|
|
torch::jit::overrideCanFuseOnCPU(false);
|
|
|
|
|
|
2019-08-20 03:47:50 +00:00
|
|
|
const auto& nodes = g->nodes();
|
2019-05-07 06:11:58 +00:00
|
|
|
auto fusion_group =
|
|
|
|
|
std::find_if(nodes.begin(), nodes.end(), [](const Node* node) {
|
|
|
|
|
return node->kind() == Symbol::fromQualString("prim::FusionGroup");
|
|
|
|
|
});
|
|
|
|
|
AT_ASSERT(fusion_group != nodes.end());
|
|
|
|
|
|
|
|
|
|
auto subgraph = fusion_group->g(attr::Subgraph);
|
|
|
|
|
auto hits = 0;
|
|
|
|
|
// two multiplications
|
|
|
|
|
for (const auto& n : subgraph->nodes()) {
|
2019-08-06 00:49:13 +00:00
|
|
|
(void)n;
|
2019-05-07 06:11:58 +00:00
|
|
|
hits++;
|
|
|
|
|
}
|
|
|
|
|
AT_ASSERT(hits == 2);
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(CustomFusionTest, NestedBlocks) {
|
2019-08-20 03:47:50 +00:00
|
|
|
auto graph_string = R"IR(
|
|
|
|
|
graph(%0 : Float(2, 3, 4),
|
|
|
|
|
%1 : Float(2, 3, 4),
|
|
|
|
|
%2 : Float(2, 3, 4)):
|
|
|
|
|
%3 : int = prim::Constant[value=1]()
|
|
|
|
|
%4 : Tensor = prim::If(%2)
|
|
|
|
|
block0():
|
|
|
|
|
%5 : Tensor = aten::mul(%0, %2)
|
|
|
|
|
%6 : Tensor = aten::mul(%5, %1)
|
|
|
|
|
-> (%6)
|
|
|
|
|
block1():
|
|
|
|
|
%7 : Tensor = aten::add(%0, %2, %3)
|
|
|
|
|
%8 : Tensor = aten::add(%7, %1, %3)
|
|
|
|
|
-> (%8)
|
|
|
|
|
%9 : Tensor = aten::add(%4, %2, %3)
|
|
|
|
|
return (%4))IR";
|
2019-07-25 12:50:43 +00:00
|
|
|
auto g = std::make_shared<Graph>();
|
2020-03-12 06:29:34 +00:00
|
|
|
torch::jit::parseIR(graph_string, g.get());
|
2019-07-25 12:50:43 +00:00
|
|
|
|
|
|
|
|
CustomFuseGraph(
|
|
|
|
|
g,
|
|
|
|
|
[](Node* n) { return n->kind() == aten::mul; },
|
|
|
|
|
Symbol::fromQualString("prim::FusionGroup"));
|
2019-08-06 00:49:13 +00:00
|
|
|
|
2019-07-25 12:50:43 +00:00
|
|
|
// Could be done in more efficient ways, but this is only a test.
|
2020-03-26 18:15:49 +00:00
|
|
|
std::function<bool(const Block*, Symbol)> dfs = [&](const Block* b,
|
|
|
|
|
Symbol s) {
|
|
|
|
|
for (auto node : b->nodes()) {
|
|
|
|
|
if (node->kind() == s)
|
|
|
|
|
return true;
|
|
|
|
|
for (auto nested_b : node->blocks())
|
|
|
|
|
if (dfs(nested_b, s))
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
2019-07-25 12:50:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
AT_ASSERT(dfs(g->block(), Symbol::fromQualString("prim::FusionGroup")));
|
|
|
|
|
}
|
|
|
|
|
|
2019-03-18 16:56:25 +00:00
|
|
|
static const auto cf_examples = R"JIT(
|
2018-10-07 05:58:28 +00:00
|
|
|
def if_test(a, b):
|
|
|
|
|
# FIXME: use 0 instead of a.
|
|
|
|
|
# c = 0
|
|
|
|
|
c = a
|
|
|
|
|
if bool(a < b):
|
|
|
|
|
c = b
|
|
|
|
|
else:
|
|
|
|
|
c = a
|
|
|
|
|
return c
|
|
|
|
|
def if_one(a, b):
|
|
|
|
|
c = b
|
|
|
|
|
if bool(a < b):
|
|
|
|
|
c = a
|
|
|
|
|
return c
|
|
|
|
|
def while_test(a, i):
|
|
|
|
|
while bool(i < 3):
|
|
|
|
|
a *= a
|
|
|
|
|
i += 1
|
|
|
|
|
return a
|
|
|
|
|
)JIT";
|
2020-09-25 18:35:39 +00:00
|
|
|
|
|
|
|
|
TEST(ControlFlowTest, Basic) {
|
2019-04-11 20:30:42 +00:00
|
|
|
auto cu = compile(cf_examples);
|
|
|
|
|
|
2018-10-07 05:58:28 +00:00
|
|
|
auto run = [&](const std::string& name, std::vector<IValue> stack) {
|
2019-04-11 20:30:42 +00:00
|
|
|
auto graph = cu->get_function(name).graph();
|
improved TorchScript traceback (#33834)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33834
This changes how we report Tracebacks to make them more clear when
there are both serialized and non-serialized ranges. It now looks like:
```
Traceback (most recent call last):
File "foo.py", line 25, in <module>
s2(a, b)
File "/scratch/zdevito/pytorch/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
File "code/__torch__.py", line 7, in forward
x: Tensor,
y: Tensor) -> Tensor:
return (self).bar(x, y, )
~~~~~~~~~ <--- HERE
def bar(self: __torch__.Moo,
x: Tensor,
File "code/__torch__.py", line 11, in bar
x: Tensor,
y: Tensor) -> Tensor:
_0 = (self).baz(x, y, )
~~~~~~~~~ <--- HERE
_1 = torch.ones([3], dtype=None, layout=None, device=None, pin_memory=None)
return torch.add(_0, _1, alpha=1)
File "code/__torch__.py", line 17, in baz
x: Tensor,
y: Tensor) -> Tensor:
return torch.add(x, y, alpha=1)
~~~~~~~~~ <--- HERE
Traceback of TorchScript, original code (most recent call last):
File "foo.py", line 11, in forward
def forward(self, x, y):
return self.bar(x, y)
~~~~~~~~ <--- HERE
File "foo.py", line 9, in bar
def bar(self, x, y):
return self.baz(x, y) + torch.ones(3)
~~~~~~~~ <--- HERE
File "foo.py", line 7, in baz
def baz(self, x, y):
return x + y
~~~~~ <--- HERE
RuntimeError: The size of tensor a (4) must match the size of tensor b (5) at non-singleton dimension 1
```
It follows Python convension of having the most important information last
and reading from the bottom up.
Changes:
* Moved the error message to the end, to copy Python
* Report original traceback separate from serialized traceback
* Make sure root functions have names in the interpreter trace.
Test Plan: Imported from OSS
Differential Revision: D20126136
Pulled By: zdevito
fbshipit-source-id: fd01f9985e5d74e04c4d064c02e8bc320f4fac13
2020-03-03 20:24:28 +00:00
|
|
|
Code code(graph, "");
|
2018-10-07 05:58:28 +00:00
|
|
|
InterpreterState interp(code);
|
|
|
|
|
interp.run(stack);
|
|
|
|
|
return stack;
|
|
|
|
|
};
|
|
|
|
|
|
2020-03-26 18:15:49 +00:00
|
|
|
auto L = [](int64_t l) { return IValue(scalar_to_tensor(at::Scalar(l))); };
|
2018-10-07 05:58:28 +00:00
|
|
|
auto V = [](IValue t) { return std::move(t).toTensor().item<int64_t>(); };
|
|
|
|
|
auto run_binary = [&](const std::string& name, int64_t a, int64_t b) {
|
|
|
|
|
return V(run(name, {L(a), L(b)})[0]);
|
|
|
|
|
};
|
|
|
|
|
ASSERT_EQ(2, run_binary("if_test", 1, 2));
|
|
|
|
|
ASSERT_EQ(3, run_binary("if_test", 3, 2));
|
|
|
|
|
ASSERT_EQ(2, run_binary("if_one", 2, 3));
|
|
|
|
|
ASSERT_EQ(2, run_binary("if_one", 3, 2));
|
|
|
|
|
ASSERT_EQ(256, run_binary("while_test", 2, 0));
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(ProtoTest, Basic) {
|
2018-10-07 05:58:28 +00:00
|
|
|
::ONNX_NAMESPACE::ModelProto proto;
|
|
|
|
|
proto.set_producer_name("foo");
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-15 21:49:37 +00:00
|
|
|
// test a few features that are not directly used in schemas yet
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(SchemaParserTest, NestedArrays) {
|
2018-10-15 21:49:37 +00:00
|
|
|
// nested arrays
|
|
|
|
|
auto s = parseSchema("at::what(int[][4] foo) -> ()");
|
2018-10-24 17:28:04 +00:00
|
|
|
ASSERT_TRUE(s.arguments().at(0).N() == 4);
|
2018-12-26 14:52:25 +00:00
|
|
|
ASSERT_TRUE(IntType::get()->isSubtypeOf(s.arguments()
|
|
|
|
|
.at(0)
|
|
|
|
|
.type()
|
|
|
|
|
->expect<ListType>()
|
2018-10-15 21:49:37 +00:00
|
|
|
->getElementType()
|
|
|
|
|
->expect<ListType>()
|
|
|
|
|
->getElementType()));
|
|
|
|
|
auto s2 = parseSchema("at::what(int[][] foo) -> ()");
|
2018-12-26 14:52:25 +00:00
|
|
|
ASSERT_TRUE(IntType::get()->isSubtypeOf(s2.arguments()
|
|
|
|
|
.at(0)
|
|
|
|
|
.type()
|
|
|
|
|
->expect<ListType>()
|
|
|
|
|
->getElementType()
|
|
|
|
|
->expect<ListType>()
|
|
|
|
|
->getElementType()));
|
2020-09-25 18:35:39 +00:00
|
|
|
}
|
2018-10-27 16:58:44 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(SchemaParserTest, NamedReturns) {
|
2018-10-15 21:49:37 +00:00
|
|
|
// named returns
|
|
|
|
|
parseSchema("at::what(Tensor! i_will_be_written_to) -> ()");
|
2018-12-26 14:52:25 +00:00
|
|
|
auto s3 =
|
|
|
|
|
parseSchema("at::what() -> (Tensor the_return, Tensor the_return2)");
|
2018-10-24 17:28:04 +00:00
|
|
|
ASSERT_TRUE(s3.returns().at(0).name() == "the_return");
|
|
|
|
|
ASSERT_TRUE(s3.returns().at(1).name() == "the_return2");
|
2020-09-25 18:35:39 +00:00
|
|
|
}
|
2018-10-15 21:49:37 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(SchemaParserTest, Futures) {
|
2018-10-27 16:58:44 +00:00
|
|
|
// futures
|
|
|
|
|
auto s4 = parseSchema("at::what(Future(int) foo) -> ()");
|
2018-12-26 14:52:25 +00:00
|
|
|
ASSERT_TRUE(IntType::get()->isSubtypeOf(
|
|
|
|
|
s4.arguments().at(0).type()->expect<FutureType>()->getElementType()));
|
2020-09-25 18:35:39 +00:00
|
|
|
}
|
2018-10-27 16:58:44 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(SchemaParserTest, AnnotatedAliasSets) {
|
2018-10-15 21:49:37 +00:00
|
|
|
// test tensor with annotated alias sets
|
2018-11-16 19:32:34 +00:00
|
|
|
parseSchema("at::what(Tensor(a) foo) -> (Tensor(a))");
|
2020-09-25 18:35:39 +00:00
|
|
|
}
|
2018-10-15 21:49:37 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(SchemaParserTest, BeforeAfterSets) {
|
|
|
|
|
const auto s = parseSchema(
|
|
|
|
|
"at::what(Tensor(b|c)[](a!) list, Tensor(c) element)"
|
|
|
|
|
" -> (Tensor(b|c)[](a!))");
|
|
|
|
|
|
|
|
|
|
// The list itself is annotated with `a`
|
|
|
|
|
const auto& aliasInfo = *s.arguments().at(0).alias_info();
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
|
aliasInfo.beforeSets() ==
|
|
|
|
|
std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
|
|
|
|
|
ASSERT_TRUE(aliasInfo.isWrite());
|
|
|
|
|
|
|
|
|
|
// Check the contained types
|
|
|
|
|
ASSERT_TRUE(!aliasInfo.containedTypes().empty());
|
|
|
|
|
const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
|
|
|
|
|
const auto expected = std::unordered_set<Symbol>{
|
|
|
|
|
Symbol::fromQualString("alias::b"),
|
|
|
|
|
Symbol::fromQualString("alias::c"),
|
|
|
|
|
};
|
|
|
|
|
ASSERT_TRUE(containedAliasInfo.beforeSets() == expected);
|
|
|
|
|
ASSERT_TRUE(containedAliasInfo.afterSets() == expected);
|
|
|
|
|
ASSERT_FALSE(containedAliasInfo.isWrite());
|
2018-10-15 21:49:37 +00:00
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(SchemaParserTest, BeforeAfterSets2) {
|
|
|
|
|
const auto s = parseSchema(
|
|
|
|
|
"at::what(Tensor(b -> b|c)[](a!) list, Tensor(c) element)"
|
|
|
|
|
" -> (Tensor(b|c)[](a!))");
|
|
|
|
|
|
|
|
|
|
// The list itself is annotated with `a`
|
|
|
|
|
const auto& aliasInfo = *s.arguments().at(0).alias_info();
|
|
|
|
|
ASSERT_EQ(
|
|
|
|
|
aliasInfo.beforeSets(),
|
|
|
|
|
std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
|
|
|
|
|
ASSERT_EQ(
|
|
|
|
|
aliasInfo.afterSets(),
|
|
|
|
|
std::unordered_set<Symbol>{Symbol::fromQualString("alias::a")});
|
|
|
|
|
ASSERT_TRUE(aliasInfo.isWrite());
|
|
|
|
|
ASSERT_EQ(aliasInfo.containedTypes().size(), 1);
|
|
|
|
|
|
|
|
|
|
// Check the contained types
|
|
|
|
|
ASSERT_TRUE(!aliasInfo.containedTypes().empty());
|
|
|
|
|
const auto& containedAliasInfo = aliasInfo.containedTypes()[0];
|
|
|
|
|
const auto expectedBefore = std::unordered_set<Symbol>{
|
|
|
|
|
Symbol::fromQualString("alias::b"),
|
|
|
|
|
};
|
|
|
|
|
const auto expectedAfter = std::unordered_set<Symbol>{
|
|
|
|
|
Symbol::fromQualString("alias::b"), Symbol::fromQualString("alias::c")};
|
|
|
|
|
ASSERT_TRUE(containedAliasInfo.beforeSets() == expectedBefore);
|
|
|
|
|
ASSERT_TRUE(containedAliasInfo.afterSets() == expectedAfter);
|
|
|
|
|
ASSERT_FALSE(containedAliasInfo.isWrite());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(TopologicalIndexTest, Basic) {
|
|
|
|
|
Graph graph;
|
|
|
|
|
auto node1 = graph.create(prim::AutogradZero);
|
|
|
|
|
auto node2 = graph.create(prim::AutogradZero);
|
|
|
|
|
auto node3 = graph.create(prim::AutogradZero);
|
|
|
|
|
auto node4 = graph.create(prim::AutogradZero);
|
|
|
|
|
|
|
|
|
|
graph.appendNode(node4);
|
|
|
|
|
graph.prependNode(node1);
|
|
|
|
|
node2->insertAfter(node1);
|
|
|
|
|
node3->insertBefore(node4);
|
|
|
|
|
|
|
|
|
|
// nodes should be in numerical order
|
|
|
|
|
ASSERT_TRUE(node1->isBefore(node2));
|
|
|
|
|
ASSERT_TRUE(node1->isBefore(node3));
|
|
|
|
|
ASSERT_TRUE(node1->isBefore(node4));
|
|
|
|
|
ASSERT_TRUE(node2->isAfter(node1));
|
|
|
|
|
ASSERT_TRUE(node2->isBefore(node3));
|
|
|
|
|
ASSERT_TRUE(node2->isBefore(node4));
|
|
|
|
|
ASSERT_FALSE(node3->isBefore(node1));
|
|
|
|
|
ASSERT_FALSE(node3->isBefore(node2));
|
|
|
|
|
ASSERT_FALSE(node3->isAfter(node4));
|
|
|
|
|
|
|
|
|
|
// Built up a block structure
|
|
|
|
|
// node3
|
|
|
|
|
// /\ ...
|
|
|
|
|
// A B block1
|
|
|
|
|
// \ ...
|
|
|
|
|
// C block2
|
|
|
|
|
auto block1 = node3->addBlock();
|
|
|
|
|
auto A = graph.create(prim::AutogradZero);
|
|
|
|
|
block1->appendNode(A);
|
|
|
|
|
auto B = graph.create(prim::AutogradZero);
|
|
|
|
|
block1->appendNode(B);
|
|
|
|
|
auto block2 = B->addBlock();
|
|
|
|
|
auto C = graph.create(prim::AutogradZero);
|
|
|
|
|
block2->appendNode(C);
|
|
|
|
|
|
|
|
|
|
// Check isAfter on different block levels
|
|
|
|
|
ASSERT_TRUE(node1->isBefore(A));
|
|
|
|
|
ASSERT_TRUE(A->isBefore(B));
|
|
|
|
|
ASSERT_TRUE(A->isBefore(C));
|
|
|
|
|
|
|
|
|
|
// make sure things don't blow up on deletions
|
|
|
|
|
node2->destroy();
|
|
|
|
|
auto node2p = graph.create(prim::AutogradZero);
|
|
|
|
|
node2p->insertAfter(node1);
|
|
|
|
|
ASSERT_TRUE(node1->isBefore(node2p));
|
|
|
|
|
ASSERT_TRUE(node2p->isBefore(node3));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(TopologicalIndexTest, Reindex) {
|
|
|
|
|
// Induce reindexing to test that path
|
|
|
|
|
Graph graph;
|
|
|
|
|
std::map<size_t, Node*> nodes;
|
|
|
|
|
|
|
|
|
|
auto anchor = graph.create(prim::AutogradZero);
|
|
|
|
|
graph.appendNode(anchor);
|
|
|
|
|
// Inserting to the same place a lot will trigger reindexing
|
|
|
|
|
for (auto i = 0; i < 100; ++i) {
|
|
|
|
|
auto n = graph.create(prim::AutogradZero);
|
|
|
|
|
n->insertAfter(anchor);
|
|
|
|
|
nodes[i] = n;
|
2018-10-23 06:53:06 +00:00
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// Nodes should be in reverse order
|
|
|
|
|
for (auto i = 0; i < 100; ++i) {
|
|
|
|
|
for (auto j = i + 1; j < 100; ++j) {
|
|
|
|
|
ASSERT_TRUE(nodes[i]->isAfter(nodes[j]));
|
2018-10-23 06:53:06 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-16 03:24:10 +00:00
|
|
|
at::Tensor invokeTestRecordFunction(at::Tensor& t) {
|
|
|
|
|
RECORD_FUNCTION("test", std::vector<c10::IValue>({t}));
|
|
|
|
|
|
|
|
|
|
auto t2 = t.pow(2);
|
|
|
|
|
return t2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static const auto invokeTestRecordFunction_JIT = R"JIT(
|
2020-03-31 07:31:06 +00:00
|
|
|
def foo(self, t):
|
2019-04-16 03:24:10 +00:00
|
|
|
t2 = t.pow(2)
|
|
|
|
|
return t2
|
2020-03-31 07:31:06 +00:00
|
|
|
|
|
|
|
|
def forward(self, t):
|
|
|
|
|
return self.foo(t)
|
2019-04-16 03:24:10 +00:00
|
|
|
)JIT";
|
|
|
|
|
|
|
|
|
|
at::Tensor invokeTestRecordFunctionJIT(at::Tensor& t) {
|
|
|
|
|
RECORD_FUNCTION("test", std::vector<c10::IValue>({t}));
|
|
|
|
|
|
2020-03-31 07:31:06 +00:00
|
|
|
auto module = std::make_shared<script::Module>(
|
|
|
|
|
"RecordFunctionTestModule", std::make_shared<script::CompilationUnit>());
|
|
|
|
|
module->define(invokeTestRecordFunction_JIT);
|
|
|
|
|
return module->forward({t}).toTensor();
|
2019-04-16 03:24:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
using TracedTestInputs =
|
|
|
|
|
std::vector<std::tuple<std::string, std::vector<std::vector<int64_t>>>>;
|
|
|
|
|
|
|
|
|
|
void checkTracedInputs(const TracedTestInputs& inputs) {
|
|
|
|
|
bool found_test = false;
|
|
|
|
|
bool found_pow = false;
|
|
|
|
|
bool found_mul = false;
|
|
|
|
|
for (const auto& input : inputs) {
|
|
|
|
|
const auto& fn = std::get<0>(input);
|
|
|
|
|
const auto& sizes = std::get<1>(input);
|
2020-07-18 05:18:35 +00:00
|
|
|
|
2019-04-16 03:24:10 +00:00
|
|
|
if (fn == "test") {
|
|
|
|
|
found_test = true;
|
2019-05-15 14:58:48 +00:00
|
|
|
TORCH_CHECK(sizes.size() == 1);
|
|
|
|
|
TORCH_CHECK(sizes[0] == std::vector<int64_t>({1, 2, 3}));
|
2020-07-18 05:18:35 +00:00
|
|
|
} else if (fn == "aten::pow") {
|
2019-04-16 03:24:10 +00:00
|
|
|
found_pow = true;
|
2019-05-15 14:58:48 +00:00
|
|
|
TORCH_CHECK(sizes.size() == 2);
|
|
|
|
|
TORCH_CHECK(sizes[0] == std::vector<int64_t>({1, 2, 3}));
|
|
|
|
|
TORCH_CHECK(sizes[1].empty());
|
2020-07-18 05:18:35 +00:00
|
|
|
} else if (fn == "aten::mul") {
|
2019-04-16 03:24:10 +00:00
|
|
|
found_mul = true;
|
2019-05-15 14:58:48 +00:00
|
|
|
TORCH_CHECK(sizes.size() > 1);
|
|
|
|
|
TORCH_CHECK(sizes[0] == std::vector<int64_t>({1, 2, 3}));
|
2019-04-16 03:24:10 +00:00
|
|
|
}
|
|
|
|
|
}
|
2019-05-15 14:58:48 +00:00
|
|
|
TORCH_CHECK(found_test);
|
|
|
|
|
TORCH_CHECK(found_pow);
|
|
|
|
|
TORCH_CHECK(found_mul);
|
2019-03-29 00:42:47 +00:00
|
|
|
}
|
|
|
|
|
|
2020-03-31 07:31:06 +00:00
|
|
|
void checkScopeCallbacks() {
|
|
|
|
|
bool found_function_scope = false;
|
|
|
|
|
bool found_method_scope = false;
|
|
|
|
|
bool found_user_scope = false;
|
2020-05-07 21:46:41 +00:00
|
|
|
at::addGlobalCallback(at::RecordFunctionCallback(
|
|
|
|
|
[&](const at::RecordFunction& fn) {
|
|
|
|
|
if (fn.scope() == at::RecordScope::FUNCTION &&
|
2020-03-31 07:31:06 +00:00
|
|
|
std::string(fn.name().str()) == "test_function") {
|
|
|
|
|
found_function_scope = true;
|
|
|
|
|
}
|
2020-05-07 21:46:41 +00:00
|
|
|
if (fn.scope() == at::RecordScope::TORCHSCRIPT_FUNCTION &&
|
2020-03-31 07:31:06 +00:00
|
|
|
std::string(fn.name().str()) == "test_method") {
|
|
|
|
|
found_method_scope = true;
|
|
|
|
|
}
|
2020-05-07 21:46:41 +00:00
|
|
|
if (fn.scope() == at::RecordScope::USER_SCOPE &&
|
2020-03-31 07:31:06 +00:00
|
|
|
std::string(fn.name().str()) == "test_user_scope") {
|
|
|
|
|
found_user_scope = true;
|
|
|
|
|
}
|
|
|
|
|
},
|
2020-05-07 21:46:41 +00:00
|
|
|
[](const at::RecordFunction&) {}));
|
2020-03-31 07:31:06 +00:00
|
|
|
|
|
|
|
|
bool bad_scope = false;
|
2020-05-07 21:46:41 +00:00
|
|
|
auto pushScopedCallback = [&](at::RecordScope scope, size_t& cnt) {
|
|
|
|
|
at::addGlobalCallback(
|
|
|
|
|
at::RecordFunctionCallback(
|
|
|
|
|
[&bad_scope, &cnt, scope](const at::RecordFunction& fn) {
|
2020-05-07 21:46:41 +00:00
|
|
|
if (fn.scope() == scope) {
|
|
|
|
|
++cnt;
|
|
|
|
|
} else {
|
|
|
|
|
bad_scope = true;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
},
|
2020-05-07 21:46:41 +00:00
|
|
|
[](const at::RecordFunction&) {})
|
2020-05-07 21:46:41 +00:00
|
|
|
.scopes({scope}));
|
2020-03-31 07:31:06 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
size_t fun_cnt = 0;
|
2020-05-07 21:46:41 +00:00
|
|
|
pushScopedCallback(at::RecordScope::FUNCTION, fun_cnt);
|
2020-03-31 07:31:06 +00:00
|
|
|
size_t ts_fun_cnt = 0;
|
2020-05-07 21:46:41 +00:00
|
|
|
pushScopedCallback(at::RecordScope::TORCHSCRIPT_FUNCTION, ts_fun_cnt);
|
2020-03-31 07:31:06 +00:00
|
|
|
size_t user_scope_cnt = 0;
|
2020-05-07 21:46:41 +00:00
|
|
|
pushScopedCallback(at::RecordScope::USER_SCOPE, user_scope_cnt);
|
2020-03-31 07:31:06 +00:00
|
|
|
|
2020-05-07 21:46:41 +00:00
|
|
|
TORCH_CHECK(at::hasCallbacks());
|
2020-03-31 07:31:06 +00:00
|
|
|
|
|
|
|
|
{
|
|
|
|
|
RECORD_TORCHSCRIPT_FUNCTION("test_method", {});
|
|
|
|
|
{ RECORD_FUNCTION("test_function", {}); }
|
|
|
|
|
{ RECORD_USER_SCOPE("test_user_scope"); }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(!bad_scope);
|
|
|
|
|
TORCH_CHECK(fun_cnt == 1);
|
|
|
|
|
TORCH_CHECK(ts_fun_cnt == 1);
|
|
|
|
|
TORCH_CHECK(user_scope_cnt == 1);
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(found_function_scope);
|
|
|
|
|
TORCH_CHECK(found_method_scope);
|
|
|
|
|
TORCH_CHECK(found_user_scope);
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(RecordFunctionTest, Basic) {
|
2020-03-31 07:31:06 +00:00
|
|
|
// disabling the inlining of method calls
|
2020-04-28 17:46:37 +00:00
|
|
|
GraphOptimizerEnabledGuard opt_guard(false);
|
2020-03-31 07:31:06 +00:00
|
|
|
|
2019-04-16 03:24:10 +00:00
|
|
|
// [(fn, [[sizes], [sizes], ...]), ...]
|
|
|
|
|
TracedTestInputs traced_inputs;
|
2020-03-31 07:31:06 +00:00
|
|
|
std::unordered_set<std::string> ts_names;
|
2020-05-07 21:46:41 +00:00
|
|
|
addGlobalCallback(
|
|
|
|
|
RecordFunctionCallback(
|
|
|
|
|
[&](const RecordFunction& fn) {
|
|
|
|
|
if (fn.scope() == RecordScope::FUNCTION) {
|
|
|
|
|
auto inputs = fn.inputs();
|
|
|
|
|
std::vector<std::vector<int64_t>> sizes;
|
|
|
|
|
for (const auto& input : inputs) {
|
|
|
|
|
if (input.isTensor()) {
|
|
|
|
|
sizes.push_back(input.toTensor().sizes().vec());
|
|
|
|
|
} else if (input.isScalar()) {
|
|
|
|
|
sizes.push_back(std::vector<int64_t>());
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
traced_inputs.push_back(std::make_tuple(fn.name().str(), sizes));
|
|
|
|
|
} else if (fn.scope() == RecordScope::TORCHSCRIPT_FUNCTION) {
|
|
|
|
|
ts_names.insert(fn.name().str());
|
2020-03-31 07:31:06 +00:00
|
|
|
}
|
2020-05-07 21:46:41 +00:00
|
|
|
},
|
|
|
|
|
[](const RecordFunction&) {})
|
|
|
|
|
.needsInputs(true));
|
2019-05-15 21:38:37 +00:00
|
|
|
|
2020-04-21 23:31:17 +00:00
|
|
|
TracedTestInputs eager_inputs, jit_inputs;
|
|
|
|
|
{
|
|
|
|
|
auto t = torch::randn({1, 2, 3}, at::kCPU);
|
|
|
|
|
t.set_requires_grad(true);
|
|
|
|
|
auto t2 = invokeTestRecordFunction(t);
|
|
|
|
|
t2.backward(torch::ones_like(t2, at::MemoryFormat::Preserve));
|
|
|
|
|
eager_inputs = traced_inputs;
|
|
|
|
|
traced_inputs.clear();
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(ts_names.empty());
|
|
|
|
|
|
|
|
|
|
t = torch::randn({1, 2, 3}, at::kCPU);
|
|
|
|
|
t.set_requires_grad(true);
|
|
|
|
|
t2 = invokeTestRecordFunctionJIT(t);
|
|
|
|
|
t2.backward(torch::ones_like(t2, at::MemoryFormat::Preserve));
|
|
|
|
|
jit_inputs = traced_inputs;
|
|
|
|
|
traced_inputs.clear();
|
|
|
|
|
}
|
2019-03-29 00:42:47 +00:00
|
|
|
|
2020-03-31 07:31:06 +00:00
|
|
|
TORCH_CHECK(ts_names.find("forward") != ts_names.end());
|
|
|
|
|
TORCH_CHECK(ts_names.find("foo") != ts_names.end());
|
|
|
|
|
|
2019-04-16 03:24:10 +00:00
|
|
|
checkTracedInputs(eager_inputs);
|
|
|
|
|
checkTracedInputs(jit_inputs);
|
2020-05-07 21:46:41 +00:00
|
|
|
at::clearCallbacks();
|
2019-06-06 20:40:03 +00:00
|
|
|
|
|
|
|
|
// test sampled callbacks
|
|
|
|
|
int sampled_cb_ctr = 0;
|
2020-05-07 21:46:41 +00:00
|
|
|
auto setup_sampled_callback = [&sampled_cb_ctr](double sampling_prob) {
|
|
|
|
|
return addGlobalCallback(RecordFunctionCallback(
|
|
|
|
|
[&sampled_cb_ctr](const RecordFunction& fn) {
|
|
|
|
|
if (std::string(fn.name().str()) == "test") {
|
|
|
|
|
++sampled_cb_ctr;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
},
|
|
|
|
|
[](const RecordFunction&) {})
|
|
|
|
|
.samplingProb(sampling_prob));
|
|
|
|
|
};
|
2019-06-06 20:40:03 +00:00
|
|
|
|
|
|
|
|
int non_sampled_cb_ctr = 0;
|
2020-05-07 21:46:41 +00:00
|
|
|
addGlobalCallback(RecordFunctionCallback(
|
|
|
|
|
[&non_sampled_cb_ctr](const RecordFunction& fn) {
|
2019-06-06 20:40:03 +00:00
|
|
|
if (std::string(fn.name().str()) == "test") {
|
|
|
|
|
++non_sampled_cb_ctr;
|
|
|
|
|
}
|
2020-03-31 07:31:06 +00:00
|
|
|
return true;
|
2019-06-06 20:40:03 +00:00
|
|
|
},
|
2020-05-07 21:46:41 +00:00
|
|
|
[](const RecordFunction&) {}));
|
|
|
|
|
|
|
|
|
|
auto handle = setup_sampled_callback(0.5);
|
2019-06-06 20:40:03 +00:00
|
|
|
|
|
|
|
|
auto run_test_function = []() {
|
|
|
|
|
auto t = torch::randn({1, 2, 3}, at::kCPU);
|
|
|
|
|
for (auto k = 0; k < 1000; k++) {
|
|
|
|
|
invokeTestRecordFunction(t);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
run_test_function();
|
|
|
|
|
TORCH_CHECK(non_sampled_cb_ctr == 1000);
|
|
|
|
|
TORCH_CHECK(sampled_cb_ctr > 0 && sampled_cb_ctr < 1000);
|
|
|
|
|
|
|
|
|
|
sampled_cb_ctr = 0;
|
2020-05-07 21:46:41 +00:00
|
|
|
removeCallback(handle);
|
|
|
|
|
handle = setup_sampled_callback(0.0);
|
2019-06-06 20:40:03 +00:00
|
|
|
run_test_function();
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(non_sampled_cb_ctr == 2000);
|
|
|
|
|
TORCH_CHECK(sampled_cb_ctr == 0);
|
|
|
|
|
|
|
|
|
|
sampled_cb_ctr = 0;
|
2020-05-07 21:46:41 +00:00
|
|
|
removeCallback(handle);
|
|
|
|
|
handle = setup_sampled_callback(1.0);
|
2019-06-06 20:40:03 +00:00
|
|
|
run_test_function();
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(non_sampled_cb_ctr == 3000);
|
|
|
|
|
TORCH_CHECK(sampled_cb_ctr == 1000);
|
2020-05-07 21:46:41 +00:00
|
|
|
clearCallbacks();
|
2019-06-06 20:40:03 +00:00
|
|
|
|
2020-03-31 07:31:06 +00:00
|
|
|
// test the scope of the callbacks
|
|
|
|
|
checkScopeCallbacks();
|
2020-05-07 21:46:41 +00:00
|
|
|
clearCallbacks();
|
2020-04-21 02:14:03 +00:00
|
|
|
|
|
|
|
|
// check record function guard
|
|
|
|
|
std::vector<std::string> fn_names;
|
|
|
|
|
std::mutex mtx;
|
2020-05-07 21:46:41 +00:00
|
|
|
addGlobalCallback(RecordFunctionCallback(
|
|
|
|
|
[&fn_names, &mtx](const RecordFunction& fn) {
|
2020-04-21 02:14:03 +00:00
|
|
|
std::lock_guard<std::mutex> lock(mtx);
|
|
|
|
|
fn_names.push_back(fn.name().str());
|
|
|
|
|
return true;
|
|
|
|
|
},
|
2020-05-07 21:46:41 +00:00
|
|
|
[](const RecordFunction&) {}));
|
2020-04-21 02:14:03 +00:00
|
|
|
{
|
2020-05-07 21:46:41 +00:00
|
|
|
RecordFunctionGuard g1(false);
|
2020-04-21 02:14:03 +00:00
|
|
|
{
|
|
|
|
|
RECORD_USER_SCOPE("A");
|
|
|
|
|
{
|
2020-05-07 21:46:41 +00:00
|
|
|
RecordFunctionGuard g2(true);
|
2020-04-21 02:14:03 +00:00
|
|
|
RECORD_USER_SCOPE("B");
|
|
|
|
|
{
|
2020-05-07 21:46:41 +00:00
|
|
|
DisableRecordFunctionGuard g3;
|
2020-04-21 02:14:03 +00:00
|
|
|
RECORD_USER_SCOPE("C");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
{ RECORD_USER_SCOPE("D"); }
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
TORCH_CHECK(fn_names.size() == 1);
|
|
|
|
|
TORCH_CHECK(fn_names[0] == "B");
|
2020-05-07 21:46:41 +00:00
|
|
|
clearCallbacks();
|
|
|
|
|
|
|
|
|
|
// test add/remove
|
|
|
|
|
std::vector<size_t> ids;
|
|
|
|
|
auto add_remove_test_add_cb = [&ids](size_t id) {
|
|
|
|
|
return addGlobalCallback(RecordFunctionCallback(
|
|
|
|
|
[&ids, id](const RecordFunction& fn) { ids.push_back(id); },
|
|
|
|
|
[](const RecordFunction&) {}));
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
auto h1 = add_remove_test_add_cb(1);
|
|
|
|
|
auto h2 = add_remove_test_add_cb(2);
|
|
|
|
|
auto h3 = add_remove_test_add_cb(3);
|
|
|
|
|
|
|
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(ids.size() == 3);
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 1) != ids.end());
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 2) != ids.end());
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 3) != ids.end());
|
|
|
|
|
|
|
|
|
|
ids.clear();
|
|
|
|
|
removeCallback(h1);
|
|
|
|
|
|
|
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(ids.size() == 2);
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 2) != ids.end());
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 3) != ids.end());
|
|
|
|
|
|
|
|
|
|
ids.clear();
|
|
|
|
|
removeCallback(h3);
|
|
|
|
|
|
|
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(ids.size() == 1);
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 2) != ids.end());
|
|
|
|
|
|
|
|
|
|
clearCallbacks();
|
|
|
|
|
|
|
|
|
|
// thread local / global callbacks
|
|
|
|
|
|
|
|
|
|
ids.clear();
|
|
|
|
|
addGlobalCallback(RecordFunctionCallback(
|
|
|
|
|
[&ids](const RecordFunction& fn) { ids.push_back(1); },
|
|
|
|
|
[](const RecordFunction&) {}));
|
|
|
|
|
|
|
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(ids.size() == 1);
|
|
|
|
|
TORCH_CHECK(ids[0] == 1);
|
|
|
|
|
ids.clear();
|
|
|
|
|
|
|
|
|
|
auto th = std::thread([&ids]() {
|
|
|
|
|
addThreadLocalCallback(RecordFunctionCallback(
|
|
|
|
|
[&ids](const RecordFunction& fn) { ids.push_back(2); },
|
|
|
|
|
[](const RecordFunction&) {}));
|
|
|
|
|
|
|
|
|
|
{ RECORD_USER_SCOPE("test_thread"); }
|
|
|
|
|
});
|
|
|
|
|
th.join();
|
|
|
|
|
TORCH_CHECK(ids.size() == 2);
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 1) != ids.end());
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 2) != ids.end());
|
|
|
|
|
ids.clear();
|
|
|
|
|
|
|
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(ids.size() == 1);
|
|
|
|
|
TORCH_CHECK(ids[0] == 1);
|
|
|
|
|
ids.clear();
|
|
|
|
|
|
2020-05-07 21:46:41 +00:00
|
|
|
clearCallbacks();
|
|
|
|
|
|
2020-09-16 18:27:46 +00:00
|
|
|
// START: thread local / global context check callbacks
|
|
|
|
|
struct TestContext : public ObserverContext {
|
|
|
|
|
int a{0};
|
|
|
|
|
std::string b;
|
|
|
|
|
};
|
|
|
|
|
ids.clear();
|
|
|
|
|
{ // START: global test
|
|
|
|
|
const int test_val = 123;
|
|
|
|
|
const std::string test_str = "test str";
|
|
|
|
|
addGlobalCallback(RecordFunctionCallback(
|
|
|
|
|
[test_val, test_str, &ids](const RecordFunction& /* unused */) {
|
|
|
|
|
auto ctx = std::make_unique<TestContext>();
|
|
|
|
|
ctx->a = test_val;
|
|
|
|
|
ctx->b = test_str;
|
|
|
|
|
ids.push_back(1);
|
|
|
|
|
return ctx;
|
|
|
|
|
},
|
|
|
|
|
[test_val, test_str](
|
|
|
|
|
const RecordFunction& /* unused */, ObserverContext* ctx_ptr) {
|
|
|
|
|
auto ctx = dynamic_cast<TestContext*>(ctx_ptr);
|
|
|
|
|
TORCH_CHECK(ctx_ptr != nullptr);
|
|
|
|
|
TORCH_CHECK(ctx->a == test_val);
|
|
|
|
|
TORCH_CHECK(ctx->b == test_str);
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(ids.size() == 1);
|
|
|
|
|
TORCH_CHECK(ids[0] == 1);
|
|
|
|
|
ids.clear();
|
|
|
|
|
} // END: global test
|
|
|
|
|
{ // START: thread local test
|
|
|
|
|
auto ctx_th = std::thread([&ids]() {
|
|
|
|
|
const int test_val = 234;
|
|
|
|
|
const std::string test_str = "test thread str";
|
|
|
|
|
addThreadLocalCallback(RecordFunctionCallback(
|
|
|
|
|
[test_val, test_str, &ids](const RecordFunction& /* unused */) {
|
|
|
|
|
auto ctx = std::make_unique<TestContext>();
|
|
|
|
|
ctx->a = test_val;
|
|
|
|
|
ctx->b = test_str;
|
|
|
|
|
ids.push_back(2);
|
|
|
|
|
return ctx;
|
|
|
|
|
},
|
|
|
|
|
[test_val, test_str](
|
|
|
|
|
const RecordFunction& /* unused */, ObserverContext* ctx_ptr) {
|
|
|
|
|
auto ctx = dynamic_cast<TestContext*>(ctx_ptr);
|
|
|
|
|
TORCH_CHECK(ctx_ptr != nullptr);
|
|
|
|
|
TORCH_CHECK(ctx->a == test_val);
|
|
|
|
|
TORCH_CHECK(ctx->b == test_str);
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
// Will call both global and thread local callbacks.
|
|
|
|
|
{ RECORD_USER_SCOPE("test_thread"); }
|
|
|
|
|
});
|
|
|
|
|
ctx_th.join();
|
|
|
|
|
TORCH_CHECK(ids.size() == 2);
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 1) != ids.end());
|
|
|
|
|
TORCH_CHECK(std::find(ids.begin(), ids.end(), 2) != ids.end());
|
|
|
|
|
ids.clear();
|
|
|
|
|
} // END: thread local test
|
|
|
|
|
|
|
|
|
|
clearCallbacks();
|
|
|
|
|
|
2020-05-07 21:46:41 +00:00
|
|
|
// test should_run
|
|
|
|
|
|
|
|
|
|
bool ran = false;
|
|
|
|
|
bool should_run = false;
|
|
|
|
|
addGlobalCallback(
|
|
|
|
|
RecordFunctionCallback(
|
|
|
|
|
[&ran](const RecordFunction& fn) { ran = true; },
|
|
|
|
|
[](const RecordFunction&) {})
|
|
|
|
|
.setShouldRun([&should_run](const RecordFunctionCallback&) {
|
|
|
|
|
return should_run;
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(!ran);
|
|
|
|
|
|
|
|
|
|
should_run = true;
|
|
|
|
|
|
|
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
|
|
|
|
|
TORCH_CHECK(ran);
|
|
|
|
|
|
|
|
|
|
clearCallbacks();
|
2020-05-07 21:46:41 +00:00
|
|
|
|
|
|
|
|
// test propagation of TLS callbacks
|
|
|
|
|
std::thread t([]() {
|
|
|
|
|
RecordFunctionGuard enable_rec_fn;
|
|
|
|
|
std::string recorded_op;
|
|
|
|
|
auto handle = addThreadLocalCallback(RecordFunctionCallback(
|
|
|
|
|
[&recorded_op](const RecordFunction& fn) {
|
|
|
|
|
recorded_op = fn.name().str();
|
|
|
|
|
},
|
|
|
|
|
[](const RecordFunction&) {}));
|
|
|
|
|
ThreadLocalState state;
|
|
|
|
|
std::thread t_child([state]() {
|
|
|
|
|
ThreadLocalStateGuard g_tls(state);
|
|
|
|
|
RECORD_USER_SCOPE("test_in_thread");
|
|
|
|
|
});
|
|
|
|
|
t_child.join();
|
|
|
|
|
TORCH_CHECK(recorded_op == "test_in_thread");
|
|
|
|
|
removeCallback(handle);
|
|
|
|
|
});
|
|
|
|
|
t.join();
|
2020-05-29 22:32:39 +00:00
|
|
|
clearCallbacks();
|
|
|
|
|
|
|
|
|
|
// test set ids
|
|
|
|
|
bool has_ids = false;
|
add overload name for str cmp (#39607)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/39607
add overload name for strcmp macro to prevent duplicated op names in lite interpreter
also reformatted some other files
Test Plan:
verified these op schema are changed
```
-aten::eq(str a, str b) -> (bool)
+aten::eq.str(str a, str b) -> (bool)
-aten::ne(str a, str b) -> (bool)
+aten::ne.str(str a, str b) -> (bool)
-aten::lt(str a, str b) -> (bool)
+aten::lt.str(str a, str b) -> (bool)
-aten::gt(str a, str b) -> (bool)
+aten::gt.str(str a, str b) -> (bool)
-aten::le(str a, str b) -> (bool)
+aten::le.str(str a, str b) -> (bool)
-aten::ge(str a, str b) -> (bool)
+aten::ge.str(str a, str b) -> (bool)
```
Reviewed By: iseeyuan
Differential Revision: D21913049
fbshipit-source-id: 518db068c8c5b0efd19223f0bd94fc3351335dc4
2020-06-07 06:19:33 +00:00
|
|
|
addGlobalCallback(
|
|
|
|
|
RecordFunctionCallback(
|
|
|
|
|
[&has_ids](const RecordFunction& fn) { has_ids = fn.handle() > 0; },
|
|
|
|
|
[](const RecordFunction&) {})
|
|
|
|
|
.needsIds(true));
|
2020-05-29 22:32:39 +00:00
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
TORCH_CHECK(has_ids);
|
|
|
|
|
clearCallbacks();
|
|
|
|
|
has_ids = false;
|
|
|
|
|
addGlobalCallback(RecordFunctionCallback(
|
|
|
|
|
[&has_ids](const RecordFunction& fn) { has_ids = fn.handle() > 0; },
|
|
|
|
|
[](const RecordFunction&) {}));
|
|
|
|
|
{ RECORD_USER_SCOPE("test"); }
|
|
|
|
|
TORCH_CHECK(!has_ids);
|
|
|
|
|
clearCallbacks();
|
2019-03-29 00:42:47 +00:00
|
|
|
}
|
|
|
|
|
|
2020-10-29 05:36:13 +00:00
|
|
|
TEST(RecordFunctionTest, OperatorNameOverload) {
|
|
|
|
|
std::set<std::string> operator_names;
|
|
|
|
|
|
|
|
|
|
at::addGlobalCallback(at::RecordFunctionCallback(
|
|
|
|
|
[&operator_names](const at::RecordFunction& fn) {
|
|
|
|
|
c10::optional<c10::OperatorName> op_name =
|
|
|
|
|
fn.operator_name();
|
|
|
|
|
if (op_name.has_value()) {
|
|
|
|
|
operator_names.insert(c10::toString(*op_name));
|
|
|
|
|
} else {
|
|
|
|
|
operator_names.insert("No Operator Name");
|
|
|
|
|
}
|
|
|
|
|
})
|
|
|
|
|
.scopes({at::RecordScope::FUNCTION}));
|
|
|
|
|
auto t = torch::randn({1, 2, 3}, at::kCPU);
|
|
|
|
|
t.set_requires_grad(false);
|
|
|
|
|
auto t2 = t.pow(2);
|
|
|
|
|
|
|
|
|
|
at::clearCallbacks();
|
|
|
|
|
EXPECT_TRUE(operator_names.count("No Operator Name") == 0)
|
|
|
|
|
<< "Expected that all traced operators had an associated OperatorName object";
|
|
|
|
|
EXPECT_TRUE(operator_names.count("aten::randn") == 1)
|
|
|
|
|
<< "Expected aten::randn to have been called and recorded, but it was not";
|
|
|
|
|
EXPECT_TRUE(operator_names.count("aten::pow.Tensor_Scalar") == 1)
|
|
|
|
|
<< "Expected aten::pow.Tensor_Scalar to have been called and recorded, but it was not";
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-12 02:20:52 +00:00
|
|
|
class TestThreadLocalDebugInfo : public c10::DebugInfoBase {
|
2019-08-12 21:48:06 +00:00
|
|
|
public:
|
|
|
|
|
int getModelId() const {
|
|
|
|
|
return model_id_;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void setModelId(int model_id) {
|
|
|
|
|
model_id_ = model_id;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
virtual ~TestThreadLocalDebugInfo() {}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
int model_id_ = 0;
|
|
|
|
|
};
|
|
|
|
|
|
2020-05-12 02:20:52 +00:00
|
|
|
void checkDebugInfo(c10::DebugInfoKind kind, int model_id) {
|
|
|
|
|
auto debug_info = c10::ThreadLocalDebugInfo::get(kind);
|
2020-04-01 08:51:34 +00:00
|
|
|
TORCH_CHECK(debug_info != nullptr);
|
|
|
|
|
auto* test_debug_info =
|
|
|
|
|
dynamic_cast<TestThreadLocalDebugInfo*>(debug_info.get());
|
|
|
|
|
TORCH_CHECK(test_debug_info != nullptr);
|
|
|
|
|
TORCH_CHECK(test_debug_info->getModelId() == model_id);
|
|
|
|
|
}
|
2019-08-12 21:48:06 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(ThreadLocalDebugInfoTest, Basic) {
|
2020-04-01 08:51:34 +00:00
|
|
|
TORCH_CHECK(
|
2020-05-12 02:20:52 +00:00
|
|
|
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr);
|
2019-08-12 21:48:06 +00:00
|
|
|
auto debug_info = std::make_shared<TestThreadLocalDebugInfo>();
|
|
|
|
|
debug_info->setModelId(42);
|
2020-04-01 08:51:34 +00:00
|
|
|
{
|
2020-05-12 02:20:52 +00:00
|
|
|
c10::DebugInfoGuard guard(c10::DebugInfoKind::TEST_INFO, debug_info);
|
|
|
|
|
checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42);
|
2020-04-01 08:51:34 +00:00
|
|
|
}
|
2019-08-12 21:48:06 +00:00
|
|
|
|
|
|
|
|
// check that thread local debug info is propagated through fork calls
|
2020-04-01 08:51:34 +00:00
|
|
|
TORCH_CHECK(
|
2020-05-12 02:20:52 +00:00
|
|
|
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr);
|
2020-03-26 18:15:49 +00:00
|
|
|
std::atomic<bool> done{false};
|
2020-04-01 08:51:34 +00:00
|
|
|
{
|
2020-05-12 02:20:52 +00:00
|
|
|
c10::DebugInfoGuard guard(c10::DebugInfoKind::TEST_INFO, debug_info);
|
2020-04-01 08:51:34 +00:00
|
|
|
at::launch([&done]() {
|
2020-05-12 02:20:52 +00:00
|
|
|
checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42);
|
2020-04-01 08:51:34 +00:00
|
|
|
done = true;
|
|
|
|
|
});
|
|
|
|
|
}
|
2020-03-26 18:15:49 +00:00
|
|
|
while (!done) {
|
|
|
|
|
}
|
2019-08-12 21:48:06 +00:00
|
|
|
|
|
|
|
|
// check that thread local debug info is propagated through backward pass
|
2020-04-01 08:51:34 +00:00
|
|
|
TORCH_CHECK(
|
2020-05-12 02:20:52 +00:00
|
|
|
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr);
|
2020-04-01 08:51:34 +00:00
|
|
|
done = false;
|
2020-05-07 21:46:41 +00:00
|
|
|
auto handle = addGlobalCallback(RecordFunctionCallback(
|
|
|
|
|
[&done](const RecordFunction&) {
|
2020-05-12 02:20:52 +00:00
|
|
|
checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42);
|
2020-04-01 08:51:34 +00:00
|
|
|
done = true;
|
2020-03-31 07:31:06 +00:00
|
|
|
return true;
|
2019-08-12 21:48:06 +00:00
|
|
|
},
|
2020-05-07 21:46:41 +00:00
|
|
|
[](const RecordFunction&) {}));
|
2019-08-12 21:48:06 +00:00
|
|
|
{
|
2020-05-12 02:20:52 +00:00
|
|
|
c10::DebugInfoGuard guard(c10::DebugInfoKind::TEST_INFO, debug_info);
|
2019-08-12 21:48:06 +00:00
|
|
|
auto t = torch::randn({1, 2, 3}, at::kCPU);
|
|
|
|
|
t.set_requires_grad(true);
|
|
|
|
|
auto t2 = t.pow(2);
|
2019-11-19 05:45:42 +00:00
|
|
|
t2.backward(torch::ones_like(t2, at::MemoryFormat::Preserve));
|
2019-08-12 21:48:06 +00:00
|
|
|
}
|
2020-05-07 21:46:41 +00:00
|
|
|
removeCallback(handle);
|
2020-04-01 08:51:34 +00:00
|
|
|
TORCH_CHECK(done);
|
2019-08-12 21:48:06 +00:00
|
|
|
|
2020-04-01 08:51:34 +00:00
|
|
|
// check nested debug info
|
|
|
|
|
TORCH_CHECK(
|
2020-05-12 02:20:52 +00:00
|
|
|
c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::TEST_INFO) == nullptr);
|
2020-04-01 08:51:34 +00:00
|
|
|
{
|
2020-05-12 02:20:52 +00:00
|
|
|
c10::DebugInfoGuard guard(c10::DebugInfoKind::TEST_INFO, debug_info);
|
2020-04-01 08:51:34 +00:00
|
|
|
{
|
2020-05-12 02:20:52 +00:00
|
|
|
checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42);
|
2020-04-01 08:51:34 +00:00
|
|
|
{
|
|
|
|
|
auto debug_info = std::make_shared<TestThreadLocalDebugInfo>();
|
|
|
|
|
debug_info->setModelId(314);
|
2020-05-12 02:20:52 +00:00
|
|
|
c10::DebugInfoGuard guard(c10::DebugInfoKind::TEST_INFO_2, debug_info);
|
2020-04-01 08:51:34 +00:00
|
|
|
{
|
2020-05-12 02:20:52 +00:00
|
|
|
checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42);
|
|
|
|
|
checkDebugInfo(c10::DebugInfoKind::TEST_INFO_2, 314);
|
2020-04-01 08:51:34 +00:00
|
|
|
done = false;
|
|
|
|
|
at::launch([&done]() {
|
2020-05-12 02:20:52 +00:00
|
|
|
checkDebugInfo(c10::DebugInfoKind::TEST_INFO, 42);
|
|
|
|
|
checkDebugInfo(c10::DebugInfoKind::TEST_INFO_2, 314);
|
2020-04-01 08:51:34 +00:00
|
|
|
done = true;
|
|
|
|
|
});
|
|
|
|
|
while (!done) {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-08-12 21:48:06 +00:00
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(FallbackGraphsTest, Basic) {
|
2020-08-29 06:29:27 +00:00
|
|
|
static const auto nestGraphIntoFallbackGraph =
|
|
|
|
|
[](const std::shared_ptr<Graph>& graph) {
|
|
|
|
|
ProfilingRecord::removeProfileCounter(graph->block());
|
|
|
|
|
auto fallback =
|
2020-09-03 21:40:43 +00:00
|
|
|
replaceBlockWithFallbackGraph(graph->block(), graph->inputs());
|
2020-08-29 06:29:27 +00:00
|
|
|
for (size_t i = 0; i < graph->outputs().size(); i++) {
|
|
|
|
|
graph->outputs()[i]->replaceAllUsesWith(fallback->output(i));
|
|
|
|
|
fallback->output(i)->copyMetadata(graph->outputs()[i]);
|
|
|
|
|
}
|
|
|
|
|
for (auto it = graph->block()->nodes().rbegin();
|
|
|
|
|
it != fallback->iterator();
|
|
|
|
|
it++) {
|
|
|
|
|
it.destroyCurrent();
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
auto x = at::randn({1}, at::kCPU);
|
|
|
|
|
auto y = at::randn({1}, at::kCPU);
|
|
|
|
|
auto stack = createStack({x.clone(), y.clone()});
|
|
|
|
|
|
|
|
|
|
auto graph_string = R"IR(
|
|
|
|
|
graph(%0 : Float(1),
|
|
|
|
|
%1 : Float(1)):
|
|
|
|
|
%2 : Tensor = aten::mul(%0, %1)
|
|
|
|
|
%3 : Tensor = aten::mul(%2, %0)
|
|
|
|
|
return (%3))IR";
|
|
|
|
|
auto graph = std::make_shared<Graph>();
|
|
|
|
|
torch::jit::parseIR(graph_string, graph.get());
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
Code code(graph, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
}
|
|
|
|
|
at::Tensor et;
|
|
|
|
|
pop(stack, et);
|
|
|
|
|
float ef = et.item<float>();
|
|
|
|
|
{
|
|
|
|
|
EnableProfilingGuard epg;
|
|
|
|
|
GraphFunction f("fallbackGraphs", graph, nullptr);
|
|
|
|
|
for (size_t i = 0; i < getNumProfiledRuns() + 1; i++) {
|
|
|
|
|
stack.emplace_back(x.clone());
|
|
|
|
|
stack.emplace_back(y.clone());
|
|
|
|
|
if (i == getNumProfiledRuns()) {
|
|
|
|
|
// we will be modifying a profiled graph
|
|
|
|
|
// before ProfilingGraphExecutor
|
|
|
|
|
// will optimize it in the next iteration
|
|
|
|
|
auto opt_graph = lastExecutedOptimizedGraph();
|
|
|
|
|
// this is safe to do since we are done profiling
|
|
|
|
|
ProfilingRecord::removeProfileCounter(opt_graph->block());
|
2020-09-03 21:40:43 +00:00
|
|
|
replaceBlockWithFallbackGraph(opt_graph->block(), opt_graph->inputs());
|
2020-08-29 06:29:27 +00:00
|
|
|
auto it = opt_graph->block()->nodes().begin();
|
|
|
|
|
ASSERT_EQ(it->kind(), prim::FallbackGraph);
|
|
|
|
|
auto fallback = *it++;
|
|
|
|
|
ASSERT_EQ(it, opt_graph->block()->nodes().end());
|
|
|
|
|
ASSERT_TRUE(fallback->hasAttribute(attr::Subgraph));
|
|
|
|
|
testing::FileCheck()
|
|
|
|
|
.check("Tensor = aten::mul")
|
|
|
|
|
->check("Tensor = aten::mul")
|
|
|
|
|
->run(*fallback->g(attr::Subgraph));
|
|
|
|
|
}
|
|
|
|
|
f.run(stack);
|
|
|
|
|
at::Tensor at;
|
|
|
|
|
pop(stack, at);
|
|
|
|
|
float af = at.item<float>();
|
|
|
|
|
ASSERT_EQ(af, ef);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto opt_graph = lastExecutedOptimizedGraph();
|
|
|
|
|
testing::FileCheck()
|
|
|
|
|
.check("(Tensor) = prim::CallFunction")
|
|
|
|
|
->run(*opt_graph);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// TODO this test wasn't running and is broken.
|
|
|
|
|
// TEST(AutogradProfilerTest, Basic) {
|
|
|
|
|
// constexpr int batch_size = 4;
|
|
|
|
|
// constexpr int input_size = 256;
|
|
|
|
|
// constexpr int seq_len = 32;
|
|
|
|
|
|
|
|
|
|
// int hidden_size = 2 * input_size;
|
|
|
|
|
// auto input = torch::randn({seq_len, batch_size, input_size}, at::kCPU);
|
|
|
|
|
// auto hx = torch::randn({batch_size, hidden_size}, at::kCPU);
|
|
|
|
|
// auto cx = torch::randn({batch_size, hidden_size}, at::kCPU);
|
|
|
|
|
// auto w_ih = t_def(torch::randn({4 * hidden_size, input_size}, at::kCPU));
|
|
|
|
|
// auto w_hh = t_def(torch::randn({4 * hidden_size, hidden_size}, at::kCPU));
|
|
|
|
|
|
|
|
|
|
// std::stringstream ss;
|
|
|
|
|
// {
|
|
|
|
|
// RecordProfile guard(ss);
|
|
|
|
|
// for (size_t i = 0; i < 100; ++i) {
|
|
|
|
|
// std::tie(hx, cx) = lstm(input[0], hx, cx, w_ih, w_hh);
|
|
|
|
|
// }
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
// std::string result = ss.str();
|
|
|
|
|
// size_t count = 0;
|
|
|
|
|
// for (size_t pos = 0; (pos = result.find("tanh", pos)) != std::string::npos;
|
|
|
|
|
// count++, pos++) {
|
|
|
|
|
// }
|
|
|
|
|
// ASSERT_EQ((count, 200);
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
TEST(NoneSchemaMatchTest, Basic) {
|
2019-02-19 19:34:46 +00:00
|
|
|
RegisterOperators reg({
|
|
|
|
|
Operator(
|
2019-04-05 17:40:19 +00:00
|
|
|
"prim::test_none() -> int?",
|
2020-06-30 02:22:42 +00:00
|
|
|
[](Stack* stack) { push(stack, IValue()); },
|
2019-07-25 18:37:34 +00:00
|
|
|
aliasAnalysisFromSchema()),
|
2019-02-19 19:34:46 +00:00
|
|
|
Operator(
|
2019-04-05 17:40:19 +00:00
|
|
|
"prim::is_none(int? a) -> bool",
|
2020-06-30 02:22:42 +00:00
|
|
|
[](Stack* stack) {
|
remove unnecessary Node* ops (#32760)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/32760
Minor changes to the way ops are implemented to remove incidental use of Node*
in the operator implementation.
Current state for operators that previously took Node:
```
TBD:
USES NODE: prim::DifferentiableGraph(...) -> (...)
USES NODE: prim::profile(...) -> (...)
USES NODE: prim::FusionGroup(...) -> (...)
USES NODE: prim::PythonOp(...) -> (...)
USES NODE: prim::ImplicitTensorToNum(Tensor a) -> Scalar # next PR
Should be made interpreter primitives:
USES NODE: prim::TupleUnpack(...) -> (...)
USES NODE: prim::TupleSlice(...) -> (...)
USES NODE: prim::TupleConstruct(...) -> (...)
USES NODE: prim::ListUnpack(...) -> (...)
USES NODE: prim::ListConstruct(...) -> (...)
USES NODE: prim::DictConstruct(...) -> (...)
USES NODE: prim::Constant() -> (...)
USES NODE: prim::isinstance(...) -> (...)
USES NODE: prim::CreateObject(...) -> (...)
USES NODE: prim::fork(...) -> (...)
USES NODE: aten::warn(str message, *, int stacklevel=2) -> () # need stack level information, so ideally in interpreter so it can look at the stack
Should be made into vararg operators, i.e. the operators last argument should be an IValue
that contains the number of arguments.
USES NODE: prim::FusedConcat(...) -> (...)
USES NODE: prim::MMTreeReduce(...) -> (...)
USES NODE: prim::MMBatchSide(...) -> (...)
USES NODE: prim::ConstantChunk(...) -> (...)
USES NODE: prim::AutogradAnyNonZero(...) -> bool
USES NODE: prim::BroadcastSizes(...) -> (...)
USES NODE: prim::ChunkSizes(...) -> (...)
USES NODE: aten::format(str self, ...) -> str
USES NODE: prim::Print(...) -> (...)
fixed:
USES NODE: aten::extend(Tensor[](a!) self, Tensor [] other) -> ()
USES NODE: aten::copy(Tensor[](a) self) -> Tensor[]
USES NODE: aten::extend(int[](a!) self, int [] other) -> ()
USES NODE: aten::copy(int[](a) self) -> int[]
USES NODE: aten::extend(float[](a!) self, float [] other) -> ()
USES NODE: aten::copy(float[](a) self) -> float[]
USES NODE: aten::extend(bool[](a!) self, bool [] other) -> ()
USES NODE: aten::copy(bool[](a) self) -> bool[]
USES NODE: aten::extend(t[](a!) self, t [] other) -> ()
USES NODE: aten::copy(t[](a) self) -> t[]
USES NODE: aten::keys(Dict(str, t) self) -> str[](*)
USES NODE: aten::values(Dict(str, t) self) -> t[](*)
USES NODE: aten::dict((str, tVal)[] inputs) -> Dict(str, tVal)
USES NODE: aten::keys(Dict(int, t) self) -> int[](*)
USES NODE: aten::values(Dict(int, t) self) -> t[](*)
USES NODE: aten::dict((int, tVal)[] inputs) -> Dict(int, tVal)
USES NODE: aten::keys(Dict(float, t) self) -> float[](*)
USES NODE: aten::values(Dict(float, t) self) -> t[](*)
USES NODE: aten::dict((float, tVal)[] inputs) -> Dict(float, tVal)
USES NODE: aten::keys(Dict(Tensor, t) self) -> Tensor[](*)
USES NODE: aten::values(Dict(Tensor, t) self) -> t[](*)
USES NODE: aten::dict((Tensor, tVal)[] inputs) -> Dict(Tensor, tVal)
USES NODE: aten::test_vartype2(t a, t[] b) -> (t[])
USES NODE: aten::_ncf_unsqueeze(Tensor self, int ndim) -> Tensor
USES NODE: aten::_ncf_view(Tensor self, int[] input_shape, int normalized_ndim) -> Tensor
USES NODE: prim::is_none(int? a) -> bool
USES NODE: aten::__interpolate(Tensor input, int? size = None, float[]? scale_factor = None, str mode = 'nearest', bool? align_corners = None, bool? recompute_scale_factor = None) -> Tensor
USES NODE: aten::__interpolate(Tensor input, int[]? size = None, float[]? scale_factor = None, str mode = 'nearest', bool? align_corners = None, bool? recompute_scale_factor = None) -> Tensor
USES NODE: aten::__interpolate(Tensor input, int? size = None, float? scale_factor = None, str mode = 'nearest', bool? align_corners = None, bool? recompute_scale_factor = None) -> Tensor
USES NODE: aten::__interpolate(Tensor input, int[]? size = None, float? scale_factor = None, str mode = 'nearest', bool? align_corners = None, bool? recompute_scale_factor = None) -> Tensor
USES NODE: aten::sorted(t[](a) self) -> (t[])
USES NODE: aten::sort(t[](a!) self, bool reverse=False) -> ()
USES NODE: aten::test_vartype(t[] a, t b) -> (t)
USES NODE: prim::unchecked_unwrap_optional(t(a)? optional) -> t(a)
USES NODE: prim::unchecked_cast(...) -> (...)
USES NODE: aten::dict() -> Dict(str, Tensor)
USES NODE: prim::Load(...) -> (...)
USES NODE: prim::Store(...) -> (...)
USES NODE: prim::Drop(...) -> (...)
USES NODE: aten::tensor(t[] data, *, ScalarType? dtype=None, Device? device=None, bool requires_grad=False) -> Tensor
USES NODE: aten::as_tensor(t[] data, *, ScalarType? dtype=None, Device? device=None) -> Tensor
```
Test Plan: Imported from OSS
Differential Revision: D19615387
Pulled By: zdevito
fbshipit-source-id: 95298c3c4249b9f812c332d13f0fb79daeecb662
2020-02-12 22:45:44 +00:00
|
|
|
IValue a = pop(stack);
|
|
|
|
|
if (a.isNone()) {
|
|
|
|
|
push(stack, true);
|
|
|
|
|
} else {
|
|
|
|
|
push(stack, false);
|
|
|
|
|
}
|
2019-07-25 18:37:34 +00:00
|
|
|
},
|
|
|
|
|
aliasAnalysisFromSchema()),
|
2019-02-19 19:34:46 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Constant propagation will run test_none and produce a None,
|
|
|
|
|
// testing that its type is set appropriately and schema matching doesn't
|
|
|
|
|
// fail when running is_none
|
|
|
|
|
|
|
|
|
|
auto r = std::make_shared<Graph>();
|
|
|
|
|
auto& g = *r;
|
2019-04-05 17:40:19 +00:00
|
|
|
auto opt_int = g.insert(Symbol::fromQualString("prim::test_none"), {});
|
|
|
|
|
auto out_bool = g.insert(Symbol::fromQualString("prim::is_none"), {opt_int});
|
2019-02-19 19:34:46 +00:00
|
|
|
g.registerOutput(out_bool);
|
|
|
|
|
ConstantPropagation(r);
|
|
|
|
|
|
|
|
|
|
auto nodes = r->block()->nodes();
|
|
|
|
|
// checking that constant propagation ran wo/failure
|
|
|
|
|
AT_ASSERT(std::distance(nodes.begin(), nodes.end()) == 1);
|
|
|
|
|
}
|
2019-04-11 20:30:42 +00:00
|
|
|
|
2019-04-12 21:53:17 +00:00
|
|
|
static int testPassValue = 0;
|
|
|
|
|
void fakePass(std::shared_ptr<Graph>& g) {
|
|
|
|
|
testPassValue++;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RegisterPass p(fakePass);
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(PassManagementTest, Basic) {
|
2019-04-12 21:53:17 +00:00
|
|
|
std::shared_ptr<Graph> graph = std::make_shared<Graph>();
|
2020-03-12 06:29:34 +00:00
|
|
|
parseIR(
|
2019-04-12 21:53:17 +00:00
|
|
|
R"IR(
|
|
|
|
|
graph(%a):
|
|
|
|
|
return (%a))IR",
|
|
|
|
|
&*graph);
|
|
|
|
|
|
|
|
|
|
std::vector<IValue> stack = {IValue(torch::randn({22}, at::kCPU))};
|
|
|
|
|
auto run = [&](std::shared_ptr<Graph>& graph, std::vector<IValue> stack) {
|
improved TorchScript traceback (#33834)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33834
This changes how we report Tracebacks to make them more clear when
there are both serialized and non-serialized ranges. It now looks like:
```
Traceback (most recent call last):
File "foo.py", line 25, in <module>
s2(a, b)
File "/scratch/zdevito/pytorch/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
File "code/__torch__.py", line 7, in forward
x: Tensor,
y: Tensor) -> Tensor:
return (self).bar(x, y, )
~~~~~~~~~ <--- HERE
def bar(self: __torch__.Moo,
x: Tensor,
File "code/__torch__.py", line 11, in bar
x: Tensor,
y: Tensor) -> Tensor:
_0 = (self).baz(x, y, )
~~~~~~~~~ <--- HERE
_1 = torch.ones([3], dtype=None, layout=None, device=None, pin_memory=None)
return torch.add(_0, _1, alpha=1)
File "code/__torch__.py", line 17, in baz
x: Tensor,
y: Tensor) -> Tensor:
return torch.add(x, y, alpha=1)
~~~~~~~~~ <--- HERE
Traceback of TorchScript, original code (most recent call last):
File "foo.py", line 11, in forward
def forward(self, x, y):
return self.bar(x, y)
~~~~~~~~ <--- HERE
File "foo.py", line 9, in bar
def bar(self, x, y):
return self.baz(x, y) + torch.ones(3)
~~~~~~~~ <--- HERE
File "foo.py", line 7, in baz
def baz(self, x, y):
return x + y
~~~~~ <--- HERE
RuntimeError: The size of tensor a (4) must match the size of tensor b (5) at non-singleton dimension 1
```
It follows Python convension of having the most important information last
and reading from the bottom up.
Changes:
* Moved the error message to the end, to copy Python
* Report original traceback separate from serialized traceback
* Make sure root functions have names in the interpreter trace.
Test Plan: Imported from OSS
Differential Revision: D20126136
Pulled By: zdevito
fbshipit-source-id: fd01f9985e5d74e04c4d064c02e8bc320f4fac13
2020-03-03 20:24:28 +00:00
|
|
|
GraphExecutor executor(graph, "");
|
2019-04-12 21:53:17 +00:00
|
|
|
executor.run(stack);
|
|
|
|
|
return stack;
|
|
|
|
|
};
|
|
|
|
|
run(graph, stack);
|
2019-11-11 21:39:03 +00:00
|
|
|
// we will not run fusion in simple mode
|
|
|
|
|
if (!getExecutorMode()) {
|
|
|
|
|
AT_ASSERT(testPassValue);
|
|
|
|
|
}
|
2019-04-12 21:53:17 +00:00
|
|
|
}
|
|
|
|
|
|
2020-08-15 03:16:00 +00:00
|
|
|
static void checkShape(TypePtr typ, std::vector<int64_t> expected) {
|
|
|
|
|
auto ptp = typ->expect<TensorType>();
|
|
|
|
|
ASSERT_EQ(ptp->sizes().concrete_sizes().value(), expected);
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-20 17:37:49 +00:00
|
|
|
static void checkShape(
|
|
|
|
|
Node* n,
|
|
|
|
|
std::vector<int64_t> expected,
|
|
|
|
|
bool prev = true) {
|
|
|
|
|
auto profile = (prev) ? n->inputs().at(0)->node() : n;
|
2020-08-15 03:16:00 +00:00
|
|
|
checkShape(profile->output()->type(), expected);
|
2019-04-17 04:08:38 +00:00
|
|
|
}
|
|
|
|
|
|
2020-06-10 20:46:11 +00:00
|
|
|
void count_(
|
|
|
|
|
Block* block,
|
|
|
|
|
const std::function<bool(Node* n)>& pred,
|
|
|
|
|
size_t& count) {
|
|
|
|
|
for (Node* n : block->nodes()) {
|
|
|
|
|
if (pred(n)) {
|
|
|
|
|
count++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (Block* ib : n->blocks()) {
|
|
|
|
|
count_(ib, pred, count);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t countNodes(
|
|
|
|
|
const std::shared_ptr<Graph>& graph,
|
|
|
|
|
const std::function<bool(Node* n)>& pred) {
|
|
|
|
|
size_t count = 0;
|
|
|
|
|
count_(graph->block(), pred, count);
|
|
|
|
|
return count;
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
bool true_pred(Node* n) {
|
|
|
|
|
return true;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
bool is_loop(Node* n) {
|
|
|
|
|
return n->kind() == prim::Loop;
|
|
|
|
|
};
|
2020-06-10 20:46:11 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(LoopPeelerTest, NoInductionVariableUse) {
|
2020-06-10 20:46:11 +00:00
|
|
|
// do not use an induction variable explicitly
|
2020-09-25 18:35:39 +00:00
|
|
|
static const auto str_func_def = R"JIT(
|
2020-06-10 20:46:11 +00:00
|
|
|
def test_peel_n_times():
|
|
|
|
|
sum = 0
|
|
|
|
|
for i in range(10):
|
|
|
|
|
sum += 2
|
|
|
|
|
return sum
|
|
|
|
|
)JIT";
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
auto cu = compile(str_func_def);
|
|
|
|
|
auto& f = cu->get_function("test_peel_n_times");
|
|
|
|
|
auto stack = createStack({});
|
|
|
|
|
// peeling loop once
|
|
|
|
|
{
|
|
|
|
|
LoopsPeeler peeler(true_pred, 1);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
int num_loops =
|
|
|
|
|
std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
|
|
|
|
|
ASSERT_EQ(num_loops, 2);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 20);
|
|
|
|
|
}
|
2020-06-10 20:46:11 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// test peeling more than one iteration
|
|
|
|
|
{
|
|
|
|
|
LoopsPeeler peeler(true_pred, 3);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
int num_loops =
|
|
|
|
|
std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
|
|
|
|
|
ASSERT_EQ(num_loops, 2);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 20);
|
2020-06-10 20:46:11 +00:00
|
|
|
}
|
2020-09-25 18:35:39 +00:00
|
|
|
}
|
2020-06-10 20:46:11 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(LoopPeelerTest, YesInductionVariableUse) {
|
2020-06-10 20:46:11 +00:00
|
|
|
// uses the induction variable
|
2020-09-25 18:35:39 +00:00
|
|
|
static const auto str_func_def = R"JIT(
|
2020-06-10 20:46:11 +00:00
|
|
|
def test_peel_n_times():
|
|
|
|
|
sum = 0
|
|
|
|
|
for i in range(10):
|
|
|
|
|
sum += i
|
|
|
|
|
return sum
|
|
|
|
|
)JIT";
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
auto cu = compile(str_func_def);
|
|
|
|
|
auto& f = cu->get_function("test_peel_n_times");
|
|
|
|
|
auto stack = createStack({});
|
|
|
|
|
// peeling loop once
|
|
|
|
|
{
|
|
|
|
|
LoopsPeeler peeler(true_pred, 1);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
int num_loops =
|
|
|
|
|
std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
|
|
|
|
|
ASSERT_EQ(num_loops, 2);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 45);
|
|
|
|
|
}
|
2020-06-10 20:46:11 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// test peeling more than one iteration
|
|
|
|
|
{
|
|
|
|
|
LoopsPeeler peeler(true_pred, 3);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
int num_loops =
|
|
|
|
|
std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
|
|
|
|
|
ASSERT_EQ(num_loops, 2);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 45);
|
2020-06-10 20:46:11 +00:00
|
|
|
}
|
2020-09-25 18:35:39 +00:00
|
|
|
}
|
2020-06-10 20:46:11 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(LoopPeelerTest, LoopWithTerminationCondition) {
|
2020-06-10 20:46:11 +00:00
|
|
|
// tests with explicit termination conditions
|
2020-09-25 18:35:39 +00:00
|
|
|
static const auto str_func_def = R"JIT(
|
2020-06-10 20:46:11 +00:00
|
|
|
def test_with_cond_times():
|
|
|
|
|
sum = 0
|
|
|
|
|
i = 0
|
|
|
|
|
while (sum < 2):
|
|
|
|
|
sum += i
|
|
|
|
|
i += 1
|
|
|
|
|
return sum
|
|
|
|
|
)JIT";
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// the peel changes the termination condition to false
|
|
|
|
|
// so the original loop doesn't run
|
|
|
|
|
auto cu = compile(str_func_def);
|
|
|
|
|
auto& f = cu->get_function("test_with_cond_times");
|
|
|
|
|
auto stack = createStack({});
|
|
|
|
|
// peeling 5 iterations should update the termination
|
|
|
|
|
// condition to false
|
|
|
|
|
{
|
|
|
|
|
LoopsPeeler peeler(true_pred, 5);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
int num_loops =
|
|
|
|
|
std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
|
|
|
|
|
ASSERT_EQ(num_loops, 2);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 3);
|
2020-06-10 20:46:11 +00:00
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// the termination condition remains true
|
2020-06-10 20:46:11 +00:00
|
|
|
{
|
2020-09-25 18:35:39 +00:00
|
|
|
LoopsPeeler peeler(true_pred, 1);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
int num_loops =
|
|
|
|
|
std::count_if(copy->nodes().begin(), copy->nodes().end(), is_loop);
|
|
|
|
|
ASSERT_EQ(num_loops, 2);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 3);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// tests simple nested loops
|
|
|
|
|
TEST(LoopPeelerTest, SimpleNestedLoops) {
|
|
|
|
|
static const auto str_func_def = R"JIT(
|
2020-06-10 20:46:11 +00:00
|
|
|
def test_nested_loops():
|
|
|
|
|
sum = 0
|
|
|
|
|
i = 0
|
|
|
|
|
for i in range(10):
|
|
|
|
|
for j in range(10):
|
|
|
|
|
sum += i + j
|
|
|
|
|
return sum
|
|
|
|
|
)JIT";
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
auto cu = compile(str_func_def);
|
|
|
|
|
auto& f = cu->get_function("test_nested_loops");
|
|
|
|
|
auto stack = createStack({});
|
2020-06-10 20:46:11 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
{
|
|
|
|
|
LoopsPeeler peeler(true_pred, 1);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
ASSERT_EQ(countNodes(copy, is_loop), 5);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 900);
|
2020-06-10 20:46:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
{
|
2020-09-25 18:35:39 +00:00
|
|
|
LoopsPeeler peeler(true_pred, 5);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
ASSERT_EQ(countNodes(copy, is_loop), 5);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 900);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(LoopPeelerTest, SimpleNestedLoops2) {
|
|
|
|
|
static const auto str_func_def = R"JIT(
|
2020-06-10 20:46:11 +00:00
|
|
|
def test_nested_loops():
|
|
|
|
|
sum = 0
|
|
|
|
|
i = 0
|
|
|
|
|
for i in range(10):
|
|
|
|
|
j = 0
|
|
|
|
|
while sum < 2:
|
|
|
|
|
sum += i + j
|
|
|
|
|
j += 1
|
|
|
|
|
return sum
|
|
|
|
|
)JIT";
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
auto cu = compile(str_func_def);
|
|
|
|
|
auto& f = cu->get_function("test_nested_loops");
|
|
|
|
|
auto stack = createStack({});
|
|
|
|
|
{
|
|
|
|
|
LoopsPeeler peeler(true_pred, 1);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
ASSERT_EQ(countNodes(copy, is_loop), 5);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 3);
|
|
|
|
|
}
|
2020-06-10 20:46:11 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
{
|
|
|
|
|
LoopsPeeler peeler(true_pred, 5);
|
|
|
|
|
auto copy = f.graph()->copy();
|
|
|
|
|
peeler.run(copy);
|
|
|
|
|
ASSERT_EQ(countNodes(copy, is_loop), 5);
|
|
|
|
|
Code code(copy, "");
|
|
|
|
|
InterpreterState interpreter{code};
|
|
|
|
|
interpreter.run(stack);
|
|
|
|
|
ASSERT_EQ(stack.back().toInt(), 3);
|
2020-06-10 20:46:11 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(InsertAndEliminateRedundantGuardsTest, Basic) {
|
2019-05-20 17:37:49 +00:00
|
|
|
static const auto basic_example = R"JIT(
|
|
|
|
|
def basic(x, y):
|
|
|
|
|
a = x + y
|
|
|
|
|
b = x * y
|
|
|
|
|
c = x + 1
|
|
|
|
|
d = a - c
|
|
|
|
|
e = b - c
|
|
|
|
|
return d + e
|
|
|
|
|
)JIT";
|
|
|
|
|
|
|
|
|
|
auto cu = compile(basic_example);
|
|
|
|
|
auto& fun = cu->get_function("basic");
|
|
|
|
|
auto pr = ProfilingRecord::instrumentGraph(fun.graph());
|
|
|
|
|
auto x = at::randn({2, 3}, at::kCPU);
|
|
|
|
|
auto y = at::randn({2, 3}, at::kCPU);
|
2019-11-13 15:40:45 +00:00
|
|
|
auto stack = createStack({x, y});
|
2019-05-20 17:37:49 +00:00
|
|
|
// introduce some profiling information
|
improved TorchScript traceback (#33834)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33834
This changes how we report Tracebacks to make them more clear when
there are both serialized and non-serialized ranges. It now looks like:
```
Traceback (most recent call last):
File "foo.py", line 25, in <module>
s2(a, b)
File "/scratch/zdevito/pytorch/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
File "code/__torch__.py", line 7, in forward
x: Tensor,
y: Tensor) -> Tensor:
return (self).bar(x, y, )
~~~~~~~~~ <--- HERE
def bar(self: __torch__.Moo,
x: Tensor,
File "code/__torch__.py", line 11, in bar
x: Tensor,
y: Tensor) -> Tensor:
_0 = (self).baz(x, y, )
~~~~~~~~~ <--- HERE
_1 = torch.ones([3], dtype=None, layout=None, device=None, pin_memory=None)
return torch.add(_0, _1, alpha=1)
File "code/__torch__.py", line 17, in baz
x: Tensor,
y: Tensor) -> Tensor:
return torch.add(x, y, alpha=1)
~~~~~~~~~ <--- HERE
Traceback of TorchScript, original code (most recent call last):
File "foo.py", line 11, in forward
def forward(self, x, y):
return self.bar(x, y)
~~~~~~~~ <--- HERE
File "foo.py", line 9, in bar
def bar(self, x, y):
return self.baz(x, y) + torch.ones(3)
~~~~~~~~ <--- HERE
File "foo.py", line 7, in baz
def baz(self, x, y):
return x + y
~~~~~ <--- HERE
RuntimeError: The size of tensor a (4) must match the size of tensor b (5) at non-singleton dimension 1
```
It follows Python convension of having the most important information last
and reading from the bottom up.
Changes:
* Moved the error message to the end, to copy Python
* Report original traceback separate from serialized traceback
* Make sure root functions have names in the interpreter trace.
Test Plan: Imported from OSS
Differential Revision: D20126136
Pulled By: zdevito
fbshipit-source-id: fd01f9985e5d74e04c4d064c02e8bc320f4fac13
2020-03-03 20:24:28 +00:00
|
|
|
Code cd(pr->profiled_graph_, "");
|
2019-05-20 17:37:49 +00:00
|
|
|
InterpreterState is{cd};
|
|
|
|
|
is.run(stack);
|
|
|
|
|
auto copy = pr->profiled_graph_->copy();
|
2020-08-23 06:50:52 +00:00
|
|
|
ProfilingRecord::removeProfileCounter(copy->block());
|
2019-05-20 17:37:49 +00:00
|
|
|
InsertGuards(copy);
|
|
|
|
|
auto nodes = copy->block()->nodes();
|
|
|
|
|
auto guard = std::find_if(nodes.begin(), nodes.end(), [](Node* n) {
|
|
|
|
|
return n->kind() == prim::Guard;
|
|
|
|
|
});
|
|
|
|
|
ASSERT_NE(guard, nodes.end());
|
2019-08-20 19:57:40 +00:00
|
|
|
ASSERT_EQ(
|
|
|
|
|
guard->input()->type()->expect<TensorType>()->sizes().size(),
|
|
|
|
|
c10::nullopt);
|
2019-05-20 17:37:49 +00:00
|
|
|
checkShape(*guard, {2, 3}, false);
|
2019-06-03 16:36:49 +00:00
|
|
|
auto is_guard = [](Node* n) { return n->kind() == prim::Guard; };
|
|
|
|
|
int num_guards = std::count_if(nodes.begin(), nodes.end(), is_guard);
|
2019-10-03 17:38:07 +00:00
|
|
|
ASSERT_EQ(num_guards, 12);
|
2019-06-03 16:36:49 +00:00
|
|
|
// now eliminate as many guards as possible
|
|
|
|
|
// we should be left with two guards on x and y's defs
|
2019-06-14 23:51:59 +00:00
|
|
|
EliminateRedundantGuards(copy);
|
2019-06-03 16:36:49 +00:00
|
|
|
num_guards = std::count_if(nodes.begin(), nodes.end(), is_guard);
|
|
|
|
|
ASSERT_EQ(num_guards, 2);
|
2019-05-20 17:37:49 +00:00
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(InsertBailOutsTest, Basic) {
|
2019-06-10 18:40:49 +00:00
|
|
|
static const auto basic_example = R"JIT(
|
|
|
|
|
def basic_loop(x, y):
|
|
|
|
|
|
|
|
|
|
a = x + 1
|
|
|
|
|
b = y + 2
|
|
|
|
|
c = x + y + 3
|
|
|
|
|
|
|
|
|
|
for i in range(10):
|
|
|
|
|
a = a + b
|
|
|
|
|
# invariant
|
|
|
|
|
d = b * c
|
|
|
|
|
#
|
|
|
|
|
a = a - d
|
|
|
|
|
|
|
|
|
|
e = a + 4
|
|
|
|
|
return e
|
|
|
|
|
)JIT";
|
|
|
|
|
|
|
|
|
|
auto cu = compile(basic_example);
|
|
|
|
|
auto& fun = cu->get_function("basic_loop");
|
|
|
|
|
auto pr = ProfilingRecord::instrumentGraph(fun.graph());
|
|
|
|
|
auto x = at::randn({2, 3}, at::kCPU);
|
|
|
|
|
auto y = at::randn({2, 3}, at::kCPU);
|
2019-11-13 15:40:45 +00:00
|
|
|
auto stack = createStack({x, y});
|
2019-06-10 18:40:49 +00:00
|
|
|
// introduce some profiling information
|
improved TorchScript traceback (#33834)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33834
This changes how we report Tracebacks to make them more clear when
there are both serialized and non-serialized ranges. It now looks like:
```
Traceback (most recent call last):
File "foo.py", line 25, in <module>
s2(a, b)
File "/scratch/zdevito/pytorch/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
File "code/__torch__.py", line 7, in forward
x: Tensor,
y: Tensor) -> Tensor:
return (self).bar(x, y, )
~~~~~~~~~ <--- HERE
def bar(self: __torch__.Moo,
x: Tensor,
File "code/__torch__.py", line 11, in bar
x: Tensor,
y: Tensor) -> Tensor:
_0 = (self).baz(x, y, )
~~~~~~~~~ <--- HERE
_1 = torch.ones([3], dtype=None, layout=None, device=None, pin_memory=None)
return torch.add(_0, _1, alpha=1)
File "code/__torch__.py", line 17, in baz
x: Tensor,
y: Tensor) -> Tensor:
return torch.add(x, y, alpha=1)
~~~~~~~~~ <--- HERE
Traceback of TorchScript, original code (most recent call last):
File "foo.py", line 11, in forward
def forward(self, x, y):
return self.bar(x, y)
~~~~~~~~ <--- HERE
File "foo.py", line 9, in bar
def bar(self, x, y):
return self.baz(x, y) + torch.ones(3)
~~~~~~~~ <--- HERE
File "foo.py", line 7, in baz
def baz(self, x, y):
return x + y
~~~~~ <--- HERE
RuntimeError: The size of tensor a (4) must match the size of tensor b (5) at non-singleton dimension 1
```
It follows Python convension of having the most important information last
and reading from the bottom up.
Changes:
* Moved the error message to the end, to copy Python
* Report original traceback separate from serialized traceback
* Make sure root functions have names in the interpreter trace.
Test Plan: Imported from OSS
Differential Revision: D20126136
Pulled By: zdevito
fbshipit-source-id: fd01f9985e5d74e04c4d064c02e8bc320f4fac13
2020-03-03 20:24:28 +00:00
|
|
|
Code cd(pr->profiled_graph_, "");
|
2019-06-10 18:40:49 +00:00
|
|
|
InterpreterState is{cd};
|
|
|
|
|
is.run(stack);
|
|
|
|
|
auto copy = pr->profiled_graph_->copy();
|
2020-08-23 06:50:52 +00:00
|
|
|
ProfilingRecord::removeProfileCounter(copy->block());
|
2019-06-10 18:40:49 +00:00
|
|
|
InsertGuards(copy);
|
2019-06-14 23:51:59 +00:00
|
|
|
EliminateRedundantGuards(copy);
|
2019-06-10 18:40:49 +00:00
|
|
|
auto nodes = copy->block()->nodes();
|
|
|
|
|
auto is_guard = [](Node* n) { return n->kind() == prim::Guard; };
|
|
|
|
|
auto num_guards = std::count_if(nodes.begin(), nodes.end(), is_guard);
|
|
|
|
|
ASSERT_EQ(num_guards, 3);
|
|
|
|
|
InsertBailOuts(copy);
|
|
|
|
|
auto is_bailout = [](Node* n) { return n->kind() == prim::BailOut; };
|
|
|
|
|
auto num_bailouts = std::count_if(nodes.begin(), nodes.end(), is_bailout);
|
|
|
|
|
ASSERT_EQ(num_guards, num_bailouts);
|
|
|
|
|
std::vector<Node*> bailouts(num_bailouts);
|
|
|
|
|
std::copy_if(nodes.begin(), nodes.end(), bailouts.begin(), is_bailout);
|
2019-06-21 04:19:25 +00:00
|
|
|
|
|
|
|
|
for (auto blo : bailouts) {
|
2019-07-05 00:07:52 +00:00
|
|
|
ASSERT_EQ(blo->inputs().at(0)->node()->kind(), prim::BailoutTemplate);
|
2019-06-21 04:19:25 +00:00
|
|
|
}
|
2019-06-10 18:40:49 +00:00
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(ProfilerTest, Basic) {
|
2019-04-17 04:08:38 +00:00
|
|
|
constexpr int batch_size = 4;
|
|
|
|
|
constexpr int input_size = 256;
|
|
|
|
|
|
|
|
|
|
int hidden_size = 2 * input_size;
|
|
|
|
|
|
|
|
|
|
auto input = at::randn({batch_size, input_size}, at::kCPU);
|
|
|
|
|
auto hx = at::randn({batch_size, hidden_size}, at::kCPU);
|
|
|
|
|
auto cx = at::randn({batch_size, hidden_size}, at::kCPU);
|
|
|
|
|
auto w_ih = t_def(at::randn({4 * hidden_size, input_size}, at::kCPU));
|
|
|
|
|
auto w_hh = t_def(at::randn({4 * hidden_size, hidden_size}, at::kCPU));
|
|
|
|
|
|
|
|
|
|
auto g = build_lstm();
|
2019-11-13 15:40:45 +00:00
|
|
|
auto stack = createStack({input, hx, cx, w_ih, w_hh});
|
2019-04-17 04:08:38 +00:00
|
|
|
|
|
|
|
|
auto& opt_graph = *g.get();
|
|
|
|
|
ArgumentSpecCreator arg_spec_creator(opt_graph);
|
|
|
|
|
ArgumentSpec spec =
|
|
|
|
|
arg_spec_creator.create(autograd::GradMode::is_enabled(), stack);
|
Specialize Optional[T] to T (or subtype for Tensor) or None when executing graph (#18407)
Summary:
This patch specializes `Optional[Tensor]` graph inputs to either a `DimensionedTensorType` (if a Tensor is passed) or `NoneType`. Other `Optional[T]` are specialized to `T` or `None`.
- For unwrapping (checked and unchecked) we need to keep the output type, as IR code that follows unwrapping may not work with NoneType (just as it doesn't deal with Optional). While it would not be hit during execution, it will run against the (legitimate) assumptions of the analysis passes.
- Function lookup currently will not match NoneType when it expects optional (I'm not entirely sure why this doesn't lead to unhappyness currently, but hey), I amend this at the level of the function matching code (`operator.cpp`), but see Adam's comments. We would run into trouble if we needed to select between functions whose signature only differs in Optional types with different subtypes, but we would have the same problem when calling them directly, so I would think this is OK.
- It would enable throwing away branches we can't hit. This also reduces the "blockyness" of the graph, so it may be easier to apply optimizations (e.g. fuse things in `if t is None: ...` and outside the `if`.
- Arguments passed into `Optional[Tensor]` arguments will get shape information, which is very handy.
- It get's rid of the problem that tensors passed into Optional arguments get requires_grad set erroneously #18270 (though that also affects lists, which aren't fixed here).
- `Optional[List[int]]` is needed for #18697.
- We're changing typing in a more subtle way than the `TensorType`->`DimensionedTensorType`.
- In particular, specializing to NoneType loses the Type information captured in the `OptionalType` element type.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/18407
Reviewed By: zdevito
Differential Revision: D15216808
Pulled By: eellison
fbshipit-source-id: 01f1a7643deaf4962c3f55eff2070d54b0e54b69
2019-05-06 21:54:10 +00:00
|
|
|
arg_spec_creator.specializeTypes(opt_graph, spec);
|
2019-04-17 04:08:38 +00:00
|
|
|
auto pr = ProfilingRecord::instrumentGraph(g);
|
improved TorchScript traceback (#33834)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33834
This changes how we report Tracebacks to make them more clear when
there are both serialized and non-serialized ranges. It now looks like:
```
Traceback (most recent call last):
File "foo.py", line 25, in <module>
s2(a, b)
File "/scratch/zdevito/pytorch/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
File "code/__torch__.py", line 7, in forward
x: Tensor,
y: Tensor) -> Tensor:
return (self).bar(x, y, )
~~~~~~~~~ <--- HERE
def bar(self: __torch__.Moo,
x: Tensor,
File "code/__torch__.py", line 11, in bar
x: Tensor,
y: Tensor) -> Tensor:
_0 = (self).baz(x, y, )
~~~~~~~~~ <--- HERE
_1 = torch.ones([3], dtype=None, layout=None, device=None, pin_memory=None)
return torch.add(_0, _1, alpha=1)
File "code/__torch__.py", line 17, in baz
x: Tensor,
y: Tensor) -> Tensor:
return torch.add(x, y, alpha=1)
~~~~~~~~~ <--- HERE
Traceback of TorchScript, original code (most recent call last):
File "foo.py", line 11, in forward
def forward(self, x, y):
return self.bar(x, y)
~~~~~~~~ <--- HERE
File "foo.py", line 9, in bar
def bar(self, x, y):
return self.baz(x, y) + torch.ones(3)
~~~~~~~~ <--- HERE
File "foo.py", line 7, in baz
def baz(self, x, y):
return x + y
~~~~~ <--- HERE
RuntimeError: The size of tensor a (4) must match the size of tensor b (5) at non-singleton dimension 1
```
It follows Python convension of having the most important information last
and reading from the bottom up.
Changes:
* Moved the error message to the end, to copy Python
* Report original traceback separate from serialized traceback
* Make sure root functions have names in the interpreter trace.
Test Plan: Imported from OSS
Differential Revision: D20126136
Pulled By: zdevito
fbshipit-source-id: fd01f9985e5d74e04c4d064c02e8bc320f4fac13
2020-03-03 20:24:28 +00:00
|
|
|
Code cd(pr->profiled_graph_, "");
|
2019-04-17 04:08:38 +00:00
|
|
|
InterpreterState is{cd};
|
|
|
|
|
is.run(stack);
|
|
|
|
|
|
2020-08-15 03:16:00 +00:00
|
|
|
// profiled types are stored as attributes and show up in the dump, e.g.
|
2020-10-06 22:02:21 +00:00
|
|
|
// Tensor = prim::profile[profiled_type=Double(4, 256, strides=[256, 1],
|
|
|
|
|
// requires_grad=0, device=cpu)
|
2020-08-15 03:16:00 +00:00
|
|
|
testing::FileCheck()
|
|
|
|
|
.check("Tensor = prim::profile[profiled_type")
|
|
|
|
|
->check_same("256")
|
|
|
|
|
->run(*pr->profiled_graph_);
|
|
|
|
|
|
2019-04-17 04:08:38 +00:00
|
|
|
auto begin = pr->profiled_graph_->block()->nodes().begin();
|
|
|
|
|
auto end = pr->profiled_graph_->block()->nodes().end();
|
|
|
|
|
auto mm =
|
2020-06-05 20:41:53 +00:00
|
|
|
std::find_if(begin, end, [](Node* n) { return n->kind() == aten::add; });
|
2019-04-17 04:08:38 +00:00
|
|
|
ASSERT_NE(mm, end);
|
2020-06-05 20:41:53 +00:00
|
|
|
std::vector<int64_t> mm_expected{4, 2048};
|
2019-04-17 04:08:38 +00:00
|
|
|
std::vector<int64_t> eltwise{4, 512};
|
2020-08-15 03:16:00 +00:00
|
|
|
checkShape(mm->inputs().at(0)->node()->ty(attr::profiled_type), mm_expected);
|
2020-06-05 20:41:53 +00:00
|
|
|
auto mul_n =
|
|
|
|
|
std::find_if(begin, end, [](Node* n) { return n->kind() == aten::mul; });
|
|
|
|
|
ASSERT_NE(mul_n, end);
|
2020-08-15 03:16:00 +00:00
|
|
|
checkShape(mul_n->inputs().at(0)->node()->ty(attr::profiled_type), eltwise);
|
2019-04-17 04:08:38 +00:00
|
|
|
auto tanh_n =
|
|
|
|
|
std::find_if(begin, end, [](Node* n) { return n->kind() == aten::tanh; });
|
2020-08-15 03:16:00 +00:00
|
|
|
checkShape(tanh_n->inputs().at(0)->node()->ty(attr::profiled_type), eltwise);
|
2019-04-17 04:08:38 +00:00
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(CallStackTest, Basic) {
|
2019-11-20 01:55:42 +00:00
|
|
|
const auto text = R"(
|
|
|
|
|
def ham(x):
|
|
|
|
|
return x/7
|
|
|
|
|
|
|
|
|
|
def bar(x):
|
|
|
|
|
return x*3
|
|
|
|
|
|
|
|
|
|
def baz(x):
|
|
|
|
|
return ham(x)*x
|
|
|
|
|
|
|
|
|
|
def foo(x):
|
|
|
|
|
return bar(x)*baz(x)*11
|
|
|
|
|
)";
|
|
|
|
|
auto cu = compile(text);
|
|
|
|
|
const Function& foo = cu->get_function("foo");
|
|
|
|
|
for (Node* n : foo.optimized_graph()->nodes()) {
|
|
|
|
|
if (n->kind() == prim::Constant) {
|
|
|
|
|
if (!n->hasAttribute(attr::value) ||
|
|
|
|
|
n->kindOf(attr::value) != AttributeKind::i) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
int v = n->i(attr::value);
|
|
|
|
|
switch (v) {
|
|
|
|
|
case 3: {
|
|
|
|
|
// Const 3 comes from function 'bar', which gets inlined to 'foo'.
|
|
|
|
|
// The callstack for the corresponding node should contain only the
|
|
|
|
|
// function 'bar'.
|
|
|
|
|
ASSERT_TRUE(n->callstack());
|
|
|
|
|
auto callstack_vector = (*n->callstack())->vec();
|
|
|
|
|
ASSERT_EQ(callstack_vector.size(), 1);
|
|
|
|
|
ASSERT_EQ(callstack_vector[0].first, &cu->get_function("bar"));
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case 7: {
|
|
|
|
|
// Const 7 comes from function 'ham', which gets inlined to 'baz',
|
|
|
|
|
// which is then inlined to 'foo'. The callstack for the corresponding
|
|
|
|
|
// node should contain these two functions.
|
|
|
|
|
ASSERT_TRUE(n->callstack());
|
|
|
|
|
auto callstack_vector = (*n->callstack())->vec();
|
|
|
|
|
ASSERT_EQ(callstack_vector.size(), 2);
|
|
|
|
|
ASSERT_EQ(callstack_vector[0].first, &cu->get_function("baz"));
|
|
|
|
|
ASSERT_EQ(callstack_vector[1].first, &cu->get_function("ham"));
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case 11: {
|
|
|
|
|
// Const 11 comes from function 'foo', which is not inlined anywhere
|
|
|
|
|
// and thus it should not have a callstack.
|
|
|
|
|
ASSERT_FALSE(n->callstack());
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check that inlining doesn't corrupt callstack of the callee's nodes.
|
|
|
|
|
const Function& baz = cu->get_function("baz");
|
|
|
|
|
for (Node* n : baz.optimized_graph()->nodes()) {
|
|
|
|
|
if (n->kind() == prim::Constant) {
|
|
|
|
|
if (!n->hasAttribute(attr::value) ||
|
|
|
|
|
n->kindOf(attr::value) != AttributeKind::i) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
int v = n->i(attr::value);
|
|
|
|
|
ASSERT_TRUE(v == 7);
|
|
|
|
|
// Const 7 comes from function 'ham', which gets inlined to 'baz'. 'baz'
|
|
|
|
|
// was also inlined into 'foo', but when looking at the graph of 'baz' we
|
|
|
|
|
// should only see a callstack of depth 1 (containing only 'ham').
|
|
|
|
|
ASSERT_TRUE(n->callstack());
|
|
|
|
|
auto callstack_vector = (*n->callstack())->vec();
|
|
|
|
|
ASSERT_EQ(callstack_vector.size(), 1);
|
|
|
|
|
ASSERT_EQ(callstack_vector[0].first, &cu->get_function("ham"));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(CallStackTest, Caching) {
|
2019-11-20 01:55:42 +00:00
|
|
|
const auto text = R"(
|
|
|
|
|
|
|
|
|
|
def a(x):
|
|
|
|
|
print("a1")
|
|
|
|
|
print("a2")
|
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
def b(x):
|
|
|
|
|
print("b1")
|
|
|
|
|
print("b2")
|
|
|
|
|
a(x)
|
|
|
|
|
return x
|
|
|
|
|
|
|
|
|
|
def c(x):
|
|
|
|
|
print("c1")
|
|
|
|
|
print("c2")
|
|
|
|
|
b(x)
|
|
|
|
|
return x
|
|
|
|
|
)";
|
|
|
|
|
auto cu = compile(text);
|
|
|
|
|
const Function& baz = cu->get_function("c");
|
|
|
|
|
std::unordered_map<std::string, InlinedCallStack*> callstack_objects;
|
|
|
|
|
for (Node* n : baz.optimized_graph()->nodes()) {
|
|
|
|
|
if (n->kind() == prim::Constant) {
|
|
|
|
|
if (!n->hasAttribute(attr::value) ||
|
|
|
|
|
n->kindOf(attr::value) != AttributeKind::s) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
std::string v = n->s(attr::value);
|
|
|
|
|
if (n->callstack()) {
|
|
|
|
|
callstack_objects[v] = n->callstack()->get();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// We expect to see nodes prim::Constant[value="a1"] and
|
|
|
|
|
// prim::Constant[value="a2"] inlined to function 'c'. Their callstacks are
|
|
|
|
|
// the same (a->b->c), so we want to make sure we're not creating different
|
|
|
|
|
// callstack entries for them.
|
|
|
|
|
ASSERT_TRUE(callstack_objects.count("a1") && callstack_objects.count("a2"));
|
|
|
|
|
ASSERT_TRUE(callstack_objects.at("a1") == callstack_objects.at("a2"));
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(AutogradSymbolsTest, Basic) {
|
2019-12-10 23:37:39 +00:00
|
|
|
Symbol sym = Symbol::fromQualString("aten::test_symbol");
|
|
|
|
|
Graph graph;
|
|
|
|
|
auto node = graph.create(sym);
|
|
|
|
|
TORCH_CHECK(canRunWithAutograd(node));
|
|
|
|
|
|
|
|
|
|
sym = Symbol::fromQualString("prim::test_symbol");
|
2020-03-26 18:15:49 +00:00
|
|
|
node = graph.create(sym);
|
2019-12-10 23:37:39 +00:00
|
|
|
TORCH_CHECK(canRunWithAutograd(node));
|
|
|
|
|
|
|
|
|
|
sym = Symbol::fromQualString("prim::FusionGroup");
|
2020-03-26 18:15:49 +00:00
|
|
|
node = graph.create(sym);
|
2019-12-10 23:37:39 +00:00
|
|
|
TORCH_CHECK(!canRunWithAutograd(node));
|
|
|
|
|
|
|
|
|
|
sym = Symbol::fromQualString("custom::test_symbol");
|
2020-03-26 18:15:49 +00:00
|
|
|
node = graph.create(sym);
|
2019-12-10 23:37:39 +00:00
|
|
|
TORCH_CHECK(!canRunWithAutograd(node));
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(DefaultArgTypeHintingTest, Basic) {
|
2020-05-28 08:26:44 +00:00
|
|
|
const auto text_non_hinted = R"(
|
|
|
|
|
|
|
|
|
|
def a(x, y=1):
|
|
|
|
|
print("a1")
|
|
|
|
|
print("a2")
|
|
|
|
|
return x
|
|
|
|
|
)";
|
|
|
|
|
|
|
|
|
|
const auto text_hinted = R"(
|
|
|
|
|
|
|
|
|
|
def a(x, y:int=1):
|
|
|
|
|
print("a1")
|
|
|
|
|
print("a2")
|
|
|
|
|
return x
|
|
|
|
|
)";
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
compile(text_non_hinted);
|
|
|
|
|
ASSERT_TRUE(0);
|
|
|
|
|
} catch (const std::exception& c) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto cu = compile(text_hinted);
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// Basic set case.
|
|
|
|
|
TEST(FuturesTest, Basic) {
|
|
|
|
|
auto f1 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
ASSERT_FALSE(f1->completed());
|
|
|
|
|
ASSERT_FALSE(f1->hasValue());
|
|
|
|
|
int32_t sat1 = 0;
|
|
|
|
|
int32_t sat2 = 0;
|
|
|
|
|
f1->addCallback([&]() { ++sat1; });
|
|
|
|
|
f1->markCompleted(43);
|
|
|
|
|
ASSERT_TRUE(f1->completed());
|
|
|
|
|
ASSERT_TRUE(f1->hasValue());
|
|
|
|
|
ASSERT_FALSE(f1->hasError());
|
|
|
|
|
ASSERT_EQ(sat1, 1);
|
|
|
|
|
ASSERT_EQ(f1->constValue().toInt(), 43);
|
|
|
|
|
ASSERT_EQ(f1->value().toInt(), 43);
|
|
|
|
|
f1->addCallback([&]() { ++sat2; });
|
|
|
|
|
ASSERT_EQ(sat1, 1);
|
|
|
|
|
ASSERT_EQ(sat2, 1);
|
|
|
|
|
}
|
2020-06-08 12:50:10 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// Basic error cases.
|
|
|
|
|
TEST(FuturesTest, Error) {
|
|
|
|
|
auto f1 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
int sat1 = 0;
|
|
|
|
|
int sat2 = 0;
|
|
|
|
|
f1->addCallback([&]() { ++sat1; });
|
|
|
|
|
f1->setError(
|
|
|
|
|
std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
|
|
|
|
|
ASSERT_EQ(sat1, 1);
|
|
|
|
|
ASSERT_TRUE(f1->completed());
|
|
|
|
|
ASSERT_TRUE(f1->hasError());
|
|
|
|
|
ASSERT_FALSE(f1->hasValue());
|
|
|
|
|
try {
|
|
|
|
|
(void)f1->value();
|
|
|
|
|
ASSERT_TRUE(false); // Supposed to throw.
|
|
|
|
|
} catch (const std::exception& e) {
|
|
|
|
|
ASSERT_TRUE(strcmp(e.what(), "Failed") == 0);
|
2020-06-08 12:50:10 +00:00
|
|
|
}
|
2020-09-25 18:35:39 +00:00
|
|
|
f1->addCallback([&]() { ++sat2; });
|
|
|
|
|
ASSERT_EQ(sat1, 1);
|
|
|
|
|
ASSERT_EQ(sat2, 1);
|
|
|
|
|
f1->setErrorIfNeeded(
|
|
|
|
|
std::make_exception_ptr(c10::ivalue::Future::FutureError("Dup")));
|
|
|
|
|
ASSERT_TRUE(strcmp(f1->tryRetrieveErrorMessage().c_str(), "Failed") == 0);
|
|
|
|
|
ASSERT_EQ(sat1, 1);
|
|
|
|
|
ASSERT_EQ(sat2, 1);
|
|
|
|
|
}
|
2020-06-08 12:50:10 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// then
|
|
|
|
|
TEST(FuturesTest, Then) {
|
|
|
|
|
auto f1 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
auto f2 = f1->then(
|
|
|
|
|
[f1]() -> IValue { return f1->constValue().toInt() + 1; },
|
|
|
|
|
IntType::get());
|
|
|
|
|
auto f3 = f2->then(
|
|
|
|
|
[f2]() -> IValue { return f2->constValue().toInt() * 3; },
|
|
|
|
|
IntType::get());
|
|
|
|
|
bool done = false;
|
|
|
|
|
f3->addCallback([f3, &done]() {
|
|
|
|
|
ASSERT_EQ(f3->constValue().toInt(), (42 + 1) * 3);
|
|
|
|
|
done = true;
|
|
|
|
|
});
|
|
|
|
|
ASSERT_FALSE(done);
|
|
|
|
|
f1->markCompleted(42);
|
|
|
|
|
ASSERT_TRUE(done);
|
|
|
|
|
}
|
2020-06-08 12:50:10 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// collectAll()
|
|
|
|
|
TEST(FuturesTest, CollectAll) {
|
|
|
|
|
auto s1 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
auto s2 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
auto s3 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
|
|
|
|
|
// Empty case
|
|
|
|
|
c10::List<intrusive_ptr<ivalue::Future>> futures(
|
|
|
|
|
FutureType::create(IntType::get()));
|
|
|
|
|
auto c1 = collectAll(futures);
|
|
|
|
|
ASSERT_TRUE(c1->completed());
|
|
|
|
|
ASSERT_EQ(c1->value().toList().size(), 0);
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
|
*(c1->value().toList().elementType()) ==
|
|
|
|
|
*FutureType::create(IntType::get()));
|
|
|
|
|
|
|
|
|
|
// 1-element, initially not completed.
|
|
|
|
|
futures.push_back(s1);
|
|
|
|
|
auto c2 = collectAll(futures);
|
|
|
|
|
ASSERT_FALSE(c2->completed());
|
|
|
|
|
s1->markCompleted(5);
|
|
|
|
|
ASSERT_TRUE(c2->completed());
|
|
|
|
|
ASSERT_EQ(c2->value().toList().size(), 1);
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
|
*(c2->value().toList().elementType()) ==
|
|
|
|
|
*FutureType::create(IntType::get()));
|
|
|
|
|
ASSERT_EQ(c2->value().toList().get(0).toFuture()->value().toInt(), 5);
|
|
|
|
|
|
|
|
|
|
// 1-element, already completed
|
|
|
|
|
auto c3 = collectAll(futures);
|
|
|
|
|
ASSERT_TRUE(c3->completed());
|
|
|
|
|
ASSERT_EQ(c3->value().toList().size(), 1);
|
|
|
|
|
ASSERT_EQ(c3->value().toList().get(0).toFuture()->value().toInt(), 5);
|
|
|
|
|
|
|
|
|
|
// 3 elements.
|
|
|
|
|
futures.push_back(s2);
|
|
|
|
|
futures.push_back(s3);
|
|
|
|
|
auto c4 = collectAll(futures);
|
|
|
|
|
ASSERT_FALSE(c4->completed());
|
|
|
|
|
s3->markCompleted(7);
|
|
|
|
|
ASSERT_FALSE(c4->completed());
|
|
|
|
|
s2->markCompleted(6);
|
|
|
|
|
ASSERT_TRUE(c4->completed());
|
|
|
|
|
ASSERT_EQ(c4->value().toList().size(), 3);
|
|
|
|
|
ASSERT_EQ(c4->value().toList().get(0).toFuture()->value().toInt(), 5);
|
|
|
|
|
ASSERT_EQ(c4->value().toList().get(1).toFuture()->value().toInt(), 6);
|
|
|
|
|
ASSERT_EQ(c4->value().toList().get(2).toFuture()->value().toInt(), 7);
|
|
|
|
|
ASSERT_TRUE(
|
|
|
|
|
*(c4->value().toList().elementType()) ==
|
|
|
|
|
*FutureType::create(IntType::get()));
|
|
|
|
|
|
|
|
|
|
// Handle exception in the list.
|
|
|
|
|
auto s4 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
futures.push_back(s4);
|
|
|
|
|
auto c5 = collectAll(futures);
|
|
|
|
|
ASSERT_FALSE(c5->completed());
|
|
|
|
|
s4->setError(
|
|
|
|
|
std::make_exception_ptr(c10::ivalue::Future::FutureError("Failed")));
|
|
|
|
|
ASSERT_TRUE(c5->completed());
|
|
|
|
|
ASSERT_EQ(c5->value().toList().size(), 4);
|
|
|
|
|
try {
|
|
|
|
|
(void)c5->value().toList().get(3).toFuture()->value();
|
|
|
|
|
ASSERT_TRUE(false); // supposed to throw
|
|
|
|
|
} catch (const std::exception& e) {
|
|
|
|
|
ASSERT_EQ(std::string(e.what()), "Failed");
|
2020-06-08 12:50:10 +00:00
|
|
|
}
|
2020-09-25 18:35:39 +00:00
|
|
|
}
|
2020-06-09 23:28:48 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// collectAny()
|
|
|
|
|
TEST(FuturesTest, CollectAny) {
|
|
|
|
|
auto s1 = c10::make_intrusive<Future>(IntType::get());
|
2020-06-09 23:28:48 +00:00
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
// Empty case
|
|
|
|
|
c10::List<intrusive_ptr<ivalue::Future>> futures(
|
|
|
|
|
FutureType::create(IntType::get()));
|
|
|
|
|
auto c1 = collectAny(futures);
|
|
|
|
|
ASSERT_TRUE(c1->completed());
|
|
|
|
|
|
|
|
|
|
// 1 element, not yet satisfied
|
|
|
|
|
futures.push_back(s1);
|
|
|
|
|
auto c2 = collectAny(futures);
|
|
|
|
|
ASSERT_FALSE(c2->completed());
|
|
|
|
|
s1->markCompleted(5);
|
|
|
|
|
ASSERT_TRUE(c2->completed());
|
|
|
|
|
ASSERT_TRUE(c2->value().isInt());
|
|
|
|
|
ASSERT_EQ(c2->value().toInt(), 5);
|
|
|
|
|
|
|
|
|
|
// 1 element already satisfied.
|
|
|
|
|
auto c3 = collectAny(futures);
|
|
|
|
|
ASSERT_TRUE(c3->completed());
|
|
|
|
|
ASSERT_TRUE(c3->value().isInt());
|
|
|
|
|
ASSERT_EQ(c3->value().toInt(), 5);
|
|
|
|
|
|
|
|
|
|
// 2 elements
|
|
|
|
|
futures.clear();
|
|
|
|
|
auto s2 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
auto s3 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
futures.push_back(s2);
|
|
|
|
|
futures.push_back(s3);
|
|
|
|
|
auto c4 = collectAny(futures);
|
|
|
|
|
ASSERT_FALSE(c4->completed());
|
|
|
|
|
s3->markCompleted(7);
|
|
|
|
|
ASSERT_TRUE(c4->completed());
|
|
|
|
|
ASSERT_EQ(c4->value().toInt(), 7);
|
|
|
|
|
s2->markCompleted(1);
|
|
|
|
|
ASSERT_EQ(c4->value().toInt(), 7);
|
2020-06-08 12:50:10 +00:00
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(TLSFutureCallbacksTest, Basic) {
|
2020-07-09 06:24:10 +00:00
|
|
|
// cb that verifies the profiler is enabled
|
|
|
|
|
auto profilerEnabledCb = []() {
|
|
|
|
|
ASSERT_TRUE(torch::autograd::profiler::profilerEnabled());
|
|
|
|
|
};
|
|
|
|
|
// test running callbacks with propagation of TLS state.
|
|
|
|
|
{
|
|
|
|
|
// Enable the profiler in this thread
|
|
|
|
|
torch::autograd::profiler::enableProfiler(
|
|
|
|
|
torch::autograd::profiler::ProfilerConfig(
|
|
|
|
|
torch::autograd::profiler::ProfilerState::CPU, false, false));
|
|
|
|
|
auto s1 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
s1->addCallback(wrapPropagateTLSState<void>(profilerEnabledCb));
|
|
|
|
|
std::thread t([s1 = std::move(s1)]() { s1->markCompleted(); });
|
|
|
|
|
// Since we join here, we can ensure that all callbacks corresponding to
|
|
|
|
|
// markCompleted() have finished.
|
|
|
|
|
t.join();
|
|
|
|
|
torch::autograd::profiler::disableProfiler();
|
|
|
|
|
}
|
|
|
|
|
// then() with TLS State
|
|
|
|
|
{
|
|
|
|
|
// Enable the profiler in this thread
|
|
|
|
|
torch::autograd::profiler::enableProfiler(
|
|
|
|
|
torch::autograd::profiler::ProfilerConfig(
|
|
|
|
|
torch::autograd::profiler::ProfilerState::CPU, false, false));
|
|
|
|
|
auto s1 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
auto s2 = s1->then(
|
|
|
|
|
wrapPropagateTLSState<c10::IValue>([&profilerEnabledCb]() {
|
|
|
|
|
profilerEnabledCb();
|
|
|
|
|
return at::IValue(1);
|
|
|
|
|
}),
|
|
|
|
|
IntType::get());
|
|
|
|
|
std::thread t([s1 = std::move(s1)]() { s1->markCompleted(); });
|
|
|
|
|
t.join();
|
|
|
|
|
s2->wait();
|
|
|
|
|
torch::autograd::profiler::disableProfiler();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(ProfilerDisableInCallbackTest, Basic) {
|
2020-09-23 04:13:07 +00:00
|
|
|
// cb that verifies the profiler is enabled
|
|
|
|
|
auto profilerEnabledCb = []() {
|
|
|
|
|
ASSERT_TRUE(torch::autograd::profiler::profilerEnabled());
|
|
|
|
|
};
|
|
|
|
|
torch::autograd::profiler::enableProfiler(
|
|
|
|
|
torch::autograd::profiler::ProfilerConfig(
|
|
|
|
|
torch::autograd::profiler::ProfilerState::CPU, false, false));
|
|
|
|
|
auto s1 = c10::make_intrusive<Future>(IntType::get());
|
[RPC profiling] Extend RPC profiling to support async function execution over RPC. (#44664)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44664
Closes https://github.com/pytorch/pytorch/issues/39971. This PR adds support for functions decorated with `rpc.functions.async_execution` to be profiled over RPC as builtins, jit functions, and blocking python UDFs currently can be. The reasoning for this is to provide complete feature support in terms of RPC profiling and the various types of functions users can run.
To enable this, the PR below this enables calling `disableProfiler()` safely from another thread. We use that functionality to defer disabling the profiler on the server until the future corresponding to the RPC request completes (rather than only the blocking `processRPC` call as was done previously). Since when the future completes we've kicked off the async function and the future corresponding to it has completed, we are able to capture any RPCs the function would have called and the actual work done on the other node.
For example, if the following async function is ran on a server over RPC:
```
def slow_add(x, y):
time.sleep(1)
return torch.add(x, y)
rpc.functions.async_execution
def slow_async_add(to, x, y):
return rpc.rpc_async(to, slow_add, args=(x, y))
```
we expect to see the original RPC profiled, the nested RPC profiled, and the actual torch.add() work. All of these events should be recorded with the correct node id. Here is an example profiling output:
```
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls Node ID
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- --------------- rpc_async#slow_async_add(worker1 -> worker2) 0.00% 0.000us 0 1.012s
1.012s 1 1
aten::empty 7.02% 11.519us 7.02% 11.519us 11.519us 1 1
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3) 0.00% 0.000us 0 1.006s
1.006s 1 2 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: aten::empty 7.21% 11.843us 7.21% 11.843us
11.843us 1 2
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::add 71.94% 118.107us 85.77% 140.802us 140.802us 1 3
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::empty 13.82% 22.695us 13.82% 22.695us
22.695us 1 3 ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Self CPU time total: 164.164us
```
This PR also moves a bunch of the profiling logic to `rpc/utils.cpp` to declutter `request_callback` code.
ghstack-source-id: 112868470
Test Plan:
```
rvarm1@devbig978:fbcode (52dd34f6)$ buck test mode/no-gpu mode/dev-nosan //caffe2/test/distributed/rpc:process_group_agent -- test_rpc_profiling_async_function --print-passing-details --stress-runs 1
```
Reviewed By: mrshenli
Differential Revision: D23638387
fbshipit-source-id: eedb6d48173a4ecd41d70a9c64048920bd4807c4
2020-09-25 20:17:24 +00:00
|
|
|
auto verifyProfilerCb = wrapPropagateTLSState<void>([&profilerEnabledCb] {
|
2020-09-23 04:13:07 +00:00
|
|
|
// Ensure the profiler is still enabled in this thread.
|
|
|
|
|
profilerEnabledCb();
|
|
|
|
|
auto t1 = torch::ones({2, 2});
|
|
|
|
|
auto t2 = torch::ones({2, 2});
|
|
|
|
|
torch::add(t1, t2);
|
|
|
|
|
// Don't cleanup TLSState, and just consolidate.
|
[RPC profiling] Extend RPC profiling to support async function execution over RPC. (#44664)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44664
Closes https://github.com/pytorch/pytorch/issues/39971. This PR adds support for functions decorated with `rpc.functions.async_execution` to be profiled over RPC as builtins, jit functions, and blocking python UDFs currently can be. The reasoning for this is to provide complete feature support in terms of RPC profiling and the various types of functions users can run.
To enable this, the PR below this enables calling `disableProfiler()` safely from another thread. We use that functionality to defer disabling the profiler on the server until the future corresponding to the RPC request completes (rather than only the blocking `processRPC` call as was done previously). Since when the future completes we've kicked off the async function and the future corresponding to it has completed, we are able to capture any RPCs the function would have called and the actual work done on the other node.
For example, if the following async function is ran on a server over RPC:
```
def slow_add(x, y):
time.sleep(1)
return torch.add(x, y)
rpc.functions.async_execution
def slow_async_add(to, x, y):
return rpc.rpc_async(to, slow_add, args=(x, y))
```
we expect to see the original RPC profiled, the nested RPC profiled, and the actual torch.add() work. All of these events should be recorded with the correct node id. Here is an example profiling output:
```
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls Node ID
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- --------------- rpc_async#slow_async_add(worker1 -> worker2) 0.00% 0.000us 0 1.012s
1.012s 1 1
aten::empty 7.02% 11.519us 7.02% 11.519us 11.519us 1 1
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3) 0.00% 0.000us 0 1.006s
1.006s 1 2 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: aten::empty 7.21% 11.843us 7.21% 11.843us
11.843us 1 2
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::add 71.94% 118.107us 85.77% 140.802us 140.802us 1 3
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::empty 13.82% 22.695us 13.82% 22.695us
22.695us 1 3 ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Self CPU time total: 164.164us
```
This PR also moves a bunch of the profiling logic to `rpc/utils.cpp` to declutter `request_callback` code.
ghstack-source-id: 112868470
Test Plan:
```
rvarm1@devbig978:fbcode (52dd34f6)$ buck test mode/no-gpu mode/dev-nosan //caffe2/test/distributed/rpc:process_group_agent -- test_rpc_profiling_async_function --print-passing-details --stress-runs 1
```
Reviewed By: mrshenli
Differential Revision: D23638387
fbshipit-source-id: eedb6d48173a4ecd41d70a9c64048920bd4807c4
2020-09-25 20:17:24 +00:00
|
|
|
auto opts = torch::autograd::profiler::ProfilerDisableOptions(false, true);
|
2020-09-23 04:13:07 +00:00
|
|
|
auto thread_event_lists =
|
[RPC profiling] Extend RPC profiling to support async function execution over RPC. (#44664)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44664
Closes https://github.com/pytorch/pytorch/issues/39971. This PR adds support for functions decorated with `rpc.functions.async_execution` to be profiled over RPC as builtins, jit functions, and blocking python UDFs currently can be. The reasoning for this is to provide complete feature support in terms of RPC profiling and the various types of functions users can run.
To enable this, the PR below this enables calling `disableProfiler()` safely from another thread. We use that functionality to defer disabling the profiler on the server until the future corresponding to the RPC request completes (rather than only the blocking `processRPC` call as was done previously). Since when the future completes we've kicked off the async function and the future corresponding to it has completed, we are able to capture any RPCs the function would have called and the actual work done on the other node.
For example, if the following async function is ran on a server over RPC:
```
def slow_add(x, y):
time.sleep(1)
return torch.add(x, y)
rpc.functions.async_execution
def slow_async_add(to, x, y):
return rpc.rpc_async(to, slow_add, args=(x, y))
```
we expect to see the original RPC profiled, the nested RPC profiled, and the actual torch.add() work. All of these events should be recorded with the correct node id. Here is an example profiling output:
```
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls Node ID
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- --------------- rpc_async#slow_async_add(worker1 -> worker2) 0.00% 0.000us 0 1.012s
1.012s 1 1
aten::empty 7.02% 11.519us 7.02% 11.519us 11.519us 1 1
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3) 0.00% 0.000us 0 1.006s
1.006s 1 2 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: aten::empty 7.21% 11.843us 7.21% 11.843us
11.843us 1 2
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::add 71.94% 118.107us 85.77% 140.802us 140.802us 1 3
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::empty 13.82% 22.695us 13.82% 22.695us
22.695us 1 3 ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Self CPU time total: 164.164us
```
This PR also moves a bunch of the profiling logic to `rpc/utils.cpp` to declutter `request_callback` code.
ghstack-source-id: 112868470
Test Plan:
```
rvarm1@devbig978:fbcode (52dd34f6)$ buck test mode/no-gpu mode/dev-nosan //caffe2/test/distributed/rpc:process_group_agent -- test_rpc_profiling_async_function --print-passing-details --stress-runs 1
```
Reviewed By: mrshenli
Differential Revision: D23638387
fbshipit-source-id: eedb6d48173a4ecd41d70a9c64048920bd4807c4
2020-09-25 20:17:24 +00:00
|
|
|
torch::autograd::profiler::disableProfiler(std::move(opts));
|
2020-09-23 04:13:07 +00:00
|
|
|
// Ensure that the events from this thread are still profiled and we obtain
|
|
|
|
|
// the expected in events in our consolidated list when calling
|
|
|
|
|
// disableProfiler().
|
|
|
|
|
bool found_ones = false;
|
|
|
|
|
bool found_add = false;
|
|
|
|
|
for (const auto& li : thread_event_lists) {
|
|
|
|
|
for (const auto& evt : li) {
|
|
|
|
|
if (strcmp(evt.name(), "aten::add") == 0) {
|
|
|
|
|
found_add = true;
|
|
|
|
|
} else if (strcmp(evt.name(), "aten::ones") == 0) {
|
|
|
|
|
found_ones = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
[RPC profiling] Extend RPC profiling to support async function execution over RPC. (#44664)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44664
Closes https://github.com/pytorch/pytorch/issues/39971. This PR adds support for functions decorated with `rpc.functions.async_execution` to be profiled over RPC as builtins, jit functions, and blocking python UDFs currently can be. The reasoning for this is to provide complete feature support in terms of RPC profiling and the various types of functions users can run.
To enable this, the PR below this enables calling `disableProfiler()` safely from another thread. We use that functionality to defer disabling the profiler on the server until the future corresponding to the RPC request completes (rather than only the blocking `processRPC` call as was done previously). Since when the future completes we've kicked off the async function and the future corresponding to it has completed, we are able to capture any RPCs the function would have called and the actual work done on the other node.
For example, if the following async function is ran on a server over RPC:
```
def slow_add(x, y):
time.sleep(1)
return torch.add(x, y)
rpc.functions.async_execution
def slow_async_add(to, x, y):
return rpc.rpc_async(to, slow_add, args=(x, y))
```
we expect to see the original RPC profiled, the nested RPC profiled, and the actual torch.add() work. All of these events should be recorded with the correct node id. Here is an example profiling output:
```
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls Node ID
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- --------------- rpc_async#slow_async_add(worker1 -> worker2) 0.00% 0.000us 0 1.012s
1.012s 1 1
aten::empty 7.02% 11.519us 7.02% 11.519us 11.519us 1 1
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3) 0.00% 0.000us 0 1.006s
1.006s 1 2 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: aten::empty 7.21% 11.843us 7.21% 11.843us
11.843us 1 2
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::add 71.94% 118.107us 85.77% 140.802us 140.802us 1 3
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::empty 13.82% 22.695us 13.82% 22.695us
22.695us 1 3 ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Self CPU time total: 164.164us
```
This PR also moves a bunch of the profiling logic to `rpc/utils.cpp` to declutter `request_callback` code.
ghstack-source-id: 112868470
Test Plan:
```
rvarm1@devbig978:fbcode (52dd34f6)$ buck test mode/no-gpu mode/dev-nosan //caffe2/test/distributed/rpc:process_group_agent -- test_rpc_profiling_async_function --print-passing-details --stress-runs 1
```
Reviewed By: mrshenli
Differential Revision: D23638387
fbshipit-source-id: eedb6d48173a4ecd41d70a9c64048920bd4807c4
2020-09-25 20:17:24 +00:00
|
|
|
if (found_add && found_ones) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-09-23 04:13:07 +00:00
|
|
|
}
|
|
|
|
|
ASSERT_TRUE(found_ones);
|
|
|
|
|
ASSERT_TRUE(found_add);
|
[RPC profiling] Extend RPC profiling to support async function execution over RPC. (#44664)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44664
Closes https://github.com/pytorch/pytorch/issues/39971. This PR adds support for functions decorated with `rpc.functions.async_execution` to be profiled over RPC as builtins, jit functions, and blocking python UDFs currently can be. The reasoning for this is to provide complete feature support in terms of RPC profiling and the various types of functions users can run.
To enable this, the PR below this enables calling `disableProfiler()` safely from another thread. We use that functionality to defer disabling the profiler on the server until the future corresponding to the RPC request completes (rather than only the blocking `processRPC` call as was done previously). Since when the future completes we've kicked off the async function and the future corresponding to it has completed, we are able to capture any RPCs the function would have called and the actual work done on the other node.
For example, if the following async function is ran on a server over RPC:
```
def slow_add(x, y):
time.sleep(1)
return torch.add(x, y)
rpc.functions.async_execution
def slow_async_add(to, x, y):
return rpc.rpc_async(to, slow_add, args=(x, y))
```
we expect to see the original RPC profiled, the nested RPC profiled, and the actual torch.add() work. All of these events should be recorded with the correct node id. Here is an example profiling output:
```
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls Node ID
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- --------------- rpc_async#slow_async_add(worker1 -> worker2) 0.00% 0.000us 0 1.012s
1.012s 1 1
aten::empty 7.02% 11.519us 7.02% 11.519us 11.519us 1 1
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3) 0.00% 0.000us 0 1.006s
1.006s 1 2 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: aten::empty 7.21% 11.843us 7.21% 11.843us
11.843us 1 2
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::add 71.94% 118.107us 85.77% 140.802us 140.802us 1 3
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::empty 13.82% 22.695us 13.82% 22.695us
22.695us 1 3 ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Self CPU time total: 164.164us
```
This PR also moves a bunch of the profiling logic to `rpc/utils.cpp` to declutter `request_callback` code.
ghstack-source-id: 112868470
Test Plan:
```
rvarm1@devbig978:fbcode (52dd34f6)$ buck test mode/no-gpu mode/dev-nosan //caffe2/test/distributed/rpc:process_group_agent -- test_rpc_profiling_async_function --print-passing-details --stress-runs 1
```
Reviewed By: mrshenli
Differential Revision: D23638387
fbshipit-source-id: eedb6d48173a4ecd41d70a9c64048920bd4807c4
2020-09-25 20:17:24 +00:00
|
|
|
});
|
|
|
|
|
|
|
|
|
|
s1->addCallback(verifyProfilerCb);
|
2020-09-23 04:13:07 +00:00
|
|
|
// Disable the profiler, but do not consolidate results in the main thread.
|
[RPC profiling] Extend RPC profiling to support async function execution over RPC. (#44664)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44664
Closes https://github.com/pytorch/pytorch/issues/39971. This PR adds support for functions decorated with `rpc.functions.async_execution` to be profiled over RPC as builtins, jit functions, and blocking python UDFs currently can be. The reasoning for this is to provide complete feature support in terms of RPC profiling and the various types of functions users can run.
To enable this, the PR below this enables calling `disableProfiler()` safely from another thread. We use that functionality to defer disabling the profiler on the server until the future corresponding to the RPC request completes (rather than only the blocking `processRPC` call as was done previously). Since when the future completes we've kicked off the async function and the future corresponding to it has completed, we are able to capture any RPCs the function would have called and the actual work done on the other node.
For example, if the following async function is ran on a server over RPC:
```
def slow_add(x, y):
time.sleep(1)
return torch.add(x, y)
rpc.functions.async_execution
def slow_async_add(to, x, y):
return rpc.rpc_async(to, slow_add, args=(x, y))
```
we expect to see the original RPC profiled, the nested RPC profiled, and the actual torch.add() work. All of these events should be recorded with the correct node id. Here is an example profiling output:
```
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls Node ID
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- --------------- rpc_async#slow_async_add(worker1 -> worker2) 0.00% 0.000us 0 1.012s
1.012s 1 1
aten::empty 7.02% 11.519us 7.02% 11.519us 11.519us 1 1
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3) 0.00% 0.000us 0 1.006s
1.006s 1 2 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: aten::empty 7.21% 11.843us 7.21% 11.843us
11.843us 1 2
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::add 71.94% 118.107us 85.77% 140.802us 140.802us 1 3
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::empty 13.82% 22.695us 13.82% 22.695us
22.695us 1 3 ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Self CPU time total: 164.164us
```
This PR also moves a bunch of the profiling logic to `rpc/utils.cpp` to declutter `request_callback` code.
ghstack-source-id: 112868470
Test Plan:
```
rvarm1@devbig978:fbcode (52dd34f6)$ buck test mode/no-gpu mode/dev-nosan //caffe2/test/distributed/rpc:process_group_agent -- test_rpc_profiling_async_function --print-passing-details --stress-runs 1
```
Reviewed By: mrshenli
Differential Revision: D23638387
fbshipit-source-id: eedb6d48173a4ecd41d70a9c64048920bd4807c4
2020-09-25 20:17:24 +00:00
|
|
|
auto opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
|
|
|
|
|
torch::autograd::profiler::disableProfiler(std::move(opts));
|
2020-09-23 04:13:07 +00:00
|
|
|
std::thread t([s1 = std::move(s1)]() { s1->markCompleted(at::IValue(1)); });
|
|
|
|
|
t.join();
|
[RPC profiling] Extend RPC profiling to support async function execution over RPC. (#44664)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/44664
Closes https://github.com/pytorch/pytorch/issues/39971. This PR adds support for functions decorated with `rpc.functions.async_execution` to be profiled over RPC as builtins, jit functions, and blocking python UDFs currently can be. The reasoning for this is to provide complete feature support in terms of RPC profiling and the various types of functions users can run.
To enable this, the PR below this enables calling `disableProfiler()` safely from another thread. We use that functionality to defer disabling the profiler on the server until the future corresponding to the RPC request completes (rather than only the blocking `processRPC` call as was done previously). Since when the future completes we've kicked off the async function and the future corresponding to it has completed, we are able to capture any RPCs the function would have called and the actual work done on the other node.
For example, if the following async function is ran on a server over RPC:
```
def slow_add(x, y):
time.sleep(1)
return torch.add(x, y)
rpc.functions.async_execution
def slow_async_add(to, x, y):
return rpc.rpc_async(to, slow_add, args=(x, y))
```
we expect to see the original RPC profiled, the nested RPC profiled, and the actual torch.add() work. All of these events should be recorded with the correct node id. Here is an example profiling output:
```
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Name Self CPU total % Self CPU total CPU total % CPU total CPU time avg Number of Calls Node ID
------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- --------------- rpc_async#slow_async_add(worker1 -> worker2) 0.00% 0.000us 0 1.012s
1.012s 1 1
aten::empty 7.02% 11.519us 7.02% 11.519us 11.519us 1 1
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3) 0.00% 0.000us 0 1.006s
1.006s 1 2 rpc_async#slow_async_add(worker1 -> worker2)#remote_op: aten::empty 7.21% 11.843us 7.21% 11.843us
11.843us 1 2
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::add 71.94% 118.107us 85.77% 140.802us 140.802us 1 3
rpc_async#slow_async_add(worker1 -> worker2)#remote_op: rpc_async#slow_add(worker2 -> worker3)#remote_op: aten::empty 13.82% 22.695us 13.82% 22.695us
22.695us 1 3 ------------------------------------------------------------------------------------------------------------------------- --------------- --------------- --------------- --------
------- --------------- --------------- ---------------
Self CPU time total: 164.164us
```
This PR also moves a bunch of the profiling logic to `rpc/utils.cpp` to declutter `request_callback` code.
ghstack-source-id: 112868470
Test Plan:
```
rvarm1@devbig978:fbcode (52dd34f6)$ buck test mode/no-gpu mode/dev-nosan //caffe2/test/distributed/rpc:process_group_agent -- test_rpc_profiling_async_function --print-passing-details --stress-runs 1
```
Reviewed By: mrshenli
Differential Revision: D23638387
fbshipit-source-id: eedb6d48173a4ecd41d70a9c64048920bd4807c4
2020-09-25 20:17:24 +00:00
|
|
|
|
|
|
|
|
// Similar to above test, but verifies correctness in the case where
|
|
|
|
|
// continuation runs on the main thread.
|
|
|
|
|
torch::autograd::profiler::enableProfiler(
|
|
|
|
|
torch::autograd::profiler::ProfilerConfig(
|
|
|
|
|
torch::autograd::profiler::ProfilerState::CPU, false, false));
|
|
|
|
|
s1 = c10::make_intrusive<Future>(IntType::get());
|
|
|
|
|
s1->addCallback(verifyProfilerCb);
|
|
|
|
|
// Runs callback inline
|
|
|
|
|
s1->markCompleted(at::IValue(1));
|
|
|
|
|
opts = torch::autograd::profiler::ProfilerDisableOptions(true, false);
|
|
|
|
|
torch::autograd::profiler::disableProfiler(std::move(opts));
|
2020-09-23 04:13:07 +00:00
|
|
|
}
|
|
|
|
|
|
2020-09-25 18:35:39 +00:00
|
|
|
TEST(IValueKWargsTest, Basic) {
|
2020-09-24 07:19:05 +00:00
|
|
|
const auto text = R"(
|
|
|
|
|
def foo(a : int, b : int, c : int = 4):
|
|
|
|
|
return a + 2*b + 3*c
|
|
|
|
|
)";
|
|
|
|
|
auto cu = compile(text);
|
|
|
|
|
auto result = cu->get_function("foo")({1}, {{"b", 3}});
|
|
|
|
|
ASSERT_EQ(result.toInt(), 19);
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-07 05:58:28 +00:00
|
|
|
} // namespace jit
|
|
|
|
|
} // namespace torch
|