mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/47767 This diff implements the functionality of running benchmark on mobile on top of operator_benchmark framework. It does so through a few steps: 1. create a scripted module from existing benchmark case. 2. run mobile specific optimization pass on the scripted module 3. run the scripted module on AiBench by calling its Python API A small change in the way of writing a benchmark case is introduced so that both local and mobile run can share the same interface. The change is about having inputs as arguments of the `forward` function, so that mobile optimization pass can be run successfully (otherwise everything will be optimized away by constant propagation). Test Plan: ## local op_bench run buck run caffe2/benchmarks/operator_benchmark:benchmark_all_test -- --iterations 1 --warmup_iterations 1 buck run caffe2/benchmarks/operator_benchmark:benchmark_all_test -- --iterations 1 --warmup_iterations 1 --use_jit Exceptions: `py_module` op in `FakeQuantizePerTensorBaseOpBenchmark` and `FakeQuantizePerChannelBaseOpBenchmark` under JIT mode. These tests also failed in the base version ``` RuntimeError: Module 'FakeQuantizePerChannelOpBenchmark' has no attribute 'op_func' (This function exists as an attribute on the Python module, but we failed to compile it to a TorchScript function. The error stack is reproduced here: Python builtin <built-in method apply of FunctionMeta object at 0x619000c652a0> is currently not supported in Torchscript: File "/data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/pt/quantization_test#link-tree/quantization_test.py", line 260 quant_min: int, quant_max: int ): return _LearnableFakeQuantizePerChannelOp.apply(input, scale, zero_point, axis, quant_min, quant_max, 1.0) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE : File "/data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/pt/quantization_test#link-tree/quantization_test.py", line 313 axis: int, quant_min: int, quant_max: int ): return self.op_func(input, scale, zero_point, axis, quant_min, quant_max) ~~~~~~~~~~~~ <--- HERE ``` `_consume_op` typing mismatch: chunk, split, qobserver, sort in qunary. These will be fixed in D24774105 ## OSS test python3 -m benchmark_all_test --iterations 1 --warmup_iterations 1 --use_jit python3 -m benchmark_all_test --iterations 1 --warmup_iterations 1 ## saved module graph ``` module __torch__.mobile_benchmark_utils.OpBenchmarkMobile { parameters { } attributes { training = True num_iters = 1 benchmark = <__torch__.pt.add_test.___torch_mangle_4.AddBenchmark object at 0x6070001b8b50> } methods { method forward { graph(%self : __torch__.mobile_benchmark_utils.OpBenchmarkMobile): %12 : None = prim::Constant() # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/mobile_benchmark_utils.py:9:4 %4 : bool = prim::Constant[value=1]() # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/mobile_benchmark_utils.py:10:8 %1 : int = prim::GetAttr[name="num_iters"](%self) = prim::Loop(%1, %4) # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/mobile_benchmark_utils.py:10:8 block0(%i : int): %6 : __torch__.pt.add_test.___torch_mangle_4.AddBenchmark = prim::GetAttr[name="benchmark"](%self) %7 : __torch__.pt.add_test.___torch_mangle_4.AddBenchmark = prim::GetAttr[name="benchmark"](%self) %self.inputs_tuple : (Float(1, 1, 1, strides=[1, 1, 1], requires_grad=0, device=cpu), Float(1, 1, 1, strides=[1, 1, 1], requires_grad=0, device=cpu)) = prim::Constant[value=({0.48884}, {0.809042})]() %9 : Tensor, %10 : Tensor = prim::TupleUnpack(%self.inputs_tuple) %23 : int = prim::Constant[value=1]() %24 : Tensor = aten::add(%9, %10, %23) # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/pt/add_test.py:39:15 -> (%4) return (%12) } } submodules { module __torch__.pt.add_test.___torch_mangle_4.AddBenchmark { parameters { } attributes { mobile_optimized = True } methods { method forward { graph(%self : __torch__.pt.add_test.___torch_mangle_4.AddBenchmark, %input_one.1 : Tensor, %input_two.1 : Tensor): %3 : int = prim::Constant[value=1]() %4 : Tensor = aten::add(%input_one.1, %input_two.1, %3) # /data/users/wangyang19/fbsource/fbcode/buck-out/dev/gen/caffe2/benchmarks/operator_benchmark/fb/pt/mobile/benchmark_all_test_fbcode#link-tree/pt/add_test.py:39:15 return (%4) } method get_inputs { graph(%self : __torch__.pt.add_test.___torch_mangle_4.AddBenchmark): %self.inputs_tuple : (Float(1, 1, 1, strides=[1, 1, 1], requires_grad=0, device=cpu), Float(1, 1, 1, strides=[1, 1, 1], requires_grad=0, device=cpu)) = prim::Constant[value=({0.48884}, {0.809042})]() return (%self.inputs_tuple) } } submodules { } } } } ``` Reviewed By: kimishpatel Differential Revision: D24322214 fbshipit-source-id: 335317eca4f40c4083883eb41dc47caf25cbdfd1
103 lines
2.6 KiB
Python
103 lines
2.6 KiB
Python
import operator_benchmark as op_bench
|
|
import torch
|
|
|
|
|
|
"""Microbenchmarks for binary operators."""
|
|
|
|
|
|
# Benchmark ops performance with broadcast
|
|
binary_ops_bcast_list = op_bench.op_list(
|
|
attr_names=['op_name', 'op_func'],
|
|
attrs=[
|
|
['add', torch.add],
|
|
],
|
|
)
|
|
|
|
# Configs with broadcast
|
|
binary_configs_broadcast = op_bench.config_list(
|
|
attr_names=['in_one', 'in_two'],
|
|
attrs=[
|
|
[[64, 1, 64], [1, 64, 1]],
|
|
],
|
|
cross_product_configs={
|
|
'device': ['cpu'],
|
|
'dtype': [torch.float],
|
|
},
|
|
tags=["short"]
|
|
)
|
|
|
|
|
|
class BinaryOpBcastBenchmark(op_bench.TorchBenchmarkBase):
|
|
def init(self, in_one, in_two, dtype, device, op_func):
|
|
self.inputs = {
|
|
"in_one": torch.randn(in_one, device=device).to(dtype=dtype),
|
|
"in_two": torch.randn(in_two, device=device).to(dtype=dtype)
|
|
}
|
|
self.op_func = op_func
|
|
|
|
def forward(self, in_one, in_two):
|
|
return self.op_func(in_one, in_two)
|
|
|
|
|
|
op_bench.generate_pt_tests_from_op_list(binary_ops_bcast_list,
|
|
binary_configs_broadcast,
|
|
BinaryOpBcastBenchmark)
|
|
|
|
|
|
def copy(in1, in2):
|
|
return in1.copy_(in2)
|
|
|
|
# Benchmark ops performance without broadcast
|
|
binary_ops_list = op_bench.op_list(
|
|
attr_names=['op_name', 'op_func'],
|
|
attrs=[
|
|
['add', torch.add],
|
|
['copy_', copy],
|
|
],
|
|
)
|
|
|
|
binary_short_configs = op_bench.config_list(
|
|
attr_names=['M', 'N', 'K'],
|
|
attrs=[
|
|
[1, 1, 1],
|
|
[64, 64, 64],
|
|
[64, 64, 128],
|
|
],
|
|
cross_product_configs={
|
|
'device': ['cpu', 'cuda'],
|
|
'dtype_one' : [torch.int32],
|
|
'dtype_two' : [torch.int32],
|
|
},
|
|
tags=['short'],
|
|
)
|
|
|
|
binary_long_configs = op_bench.cross_product_configs(
|
|
M=[8, 128],
|
|
N=[32, 64],
|
|
K=[256, 512],
|
|
device=['cpu', 'cuda'],
|
|
dtype_one=[torch.int8, torch.int32],
|
|
dtype_two=[torch.int8, torch.int32],
|
|
tags=['long']
|
|
)
|
|
|
|
|
|
class BinaryOpBenchmark(op_bench.TorchBenchmarkBase):
|
|
def init(self, M, N, K, device, dtype_one, dtype_two, op_func):
|
|
self.inputs = {
|
|
"input_one": torch.randn(M, N, K, device=device).to(dtype=dtype_one),
|
|
"input_two": torch.randn(M, N, K, device=device).to(dtype=dtype_two)
|
|
}
|
|
self.op_func = op_func
|
|
|
|
def forward(self, input_one, input_two):
|
|
return self.op_func(input_one, input_two)
|
|
|
|
|
|
op_bench.generate_pt_tests_from_op_list(binary_ops_list,
|
|
binary_short_configs + binary_long_configs,
|
|
BinaryOpBenchmark)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
op_bench.benchmark_runner.main()
|