pytorch/benchmarks/dynamo/microbenchmarks
Jason Ansel a669319450 [inductor] Faster C++ kernel python bindings (#117500)
Calling C++ from Python via ctypes is notoriously slow.  This switches to generating our own C++ bindings directly, which is a >5x speedup on this kernel-launch-bound microbenchmark:
```python
from ctypes import c_void_p
import torch
from torch import empty
from torch._inductor.codecache import AsyncCompile
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
from torch._inductor.wrapper_benchmark import compiled_module_main

async_compile = AsyncCompile()

src = '''
#include "/tmp/torchinductor_jansel/gb/cgbau5vlj6cetmcjbjbtw6x4rrivaln6f45s5d72gy2bfx5foz3k.h"
extern "C" void kernel(const float* in_ptr0,
                       float* out_ptr0)
{
    {
        auto tmp0 = in_ptr0[static_cast<long>(0L)];
        auto tmp1 = static_cast<float>(1.0);
        auto tmp2 = decltype(tmp0)(tmp0 + tmp1);
        out_ptr0[static_cast<long>(0L)] = tmp2;
    }
}
'''

cpp_fused_add_ctypes = async_compile.cpp(src)
cpp_fused_add_cpython = async_compile.cpp_pybinding(["const float*", "float*"], src)

async_compile.wait(globals())
del async_compile

def call(arg0_1):
    buf0 = empty((1,), device='cpu', dtype=torch.float32)
    if use_ctypes:
        for _ in range(100):
            cpp_fused_add_ctypes(c_void_p(arg0_1.data_ptr()), c_void_p(buf0.data_ptr()))
    else:
        for _ in range(100):
            cpp_fused_add_cpython(arg0_1, buf0)
    del arg0_1
    return (buf0,)

def benchmark_compiled_module(times=1000, repeat=100):
    arg0_1 = rand_strided((1,), (1,), device='cpu', dtype=torch.float32)
    return print_performance(lambda: call(arg0_1), times=times, repeat=repeat)

print("old ctypes bindings: ", end='')
use_ctypes = True
compiled_module_main('None', benchmark_compiled_module)
print("new bindings:        ", end='')
use_ctypes = False
compiled_module_main('None', benchmark_compiled_module)
```
Output:
```
old ctypes bindings: 0.000073
new bindings:        0.000013
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/117500
Approved by: https://github.com/desertfire
2024-01-18 16:20:12 +00:00
..
operator_inp_logs
__init__.py
bench_mm_fusion.py
benchmark_helper.py
inductor_bmm.py
inductor_cpu_atomic.py
inductor_mm.py
matmul_relu.py
microbench.py
model.py
operator_inp_utils.py
operatorbench.py
overheads.py [inductor] Faster C++ kernel python bindings (#117500) 2024-01-18 16:20:12 +00:00
tensor_layout_mini_benchmark.py
utils.py