2022-10-13 23:18:06 +00:00
|
|
|
import collections
|
2022-11-29 00:58:46 +00:00
|
|
|
import contextlib
|
2022-10-13 23:18:06 +00:00
|
|
|
import dataclasses
|
|
|
|
|
import functools
|
|
|
|
|
import hashlib
|
|
|
|
|
from itertools import count
|
|
|
|
|
from typing import Any, Dict, List
|
|
|
|
|
|
2023-01-17 20:25:18 +00:00
|
|
|
from torch._dynamo.utils import dynamo_timed
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
from .. import codecache, config, ir
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
from ..codecache import cpp_compile_command, get_code_path
|
2023-01-17 20:25:18 +00:00
|
|
|
from ..utils import cache_on_self, has_triton, sympy_dot, sympy_product
|
2022-10-13 23:18:06 +00:00
|
|
|
from ..virtualized import V
|
|
|
|
|
from .common import CodeGen, DeferredLine, IndentedBuffer, Kernel
|
|
|
|
|
from .triton import texpr
|
|
|
|
|
|
|
|
|
|
pexpr = texpr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def buffer_reuse_key(node: ir.Buffer):
|
|
|
|
|
size = node.get_size()
|
|
|
|
|
stride = node.get_stride()
|
|
|
|
|
last_element = sympy_dot([s - 1 for s in size], stride)
|
|
|
|
|
return (
|
|
|
|
|
node.get_device(),
|
|
|
|
|
node.get_dtype(),
|
|
|
|
|
V.graph.sizevars.simplify(sympy_product(size)),
|
|
|
|
|
# Detect gaps in tensor storage caused by strides
|
|
|
|
|
V.graph.sizevars.size_hint(last_element),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
def make_buffer_reuse(old, new, del_func, declare, ending, as_strided):
|
2022-10-13 23:18:06 +00:00
|
|
|
assert old.get_dtype() == new.get_dtype()
|
2022-10-15 15:35:32 +00:00
|
|
|
del_line = ""
|
|
|
|
|
if old.get_name() not in V.graph.get_output_names():
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
del_line = del_func(old.get_name())
|
2022-10-13 23:18:06 +00:00
|
|
|
if old.get_size() == new.get_size() and old.get_stride() == new.get_stride():
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
return f"{declare}{new.get_name()} = {old.get_name()}{del_line}{ending}"
|
2022-10-13 23:18:06 +00:00
|
|
|
|
|
|
|
|
return (
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
f"{declare}{new.get_name()} = {as_strided}({old.get_name()}, "
|
2022-10-13 23:18:06 +00:00
|
|
|
f"{V.graph.sizevars.codegen_shape_tuple(new.get_size())}, "
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
f"{V.graph.sizevars.codegen_shape_tuple(new.get_stride())}){del_line}{ending}"
|
2022-10-13 23:18:06 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def make_buffer_allocation(buffer):
|
|
|
|
|
device = buffer.get_device()
|
|
|
|
|
dtype = buffer.get_dtype()
|
|
|
|
|
shape = tuple(buffer.get_size())
|
|
|
|
|
stride = tuple(buffer.get_stride())
|
|
|
|
|
return (
|
|
|
|
|
f"{buffer.get_name()} = empty_strided("
|
|
|
|
|
f"{V.graph.sizevars.codegen_shape_tuple(shape)}, "
|
|
|
|
|
f"{V.graph.sizevars.codegen_shape_tuple(stride)}, "
|
|
|
|
|
f"device='{device.type}', dtype={dtype})"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
def make_cpp_buffer_allocation(buffer):
|
|
|
|
|
from .cpp import DTYPE_TO_ATEN
|
|
|
|
|
|
|
|
|
|
# TODO: map layout and device here
|
|
|
|
|
dtype = buffer.get_dtype()
|
|
|
|
|
shape = tuple(buffer.get_size())
|
|
|
|
|
stride = tuple(buffer.get_stride())
|
|
|
|
|
return (
|
|
|
|
|
f"auto {buffer.get_name()} = at::empty_strided("
|
|
|
|
|
f"{V.graph.sizevars.codegen_shape_tuple(shape)}, "
|
|
|
|
|
f"{V.graph.sizevars.codegen_shape_tuple(stride)}, "
|
|
|
|
|
f"{DTYPE_TO_ATEN[dtype]}); "
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
class MemoryPlanningState:
|
|
|
|
|
def __init__(self):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self.reuse_pool: Dict[
|
|
|
|
|
Any, List["FreeIfNotReusedLine"]
|
|
|
|
|
] = collections.defaultdict(list)
|
|
|
|
|
|
|
|
|
|
def __contains__(self, key):
|
|
|
|
|
return bool(self.reuse_pool.get(key, None))
|
|
|
|
|
|
|
|
|
|
def pop(self, key) -> "FreeIfNotReusedLine":
|
|
|
|
|
item = self.reuse_pool[key].pop()
|
|
|
|
|
assert not item.is_reused
|
|
|
|
|
return item
|
|
|
|
|
|
|
|
|
|
def push(self, key, item: "FreeIfNotReusedLine"):
|
|
|
|
|
assert not item.is_reused
|
|
|
|
|
self.reuse_pool[key].append(item)
|
|
|
|
|
|
|
|
|
|
|
generate device context managers in inductor code (#90934)
Fixes https://github.com/pytorch/torchdynamo/issues/1717, https://github.com/pytorch/torchdynamo/issues/1990
<s>TODO: add test with multiple devices, figure out extra context initialization</s>
Problems:
<s>It still initializes context on 0-th device that it shouldn't, I'll take a look where that happens and fix before landing</s>
It adds a python device context manages, that is absurdly slow and takes ~2.5 us (should be nanoseconds). That's not a problem for real models, because it'll be called just once, but it is a bit of an inconvenience for microbenchmarking, we should make that context manager more performant (won't fix in this PR)
It still can have bugs for graphs that run on multiple devices and can have buffers incorrectly shared between multiple device by memory reuse, if that happens that'll need to be solved separately.
Generated code:
```
def call(args):
arg0_1, arg1_1 = args
args.clear()
with torch.cuda.device(1):
buf0 = empty_strided((4, ), (1, ), device='cuda', dtype=torch.float32)
stream1 = get_cuda_stream(1)
triton_fused_div_0.run(arg0_1, arg1_1, buf0, 4, grid=grid(4), stream=stream1)
del arg0_1
del arg1_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90934
Approved by: https://github.com/wconstab
2022-12-16 18:03:39 +00:00
|
|
|
@dataclasses.dataclass
|
|
|
|
|
class EnterCudaDeviceContextManagerLine:
|
|
|
|
|
device_idx: int
|
|
|
|
|
|
|
|
|
|
def codegen(self, code: IndentedBuffer):
|
2023-01-12 11:41:40 +00:00
|
|
|
# Note _DeviceGuard has less overhead than device, but only accepts
|
|
|
|
|
# integers
|
|
|
|
|
code.writeline(f"with torch.cuda._DeviceGuard({self.device_idx}):")
|
generate device context managers in inductor code (#90934)
Fixes https://github.com/pytorch/torchdynamo/issues/1717, https://github.com/pytorch/torchdynamo/issues/1990
<s>TODO: add test with multiple devices, figure out extra context initialization</s>
Problems:
<s>It still initializes context on 0-th device that it shouldn't, I'll take a look where that happens and fix before landing</s>
It adds a python device context manages, that is absurdly slow and takes ~2.5 us (should be nanoseconds). That's not a problem for real models, because it'll be called just once, but it is a bit of an inconvenience for microbenchmarking, we should make that context manager more performant (won't fix in this PR)
It still can have bugs for graphs that run on multiple devices and can have buffers incorrectly shared between multiple device by memory reuse, if that happens that'll need to be solved separately.
Generated code:
```
def call(args):
arg0_1, arg1_1 = args
args.clear()
with torch.cuda.device(1):
buf0 = empty_strided((4, ), (1, ), device='cuda', dtype=torch.float32)
stream1 = get_cuda_stream(1)
triton_fused_div_0.run(arg0_1, arg1_1, buf0, 4, grid=grid(4), stream=stream1)
del arg0_1
del arg1_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90934
Approved by: https://github.com/wconstab
2022-12-16 18:03:39 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class ExitCudaDeviceContextManagerLine:
|
2023-01-12 17:23:49 +00:00
|
|
|
pass
|
generate device context managers in inductor code (#90934)
Fixes https://github.com/pytorch/torchdynamo/issues/1717, https://github.com/pytorch/torchdynamo/issues/1990
<s>TODO: add test with multiple devices, figure out extra context initialization</s>
Problems:
<s>It still initializes context on 0-th device that it shouldn't, I'll take a look where that happens and fix before landing</s>
It adds a python device context manages, that is absurdly slow and takes ~2.5 us (should be nanoseconds). That's not a problem for real models, because it'll be called just once, but it is a bit of an inconvenience for microbenchmarking, we should make that context manager more performant (won't fix in this PR)
It still can have bugs for graphs that run on multiple devices and can have buffers incorrectly shared between multiple device by memory reuse, if that happens that'll need to be solved separately.
Generated code:
```
def call(args):
arg0_1, arg1_1 = args
args.clear()
with torch.cuda.device(1):
buf0 = empty_strided((4, ), (1, ), device='cuda', dtype=torch.float32)
stream1 = get_cuda_stream(1)
triton_fused_div_0.run(arg0_1, arg1_1, buf0, 4, grid=grid(4), stream=stream1)
del arg0_1
del arg1_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90934
Approved by: https://github.com/wconstab
2022-12-16 18:03:39 +00:00
|
|
|
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
class MemoryPlanningLine:
|
|
|
|
|
def plan(self, state: MemoryPlanningState) -> "MemoryPlanningLine":
|
|
|
|
|
"""First pass to find reuse"""
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def codegen(self, code: IndentedBuffer):
|
|
|
|
|
"""Second pass to output code"""
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclasses.dataclass
|
|
|
|
|
class AllocateLine(MemoryPlanningLine):
|
|
|
|
|
node: ir.Buffer
|
|
|
|
|
|
|
|
|
|
def plan(self, state: MemoryPlanningState):
|
|
|
|
|
if self.node.get_name() in V.graph.removed_buffers:
|
|
|
|
|
return NullLine()
|
|
|
|
|
|
|
|
|
|
# try to reuse a recently freed buffer
|
|
|
|
|
key = buffer_reuse_key(self.node)
|
|
|
|
|
if key in state:
|
|
|
|
|
free_line = state.pop(key)
|
|
|
|
|
free_line.is_reused = True
|
|
|
|
|
return ReuseLine(free_line.node, self.node)
|
|
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def codegen(self, code: IndentedBuffer):
|
|
|
|
|
assert self.node.get_name() not in V.graph.removed_buffers
|
|
|
|
|
code.writeline(make_buffer_allocation(self.node))
|
|
|
|
|
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
@dataclasses.dataclass
|
|
|
|
|
class CppAllocateLine(AllocateLine):
|
|
|
|
|
def plan(self, state: MemoryPlanningState):
|
|
|
|
|
if self.node.get_name() in V.graph.removed_buffers:
|
|
|
|
|
return NullLine()
|
|
|
|
|
|
|
|
|
|
# try to reuse a recently freed buffer
|
|
|
|
|
key = buffer_reuse_key(self.node)
|
|
|
|
|
|
|
|
|
|
if key in state:
|
|
|
|
|
free_line = state.pop(key)
|
|
|
|
|
free_line.is_reused = True
|
|
|
|
|
return CppReuseLine(free_line.node, self.node)
|
|
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def codegen(self, code: IndentedBuffer):
|
|
|
|
|
assert self.node.get_name() not in V.graph.removed_buffers
|
|
|
|
|
code.writeline(make_cpp_buffer_allocation(self.node))
|
|
|
|
|
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
@dataclasses.dataclass
|
|
|
|
|
class FreeIfNotReusedLine(MemoryPlanningLine):
|
|
|
|
|
node: ir.Buffer
|
|
|
|
|
is_reused: bool = False
|
|
|
|
|
|
|
|
|
|
def plan(self, state: MemoryPlanningState):
|
|
|
|
|
assert not self.is_reused
|
|
|
|
|
if self.node.get_name() in V.graph.removed_buffers:
|
|
|
|
|
return NullLine()
|
|
|
|
|
state.push(buffer_reuse_key(self.node), self)
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def codegen(self, code: IndentedBuffer):
|
|
|
|
|
assert self.node.get_name() not in V.graph.removed_buffers
|
|
|
|
|
if not self.is_reused:
|
|
|
|
|
code.writeline(f"del {self.node.get_name()}")
|
|
|
|
|
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
@dataclasses.dataclass
|
|
|
|
|
class CppFreeIfNotReusedLine(FreeIfNotReusedLine):
|
|
|
|
|
node: ir.Buffer
|
|
|
|
|
is_reused: bool = False
|
|
|
|
|
|
|
|
|
|
def codegen(self, code: IndentedBuffer):
|
|
|
|
|
assert (self.node.get_name()) not in V.graph.removed_buffers
|
|
|
|
|
if not self.is_reused:
|
|
|
|
|
code.writeline(f"{self.node.get_name()}.reset();")
|
|
|
|
|
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
@dataclasses.dataclass
|
|
|
|
|
class ReuseLine(MemoryPlanningLine):
|
|
|
|
|
node: ir.Buffer
|
|
|
|
|
reused_as: ir.Buffer
|
|
|
|
|
|
|
|
|
|
def plan(self, state: MemoryPlanningState):
|
|
|
|
|
assert self.node.get_name() not in V.graph.removed_buffers
|
2022-10-15 15:35:32 +00:00
|
|
|
assert self.reused_as.get_name() not in V.graph.removed_buffers
|
2022-10-13 23:18:06 +00:00
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def codegen(self, code: IndentedBuffer):
|
|
|
|
|
assert self.node.get_name() not in V.graph.removed_buffers
|
|
|
|
|
assert self.reused_as.get_name() not in V.graph.removed_buffers
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
code.writeline(
|
|
|
|
|
make_buffer_reuse(
|
|
|
|
|
self.node,
|
|
|
|
|
self.reused_as,
|
|
|
|
|
del_func=lambda name: f"; del {name}",
|
|
|
|
|
declare="",
|
|
|
|
|
ending="",
|
|
|
|
|
as_strided="as_strided",
|
|
|
|
|
)
|
|
|
|
|
+ " # reuse"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclasses.dataclass
|
|
|
|
|
class CppReuseLine(ReuseLine):
|
|
|
|
|
node: ir.Buffer
|
|
|
|
|
reused_as: ir.Buffer
|
|
|
|
|
|
|
|
|
|
def codegen(self, code: IndentedBuffer):
|
|
|
|
|
assert self.node.get_name() not in V.graph.removed_buffers
|
|
|
|
|
assert self.reused_as.get_name() not in V.graph.removed_buffers
|
|
|
|
|
code.writeline(
|
|
|
|
|
make_buffer_reuse(
|
|
|
|
|
self.node,
|
|
|
|
|
self.reused_as,
|
|
|
|
|
del_func=lambda name: f"; {name}.reset()",
|
|
|
|
|
declare="auto ",
|
|
|
|
|
ending=";",
|
|
|
|
|
as_strided="at::as_strided",
|
|
|
|
|
)
|
|
|
|
|
+ " // reuse"
|
|
|
|
|
)
|
2022-10-13 23:18:06 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
@dataclasses.dataclass
|
|
|
|
|
class FreeLine(MemoryPlanningLine):
|
|
|
|
|
node: ir.Buffer
|
|
|
|
|
|
|
|
|
|
def plan(self, state: MemoryPlanningState):
|
|
|
|
|
if self.node.get_name() in V.graph.removed_buffers:
|
|
|
|
|
return NullLine()
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
|
|
def codegen(self, code: IndentedBuffer):
|
|
|
|
|
assert self.node.get_name() not in V.graph.removed_buffers
|
|
|
|
|
code.writeline(f"del {self.node.get_name()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NullLine(MemoryPlanningLine):
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WrapperCodeGen(CodeGen):
|
|
|
|
|
"""
|
|
|
|
|
The outer wrapper that calls the kernels.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
super().__init__()
|
|
|
|
|
self._names_iter = count()
|
|
|
|
|
self.header = IndentedBuffer()
|
|
|
|
|
self.prefix = IndentedBuffer()
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.wrapper_call = IndentedBuffer()
|
2022-10-13 23:18:06 +00:00
|
|
|
self.kernels = {}
|
|
|
|
|
self.lines = []
|
|
|
|
|
self.header.splice(
|
|
|
|
|
f"""
|
|
|
|
|
from ctypes import c_void_p, c_long
|
|
|
|
|
import torch
|
|
|
|
|
import random
|
|
|
|
|
from torch import empty_strided, as_strided, device
|
|
|
|
|
from {codecache.__name__} import AsyncCompile
|
2023-01-11 00:08:03 +00:00
|
|
|
from torch._inductor.select_algorithm import extern_kernels
|
2022-10-13 23:18:06 +00:00
|
|
|
|
|
|
|
|
aten = torch.ops.aten
|
2022-10-16 17:16:04 +00:00
|
|
|
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
|
2022-10-13 23:18:06 +00:00
|
|
|
async_compile = AsyncCompile()
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if has_triton():
|
|
|
|
|
self.header.splice(
|
|
|
|
|
f"""
|
|
|
|
|
import triton
|
|
|
|
|
import triton.language as tl
|
|
|
|
|
from {config.inductor_import}.triton_ops.autotune import grid
|
|
|
|
|
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
|
|
|
|
|
"""
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if config.triton.convolution != "aten":
|
|
|
|
|
self.header.splice(
|
|
|
|
|
f"""
|
|
|
|
|
from {config.inductor_import}.triton_ops.conv_perf_model import early_config_prune
|
|
|
|
|
from {config.inductor_import}.triton_ops.conv_perf_model import estimate_conv_time
|
|
|
|
|
from {config.inductor_import}.triton_ops.autotune import conv_heuristics
|
|
|
|
|
"""
|
|
|
|
|
)
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.write_prefix()
|
|
|
|
|
|
|
|
|
|
for name, value in V.graph.constants.items():
|
|
|
|
|
# include a hash so our code cache gives different constants different files
|
|
|
|
|
hashed = hashlib.sha256(repr(value).encode("utf-8")).hexdigest()
|
|
|
|
|
self.header.writeline(f"{name} = None # {hashed}")
|
|
|
|
|
|
|
|
|
|
self.allocated = set()
|
|
|
|
|
self.freed = set()
|
|
|
|
|
self.write_get_cuda_stream = functools.lru_cache(None)(
|
|
|
|
|
self.write_get_cuda_stream
|
|
|
|
|
)
|
|
|
|
|
|
2023-01-11 00:08:03 +00:00
|
|
|
@functools.lru_cache(None)
|
|
|
|
|
def add_import_once(line):
|
|
|
|
|
self.header.writeline(line)
|
|
|
|
|
|
|
|
|
|
self.add_import_once = add_import_once
|
|
|
|
|
self._metas = {}
|
|
|
|
|
|
|
|
|
|
def add_meta_once(self, meta):
|
|
|
|
|
meta = repr(meta)
|
|
|
|
|
if meta not in self._metas:
|
|
|
|
|
var = f"meta{len(self._metas)}"
|
|
|
|
|
self._metas[meta] = var
|
|
|
|
|
self.header.writeline(f"{var} = {meta}")
|
|
|
|
|
return self._metas[meta]
|
|
|
|
|
|
2022-12-13 09:52:54 +00:00
|
|
|
@cache_on_self
|
|
|
|
|
def get_output_refs(self):
|
|
|
|
|
return [x.codegen_reference() for x in V.graph.graph_outputs]
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
def write_prefix(self):
|
2022-10-13 23:18:06 +00:00
|
|
|
self.prefix.splice(
|
2022-10-14 18:05:28 +00:00
|
|
|
"""
|
2022-10-13 23:18:06 +00:00
|
|
|
|
|
|
|
|
async_compile.wait(globals())
|
|
|
|
|
del async_compile
|
|
|
|
|
|
2022-10-14 18:05:28 +00:00
|
|
|
def call(args):
|
2022-10-13 23:18:06 +00:00
|
|
|
"""
|
|
|
|
|
)
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
with self.wrapper_call.indent():
|
2022-12-08 16:22:26 +00:00
|
|
|
if config.triton.debug_sync_graph:
|
|
|
|
|
self.wrapper_call.writeline("torch.cuda.synchronize()")
|
2022-10-14 18:05:28 +00:00
|
|
|
inp_len = len(V.graph.graph_inputs.keys())
|
|
|
|
|
if inp_len != 0:
|
|
|
|
|
lhs = f"{', '.join(V.graph.graph_inputs.keys())}{'' if inp_len != 1 else ','}"
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.wrapper_call.writeline(f"{lhs} = args")
|
|
|
|
|
self.wrapper_call.writeline("args.clear()")
|
2022-10-13 23:18:06 +00:00
|
|
|
for name in V.graph.randomness_seeds:
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.wrapper_call.writeline(
|
2022-10-13 23:18:06 +00:00
|
|
|
f"torch.randint(2**31, size=(), dtype=torch.int64, out={name})"
|
|
|
|
|
)
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
V.graph.sizevars.codegen(self.wrapper_call, V.graph.graph_inputs)
|
2022-10-13 23:18:06 +00:00
|
|
|
|
|
|
|
|
def write_get_cuda_stream(self, index):
|
|
|
|
|
name = f"stream{index}"
|
|
|
|
|
self.writeline(f"{name} = get_cuda_stream({index})")
|
|
|
|
|
return name
|
|
|
|
|
|
2022-11-10 21:38:04 +00:00
|
|
|
def next_kernel_suffix(self):
|
|
|
|
|
return f"{next(self._names_iter)}"
|
2022-10-13 23:18:06 +00:00
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
def write_allocate_line(self, buffer):
|
|
|
|
|
self.writeline(AllocateLine(buffer))
|
|
|
|
|
|
|
|
|
|
def get_deferred_line(self, name, layout):
|
|
|
|
|
return DeferredLine(
|
|
|
|
|
name, f"{name} = {layout.view.codegen_reference()} # alias"
|
|
|
|
|
)
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
def codegen_allocation(self, buffer):
|
|
|
|
|
name = buffer.get_name()
|
|
|
|
|
if name in V.graph.removed_buffers or name in self.allocated:
|
|
|
|
|
return
|
|
|
|
|
self.allocated.add(name)
|
|
|
|
|
|
2022-10-15 15:35:32 +00:00
|
|
|
if isinstance(
|
|
|
|
|
buffer,
|
|
|
|
|
(ir.ExternKernelAlloc, ir.MultiOutput),
|
|
|
|
|
):
|
|
|
|
|
return
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
layout = buffer.get_layout()
|
|
|
|
|
if isinstance(layout, ir.MutationLayout):
|
|
|
|
|
return
|
|
|
|
|
if isinstance(layout, ir.AliasedLayout):
|
|
|
|
|
assert isinstance(layout.view, ir.ReinterpretView)
|
|
|
|
|
if not layout.maybe_guard_aligned():
|
|
|
|
|
V.graph.unaligned_buffers.add(name)
|
|
|
|
|
self.codegen_allocation(layout.view.data)
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
allocation = self.get_deferred_line(name, layout)
|
2022-10-13 23:18:06 +00:00
|
|
|
self.writeline(allocation)
|
|
|
|
|
return
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.write_allocate_line(buffer)
|
|
|
|
|
|
|
|
|
|
def write_del_line(self, name):
|
|
|
|
|
self.writeline(f"del {name}")
|
|
|
|
|
|
|
|
|
|
def write_free_if_not_reused_line(self, buffer):
|
|
|
|
|
self.writeline(FreeIfNotReusedLine(buffer))
|
2022-10-13 23:18:06 +00:00
|
|
|
|
|
|
|
|
def codegen_free(self, buffer):
|
|
|
|
|
name = buffer.get_name()
|
2022-10-14 18:05:28 +00:00
|
|
|
|
|
|
|
|
# can be freed but not reused
|
|
|
|
|
if isinstance(buffer, ir.InputBuffer):
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.write_del_line(name)
|
2022-10-14 18:05:28 +00:00
|
|
|
return
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
if not self.can_reuse(buffer):
|
|
|
|
|
return
|
|
|
|
|
self.freed.add(name)
|
|
|
|
|
|
|
|
|
|
layout = buffer.get_layout()
|
|
|
|
|
if isinstance(layout, (ir.AliasedLayout, ir.MultiOutputLayout)):
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.write_del_line(name)
|
2022-10-13 23:18:06 +00:00
|
|
|
return
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.write_free_if_not_reused_line(buffer)
|
2022-10-13 23:18:06 +00:00
|
|
|
|
|
|
|
|
def can_reuse(self, buffer):
|
|
|
|
|
name = buffer.get_name()
|
|
|
|
|
if (
|
|
|
|
|
name in V.graph.removed_buffers
|
|
|
|
|
or name in V.graph.graph_inputs
|
|
|
|
|
or name in V.graph.constants
|
|
|
|
|
or name in self.freed
|
|
|
|
|
):
|
|
|
|
|
return False
|
|
|
|
|
return True
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
def write_reuse_line(self, input_buffer, output_buffer):
|
|
|
|
|
self.writeline(ReuseLine(input_buffer, output_buffer))
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
def codegen_inplace_reuse(self, input_buffer, output_buffer):
|
|
|
|
|
assert buffer_reuse_key(input_buffer) == buffer_reuse_key(output_buffer)
|
|
|
|
|
self.codegen_allocation(input_buffer)
|
|
|
|
|
self.freed.add(input_buffer.get_name())
|
|
|
|
|
self.allocated.add(output_buffer.get_name())
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.write_reuse_line(input_buffer, output_buffer)
|
|
|
|
|
|
generate device context managers in inductor code (#90934)
Fixes https://github.com/pytorch/torchdynamo/issues/1717, https://github.com/pytorch/torchdynamo/issues/1990
<s>TODO: add test with multiple devices, figure out extra context initialization</s>
Problems:
<s>It still initializes context on 0-th device that it shouldn't, I'll take a look where that happens and fix before landing</s>
It adds a python device context manages, that is absurdly slow and takes ~2.5 us (should be nanoseconds). That's not a problem for real models, because it'll be called just once, but it is a bit of an inconvenience for microbenchmarking, we should make that context manager more performant (won't fix in this PR)
It still can have bugs for graphs that run on multiple devices and can have buffers incorrectly shared between multiple device by memory reuse, if that happens that'll need to be solved separately.
Generated code:
```
def call(args):
arg0_1, arg1_1 = args
args.clear()
with torch.cuda.device(1):
buf0 = empty_strided((4, ), (1, ), device='cuda', dtype=torch.float32)
stream1 = get_cuda_stream(1)
triton_fused_div_0.run(arg0_1, arg1_1, buf0, 4, grid=grid(4), stream=stream1)
del arg0_1
del arg1_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90934
Approved by: https://github.com/wconstab
2022-12-16 18:03:39 +00:00
|
|
|
def codegen_cuda_device_guard_enter(self, device_idx):
|
|
|
|
|
self.lines.append(EnterCudaDeviceContextManagerLine(device_idx))
|
|
|
|
|
|
|
|
|
|
def codegen_cuda_device_guard_exit(self):
|
|
|
|
|
self.lines.append(ExitCudaDeviceContextManagerLine())
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
def generate_return(self, output_refs):
|
|
|
|
|
if output_refs:
|
|
|
|
|
self.wrapper_call.writeline("return (" + ", ".join(output_refs) + ", )")
|
|
|
|
|
else:
|
|
|
|
|
self.wrapper_call.writeline("return ()")
|
|
|
|
|
|
|
|
|
|
def generate_end(self, result):
|
|
|
|
|
return
|
2022-10-13 23:18:06 +00:00
|
|
|
|
2022-12-14 15:43:31 +00:00
|
|
|
def generate_extern_kernel_out(
|
|
|
|
|
self, output_view, codegen_reference, args, kernel, cpp_kernel
|
|
|
|
|
):
|
|
|
|
|
if output_view:
|
|
|
|
|
args.append(f"out={output_view.codegen_reference()}")
|
|
|
|
|
else:
|
|
|
|
|
args.append(f"out={codegen_reference}")
|
|
|
|
|
self.writeline(f"{kernel}({', '.join(args)})")
|
|
|
|
|
|
2023-01-17 20:25:18 +00:00
|
|
|
@dynamo_timed
|
2022-10-13 23:18:06 +00:00
|
|
|
def generate(self):
|
|
|
|
|
result = IndentedBuffer()
|
|
|
|
|
result.splice(self.header)
|
|
|
|
|
result.splice(self.prefix)
|
|
|
|
|
|
|
|
|
|
out_names = V.graph.get_output_names()
|
2022-11-29 00:58:46 +00:00
|
|
|
with contextlib.ExitStack() as stack:
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
stack.enter_context(self.wrapper_call.indent())
|
2022-11-29 00:58:46 +00:00
|
|
|
if config.profiler_mark_wrapper_call:
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.wrapper_call.writeline(
|
|
|
|
|
"from torch.profiler import record_function"
|
|
|
|
|
)
|
|
|
|
|
self.wrapper_call.writeline(
|
|
|
|
|
"with record_function('inductor_wrapper_call'):"
|
|
|
|
|
)
|
|
|
|
|
stack.enter_context(self.wrapper_call.indent())
|
2022-10-13 23:18:06 +00:00
|
|
|
while (
|
|
|
|
|
self.lines
|
|
|
|
|
and isinstance(self.lines[-1], MemoryPlanningLine)
|
|
|
|
|
and self.lines[-1].node.name not in out_names
|
|
|
|
|
):
|
|
|
|
|
# these lines will be pointless
|
|
|
|
|
self.lines.pop()
|
|
|
|
|
|
|
|
|
|
# codegen allocations in two passes
|
|
|
|
|
planning_state = MemoryPlanningState()
|
|
|
|
|
for i in range(len(self.lines)):
|
|
|
|
|
if isinstance(self.lines[i], MemoryPlanningLine):
|
|
|
|
|
self.lines[i] = self.lines[i].plan(planning_state)
|
|
|
|
|
|
generate device context managers in inductor code (#90934)
Fixes https://github.com/pytorch/torchdynamo/issues/1717, https://github.com/pytorch/torchdynamo/issues/1990
<s>TODO: add test with multiple devices, figure out extra context initialization</s>
Problems:
<s>It still initializes context on 0-th device that it shouldn't, I'll take a look where that happens and fix before landing</s>
It adds a python device context manages, that is absurdly slow and takes ~2.5 us (should be nanoseconds). That's not a problem for real models, because it'll be called just once, but it is a bit of an inconvenience for microbenchmarking, we should make that context manager more performant (won't fix in this PR)
It still can have bugs for graphs that run on multiple devices and can have buffers incorrectly shared between multiple device by memory reuse, if that happens that'll need to be solved separately.
Generated code:
```
def call(args):
arg0_1, arg1_1 = args
args.clear()
with torch.cuda.device(1):
buf0 = empty_strided((4, ), (1, ), device='cuda', dtype=torch.float32)
stream1 = get_cuda_stream(1)
triton_fused_div_0.run(arg0_1, arg1_1, buf0, 4, grid=grid(4), stream=stream1)
del arg0_1
del arg1_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90934
Approved by: https://github.com/wconstab
2022-12-16 18:03:39 +00:00
|
|
|
device_cm_stack = contextlib.ExitStack()
|
2022-10-13 23:18:06 +00:00
|
|
|
for line in self.lines:
|
|
|
|
|
if isinstance(line, MemoryPlanningLine):
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
line.codegen(self.wrapper_call)
|
generate device context managers in inductor code (#90934)
Fixes https://github.com/pytorch/torchdynamo/issues/1717, https://github.com/pytorch/torchdynamo/issues/1990
<s>TODO: add test with multiple devices, figure out extra context initialization</s>
Problems:
<s>It still initializes context on 0-th device that it shouldn't, I'll take a look where that happens and fix before landing</s>
It adds a python device context manages, that is absurdly slow and takes ~2.5 us (should be nanoseconds). That's not a problem for real models, because it'll be called just once, but it is a bit of an inconvenience for microbenchmarking, we should make that context manager more performant (won't fix in this PR)
It still can have bugs for graphs that run on multiple devices and can have buffers incorrectly shared between multiple device by memory reuse, if that happens that'll need to be solved separately.
Generated code:
```
def call(args):
arg0_1, arg1_1 = args
args.clear()
with torch.cuda.device(1):
buf0 = empty_strided((4, ), (1, ), device='cuda', dtype=torch.float32)
stream1 = get_cuda_stream(1)
triton_fused_div_0.run(arg0_1, arg1_1, buf0, 4, grid=grid(4), stream=stream1)
del arg0_1
del arg1_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90934
Approved by: https://github.com/wconstab
2022-12-16 18:03:39 +00:00
|
|
|
elif isinstance(line, EnterCudaDeviceContextManagerLine):
|
|
|
|
|
line.codegen(self.wrapper_call)
|
|
|
|
|
device_cm_stack.enter_context(self.wrapper_call.indent())
|
2023-01-12 17:23:49 +00:00
|
|
|
self.wrapper_call.writeline(
|
|
|
|
|
f"torch.cuda.set_device({line.device_idx}) # no-op to ensure context"
|
|
|
|
|
)
|
generate device context managers in inductor code (#90934)
Fixes https://github.com/pytorch/torchdynamo/issues/1717, https://github.com/pytorch/torchdynamo/issues/1990
<s>TODO: add test with multiple devices, figure out extra context initialization</s>
Problems:
<s>It still initializes context on 0-th device that it shouldn't, I'll take a look where that happens and fix before landing</s>
It adds a python device context manages, that is absurdly slow and takes ~2.5 us (should be nanoseconds). That's not a problem for real models, because it'll be called just once, but it is a bit of an inconvenience for microbenchmarking, we should make that context manager more performant (won't fix in this PR)
It still can have bugs for graphs that run on multiple devices and can have buffers incorrectly shared between multiple device by memory reuse, if that happens that'll need to be solved separately.
Generated code:
```
def call(args):
arg0_1, arg1_1 = args
args.clear()
with torch.cuda.device(1):
buf0 = empty_strided((4, ), (1, ), device='cuda', dtype=torch.float32)
stream1 = get_cuda_stream(1)
triton_fused_div_0.run(arg0_1, arg1_1, buf0, 4, grid=grid(4), stream=stream1)
del arg0_1
del arg1_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90934
Approved by: https://github.com/wconstab
2022-12-16 18:03:39 +00:00
|
|
|
elif isinstance(line, ExitCudaDeviceContextManagerLine):
|
|
|
|
|
device_cm_stack.close()
|
2022-10-13 23:18:06 +00:00
|
|
|
else:
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.wrapper_call.writeline(line)
|
2022-10-13 23:18:06 +00:00
|
|
|
|
2022-12-13 09:52:54 +00:00
|
|
|
output_refs = self.get_output_refs()
|
2022-12-08 16:22:26 +00:00
|
|
|
if config.triton.debug_sync_graph:
|
|
|
|
|
self.wrapper_call.writeline("torch.cuda.synchronize()")
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
self.generate_return(output_refs)
|
|
|
|
|
|
|
|
|
|
with result.indent():
|
|
|
|
|
result.splice(self.wrapper_call)
|
|
|
|
|
|
|
|
|
|
self.generate_end(result)
|
2022-10-13 23:18:06 +00:00
|
|
|
|
|
|
|
|
self.add_benchmark_harness(result)
|
|
|
|
|
|
|
|
|
|
return result.getvalue()
|
|
|
|
|
|
|
|
|
|
def add_benchmark_harness(self, output):
|
|
|
|
|
"""
|
|
|
|
|
Append a benchmark harness to generated code for debugging
|
|
|
|
|
"""
|
|
|
|
|
if not config.benchmark_harness:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
def add_fake_input(name, shape, stride, device, dtype):
|
|
|
|
|
output.writeline(
|
|
|
|
|
f"{name} = rand_strided("
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
f"{V.graph.sizevars.codegen_benchmark_shape_tuple(shape)}, "
|
|
|
|
|
f"{V.graph.sizevars.codegen_benchmark_shape_tuple(stride)}, "
|
generate device context managers in inductor code (#90934)
Fixes https://github.com/pytorch/torchdynamo/issues/1717, https://github.com/pytorch/torchdynamo/issues/1990
<s>TODO: add test with multiple devices, figure out extra context initialization</s>
Problems:
<s>It still initializes context on 0-th device that it shouldn't, I'll take a look where that happens and fix before landing</s>
It adds a python device context manages, that is absurdly slow and takes ~2.5 us (should be nanoseconds). That's not a problem for real models, because it'll be called just once, but it is a bit of an inconvenience for microbenchmarking, we should make that context manager more performant (won't fix in this PR)
It still can have bugs for graphs that run on multiple devices and can have buffers incorrectly shared between multiple device by memory reuse, if that happens that'll need to be solved separately.
Generated code:
```
def call(args):
arg0_1, arg1_1 = args
args.clear()
with torch.cuda.device(1):
buf0 = empty_strided((4, ), (1, ), device='cuda', dtype=torch.float32)
stream1 = get_cuda_stream(1)
triton_fused_div_0.run(arg0_1, arg1_1, buf0, 4, grid=grid(4), stream=stream1)
del arg0_1
del arg1_1
return (buf0, )
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/90934
Approved by: https://github.com/wconstab
2022-12-16 18:03:39 +00:00
|
|
|
f"device='{device}', dtype={dtype})"
|
2022-10-13 23:18:06 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
output.writelines(["", "", 'if __name__ == "__main__":'])
|
|
|
|
|
with output.indent():
|
|
|
|
|
output.splice(
|
2023-01-17 20:25:18 +00:00
|
|
|
"""
|
|
|
|
|
from torch._dynamo.testing import rand_strided
|
|
|
|
|
from torch._inductor.utils import print_performance
|
2022-10-13 23:18:06 +00:00
|
|
|
""",
|
|
|
|
|
strip=True,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for name, value in V.graph.constants.items():
|
|
|
|
|
add_fake_input(
|
|
|
|
|
name, value.size(), value.stride(), value.device, value.dtype
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
for name, value in V.graph.graph_inputs.items():
|
|
|
|
|
shape = [V.graph.sizevars.size_hint(x) for x in value.get_size()]
|
|
|
|
|
stride = [V.graph.sizevars.size_hint(x) for x in value.get_stride()]
|
|
|
|
|
add_fake_input(
|
|
|
|
|
name, shape, stride, value.get_device(), value.get_dtype()
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
output.writeline(
|
2022-10-14 18:05:28 +00:00
|
|
|
f"print_performance(lambda: call([{', '.join(V.graph.graph_inputs.keys())}]))"
|
2022-10-13 23:18:06 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def define_kernel(self, name: str, kernel: str):
|
|
|
|
|
self.header.splice(f"\n\n{name} = {kernel}")
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
def load_kernel(self, name: str = None, kernel: str = None, arg_types: List = None):
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
def wrap_kernel_call(self, name, call_args):
|
|
|
|
|
return "{}({})".format(name, ", ".join(call_args))
|
|
|
|
|
|
|
|
|
|
def generate_kernel_call(self, name, call_args):
|
|
|
|
|
self.writeline(
|
|
|
|
|
self.wrap_kernel_call(name, call_args),
|
|
|
|
|
)
|
|
|
|
|
|
2022-10-13 23:18:06 +00:00
|
|
|
def call_kernel(self, name: str, kernel: Kernel):
|
|
|
|
|
tmp = IndentedBuffer()
|
|
|
|
|
kernel.call_kernel(self, tmp, name)
|
|
|
|
|
for line in tmp.getvalue().split("\n"):
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if line:
|
|
|
|
|
self.writeline(line)
|
|
|
|
|
|
|
|
|
|
def writeline(self, line):
|
|
|
|
|
self.lines.append(line)
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class CppWrapperCodeGen(WrapperCodeGen):
|
|
|
|
|
"""
|
|
|
|
|
The outer wrapper that calls the kernels.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
call_func_id = count()
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
self._call_func_id = next(CppWrapperCodeGen.call_func_id)
|
|
|
|
|
super().__init__()
|
|
|
|
|
|
2022-12-13 09:52:54 +00:00
|
|
|
@cache_on_self
|
|
|
|
|
def get_output_refs(self):
|
|
|
|
|
def has_cpp_codegen_func(x):
|
|
|
|
|
return hasattr(x, "cpp_wrapper_codegen_reference") and callable(
|
|
|
|
|
x.cpp_wrapper_codegen_reference
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return [
|
|
|
|
|
x.cpp_wrapper_codegen_reference()
|
|
|
|
|
if has_cpp_codegen_func(x)
|
|
|
|
|
else x.codegen_reference()
|
|
|
|
|
for x in V.graph.graph_outputs
|
|
|
|
|
]
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
def write_prefix(self):
|
|
|
|
|
self.prefix.splice(
|
|
|
|
|
"""
|
|
|
|
|
async_compile.wait(globals())
|
|
|
|
|
del async_compile
|
|
|
|
|
from torch.utils.cpp_extension import load_inline
|
|
|
|
|
wrapper = (
|
|
|
|
|
'''
|
|
|
|
|
#include <dlfcn.h>
|
|
|
|
|
#include <assert.h>
|
Inductor cpp wrapper: cache the loading of the kernel (#89742)
### Pitch
Cache the loaded kernel to reduce the overhead.
#### Code before:
```cpp
std::vector<at::Tensor> call_0(std::tuple<at::Tensor&, at::Tensor&> args) {
...
auto kernel_cpp_0_lib = dlopen("/tmp/torchinductor_xxx/yr/cyr3uymlc6pgvnimx3fnynaa4t7ldafeqzhe5zpizmvorisx4hb2.so", RTLD_NOW);
assert(kernel_cpp_0_lib != nullptr);
void (*kernel_cpp_0)(const float*,const float*,float*,float*);
*(void **) (&kernel_cpp_0) = dlsym(kernel_cpp_0_lib, "kernel");
kernel_cpp_0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
...
}
```
#### Code after:
```cpp
template <typename KernelFunc>
KernelFunc load_cpp_kernel(const char* so_filename) {
KernelFunc kernel_cpp;
auto kernel_cpp_lib = dlopen(so_filename, RTLD_NOW);
assert(kernel_cpp_lib != nullptr);
*(void **) (&kernel_cpp) = dlsym(kernel_cpp_lib, "kernel");
return kernel_cpp;
}
std::vector<at::Tensor> call_0(std::tuple<at::Tensor&, at::Tensor&> args) {
...
static auto kernel_cpp_0 = load_cpp_kernel<void (*)(const float*,const float*,float*,float*)>("/tmp/torchinductor_xxx/yr/cyr3uymlc6pgvnimx3fnynaa4t7ldafeqzhe5zpizmvorisx4hb2.so");
kernel_cpp_0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
...
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89742
Approved by: https://github.com/jgong5, https://github.com/desertfire
2023-01-30 07:39:56 +00:00
|
|
|
|
|
|
|
|
template <typename KernelFunc>
|
|
|
|
|
KernelFunc load_cpp_kernel(const char* so_filename) {
|
|
|
|
|
KernelFunc kernel_cpp;
|
|
|
|
|
auto kernel_cpp_lib = dlopen(so_filename, RTLD_NOW);
|
|
|
|
|
assert(kernel_cpp_lib != nullptr);
|
|
|
|
|
*(void **) (&kernel_cpp) = dlsym(kernel_cpp_lib, "kernel");
|
|
|
|
|
return kernel_cpp;
|
|
|
|
|
}
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
"""
|
|
|
|
|
)
|
|
|
|
|
with self.wrapper_call.indent():
|
|
|
|
|
inputs_len = len(V.graph.graph_inputs.keys())
|
2022-12-13 09:52:54 +00:00
|
|
|
output_refs = self.get_output_refs()
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
if output_refs:
|
|
|
|
|
if len(output_refs) == 1:
|
|
|
|
|
output_types = "at::Tensor"
|
|
|
|
|
else:
|
|
|
|
|
output_types = "std::vector<at::Tensor>"
|
|
|
|
|
else:
|
|
|
|
|
output_types = "void"
|
|
|
|
|
|
2022-12-14 15:43:32 +00:00
|
|
|
inputs_types = "std::vector<at::Tensor>"
|
|
|
|
|
self.wrapper_call.writeline(
|
|
|
|
|
f"{output_types} call_{self._call_func_id}({inputs_types} args) {{"
|
|
|
|
|
)
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
if inputs_len != 0:
|
|
|
|
|
inputs_keys_str = ", ".join(V.graph.graph_inputs.keys())
|
|
|
|
|
self.wrapper_call.writeline(f"at::Tensor {inputs_keys_str};")
|
2022-12-14 15:43:32 +00:00
|
|
|
for idx, input_key in enumerate(V.graph.graph_inputs.keys()):
|
|
|
|
|
self.wrapper_call.writeline(f"{input_key} = args[{idx}];")
|
|
|
|
|
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
for name in V.graph.randomness_seeds:
|
|
|
|
|
self.wrapper_call.writeline(f"at::Tensor {name};")
|
|
|
|
|
self.wrapper_call.writeline(
|
|
|
|
|
f"{name} = at::randint(std::pow(2, 31), {{}}, at::ScalarType::Long);"
|
|
|
|
|
)
|
|
|
|
|
V.graph.sizevars.codegen(self.wrapper_call, V.graph.graph_inputs)
|
|
|
|
|
|
|
|
|
|
def write_allocate_line(self, buffer):
|
|
|
|
|
self.writeline(CppAllocateLine(buffer))
|
|
|
|
|
|
|
|
|
|
def write_del_line(self, name):
|
|
|
|
|
self.writeline(f"{name}.reset();")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
def write_free_if_not_reused_line(self, buffer):
|
|
|
|
|
self.writeline(CppFreeIfNotReusedLine(buffer))
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
def write_reuse_line(self, input_buffer, output_buffer):
|
|
|
|
|
self.writeline(CppReuseLine(input_buffer, output_buffer))
|
|
|
|
|
|
|
|
|
|
def get_deferred_line(self, name, layout):
|
|
|
|
|
return DeferredLine(
|
|
|
|
|
name, f"auto {name} = {layout.view.codegen_reference()}; // alias"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def get_kernel_path(self, code):
|
|
|
|
|
from ..codecache import pick_vec_isa
|
|
|
|
|
|
|
|
|
|
picked_vec_isa = pick_vec_isa()
|
|
|
|
|
ext = "so"
|
|
|
|
|
extra = cpp_compile_command("i", "o", vec_isa=picked_vec_isa)
|
|
|
|
|
# \n is required to match with the CodeCache behavior
|
2022-12-14 15:43:28 +00:00
|
|
|
# For reductions, the code string gotten from code.getvalue() will use backslash '\'
|
|
|
|
|
# at the end of lines for readability purpose:
|
|
|
|
|
# #pragma omp declare reduction(xxx :\
|
|
|
|
|
# omp_out.value = xxx,\
|
|
|
|
|
# While the code string loaded during the execution will escape the backslash '\':
|
|
|
|
|
# #pragma omp declare reduction(xxx : omp_out.value = xxx,
|
|
|
|
|
# Use code.getrawvalue() here to escape the backslash to
|
|
|
|
|
# make sure the same code string is used during compilation and execution,
|
|
|
|
|
# so that the hash value is the same.
|
|
|
|
|
source_code = "\n" + code.getrawvalue()
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
_, _, kernel_path = get_code_path(source_code, ext, extra)
|
|
|
|
|
return kernel_path
|
|
|
|
|
|
|
|
|
|
def load_kernel(self, name: str = None, kernel: str = None, arg_types: List = None):
|
|
|
|
|
kernel_path = self.get_kernel_path(kernel)
|
Inductor cpp wrapper: cache the loading of the kernel (#89742)
### Pitch
Cache the loaded kernel to reduce the overhead.
#### Code before:
```cpp
std::vector<at::Tensor> call_0(std::tuple<at::Tensor&, at::Tensor&> args) {
...
auto kernel_cpp_0_lib = dlopen("/tmp/torchinductor_xxx/yr/cyr3uymlc6pgvnimx3fnynaa4t7ldafeqzhe5zpizmvorisx4hb2.so", RTLD_NOW);
assert(kernel_cpp_0_lib != nullptr);
void (*kernel_cpp_0)(const float*,const float*,float*,float*);
*(void **) (&kernel_cpp_0) = dlsym(kernel_cpp_0_lib, "kernel");
kernel_cpp_0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
...
}
```
#### Code after:
```cpp
template <typename KernelFunc>
KernelFunc load_cpp_kernel(const char* so_filename) {
KernelFunc kernel_cpp;
auto kernel_cpp_lib = dlopen(so_filename, RTLD_NOW);
assert(kernel_cpp_lib != nullptr);
*(void **) (&kernel_cpp) = dlsym(kernel_cpp_lib, "kernel");
return kernel_cpp;
}
std::vector<at::Tensor> call_0(std::tuple<at::Tensor&, at::Tensor&> args) {
...
static auto kernel_cpp_0 = load_cpp_kernel<void (*)(const float*,const float*,float*,float*)>("/tmp/torchinductor_xxx/yr/cyr3uymlc6pgvnimx3fnynaa4t7ldafeqzhe5zpizmvorisx4hb2.so");
kernel_cpp_0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
...
}
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/89742
Approved by: https://github.com/jgong5, https://github.com/desertfire
2023-01-30 07:39:56 +00:00
|
|
|
self.writeline(
|
|
|
|
|
f'static auto {name} = load_cpp_kernel<void (*)({arg_types})>("{kernel_path}");'
|
|
|
|
|
)
|
Add a cpp wrapper for Inductor (#88167)
## Description
Implements https://github.com/pytorch/torchdynamo/issues/1556.
This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting:
```python
from torch._inductor import config
config.cpp_wrapper = True
```
### Example
The main part of the generated code:
```python
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) {
at::Tensor arg0_1, arg1_1;
std::tie(arg0_1, arg1_1) = args;
auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float);
auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float);
auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW);
assert(kernel0_lib != nullptr);
void (*kernel0)(const float*,const float*,float*,float*);
*(void **) (&kernel0) = dlsym(kernel0_lib, "kernel");
kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr()));
arg0_1.reset();
arg1_1.reset();
return std::make_tuple(buf0, buf1); }''' )
module = load_inline(
name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu',
cpp_sources=[wrapper],
functions=['call_0'],
extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'],
extra_ldflags=['-shared -lgomp'],
extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m'])
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_0)
```
### Next steps
The below items will be addressed in upcoming PRs.
- [x] Support Reduction: #88561
- [x] Support None: #88560
- [ ] Support ExternKernel
- [x] ATen GEMM-related OPs: #88667
- [ ] ATen Conv
- [ ] Conv/GEMM fusion OPs
- [x] Cache the kernel loading part: #89742
- [ ] De-allocate input buffers when possible by leveraging CPython APIs
- [ ] Support Constant
Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167
Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
|
|
|
|
|
|
|
|
def wrap_kernel_call(self, name, call_args):
|
|
|
|
|
return "{}({});".format(name, ", ".join(call_args))
|
|
|
|
|
|
|
|
|
|
def generate_return(self, output_refs):
|
|
|
|
|
if output_refs:
|
|
|
|
|
if len(output_refs) == 1:
|
|
|
|
|
self.wrapper_call.writeline("return " + output_refs[0] + "; }''' )")
|
|
|
|
|
else:
|
|
|
|
|
self.wrapper_call.writeline(
|
|
|
|
|
"return std::vector<at::Tensor>({"
|
|
|
|
|
+ ", ".join(output_refs)
|
|
|
|
|
+ "}); }''' )"
|
|
|
|
|
)
|
|
|
|
|
else:
|
|
|
|
|
self.wrapper_call.writeline("return; }''' )")
|
|
|
|
|
|
|
|
|
|
def generate_end(self, result):
|
|
|
|
|
shared = codecache.get_shared()
|
|
|
|
|
warning_all_flag = codecache.get_warning_all_flag()
|
|
|
|
|
cpp_flags = codecache.cpp_flags()
|
|
|
|
|
ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths()
|
|
|
|
|
optimization_flags = codecache.optimization_flags()
|
|
|
|
|
use_custom_generated_macros = codecache.use_custom_generated_macros()
|
|
|
|
|
|
|
|
|
|
extra_cflags = f"{cpp_flags} {optimization_flags} {warning_all_flag} {macros} {use_custom_generated_macros}"
|
|
|
|
|
extra_ldflags = f"{shared} {lpaths} {libs}"
|
|
|
|
|
extra_include_paths = f"{ipaths}"
|
|
|
|
|
|
|
|
|
|
# get the hash of the wrapper code to name the extension
|
|
|
|
|
wrapper_call_hash = codecache.code_hash(self.wrapper_call.getvalue())
|
|
|
|
|
result.splice(
|
|
|
|
|
f"""
|
|
|
|
|
module = load_inline(
|
|
|
|
|
name='inline_extension_{wrapper_call_hash}',
|
|
|
|
|
cpp_sources=[wrapper],
|
|
|
|
|
functions=['call_{self._call_func_id}'],
|
|
|
|
|
extra_cflags=['{extra_cflags}'],
|
|
|
|
|
extra_ldflags=['{extra_ldflags}'],
|
|
|
|
|
extra_include_paths=['{extra_include_paths}'])
|
|
|
|
|
"""
|
|
|
|
|
)
|
|
|
|
|
# Wrap the func to support setting result._boxed_call = True
|
|
|
|
|
result.splice(
|
|
|
|
|
f"""
|
|
|
|
|
def _wrap_func(f):
|
|
|
|
|
def g(args):
|
|
|
|
|
return f(args)
|
|
|
|
|
return g
|
|
|
|
|
call = _wrap_func(module.call_{self._call_func_id})
|
|
|
|
|
"""
|
|
|
|
|
)
|
2022-12-14 15:43:31 +00:00
|
|
|
|
|
|
|
|
def generate_extern_kernel_out(
|
|
|
|
|
self, output_view, codegen_reference, args, kernel, cpp_kernel
|
|
|
|
|
):
|
|
|
|
|
if output_view:
|
|
|
|
|
output_as_strided = f"{output_view.codegen_reference()}"
|
|
|
|
|
output_name = f"{output_view.get_name()}_as_strided"
|
|
|
|
|
self.writeline(f"auto {output_name} = {output_as_strided};")
|
|
|
|
|
|
|
|
|
|
args.insert(0, output_name)
|
|
|
|
|
else:
|
|
|
|
|
args.insert(0, f"{codegen_reference}")
|
|
|
|
|
self.writeline(f"{cpp_kernel}({', '.join(args)});")
|