pytorch/torch/_inductor/codegen/wrapper.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

785 lines
26 KiB
Python
Raw Normal View History

import collections
import contextlib
import dataclasses
import functools
import hashlib
from itertools import count
from typing import Any, Dict, List
from torch._dynamo.utils import dynamo_timed
from .. import codecache, config, ir
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
from ..codecache import cpp_compile_command, get_code_path
from ..utils import cache_on_self, has_triton, sympy_dot, sympy_product
from ..virtualized import V
from .common import CodeGen, DeferredLine, IndentedBuffer, Kernel
from .triton import texpr
pexpr = texpr
def buffer_reuse_key(node: ir.Buffer):
size = node.get_size()
stride = node.get_stride()
last_element = sympy_dot([s - 1 for s in size], stride)
return (
node.get_device(),
node.get_dtype(),
V.graph.sizevars.simplify(sympy_product(size)),
# Detect gaps in tensor storage caused by strides
V.graph.sizevars.size_hint(last_element),
)
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
def make_buffer_reuse(old, new, del_func, declare, ending, as_strided):
assert old.get_dtype() == new.get_dtype()
del_line = ""
if old.get_name() not in V.graph.get_output_names():
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
del_line = del_func(old.get_name())
if old.get_size() == new.get_size() and old.get_stride() == new.get_stride():
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
return f"{declare}{new.get_name()} = {old.get_name()}{del_line}{ending}"
return (
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
f"{declare}{new.get_name()} = {as_strided}({old.get_name()}, "
f"{V.graph.sizevars.codegen_shape_tuple(new.get_size())}, "
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
f"{V.graph.sizevars.codegen_shape_tuple(new.get_stride())}){del_line}{ending}"
)
def make_buffer_allocation(buffer):
device = buffer.get_device()
dtype = buffer.get_dtype()
shape = tuple(buffer.get_size())
stride = tuple(buffer.get_stride())
return (
f"{buffer.get_name()} = empty_strided("
f"{V.graph.sizevars.codegen_shape_tuple(shape)}, "
f"{V.graph.sizevars.codegen_shape_tuple(stride)}, "
f"device='{device.type}', dtype={dtype})"
)
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
def make_cpp_buffer_allocation(buffer):
from .cpp import DTYPE_TO_ATEN
# TODO: map layout and device here
dtype = buffer.get_dtype()
shape = tuple(buffer.get_size())
stride = tuple(buffer.get_stride())
return (
f"auto {buffer.get_name()} = at::empty_strided("
f"{V.graph.sizevars.codegen_shape_tuple(shape)}, "
f"{V.graph.sizevars.codegen_shape_tuple(stride)}, "
f"{DTYPE_TO_ATEN[dtype]}); "
)
class MemoryPlanningState:
def __init__(self):
super().__init__()
self.reuse_pool: Dict[
Any, List["FreeIfNotReusedLine"]
] = collections.defaultdict(list)
def __contains__(self, key):
return bool(self.reuse_pool.get(key, None))
def pop(self, key) -> "FreeIfNotReusedLine":
item = self.reuse_pool[key].pop()
assert not item.is_reused
return item
def push(self, key, item: "FreeIfNotReusedLine"):
assert not item.is_reused
self.reuse_pool[key].append(item)
@dataclasses.dataclass
class EnterCudaDeviceContextManagerLine:
device_idx: int
def codegen(self, code: IndentedBuffer):
# Note _DeviceGuard has less overhead than device, but only accepts
# integers
code.writeline(f"with torch.cuda._DeviceGuard({self.device_idx}):")
class ExitCudaDeviceContextManagerLine:
pass
class MemoryPlanningLine:
def plan(self, state: MemoryPlanningState) -> "MemoryPlanningLine":
"""First pass to find reuse"""
return self
def codegen(self, code: IndentedBuffer):
"""Second pass to output code"""
pass
@dataclasses.dataclass
class AllocateLine(MemoryPlanningLine):
node: ir.Buffer
def plan(self, state: MemoryPlanningState):
if self.node.get_name() in V.graph.removed_buffers:
return NullLine()
# try to reuse a recently freed buffer
key = buffer_reuse_key(self.node)
if key in state:
free_line = state.pop(key)
free_line.is_reused = True
return ReuseLine(free_line.node, self.node)
return self
def codegen(self, code: IndentedBuffer):
assert self.node.get_name() not in V.graph.removed_buffers
code.writeline(make_buffer_allocation(self.node))
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
@dataclasses.dataclass
class CppAllocateLine(AllocateLine):
def plan(self, state: MemoryPlanningState):
if self.node.get_name() in V.graph.removed_buffers:
return NullLine()
# try to reuse a recently freed buffer
key = buffer_reuse_key(self.node)
if key in state:
free_line = state.pop(key)
free_line.is_reused = True
return CppReuseLine(free_line.node, self.node)
return self
def codegen(self, code: IndentedBuffer):
assert self.node.get_name() not in V.graph.removed_buffers
code.writeline(make_cpp_buffer_allocation(self.node))
@dataclasses.dataclass
class FreeIfNotReusedLine(MemoryPlanningLine):
node: ir.Buffer
is_reused: bool = False
def plan(self, state: MemoryPlanningState):
assert not self.is_reused
if self.node.get_name() in V.graph.removed_buffers:
return NullLine()
state.push(buffer_reuse_key(self.node), self)
return self
def codegen(self, code: IndentedBuffer):
assert self.node.get_name() not in V.graph.removed_buffers
if not self.is_reused:
code.writeline(f"del {self.node.get_name()}")
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
@dataclasses.dataclass
class CppFreeIfNotReusedLine(FreeIfNotReusedLine):
node: ir.Buffer
is_reused: bool = False
def codegen(self, code: IndentedBuffer):
assert (self.node.get_name()) not in V.graph.removed_buffers
if not self.is_reused:
code.writeline(f"{self.node.get_name()}.reset();")
@dataclasses.dataclass
class ReuseLine(MemoryPlanningLine):
node: ir.Buffer
reused_as: ir.Buffer
def plan(self, state: MemoryPlanningState):
assert self.node.get_name() not in V.graph.removed_buffers
assert self.reused_as.get_name() not in V.graph.removed_buffers
return self
def codegen(self, code: IndentedBuffer):
assert self.node.get_name() not in V.graph.removed_buffers
assert self.reused_as.get_name() not in V.graph.removed_buffers
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
code.writeline(
make_buffer_reuse(
self.node,
self.reused_as,
del_func=lambda name: f"; del {name}",
declare="",
ending="",
as_strided="as_strided",
)
+ " # reuse"
)
@dataclasses.dataclass
class CppReuseLine(ReuseLine):
node: ir.Buffer
reused_as: ir.Buffer
def codegen(self, code: IndentedBuffer):
assert self.node.get_name() not in V.graph.removed_buffers
assert self.reused_as.get_name() not in V.graph.removed_buffers
code.writeline(
make_buffer_reuse(
self.node,
self.reused_as,
del_func=lambda name: f"; {name}.reset()",
declare="auto ",
ending=";",
as_strided="at::as_strided",
)
+ " // reuse"
)
@dataclasses.dataclass
class FreeLine(MemoryPlanningLine):
node: ir.Buffer
def plan(self, state: MemoryPlanningState):
if self.node.get_name() in V.graph.removed_buffers:
return NullLine()
return self
def codegen(self, code: IndentedBuffer):
assert self.node.get_name() not in V.graph.removed_buffers
code.writeline(f"del {self.node.get_name()}")
class NullLine(MemoryPlanningLine):
pass
class WrapperCodeGen(CodeGen):
"""
The outer wrapper that calls the kernels.
"""
def __init__(self):
super().__init__()
self._names_iter = count()
self.header = IndentedBuffer()
self.prefix = IndentedBuffer()
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.wrapper_call = IndentedBuffer()
self.kernels = {}
self.lines = []
self.header.splice(
f"""
from ctypes import c_void_p, c_long
import torch
import random
from torch import empty_strided, as_strided, device
from {codecache.__name__} import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
aten = torch.ops.aten
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
async_compile = AsyncCompile()
"""
)
if has_triton():
self.header.splice(
f"""
import triton
import triton.language as tl
from {config.inductor_import}.triton_ops.autotune import grid
from torch._C import _cuda_getCurrentRawStream as get_cuda_stream
"""
)
if config.triton.convolution != "aten":
self.header.splice(
f"""
from {config.inductor_import}.triton_ops.conv_perf_model import early_config_prune
from {config.inductor_import}.triton_ops.conv_perf_model import estimate_conv_time
from {config.inductor_import}.triton_ops.autotune import conv_heuristics
"""
)
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.write_prefix()
for name, value in V.graph.constants.items():
# include a hash so our code cache gives different constants different files
hashed = hashlib.sha256(repr(value).encode("utf-8")).hexdigest()
self.header.writeline(f"{name} = None # {hashed}")
self.allocated = set()
self.freed = set()
self.write_get_cuda_stream = functools.lru_cache(None)(
self.write_get_cuda_stream
)
@functools.lru_cache(None)
def add_import_once(line):
self.header.writeline(line)
self.add_import_once = add_import_once
self._metas = {}
def add_meta_once(self, meta):
meta = repr(meta)
if meta not in self._metas:
var = f"meta{len(self._metas)}"
self._metas[meta] = var
self.header.writeline(f"{var} = {meta}")
return self._metas[meta]
@cache_on_self
def get_output_refs(self):
return [x.codegen_reference() for x in V.graph.graph_outputs]
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
def write_prefix(self):
self.prefix.splice(
"""
async_compile.wait(globals())
del async_compile
def call(args):
"""
)
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
with self.wrapper_call.indent():
if config.triton.debug_sync_graph:
self.wrapper_call.writeline("torch.cuda.synchronize()")
inp_len = len(V.graph.graph_inputs.keys())
if inp_len != 0:
lhs = f"{', '.join(V.graph.graph_inputs.keys())}{'' if inp_len != 1 else ','}"
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.wrapper_call.writeline(f"{lhs} = args")
self.wrapper_call.writeline("args.clear()")
for name in V.graph.randomness_seeds:
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.wrapper_call.writeline(
f"torch.randint(2**31, size=(), dtype=torch.int64, out={name})"
)
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
V.graph.sizevars.codegen(self.wrapper_call, V.graph.graph_inputs)
def write_get_cuda_stream(self, index):
name = f"stream{index}"
self.writeline(f"{name} = get_cuda_stream({index})")
return name
def next_kernel_suffix(self):
return f"{next(self._names_iter)}"
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
def write_allocate_line(self, buffer):
self.writeline(AllocateLine(buffer))
def get_deferred_line(self, name, layout):
return DeferredLine(
name, f"{name} = {layout.view.codegen_reference()} # alias"
)
def codegen_allocation(self, buffer):
name = buffer.get_name()
if name in V.graph.removed_buffers or name in self.allocated:
return
self.allocated.add(name)
if isinstance(
buffer,
(ir.ExternKernelAlloc, ir.MultiOutput),
):
return
layout = buffer.get_layout()
if isinstance(layout, ir.MutationLayout):
return
if isinstance(layout, ir.AliasedLayout):
assert isinstance(layout.view, ir.ReinterpretView)
if not layout.maybe_guard_aligned():
V.graph.unaligned_buffers.add(name)
self.codegen_allocation(layout.view.data)
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
allocation = self.get_deferred_line(name, layout)
self.writeline(allocation)
return
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.write_allocate_line(buffer)
def write_del_line(self, name):
self.writeline(f"del {name}")
def write_free_if_not_reused_line(self, buffer):
self.writeline(FreeIfNotReusedLine(buffer))
def codegen_free(self, buffer):
name = buffer.get_name()
# can be freed but not reused
if isinstance(buffer, ir.InputBuffer):
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.write_del_line(name)
return
if not self.can_reuse(buffer):
return
self.freed.add(name)
layout = buffer.get_layout()
if isinstance(layout, (ir.AliasedLayout, ir.MultiOutputLayout)):
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.write_del_line(name)
return
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.write_free_if_not_reused_line(buffer)
def can_reuse(self, buffer):
name = buffer.get_name()
if (
name in V.graph.removed_buffers
or name in V.graph.graph_inputs
or name in V.graph.constants
or name in self.freed
):
return False
return True
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
def write_reuse_line(self, input_buffer, output_buffer):
self.writeline(ReuseLine(input_buffer, output_buffer))
def codegen_inplace_reuse(self, input_buffer, output_buffer):
assert buffer_reuse_key(input_buffer) == buffer_reuse_key(output_buffer)
self.codegen_allocation(input_buffer)
self.freed.add(input_buffer.get_name())
self.allocated.add(output_buffer.get_name())
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.write_reuse_line(input_buffer, output_buffer)
def codegen_cuda_device_guard_enter(self, device_idx):
self.lines.append(EnterCudaDeviceContextManagerLine(device_idx))
def codegen_cuda_device_guard_exit(self):
self.lines.append(ExitCudaDeviceContextManagerLine())
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
def generate_return(self, output_refs):
if output_refs:
self.wrapper_call.writeline("return (" + ", ".join(output_refs) + ", )")
else:
self.wrapper_call.writeline("return ()")
def generate_end(self, result):
return
def generate_extern_kernel_out(
self, output_view, codegen_reference, args, kernel, cpp_kernel
):
if output_view:
args.append(f"out={output_view.codegen_reference()}")
else:
args.append(f"out={codegen_reference}")
self.writeline(f"{kernel}({', '.join(args)})")
@dynamo_timed
def generate(self):
result = IndentedBuffer()
result.splice(self.header)
result.splice(self.prefix)
out_names = V.graph.get_output_names()
with contextlib.ExitStack() as stack:
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
stack.enter_context(self.wrapper_call.indent())
if config.profiler_mark_wrapper_call:
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.wrapper_call.writeline(
"from torch.profiler import record_function"
)
self.wrapper_call.writeline(
"with record_function('inductor_wrapper_call'):"
)
stack.enter_context(self.wrapper_call.indent())
while (
self.lines
and isinstance(self.lines[-1], MemoryPlanningLine)
and self.lines[-1].node.name not in out_names
):
# these lines will be pointless
self.lines.pop()
# codegen allocations in two passes
planning_state = MemoryPlanningState()
for i in range(len(self.lines)):
if isinstance(self.lines[i], MemoryPlanningLine):
self.lines[i] = self.lines[i].plan(planning_state)
device_cm_stack = contextlib.ExitStack()
for line in self.lines:
if isinstance(line, MemoryPlanningLine):
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
line.codegen(self.wrapper_call)
elif isinstance(line, EnterCudaDeviceContextManagerLine):
line.codegen(self.wrapper_call)
device_cm_stack.enter_context(self.wrapper_call.indent())
self.wrapper_call.writeline(
f"torch.cuda.set_device({line.device_idx}) # no-op to ensure context"
)
elif isinstance(line, ExitCudaDeviceContextManagerLine):
device_cm_stack.close()
else:
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.wrapper_call.writeline(line)
output_refs = self.get_output_refs()
if config.triton.debug_sync_graph:
self.wrapper_call.writeline("torch.cuda.synchronize()")
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
self.generate_return(output_refs)
with result.indent():
result.splice(self.wrapper_call)
self.generate_end(result)
self.add_benchmark_harness(result)
return result.getvalue()
def add_benchmark_harness(self, output):
"""
Append a benchmark harness to generated code for debugging
"""
if not config.benchmark_harness:
return
def add_fake_input(name, shape, stride, device, dtype):
output.writeline(
f"{name} = rand_strided("
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
f"{V.graph.sizevars.codegen_benchmark_shape_tuple(shape)}, "
f"{V.graph.sizevars.codegen_benchmark_shape_tuple(stride)}, "
f"device='{device}', dtype={dtype})"
)
output.writelines(["", "", 'if __name__ == "__main__":'])
with output.indent():
output.splice(
"""
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
""",
strip=True,
)
for name, value in V.graph.constants.items():
add_fake_input(
name, value.size(), value.stride(), value.device, value.dtype
)
for name, value in V.graph.graph_inputs.items():
shape = [V.graph.sizevars.size_hint(x) for x in value.get_size()]
stride = [V.graph.sizevars.size_hint(x) for x in value.get_stride()]
add_fake_input(
name, shape, stride, value.get_device(), value.get_dtype()
)
output.writeline(
f"print_performance(lambda: call([{', '.join(V.graph.graph_inputs.keys())}]))"
)
def define_kernel(self, name: str, kernel: str):
self.header.splice(f"\n\n{name} = {kernel}")
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
def load_kernel(self, name: str = None, kernel: str = None, arg_types: List = None):
return
def wrap_kernel_call(self, name, call_args):
return "{}({})".format(name, ", ".join(call_args))
def generate_kernel_call(self, name, call_args):
self.writeline(
self.wrap_kernel_call(name, call_args),
)
def call_kernel(self, name: str, kernel: Kernel):
tmp = IndentedBuffer()
kernel.call_kernel(self, tmp, name)
for line in tmp.getvalue().split("\n"):
line = line.strip()
if line:
self.writeline(line)
def writeline(self, line):
self.lines.append(line)
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
class CppWrapperCodeGen(WrapperCodeGen):
"""
The outer wrapper that calls the kernels.
"""
call_func_id = count()
def __init__(self):
self._call_func_id = next(CppWrapperCodeGen.call_func_id)
super().__init__()
@cache_on_self
def get_output_refs(self):
def has_cpp_codegen_func(x):
return hasattr(x, "cpp_wrapper_codegen_reference") and callable(
x.cpp_wrapper_codegen_reference
)
return [
x.cpp_wrapper_codegen_reference()
if has_cpp_codegen_func(x)
else x.codegen_reference()
for x in V.graph.graph_outputs
]
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
def write_prefix(self):
self.prefix.splice(
"""
async_compile.wait(globals())
del async_compile
from torch.utils.cpp_extension import load_inline
wrapper = (
'''
#include <dlfcn.h>
#include <assert.h>
2023-01-30 07:39:56 +00:00
template <typename KernelFunc>
KernelFunc load_cpp_kernel(const char* so_filename) {
KernelFunc kernel_cpp;
auto kernel_cpp_lib = dlopen(so_filename, RTLD_NOW);
assert(kernel_cpp_lib != nullptr);
*(void **) (&kernel_cpp) = dlsym(kernel_cpp_lib, "kernel");
return kernel_cpp;
}
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
"""
)
with self.wrapper_call.indent():
inputs_len = len(V.graph.graph_inputs.keys())
output_refs = self.get_output_refs()
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
if output_refs:
if len(output_refs) == 1:
output_types = "at::Tensor"
else:
output_types = "std::vector<at::Tensor>"
else:
output_types = "void"
inputs_types = "std::vector<at::Tensor>"
self.wrapper_call.writeline(
f"{output_types} call_{self._call_func_id}({inputs_types} args) {{"
)
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
if inputs_len != 0:
inputs_keys_str = ", ".join(V.graph.graph_inputs.keys())
self.wrapper_call.writeline(f"at::Tensor {inputs_keys_str};")
for idx, input_key in enumerate(V.graph.graph_inputs.keys()):
self.wrapper_call.writeline(f"{input_key} = args[{idx}];")
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
for name in V.graph.randomness_seeds:
self.wrapper_call.writeline(f"at::Tensor {name};")
self.wrapper_call.writeline(
f"{name} = at::randint(std::pow(2, 31), {{}}, at::ScalarType::Long);"
)
V.graph.sizevars.codegen(self.wrapper_call, V.graph.graph_inputs)
def write_allocate_line(self, buffer):
self.writeline(CppAllocateLine(buffer))
def write_del_line(self, name):
self.writeline(f"{name}.reset();")
return
def write_free_if_not_reused_line(self, buffer):
self.writeline(CppFreeIfNotReusedLine(buffer))
return
def write_reuse_line(self, input_buffer, output_buffer):
self.writeline(CppReuseLine(input_buffer, output_buffer))
def get_deferred_line(self, name, layout):
return DeferredLine(
name, f"auto {name} = {layout.view.codegen_reference()}; // alias"
)
def get_kernel_path(self, code):
from ..codecache import pick_vec_isa
picked_vec_isa = pick_vec_isa()
ext = "so"
extra = cpp_compile_command("i", "o", vec_isa=picked_vec_isa)
# \n is required to match with the CodeCache behavior
# For reductions, the code string gotten from code.getvalue() will use backslash '\'
# at the end of lines for readability purpose:
# #pragma omp declare reduction(xxx :\
# omp_out.value = xxx,\
# While the code string loaded during the execution will escape the backslash '\':
# #pragma omp declare reduction(xxx : omp_out.value = xxx,
# Use code.getrawvalue() here to escape the backslash to
# make sure the same code string is used during compilation and execution,
# so that the hash value is the same.
source_code = "\n" + code.getrawvalue()
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
_, _, kernel_path = get_code_path(source_code, ext, extra)
return kernel_path
def load_kernel(self, name: str = None, kernel: str = None, arg_types: List = None):
kernel_path = self.get_kernel_path(kernel)
2023-01-30 07:39:56 +00:00
self.writeline(
f'static auto {name} = load_cpp_kernel<void (*)({arg_types})>("{kernel_path}");'
)
Add a cpp wrapper for Inductor (#88167) ## Description Implements https://github.com/pytorch/torchdynamo/issues/1556. This PR adds a cpp wrapper to invoke the generated kernels. The cpp wrapper is turned off by default and can be turned on by setting: ```python from torch._inductor import config config.cpp_wrapper = True ``` ### Example The main part of the generated code: ```python from torch.utils.cpp_extension import load_inline wrapper = ( ''' #include <dlfcn.h> #include <assert.h> std::tuple<at::Tensor, at::Tensor> call_0(std::tuple<at::Tensor, at::Tensor> args) { at::Tensor arg0_1, arg1_1; std::tie(arg0_1, arg1_1) = args; auto buf0 = at::empty_strided({8, 8}, {8, 1}, at::ScalarType::Float); auto buf1 = at::empty_strided({8, 8}, {1, 8}, at::ScalarType::Float); auto kernel0_lib = dlopen("/tmp/torchinductor_user/kn/ckn7ubcn2qbkme2vx5r6antnh5sv6d3o3t6qwdfgfoupnxty6pnm.so", RTLD_NOW); assert(kernel0_lib != nullptr); void (*kernel0)(const float*,const float*,float*,float*); *(void **) (&kernel0) = dlsym(kernel0_lib, "kernel"); kernel0((float*)(arg0_1.data_ptr()), (float*)(arg1_1.data_ptr()), (float*)(buf0.data_ptr()), (float*)(buf1.data_ptr())); arg0_1.reset(); arg1_1.reset(); return std::make_tuple(buf0, buf1); }''' ) module = load_inline( name='inline_extension_c64wpbccpbre3th2k6oxwrjy5bhvxnmkdxkhcfxlsw7xpsg4eabu', cpp_sources=[wrapper], functions=['call_0'], extra_cflags=['-fPIC -Wall -std=c++14 -Wno-unused-variable -march=native -O3 -ffast-math -fno-finite-math-only -fopenmp'], extra_ldflags=['-shared -lgomp'], extra_include_paths=['-I/home/user/pytorch/torch/include -I/home/user/pytorch/torch/include/torch/csrc/api/include -I/home/user/pytorch/torch/include/TH -I/home/user/pytorch/torch/include/THC -I/home/user/miniconda3/envs/pytorch/include/python3.7m']) def _wrap_func(f): def g(args): return f(args) return g call = _wrap_func(module.call_0) ``` ### Next steps The below items will be addressed in upcoming PRs. - [x] Support Reduction: #88561 - [x] Support None: #88560 - [ ] Support ExternKernel - [x] ATen GEMM-related OPs: #88667 - [ ] ATen Conv - [ ] Conv/GEMM fusion OPs - [x] Cache the kernel loading part: #89742 - [ ] De-allocate input buffers when possible by leveraging CPython APIs - [ ] Support Constant Pull Request resolved: https://github.com/pytorch/pytorch/pull/88167 Approved by: https://github.com/jgong5, https://github.com/jansel, https://github.com/desertfire
2022-11-30 10:35:05 +00:00
def wrap_kernel_call(self, name, call_args):
return "{}({});".format(name, ", ".join(call_args))
def generate_return(self, output_refs):
if output_refs:
if len(output_refs) == 1:
self.wrapper_call.writeline("return " + output_refs[0] + "; }''' )")
else:
self.wrapper_call.writeline(
"return std::vector<at::Tensor>({"
+ ", ".join(output_refs)
+ "}); }''' )"
)
else:
self.wrapper_call.writeline("return; }''' )")
def generate_end(self, result):
shared = codecache.get_shared()
warning_all_flag = codecache.get_warning_all_flag()
cpp_flags = codecache.cpp_flags()
ipaths, lpaths, libs, macros = codecache.get_include_and_linking_paths()
optimization_flags = codecache.optimization_flags()
use_custom_generated_macros = codecache.use_custom_generated_macros()
extra_cflags = f"{cpp_flags} {optimization_flags} {warning_all_flag} {macros} {use_custom_generated_macros}"
extra_ldflags = f"{shared} {lpaths} {libs}"
extra_include_paths = f"{ipaths}"
# get the hash of the wrapper code to name the extension
wrapper_call_hash = codecache.code_hash(self.wrapper_call.getvalue())
result.splice(
f"""
module = load_inline(
name='inline_extension_{wrapper_call_hash}',
cpp_sources=[wrapper],
functions=['call_{self._call_func_id}'],
extra_cflags=['{extra_cflags}'],
extra_ldflags=['{extra_ldflags}'],
extra_include_paths=['{extra_include_paths}'])
"""
)
# Wrap the func to support setting result._boxed_call = True
result.splice(
f"""
def _wrap_func(f):
def g(args):
return f(args)
return g
call = _wrap_func(module.call_{self._call_func_id})
"""
)
def generate_extern_kernel_out(
self, output_view, codegen_reference, args, kernel, cpp_kernel
):
if output_view:
output_as_strided = f"{output_view.codegen_reference()}"
output_name = f"{output_view.get_name()}_as_strided"
self.writeline(f"auto {output_name} = {output_as_strided};")
args.insert(0, output_name)
else:
args.insert(0, f"{codegen_reference}")
self.writeline(f"{cpp_kernel}({', '.join(args)});")