mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Summary: This is a copy of https://github.com/pytorch/pytorch/pull/97152 to make the landing easier. This PR implements a two-pass wrapper codegen for the Triton backend to achieve ahead-of-time compilation. In the first pass, the regular python wrapper code will be generated, and then the generated code will be executed to perform Triton compilation and autotuning. After that, the second pass wrapper codegen will generate C++ wrapper with proper CUDA API to load and launch Triton-generated CUDA kernels. Like the AOT mode for the cpp backend, the next step would be to provide a more complete API for AOT. Pull Request resolved: https://github.com/pytorch/pytorch/pull/98214 Approved by: https://github.com/eellison
21 lines
469 B
Python
21 lines
469 B
Python
import shutil
|
|
|
|
import torch
|
|
import torch._dynamo
|
|
import torch._inductor
|
|
|
|
|
|
class Net(torch.nn.Module):
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.weight = torch.ones(32, 64)
|
|
|
|
def forward(self, x):
|
|
x = torch.relu(x + self.weight)
|
|
return x
|
|
|
|
|
|
inp = torch.randn((32, 64), device="cpu")
|
|
module, _ = torch._dynamo.export(Net(), inp)
|
|
lib_path = torch._inductor.aot_compile(module, [inp])
|
|
shutil.copy(lib_path, "aot_inductor_output.so")
|