mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Reland: [TensorExpr] Add TensorExprKernel::runFast method. (#57552)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/57552 This method uses `CodeGen::call_raw` instead of `CodeGen::call`. Relanding #57328 (the entire stack) which was reverted because I forgot to guard a new test with `ifdef LLVM`. Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D28195047 Pulled By: ZolotukhinM fbshipit-source-id: bcfd3cb5b4f33a149b7549515ffd705e2c4f208f
This commit is contained in:
parent
0bf69278f7
commit
e686c66fe7
3 changed files with 52 additions and 0 deletions
|
|
@ -1257,5 +1257,36 @@ TEST_F(Kernel, ConstantTensorsNonContiguous) {
|
|||
ASSERT_TRUE(at::allclose(o, ref));
|
||||
}
|
||||
|
||||
TEST_F(Kernel, RunFast) {
|
||||
#ifdef TORCH_ENABLE_LLVM
|
||||
// TODO: Implement call_raw in IREval and remove the ifdef
|
||||
KernelScope kernel_scope;
|
||||
|
||||
const auto graph_string = R"IR(
|
||||
graph(%0 : Float(5, 3, strides=[3, 1], device=cpu),
|
||||
%1 : Float(5, 3, strides=[1, 5], device=cpu)):
|
||||
%2 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %1)
|
||||
%3 : Float(5, 3, strides=[3, 1]) = aten::mul(%0, %2)
|
||||
return (%3))IR";
|
||||
auto graph = std::make_shared<Graph>();
|
||||
parseIR(graph_string, &*graph);
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
|
||||
auto a = at::rand({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
|
||||
auto b =
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
|
||||
at::rand({3, 5}, TensorOptions(kCPU).dtype(at::kFloat)).transpose(0, 1);
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
|
||||
auto o = at::zeros({5, 3}, TensorOptions(kCPU).dtype(at::kFloat));
|
||||
auto ref = a * (a * b);
|
||||
TensorExprKernel k(graph);
|
||||
|
||||
k.runFast({a.data_ptr(), b.data_ptr()}, {o.data_ptr()});
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
|
||||
for (size_t i = 0; i < 5 * 3; i++) {
|
||||
CHECK_EQ(((float*)o.data_ptr())[i], ((float*)ref.data_ptr())[i]);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} // namespace jit
|
||||
} // namespace torch
|
||||
|
|
|
|||
|
|
@ -3185,3 +3185,21 @@ void TensorExprKernel::runKernel(Stack& stack) {
|
|||
push_one(stack, std::move(o));
|
||||
}
|
||||
}
|
||||
|
||||
void TensorExprKernel::runFast(
|
||||
const std::vector<void*>& inputs,
|
||||
const std::vector<void*>& outputs) {
|
||||
KernelScope kernelScope(&kernelArena_);
|
||||
|
||||
std::vector<void*> args(inputs);
|
||||
args.reserve(inputs.size() + outputs.size() + constants_.size());
|
||||
args.insert(args.end(), outputs.begin(), outputs.end());
|
||||
|
||||
// TODO: we can consider preallocating and pre-filling the args vector.
|
||||
for (auto c : constants_) {
|
||||
args.push_back(c.ptr);
|
||||
}
|
||||
|
||||
// Call the kernel.
|
||||
codegen_->call_raw(args);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -70,6 +70,9 @@ class TORCH_API TensorExprKernel {
|
|||
explicit TensorExprKernel(const std::shared_ptr<Graph>& subgraph);
|
||||
|
||||
void run(Stack& stack);
|
||||
void runFast(
|
||||
const std::vector<void*>& inputs,
|
||||
const std::vector<void*>& outputs);
|
||||
|
||||
void fallback(Stack& stack) {
|
||||
InterpreterState(code_).run(stack);
|
||||
|
|
|
|||
Loading…
Reference in a new issue