pytorch/test/cpp/jit/test_gpu_shift.cpp
jiej 2d110d514f Nvfuser code bump 2_1_2022 (#72127)
Summary:
Things changed in this PR that requires review:
1. aten/src/ATen/core/interned_strings.h
2. torch/csrc/jit/ir/alias_analysis.h : exposing createValue to allow efficient mutation
3. torch/csrc/jit/runtime/symbolic_shape_registry.cpp : added gelu/tanh/erf in registry
4. torch/jit/_script.py : throws scripting model sees autocast as decorator since it's not supported

nvfuser code update:
1. codegen improvements and performance tuning
2. integration bug fixes for shape expression logic
3. kernel segmentation update to address perf regression from horizontal fusion
4. scalar cpu tensor promotion to support inter-device operation between cpu scalar tensor and cuda tensor

Things reverted from local changes:
aten::gelu with approximation (tracked in PR: https://github.com/pytorch/pytorch/pull/61439)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/72127

Reviewed By: HamidShojanazeri

Differential Revision: D34113233

Pulled By: jbschlosser

fbshipit-source-id: b82cde32b71e324eca0ea57cb8c9f9647278ca74
(cherry picked from commit e009bc5c4e943211c4953e6fdf7c9913fa66b3c9)
2022-02-15 00:43:16 +00:00

5371 lines
151 KiB
C++

#if defined(USE_CUDA)
#include <gtest/gtest.h>
#include <torch/csrc/jit/codegen/cuda/arith.h>
#include <torch/csrc/jit/codegen/cuda/codegen.h>
#include <torch/csrc/jit/codegen/cuda/disjoint_set.h>
#include <torch/csrc/jit/codegen/cuda/executor.h>
#include <torch/csrc/jit/codegen/cuda/executor_launch_params.h>
#include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
#include <torch/csrc/jit/codegen/cuda/fusion.h>
#include <torch/csrc/jit/codegen/cuda/fusion_segmenter.h>
#include <torch/csrc/jit/codegen/cuda/interface.h>
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
#include <torch/csrc/jit/codegen/cuda/ir_graphviz.h>
#include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
#include <torch/csrc/jit/codegen/cuda/iter_visitor.h>
#include <torch/csrc/jit/codegen/cuda/kernel_cache.h>
#include <torch/csrc/jit/codegen/cuda/kernel_expr_evaluator.h>
#include <torch/csrc/jit/codegen/cuda/kernel_ir.h>
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
#include <torch/csrc/jit/codegen/cuda/mutator.h>
#include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
#include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
#include <torch/csrc/jit/codegen/cuda/transform_replay.h>
#include <torch/csrc/jit/codegen/cuda/transform_rfactor.h>
// fuser and IR parser
#include "test_gpu_validator.h"
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <c10/cuda/CUDAStream.h>
#include <algorithm>
#include <iostream>
// Tests go in torch::jit
namespace torch {
namespace jit {
using namespace torch::jit::fuser::cuda;
using namespace at::indexing;
namespace {
// Make a tensor that is known to be fully contiguous of dimensionality=ndims,
// but unknown sizes
TensorView* makeContigTensor(size_t ndims, DataType dtype = DataType::Float) {
return TensorViewBuilder()
.ndims(ndims)
.dtype(dtype)
.contiguity(std::vector<bool>(ndims, true))
.build();
}
// Make a tensor that is known to be non-contiguous of dimensionality=ndims,
// but unknown sizes
TensorView* makeSymbolicTensor(size_t ndims, DataType dtype = DataType::Float) {
return TensorViewBuilder().ndims(ndims).dtype(dtype).build();
}
// Make a non-contiguous tensor of compile-time known sizes
TensorView* makeConcreteTensor(
std::vector<int64_t> shape,
DataType dtype = DataType::Float) {
return TensorViewBuilder().shape(shape).dtype(dtype).build();
}
void checkIntValue(
ExpressionEvaluator& evaluator,
Val* val,
Int::ScalarType expected_value) {
TORCH_CHECK(val->isAnInt());
const auto actual_value = evaluator.evaluate(val);
TORCH_CHECK(actual_value.has_value());
TORCH_CHECK(actual_value.value() == expected_value);
}
void checkIntValue(
kir::ExpressionEvaluator& evaluator,
const Val* val,
Int::ScalarType expected_value) {
const auto actual_value = evaluator.evaluate(val);
TORCH_CHECK(actual_value.has_value());
TORCH_CHECK(actual_value.value() == expected_value);
}
// Used to signify invalid ranges, i.e., values at offset 0 to
// start_offset, and values at offset stop_offset to the end of the
// domain.
static constexpr int invalid_marker = 1;
// ATen version of tensor shifting
auto shift(
at::Tensor tensor,
const std::vector<int>& offsets,
std::vector<int> padding = {}) {
TORCH_INTERNAL_ASSERT(tensor.ndimension() == offsets.size());
if (padding.empty()) {
padding = offsets;
for (auto& p : padding) {
p = std::abs(p);
}
}
at::Tensor t = tensor;
for (size_t i = 0; i < offsets.size(); ++i) {
auto offset = offsets[i];
t = t.roll(offsets[i], i);
if (offset == 0) {
continue;
}
// Zero padding
std::vector<at::indexing::TensorIndex> indices(
tensor.ndimension(), at::indexing::Slice(0, at::indexing::None));
if (offset > 0) {
indices[i] = at::indexing::Slice(0, offset);
} else {
indices[i] = at::indexing::Slice(offset, at::indexing::None);
}
t.index(indices) = 0;
// Fill the outside range by the special marker value.
const auto pad = padding[i];
if (offset > 0) {
indices[i] = at::indexing::Slice(0, offset - pad);
} else {
offset += pad;
TORCH_INTERNAL_ASSERT(offset <= 0);
if (offset == 0) {
continue;
}
indices[i] = at::indexing::Slice(offset, at::indexing::None);
}
t.index(indices) = invalid_marker;
}
return t;
}
// ATen version of tensor gather
auto gather(
at::Tensor tensor,
const std::vector<int>& window_shape,
const std::vector<std::vector<int>>& pad_width,
std::vector<int> strides = {}) {
TORCH_CHECK(
tensor.ndimension() == window_shape.size(),
"Invalid window shape: ",
window_shape,
". Size of the window shape is different from the tensor dimension.");
TORCH_CHECK(
tensor.ndimension() == pad_width.size(),
"Invalid pad width: ",
pad_width,
". Size of the pad width is different from the tensor dimension.");
if (strides.empty()) {
strides = std::vector<int>(tensor.ndimension(), 1);
} else {
TORCH_CHECK(
tensor.ndimension() == strides.size(),
"Invalid strides: ",
strides,
". Size of strides is different from the tensor dimension.");
}
at::Tensor t = tensor;
for (size_t i = 0; i < window_shape.size(); ++i) {
const auto w_size = window_shape[i];
TORCH_CHECK(w_size != 0);
const auto& pad = pad_width[i];
TORCH_CHECK(pad.size() == 2);
const auto out_extent_adj = -w_size + 1 + pad[0] + pad[1];
TORCH_INTERNAL_ASSERT(out_extent_adj <= 0);
const auto stride = strides[i];
TORCH_CHECK(stride >= 1);
at::Tensor concat_tensor;
for (int w = 0; w < w_size; ++w) {
std::vector<int> shift_offsets(t.ndimension(), 0);
shift_offsets[i] = pad[0] - w;
auto shifted = shift(t, shift_offsets);
// Apply stride
if (stride != 1) {
std::vector<at::indexing::TensorIndex> indices(
shifted.ndimension(), at::indexing::Slice(0, at::indexing::None));
if (out_extent_adj == 0) {
indices[i] = at::indexing::Slice(0, at::indexing::None, strides[i]);
} else {
indices[i] = at::indexing::Slice(0, out_extent_adj, strides[i]);
}
shifted = shifted.index(indices);
}
shifted = shifted.unsqueeze(-1);
if (w == 0) {
concat_tensor = shifted;
} else {
concat_tensor = at::cat({concat_tensor, shifted}, -1);
}
}
t = concat_tensor;
}
// Fill invalid regions with the marker. Note that when non-unit
// stride is used, it trims invalid regions, so no marking is
// necessary.
for (size_t i = 0; i < window_shape.size(); ++i) {
if (strides[i] != 1) {
continue;
}
const auto out_extent_adj =
-window_shape[i] + 1 + pad_width[i][0] + pad_width[i][1];
if (out_extent_adj < 0) {
std::vector<at::indexing::TensorIndex> indices(
t.ndimension(), at::indexing::Slice(0, at::indexing::None));
indices[i] = at::indexing::Slice(out_extent_adj, at::indexing::None);
t.index(indices) = invalid_marker;
}
}
return t;
}
} // namespace
// Shift an input tensor
TEST_F(NVFuserTest, FusionShift1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = shift(tv0, {-1, 0});
fusion.addOutput(tv1);
auto tv2 = shift(tv0, {0, 1});
fusion.addOutput(tv2);
auto tv3 = shift(tv0, {2, 2});
fusion.addOutput(tv3);
auto tv4 = shift(tv0, {-2, -2});
fusion.addOutput(tv4);
int numel_x = 9;
int numel_y = 11;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = shift(t0, {-1, 0});
TORCH_CHECK(t1.equal(outputs[0]));
auto t2 = shift(t0, {0, 1});
TORCH_CHECK(t2.equal(outputs[1]));
auto t3 = shift(t0, {2, 2});
TORCH_CHECK(t3.equal(outputs[2]));
auto t4 = shift(t0, {-2, -2});
TORCH_CHECK(t4.equal(outputs[3]));
}
// Shifts an intermediate tensor
TEST_F(NVFuserTest, FusionShift2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {-1, 0});
fusion.addOutput(tv2);
// make it a little more complex
auto tv3 = add(tv0, IrBuilder::create<Double>(3));
auto tv4 = add(tv3, IrBuilder::create<Double>(4));
auto tv5 = shift(tv4, {-1, 0});
auto tv6 = shift(tv4, {0, -1});
auto tv7 = shift(tv4, {1, 0});
auto tv8 = shift(tv4, {0, 0});
auto tv9 = add(tv5, tv6);
auto tv10 = add(tv9, tv7);
auto tv11 = add(tv10, tv8);
fusion.addOutput(tv11);
for (auto tv : {tv1, tv2, tv3, tv4, tv5, tv6, tv7, tv8, tv9, tv10, tv11}) {
tv->setMemoryType(MemoryType::Global);
}
// t1 allocation: (t1.size[0] + 1) * (t1.size[1])
// t3 allocation: (t3.size[0] + 2) * (t3.size[1] + 1)
// t4 allocation: (t3.size[0] + 2) * (t3.size[1] + 1)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1 || tensor_name == 3 || tensor_name == 4) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
if (tensor_name == 1 && i == 1) {
TORCH_CHECK(alloc->shape().at(i)->isA<NamedScalar>());
continue;
}
auto def =
dynamic_cast<BinaryOp*>(alloc->shape().at(i)->definition());
TORCH_CHECK(
def != nullptr && def->getBinaryOpType() == BinaryOpType::Add);
TORCH_CHECK(def->as<BinaryOp>()->lhs()->isA<NamedScalar>());
auto rhs = dynamic_cast<Int*>(def->as<BinaryOp>()->rhs());
TORCH_CHECK(rhs != nullptr && rhs->isConst());
int rhs_value = *rhs->value();
if (tensor_name == 1) {
TORCH_CHECK(i == 0);
TORCH_CHECK(rhs_value == 1);
} else {
if (i == 0) {
TORCH_CHECK(rhs_value == 2);
} else {
TORCH_CHECK(rhs_value == 1);
}
}
}
}
}
}
int numel_x = 9;
int numel_y = 11;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {-1, 0});
auto t3 = t0 + 3;
auto t4 = t3 + 4;
auto t5 = shift(t4, {-1, 0});
auto t6 = shift(t4, {0, -1});
auto t7 = shift(t4, {1, 0});
auto t8 = shift(t4, {0, 0});
auto t9 = t5 + t6;
auto t10 = t9 + t7;
auto t11 = t10 + t8;
testValidate(&fusion, outputs, inputs, {t2, t11}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftRightOfCA_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {0, 1});
fusion.addOutput(tv2);
tv0->computeAt(tv2, -2);
tv1->setMemoryType(MemoryType::Global);
int numel_x = 100;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {0, 1});
TORCH_CHECK(t2.allclose(outputs[0]));
}
TEST_F(NVFuserTest, FusionShiftLeftOfCA_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = add(tv1, IrBuilder::create<Double>(1));
auto tv3 = shift(tv2, {-1, 0});
auto tv4 = add(tv3, IrBuilder::create<Double>(1));
fusion.addOutput(tv4);
tv0->computeAt(tv4, -1);
// Lowering should trigger an assertion failure as a shifted axis is
// found inside an allocation position.
ASSERT_ANY_THROW(fusion.printKernel());
}
TEST_F(NVFuserTest, FusionShiftSplit1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {0, 1});
auto tv3 = shift(tv1, {0, -2});
fusion.addOutput(tv2);
fusion.addOutput(tv3);
int split_factor = 4;
tv2->split(-1, split_factor);
tv3->split(-1, split_factor);
tv0->computeAt(tv2, -2);
tv0->computeAt(tv3, -2);
// t1 allocation: 7
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1) {
TORCH_CHECK(alloc->shape().size() == 1);
auto size = dynamic_cast<Int*>(alloc->shape().at(0));
TORCH_CHECK(
size != nullptr && size->isConst() && size->value().value() == 7);
}
}
}
int numel_x = 9;
int numel_y = 11;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {0, 1});
auto t3 = shift(t1, {0, -2});
testValidate(&fusion, outputs, inputs, {t2, t3}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftSplit2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = add(tv1, IrBuilder::create<Double>(1));
auto tv3 = shift(tv2, {0, -1});
auto tv4 = shift(tv2, {0, 1});
auto tv5 = add(tv3, tv4);
fusion.addOutput(tv5);
auto tv6 = add(tv0, IrBuilder::create<Double>(1));
auto tv7 = shift(tv6, {0, 0});
auto tv8 = add(tv7, IrBuilder::create<Double>(1));
fusion.addOutput(tv8);
int split_factor = 4;
tv5->split(-1, split_factor);
tv8->split(-1, split_factor);
tv0->computeAt(tv5, -2);
tv0->computeAt(tv8, -2);
// t1 and t2 allocation: 6
// t4 allocation: 4
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1 || tensor_name == 2) {
TORCH_CHECK(alloc->shape().size() == 1);
auto size = dynamic_cast<Int*>(alloc->shape().at(0));
TORCH_CHECK(
size != nullptr && size->isConst() && size->value().value() == 6);
} else if (tensor_name == 4) {
TORCH_CHECK(alloc->shape().size() == 1);
auto size = dynamic_cast<Int*>(alloc->shape().at(0));
TORCH_CHECK(size != nullptr && size->isConst());
int size_value = *size->value();
TORCH_CHECK(size_value == split_factor);
}
}
}
int numel_x = 9;
int numel_y = 11;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 2;
auto t3 = shift(t1, {0, -1});
auto t4 = shift(t1, {0, 1});
auto t5 = t3 + t4;
auto t6 = t0 + 1;
auto t7 = t6;
auto t8 = t7 + 1;
testValidate(&fusion, outputs, inputs, {t5, t8}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftDoubleSplit_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = add(tv1, IrBuilder::create<Double>(2));
auto tv3 = shift(tv2, {0, 1});
fusion.addOutput(tv3);
int split_factor1 = 8;
int split_factor2 = 4;
tv3->split(-1, split_factor1);
tv0->computeAt(tv3, -2);
tv1->split(-1, split_factor2);
// t1: [i1, i2/8, 8/4, 4]
// t2: [i1, i2/8, 8]
// t3: [i1, i2/8, 8]
// t1 and t2 allocation: (split_factor1 + 1) = 9
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1 || tensor_name == 2) {
TORCH_CHECK(alloc->shape().size() == 1);
auto size = dynamic_cast<Int*>(alloc->shape().at(0));
TORCH_CHECK(
size != nullptr && size->isConst() && size->value().value() == 9);
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 3;
auto ref = shift(t1, {0, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShift3ptStencil_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// 3-pt stencil
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
std::vector<std::vector<int>> offsets = {{-1}, {1}};
std::vector<TensorView*> tvs;
for (const auto& offset : offsets) {
tvs.push_back(shift(tv0, offset));
}
auto tv_out = tv0;
for (auto tv : tvs) {
tv_out = add(tv_out, tv);
}
tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
fusion.addOutput(tv_out);
int split_factor = 4;
tv_out->split(0, split_factor);
// This seems fine but not verified yet
// tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
auto cache = tv0->cache_after();
tv0->computeAt(tv_out, 1);
// Inline completely except for the cache
for (auto tv : tvs) {
tv->computeAt(tv_out, -1);
}
// cache allocation: (split_factor + 2)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == cache->name()) {
TORCH_CHECK(alloc->shape().size() == 1);
auto size = dynamic_cast<Int*>(alloc->shape().at(0));
TORCH_CHECK(
size != nullptr && size->isConst() &&
size->value().value() == split_factor + 2);
}
}
}
cache->doubleBuffer();
int numel_x = 99;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShift5ptStencil_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// 5-pt stencil
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
std::vector<TensorView*> tvs;
for (const auto& offset : offsets) {
tvs.push_back(shift(tv0, offset));
}
auto tv_out = tv0;
for (auto tv : tvs) {
tv_out = add(tv_out, tv);
}
tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
fusion.addOutput(tv_out);
std::vector<int> split_factor({4, 8});
tv_out->split(-1, split_factor[1]);
tv_out->split(0, split_factor[0]);
tv_out->reorder({{1, 2}, {2, 1}});
auto cache = tv0->cache_after();
tv0->computeAt(tv_out, 2);
// Inline completely except for the cache
for (auto tv : tvs) {
tv->computeAt(tv_out, -1);
}
// cache allocation: (split_factor + 2) * (split_factor + 2)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == cache->name()) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(
size != nullptr && size->isConst() &&
size->value().value() == split_factor[i] + 2);
}
}
}
}
cache->doubleBuffer();
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = t0;
for (const auto& offset : offsets) {
ref = ref + shift(t0, offset);
}
ref = ref / int(offsets.size() + 1);
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShift9ptStencil_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// 9-pt stencil
std::vector<std::vector<int>> offsets;
for (int i = -1; i < 2; ++i) {
for (int j = -1; j < 2; ++j) {
if (i == 0 && j == 0) {
continue;
}
offsets.push_back({i, j});
}
}
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
std::vector<TensorView*> tvs;
for (const auto& offset : offsets) {
tvs.push_back(shift(tv0, offset));
}
auto tv_out = tv0;
for (auto tv : tvs) {
tv_out = add(tv_out, tv);
}
tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
fusion.addOutput(tv_out);
std::vector<int> split_factor({4, 8});
tv_out->split(-1, split_factor[1]);
tv_out->split(0, split_factor[0]);
tv_out->reorder({{1, 2}, {2, 1}});
auto cache = tv0->cache_after();
tv0->computeAt(tv_out, 2);
// Inline completely except for the cache
for (auto tv : tvs) {
tv->computeAt(tv_out, -1);
}
// This seems fine but not yet verified
// tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
// cache allocation: (split_factor + 2) * (split_factor + 2)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == cache->name()) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(
size != nullptr && size->isConst() &&
size->value().value() == split_factor[i] + 2);
}
}
}
}
cache->doubleBuffer();
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = t0;
for (const auto& offset : offsets) {
ref = ref + shift(t0, offset);
}
ref = ref / int(offsets.size() + 1);
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftSmemBlocking_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {0, 1});
fusion.addOutput(tv2);
int smem_block_factor = 32;
tv2->split(-1, smem_block_factor);
tv0->computeAt(tv2, -2);
tv1->axis(-1)->parallelize(ParallelType::TIDx);
tv2->axis(-1)->parallelize(ParallelType::TIDx);
tv1->setMemoryType(MemoryType::Shared);
// tv1 allocation: (split_factor + 1)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == tv1->name()) {
TORCH_CHECK(alloc->shape().size() == 1);
for (int i = 0; i < 1; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(
size != nullptr && size->isConst() &&
size->value().value() == smem_block_factor + 1);
}
}
}
}
int numel_x = 100;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {0, 1});
auto ref = t2;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShift3ptStencilParallel_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// 3-pt stencil
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
std::vector<TensorView*> tvs;
tvs.push_back(shift(tv0, {-1}));
tvs.push_back(shift(tv0, {1}));
auto tv_out = tv0;
for (auto tv : tvs) {
tv_out = add(tv_out, tv);
}
tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
fusion.addOutput(tv_out);
int smem_block_factor = 32;
tv_out->split(0, smem_block_factor);
// tv_out->axis(-1)->parallelize(ParallelType::Unswitch);
auto tv0_cache = tv0->cache_after();
tv0->computeAt(tv_out, 1);
for (auto tv : tvs) {
tv->computeAt(tv_out, -1);
}
tv0_cache->setMemoryType(MemoryType::Shared);
tv_out->axis(-1)->parallelize(ParallelType::TIDx);
tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
tv0_cache->doubleBuffer();
int numel_x = 99;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = (t0 + shift(t0, {-1}) + shift(t0, {1})) / 3;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShift5ptStencilParallel_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// 5-pt stencil
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
std::vector<TensorView*> tvs;
for (const auto& offset : offsets) {
tvs.push_back(shift(tv0, offset));
}
auto tv_out = tv0;
for (auto tv : tvs) {
tv_out = add(tv_out, tv);
}
tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
fusion.addOutput(tv_out);
int smem_block_factor = 32;
tv_out->split(-1, smem_block_factor);
tv_out->split(0, smem_block_factor);
tv_out->reorder({{1, 2}, {2, 1}});
auto tv0_cache = tv0->cache_after();
tv0->computeAt(tv_out, 2);
for (auto tv : tvs) {
tv->computeAt(tv_out, -1);
}
tv_out->axis(-1)->parallelize(ParallelType::TIDx);
tv_out->axis(-2)->parallelize(ParallelType::TIDy);
tv_out->axis(-3)->parallelize(ParallelType::BIDx);
tv_out->axis(-4)->parallelize(ParallelType::BIDy);
tv0_cache->setMemoryType(MemoryType::Shared);
tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
tv0_cache->axis(-2)->parallelize(ParallelType::TIDy);
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = t0;
for (const auto& offset : offsets) {
ref = ref + shift(t0, offset);
}
ref = ref / int(offsets.size() + 1);
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftMerge1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {-1, 1});
fusion.addOutput(tv2);
int split_factor = 4;
tv2->split(-1, split_factor);
tv2->split(0, split_factor);
tv2->reorder({{1, 2}, {2, 1}});
tv2->merge(2, 3);
tv0->computeAt(tv2, 2);
// t1 allocation: (split_factor + 1) * (split_factor + 1)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(
size != nullptr && size->isConst() &&
size->value().value() == split_factor + 1);
}
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {-1, 1});
auto ref = t2;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftMerge2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {1, -1});
auto tv3 = shift(tv1, {-1, 1});
auto tv4 = add(tv2, tv3);
fusion.addOutput(tv4);
int split_factor = 4;
tv4->split(-1, split_factor);
tv4->split(0, split_factor);
tv4->reorder({{1, 2}, {2, 1}});
tv4->merge(2, 3);
tv0->computeAt(tv4, -2);
// t1 allocation: (split_factor + 2) * (split_factor + 2)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(
size != nullptr && size->isConst() &&
size->value().value() == split_factor + 2);
}
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {1, -1});
auto t3 = shift(t1, {-1, 1});
auto t4 = t2 + t3;
TORCH_CHECK(t4.allclose(outputs[0]));
}
TEST_F(NVFuserTest, FusionShiftGlobal_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {0, 1});
auto tv3 = shift(tv1, {-1, 0});
auto tv4 = add(tv2, tv3);
fusion.addOutput(tv4);
tv1->split(-1, 4);
tv2->split(-1, 8);
tv3->split(-1, 2);
tv4->split(-1, 3);
tv1->merge(-2, -1);
tv1->setMemoryType(MemoryType::Global);
tv2->setMemoryType(MemoryType::Global);
tv3->setMemoryType(MemoryType::Global);
// t1 allocation: (t1.size[0] + 1) * (t1.size[1] + 1)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
auto def =
dynamic_cast<BinaryOp*>(alloc->shape().at(i)->definition());
TORCH_CHECK(
def != nullptr && def->getBinaryOpType() == BinaryOpType::Add);
TORCH_CHECK(def->as<BinaryOp>()->lhs()->isA<NamedScalar>());
auto rhs = dynamic_cast<Int*>(def->as<BinaryOp>()->rhs());
TORCH_CHECK(rhs != nullptr && rhs->isConst());
int rhs_value = *rhs->value();
TORCH_CHECK(rhs_value == 1);
}
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {0, 1});
auto t3 = shift(t1, {-1, 0});
auto t4 = t2 + t3;
auto ref = t4;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftDoubleSplitMerge1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = add(tv1, IrBuilder::create<Double>(2));
auto tv3 = shift(tv2, {0, 1});
fusion.addOutput(tv3);
int split_factor1 = 8;
int split_factor2 = 4;
tv3->split(-1, split_factor1);
tv0->computeAt(tv3, -2);
tv1->split(-1, split_factor2);
tv1->merge(-2, -1);
// t1 and t2 allocation: (split_factor1 + 1)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1 || tensor_name == 2) {
auto size = dynamic_cast<Int*>(alloc->shape().at(0));
TORCH_CHECK(
size != nullptr && size->isConst() &&
size->value().value() == split_factor1 + 1);
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 3;
auto ref = shift(t1, {0, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftDoubleSplitMerge2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = add(tv1, IrBuilder::create<Double>(2));
auto tv3 = shift(tv2, {1, 1});
fusion.addOutput(tv3);
auto out = tv3;
int split_factor1 = 32;
int split_factor2 = 4;
out->split(-1, split_factor1);
out->split(-1, split_factor2);
out->split(0, split_factor1);
out->split(1, split_factor2);
out->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}});
out->merge(2, 3);
out->merge(2, 3);
out->merge(2, 3);
out->merge(0, 1);
TransformPropagator::from(out);
tv0->computeAt(out, 1);
out->axis(0)->parallelize(ParallelType::BIDx);
out->axis(1)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {tv1, tv2});
for (auto tv : {tv1, tv2}) {
tv->setMemoryType(MemoryType::Shared);
}
// t1 and t2 allocation: (split_factor1 + 1) * (split_factor1 + 1)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1 || tensor_name == 2) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(
size != nullptr && size->isConst() &&
size->value().value() == split_factor1 + 1);
}
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = shift(t0 + 1 + 2, {1, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShift5ptStencilParallel1DThreadBlock_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// 5-pt stencil
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
std::vector<TensorView*> tvs;
for (const auto& offset : offsets) {
tvs.push_back(shift(tv0, offset));
}
auto tv_out = tv0;
for (auto tv : tvs) {
tv_out = add(tv_out, tv);
}
tv_out = div(tv_out, IrBuilder::create<Double>(tvs.size() + 1));
fusion.addOutput(tv_out);
std::vector<int> split_factor({4, 32});
tv_out->split(-1, split_factor[1]);
tv_out->split(0, split_factor[0]);
tv_out->reorder({{1, 2}, {2, 1}});
auto tv0_cache = tv0->cache_after();
// Merge the inner-most two axes and create
// a 1D thread block of split_factor1*split_factor2 threads
tv_out->merge(-2, -1);
tv0->computeAt(tv_out, 2);
// Inline completely except for the cache
for (auto tv : tvs) {
tv->computeAt(tv_out, -1);
}
tv0_cache->merge(-2, -1);
tv_out->axis(-1)->parallelize(ParallelType::TIDx);
tv_out->axis(1)->parallelize(ParallelType::BIDx);
tv_out->axis(0)->parallelize(ParallelType::BIDy);
tv0_cache->setMemoryType(MemoryType::Shared);
tv0_cache->axis(-1)->parallelize(ParallelType::TIDx);
// cache allocation: (split_factor1 + 2) * (split_factor2 + 2)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == tv0_cache->name()) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(
size != nullptr && size->isConst() &&
size->value().value() == split_factor[i] + 2);
}
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = t0;
for (const auto& offset : offsets) {
ref = ref + shift(t0, offset);
}
ref = ref / int(offsets.size() + 1);
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftChain1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = shift(tv0, {0, 1});
auto tv2 = shift(tv1, {0, 1});
fusion.addOutput(tv2);
int split_factor = 4;
tv2->split(-1, split_factor);
tv0->computeAt(tv2, -2);
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = shift(shift(t0, {0, 1}), {0, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftChain2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = shift(tv0, {0, 1});
auto tv2 = shift(tv1, {0, -1});
fusion.addOutput(tv2);
tv2->split(-1, 4);
tv0->computeAt(tv2, -2);
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = shift(shift(t0, {0, 1}), {0, -1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftChain3_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {0, 1});
auto tv3 = shift(tv2, {0, 1});
fusion.addOutput(tv3);
int split_factor = 4;
tv3->split(-1, split_factor);
tv0->computeAt(tv3, -2);
// Halo size of tv1 is 2 as it needs to account for both of the two
// shift operations , while that of tv2 is still just 1
// tv1: (split_factor + 2)
// tv2: (split_factor + 1)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1 || tensor_name == 2) {
TORCH_CHECK(alloc->shape().size() == 1);
for (int i = 0; i < 1; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(size != nullptr && size->isConst());
if (tensor_name == 1) {
TORCH_CHECK(size->value().value() == split_factor + 2);
} else if (tensor_name == 2) {
TORCH_CHECK(size->value().value() == split_factor + 1);
}
}
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {0, 1});
auto t3 = shift(t2, {0, 1});
auto ref = t3;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftChain4_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = shift(tv0, {1, -1});
auto tv2 = shift(tv1, {2, -2});
auto tv3 = shift(tv2, {3, -3});
auto tv4 = shift(tv3, {4, -4});
auto tv_out = tv4;
fusion.addOutput(tv_out);
int split_factor = 4;
tv_out->split(-1, split_factor);
tv_out->split(0, split_factor);
tv_out->reorder({{1, 2}, {2, 1}});
tv0->computeAt(tv_out, 2);
tv1->merge(-2, -1);
tv2->merge(-2, -1);
tv3->merge(-2, -1);
// tv1: (split_factor + 9) * (split_factor + 9)
// tv2: (split_factor + 7) * (split_factor + 7)
// tv3: (split_factor + 4) * (split_factor + 4)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == 1 || tensor_name == 2) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(size != nullptr && size->isConst());
auto size_val = size->value().value();
if (tensor_name == 1) {
TORCH_CHECK(size_val == split_factor + 9);
} else if (tensor_name == 2) {
TORCH_CHECK(size_val == split_factor + 7);
} else if (tensor_name == 3) {
TORCH_CHECK(size_val == split_factor + 4);
}
}
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = shift(t0, {1, -1});
auto t2 = shift(t1, {2, -2});
auto t3 = shift(t2, {3, -3});
auto t4 = shift(t3, {4, -4});
auto ref = t4;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShift5ptStencilChain_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
// First stencil: 5pt stencil
// stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5
std::vector<TensorView*> tv_stencil1_shifts;
for (const auto& offset : offsets) {
tv_stencil1_shifts.push_back(shift(tv0, offset));
}
auto tv_stencil1 = tv0;
for (auto tv : tv_stencil1_shifts) {
tv_stencil1 = add(tv_stencil1, tv);
}
tv_stencil1 = div(
tv_stencil1, IrBuilder::create<Double>(tv_stencil1_shifts.size() + 1));
// Second stencil: Same 5pt stencil
std::vector<TensorView*> tv_stencil2_shifts;
for (const auto& offset : offsets) {
tv_stencil2_shifts.push_back(shift(tv_stencil1, offset));
}
auto tv_stencil2 = tv_stencil1;
for (auto tv : tv_stencil2_shifts) {
tv_stencil2 = add(tv_stencil2, tv);
}
tv_stencil2 = div(
tv_stencil2, IrBuilder::create<Double>(tv_stencil2_shifts.size() + 1));
auto tv_out = tv_stencil2;
fusion.addOutput(tv_out);
auto tv0_cache = tv0->cache_after();
std::vector<int> split_factor({16, 16});
tv_out->split(-1, split_factor[1]);
tv_out->split(0, split_factor[0]);
tv_out->reorder({{1, 2}, {2, 1}});
tv0->computeAt(tv_out, 2);
// Inline completely all inputs to the first stencil output, except for the
// tv0 cache
for (auto tv : tv_stencil1_shifts) {
tv->computeAt(tv_stencil1, -1);
}
// Inline completely all inputs to the second stencil output, except
// for the first stencil output
for (auto tv : tv_stencil2_shifts) {
tv->computeAt(tv_stencil2, -1);
}
tv_out->axis(1)->parallelize(ParallelType::BIDx);
tv_out->axis(0)->parallelize(ParallelType::BIDy);
auto all_values = DependencyCheck::getAllValsBetween(
{fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
for (auto tv : ir_utils::filterByType<TensorView>(all_values)) {
tv->axis(-1)->parallelize(ParallelType::TIDx);
tv->axis(-2)->parallelize(ParallelType::TIDy);
}
tv0_cache->setMemoryType(MemoryType::Shared);
tv_stencil1->setMemoryType(MemoryType::Shared);
// tv0_cache: (split_factor + 4) * (split_factor + 4)
// tv_stencil1: (split_factor + 2) * (split_factor + 2)
GpuLower gpulw(&fusion);
for (const auto expr : gpulw.kernel()->unordered_exprs()) {
if (auto alloc = dynamic_cast<kir::Allocate*>(expr)) {
auto tensor_name = alloc->buffer()->name();
if (tensor_name == tv0_cache->name() ||
tensor_name == tv_stencil1->name()) {
TORCH_CHECK(alloc->shape().size() == 2);
for (int i = 0; i < 2; ++i) {
auto size = dynamic_cast<Int*>(alloc->shape().at(i));
TORCH_CHECK(size != nullptr && size->isConst());
if (tensor_name == tv0_cache->name()) {
TORCH_CHECK(size->value().value() == split_factor[i] + 4);
} else if (tensor_name == tv_stencil1->name()) {
TORCH_CHECK(size->value().value() == split_factor[i] + 2);
}
}
}
}
}
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto stencil1 = t0;
for (const auto& offset : offsets) {
stencil1 = stencil1 + shift(t0, offset);
}
stencil1 = stencil1 / int(offsets.size() + 1);
auto stencil2 = stencil1;
for (const auto& offset : offsets) {
stencil2 = stencil2 + shift(stencil1, offset);
}
stencil2 = stencil2 / int(offsets.size() + 1);
auto ref = stencil2;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Shift a reduced tensor
TEST_F(NVFuserTest, FusionShiftReduction1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = sum(tv1, {1});
auto tv3 = shift(tv2, {1});
fusion.addOutput(tv3);
tv3->split(0, 4);
tv0->computeAt(tv3, 1);
tv0->computeAt(tv2, -1);
const int numel_x = 9;
const int numel_y = 11;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = sum(t1, {1});
auto t3 = shift(t2, {1});
auto ref = t3;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Parallelized version of FusionShiftReduction1
TEST_F(NVFuserTest, FusionShiftReduction2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = sum(tv1, {1});
auto tv3 = shift(tv2, {1});
fusion.addOutput(tv3);
tv3->split(0, 4);
tv0->computeAt(tv3, 1);
tv2->split(-1, 32);
tv0->computeAt(tv2, -1);
tv2->axis(-1)->parallelize(ParallelType::TIDx);
tv2->setMemoryType(MemoryType::Shared);
const int numel_x = 201;
const int numel_y = 301;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = sum(t1, {1});
auto t3 = shift(t2, {1});
auto ref = t3;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftRfactor1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = sum(tv1, {1});
auto tv3 = shift(tv2, {1});
fusion.addOutput(tv3);
tv3->split(0, 4);
tv0->computeAt(tv3, 1);
tv2->split(-1, 32);
auto rf = tv2->rFactor({-2});
tv0->computeAt(tv2, -1);
tv0->computeAt(rf, -1);
tv2->axis(-1)->parallelize(ParallelType::TIDx);
tv2->setMemoryType(MemoryType::Shared);
const int numel_x = 201;
const int numel_y = 301;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = sum(t1, {1});
auto t3 = shift(t2, {1});
auto ref = t3;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftBcast1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = makeSymbolicTensor(2);
fusion.addInput(tv1);
auto tv2 = broadcast(tv0, {false, true});
auto tv3 = shift(tv2, {0, 1});
auto tv4 = add(tv3, tv1);
fusion.addOutput(tv4);
tv0->computeAt(tv4, -1);
tv1->computeAt(tv4, -1);
const int numel_x = 9;
const int numel_y = 11;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x}, options);
at::Tensor t1 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0, t1};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t4 = t0.unsqueeze(-1).expand({numel_x, numel_y}) + t1;
auto ref = t4;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftBcast2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = makeSymbolicTensor(2);
fusion.addInput(tv1);
auto tv2 = broadcast(tv0, {false, true});
auto tv3 = shift(tv2, {1, 0});
auto tv4 = add(tv3, tv1);
fusion.addOutput(tv4);
tv4->split(0, 4);
tv0->computeAt(tv4, 1);
const int numel_x = 9;
const int numel_y = 11;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x}, options);
at::Tensor t1 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0, t1};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y});
auto t3 = shift(t2, {1, 0});
auto ref = t3 + t1;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Combine ShiftBcast1 and ShiftBcast2 with parallelization
TEST_F(NVFuserTest, FusionShiftBcast3_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = makeSymbolicTensor(2);
fusion.addInput(tv1);
auto tv2 = broadcast(tv0, {false, true});
auto tv3 = shift(tv2, {1, 0});
auto tv4 = shift(tv2, {0, 1});
auto tv5 = shift(tv2, {-1, -1});
auto tv6 = add(tv3, tv4);
auto tv7 = add(tv6, tv5);
auto tv8 = add(tv7, tv1);
fusion.addOutput(tv8);
tv8->split(0, 4);
tv8->split(-1, 4);
tv0->computeAt(tv8, 1);
tv8->axis(-1)->parallelize(ParallelType::TIDx);
for (auto tv : {tv8, tv7, tv6, tv5, tv4, tv3, tv2}) {
tv->axis(1)->parallelize(ParallelType::TIDy);
}
tv2->setMemoryType(MemoryType::Shared);
const int numel_x = 101;
const int numel_y = 201;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x}, options);
at::Tensor t1 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0, t1};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t2 = t0.unsqueeze(-1).expand({numel_x, numel_y});
auto t3 = shift(t2, {1, 0});
auto t4 = t2;
auto t5 = shift(t2, {-1, 0});
auto ref = t3 + t4 + t5 + t1;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// See issue #893
TEST_F(NVFuserTest, FusionShiftSyncPlacement1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = add(tv0, IrBuilder::create<Double>(2));
auto tv3 = add(tv1, tv2);
auto tv4 = shift(tv3, {0, 1});
fusion.addOutput(tv4);
tv4->split(1, 8);
tv0->computeAt(tv4, 2);
tv2->computeAt(tv3, -1);
tv1->setMemoryType(MemoryType::Shared);
tv3->setMemoryType(MemoryType::Shared);
tv1->axis(-1)->parallelize(ParallelType::TIDx);
tv3->axis(-1)->parallelize(ParallelType::TIDx);
tv4->axis(-1)->parallelize(ParallelType::TIDx);
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = t0 + 2;
auto t3 = add(t1, t2);
auto t4 = shift(t3, {0, 1});
testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
}
// See issue #893. Top-level placement.
TEST_F(NVFuserTest, FusionShiftSyncPlacement2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = add(tv0, IrBuilder::create<Double>(2));
auto tv3 = add(tv1, tv2);
auto tv4 = shift(tv3, {1});
fusion.addOutput(tv4);
tv2->computeAt(tv3, -1);
tv1->setMemoryType(MemoryType::Shared);
tv3->setMemoryType(MemoryType::Shared);
tv1->axis(-1)->parallelize(ParallelType::TIDx);
tv3->axis(-1)->parallelize(ParallelType::TIDx);
tv4->axis(-1)->parallelize(ParallelType::TIDx);
int numel_x = 99;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = t0 + 2;
auto t3 = add(t1, t2);
auto t4 = shift(t3, {1});
testValidate(&fusion, outputs, inputs, {t4}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftSyncPlacement3_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = add(tv1, IrBuilder::create<Double>(2));
auto tv3 = shift(tv2, {1});
fusion.addOutput(tv3);
// This doesn't work. syncthreads is needed between tv1 and tv2, but
// both the loop extent of both tv1 and tv2 has halo, so the loop is
// not eliminated even though it is parallelized. Moving syncthreads
// out of the loop would make it placed before tv1, which would make
// it meaningless.
// Ideally, an exception should be thrown at this computeAt, but at
// this point, the fusion is not yet parallelized, nor memory type
// is set, so this computeAt itself is not an error yet.
tv1->computeAt(tv2, -1);
tv1->setMemoryType(MemoryType::Shared);
tv2->setMemoryType(MemoryType::Shared);
tv1->axis(-1)->parallelize(ParallelType::TIDx);
tv2->axis(-1)->parallelize(ParallelType::TIDx);
tv3->axis(-1)->parallelize(ParallelType::TIDx);
// The error should be detected when the fusion is lowered.
ASSERT_ANY_THROW(fusion.printKernel());
}
// Based on original CUDA provided by Vishal Mehta.
// Major differences with the original version:
// - The original version uses additional 2 warps to load the halos
// along the Y dimension. The other 10 warps are used to load a 32x10
// tile, and all warps will do coalesced loads. No such optimization
// is done in the fuser version.
TEST_F(NVFuserTest, FusionHdiff_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
auto coeff = makeSymbolicTensor(3);
fusion.addInput(coeff);
std::vector<std::vector<int>> offsets{
{0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}};
// T2, T3, T4, T5
std::vector<TensorView*> inp_neighbors;
for (const auto& offset : offsets) {
inp_neighbors.push_back(shift(inp, offset, false));
}
// T8
TensorView* sum_of_neighbors = nullptr;
for (auto inp_neighbor : inp_neighbors) {
if (sum_of_neighbors == nullptr) {
sum_of_neighbors = inp_neighbor;
} else {
sum_of_neighbors = add(sum_of_neighbors, inp_neighbor);
}
}
// T9 = T0 * 4
// T10 = T9 - T8
auto lap = sub(mul(inp, IrBuilder::create<Double>(4)), sum_of_neighbors);
// T11 = shift(T10)
// T12 = T11 - T10
auto flx = sub(shift(lap, {0, 0, -1}, false), lap);
// T14 = T13 - T0
// T15 = T12 * T14
// T16 = T15 > 0
// T17 = T16 ? 0 : T12
auto flx_cond =
gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)),
IrBuilder::create<Double>(0));
auto flx0 = where(flx_cond, IrBuilder::create<Double>(0), flx);
// T18 = shift(T10)
// T19 = T18 - T10
auto fly = sub(shift(lap, {0, -1, 0}, false), lap);
// T20 = shift(T0)
// T21 = T20 - T0
// T22 = T19 * T21
// T23 = T22 > 0
auto fly_cond =
gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)),
IrBuilder::create<Double>(0));
// T24 = T23 ? 0 : T19
auto fly0 = where(fly_cond, IrBuilder::create<Double>(0), fly);
// T25 = shift(flx0)
// T26 = T17 - T25
// T27 = shift(fly0)
// T28 = T24 - T27
// T29 = T26 + T28
// T30 = T1 * T29
// T31 = T0 - T30
auto out =
sub(inp,
mul(coeff,
add(sub(flx0, shift(flx0, {0, 0, 1}, false)),
sub(fly0, shift(fly0, {0, 1, 0}, false)))));
fusion.addOutput(out);
/////////////////////////////////
// Scheduling
/////////////////////////////////
out->setContiguity(false);
// Step 1: 2D Tiling
const int tile_x = 32;
const int tile_y = 8;
out->split(-1, tile_x);
out->split(-3, tile_y);
out->reorder({{-2, -3}});
inp->computeAt(out, -3);
coeff->computeAt(out, -3);
// Step 2: Inlining
// Inline inputs to lap
auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap});
for (auto val : ir_utils::filterByType<TensorView>(lap_vals)) {
if (val != lap && val != inp) {
val->computeAt(lap, -1);
}
}
// Inline inputs to flx0
auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0});
for (auto val : ir_utils::filterByType<TensorView>(flx0_vals)) {
if (val != lap && val != flx0 && val != inp) {
val->computeAt(flx0, -1);
}
}
// Inline inputs to fly0
auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0});
for (auto val : ir_utils::filterByType<TensorView>(flxy_vals)) {
if (val != lap && val != fly0 && val != inp) {
val->computeAt(fly0, -1);
}
}
// Inline inputs to out
auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out});
for (auto val : ir_utils::filterByType<TensorView>(out_vals)) {
if (val != flx0 && val != fly0 && val != out) {
val->computeAt(out, -1);
}
}
// Step 3: Parallelization
// Block parallelization
out->axis(0)->parallelize(ParallelType::BIDz);
out->axis(1)->parallelize(ParallelType::BIDy);
out->axis(2)->parallelize(ParallelType::BIDx);
// Thread parallelization
out->axis(3)->parallelize(ParallelType::TIDy);
out->axis(4)->parallelize(ParallelType::TIDx);
// Apply the same parallelization to all other tensors
scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
// Store intermediate stencil results on smem so that they can be
// accessed by threads
for (auto tv : {flx0, fly0, lap}) {
tv->setMemoryType(MemoryType::Shared);
}
/////////////////////////////////
int numel_x = 101;
int numel_y = 99;
int numel_z = 10;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options);
at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options);
std::vector<IValue> inputs = {inp_at, coeff_at};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto fuser_output = fe.runFusion(inputs)[0];
// Trim the outer rim
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(0, at::indexing::None),
at::indexing::Slice(2, -2),
at::indexing::Slice(2, -2)};
fuser_output = fuser_output.index(indices);
{
at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options);
auto lap = inp_at * 4 -
(shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) +
shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1}));
auto flx = shift(lap, {0, 0, -1}) - lap;
auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0;
auto flx0 = at::where(flx_cond, zeros, flx);
auto fly = shift(lap, {0, -1, 0}) - lap;
auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0;
auto fly0 = at::where(fly_cond, zeros, fly);
auto ref = inp_at -
coeff_at *
((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0})));
ref = ref.index(indices);
testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__);
}
}
TEST_F(NVFuserTest, FusionHdiffPartialSplitUnswitch_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
auto coeff = makeSymbolicTensor(3);
fusion.addInput(coeff);
std::vector<std::vector<int>> offsets{
{0, 1, 0}, {0, -1, 0}, {0, 0, 1}, {0, 0, -1}};
// T2, T3, T4, T5
std::vector<TensorView*> inp_neighbors;
for (const auto& offset : offsets) {
inp_neighbors.push_back(shift(inp, offset, false));
}
// T8
TensorView* sum_of_neighbors = nullptr;
for (auto inp_neighbor : inp_neighbors) {
if (sum_of_neighbors == nullptr) {
sum_of_neighbors = inp_neighbor;
} else {
sum_of_neighbors = add(sum_of_neighbors, inp_neighbor);
}
}
// T9 = T0 * 4
// T10 = T9 - T8
auto lap = sub(mul(inp, IrBuilder::create<Double>(4)), sum_of_neighbors);
// T11 = shift(T10)
// T12 = T11 - T10
auto flx = sub(shift(lap, {0, 0, -1}, false), lap);
// T14 = T13 - T0
// T15 = T12 * T14
// T16 = T15 > 0
// T17 = T16 ? 0 : T12
auto flx_cond =
gt(mul(flx, sub(shift(inp, {0, 0, -1}, false), inp)),
IrBuilder::create<Double>(0));
auto flx0 = where(flx_cond, IrBuilder::create<Double>(0), flx);
// T18 = shift(T10)
// T19 = T18 - T10
auto fly = sub(shift(lap, {0, -1, 0}, false), lap);
// T20 = shift(T0)
// T21 = T20 - T0
// T22 = T19 * T21
// T23 = T22 > 0
auto fly_cond =
gt(mul(fly, sub(shift(inp, {0, -1, 0}, false), inp)),
IrBuilder::create<Double>(0));
// T24 = T23 ? 0 : T19
auto fly0 = where(fly_cond, IrBuilder::create<Double>(0), fly);
// T25 = shift(flx0)
// T26 = T17 - T25
// T27 = shift(fly0)
// T28 = T24 - T27
// T29 = T26 + T28
// T30 = T1 * T29
// T31 = T0 - T30
auto out =
sub(inp,
mul(coeff,
add(sub(flx0, shift(flx0, {0, 0, 1}, false)),
sub(fly0, shift(fly0, {0, 1, 0}, false)))));
fusion.addOutput(out);
out->setContiguity(false);
/////////////////////////////////
// Scheduling
/////////////////////////////////
const auto all_vals = fusion.usedMathVals();
const std::vector<TensorView*> all_tensors(
{ir_utils::filterByType<TensorView>(all_vals).begin(),
ir_utils::filterByType<TensorView>(all_vals).end()});
// Step 1: Blocking
// - Thread block size: (tile_x, tile_y)
// - Each thread computes a vertical column of length tile_z along the Z
// axis.
// - Grid dize: (NX / block_x, NY / block_y, NZ / tile_z)
const int tile_x = 32;
const int tile_y = 8;
const int tile_z = 16;
out->split(0, tile_z);
out->split(-1, tile_x, true, true);
out->split(-3, tile_y, true, true);
// out: [NZ/tz, tz, NY/by, by, NX/bx, bx]
out->reorder({{1, 3}, {2, 1}, {3, 4}, {4, 2}});
// out: [NZ/tz, NY/by, NX/bx, tz, by, bx]
TransformPropagator::from(out);
inp->computeAt(out, 4);
// Step 2: Inlining
// Inline inputs to lap
auto lap_vals = DependencyCheck::getAllValsBetween({inp}, {lap});
for (auto val : ir_utils::filterByType<TensorView>(lap_vals)) {
if (val != lap && val != inp) {
val->computeAt(lap, -1);
}
}
// Inline inputs to flx0
auto flx0_vals = DependencyCheck::getAllValsBetween({lap, inp}, {flx0});
for (auto val : ir_utils::filterByType<TensorView>(flx0_vals)) {
if (val != lap && val != flx0 && val != inp) {
val->computeAt(flx0, -1);
}
}
// Inline inputs to fly0
auto flxy_vals = DependencyCheck::getAllValsBetween({lap, inp}, {fly0});
for (auto val : ir_utils::filterByType<TensorView>(flxy_vals)) {
if (val != lap && val != fly0 && val != inp) {
val->computeAt(fly0, -1);
}
}
// Inline inputs to out
auto out_vals = DependencyCheck::getAllValsBetween({flx0, fly0}, {out});
for (auto val : ir_utils::filterByType<TensorView>(out_vals)) {
if (val != flx0 && val != fly0 && val != out) {
val->computeAt(out, -1);
}
}
// Step 3: Parallelization
// Block parallelization
out->axis(0)->parallelize(ParallelType::BIDz);
out->axis(1)->parallelize(ParallelType::BIDy);
out->axis(2)->parallelize(ParallelType::BIDx);
out->axis(4)->parallelize(ParallelType::TIDy);
out->axis(5)->parallelize(ParallelType::TIDx);
// Unswitch at the tz axis
out->axis(3)->parallelize(ParallelType::Unswitch);
scheduler_utils::parallelizeAllLike(out, all_tensors);
// These need to be on smem
for (auto tv : {flx0, fly0, lap}) {
tv->setMemoryType(MemoryType::Shared);
}
/////////////////////////////////
const int halo_extent = 2;
const int numel_x = 64 + halo_extent * 2;
const int numel_y = 64 + halo_extent * 2;
const int numel_z = 32;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor inp_at = at::randn({numel_z, numel_y, numel_x}, options);
at::Tensor coeff_at = at::randn({numel_z, numel_y, numel_x}, options);
std::vector<IValue> inputs = {inp_at, coeff_at};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto fuser_output = fe.runFusion(inputs)[0];
// Trim the outer rim
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(0, at::indexing::None),
at::indexing::Slice(2, -2),
at::indexing::Slice(2, -2)};
fuser_output = fuser_output.index(indices);
{
at::Tensor zeros = at::zeros({numel_z, numel_y, numel_x}, options);
auto lap = inp_at * 4 -
(shift(inp_at, {0, 1, 0}) + shift(inp_at, {0, -1, 0}) +
shift(inp_at, {0, 0, 1}) + shift(inp_at, {0, 0, -1}));
auto flx = shift(lap, {0, 0, -1}) - lap;
auto flx_cond = (flx * (shift(inp_at, {0, 0, -1}) - inp_at)) > 0;
auto flx0 = at::where(flx_cond, zeros, flx);
auto fly = shift(lap, {0, -1, 0}) - lap;
auto fly_cond = (fly * (shift(inp_at, {0, -1, 0}) - inp_at)) > 0;
auto fly0 = at::where(fly_cond, zeros, fly);
auto ref = inp_at -
coeff_at *
((flx0 - shift(flx0, {0, 0, 1})) + (fly0 - shift(fly0, {0, 1, 0})));
ref = ref.index(indices);
testValidate(&fusion, {fuser_output}, inputs, {ref}, __LINE__, __FILE__);
}
}
// 3x3 max pooling
TEST_F(NVFuserTest, FusionMaxPooling_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Format: CHW
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// 3x3 pooling of the HW spatial domain
std::vector<std::vector<int>> offsets;
for (int i = -1; i <= 1; ++i) {
for (int j = -1; j <= 1; ++j) {
if (i == 0 && j == 0) {
continue;
}
offsets.push_back({i, j});
}
}
std::vector<TensorView*> inp_tile({inp});
for (auto offset : offsets) {
offset.insert(offset.begin(), 0);
inp_tile.push_back(shift(inp, offset));
}
TensorView* max_tensor = nullptr;
for (auto tv : inp_tile) {
if (max_tensor == nullptr) {
max_tensor = tv;
} else {
max_tensor = binaryOp(BinaryOpType::Max, max_tensor, tv);
}
}
fusion.addOutput(max_tensor);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Tiling the spatial domain
const int tile_x = 32;
const int tile_y = 8;
max_tensor->split(-2, tile_y);
max_tensor->axis(-2)->parallelize(ParallelType::TIDy);
max_tensor->split(-1, tile_x);
max_tensor->axis(-1)->parallelize(ParallelType::TIDx);
max_tensor->reorder({{-3, -2}});
inp_cache->computeAt(max_tensor, 3);
inp_cache->axis(-2)->parallelize(ParallelType::TIDy);
inp_cache->axis(-1)->parallelize(ParallelType::TIDx);
inp_cache->setMemoryType(MemoryType::Shared);
auto max_tensor_dep =
DependencyCheck::getAllValsBetween({inp_cache}, {max_tensor});
for (auto tv : ir_utils::filterByType<TensorView>(max_tensor_dep)) {
if (tv == inp_cache || tv == max_tensor) {
continue;
}
tv->computeAt(max_tensor, -1);
}
max_tensor->axis(0)->parallelize(ParallelType::BIDx);
const int hw = 50;
const int num_channels = 20;
const int pooling_window = 3;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options);
// shift always pads by zero, so if all surrounding values are
// negative, max pooling would pick a padded value, which isn't the
// correct behavior. We need to be able to choose the value of
// padding. In this case, padding by the minimum value would not
// have this problem. For now, avoid the problem by making sure all
// values are not negative.
aten_inp = at::abs(aten_inp);
std::vector<IValue> inputs = {aten_inp};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = at::max_pool2d(
aten_inp, {pooling_window, pooling_window}, {1, 1}, {1, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionGather1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {1, 3};
const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
auto tv1 = gather(tv0, window_shape, padding_width);
fusion.addOutput(tv1);
const int s1 = 11;
const int s2 = 13;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({s1, s2}, options);
auto ref = gather(t0, window_shape, padding_width);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0});
TORCH_CHECK(ref.equal(outputs[0]));
}
TEST_F(NVFuserTest, FusionGather2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const std::vector<int> window_shape = {1, 3};
const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = gather(tv1, window_shape, padding_width);
auto tv3 = sum(tv2, {-1});
fusion.addOutput(tv3);
tv3->split(1, 32);
tv0->computeAt(tv3, 2);
tv2->computeAt(tv3, -1);
tv3->axis(0)->parallelize(ParallelType::BIDy);
tv3->axis(1)->parallelize(ParallelType::BIDx);
tv3->axis(2)->parallelize(ParallelType::TIDx);
tv1->axis(2)->parallelize(ParallelType::TIDx);
tv1->setMemoryType(MemoryType::Shared);
const int s1 = 99;
const int s2 = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({s1, s2}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = gather(t1, window_shape, padding_width);
auto ref = sum(t2, {-1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionGather3_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {1, 3};
const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
auto tv1 = gather(tv0, window_shape, padding_width);
fusion.addOutput(tv1);
const int s1 = 11;
const int s2 = 13;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
std::vector<int64_t> size({s1, s2});
at::Tensor t0 = at::randn(size, options);
size.insert(size.end(), window_shape.begin(), window_shape.end());
// Use a pre-allocated output tensor filled with 1 so that invalid
// writes to outside valid ranges can be detected
at::Tensor output = at::ones(size, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0}, {output});
auto ref = gather(t0, window_shape, padding_width);
TORCH_CHECK(ref.equal(outputs[0]));
}
TEST_F(NVFuserTest, FusionGather4_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {3, 3};
const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
auto tv1 = gather(tv0, window_shape, padding_width);
fusion.addOutput(tv1);
const int s1 = 11;
const int s2 = 13;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
std::vector<int64_t> size({s1, s2});
at::Tensor t0 = at::randn(size, options);
size.insert(size.end(), window_shape.begin(), window_shape.end());
// Use a pre-allocated output tensor filled with 1 so that invalid
// writes to outside valid ranges can be detected
at::Tensor output = at::ones(size, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0}, {output});
auto ref = gather(t0, window_shape, padding_width);
TORCH_CHECK(ref.equal(outputs[0]));
}
TEST_F(NVFuserTest, FusionGather5_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {3, 3};
const std::vector<std::vector<int>> padding_width = {{1, 0}, {0, 1}};
auto tv1 = gather(tv0, window_shape, padding_width);
fusion.addOutput(tv1);
const int s1 = 11;
const int s2 = 13;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
std::vector<int64_t> size({s1, s2});
at::Tensor t0 = at::randn(size, options);
size.insert(size.end(), window_shape.begin(), window_shape.end());
// Use a pre-allocated output tensor filled with 1 so that invalid
// writes to outside valid ranges can be detected
at::Tensor output = at::ones(size, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0}, {output});
auto ref = gather(t0, window_shape, padding_width);
TORCH_CHECK(ref.equal(outputs[0]));
}
// Conv-like pattern with no padding
TEST_F(NVFuserTest, FusionGather6_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {3, 4};
const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
auto tv1 = gather(tv0, window_shape, padding_width);
fusion.addOutput(tv1);
// Blocking the spatial dimensions
const int block_x = 16;
const int block_y = 8;
auto tv0_cache = tv0->cache_after();
auto out = tv1;
auto out_cache = out->cache_before();
out->split(1, block_x);
out->split(0, block_y);
out->reorder({{1, 2}, {2, 1}});
TransformPropagator::from(out);
tv0->computeAt(out, 2);
tv0_cache->setMemoryType(MemoryType::Shared);
out->axis(0)->parallelize(ParallelType::BIDy);
out->axis(1)->parallelize(ParallelType::BIDx);
out->axis(2)->parallelize(ParallelType::TIDy);
out->axis(3)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
const int s1 = 101;
const int s2 = 99;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
std::vector<int64_t> size({s1, s2});
at::Tensor t0 = at::randn(size, options);
size.insert(size.end(), window_shape.begin(), window_shape.end());
// Use a pre-allocated output tensor filled with 1 so that invalid
// writes to outside valid ranges can be detected
at::Tensor output = at::ones(size, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0}, {output});
auto ref = gather(t0, window_shape, padding_width);
TORCH_CHECK(ref.equal(outputs[0]));
}
// Conv-like pattern with irregular padding
TEST_F(NVFuserTest, FusionGather7_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {3, 4};
const std::vector<std::vector<int>> padding_width = {{0, 2}, {2, 1}};
auto tv1 = gather(tv0, window_shape, padding_width);
fusion.addOutput(tv1);
// Blocking the spatial dimensions
const int block_x = 16;
const int block_y = 8;
auto tv0_cache = tv0->cache_after();
auto out = tv1;
auto out_cache = out->cache_before();
out->split(1, block_x);
out->split(0, block_y);
out->reorder({{1, 2}, {2, 1}});
TransformPropagator::from(out);
tv0->computeAt(out, 2);
tv0_cache->setMemoryType(MemoryType::Shared);
out->axis(0)->parallelize(ParallelType::BIDy);
out->axis(1)->parallelize(ParallelType::BIDx);
out->axis(2)->parallelize(ParallelType::TIDy);
out->axis(3)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
const int s1 = 101;
const int s2 = 99;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
std::vector<int64_t> size({s1, s2});
at::Tensor t0 = at::randn(size, options);
size.insert(size.end(), window_shape.begin(), window_shape.end());
at::Tensor output = at::ones(size, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0}, {output});
auto ref = gather(t0, window_shape, padding_width);
TORCH_CHECK(ref.equal(outputs[0]));
}
// With no padding but with striding
TEST_F(NVFuserTest, FusionGather8_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {2, 3};
const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
const std::vector<int> strides = {3, 3};
auto tv1 = gather(tv0, window_shape, padding_width, strides);
fusion.addOutput(tv1);
const int s1 = 11;
const int s2 = 13;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
std::vector<int64_t> size({s1, s2});
at::Tensor t0 = at::randn(size, options);
for (const auto i : c10::irange(size.size())) {
size[i] = ceilDiv(
size[i] - window_shape[i] + 1 + padding_width[i][0] +
padding_width[i][1],
strides[i]);
}
size.insert(size.end(), window_shape.begin(), window_shape.end());
// Use a pre-allocated output tensor filled with 1 so that invalid
// writes to outside valid ranges can be detected
at::Tensor output = at::ones(size, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0}, {output});
auto ref = gather(t0, window_shape, padding_width, strides);
TORCH_CHECK(ref.equal(outputs[0]));
}
// Similar to Gather8 but with splitting and parallelization
TEST_F(NVFuserTest, FusionGather9_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {3, 4};
const std::vector<std::vector<int>> padding_width = {{0, 0}, {0, 0}};
const std::vector<int> strides = {2, 2};
auto tv1 = gather(tv0, window_shape, padding_width, strides);
fusion.addOutput(tv1);
// Blocking the spatial dimensions
const int block_x = 16;
const int block_y = 8;
auto tv0_cache = tv0->cache_after();
auto out = tv1;
auto out_cache = out->cache_before();
out->split(1, block_x);
out->split(0, block_y);
out->reorder({{1, 2}, {2, 1}});
TransformPropagator::from(out);
tv0->computeAt(out, 2);
tv0_cache->setMemoryType(MemoryType::Shared);
out->axis(0)->parallelize(ParallelType::BIDy);
out->axis(1)->parallelize(ParallelType::BIDx);
out->axis(2)->parallelize(ParallelType::TIDy);
out->axis(3)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
const int s1 = 101;
const int s2 = 99;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
std::vector<int64_t> size({s1, s2});
at::Tensor t0 = at::randn(size, options);
for (const auto i : c10::irange(size.size())) {
size[i] = ceilDiv(
size[i] - window_shape[i] + 1 + padding_width[i][0] +
padding_width[i][1],
strides[i]);
}
size.insert(size.end(), window_shape.begin(), window_shape.end());
// Use a pre-allocated output tensor filled with 1 so that invalid
// writes to outside valid ranges can be detected
at::Tensor output = at::ones(size, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0}, {output});
auto ref = gather(t0, window_shape, padding_width, strides);
TORCH_CHECK(ref.equal(outputs[0]));
}
TEST_F(NVFuserTest, FusionConv2D_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [C, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// Weights: [K, C, 3, 3]
auto w = makeSymbolicTensor(4);
fusion.addInput(w);
// Gather a neighbor tile of [3, 3] with padding size of 1 for each
// side of the spatial dimensions
auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}});
// inp_tile: [C, H, W, 1, 3, 3]
auto inp_bc =
broadcast(inp_tile, {true, false, false, false, false, false, false});
auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
auto inp_times_w = mul(inp_bc, w_bc);
// Reduce the channel and neighbor tile dimensions
auto out = sum(inp_times_w, {1, 4, 5, 6});
fusion.addOutput(out);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
// Blocking the channel dimension
const int block_c = 8;
out->split(2, block_h);
out->split(4, block_w);
out->reorder({{3, 4}});
// out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
out->split(1, block_c);
// out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
auto out_rf = out->rFactor({1, -3, -2, -1});
// out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
// out_rf: [K, Ci, Ho, Wo, Hi, Wi]
// Create a [block_x, block_y] tile on smem
inp_cache->computeAt(out, 4);
// inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
inp_cache->setMemoryType(MemoryType::Shared);
// Move Ci forward
out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
inp_cache->computeAt(out_rf, 5);
inp_tile->computeAt(out_rf, -1);
w->computeAt(out_rf, -1);
out->axis(0)->parallelize(ParallelType::BIDx);
out->axis(1)->parallelize(ParallelType::TIDz);
out->axis(4)->parallelize(ParallelType::TIDy);
out->axis(5)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
const int dim_h = 99;
const int dim_w = 101;
const int dim_c = 10;
const int dim_f = 20;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
std::vector<IValue> inputs = {at_inp, at_w};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1);
at_out = at_out.squeeze(0); // drop the N axis
testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionConv2DNoPadding_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [C, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// Weights: [K, C, 3, 3]
auto w = makeSymbolicTensor(4);
fusion.addInput(w);
// Gather a neighbor tile of [3, 3] with no padding
auto inp_tile =
gather(inp, {1, 3, 3}, {{0, 0}, {0, 0}, {0, 0}}, {1, 1, 1}, true);
// inp_tile: [C, H-2, W-2, 1, 3, 3]
auto inp_bc =
broadcast(inp_tile, {true, false, false, false, false, false, false});
auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
auto inp_times_w = mul(inp_bc, w_bc);
// Reduce the channel and neighbor tile dimensions
auto out = sum(inp_times_w, {1, 4, 5, 6});
fusion.addOutput(out);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
// Blocking the channel dimension
const int block_c = 8;
out->split(2, block_h);
out->split(4, block_w);
out->reorder({{3, 4}});
// out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
out->split(1, block_c);
// out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
auto out_rf = out->rFactor({1, -3, -2, -1});
// out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
// out_rf: [K, Ci, Ho, Wo, Hi, Wi]
// Create a [block_x, block_y] tile on smem
inp_cache->computeAt(out, 4);
// inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
inp_cache->setMemoryType(MemoryType::Shared);
// Move Ci forward
out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
inp_cache->computeAt(out_rf, 5);
inp_tile->computeAt(out_rf, -1);
w->computeAt(out_rf, -1);
out->axis(0)->parallelize(ParallelType::BIDx);
out->axis(1)->parallelize(ParallelType::TIDz);
out->axis(4)->parallelize(ParallelType::TIDy);
out->axis(5)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
const int dim_h = 99;
const int dim_w = 101;
const int dim_c = 10;
const int dim_f = 20;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
std::vector<IValue> inputs = {at_inp, at_w};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
std::vector<int64_t> stride = {1, 1};
std::vector<int64_t> padding = {0, 0};
auto at_out = at::conv2d(at_inp, at_w, {}, stride, padding);
at_out = at_out.squeeze(0); // drop the N axis
testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionConv2DNoPaddingStrided_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [C, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// Weights: [K, C, 3, 3]
auto w = makeSymbolicTensor(4);
fusion.addInput(w);
// Gather a neighbor tile of [2, 2] with no padding and strides of
// [2, 2]
auto inp_tile = gather(inp, {1, 2, 2}, {{0, 0}, {0, 0}, {0, 0}}, {1, 2, 2});
// inp_tile: [C, H/2, W/2, 1, 2, 2]
auto inp_bc =
broadcast(inp_tile, {true, false, false, false, false, false, false});
auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
auto inp_times_w = mul(inp_bc, w_bc);
// Reduce the channel and neighbor tile dimensions
auto out = sum(inp_times_w, {1, 4, 5, 6});
fusion.addOutput(out);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
// Blocking the channel dimension
const int block_c = 8;
out->split(2, block_h);
out->split(4, block_w);
out->reorder({{3, 4}});
// out: [K, C, Ho, Wo, Hi, Wi, 1, 3, 3]
out->split(1, block_c);
// out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
auto out_rf = out->rFactor({1, -3, -2, -1});
// out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 3, 3]
// out_rf: [K, Ci, Ho, Wo, Hi, Wi]
// Create a [block_x, block_y] tile on smem
inp_cache->computeAt(out, 4);
// inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
inp_cache->setMemoryType(MemoryType::Shared);
// Move Ci forward
out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
inp_cache->computeAt(out_rf, 5);
inp_tile->computeAt(out_rf, -1);
w->computeAt(out_rf, -1);
out->axis(0)->parallelize(ParallelType::BIDx);
out->axis(1)->parallelize(ParallelType::TIDz);
out->axis(4)->parallelize(ParallelType::TIDy);
out->axis(5)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
const int dim_h = 99;
const int dim_w = 101;
const int dim_c = 10;
const int dim_f = 20;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options);
std::vector<IValue> inputs = {at_inp, at_w};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
std::vector<int64_t> stride = {2, 2};
std::vector<int64_t> padding = {0, 0};
auto at_out = at::conv2d(at_inp, at_w, {}, stride, padding);
at_out = at_out.squeeze(0); // drop the N axis
testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
}
// 5x5 followed by 3x3
TEST_F(NVFuserTest, FusionConv2DChain_CUDA) {
const int dim_w1_h = 5;
const int dim_w1_w = 5;
const int dim_pad1_h = (dim_w1_h - 1) / 2;
const int dim_pad1_w = (dim_w1_w - 1) / 2;
const int dim_w2_h = 3;
const int dim_w2_w = 3;
const int dim_pad2_h = (dim_w2_h - 1) / 2;
const int dim_pad2_w = (dim_w2_w - 1) / 2;
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [K1, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// Weights: [K2, K1, S1, T1]
auto w1 = makeSymbolicTensor(4);
fusion.addInput(w1);
// Weights: [K3, K2, S2, T2]
auto w2 = makeSymbolicTensor(4);
fusion.addInput(w2);
// Gather a neighbor tile of [w1_h, w1_w] with padding
auto inp_tile = gather(
inp,
{1, dim_w1_h, dim_w1_w},
{{0, 0}, {dim_pad1_h, dim_pad1_h}, {dim_pad1_w, dim_pad1_w}});
// inp_tile: [C, 1, H - w1_h + 1, W - w1_w + 1, w1_h, w1_w]
auto inp_bc =
broadcast(inp_tile, {true, false, false, false, false, false, false});
auto w1_bc = broadcast(w1, {false, false, true, true, true, false, false});
auto inp_times_w1 = mul(inp_bc, w1_bc);
// Reduce the channel and neighbor tile dimensions
auto out1 = sum(inp_times_w1, {1, 4, 5, 6});
// Second conv
auto out1_tile = gather(
out1,
{1, dim_w2_h, dim_w2_w},
{{0, 0}, {dim_pad2_h, dim_pad2_h}, {dim_pad2_w, dim_pad2_w}});
auto out1_bc =
broadcast(out1_tile, {true, false, false, false, false, false, false});
auto w2_bc = broadcast(w2, {false, false, true, true, true, false, false});
auto out1_times_w2 = mul(out1_bc, w2_bc);
auto out2 = sum(out1_times_w2, {1, 4, 5, 6});
fusion.addOutput(out2);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
out2->split(2, block_h);
out2->split(4, block_w);
out2->reorder({{3, 4}});
// out2: [K3, K2, Ho, Wo, Hi, Wi, 1, 3, 3]
// Create a [block_x, block_y] tile on smem
inp_cache->computeAt(out2, 4);
// inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
inp_cache->setMemoryType(MemoryType::Shared);
// Move Ci forward
out1->reorder({{5, 3}, {3, 4}, {4, 5}});
out1->setMemoryType(MemoryType::Shared);
inp_cache->computeAt(out1, 4);
inp_tile->computeAt(out1, -1);
w1->computeAt(out1, -1);
out1_tile->computeAt(out2, -1);
w2->computeAt(out2, -1);
out2->axis(0)->parallelize(ParallelType::BIDx);
out2->axis(4)->parallelize(ParallelType::TIDy);
out2->axis(5)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out2, {inp_cache, out1});
const int dim_h = 99;
const int dim_w = 101;
const int dim_k1 = 3;
const int dim_k2 = 5;
const int dim_k3 = 7;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_k1, dim_h, dim_w}, options);
at::Tensor at_w1 = at::randn({dim_k2, dim_k1, dim_w1_h, dim_w1_w}, options);
at::Tensor at_w2 = at::randn({dim_k3, dim_k2, dim_w2_h, dim_w2_w}, options);
std::vector<IValue> inputs = {at_inp, at_w1, at_w2};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
auto at_out1 = at::conv2d(at_inp, at_w1, {}, 1, 2);
auto at_out2 = at::conv2d(at_out1, at_w2, {}, 1, 1);
at_out2 = at_out2.squeeze(0); // drop the N axis
testValidate(&fusion, cg_outputs, inputs, {at_out2}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionConv2DStaticEvenSizedWindow_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [C, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// Weights: [K, C, 2, 2]
auto w = makeSymbolicTensor(4);
fusion.addInput(w);
// Gather a neighbor tile of [2, 2] with padding size of 1 only for
// the right side of the spatial dimensions. The left padding is
// zero so that the output axis stays the same.
auto inp_tile = gather(inp, {1, 2, 2}, {{0, 0}, {0, 1}, {0, 1}});
// inp_tile: [C, H, W, 1, 2, 2]
auto inp_bc =
broadcast(inp_tile, {true, false, false, false, false, false, false});
auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
auto inp_times_w = mul(inp_bc, w_bc);
// Reduce the channel and neighbor tile dimensions
auto out = sum(inp_times_w, {1, 4, 5, 6});
fusion.addOutput(out);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
// Blocking the channel dimension
const int block_c = 8;
out->split(2, block_h);
out->split(4, block_w);
out->reorder({{3, 4}});
// out: [K, C, Ho, Wo, Hi, Wi, 1, 2, 2]
out->split(1, block_c);
// out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 2, 2]
auto out_rf = out->rFactor({1, -3, -2, -1});
// out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 2, 2]
// out_rf: [K, Ci, Ho, Wo, Hi, Wi]
// Create a [block_x, block_y] tile on smem
inp_cache->computeAt(out, 4);
// inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
inp_cache->setMemoryType(MemoryType::Shared);
// Move Ci forward
out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
inp_cache->computeAt(out_rf, 5);
inp_tile->computeAt(out_rf, -1);
w->computeAt(out_rf, -1);
out->axis(0)->parallelize(ParallelType::BIDx);
out->axis(1)->parallelize(ParallelType::TIDz);
out->axis(4)->parallelize(ParallelType::TIDy);
out->axis(5)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
const int dim_h = 99;
const int dim_w = 101;
const int dim_c = 10;
const int dim_f = 20;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
at::Tensor at_w = at::randn({dim_f, dim_c, 2, 2}, options);
std::vector<IValue> inputs = {at_inp, at_w};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
auto at_out = at::conv2d(at_inp, at_w, {}, 1, 1);
at_out = at_out.squeeze(0); // drop the N axis
// The shape of the spatial domain is (dim_h+1)x(dim_w+1), whereas
// the fuser output has dim_h*dim_w. Drop the first elements to make
// it match with the fuser output.
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(0, at::indexing::None),
at::indexing::Slice(1, at::indexing::None),
at::indexing::Slice(1, at::indexing::None)};
at_out = at_out.index(indices);
testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionConv4x4Pad1x1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [C, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// Weights: [K, C, 4, 4]
auto w = makeSymbolicTensor(4);
fusion.addInput(w);
// Gather a neighbor tile of [4, 4] with padding size of 1 for both
// sides of the spatial dimensions. The resulting extent is
// decreased by one.
auto inp_tile =
gather(inp, {1, 4, 4}, {{0, 0}, {1, 1}, {1, 1}}, {1, 1, 1}, true);
// inp_tile: [C, H-1, W-1, 1, 4, 4]
auto inp_bc =
broadcast(inp_tile, {true, false, false, false, false, false, false});
auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
auto inp_times_w = mul(inp_bc, w_bc);
// Reduce the channel and neighbor tile dimensions
auto out = sum(inp_times_w, {1, 4, 5, 6});
fusion.addOutput(out);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
// Blocking the channel dimension
const int block_c = 8;
out->split(2, block_h);
out->split(4, block_w);
out->reorder({{3, 4}});
// out: [K, C, Ho, Wo, Hi, Wi, 1, 4, 4]
out->split(1, block_c);
// out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 4, 4]
auto out_rf = out->rFactor({1, -3, -2, -1});
// out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 4, 4]
// out_rf: [K, Ci, Ho, Wo, Hi, Wi]
// Create a [block_x, block_y] tile on smem
inp_cache->computeAt(out, 4);
// inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
inp_cache->setMemoryType(MemoryType::Shared);
// Move Ci forward
out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
inp_cache->computeAt(out_rf, 5);
inp_tile->computeAt(out_rf, -1);
w->computeAt(out_rf, -1);
out->axis(0)->parallelize(ParallelType::BIDx);
out->axis(1)->parallelize(ParallelType::TIDz);
out->axis(4)->parallelize(ParallelType::TIDy);
out->axis(5)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
const int dim_h = 99;
const int dim_w = 101;
const int dim_c = 10;
const int dim_f = 20;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
at::Tensor at_w = at::randn({dim_f, dim_c, 4, 4}, options);
std::vector<IValue> inputs = {at_inp, at_w};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
auto at_out =
at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 1, 1);
at_out = at_out.squeeze(0); // drop the N axis
testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionConv4x5Pad1x2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [C, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// Weights: [K, C, 4, 4]
auto w = makeSymbolicTensor(4);
fusion.addInput(w);
// Gather a neighbor tile of [4, 5] with padding size of 1 and 2 for
// each side of the spatial dimensions.
auto inp_tile =
gather(inp, {1, 4, 5}, {{0, 0}, {1, 1}, {2, 2}}, {1, 1, 1}, true);
// inp_tile: [C, H-1, W, 1, 4, 5]
auto inp_bc =
broadcast(inp_tile, {true, false, false, false, false, false, false});
auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
auto inp_times_w = mul(inp_bc, w_bc);
// Reduce the channel and neighbor tile dimensions
auto out = sum(inp_times_w, {1, 4, 5, 6});
fusion.addOutput(out);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
// Blocking the channel dimension
const int block_c = 8;
out->split(2, block_h);
out->split(4, block_w);
out->reorder({{3, 4}});
// out: [K, C, Ho, Wo, Hi, Wi, 1, 4, 5]
out->split(1, block_c);
// out: [K, Co, Ci, Ho, Wo, Hi, Wi, 1, 4, 5]
auto out_rf = out->rFactor({1, -3, -2, -1});
// out_rf: [K, rCo, Ci, Ho, Wo, Hi, Wi, 1, 4, 5]
// out_rf: [K, Ci, Ho, Wo, Hi, Wi]
// Create a [block_x, block_y] tile on smem
inp_cache->computeAt(out, 4);
// inp_cache: [Co, Ho, Wo, Ci, Hi, Wi]
inp_cache->setMemoryType(MemoryType::Shared);
// Move Ci forward
out_rf->reorder({{-4, -6}, {-5, -4}, {-6, -5}});
inp_cache->computeAt(out_rf, 5);
inp_tile->computeAt(out_rf, -1);
w->computeAt(out_rf, -1);
out->axis(0)->parallelize(ParallelType::BIDx);
out->axis(1)->parallelize(ParallelType::TIDz);
out->axis(4)->parallelize(ParallelType::TIDy);
out->axis(5)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
const int dim_h = 99;
const int dim_w = 101;
const int dim_c = 10;
const int dim_f = 20;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
at::Tensor at_w = at::randn({dim_f, dim_c, 4, 5}, options);
std::vector<IValue> inputs = {at_inp, at_w};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
auto at_out =
at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 1, {1, 2});
at_out = at_out.squeeze(0); // drop the N axis
testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionConv4x4Pad1x1Stride4_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [C, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// Weights: [K, C, 3, 3]
auto w = makeSymbolicTensor(4);
fusion.addInput(w);
// Gather a neighbor tile of [4, 4] with padding size of 1 for both
// sides of the spatial dimensions. Set the stride width as 4.
auto inp_tile = gather(inp, {1, 4, 4}, {{0, 0}, {1, 1}, {1, 1}}, {1, 4, 4});
// inp_tile: [C, H/4, s4, W/4, s4, 1, 4, 4]
auto inp_bc =
broadcast(inp_tile, {true, false, false, false, false, false, false});
auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
auto inp_times_w = mul(inp_bc, w_bc);
// Reduce the channel and neighbor tile dimensions
auto out = sum(inp_times_w, {1, 4, 5, 6});
fusion.addOutput(out);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
const int block_c = 2;
// [K, C, H/s, W/s, 1, 4, 4]
out->split(2, block_h);
// [K, C, H/s/block_h, block_h, W/s, 1, 4, 4]
out->split(4, block_w);
// [K, C, H/s/block_h, block_h, W/s/block_w, block_w, 1, 4, 4]
out->reorder({{3, 4}});
// [K, C, H/s/block_h, W/s/block_w, block_h, block_w, 1, 4, 4]
out->split(1, block_c);
// [K, C/block_c, block_c, H/s/block_h, W/s/block_w, block_h, block_w, 1, 4,
// 4]
out->split(4, 1);
// [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
// 4, 4]
auto out_rf = out->rFactor({1, -3, -2, -1});
// [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
// 4, 4]
// out: [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w]
inp_cache->computeAt(out, 5);
inp_cache->setMemoryType(MemoryType::Shared);
// [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, C/block_c, 1,
// 4, 4]
// Move C/block_c before block_h/2 and share the domain from
// inp_cache to out_rf
out_rf->reorder({{7, 5}, {5, 6}, {6, 7}});
inp_cache->computeAt(out_rf, 6);
inp_tile->computeAt(out_rf, -1);
w->computeAt(out_rf, -1);
out->axis(0)->parallelize(ParallelType::BIDx);
out->axis(1)->parallelize(ParallelType::TIDz);
out->axis(4)->parallelize(ParallelType::Unswitch);
out->axis(5)->parallelize(ParallelType::TIDy);
out->axis(6)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
const int dim_h = 99;
const int dim_w = 101;
const int dim_c = 10;
const int dim_f = 20;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
at::Tensor at_w = at::randn({dim_f, dim_c, 4, 4}, options);
std::vector<IValue> inputs = {at_inp, at_w};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
auto at_out =
at::conv2d(at_inp.to(at::kDouble), at_w.to(at::kDouble), {}, 4, {1, 1});
at_out = at_out.squeeze(0); // drop the N axis
testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
}
// POC implementation of im2col for 3-by-3 kernels
TEST_F(NVFuserTest, FusionIm2Col_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [N, C, H, W]
auto inp = makeSymbolicTensor(4);
fusion.addInput(inp);
// Gather a neighbor tile of [3, 3] with padding size of 1 for each
// side of the spatial dimensions
auto inp_tile = gather(inp, {1, 1, 3, 3}, {{0, 0}, {0, 0}, {1, 1}, {1, 1}});
// inp_tile: [N, C, H, W, 1, 1, 3, 3]
auto inp_col = transpose(inp_tile, {{1, 3}, {2, 1}, {3, 2}});
// inp_col: [N, H, W, C, 1, 1, 3, 3]
fusion.addOutput(inp_col);
////////////////////////////////////
// Cache the input tensor
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
auto out = inp_col;
out->split(1, block_h);
out->split(3, block_w);
out->reorder({{2, 3}});
// out: [N, Ho, Wo, Hi, Wi, C, 1, 1, 3, 3]
// Move the C axis out of Hi*Wi
out->reorder({{5, 3}, {3, 4}, {4, 5}});
// out: [N, Ho, Wo, C, Hi, Wi, 1, 1, 3, 3]
// Create a [block_x, block_y] tile on smem
inp_cache->computeAt(out, 4);
inp_cache->setMemoryType(MemoryType::Shared);
// Fully inline inp_tile
inp_tile->computeAt(out, -1);
out->axis(0)->parallelize(ParallelType::BIDz);
out->axis(1)->parallelize(ParallelType::BIDy);
out->axis(2)->parallelize(ParallelType::BIDx);
out->axis(4)->parallelize(ParallelType::TIDy);
out->axis(5)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {inp_cache, inp_tile});
const int dim_h = 31;
const int dim_w = 33;
const int dim_c = 5;
const int dim_n = 3;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_n, dim_c, dim_h, dim_w}, options);
std::vector<IValue> inputs = {at_inp};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
auto at_out = at::im2col(at_inp, {3, 3}, {1, 1}, {1, 1}, {1, 1});
// at::im2col outputs [N, C*3*3, N*H]
at_out = at::transpose(at_out, 1, 2);
at_out = at::reshape(at_out, {dim_n, dim_h, dim_w, dim_c, 1, 1, 3, 3});
testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftNoPadding1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {1, -1}, false);
auto tv3 = shift(tv1, {-1, 1}, false);
auto tv4 = add(tv2, tv3);
auto tv5 = sum(tv4, {0, 1});
fusion.addOutput(tv5);
tv1->setMemoryType(MemoryType::Shared);
tv5->split(0, 4);
tv5->split(-1, 8);
tv5->reorder({{1, 2}});
TransformPropagator::from(tv5);
tv2->computeAt(tv5, -1);
tv3->computeAt(tv5, -1);
tv5->axis(-1)->parallelize(ParallelType::TIDx);
tv5->axis(-2)->parallelize(ParallelType::TIDy);
scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {1, -1});
auto t3 = shift(t1, {-1, 1});
auto t4 = t2 + t3;
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
t4 = t4.index(indices);
auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Split and merge
TEST_F(NVFuserTest, FusionShiftNoPadding2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {1, -1}, false);
auto tv3 = shift(tv1, {-1, 1}, false);
auto tv4 = add(tv2, tv3);
auto tv5 = sum(tv4, {0, 1});
fusion.addOutput(tv5);
tv1->setMemoryType(MemoryType::Shared);
tv5->split(0, 4);
tv5->split(-1, 8);
tv5->reorder({{1, 2}});
tv5->merge(-2, -1);
TransformPropagator::from(tv5);
tv2->computeAt(tv5, -1);
tv3->computeAt(tv5, -1);
tv5->axis(-1)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {1, -1});
auto t3 = shift(t1, {-1, 1});
auto t4 = t2 + t3;
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
t4 = t4.index(indices);
auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Split and merge, then welford
TEST_F(NVFuserTest, FusionShiftNoPadding3_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {1, -1}, false);
auto tv3 = shift(tv1, {-1, 1}, false);
auto tv4 = add(tv2, tv3);
auto tvs = Welford(tv4, {0, 1});
auto tv_avg = tvs.avg;
auto tv_M2 = tvs.var_sum;
auto tv_N = tvs.n;
fusion.addOutput(tv_avg);
fusion.addOutput(tv_M2);
fusion.addOutput(tv_N);
tv1->setMemoryType(MemoryType::Shared);
tv_avg->split(0, 4);
tv_avg->split(-1, 8);
tv_avg->reorder({{1, 2}});
tv_avg->merge(-2, -1);
TransformPropagator::from(tv_avg);
tv2->computeAt(tv_avg, -1);
tv3->computeAt(tv_avg, -1);
tv_avg->axis(-1)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(tv_avg, ir_utils::allTvs(&fusion));
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
outputs[1] /= (numel_x - 2) * (numel_y - 2);
auto t1 = t0 + 1;
auto t2 = shift(t1, {1, -1});
auto t3 = shift(t1, {-1, 1});
auto t4 = t2 + t3;
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
t4 = t4.index(indices);
auto ref_avg = t4.mean(at::ArrayRef<int64_t>{0, 1});
auto ref_M2 = t4.var(at::ArrayRef<int64_t>{0, 1}, false);
auto ref_N = at::ones({}, options_int) * (numel_x - 2) * (numel_y - 2);
testValidate(
&fusion, outputs, inputs, {ref_avg, ref_M2, ref_N}, __LINE__, __FILE__);
}
// Shift indexing and predication with contiguous merge
TEST_F(NVFuserTest, FusionShiftNoPaddingContigMerge_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {1, -1}, true);
auto tv3 = shift(tv1, {-1, 1}, false);
auto tv4 = add(tv2, tv3);
fusion.addOutput(tv4);
tv2->merge(0);
tv3->merge(0);
tv4->merge(0);
tv1->setMemoryType(MemoryType::Global);
tv2->setMemoryType(MemoryType::Global);
tv3->setMemoryType(MemoryType::Global);
int numel_x = 9;
int numel_y = 11;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(1, -1), at::indexing::Slice(1, -1)};
auto fuser_out = outputs[0].index(indices);
auto t1 = t0 + 1;
auto t2 = shift(t1, {1, -1});
auto t3 = shift(t1, {-1, 1});
auto ref = t2 + t3;
ref = ref.index(indices);
testValidate(&fusion, {fuser_out}, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftNoPaddingChain_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {1, -1}, false);
auto tv3 = shift(tv2, {1, -1}, false);
auto tv4 = sum(tv3, {0, 1});
fusion.addOutput(tv4);
tv1->setMemoryType(MemoryType::Shared);
tv2->setMemoryType(MemoryType::Shared);
tv4->split(0, 4);
tv4->split(-1, 8);
tv4->reorder({{1, 2}});
tv1->computeAt(tv4, 2);
tv4->axis(-1)->parallelize(ParallelType::TIDx);
tv4->axis(-2)->parallelize(ParallelType::TIDy);
tv4->axis(0)->parallelize(ParallelType::BIDy);
tv4->axis(1)->parallelize(ParallelType::BIDx);
scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {1, -1});
auto t3 = shift(t2, {1, -1});
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(2, at::indexing::None), at::indexing::Slice(0, -2)};
t3 = t3.index(indices);
auto ref = t3.sum(at::ArrayRef<int64_t>{0, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Rfactor is not allowed with partial domains
TEST_F(NVFuserTest, FusionShiftNoPaddingRfactor_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {1, -1}, false);
auto tv3 = sum(tv2, {0, 1});
fusion.addOutput(tv3);
tv3->split(0, 4);
tv3->split(-1, 8);
tv3->reorder({{1, 2}});
ASSERT_ANY_THROW(tv3->rFactor({-2}));
}
TEST_F(NVFuserTest, FusionShiftPadding1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {2, -2}, {1, 1});
auto tv3 = shift(tv1, {-3, 2}, {2, 2});
auto tv4 = add(tv2, tv3);
auto tv5 = sum(tv4, {0, 1});
fusion.addOutput(tv5);
tv1->setMemoryType(MemoryType::Shared);
tv5->split(0, 4);
tv5->split(-1, 8);
tv5->reorder({{1, 2}});
TransformPropagator::from(tv5);
tv2->computeAt(tv5, -1);
tv3->computeAt(tv5, -1);
tv5->axis(-1)->parallelize(ParallelType::TIDx);
tv5->axis(-2)->parallelize(ParallelType::TIDy);
scheduler_utils::parallelizeAllLike(tv5, ir_utils::allTvs(&fusion));
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = shift(t1, {2, -2});
auto t3 = shift(t1, {-3, 2});
auto t4 = t2 + t3;
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(1, -1), at::indexing::Slice(0, -1)};
t4 = t4.index(indices);
auto ref = t4.sum(at::ArrayRef<int64_t>{0, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionPartialSplit1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
// [I]
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(0));
// [I]
auto tv2 = shift(tv1, {1}, false);
// [1:I]
auto tv3 = shift(tv1, {-1}, false);
// [0:I-1]
auto tv4 = add(tv2, tv3);
// [1:I-1]
fusion.addOutput(tv4);
// Partial split of tv4. Split only the valid range, which is
// [1:-1].
tv4->split(0, 8, true, true);
// [(I-2)/8, 8]
// Propagates the partial split back to tv1. This means that all of
// the other tensors are also shaped as [(I-2)/8, 8], which appears
// to mean only the sub region of ((I-2)/8 * 8) is
// computed for tv1, tv2 and tv3. It's fine for the tv2 and tv3
// tensors as only that sub region is used by tv4. It's also fine
// for tv1 since it has halo of size one at each side, so the whole
// region is actually calculated for tv1.
tv1->computeAt(tv4, 1);
tv4->axis(-1)->parallelize(ParallelType::TIDx);
tv4->axis(-2)->parallelize(ParallelType::BIDx);
scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
tv1->setMemoryType(MemoryType::Shared);
// gridDim.x is ceilDiv(numel_x - 2, 8), not ceilDiv(numel_x, 8),
// so it's going to be just 2 rather than 3.
const int numel_x = 18;
ExpressionEvaluator evaluator(&fusion);
auto root_extent = tv4->getRootDomain()[0]->extent();
evaluator.bind(root_extent, numel_x);
auto extent_eval = evaluator.evaluate(tv4->axis(0)->extent());
TORCH_CHECK(
extent_eval.has_value(),
"Invalid evaluation of outer domain extent of partial split");
TORCH_CHECK(
extent_eval.value() == (numel_x - 2) / 8,
"Invalid extent of outer domain of partial split");
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor t0 = at::randn({numel_x}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
std::vector<at::indexing::TensorIndex> indices{at::indexing::Slice(1, -1)};
outputs[0] = outputs[0].index(indices);
auto ref = (shift(t0, {1}) + shift(t0, {-1})).index(indices);
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionPartialSplit2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(0));
auto tv2 = shift(tv1, {1}, false);
auto tv3 = shift(tv1, {-1}, false);
auto tv4 = add(tv2, tv3);
fusion.addOutput(tv4);
auto tv5 = add(tv1, IrBuilder::create<Double>(1));
auto tv6 = add(tv5, IrBuilder::create<Double>(1));
fusion.addOutput(tv6);
tv4->split(0, 4, true, true);
// This causes tv5 and tv6 also to be split with the same partial
// offsets, however, since they need to be calculated entirely, the
// resulting code would be invalid. It should be detected as part of
// initial fusion validation during lowering.
tv1->computeAt(tv4, 1);
// Validation should throw an error due to tv5 and tv6.
ASSERT_ANY_THROW(fusion.printKernel());
}
// 2D version of PartialSplit1
TEST_F(NVFuserTest, FusionPartialSplit3_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(0));
auto tv2 = shift(tv1, {1, 2}, false);
auto tv3 = shift(tv1, {-2, -1}, false);
auto tv4 = add(tv2, tv3);
fusion.addOutput(tv4);
tv4->split(1, 8, true, true);
tv4->split(0, 4, true, true);
tv4->reorder({{1, 2}, {2, 1}});
tv1->computeAt(tv4, 2);
tv4->axis(0)->parallelize(ParallelType::BIDy);
tv4->axis(1)->parallelize(ParallelType::BIDx);
tv4->axis(2)->parallelize(ParallelType::TIDy);
tv4->axis(3)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3});
tv1->setMemoryType(MemoryType::Shared);
const int numel_x = 32 + 3;
const int numel_y = 32 + 3;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
auto options_int = at::TensorOptions().dtype(at::kLong).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(1, -2), at::indexing::Slice(2, -1)};
outputs[0] = outputs[0].index(indices);
auto ref = (shift(t0, {1, 2}) + shift(t0, {-2, -1})).index(indices);
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Almost same fusion with Shift5ptStencilChain but non-padded shift
// and partial split.
TEST_F(NVFuserTest, FusionPartialSplit4_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
std::vector<std::vector<int>> offsets = {{-1, 0}, {1, 0}, {0, -1}, {0, 1}};
// First stencil: 5pt stencil
// stencil1 = (tv0 + tv0[+1][0] + tv0[-1][0] + tv0[0][+1] + tv0[0][-1]) / 5
std::vector<TensorView*> tv_stencil1_shifts;
for (const auto& offset : offsets) {
tv_stencil1_shifts.push_back(shift(tv0, offset, false));
}
auto tv_stencil1 = tv0;
for (auto tv : tv_stencil1_shifts) {
tv_stencil1 = add(tv_stencil1, tv);
}
tv_stencil1 = div(
tv_stencil1, IrBuilder::create<Double>(tv_stencil1_shifts.size() + 1));
// Second stencil: Same 5pt stencil
std::vector<TensorView*> tv_stencil2_shifts;
for (const auto& offset : offsets) {
tv_stencil2_shifts.push_back(shift(tv_stencil1, offset, false));
}
auto tv_stencil2 = tv_stencil1;
for (auto tv : tv_stencil2_shifts) {
tv_stencil2 = add(tv_stencil2, tv);
}
tv_stencil2 = div(
tv_stencil2, IrBuilder::create<Double>(tv_stencil2_shifts.size() + 1));
auto tv_out = tv_stencil2;
fusion.addOutput(tv_out);
auto tv0_cache = tv0->cache_after();
std::vector<int> split_factor({16, 16});
tv_out->split(-1, split_factor[1], true, true);
tv_out->split(0, split_factor[0], true, true);
tv_out->reorder({{1, 2}, {2, 1}});
tv0->computeAt(tv_out, 2);
// Inline completely all inputs to the first stencil output, except for the
// tv0 cache
for (auto tv : tv_stencil1_shifts) {
tv->computeAt(tv_stencil1, -1);
}
// Inline completely all inputs to the second stencil output, except
// for the first stencil output
for (auto tv : tv_stencil2_shifts) {
tv->computeAt(tv_stencil2, -1);
}
tv_out->axis(0)->parallelize(ParallelType::BIDy);
tv_out->axis(1)->parallelize(ParallelType::BIDx);
tv_out->axis(2)->parallelize(ParallelType::TIDy);
tv_out->axis(3)->parallelize(ParallelType::TIDx);
auto all_values = DependencyCheck::getAllValsBetween(
{fusion.inputs().begin(), fusion.inputs().end()}, fusion.outputs());
for (auto tv : ir_utils::filterByType<TensorView>(all_values)) {
scheduler_utils::parallelizeAllLike(tv_out, {tv});
}
tv0_cache->setMemoryType(MemoryType::Shared);
tv_stencil1->setMemoryType(MemoryType::Shared);
// Input matrix size is 68x68, and the output is 64x64. Both
// gridDim.x and gridim.y should be ceilDiv(numel - 4,
// split_factor), which is 4. If full split is used, the grid
// dimension would be 5.
const int numel_x = 64 + 4;
const int numel_y = 64 + 4;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(2, -2), at::indexing::Slice(2, -2)};
outputs[0] = outputs[0].index(indices);
auto stencil1 = t0;
for (const auto& offset : offsets) {
stencil1 = stencil1 + shift(t0, offset);
}
stencil1 = stencil1 / int(offsets.size() + 1);
auto stencil2 = stencil1;
for (const auto& offset : offsets) {
stencil2 = stencil2 + shift(stencil1, offset);
}
stencil2 = stencil2 / int(offsets.size() + 1);
auto ref = stencil2.index(indices);
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionPartialSplit5_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const int numel_x = 10;
const int numel_y = 11;
// auto tv0 = makeSymbolicTensor(2);
auto tv0 = makeConcreteTensor({numel_x, numel_y});
fusion.addInput(tv0);
auto tv1 = shift(tv0, {0, 1}, false);
auto tv2 = add(tv1, IrBuilder::create<Double>(1));
fusion.addOutput(tv2);
// Partially split tv2 but not tv1. Producer indexing with tv2 as a consumer
// requires adjustment of the index to account for the difference of split
// offsets.
tv2->split(1, 4, true, true);
tv1->split(1, 4);
tv1->computeAt(tv2, 1);
tv2->axis(1)->parallelize(ParallelType::TIDx);
tv1->axis(1)->parallelize(ParallelType::TIDx);
tv1->setMemoryType(MemoryType::Shared);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(0, at::indexing::None),
at::indexing::Slice(1, at::indexing::None)};
outputs[0] = outputs[0].index(indices);
auto ref = (shift(t0, {0, 1}) + 1).index(indices);
testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionPartialSplit6_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const int numel_x = 9;
auto tv0 = makeConcreteTensor({numel_x});
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {1}, false);
auto tv3 = add(tv2, IrBuilder::create<Double>(1));
fusion.addOutput(tv3);
// Another mix of partial and non-partial split
tv1->split(0, 4);
tv2->split(0, 4, true, true);
tv3->split(0, 4);
// Just make it easier for compute-sanitizer to flag invalid memory accesses
tv1->setMemoryType(MemoryType::Shared);
tv2->setMemoryType(MemoryType::Shared);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
std::vector<at::indexing::TensorIndex> indices{
at::indexing::Slice(1, at::indexing::None)};
outputs[0] = outputs[0].index(indices);
auto ref = (shift(t0 + 1, {1}) + 1).index(indices);
testValidate(&fusion, outputs, {t0}, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionShiftUnswitch1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = shift(tv0, {-1, 0});
fusion.addOutput(tv1);
auto tv2 = shift(tv0, {0, 1});
fusion.addOutput(tv2);
auto tv3 = shift(tv0, {2, 2});
fusion.addOutput(tv3);
auto tv4 = shift(tv0, {-2, -2});
fusion.addOutput(tv4);
auto tv5 = add(tv0, IrBuilder::create<Double>(1));
auto tv6 = shift(tv5, {0, -1});
fusion.addOutput(tv6);
tv1->axis(1)->parallelize(ParallelType::Unswitch);
tv2->axis(1)->parallelize(ParallelType::Unswitch);
tv3->axis(0)->parallelize(ParallelType::Unswitch);
tv4->axis(0)->parallelize(ParallelType::Unswitch);
tv5->axis(1)->parallelize(ParallelType::TIDx);
tv6->axis(1)->parallelize(ParallelType::TIDx);
tv5->axis(0)->parallelize(ParallelType::Unswitch);
tv5->setMemoryType(MemoryType::Shared);
int numel_x = 9;
int numel_y = 11;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = shift(t0, {-1, 0});
TORCH_CHECK(t1.equal(outputs[0]));
auto t2 = shift(t0, {0, 1});
TORCH_CHECK(t2.equal(outputs[1]));
auto t3 = shift(t0, {2, 2});
TORCH_CHECK(t3.equal(outputs[2]));
auto t4 = shift(t0, {-2, -2});
TORCH_CHECK(t4.equal(outputs[3]));
auto t6 = shift(t0 + 1, {0, -1});
TORCH_CHECK(t6.equal(outputs[4]));
}
TEST_F(NVFuserTest, FusionGatherUnswitch1_CUDA) {
const int tv1_gather = 3;
const int tv1_gather_pad = 1;
const int tv2_gather = 5;
const int tv2_gather_pad = 2;
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = gather(tv0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}});
fusion.addOutput(tv1);
auto tv2 = gather(tv0, {tv2_gather}, {{tv2_gather_pad, tv2_gather_pad}});
fusion.addOutput(tv2);
// Static gather
auto tv3 = gather(tv0, {3}, {{1, 1}});
fusion.addOutput(tv3);
// Static gather
auto tv4 = gather(tv0, {5}, {{2, 2}});
fusion.addOutput(tv4);
auto tv0_cache = tv0->cache_after();
tv0_cache->setMemoryType(MemoryType::Shared);
tv4->split(0, 32);
tv0->computeAt(tv4, 1);
tv4->axis(0)->parallelize(ParallelType::Unswitch);
tv4->axis(1)->parallelize(ParallelType::TIDx);
const int numel_x = 100;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = gather(t0, {tv1_gather}, {{tv1_gather_pad, tv1_gather_pad}});
TORCH_CHECK(t1.equal(outputs[0]));
auto t2 = gather(t0, {tv2_gather}, {{tv2_gather_pad, tv2_gather_pad}});
TORCH_CHECK(t2.equal(outputs[1]));
auto t3 = gather(t0, {3}, {{1, 1}});
TORCH_CHECK(t3.equal(outputs[2]));
auto t4 = gather(t0, {5}, {{2, 2}});
TORCH_CHECK(t4.equal(outputs[3]));
}
TEST_F(NVFuserTest, FusionGatherStrided1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {1, 3};
const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
const std::vector<int> strides = {1, 3};
auto tv1 = gather(tv0, window_shape, padding_width, strides);
fusion.addOutput(tv1);
const int s1 = 11;
const int s2 = 13;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({s1, s2}, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0});
// tv1 has a stride dimension, so its number of dimensions should be
// input_ndims + window_ndims + stride.
TORCH_CHECK(tv1->nDims() == tv0->nDims() * 2 + 1);
// However, the number of dimensions of the Aten tensor should still
// be just the twice of the number of dimensions of the input
// tensor.
auto fuser_out = outputs[0];
TORCH_CHECK(
fuser_out.ndimension() == tv0->nDims() * 2,
"Invalid dimensionality of output tensor: ",
fuser_out.ndimension());
// Each output dimension should be: ceilDiv(input_size + padding_width -
// window, stride).
for (const auto i : c10::irange(window_shape.size())) {
auto valid_dim = ceilDiv(
t0.size(i) + padding_width[i][0] + padding_width[i][1] -
window_shape[i] + 1,
strides[i]);
auto actual_dim = outputs[0].size(i);
TORCH_CHECK(
valid_dim == actual_dim,
"Invalid output size at dimension ",
i,
". Expected: ",
valid_dim,
", actual: ",
actual_dim);
}
auto ref = gather(t0, window_shape, padding_width, strides);
TORCH_CHECK(ref.equal(outputs[0]));
}
// Split strided domain
TEST_F(NVFuserTest, FusionGatherStrided2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const std::vector<int> window_shape = {3};
const std::vector<std::vector<int>> padding_width = {{1, 1}};
const std::vector<int> strides = {3};
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = gather(tv1, window_shape, padding_width, strides);
auto tv3 = sum(tv2, {-1});
fusion.addOutput(tv3);
// Split the strided domain
tv3->split(0, 4);
// Propagate the split by 4 of the tv3 domain to pre-stride domains,
// making them split by 4 * 3
tv0->computeAt(tv3, 1);
tv2->computeAt(tv3, -1);
tv3->axis(0)->parallelize(ParallelType::BIDx);
tv3->axis(1)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
tv1->setMemoryType(MemoryType::Shared);
const int s1 = 100;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({s1}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = gather(t1, window_shape, padding_width, strides);
auto ref = sum(t2, {-1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Outer split
TEST_F(NVFuserTest, FusionGatherStrided3_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const std::vector<int> window_shape = {3};
const std::vector<std::vector<int>> padding_width = {{1, 1}};
const std::vector<int> strides = {3};
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = gather(tv1, window_shape, padding_width, strides);
auto tv3 = sum(tv2, {-1});
fusion.addOutput(tv3);
// Outer split
tv3->split(0, 2, false);
tv0->computeAt(tv3, 1);
tv3->axis(0)->parallelize(ParallelType::BIDx);
tv3->axis(1)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
tv1->setMemoryType(MemoryType::Shared);
tv2->setMemoryType(MemoryType::Shared);
const int s1 = 100;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({s1}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = gather(t1, window_shape, padding_width, strides);
auto ref = sum(t2, {-1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionGatherStrided4_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const std::vector<int> window_shape = {3};
const std::vector<std::vector<int>> padding_width = {{1, 1}};
const std::vector<int> strides = {3};
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
// Test propagation of split from one gather output to another
auto tv2 = gather(tv1, window_shape, padding_width, strides);
auto tv3 = gather(tv1, window_shape, padding_width, strides);
auto tv4 = sum(tv2, {-1});
fusion.addOutput(tv4);
auto tv5 = sum(tv3, {-1});
fusion.addOutput(tv5);
tv4->split(0, 2);
// Test forward computeAt propagation from tv1 to tv3
tv0->computeAt(tv4, 1);
const int s1 = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({s1}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = gather(t1, window_shape, padding_width, strides);
auto ref = sum(t2, {-1});
testValidate(&fusion, outputs, inputs, {ref, ref}, __LINE__, __FILE__);
}
// Same as GatherStrided1 but with stride != window
TEST_F(NVFuserTest, FusionGatherStrided5_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
const std::vector<int> window_shape = {1, 3};
const std::vector<std::vector<int>> padding_width = {{0, 0}, {1, 1}};
const std::vector<int> strides = {1, 2};
auto tv1 = gather(tv0, window_shape, padding_width, strides);
fusion.addOutput(tv1);
const int s1 = 11;
const int s2 = 13;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({s1, s2}, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto outputs = fe.runFusion({t0});
auto ref = gather(t0, window_shape, padding_width, strides);
TORCH_CHECK(ref.equal(outputs[0]));
}
// Same as GatherStrided2 but with stride != window
TEST_F(NVFuserTest, FusionGatherStrided6_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const std::vector<int> window_shape = {3};
const std::vector<std::vector<int>> padding_width = {{1, 1}};
const std::vector<int> strides = {2};
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = gather(tv1, window_shape, padding_width, strides);
auto tv3 = sum(tv2, {-1});
fusion.addOutput(tv3);
// Split the strided domain
tv3->split(0, 4);
// Propagate the split by 4 of the tv3 domain to pre-stride domains,
// making them split by 4 * 2
tv0->computeAt(tv3, 1);
tv2->computeAt(tv3, -1);
tv3->axis(0)->parallelize(ParallelType::BIDx);
tv3->axis(1)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
tv1->setMemoryType(MemoryType::Shared);
const int s1 = 100;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({s1}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = gather(t1, window_shape, padding_width, strides);
auto ref = sum(t2, {-1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Same as GatherStrided4 but different strides
TEST_F(NVFuserTest, FusionGatherStrided7_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const std::vector<int> window_shape = {3};
const std::vector<std::vector<int>> padding_width = {{1, 1}};
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
// Use different strides
auto tv2 = gather(tv1, window_shape, padding_width, {3});
auto tv3 = gather(tv1, window_shape, padding_width, {2});
auto tv4 = sum(tv2, {-1});
fusion.addOutput(tv4);
auto tv5 = sum(tv3, {-1});
fusion.addOutput(tv5);
tv4->split(0, 2);
// Since tv3 has a different stride factor, this should fail.
ASSERT_ANY_THROW(tv0->computeAt(tv4, 1));
}
// Same as GatherStrided2 but with unswitch
TEST_F(NVFuserTest, FusionGatherStrided8_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const std::vector<int> window_shape = {3};
const std::vector<std::vector<int>> padding_width = {{1, 1}};
const std::vector<int> strides = {3};
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = gather(tv1, window_shape, padding_width, strides);
auto tv3 = sum(tv2, {-1});
fusion.addOutput(tv3);
const int tidx = 32;
// Split the strided domain
tv3->split(0, tidx);
// Split for unswitch
tv3->split(0, 1);
tv0->computeAt(tv3, 2);
tv2->computeAt(tv3, -1);
tv3->axis(0)->parallelize(ParallelType::BIDx);
tv3->axis(1)->parallelize(ParallelType::Unswitch);
tv3->axis(2)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(tv3, {tv1, tv2});
tv1->setMemoryType(MemoryType::Shared);
const int s1 = 1023;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({s1}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = t0 + 1;
auto t2 = gather(t1, window_shape, padding_width, strides);
auto ref = sum(t2, {-1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
// Chained strided gather. Not supported yet.
TEST_F(NVFuserTest, FusionGatherStridedChain_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
const std::vector<int> window_shape = {3};
const std::vector<std::vector<int>> padding_width = {{1, 1}};
const std::vector<int> strides = {3};
// const std::vector<int> strides = {1};
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = gather(tv1, window_shape, padding_width, strides);
// Reduce gathered window
auto tv3 = sum(tv2, {-1});
// Repeat
auto tv4 = gather(tv3, window_shape, padding_width, strides);
auto tv5 = sum(tv4, {-1});
auto out = tv5;
fusion.addOutput(out);
// This should throw an error at HaloInfo::build.
ASSERT_ANY_THROW(GpuLower gpulw(&fusion));
}
TEST_F(NVFuserTest, FusionMaxPoolingStrided_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: CHW
// Pooling window: 3x3
// Strides: 3
// Padding: 1 at each end of the inner 2 dimensions
// [C, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// [C, H/3, W/3, 1, 3, 3]
auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3});
// [C, H/3, W/3]
auto max_tensor = reductionOp(
BinaryOpType::Max,
{-3, -2, -1},
IrBuilder::create<Double>(std::numeric_limits<float>::lowest()),
inp_tile);
fusion.addOutput(max_tensor);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Tiling the spatial domain
const int tile_x = 32;
const int tile_y = 8;
max_tensor->split(1, tile_y);
max_tensor->split(3, tile_x);
max_tensor->reorder({{2, 3}});
// [C, H/tile_y, W/tile_x, tile_y, tile_x]
max_tensor->split(2, 1);
// [C, H/tile_y, W/tile_x, 1, tile_y, tile_x]
inp->computeAt(max_tensor, 4);
max_tensor->axis(0)->parallelize(ParallelType::BIDx);
max_tensor->axis(3)->parallelize(ParallelType::Unswitch);
max_tensor->axis(4)->parallelize(ParallelType::TIDy);
max_tensor->axis(5)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(max_tensor, ir_utils::allTvs(&fusion));
inp_cache->setMemoryType(MemoryType::Shared);
const int hw = 50;
const int num_channels = 20;
const int pooling_window = 3;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor aten_inp = at::randn({num_channels, hw, hw}, options);
// We always pad inputs by zero, so if all surrounding values are
// negative, max pooling would pick a padded value, which isn't the
// correct behavior. We need to be able to choose the value of
// padding. In this case, padding by the minimum value would not
// have this problem. For now, avoid the problem by making sure all
// values are not negative.
aten_inp = at::abs(aten_inp);
std::vector<IValue> inputs = {aten_inp};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto ref = at::max_pool2d(
aten_inp, {pooling_window, pooling_window}, {3, 3}, {1, 1});
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionConv2DStaticStrided_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
// Input: [C, H, W]
auto inp = makeSymbolicTensor(3);
fusion.addInput(inp);
// Weights: [K, C, 3, 3]
auto w = makeSymbolicTensor(4);
fusion.addInput(w);
// Gather a neighbor tile of [3, 3] with padding size of 1 for each
// side of the spatial dimensions
auto inp_tile = gather(inp, {1, 3, 3}, {{0, 0}, {1, 1}, {1, 1}}, {1, 3, 3});
// inp_tile: [C, H/3, s3, W/3, s3, 1, 3, 3]
auto inp_bc =
broadcast(inp_tile, {true, false, false, false, false, false, false});
auto w_bc = broadcast(w, {false, false, true, true, true, false, false});
auto inp_times_w = mul(inp_bc, w_bc);
// Reduce the channel and neighbor tile dimensions
auto out = sum(inp_times_w, {1, 4, 5, 6});
fusion.addOutput(out);
////////////////////////////////////
// Cache the input and weight tensors
auto inp_cache = inp->cache_after();
// Blocking the spatial dimensions
const int block_w = 16;
const int block_h = 4;
const int block_c = 2;
// [K, C, H/s, W/s, 1, 3, 3]
out->split(2, block_h);
// [K, C, H/s/block_h, block_h, W/s, 1, 3, 3]
out->split(4, block_w);
// [K, C, H/s/block_h, block_h, W/s/block_w, block_w, 1, 3, 3]
out->reorder({{3, 4}});
// [K, C, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3, 3]
out->split(1, block_c);
// [K, C/block_c, block_c, H/s/block_h, W/s/block_w, block_h, block_w, 1, 3,
// 3]
out->split(4, 1);
// [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
// 3, 3]
auto out_rf = out->rFactor({1, -3, -2, -1});
// [K, C/block_c, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, 1,
// 3, 3]
// out: [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w]
inp_cache->computeAt(out, 5);
inp_cache->setMemoryType(MemoryType::Shared);
// [K, block_c, H/s/block_h, W/s/block_w, 1, block_h, block_w, C/block_c, 1,
// 3, 3]
// Move C/block_c before block_h/2 and share the domain from
// inp_cache to out_rf
out_rf->reorder({{7, 5}, {5, 6}, {6, 7}});
inp_cache->computeAt(out_rf, 6);
inp_tile->computeAt(out_rf, -1);
w->computeAt(out_rf, -1);
out->axis(0)->parallelize(ParallelType::BIDx);
out->axis(1)->parallelize(ParallelType::TIDz);
out->axis(4)->parallelize(ParallelType::Unswitch);
out->axis(5)->parallelize(ParallelType::TIDy);
out->axis(6)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(out, {inp_cache, out_rf});
const int dim_h = 99;
const int dim_w = 101;
const int dim_c = 10;
const int dim_f = 20;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::manual_seed(0);
at::Tensor at_inp = at::randn({dim_c, dim_h, dim_w}, options);
at::Tensor at_w = at::randn({dim_f, dim_c, 3, 3}, options);
std::vector<IValue> inputs = {at_inp, at_w};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto cg_outputs = fe.runFusion(inputs);
at_inp = at_inp.unsqueeze(0); // at::conv2d needs the N axis
auto at_out = at::conv2d(at_inp, at_w, {}, 3, 1);
at_out = at_out.squeeze(0); // drop the N axis
testValidate(&fusion, cg_outputs, inputs, {at_out}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionNonDivisibleHalo1_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(1);
fusion.addInput(tv0);
auto tv1 = add(tv0, IrBuilder::create<Double>(1));
auto tv2 = shift(tv1, {-1});
fusion.addOutput(tv2);
// [I]
tv2->split(0, 8);
// [I/8, 8]
tv2->split(1, 3);
// [I/8, 3, 3]
tv0->computeAt(tv2, -2);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({24}, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto cg_outputs = fe.runFusion({t0});
auto ref = shift((t0 + 1), {-1});
testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionNonDivisibleHalo2_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = gather(tv0, {3, 3}, {{1, 1}, {1, 1}});
auto tv2 = sum(tv1, {-2, -1});
auto tv3 = add(tv0, tv2);
auto tv4 = sum(tv3, {0, 1});
fusion.addOutput(tv4);
const int gy = 50;
const int gx = 50;
const int by = 8;
const int bx = 16;
auto tv5 = tv0->cache_after();
// [I, J]
tv4->split(0, gy);
// [I/gy, gy, J]
tv4->split(1, by);
// [I/gy, gy/by, by, J]
tv4->split(-1, gx);
// [I/gy, gy/by, by, J/gx, gx]
tv4->split(-1, bx);
// [I/gy, gy/by, by, J/gx, gx/bx, bx]
tv4->reorder({{3, 1}, {1, 2}, {4, 3}, {2, 4}});
// [I/gy, J/gx, gy/by, gx/bx, by, bx]
auto tv6 = tv4->rFactor({2, 3});
tv0->computeAt(tv6, 4);
tv4->axis(0)->parallelize(ParallelType::BIDy);
tv4->axis(1)->parallelize(ParallelType::BIDx);
tv4->axis(2)->parallelize(ParallelType::TIDy);
tv4->axis(3)->parallelize(ParallelType::TIDx);
scheduler_utils::parallelizeAllLike(tv4, {tv1, tv2, tv3, tv5, tv6});
tv5->setMemoryType(MemoryType::Shared);
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({111, 222}, options);
FusionExecutor fe;
fe.compileFusion(&fusion, {t0});
auto cg_outputs = fe.runFusion({t0});
auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}});
auto t2 = t1.sum({-2, -1});
auto t3 = t0 + t2;
auto t4 = t3.sum({-2, -1});
testValidate(&fusion, cg_outputs, {t0}, {t4}, __LINE__, __FILE__);
}
TEST_F(NVFuserTest, FusionGather9ptStencilDoubleBuffering_CUDA) {
Fusion fusion;
FusionGuard fg(&fusion);
auto tv0 = makeSymbolicTensor(2);
fusion.addInput(tv0);
auto tv1 = gather(tv0, {3, 3}, {{1, 1}, {1, 1}});
auto tv2 = sum(tv1, {-2, -1});
auto tv3 = div(tv2, IrBuilder::create<Double>(9));
auto out = tv3;
fusion.addOutput(out);
auto tv0_cache = tv0->cache_after();
tv0_cache->setMemoryType(MemoryType::Shared);
out->split(-2, 4);
out->split(-1, 32);
out->reorder({{1, 2}, {2, 1}});
TransformPropagator::from(out);
tv0->computeAt(out, 2);
out->axis(3)->parallelize(ParallelType::TIDx);
out->axis(2)->parallelize(ParallelType::TIDy);
out->axis(0)->parallelize(ParallelType::BIDx);
scheduler_utils::parallelizeAllLike(out, ir_utils::allTvs(&fusion));
tv0_cache->doubleBuffer();
int numel_x = 99;
int numel_y = 101;
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({numel_x, numel_y}, options);
std::vector<IValue> inputs = {t0};
FusionExecutor fe;
fe.compileFusion(&fusion, inputs);
auto outputs = fe.runFusion(inputs);
auto t1 = gather(t0, {3, 3}, {{1, 1}, {1, 1}});
auto t2 = sum(t1, {-2, -1});
auto t3 = t2 / 9;
auto ref = t3;
testValidate(&fusion, outputs, inputs, {ref}, __LINE__, __FILE__);
}
} // namespace jit
} // namespace torch
#endif // #if defined(USE_CUDA)