mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Updating nvfuser code base. This should fix the indexing issue observed in https://github.com/pytorch/vision/issues/6015. Running tests locally as well. Will update the description here at a later point @bypass-github-export-checks Pull Request resolved: https://github.com/pytorch/pytorch/pull/77471 Approved by: https://github.com/seemethere, https://github.com/eellison
408 lines
13 KiB
C++
408 lines
13 KiB
C++
#include <torch/csrc/jit/codegen/cuda/arith.h>
|
|
#include <torch/csrc/jit/codegen/cuda/executor.h>
|
|
#include <torch/csrc/jit/codegen/cuda/fusion.h>
|
|
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
|
|
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
|
|
|
|
#include <benchmark/benchmark.h>
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include <benchmarks/cpp/nvfuser/utils.h>
|
|
|
|
using namespace torch::jit::fuser::cuda;
|
|
|
|
static void setupSBR(Fusion* fusion, DataType dtype) {
|
|
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
|
|
|
|
FusionGuard fg(fusion);
|
|
|
|
const size_t kNumberOfDims = 4;
|
|
|
|
std::vector<int64_t> bcast_shape(kNumberOfDims, 1);
|
|
bcast_shape[bcast_shape.size() - 1] = -1;
|
|
|
|
std::vector<bool> bcast_contig(kNumberOfDims, false);
|
|
bcast_contig[bcast_contig.size() - 1] = true;
|
|
|
|
auto x = makeContigTensor(kNumberOfDims, dtype);
|
|
|
|
auto scale = TensorViewBuilder()
|
|
.contiguity(bcast_contig)
|
|
.shape(bcast_shape)
|
|
.dtype(dtype)
|
|
.build();
|
|
|
|
auto bias = TensorViewBuilder()
|
|
.contiguity(bcast_contig)
|
|
.shape(bcast_shape)
|
|
.dtype(dtype)
|
|
.build();
|
|
|
|
fusion->addInput(x);
|
|
fusion->addInput(scale);
|
|
fusion->addInput(bias);
|
|
|
|
if (dtype == DataType::Half) {
|
|
x = castOp(DataType::Float, x);
|
|
scale = castOp(DataType::Float, scale);
|
|
bias = castOp(DataType::Float, bias);
|
|
}
|
|
|
|
auto scale_bias = add(mul(x, scale), bias);
|
|
auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias);
|
|
|
|
if (dtype == DataType::Half) {
|
|
scale_bias_relu = castOp(DataType::Half, scale_bias_relu);
|
|
}
|
|
fusion->addOutput(scale_bias_relu);
|
|
}
|
|
|
|
static void setupSBRNorm(Fusion* fusion, DataType dtype) {
|
|
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
|
|
FusionGuard fg(fusion);
|
|
|
|
const size_t kNumberOfDims = 4;
|
|
|
|
auto x = makeContigTensor(kNumberOfDims, dtype);
|
|
auto weight = makeContigTensor(1, dtype);
|
|
auto bias = makeContigTensor(1, dtype);
|
|
auto mean = makeContigTensor(1, dtype);
|
|
auto var = makeContigTensor(1, dtype);
|
|
|
|
fusion->addInput(x);
|
|
fusion->addInput(weight);
|
|
fusion->addInput(bias);
|
|
fusion->addInput(mean);
|
|
fusion->addInput(var);
|
|
|
|
std::vector<bool> broadcast_mask(kNumberOfDims, true);
|
|
broadcast_mask[broadcast_mask.size() - 1] = false;
|
|
|
|
if (dtype == DataType::Half) {
|
|
x = castOp(DataType::Float, x);
|
|
weight = castOp(DataType::Float, weight);
|
|
bias = castOp(DataType::Float, bias);
|
|
mean = castOp(DataType::Float, mean);
|
|
var = castOp(DataType::Float, var);
|
|
}
|
|
|
|
auto rsqrt = unaryOp(UnaryOpType::Rsqrt, var);
|
|
auto this_scale = mul(weight, rsqrt);
|
|
auto this_bias = mul(sub(bias, mean), this_scale);
|
|
|
|
auto bcast_scale = broadcast(this_scale, broadcast_mask);
|
|
auto bcast_bias = broadcast(this_bias, broadcast_mask);
|
|
|
|
auto scale_bias = add(mul(x, bcast_scale), bcast_bias);
|
|
auto scale_bias_relu = unaryOp(UnaryOpType::Relu, scale_bias);
|
|
|
|
if (dtype == DataType::Half) {
|
|
scale_bias_relu = castOp(DataType::Half, scale_bias_relu);
|
|
}
|
|
|
|
fusion->addOutput(scale_bias_relu);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
static void NvFuserScheduler_SBR(
|
|
benchmark::State& benchmark_state,
|
|
FusionExecutorCache* fusion_executor_cache,
|
|
DataType dtype) {
|
|
// N, H, W, C format
|
|
std::vector<int64_t> input_shape{
|
|
benchmark_state.range(0),
|
|
benchmark_state.range(1),
|
|
benchmark_state.range(1),
|
|
benchmark_state.range(2)};
|
|
std::vector<int64_t> bcast_shape{1, 1, 1, -1};
|
|
|
|
// inputs
|
|
at::manual_seed(0);
|
|
std::vector<int64_t> static_bcast_shape{1, 1, 1, benchmark_state.range(2)};
|
|
auto options =
|
|
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
|
|
at::Tensor at_x = at::randn(input_shape, options);
|
|
at::Tensor at_scale = at::ones(static_bcast_shape, options);
|
|
at::Tensor at_bias = at::zeros(static_bcast_shape, options);
|
|
|
|
// inputs
|
|
std::vector<c10::IValue> aten_inputs = {at_x, at_scale, at_bias};
|
|
|
|
fusion_executor_cache->profile(true);
|
|
fusion_executor_cache->runFusionWithInputs(aten_inputs);
|
|
|
|
auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
|
|
auto executor_instance = compile_log.fusion_executor;
|
|
TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
|
|
auto params = toString(compile_log.pointwise_params.value());
|
|
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
|
|
|
|
benchmark_state.SetLabel(params + lparams);
|
|
benchmark_state.SetLabel(lparams);
|
|
|
|
fusion_executor_cache->profile(false);
|
|
executor_instance->setMeasureKernelTimeFlag(true);
|
|
// Sync everything up before we start
|
|
cudaDeviceSynchronize();
|
|
for (auto _ : benchmark_state) {
|
|
clearL2Cache();
|
|
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
|
|
benchmark_state.SetIterationTime(
|
|
executor_instance->kernelTimeMs() / 1000.0);
|
|
}
|
|
// Sync everything up before we're finished, don't want to run ahead on the
|
|
// cpu while benchmarking.
|
|
cudaDeviceSynchronize();
|
|
|
|
const size_t size =
|
|
input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
|
|
const size_t channels = input_shape[3];
|
|
benchmark_state.SetBytesProcessed(
|
|
int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) *
|
|
int64_t(dataTypeSize(dtype)));
|
|
}
|
|
|
|
static void Baseline_SBR(benchmark::State& benchmark_state, DataType dtype) {
|
|
// N, H, W, C format
|
|
std::vector<int64_t> input_shape{
|
|
benchmark_state.range(0),
|
|
benchmark_state.range(1),
|
|
benchmark_state.range(1),
|
|
benchmark_state.range(2)};
|
|
std::vector<int64_t> bcast_shape{benchmark_state.range(2)};
|
|
|
|
// inputs
|
|
at::manual_seed(0);
|
|
auto options =
|
|
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
|
|
at::Tensor at_x = at::randn(input_shape, options);
|
|
at::Tensor at_y = at::randn(input_shape, options);
|
|
at::Tensor at_scale = at::ones(bcast_shape, options);
|
|
at::Tensor at_bias = at::zeros(bcast_shape, options);
|
|
|
|
clearL2Cache();
|
|
cudaDeviceSynchronize();
|
|
for (auto _ : benchmark_state) {
|
|
CudaKernelTimer timer;
|
|
|
|
auto scale = at::mul(at_x, at_scale);
|
|
auto bias = at::add(scale, at_bias);
|
|
auto output = at::relu(bias);
|
|
|
|
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
|
|
cudaDeviceSynchronize();
|
|
clearL2Cache();
|
|
cudaDeviceSynchronize();
|
|
}
|
|
|
|
const size_t size =
|
|
input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
|
|
const size_t channels = input_shape[3];
|
|
benchmark_state.SetBytesProcessed(
|
|
int64_t(benchmark_state.iterations()) * (channels * 2 + size * 2) *
|
|
int64_t(dataTypeSize(dtype)));
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
static void NvFuserScheduler_SBR_Norm(
|
|
benchmark::State& benchmark_state,
|
|
FusionExecutorCache* fusion_executor_cache,
|
|
DataType dtype) {
|
|
// N, H, W, C format
|
|
std::vector<int64_t> input_shape{
|
|
benchmark_state.range(0),
|
|
benchmark_state.range(1),
|
|
benchmark_state.range(1),
|
|
benchmark_state.range(2)};
|
|
std::vector<int64_t> bcast_shape{benchmark_state.range(2)};
|
|
|
|
// inputs
|
|
at::manual_seed(0);
|
|
auto options =
|
|
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
|
|
at::Tensor at_x = at::randn(input_shape, options);
|
|
at::Tensor at_weight = at::ones(bcast_shape, options);
|
|
at::Tensor at_bias = at::zeros(bcast_shape, options);
|
|
at::Tensor at_mean = at::zeros(bcast_shape, options);
|
|
at::Tensor at_var = at::ones(bcast_shape, options);
|
|
|
|
// inputs
|
|
std::vector<c10::IValue> aten_inputs = {
|
|
at_x, at_weight, at_bias, at_mean, at_var};
|
|
|
|
fusion_executor_cache->profile(true);
|
|
fusion_executor_cache->runFusionWithInputs(aten_inputs);
|
|
|
|
auto compile_log = fusion_executor_cache->getMostRecentExecutorInfo();
|
|
auto executor_instance = compile_log.fusion_executor;
|
|
TORCH_INTERNAL_ASSERT(compile_log.pointwise_params.has_value());
|
|
auto params = toString(compile_log.pointwise_params.value());
|
|
auto lparams = toString(compile_log.fusion_executor->lastLaunchParams());
|
|
|
|
benchmark_state.SetLabel(params + lparams);
|
|
|
|
fusion_executor_cache->profile(false);
|
|
executor_instance->setMeasureKernelTimeFlag(true);
|
|
// Sync everything up before we start
|
|
cudaDeviceSynchronize();
|
|
for (auto _ : benchmark_state) {
|
|
clearL2Cache();
|
|
auto cg_outputs = fusion_executor_cache->runFusionWithInputs(aten_inputs);
|
|
benchmark_state.SetIterationTime(
|
|
executor_instance->kernelTimeMs() / 1000.0);
|
|
}
|
|
|
|
// Sync everything up before we're finished, don't want to run ahead on the
|
|
// cpu while benchmarking.
|
|
cudaDeviceSynchronize();
|
|
|
|
const size_t size =
|
|
input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
|
|
const size_t channels = input_shape[3];
|
|
benchmark_state.SetBytesProcessed(
|
|
int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) *
|
|
int64_t(dataTypeSize(dtype)));
|
|
}
|
|
|
|
static void Baseline_SBR_Norm(
|
|
benchmark::State& benchmark_state,
|
|
DataType dtype) {
|
|
// N, H, W, C format
|
|
std::vector<int64_t> input_shape{
|
|
benchmark_state.range(0),
|
|
benchmark_state.range(1),
|
|
benchmark_state.range(1),
|
|
benchmark_state.range(2)};
|
|
std::vector<int64_t> bcast_shape{1, 1, 1, benchmark_state.range(2)};
|
|
|
|
// inputs
|
|
at::manual_seed(0);
|
|
auto options =
|
|
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
|
|
at::Tensor at_x = at::randn(input_shape, options);
|
|
at::Tensor at_weight = at::ones(bcast_shape, options);
|
|
at::Tensor at_bias = at::zeros(bcast_shape, options);
|
|
at::Tensor at_mean = at::zeros(bcast_shape, options);
|
|
at::Tensor at_var = at::ones(bcast_shape, options);
|
|
|
|
cudaDeviceSynchronize();
|
|
for (auto _ : benchmark_state) {
|
|
CudaKernelTimer timer;
|
|
|
|
auto this_scale = at::mul(at_weight, at::rsqrt(at_var));
|
|
auto this_bias = at::mul(at::sub(at_bias, at_mean), this_scale);
|
|
|
|
auto scale = at::mul(at_x, this_scale);
|
|
auto bias = at::add(scale, this_bias);
|
|
auto output = at::relu(bias);
|
|
|
|
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
|
|
cudaDeviceSynchronize();
|
|
}
|
|
|
|
const size_t size =
|
|
input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
|
|
const size_t channels = input_shape[3];
|
|
benchmark_state.SetBytesProcessed(
|
|
int64_t(benchmark_state.iterations()) * (channels * 4 + size * 2) *
|
|
int64_t(dataTypeSize(dtype)));
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
NVFUSER_BENCHMARK_DEFINE(
|
|
NvFuserScheduler_SBR_fp32,
|
|
setupSBR,
|
|
NvFuserScheduler_SBR,
|
|
DataType::Float);
|
|
|
|
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp32)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{8, 8}, {640, 640}, {64, 128}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|
|
|
|
NVFUSER_BENCHMARK_DEFINE(
|
|
NvFuserScheduler_SBR_fp16,
|
|
setupSBR,
|
|
NvFuserScheduler_SBR,
|
|
DataType::Half);
|
|
|
|
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_fp16)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{8, 8}, {640, 640}, {64, 128}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
NVFUSER_BENCHMARK_DEFINE(
|
|
NvFuserScheduler_SBR_Norm_fp32,
|
|
setupSBRNorm,
|
|
NvFuserScheduler_SBR_Norm,
|
|
DataType::Float);
|
|
|
|
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp32)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{8, 8}, {640, 640}, {64, 128}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|
|
|
|
NVFUSER_BENCHMARK_DEFINE(
|
|
NvFuserScheduler_SBR_Norm_fp16,
|
|
setupSBRNorm,
|
|
NvFuserScheduler_SBR_Norm,
|
|
DataType::Half);
|
|
|
|
NVFUSER_BENCHMARK_RUN(NvFuserScheduler_SBR_Norm_fp16)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{8, 8}, {640, 640}, {64, 128}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
static void Baseline_SBR_fp32(benchmark::State& benchmark_state) {
|
|
Baseline_SBR(benchmark_state, DataType::Float);
|
|
}
|
|
|
|
BENCHMARK(Baseline_SBR_fp32)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{8, 8}, {640, 640}, {64, 128}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|
|
|
|
static void Baseline_SBR_fp16(benchmark::State& benchmark_state) {
|
|
Baseline_SBR(benchmark_state, DataType::Half);
|
|
}
|
|
|
|
BENCHMARK(Baseline_SBR_fp16)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{8, 8}, {640, 640}, {64, 128}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
static void Baseline_SBR_Norm_fp32(benchmark::State& benchmark_state) {
|
|
Baseline_SBR_Norm(benchmark_state, DataType::Float);
|
|
}
|
|
|
|
BENCHMARK(Baseline_SBR_Norm_fp32)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{8, 8}, {640, 640}, {64, 128}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|
|
|
|
static void Baseline_SBR_Norm_fp16(benchmark::State& benchmark_state) {
|
|
Baseline_SBR_Norm(benchmark_state, DataType::Half);
|
|
}
|
|
|
|
BENCHMARK(Baseline_SBR_Norm_fp16)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{8, 8}, {640, 640}, {64, 128}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|