mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
Updating nvfuser code base. This should fix the indexing issue observed in https://github.com/pytorch/vision/issues/6015. Running tests locally as well. Will update the description here at a later point @bypass-github-export-checks Pull Request resolved: https://github.com/pytorch/pytorch/pull/77471 Approved by: https://github.com/seemethere, https://github.com/eellison
206 lines
6.5 KiB
C++
206 lines
6.5 KiB
C++
#include <torch/csrc/jit/codegen/cuda/executor.h>
|
|
#include <torch/csrc/jit/codegen/cuda/fusion.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
|
|
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
|
|
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
|
|
|
|
#include <benchmark/benchmark.h>
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include <benchmarks/cpp/nvfuser/utils.h>
|
|
|
|
using namespace torch::jit::fuser::cuda;
|
|
|
|
static auto getLayerBackwardNormRuntime(
|
|
std::unique_ptr<Fusion> fusion_ptr,
|
|
std::unique_ptr<FusionExecutorCache>& fec,
|
|
std::vector<at::IValue>& aten_inputs,
|
|
std::vector<int64_t>& shape,
|
|
std::vector<int64_t>& norm_shape) {
|
|
Fusion& fusion = *fusion_ptr.get();
|
|
|
|
const size_t kM = shape.size();
|
|
const size_t kN = norm_shape.size();
|
|
const size_t kOuterNumDims = kM - kN;
|
|
|
|
std::vector<int64_t> outer_shape;
|
|
for (size_t idx = 0; idx < kOuterNumDims; ++idx) {
|
|
outer_shape.push_back(shape[idx]);
|
|
}
|
|
for (size_t idx = kOuterNumDims; idx < kM; ++idx) {
|
|
outer_shape.push_back(1);
|
|
}
|
|
|
|
auto grad_out = makeSymbolicTensor(shape.size());
|
|
auto input = makeSymbolicTensor(shape.size());
|
|
auto mean = makeConcreteTensor(outer_shape);
|
|
auto rstd = makeConcreteTensor(outer_shape);
|
|
auto weight = makeSymbolicTensor(norm_shape.size());
|
|
auto bias = makeSymbolicTensor(norm_shape.size());
|
|
fusion.addInput(grad_out);
|
|
fusion.addInput(input);
|
|
fusion.addInput(mean);
|
|
fusion.addInput(rstd);
|
|
fusion.addInput(weight);
|
|
fusion.addInput(bias);
|
|
|
|
auto grads = layer_norm_backward(
|
|
grad_out,
|
|
input,
|
|
norm_shape,
|
|
mean,
|
|
rstd,
|
|
weight,
|
|
bias,
|
|
{true, true, true});
|
|
|
|
fusion.addOutput(grads.grad_input);
|
|
fusion.addOutput(grads.grad_weight);
|
|
fusion.addOutput(grads.grad_bias);
|
|
|
|
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
|
|
at::Tensor aten_grad_out = at::randn(shape, options);
|
|
at::Tensor aten_input = at::randn(shape, options);
|
|
at::Tensor aten_weight = at::randn(norm_shape, options);
|
|
at::Tensor aten_bias = at::randn(norm_shape, options);
|
|
auto at_weight = c10::optional<at::Tensor>(aten_weight);
|
|
auto at_bias = c10::optional<at::Tensor>(aten_bias);
|
|
|
|
const float kEps = 1e-5;
|
|
auto aten_results =
|
|
at::native_layer_norm(aten_input, norm_shape, at_weight, at_bias, kEps);
|
|
auto aten_output = std::get<0>(aten_results);
|
|
auto aten_mean = std::get<1>(aten_results);
|
|
auto aten_rstd = std::get<2>(aten_results);
|
|
|
|
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
|
|
aten_inputs = {
|
|
aten_grad_out, aten_input, aten_mean, aten_rstd, aten_weight, aten_bias};
|
|
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
|
|
|
|
return fec->getMostRecentKernelRuntime();
|
|
}
|
|
|
|
void LayerNormBackward_ShapeInference_Base(
|
|
benchmark::State& benchmark_state,
|
|
bool disable_launch_parameter_cache) {
|
|
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
|
|
FusionGuard fg(fusion_ptr.get());
|
|
|
|
// PreAllocate
|
|
std::unique_ptr<FusionExecutorCache> fec;
|
|
std::vector<at::IValue> aten_inputs;
|
|
|
|
std::vector<int64_t> shape{20, 100, 35, 67};
|
|
std::vector<int64_t> norm_shape{67};
|
|
|
|
auto runtime = getLayerBackwardNormRuntime(
|
|
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
|
|
TORCH_INTERNAL_ASSERT(
|
|
runtime->getMaybeHeuristicsFor(aten_inputs).has_value());
|
|
|
|
fec->profile(true);
|
|
fec->disableKernelLaunch();
|
|
fec->runFusionWithInputs(aten_inputs);
|
|
if (disable_launch_parameter_cache) {
|
|
fec->disableLaunchParamCache();
|
|
}
|
|
|
|
for (auto _ : benchmark_state) {
|
|
// Setup (not included in the measurement)
|
|
fec->runFusionWithInputs(aten_inputs);
|
|
}
|
|
}
|
|
|
|
static void LayerNormBackward_ShapeInference(
|
|
benchmark::State& benchmark_state) {
|
|
LayerNormBackward_ShapeInference_Base(benchmark_state, true);
|
|
}
|
|
|
|
static void LayerNormBackward_NoShapeInferenceCachedBaseline(
|
|
benchmark::State& benchmark_state) {
|
|
LayerNormBackward_ShapeInference_Base(benchmark_state, false);
|
|
}
|
|
|
|
static auto getLayerForwardNormRuntime(
|
|
std::unique_ptr<Fusion> fusion_ptr,
|
|
std::unique_ptr<FusionExecutorCache>& fec,
|
|
std::vector<at::IValue>& aten_inputs,
|
|
std::vector<int64_t>& shape,
|
|
std::vector<int64_t>& norm_shape) {
|
|
Fusion& fusion = *fusion_ptr.get();
|
|
|
|
const float kEps = 1e-5;
|
|
Double* eps_ptr = IrBuilder::create<Double>(kEps);
|
|
|
|
auto input = makeSymbolicTensor(shape.size());
|
|
fusion.addInput(input);
|
|
|
|
auto result = layer_norm(input, norm_shape, nullptr, nullptr, eps_ptr);
|
|
|
|
fusion.addOutput(result.output);
|
|
fusion.addOutput(result.mean);
|
|
fusion.addOutput(result.invstd);
|
|
|
|
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
|
|
at::Tensor aten_input = at::randn(shape, options);
|
|
|
|
fec = std::make_unique<FusionExecutorCache>(std::move(fusion_ptr));
|
|
aten_inputs = {aten_input};
|
|
auto cg_outputs = fec->runFusionWithInputs(aten_inputs);
|
|
|
|
return fec->getMostRecentKernelRuntime();
|
|
}
|
|
|
|
void LayerNormForward_ShapeInferenceBase(
|
|
benchmark::State& benchmark_state,
|
|
bool disable_launch_param_cache) {
|
|
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
|
|
FusionGuard fg(fusion_ptr.get());
|
|
|
|
// PreAllocate
|
|
std::unique_ptr<FusionExecutorCache> fec;
|
|
std::vector<at::IValue> aten_inputs;
|
|
|
|
std::vector<int64_t> shape{20, 100, 35, 67};
|
|
std::vector<int64_t> norm_shape{67};
|
|
|
|
auto runtime = getLayerForwardNormRuntime(
|
|
std::move(fusion_ptr), fec, aten_inputs, shape, norm_shape);
|
|
|
|
TORCH_INTERNAL_ASSERT(
|
|
runtime->getMaybeHeuristicsFor(aten_inputs).has_value());
|
|
|
|
fec->profile(true);
|
|
fec->disableKernelLaunch();
|
|
fec->runFusionWithInputs(aten_inputs);
|
|
|
|
if (disable_launch_param_cache) {
|
|
fec->disableLaunchParamCache();
|
|
}
|
|
|
|
for (auto _ : benchmark_state) {
|
|
// Setup (not included in the measurement)
|
|
fec->runFusionWithInputs(aten_inputs);
|
|
}
|
|
}
|
|
|
|
static void LayerNormForward_NoShapeInferenceCachedBaseline(
|
|
benchmark::State& benchmark_state) {
|
|
LayerNormForward_ShapeInferenceBase(benchmark_state, false);
|
|
}
|
|
|
|
static void LayerNormForward_ShapeInference(benchmark::State& benchmark_state) {
|
|
LayerNormForward_ShapeInferenceBase(benchmark_state, true);
|
|
}
|
|
|
|
BENCHMARK(LayerNormBackward_ShapeInference)->Unit(benchmark::kMicrosecond);
|
|
BENCHMARK(LayerNormForward_ShapeInference)->Unit(benchmark::kMicrosecond);
|
|
BENCHMARK(LayerNormBackward_NoShapeInferenceCachedBaseline)
|
|
->Unit(benchmark::kMicrosecond);
|
|
BENCHMARK(LayerNormForward_NoShapeInferenceCachedBaseline)
|
|
->Unit(benchmark::kMicrosecond);
|