#include #include #include #include #include #include #include #include #include #include #include "utils.h" using namespace torch::jit::fuser::cuda; //------------------------------------------------------------------------------ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) { FusionGuard fg(fusion); TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); const int kReductionAxis = 1; Double* eps_ptr = IrBuilder::create(1e-5); // setup fusion auto grad_out = makeContigTensor(2, dtype); auto input = makeContigTensor(2, dtype); auto weight = makeContigTensor(1, dtype); auto bias = makeContigTensor(1, dtype); auto mean = TensorViewBuilder() .contiguity({false, false}) .shape({-1, 1}) .dtype(dtype) .build(); auto rstd = TensorViewBuilder() .contiguity({false, false}) .shape({-1, 1}) .dtype(dtype) .build(); fusion->addInput(grad_out); fusion->addInput(input); fusion->addInput(weight); fusion->addInput(bias); fusion->addInput(mean); fusion->addInput(rstd); if (dtype == DataType::Half) { grad_out = castOp(DataType::Float, grad_out); input = castOp(DataType::Float, input); weight = castOp(DataType::Float, weight); bias = castOp(DataType::Float, bias); mean = castOp(DataType::Float, mean); rstd = castOp(DataType::Float, rstd); } auto layer_norm_results = layer_norm_backward( grad_out, input, {1}, mean, rstd, weight, bias, {true, true, true}); if (dtype != DataType::Float) { layer_norm_results.grad_input = castOp(dtype, layer_norm_results.grad_input); layer_norm_results.grad_bias = castOp(dtype, layer_norm_results.grad_bias); layer_norm_results.grad_weight = castOp(dtype, layer_norm_results.grad_weight); } fusion->addOutput(layer_norm_results.grad_input); fusion->addOutput(layer_norm_results.grad_bias); fusion->addOutput(layer_norm_results.grad_weight); } static void NvFuserScheduler_LayerNorm_BWD( benchmark::State& benchmark_state, FusionExecutorCache* fusion_executor_cache, DataType dtype) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1)}; // inputs at::manual_seed(0); auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); at::Tensor grad_out = at::randn(input_shape, options); at::Tensor input = at::randn(input_shape, options); at::Tensor weight = at::randn({input_shape[1]}, options); at::Tensor bias = at::randn({input_shape[1]}, options); at::Tensor mean = at::randn({input_shape[0], 1}, options); at::Tensor rstd = at::randn({input_shape[0], 1}, options); std::vector aten_inputs( {grad_out, input, weight, bias, mean, rstd}); runBenchmarkIterations(benchmark_state, fusion_executor_cache, aten_inputs); benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * (3 * input.numel() + weight.numel() + bias.numel() + mean.numel() + rstd.numel()) * int64_t(dataTypeSize(dtype))); } //------------------------------------------------------------------------------ static void Baseline_LayerNorm_BWD( benchmark::State& benchmark_state, DataType dtype) { TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half); std::vector input_shape{ benchmark_state.range(0), benchmark_state.range(1)}; const int kReductionAxis = 1; std::vector norm_shape; for (int idx = kReductionAxis; idx < input_shape.size(); ++idx) { norm_shape.push_back(input_shape[idx]); } // inputs at::manual_seed(0); auto options = at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0); at::Tensor grad_out = at::randn(input_shape, options); at::Tensor input = at::randn(input_shape, options); at::Tensor weight = at::randn({input_shape[1]}, options); at::Tensor bias = at::randn({input_shape[1]}, options); at::Tensor mean = at::randn({input_shape[0], 1}, options); at::Tensor rstd = at::randn({input_shape[0], 1}, options); std::array output_mask = {true, true, true}; clearL2Cache(); cudaDeviceSynchronize(); for (auto _ : benchmark_state) { CudaKernelTimer timer; at::native_layer_norm_backward( grad_out, input, norm_shape, mean, rstd, weight, bias, output_mask); auto output = at::layer_norm(input, norm_shape, weight, bias); benchmark_state.SetIterationTime(timer.elapsed() / 1000.0); cudaDeviceSynchronize(); clearL2Cache(); cudaDeviceSynchronize(); } benchmark_state.SetBytesProcessed( int64_t(benchmark_state.iterations()) * (3 * input.numel() + weight.numel() + bias.numel() + mean.numel() + rstd.numel()) * int64_t(dataTypeSize(dtype))); } static void Baseline_LayerNorm_BWD_fp32(benchmark::State& benchmark_state) { Baseline_LayerNorm_BWD(benchmark_state, DataType::Float); } static void Baseline_LayerNorm_BWD_fp16(benchmark::State& benchmark_state) { Baseline_LayerNorm_BWD(benchmark_state, DataType::Half); } //------------------------------------------------------------------------------ NVFUSER_BENCHMARK_DEFINE( NvFuserScheduler_LayerNorm_BWD_fp32, setupLayerNorm_BWD, NvFuserScheduler_LayerNorm_BWD, DataType::Float); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32) // ->RangeMultiplier(2) ->Ranges({{160, 320}, {2, 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32) // ->RangeMultiplier(2) ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32) // ->RangeMultiplier(2) ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp32) // ->RangeMultiplier(2) ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_DEFINE( NvFuserScheduler_LayerNorm_BWD_fp16, setupLayerNorm_BWD, NvFuserScheduler_LayerNorm_BWD, DataType::Half); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16) // ->RangeMultiplier(2) ->Ranges({{160, 320}, {2, 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16) // ->RangeMultiplier(2) ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16) // ->RangeMultiplier(2) ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); NVFUSER_BENCHMARK_RUN(NvFuserScheduler_LayerNorm_BWD_fp16) // ->RangeMultiplier(2) ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); //------------------------------------------------------------------------------ BENCHMARK(Baseline_LayerNorm_BWD_fp32) // ->RangeMultiplier(2) ->Ranges({{160, 320}, {2, 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_BWD_fp32) // ->RangeMultiplier(2) ->Ranges({{2, 16}, {32768, 16 * 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_BWD_fp32) // ->RangeMultiplier(2) ->Ranges({{32768, 16 * 1024 * 1024}, {2, 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_BWD_fp32) // ->RangeMultiplier(2) ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_BWD_fp16) // ->RangeMultiplier(2) ->Ranges({{160, 320}, {2, 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_BWD_fp16) // ->RangeMultiplier(2) ->Ranges({{2, 16}, {32768, 32 * 1024 * 1024}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_BWD_fp16) // ->RangeMultiplier(2) ->Ranges({{32768, 32 * 1024 * 1024}, {2, 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime(); BENCHMARK(Baseline_LayerNorm_BWD_fp16) // ->RangeMultiplier(2) ->Ranges({{128, 1024 * 16}, {128, 1024 * 16}}) ->Unit(benchmark::kMicrosecond) ->UseManualTime();