mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-15 21:00:47 +00:00
This reverts commit49c41b87a2. Reverted https://github.com/pytorch/pytorch/pull/79147 on behalf of https://github.com/janeyx99 due to Broke 11.3 builds on trunk49c41b87a2
483 lines
17 KiB
C++
483 lines
17 KiB
C++
#include <torch/csrc/jit/codegen/cuda/arith.h>
|
|
#include <torch/csrc/jit/codegen/cuda/executor.h>
|
|
#include <torch/csrc/jit/codegen/cuda/fusion.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ir_utils.h>
|
|
#include <torch/csrc/jit/codegen/cuda/lower2device.h>
|
|
#include <torch/csrc/jit/codegen/cuda/ops/all_ops.h>
|
|
#include <torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h>
|
|
|
|
#include <benchmark/benchmark.h>
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
#include <benchmarks/cpp/nvfuser/utils.h>
|
|
|
|
#define TRANSPOSE_CONFIG {true, false, false, false}
|
|
|
|
using namespace torch::jit::fuser::cuda;
|
|
|
|
struct TransposeConfig {
|
|
bool input1_transpose_axes = false;
|
|
bool input2_transpose_axes = false;
|
|
bool intermediate_transpose_axes = false;
|
|
bool output_transpose_axes = false;
|
|
};
|
|
|
|
std::vector<at::Tensor> generateInputs(
|
|
DataType dtype,
|
|
int num_dims,
|
|
std::pair<int, int> axes,
|
|
int perm_size,
|
|
int innerdim_size,
|
|
bool input1_transpose_axes,
|
|
bool input2_transpose_axes,
|
|
bool non_vectorize_offset = false,
|
|
int iter_size = 32) {
|
|
at::manual_seed(0);
|
|
auto options =
|
|
at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
|
|
|
|
std::vector<int64_t> transpose_shape(num_dims, iter_size);
|
|
transpose_shape[axes.second] = innerdim_size;
|
|
transpose_shape[axes.first] = perm_size;
|
|
|
|
std::vector<int64_t> non_transpose_shape(num_dims, iter_size);
|
|
non_transpose_shape[axes.first] = innerdim_size;
|
|
non_transpose_shape[axes.second] = perm_size;
|
|
|
|
// TensorType: Concrete, Contig, Symbolic
|
|
// Vectorization | Unroll - Add 1 to sizes
|
|
// Shift axis by 1 to disable vectorize loads
|
|
if (non_vectorize_offset) {
|
|
for (auto idx : c10::irange(transpose_shape.size())) {
|
|
transpose_shape[idx] += 1;
|
|
}
|
|
for (auto idx : c10::irange(non_transpose_shape.size())) {
|
|
non_transpose_shape[idx] += 1;
|
|
}
|
|
}
|
|
|
|
auto optionalTransposeSize =
|
|
[&transpose_shape, &non_transpose_shape](bool transpose_tensor) {
|
|
return (transpose_tensor) ? transpose_shape : non_transpose_shape;
|
|
};
|
|
|
|
at::Tensor aten_input1 =
|
|
at::randn(optionalTransposeSize(input1_transpose_axes), options);
|
|
at::Tensor aten_input2 =
|
|
at::randn(optionalTransposeSize(input2_transpose_axes), options);
|
|
return {aten_input1, aten_input2};
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
static void setupTranspose(
|
|
Fusion* fusion,
|
|
DataType dtype,
|
|
int num_dims,
|
|
std::pair<int, int> axes,
|
|
TransposeConfig tc) {
|
|
FusionGuard fg(fusion);
|
|
typedef std::pair<int, int> transpose_axes;
|
|
|
|
auto getTransposeMap =
|
|
[](const transpose_axes& axes) -> std::unordered_map<int, int> {
|
|
return {{axes.first, axes.second}, {axes.second, axes.first}};
|
|
};
|
|
|
|
auto optionalTranspose = [&getTransposeMap, axes](
|
|
TensorView* tv, bool is_transpose) {
|
|
return (is_transpose) ? transpose(tv, getTransposeMap(axes)) : tv;
|
|
};
|
|
|
|
auto input1 = makeContigTensor(num_dims);
|
|
auto input2 = makeContigTensor(num_dims);
|
|
fusion->addInput(input1);
|
|
fusion->addInput(input2);
|
|
|
|
auto ot_input1 = optionalTranspose(input1, tc.input1_transpose_axes);
|
|
auto ot_input2 = optionalTranspose(input2, tc.input2_transpose_axes);
|
|
auto intermediate = add(ot_input1, ot_input2);
|
|
auto ot_intermediate =
|
|
optionalTranspose(intermediate, tc.intermediate_transpose_axes);
|
|
auto output = relu(ot_intermediate);
|
|
auto ot_output = optionalTranspose(output, tc.output_transpose_axes);
|
|
fusion->addOutput(ot_output);
|
|
}
|
|
|
|
static void NvFuserScheduler_Transpose(
|
|
benchmark::State& benchmark_state,
|
|
FusionExecutorCache* fusion_executor_cache,
|
|
DataType dtype,
|
|
int num_dims,
|
|
std::pair<int, int> axes,
|
|
TransposeConfig tc) {
|
|
auto aten_inputs = generateInputs(
|
|
dtype,
|
|
num_dims,
|
|
axes,
|
|
benchmark_state.range(0),
|
|
benchmark_state.range(1),
|
|
tc.input1_transpose_axes,
|
|
tc.input2_transpose_axes);
|
|
auto at_input1 = aten_inputs[0];
|
|
auto at_input2 = aten_inputs[1];
|
|
|
|
std::vector<c10::IValue> fuser_inputs = {at_input1, at_input2};
|
|
runBenchmarkIterations(benchmark_state, fusion_executor_cache, fuser_inputs);
|
|
|
|
benchmark_state.SetBytesProcessed(
|
|
int64_t(benchmark_state.iterations()) *
|
|
((at_input1.numel() * 3) * int64_t(dataTypeSize(dtype))));
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
#define NVFUSER_TRANSPOSE_SQUARE_RUN( \
|
|
TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
|
|
NVFUSER_BENCHMARK_DEFINE( \
|
|
TITLE, \
|
|
setupTranspose, \
|
|
NvFuserScheduler_Transpose, \
|
|
DTYPE, \
|
|
NUM_DIMS, \
|
|
{AXIS1, AXIS2}, \
|
|
CONFIG); \
|
|
\
|
|
NVFUSER_BENCHMARK_RUN(TITLE) \
|
|
->RangeMultiplier(8) \
|
|
->Args({9, 2408}) \
|
|
->Args({16, 512}) \
|
|
->Args({18, 96}) \
|
|
->Args({24, 96}) \
|
|
->Args({24, 256}) \
|
|
->Args({24, 512}) \
|
|
->Args({32, 27}) \
|
|
->Args({32, 96}) \
|
|
->Args({32, 288}) \
|
|
->Args({32, 864}) \
|
|
->Args({40, 120}) \
|
|
->Args({48, 128}) \
|
|
->Args({48, 256}) \
|
|
->Args({49, 512}) \
|
|
->Args({49, 1024}) \
|
|
->Args({49, 2048}) \
|
|
->Args({49, 4608}) \
|
|
->Args({64, 64}) \
|
|
->Args({64, 96}) \
|
|
->Args({64, 128}) \
|
|
->Args({64, 147}) \
|
|
->Args({64, 192}) \
|
|
->Args({64, 256}) \
|
|
->Args({64, 288}) \
|
|
->Args({64, 512}) \
|
|
->Args({80, 64}) \
|
|
->Args({81, 1728}) \
|
|
->Args({83, 1728}) \
|
|
->Args({96, 864}) \
|
|
->Args({100, 1280}) \
|
|
->Args({100, 4032}) \
|
|
->Args({120, 40}) \
|
|
->Args({128, 128}) \
|
|
->Args({128, 512}) \
|
|
->Args({128, 1152}) \
|
|
->Args({192, 128}) \
|
|
->Args({192, 256}) \
|
|
->Args({192, 720}) \
|
|
->Args({192, 768}) \
|
|
->Args({192, 1120}) \
|
|
->Args({192, 1728}) \
|
|
->Args({196, 256}) \
|
|
->Args({196, 512}) \
|
|
->Args({196, 1024}) \
|
|
->Args({196, 2304}) \
|
|
->Args({256, 256}) \
|
|
->Args({256, 1024}) \
|
|
->Args({256, 2304}) \
|
|
->Args({284, 512}) \
|
|
->Args({320, 1280}) \
|
|
->Args({320, 1728}) \
|
|
->Args({324, 2592}) \
|
|
->Args({361, 768}) \
|
|
->Args({361, 1120}) \
|
|
->Args({384, 2}) \
|
|
->Args({384, 32}) \
|
|
->Args({384, 128}) \
|
|
->Args({384, 256}) \
|
|
->Args({384, 512}) \
|
|
->Args({384, 1280}) \
|
|
->Args({384, 2592}) \
|
|
->Args({384, 4032}) \
|
|
->Args({448, 1280}) \
|
|
->Args({480, 16}) \
|
|
->Args({480, 256}) \
|
|
->Args({512, 2}) \
|
|
->Args({512, 16}) \
|
|
->Args({512, 128}) \
|
|
->Args({512, 256}) \
|
|
->Args({512, 1024}) \
|
|
->Args({512, 2048}) \
|
|
->Args({512, 3072}) \
|
|
->Args({512, 4608}) \
|
|
->Args({784, 40}) \
|
|
->Args({784, 120}) \
|
|
->Args({784, 128}) \
|
|
->Args({784, 1152}) \
|
|
->Args({1001, 2408}) \
|
|
->Args({1024, 16}) \
|
|
->Args({1024, 256}) \
|
|
->Args({1024, 512}) \
|
|
->Args({1024, 1024}) \
|
|
->Args({1024, 3072}) \
|
|
->Args({1369, 192}) \
|
|
->Args({1369, 256}) \
|
|
->Args({1369, 288}) \
|
|
->Args({2048, 512}) \
|
|
->Args({2048, 1024}) \
|
|
->Args({2250, 27}) \
|
|
->Args({3072, 512}) \
|
|
->Args({3072, 1024}) \
|
|
->Args({3136, 64}) \
|
|
->Args({5329, 720}) \
|
|
->Args({5625, 64}) \
|
|
->Args({12544, 147}) \
|
|
->Args({22201, 288}) \
|
|
->Unit(benchmark::kMicrosecond)
|
|
|
|
NVFUSER_TRANSPOSE_SQUARE_RUN(
|
|
NF_Transpose_Random_fp32_Inner_2D_01_Axis,
|
|
DataType::Float,
|
|
2 /* num_dims */,
|
|
0 /* axis1 */,
|
|
1 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_SQUARE_RUN(
|
|
NF_Transpose_Random_fp32_Inner_3D_02_Axis,
|
|
DataType::Float,
|
|
3 /* num_dims */,
|
|
0 /* axis1 */,
|
|
2 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_SQUARE_RUN(
|
|
NF_Transpose_Random_fp32_Inner_3D_12_Axis,
|
|
DataType::Float,
|
|
3 /* num_dims */,
|
|
1 /* axis1 */,
|
|
2 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_SQUARE_RUN(
|
|
NF_Transpose_Random_fp32_Outer_3D_01_Axis,
|
|
DataType::Float,
|
|
3 /* num_dims */,
|
|
0 /* axis1 */,
|
|
1 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
NVFUSER_TRANSPOSE_SQUARE_RUN(
|
|
NF_Transpose_Random_fp16_Inner_2D_01_Axis,
|
|
DataType::Half,
|
|
2 /* num_dims */,
|
|
0 /* axis1 */,
|
|
1 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_SQUARE_RUN(
|
|
NF_Transpose_Random_fp16_Inner_3D_02_Axis,
|
|
DataType::Half,
|
|
3 /* num_dims */,
|
|
0 /* axis1 */,
|
|
2 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_SQUARE_RUN(
|
|
NF_Transpose_Random_fp16_Inner_3D_12_Axis,
|
|
DataType::Half,
|
|
3 /* num_dims */,
|
|
1 /* axis1 */,
|
|
2 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_SQUARE_RUN(
|
|
NF_Transpose_Random_fp16_Outer_3D_01_Axis,
|
|
DataType::Half,
|
|
3 /* num_dims */,
|
|
0 /* axis1 */,
|
|
1 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
|
|
#define NVFUSER_TRANSPOSE_RUN(TITLE, DTYPE, NUM_DIMS, AXIS1, AXIS2, CONFIG) \
|
|
NVFUSER_BENCHMARK_DEFINE( \
|
|
TITLE, \
|
|
setupTranspose, \
|
|
NvFuserScheduler_Transpose, \
|
|
DTYPE, \
|
|
NUM_DIMS, \
|
|
{AXIS1, AXIS2}, \
|
|
CONFIG); \
|
|
\
|
|
NVFUSER_BENCHMARK_RUN(TITLE) \
|
|
->RangeMultiplier(8) \
|
|
->Ranges({{2, 256 * 256}, {160, 320}}) \
|
|
->Unit(benchmark::kMicrosecond) \
|
|
|
|
NVFUSER_TRANSPOSE_RUN(
|
|
NF_Transpose_fp32_Inner_2D_01_Axis,
|
|
DataType::Float,
|
|
2 /* num_dims */,
|
|
0 /* axis1 */,
|
|
1 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_RUN(
|
|
NF_Transpose_fp32_Inner_3D_02_Axis,
|
|
DataType::Float,
|
|
3 /* num_dims */,
|
|
0 /* axis1 */,
|
|
2 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_RUN(
|
|
NF_Transpose_fp32_Inner_3D_12_Axis,
|
|
DataType::Float,
|
|
3 /* num_dims */,
|
|
1 /* axis1 */,
|
|
2 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_RUN(
|
|
NF_Transpose_fp32_Outer_3D_01_Axis,
|
|
DataType::Float,
|
|
3 /* num_dims */,
|
|
0 /* axis1 */,
|
|
1 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
NVFUSER_TRANSPOSE_RUN(
|
|
NF_Transpose_fp16_Inner_2D_01_Axis,
|
|
DataType::Half,
|
|
2 /* num_dims */,
|
|
0 /* axis1 */,
|
|
1 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_RUN(
|
|
NF_Transpose_fp16_Inner_3D_02_Axis,
|
|
DataType::Half,
|
|
3 /* num_dims */,
|
|
0 /* axis1 */,
|
|
2 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_RUN(
|
|
NF_Transpose_fp16_Inner_3D_12_Axis,
|
|
DataType::Half,
|
|
3 /* num_dims */,
|
|
1 /* axis1 */,
|
|
2 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
NVFUSER_TRANSPOSE_RUN(
|
|
NF_Transpose_fp16_Outer_3D_01_Axis,
|
|
DataType::Half,
|
|
3 /* num_dims */,
|
|
0 /* axis1 */,
|
|
1 /* axis2 */,
|
|
TransposeConfig(TRANSPOSE_CONFIG));
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
static void Baseline_Transpose(
|
|
benchmark::State& benchmark_state,
|
|
DataType dtype,
|
|
int num_dims,
|
|
std::pair<int, int> axes,
|
|
TransposeConfig tc) {
|
|
auto aten_inputs = generateInputs(
|
|
dtype,
|
|
num_dims,
|
|
axes,
|
|
benchmark_state.range(0),
|
|
benchmark_state.range(1),
|
|
tc.input1_transpose_axes,
|
|
tc.input2_transpose_axes);
|
|
auto at_input1 = aten_inputs[0];
|
|
auto at_input2 = aten_inputs[1];
|
|
|
|
auto optionalTransposeAten = [&axes](at::Tensor at, bool is_transpose) {
|
|
return (is_transpose) ? at::transpose(at, axes.first, axes.second) : at;
|
|
};
|
|
|
|
for (auto _ : benchmark_state) {
|
|
clearL2Cache();
|
|
CudaKernelTimer timer;
|
|
|
|
auto at_ot_input1 =
|
|
optionalTransposeAten(at_input1, tc.input1_transpose_axes);
|
|
auto at_ot_input2 =
|
|
optionalTransposeAten(at_input2, tc.input2_transpose_axes);
|
|
auto at_intermediate = add(at_ot_input1, at_ot_input2);
|
|
auto at_ot_intermediate =
|
|
optionalTransposeAten(at_intermediate, tc.intermediate_transpose_axes);
|
|
auto at_output = relu(at_ot_intermediate);
|
|
auto at_ot_output =
|
|
optionalTransposeAten(at_output, tc.output_transpose_axes);
|
|
|
|
benchmark_state.SetIterationTime(timer.elapsed() / 1000.0);
|
|
}
|
|
// Sync everything up before we're finished, don't want to run ahead on the
|
|
// cpu while benchmarking.
|
|
cudaDeviceSynchronize();
|
|
|
|
benchmark_state.SetBytesProcessed(
|
|
int64_t(benchmark_state.iterations()) *
|
|
(at_input1.numel() * 3 * int64_t(dataTypeSize(dtype))));
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
static void Baseline_Transpose_fp32_Inner_2D_01_Axis(
|
|
benchmark::State& benchmark_state) {
|
|
Baseline_Transpose(
|
|
benchmark_state,
|
|
DataType::Float,
|
|
2 /* num_dims */,
|
|
{0, 1} /* axes */,
|
|
TRANSPOSE_CONFIG);
|
|
}
|
|
|
|
static void Baseline_Transpose_fp16_Inner_2D_01_Axis(
|
|
benchmark::State& benchmark_state) {
|
|
Baseline_Transpose(
|
|
benchmark_state,
|
|
DataType::Half,
|
|
2 /* num_dims */,
|
|
{0, 1} /* axes */,
|
|
TRANSPOSE_CONFIG);
|
|
}
|
|
|
|
//------------------------------------------------------------------------------
|
|
|
|
BENCHMARK(Baseline_Transpose_fp32_Inner_2D_01_Axis)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{2, 1024 * 1024}, {160, 320}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|
|
|
|
BENCHMARK(Baseline_Transpose_fp16_Inner_2D_01_Axis)
|
|
// ->RangeMultiplier(2)
|
|
->Ranges({{2, 1024 * 1024}, {160, 320}})
|
|
->Unit(benchmark::kMicrosecond)
|
|
->UseManualTime();
|
|
|
|
//------------------------------------------------------------------------------
|