diff --git a/benchmarks/cpp/tensorexpr/CMakeLists.txt b/benchmarks/cpp/tensorexpr/CMakeLists.txt index 85ab72ab358..94e324b18c5 100644 --- a/benchmarks/cpp/tensorexpr/CMakeLists.txt +++ b/benchmarks/cpp/tensorexpr/CMakeLists.txt @@ -1,9 +1,18 @@ +find_package(AVX) + add_executable( tensorexpr_bench bench_approx.cpp bench_compile.cpp bench_fuser_overhead.cpp bench_gemm.cpp + bench_reduce.cpp main.cpp) +if(C_AVX2_FOUND) + message(STATUS "AVX2 compiler support found") + target_compile_options(tensorexpr_bench PUBLIC -mavx2) + target_compile_definitions(tensorexpr_bench PUBLIC USE_AVX2) +endif() + target_link_libraries(tensorexpr_bench PRIVATE torch_library benchmark) diff --git a/benchmarks/cpp/tensorexpr/bench_reduce.cpp b/benchmarks/cpp/tensorexpr/bench_reduce.cpp new file mode 100644 index 00000000000..cd467d74162 --- /dev/null +++ b/benchmarks/cpp/tensorexpr/bench_reduce.cpp @@ -0,0 +1,442 @@ +#include +#include +#include +#include +#include + +#include + +namespace te = torch::jit::tensorexpr; + +namespace { +class Reduce1D : public benchmark::Fixture { + public: + void SetUp(const benchmark::State& state) override { + at::set_num_threads(1); + torch::manual_seed(0x12345678); + M = state.range(0); + A = torch::randn({M}); + B = torch::zeros({}); + } + + void TearDown(benchmark::State& state) override { + state.counters["BYTES"] = benchmark::Counter(uint64_t(state.iterations()) * M * sizeof(float), + benchmark::Counter::kIsRate); + } + + int M; + at::Tensor A; + at::Tensor B; +}; + +} // namespace + +BENCHMARK_DEFINE_F(Reduce1D, Torch)(benchmark::State& state) { + for (auto _ : state) { + B = torch::sum(A, {0}); + } +} + +BENCHMARK_REGISTER_F(Reduce1D, Torch)->Args({1 << 24}); + +#define VALIDATE(F, A, B) ValidateFunc((F), #F, (A), (B)) + +template +void ValidateFunc(Func func, const std::string& func_name, at::Tensor& A, at::Tensor& B) { + func(A, B); + float *pB = B.data_ptr(); + at::Tensor B2 = torch::sum(A, {0}); + float *pB2 = B2.data_ptr(); + int size = A.numel(); + float size_sqrt = std::sqrt(size); + float natural_noise = size_sqrt * 1e-7; + if (!torch::allclose(B, B2, natural_noise)) { + std::ostringstream oss; + oss << func_name << " failed check: " << std::endl; + oss << "value: " << B << std::endl;; + oss << "reference: " << B2 << std::endl; + oss << "threshold: " << natural_noise << std::endl; + throw std::runtime_error(oss.str()); + } +} + +static void reduce1d_naive(at::Tensor& A, at::Tensor& B) { + float *pA = A.data_ptr(); + float *pB = B.data_ptr(); + int size = A.numel(); + TORCH_CHECK(B.numel() == 1); + *pB = 0.; + for (int i = 0; i < size; i++) { + *pB += pA[i]; + } +} + +BENCHMARK_DEFINE_F(Reduce1D, Naive)(benchmark::State& state) { + VALIDATE(reduce1d_naive, A, B); + for (auto _ : state) { + reduce1d_naive(A, B); + } +} + +BENCHMARK_REGISTER_F(Reduce1D, Naive)->Args({1 << 24}); + +static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) { + float *pA = A.data_ptr(); + float *pB = B.data_ptr(); + int size = A.numel(); + constexpr int kChunkSize = 16; + TORCH_CHECK(B.numel() == 1); + TORCH_CHECK(size % kChunkSize == 0); + *pB = 0.; + float temp[kChunkSize]; + for (int j = 0; j < kChunkSize; j++) { + temp[j] = 0; + } + + int chunk_count = size / kChunkSize; + for (int i = 0; i < chunk_count; i++) { + for (int j = 0; j < kChunkSize; j++) { + temp[j] += pA[i * kChunkSize + j]; + } + } + + for (int j = 0; j < kChunkSize; j++) { + *pB += temp[j]; + } +} + +BENCHMARK_DEFINE_F(Reduce1D, NativeRfactor)(benchmark::State& state) { + VALIDATE(reduce1d_native_rfactor, A, B); + for (auto _ : state) { + reduce1d_native_rfactor(A, B); + } +} + +BENCHMARK_REGISTER_F(Reduce1D, NativeRfactor)->Args({1 << 24}); + +#ifdef USE_AVX2 + +// x = ( x7, x6, x5, x4, x3, x2, x1, x0 ) +inline float sum_f32x8(__m256 x) { + // hiQuad = ( x7, x6, x5, x4 ) + const __m128 hiQuad = _mm256_extractf128_ps(x, 1); + // loQuad = ( x3, x2, x1, x0 ) + const __m128 loQuad = _mm256_castps256_ps128(x); + // sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 ) + const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad); + // loDual = ( -, -, x1 + x5, x0 + x4 ) + const __m128 loDual = sumQuad; + // hiDual = ( -, -, x3 + x7, x2 + x6 ) + const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad); + // sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 ) + const __m128 sumDual = _mm_add_ps(loDual, hiDual); + // lo = ( -, -, -, x0 + x2 + x4 + x6 ) + const __m128 lo = sumDual; + // hi = ( -, -, -, x1 + x3 + x5 + x7 ) + const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1); + // sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 ) + const __m128 sum = _mm_add_ss(lo, hi); + return _mm_cvtss_f32(sum); +} + +static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) { + float *pA = A.data_ptr(); + float *pB = B.data_ptr(); + int size = A.numel(); + constexpr int kChunkSize = sizeof(__m256) / sizeof(float); + TORCH_CHECK(B.numel() == 1); + TORCH_CHECK(size % kChunkSize == 0); + *pB = 0.; + __m256 temp; + temp = _mm256_setzero_ps(); + + int tile_count = size / kChunkSize; + for (int i = 0; i < tile_count; i++) { + __m256 data = _mm256_load_ps(pA + i * kChunkSize); + temp = _mm256_add_ps(temp, data); + } + + float result = sum_f32x8(temp); + *pB = result; +} + +BENCHMARK_DEFINE_F(Reduce1D, NativeVector)(benchmark::State& state) { + VALIDATE(reduce1d_native_vector, A, B); + for (auto _ : state) { + reduce1d_native_vector(A, B); + } +} + +BENCHMARK_REGISTER_F(Reduce1D, NativeVector)->Args({1 << 24}); + +static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) { + static constexpr int kTileSize = 4; + float *pA = A.data_ptr(); + float *pB = B.data_ptr(); + int size = A.numel(); + constexpr int kChunkSize = sizeof(__m256) / sizeof(float); + TORCH_CHECK(B.numel() == 1, "Invalid size: ", B.numel(), " != 1"); + TORCH_CHECK(size % kChunkSize == 0, "Invalid size: ", size, " % ", kChunkSize , " ! = 0"); + __m256 t[kTileSize]; + for (int j = 0; j < kTileSize; j++) { + t[j] = _mm256_setzero_ps(); + } + + int tile_count = size / kChunkSize / kTileSize; + for (int i = 0; i < tile_count; i++) { + #pragma unroll + for (int j = 0; j < kTileSize; j++) { + float *p = pA + (i * kTileSize + j) * kChunkSize; + __m256 data = _mm256_loadu_ps(p); + t[j] = _mm256_add_ps(t[j], data); + } + } + + float result = sum_f32x8(t[0]); + for (int j = 1; j < kTileSize; j++) { + result += sum_f32x8(t[j]); + } + *pB = result; +} + +BENCHMARK_DEFINE_F(Reduce1D, NativeTiled)(benchmark::State& state) { + VALIDATE(reduce1d_native_tiled, A, B); + for (auto _ : state) { + reduce1d_native_tiled(A, B); + } +} + +BENCHMARK_REGISTER_F(Reduce1D, NativeTiled)->Args({1 << 24}); + +#endif // USE_AVX2 + +BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) { + te::KernelScope ks; + + int M = A.numel(); + + te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat)); + te::Tensor* BT = te::Reduce( + "reduce_full", + {{1, "N"}}, + te::Sum(), + [&](const te::ExprHandle& n, const te::ExprHandle& m) { + return AP.load(m); + }, + {{M, "M"}}); + + te::LoopNest loop({BT}); + loop.prepareForCodegen(); + te::Stmt* s = loop.root_stmt(); + s = te::IRSimplifier::simplify(s); + auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); + + auto func = [&](at::Tensor& A, at::Tensor& B) { + cg->call({A.data_ptr(), B.data_ptr()}); + }; + + ValidateFunc(func, "reduce1d_te_naive", A, B); + for (auto _ : state) { + func(A, B); + } +} + +BENCHMARK_REGISTER_F(Reduce1D, TeNaive)->Args({1 << 24}); + +BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) { + te::KernelScope ks; + + int M = A.numel(); + + te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat)); + te::Tensor* BT = te::Reduce( + "reduce_full", + {{1, "N"}}, + te::Sum(), + [&](const te::ExprHandle& n, const te::ExprHandle& m) { + return AP.load(m); + }, + {{M, "M"}}); + + te::LoopNest loop({BT}); + const int kChunkSize = 8; + + { + auto const& loops = loop.getLoopStmtsFor(BT); + te::For* m = loops[1]; + te::For* mo; + te::For* mi; + te::For* tail; + loop.splitWithTail(m, kChunkSize, &mo, &mi, &tail); + } + + loop.prepareForCodegen(); + te::Stmt* s = loop.root_stmt(); + s = te::IRSimplifier::simplify(s); + auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); + + auto func = [&](at::Tensor& A, at::Tensor& B) { + cg->call({A.data_ptr(), B.data_ptr()}); + }; + + ValidateFunc(func, "reduce1d_te_naive", A, B); + for (auto _ : state) { + func(A, B); + } +} + +BENCHMARK_REGISTER_F(Reduce1D, TeSplitTail)->Args({1 << 24}); + +BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) { + te::KernelScope ks; + + int M = A.numel(); + + te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat)); + te::Tensor* BT = te::Reduce( + "reduce_full", + {{1, "N"}}, + te::Sum(), + [&](const te::ExprHandle& n, const te::ExprHandle& m) { + return AP.load(m); + }, + {{M, "M"}}); + + te::LoopNest loop({BT}); + const int kChunkSize = 8; + + { + auto const& loops = loop.getLoopStmtsFor(BT); + te::For* m = loops[1]; + te::For* mo; + te::For* mi; + loop.splitWithMask(m, kChunkSize, &mo, &mi); + } + + loop.prepareForCodegen(); + te::Stmt* s = loop.root_stmt(); + s = te::IRSimplifier::simplify(s); + auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); + + auto func = [&](at::Tensor& A, at::Tensor& B) { + cg->call({A.data_ptr(), B.data_ptr()}); + }; + + ValidateFunc(func, "reduce1d_te_naive", A, B); + for (auto _ : state) { + func(A, B); + } +} + +BENCHMARK_REGISTER_F(Reduce1D, TeSplitMask)->Args({1 << 24}); + +BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) { + te::KernelScope ks; + + int M = A.numel(); + const int kChunkSize = 8; + TORCH_CHECK(M % kChunkSize == 0); + + te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat)); + te::Tensor* BT = te::Reduce( + "reduce_full", + {}, + te::Sum(), + [&](const te::ExprHandle& m) { + return AP.load(m); + }, + {{M, "M"}}); + + te::LoopNest loop({BT}); + + { + auto const& loops = loop.getLoopStmtsFor(BT); + TORCH_CHECK(loops.size() == 1); + te::For* m = loops[0]; + te::For* mo; + te::For* mi; + loop.splitWithMask(m, kChunkSize, &mo, &mi); + } + + { + auto const& loops = loop.getLoopStmtsFor(BT); + TORCH_CHECK(loops.size() == 2); + te::For* mo = loops[0]; + te::For* mi = loops[1]; + // TODO: rfactor works on the untransformed var set. This is a problem since we need to + // look for the loop after Split to rfactor. + loop.rfactor(BT->body(), mi->var()); + } + + loop.prepareForCodegen(); + te::Stmt* s = loop.root_stmt(); + s = te::IRSimplifier::simplify(s); + auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); + + auto func = [&](at::Tensor& A, at::Tensor& B) { + cg->call({A.data_ptr(), B.data_ptr()}); + }; + + ValidateFunc(func, "reduce1d_te_naive", A, B); + for (auto _ : state) { + func(A, B); + } +} + +// TODO: add this back when the problem is fixed +// BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV1)->Args({1 << 24}); + +// Similar to TeRfactor itself. But manually constructed the Split expression. +BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV2)(benchmark::State& state) { + te::KernelScope ks; + + int M = A.numel(); + const int kChunkSize = 8; + TORCH_CHECK(M % kChunkSize == 0); + + te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat)); + te::Tensor* BT = te::Reduce( + "reduce_full", + {}, + te::Sum(), + [&](const te::ExprHandle& mo, const te::ExprHandle& mi) { + return AP.load(mo * kChunkSize + mi); + }, + {{M / kChunkSize, "mo"}, {kChunkSize, "mi"}}); + + te::LoopNest loop({BT}); + + { + auto const& loops = loop.getLoopStmtsFor(BT); + TORCH_CHECK(loops.size() == 2); + te::For* mo = loops[0]; + te::For* mi = loops[1]; + loop.rfactor(BT->body(), mi->var()); + } + + { + // Look for the new For and vectorize, but rfactor didn't return the newly added "For *". + // Resort to a hack to find the lost "For *". + // TODO: make it easier to find the transformed loop after rfactor. + auto loops = te::NodeFinder::find(loop.root_stmt()); + TORCH_CHECK(loops.size() == 4); + auto mi = loops[2]; + loop.vectorize(mi); + } + + loop.prepareForCodegen(); + te::Stmt* s = loop.root_stmt(); + s = te::IRSimplifier::simplify(s); + auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT}); + + auto func = [&](at::Tensor& A, at::Tensor& B) { + cg->call({A.data_ptr(), B.data_ptr()}); + }; + + ValidateFunc(func, "reduce1d_te_naive", A, B); + for (auto _ : state) { + func(A, B); + } +} + +BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV2)->Args({1 << 24});