mirror of
https://github.com/saymrwulf/pytorch.git
synced 2026-05-14 20:57:59 +00:00
Add CPP Full Reduction Benchmarks. (#50193)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/50193 * Supports aten, native reference implementation, and NNC TE implementations. * Support functionality checks against aten, in addition to performance checks. Test plans: * After enable "BUILD_TENSOREXPR_BENCHMARK" in CMakeLists.txt, * bin/tensorexpr_bench --benchmark_filter=Reduce1D Measurements: On a Broadwell E5-2686 CPU, Reduce1D/Torch/16777216 5638547 ns 5638444 ns 119 BYTES=11.902G/s Reduce1D/Naive/16777216 19308235 ns 19308184 ns 36 BYTES=3.47567G/s Reduce1D/NativeRfactor/16777216 8433348 ns 8433038 ns 85 BYTES=7.95785G/s Reduce1D/NativeVector/16777216 5608836 ns 5608727 ns 124 BYTES=11.9651G/s Reduce1D/NativeTiled/16777216 5550233 ns 5550221 ns 126 BYTES=12.0912G/s Reduce1D/TeNaive/16777216 21451047 ns 21450752 ns 33 BYTES=3.12851G/s Reduce1D/TeSplitTail/16777216 23701732 ns 23701229 ns 30 BYTES=2.83145G/s Reduce1D/TeSplitMask/16777216 23683589 ns 23682978 ns 30 BYTES=2.83363G/s Reduce1D/TeRfactorV2/16777216 5378019 ns 5377909 ns 131 BYTES=12.4786G/s Result summary: * The single-threaded performance with NNC TeRfactorV2 matches and exceeds Aten and avx2 naive counterpart. Follow-up items: * rfactor does not work well with split * We don't have a multi-threaded implementation yet. * Missing "parallel" scheduling primitive, which is not different from what we need for pointwise ops. Test Plan: Imported from OSS Reviewed By: bertmaher Differential Revision: D25821880 Pulled By: zheng-xq fbshipit-source-id: 8df3f40d1eed8749c8edcaacae5f0544dbf6bed3
This commit is contained in:
parent
88b36230f5
commit
b96a6516a6
2 changed files with 451 additions and 0 deletions
|
|
@ -1,9 +1,18 @@
|
|||
find_package(AVX)
|
||||
|
||||
add_executable(
|
||||
tensorexpr_bench
|
||||
bench_approx.cpp
|
||||
bench_compile.cpp
|
||||
bench_fuser_overhead.cpp
|
||||
bench_gemm.cpp
|
||||
bench_reduce.cpp
|
||||
main.cpp)
|
||||
|
||||
if(C_AVX2_FOUND)
|
||||
message(STATUS "AVX2 compiler support found")
|
||||
target_compile_options(tensorexpr_bench PUBLIC -mavx2)
|
||||
target_compile_definitions(tensorexpr_bench PUBLIC USE_AVX2)
|
||||
endif()
|
||||
|
||||
target_link_libraries(tensorexpr_bench PRIVATE torch_library benchmark)
|
||||
|
|
|
|||
442
benchmarks/cpp/tensorexpr/bench_reduce.cpp
Normal file
442
benchmarks/cpp/tensorexpr/bench_reduce.cpp
Normal file
|
|
@ -0,0 +1,442 @@
|
|||
#include <benchmark/benchmark.h>
|
||||
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
|
||||
#include <torch/csrc/jit/tensorexpr/loopnest.h>
|
||||
#include <torch/csrc/jit/tensorexpr/tensor.h>
|
||||
#include <torch/torch.h>
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace te = torch::jit::tensorexpr;
|
||||
|
||||
namespace {
|
||||
class Reduce1D : public benchmark::Fixture {
|
||||
public:
|
||||
void SetUp(const benchmark::State& state) override {
|
||||
at::set_num_threads(1);
|
||||
torch::manual_seed(0x12345678);
|
||||
M = state.range(0);
|
||||
A = torch::randn({M});
|
||||
B = torch::zeros({});
|
||||
}
|
||||
|
||||
void TearDown(benchmark::State& state) override {
|
||||
state.counters["BYTES"] = benchmark::Counter(uint64_t(state.iterations()) * M * sizeof(float),
|
||||
benchmark::Counter::kIsRate);
|
||||
}
|
||||
|
||||
int M;
|
||||
at::Tensor A;
|
||||
at::Tensor B;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
BENCHMARK_DEFINE_F(Reduce1D, Torch)(benchmark::State& state) {
|
||||
for (auto _ : state) {
|
||||
B = torch::sum(A, {0});
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(Reduce1D, Torch)->Args({1 << 24});
|
||||
|
||||
#define VALIDATE(F, A, B) ValidateFunc((F), #F, (A), (B))
|
||||
|
||||
template <typename Func>
|
||||
void ValidateFunc(Func func, const std::string& func_name, at::Tensor& A, at::Tensor& B) {
|
||||
func(A, B);
|
||||
float *pB = B.data_ptr<float>();
|
||||
at::Tensor B2 = torch::sum(A, {0});
|
||||
float *pB2 = B2.data_ptr<float>();
|
||||
int size = A.numel();
|
||||
float size_sqrt = std::sqrt(size);
|
||||
float natural_noise = size_sqrt * 1e-7;
|
||||
if (!torch::allclose(B, B2, natural_noise)) {
|
||||
std::ostringstream oss;
|
||||
oss << func_name << " failed check: " << std::endl;
|
||||
oss << "value: " << B << std::endl;;
|
||||
oss << "reference: " << B2 << std::endl;
|
||||
oss << "threshold: " << natural_noise << std::endl;
|
||||
throw std::runtime_error(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
static void reduce1d_naive(at::Tensor& A, at::Tensor& B) {
|
||||
float *pA = A.data_ptr<float>();
|
||||
float *pB = B.data_ptr<float>();
|
||||
int size = A.numel();
|
||||
TORCH_CHECK(B.numel() == 1);
|
||||
*pB = 0.;
|
||||
for (int i = 0; i < size; i++) {
|
||||
*pB += pA[i];
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(Reduce1D, Naive)(benchmark::State& state) {
|
||||
VALIDATE(reduce1d_naive, A, B);
|
||||
for (auto _ : state) {
|
||||
reduce1d_naive(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(Reduce1D, Naive)->Args({1 << 24});
|
||||
|
||||
static void reduce1d_native_rfactor(at::Tensor& A, at::Tensor& B) {
|
||||
float *pA = A.data_ptr<float>();
|
||||
float *pB = B.data_ptr<float>();
|
||||
int size = A.numel();
|
||||
constexpr int kChunkSize = 16;
|
||||
TORCH_CHECK(B.numel() == 1);
|
||||
TORCH_CHECK(size % kChunkSize == 0);
|
||||
*pB = 0.;
|
||||
float temp[kChunkSize];
|
||||
for (int j = 0; j < kChunkSize; j++) {
|
||||
temp[j] = 0;
|
||||
}
|
||||
|
||||
int chunk_count = size / kChunkSize;
|
||||
for (int i = 0; i < chunk_count; i++) {
|
||||
for (int j = 0; j < kChunkSize; j++) {
|
||||
temp[j] += pA[i * kChunkSize + j];
|
||||
}
|
||||
}
|
||||
|
||||
for (int j = 0; j < kChunkSize; j++) {
|
||||
*pB += temp[j];
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(Reduce1D, NativeRfactor)(benchmark::State& state) {
|
||||
VALIDATE(reduce1d_native_rfactor, A, B);
|
||||
for (auto _ : state) {
|
||||
reduce1d_native_rfactor(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(Reduce1D, NativeRfactor)->Args({1 << 24});
|
||||
|
||||
#ifdef USE_AVX2
|
||||
|
||||
// x = ( x7, x6, x5, x4, x3, x2, x1, x0 )
|
||||
inline float sum_f32x8(__m256 x) {
|
||||
// hiQuad = ( x7, x6, x5, x4 )
|
||||
const __m128 hiQuad = _mm256_extractf128_ps(x, 1);
|
||||
// loQuad = ( x3, x2, x1, x0 )
|
||||
const __m128 loQuad = _mm256_castps256_ps128(x);
|
||||
// sumQuad = ( x3 + x7, x2 + x6, x1 + x5, x0 + x4 )
|
||||
const __m128 sumQuad = _mm_add_ps(loQuad, hiQuad);
|
||||
// loDual = ( -, -, x1 + x5, x0 + x4 )
|
||||
const __m128 loDual = sumQuad;
|
||||
// hiDual = ( -, -, x3 + x7, x2 + x6 )
|
||||
const __m128 hiDual = _mm_movehl_ps(sumQuad, sumQuad);
|
||||
// sumDual = ( -, -, x1 + x3 + x5 + x7, x0 + x2 + x4 + x6 )
|
||||
const __m128 sumDual = _mm_add_ps(loDual, hiDual);
|
||||
// lo = ( -, -, -, x0 + x2 + x4 + x6 )
|
||||
const __m128 lo = sumDual;
|
||||
// hi = ( -, -, -, x1 + x3 + x5 + x7 )
|
||||
const __m128 hi = _mm_shuffle_ps(sumDual, sumDual, 0x1);
|
||||
// sum = ( -, -, -, x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 )
|
||||
const __m128 sum = _mm_add_ss(lo, hi);
|
||||
return _mm_cvtss_f32(sum);
|
||||
}
|
||||
|
||||
static void reduce1d_native_vector(at::Tensor& A, at::Tensor& B) {
|
||||
float *pA = A.data_ptr<float>();
|
||||
float *pB = B.data_ptr<float>();
|
||||
int size = A.numel();
|
||||
constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
|
||||
TORCH_CHECK(B.numel() == 1);
|
||||
TORCH_CHECK(size % kChunkSize == 0);
|
||||
*pB = 0.;
|
||||
__m256 temp;
|
||||
temp = _mm256_setzero_ps();
|
||||
|
||||
int tile_count = size / kChunkSize;
|
||||
for (int i = 0; i < tile_count; i++) {
|
||||
__m256 data = _mm256_load_ps(pA + i * kChunkSize);
|
||||
temp = _mm256_add_ps(temp, data);
|
||||
}
|
||||
|
||||
float result = sum_f32x8(temp);
|
||||
*pB = result;
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(Reduce1D, NativeVector)(benchmark::State& state) {
|
||||
VALIDATE(reduce1d_native_vector, A, B);
|
||||
for (auto _ : state) {
|
||||
reduce1d_native_vector(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(Reduce1D, NativeVector)->Args({1 << 24});
|
||||
|
||||
static void reduce1d_native_tiled(at::Tensor& A, at::Tensor& B) {
|
||||
static constexpr int kTileSize = 4;
|
||||
float *pA = A.data_ptr<float>();
|
||||
float *pB = B.data_ptr<float>();
|
||||
int size = A.numel();
|
||||
constexpr int kChunkSize = sizeof(__m256) / sizeof(float);
|
||||
TORCH_CHECK(B.numel() == 1, "Invalid size: ", B.numel(), " != 1");
|
||||
TORCH_CHECK(size % kChunkSize == 0, "Invalid size: ", size, " % ", kChunkSize , " ! = 0");
|
||||
__m256 t[kTileSize];
|
||||
for (int j = 0; j < kTileSize; j++) {
|
||||
t[j] = _mm256_setzero_ps();
|
||||
}
|
||||
|
||||
int tile_count = size / kChunkSize / kTileSize;
|
||||
for (int i = 0; i < tile_count; i++) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < kTileSize; j++) {
|
||||
float *p = pA + (i * kTileSize + j) * kChunkSize;
|
||||
__m256 data = _mm256_loadu_ps(p);
|
||||
t[j] = _mm256_add_ps(t[j], data);
|
||||
}
|
||||
}
|
||||
|
||||
float result = sum_f32x8(t[0]);
|
||||
for (int j = 1; j < kTileSize; j++) {
|
||||
result += sum_f32x8(t[j]);
|
||||
}
|
||||
*pB = result;
|
||||
}
|
||||
|
||||
BENCHMARK_DEFINE_F(Reduce1D, NativeTiled)(benchmark::State& state) {
|
||||
VALIDATE(reduce1d_native_tiled, A, B);
|
||||
for (auto _ : state) {
|
||||
reduce1d_native_tiled(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(Reduce1D, NativeTiled)->Args({1 << 24});
|
||||
|
||||
#endif // USE_AVX2
|
||||
|
||||
BENCHMARK_DEFINE_F(Reduce1D, TeNaive)(benchmark::State& state) {
|
||||
te::KernelScope ks;
|
||||
|
||||
int M = A.numel();
|
||||
|
||||
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
||||
te::Tensor* BT = te::Reduce(
|
||||
"reduce_full",
|
||||
{{1, "N"}},
|
||||
te::Sum(),
|
||||
[&](const te::ExprHandle& n, const te::ExprHandle& m) {
|
||||
return AP.load(m);
|
||||
},
|
||||
{{M, "M"}});
|
||||
|
||||
te::LoopNest loop({BT});
|
||||
loop.prepareForCodegen();
|
||||
te::Stmt* s = loop.root_stmt();
|
||||
s = te::IRSimplifier::simplify(s);
|
||||
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
||||
|
||||
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
||||
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
||||
};
|
||||
|
||||
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
||||
for (auto _ : state) {
|
||||
func(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(Reduce1D, TeNaive)->Args({1 << 24});
|
||||
|
||||
BENCHMARK_DEFINE_F(Reduce1D, TeSplitTail)(benchmark::State& state) {
|
||||
te::KernelScope ks;
|
||||
|
||||
int M = A.numel();
|
||||
|
||||
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
||||
te::Tensor* BT = te::Reduce(
|
||||
"reduce_full",
|
||||
{{1, "N"}},
|
||||
te::Sum(),
|
||||
[&](const te::ExprHandle& n, const te::ExprHandle& m) {
|
||||
return AP.load(m);
|
||||
},
|
||||
{{M, "M"}});
|
||||
|
||||
te::LoopNest loop({BT});
|
||||
const int kChunkSize = 8;
|
||||
|
||||
{
|
||||
auto const& loops = loop.getLoopStmtsFor(BT);
|
||||
te::For* m = loops[1];
|
||||
te::For* mo;
|
||||
te::For* mi;
|
||||
te::For* tail;
|
||||
loop.splitWithTail(m, kChunkSize, &mo, &mi, &tail);
|
||||
}
|
||||
|
||||
loop.prepareForCodegen();
|
||||
te::Stmt* s = loop.root_stmt();
|
||||
s = te::IRSimplifier::simplify(s);
|
||||
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
||||
|
||||
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
||||
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
||||
};
|
||||
|
||||
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
||||
for (auto _ : state) {
|
||||
func(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(Reduce1D, TeSplitTail)->Args({1 << 24});
|
||||
|
||||
BENCHMARK_DEFINE_F(Reduce1D, TeSplitMask)(benchmark::State& state) {
|
||||
te::KernelScope ks;
|
||||
|
||||
int M = A.numel();
|
||||
|
||||
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
||||
te::Tensor* BT = te::Reduce(
|
||||
"reduce_full",
|
||||
{{1, "N"}},
|
||||
te::Sum(),
|
||||
[&](const te::ExprHandle& n, const te::ExprHandle& m) {
|
||||
return AP.load(m);
|
||||
},
|
||||
{{M, "M"}});
|
||||
|
||||
te::LoopNest loop({BT});
|
||||
const int kChunkSize = 8;
|
||||
|
||||
{
|
||||
auto const& loops = loop.getLoopStmtsFor(BT);
|
||||
te::For* m = loops[1];
|
||||
te::For* mo;
|
||||
te::For* mi;
|
||||
loop.splitWithMask(m, kChunkSize, &mo, &mi);
|
||||
}
|
||||
|
||||
loop.prepareForCodegen();
|
||||
te::Stmt* s = loop.root_stmt();
|
||||
s = te::IRSimplifier::simplify(s);
|
||||
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
||||
|
||||
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
||||
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
||||
};
|
||||
|
||||
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
||||
for (auto _ : state) {
|
||||
func(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(Reduce1D, TeSplitMask)->Args({1 << 24});
|
||||
|
||||
BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV1)(benchmark::State& state) {
|
||||
te::KernelScope ks;
|
||||
|
||||
int M = A.numel();
|
||||
const int kChunkSize = 8;
|
||||
TORCH_CHECK(M % kChunkSize == 0);
|
||||
|
||||
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
||||
te::Tensor* BT = te::Reduce(
|
||||
"reduce_full",
|
||||
{},
|
||||
te::Sum(),
|
||||
[&](const te::ExprHandle& m) {
|
||||
return AP.load(m);
|
||||
},
|
||||
{{M, "M"}});
|
||||
|
||||
te::LoopNest loop({BT});
|
||||
|
||||
{
|
||||
auto const& loops = loop.getLoopStmtsFor(BT);
|
||||
TORCH_CHECK(loops.size() == 1);
|
||||
te::For* m = loops[0];
|
||||
te::For* mo;
|
||||
te::For* mi;
|
||||
loop.splitWithMask(m, kChunkSize, &mo, &mi);
|
||||
}
|
||||
|
||||
{
|
||||
auto const& loops = loop.getLoopStmtsFor(BT);
|
||||
TORCH_CHECK(loops.size() == 2);
|
||||
te::For* mo = loops[0];
|
||||
te::For* mi = loops[1];
|
||||
// TODO: rfactor works on the untransformed var set. This is a problem since we need to
|
||||
// look for the loop after Split to rfactor.
|
||||
loop.rfactor(BT->body(), mi->var());
|
||||
}
|
||||
|
||||
loop.prepareForCodegen();
|
||||
te::Stmt* s = loop.root_stmt();
|
||||
s = te::IRSimplifier::simplify(s);
|
||||
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
||||
|
||||
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
||||
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
||||
};
|
||||
|
||||
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
||||
for (auto _ : state) {
|
||||
func(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: add this back when the problem is fixed
|
||||
// BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV1)->Args({1 << 24});
|
||||
|
||||
// Similar to TeRfactor itself. But manually constructed the Split expression.
|
||||
BENCHMARK_DEFINE_F(Reduce1D, TeRfactorV2)(benchmark::State& state) {
|
||||
te::KernelScope ks;
|
||||
|
||||
int M = A.numel();
|
||||
const int kChunkSize = 8;
|
||||
TORCH_CHECK(M % kChunkSize == 0);
|
||||
|
||||
te::Placeholder AP(te::BufHandle("A", {M}, te::kFloat));
|
||||
te::Tensor* BT = te::Reduce(
|
||||
"reduce_full",
|
||||
{},
|
||||
te::Sum(),
|
||||
[&](const te::ExprHandle& mo, const te::ExprHandle& mi) {
|
||||
return AP.load(mo * kChunkSize + mi);
|
||||
},
|
||||
{{M / kChunkSize, "mo"}, {kChunkSize, "mi"}});
|
||||
|
||||
te::LoopNest loop({BT});
|
||||
|
||||
{
|
||||
auto const& loops = loop.getLoopStmtsFor(BT);
|
||||
TORCH_CHECK(loops.size() == 2);
|
||||
te::For* mo = loops[0];
|
||||
te::For* mi = loops[1];
|
||||
loop.rfactor(BT->body(), mi->var());
|
||||
}
|
||||
|
||||
{
|
||||
// Look for the new For and vectorize, but rfactor didn't return the newly added "For *".
|
||||
// Resort to a hack to find the lost "For *".
|
||||
// TODO: make it easier to find the transformed loop after rfactor.
|
||||
auto loops = te::NodeFinder<te::For>::find(loop.root_stmt());
|
||||
TORCH_CHECK(loops.size() == 4);
|
||||
auto mi = loops[2];
|
||||
loop.vectorize(mi);
|
||||
}
|
||||
|
||||
loop.prepareForCodegen();
|
||||
te::Stmt* s = loop.root_stmt();
|
||||
s = te::IRSimplifier::simplify(s);
|
||||
auto cg = CreateCodeGen("llvm_codegen", s, {AP, BT});
|
||||
|
||||
auto func = [&](at::Tensor& A, at::Tensor& B) {
|
||||
cg->call({A.data_ptr<float>(), B.data_ptr<float>()});
|
||||
};
|
||||
|
||||
ValidateFunc(func, "reduce1d_te_naive", A, B);
|
||||
for (auto _ : state) {
|
||||
func(A, B);
|
||||
}
|
||||
}
|
||||
|
||||
BENCHMARK_REGISTER_F(Reduce1D, TeRfactorV2)->Args({1 << 24});
|
||||
Loading…
Reference in a new issue