Optimize GatherElements further, add threshold for parallelizing Scaler. (#5579)

* Optimize GatherElements more.

* Optimize GatherElements further, add threshold for parallelizing Scaler.

* Add basic tests to exercises the parallel path
This commit is contained in:
Pranav Sharma 2020-10-24 12:38:31 -07:00 committed by GitHub
parent 3f06286154
commit 3f3b202e36
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 131 additions and 53 deletions

View file

@ -60,6 +60,8 @@ ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(
KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<int32_t>()).MayInplace(0, 0),
ScalerOp<int32_t>);
static constexpr int kParallelizationThreshold = 10 * 1000;
template <typename T>
ScalerOp<T>::ScalerOp(const OpKernelInfo& info) : OpKernel(info),
scale_(info.GetAttrsOrDefault<float>("scale")),
@ -84,29 +86,27 @@ common::Status ScalerOp<T>::Compute(OpKernelContext* context) const {
size_t x_size = x_shape.Size();
int64_t stride = x_dims.size() == 1 ? x_dims[0] : x_dims[1];
auto* ttp = context->GetOperatorThreadPool();
auto num_threads = std::min<int>(concurrency::ThreadPool::DegreeOfParallelism(ttp), static_cast<int>(x_size));
auto conditional_batch_call = [ttp, x_size](std::function<void(ptrdiff_t)> f) {
if (x_size < kParallelizationThreshold) { // TODO: tune this, arbitrary threshold
for (size_t i = 0; i < x_size; ++i) {
f(i);
}
} else {
concurrency::ThreadPool::TryBatchParallelFor(ttp, x_size, f, 0);
}
};
if (static_cast<int64_t>(offset_.size()) == stride &&
static_cast<int64_t>(scale_.size()) == stride) {
concurrency::ThreadPool::TrySimpleParallelFor(
ttp,
num_threads,
[this, num_threads, y_data, x_data, stride, x_size](ptrdiff_t batch_num) {
auto work = concurrency::ThreadPool::PartitionWork(batch_num, num_threads, x_size);
for (auto i = work.start; i < work.end; ++i) {
y_data[i] = static_cast<float>((x_data[i] - offset_[i % stride]) * scale_[i % stride]);
}
});
auto fn = [this, y_data, x_data, stride](ptrdiff_t i) {
y_data[i] = static_cast<float>((x_data[i] - offset_[i % stride]) * scale_[i % stride]);
};
conditional_batch_call(fn);
} else if (offset_.size() == 1 && scale_.size() == 1) {
concurrency::ThreadPool::TrySimpleParallelFor(
ttp,
num_threads,
[this, num_threads, y_data, x_data, x_size](ptrdiff_t batch_num) {
auto work = concurrency::ThreadPool::PartitionWork(batch_num, num_threads, x_size);
for (auto i = work.start; i < work.end; ++i) {
y_data[i] = static_cast<float>((x_data[i] - offset_[0]) * scale_[0]);
}
});
auto fn = [this, y_data, x_data](ptrdiff_t i) {
y_data[i] = static_cast<float>((x_data[i] - offset_[0]) * scale_[0]);
};
conditional_batch_call(fn);
} else {
std::ostringstream err_msg;
err_msg << "Either both scale and offset can be of feature size (" << stride << ") or 1";

View file

@ -25,6 +25,8 @@ ONNX_CPU_OPERATOR_KERNEL(
DataTypeImpl::GetTensorType<int64_t>()}),
GatherElements);
static constexpr int kParallelizationThreshold = 10 * 1000;
// Some helpers needed for GatherElements op -
// The following method computes the offset in the flattened array
@ -87,7 +89,8 @@ static inline void increment_over_inner_dim(std::vector<int64_t>& current_dims,
}
template <typename Tin>
static inline int64_t GetNegativeIndexAdjustedValue(const Tin* indices_data, Tin index, int64_t axis, const TensorShape& input_shape) {
static inline int64_t GetNegativeIndexAdjustedValue(const Tin* indices_data, Tin index, int64_t axis,
const TensorShape& input_shape) {
int64_t retval = -1;
if (indices_data[index] < 0) {
retval = static_cast<int64_t>(indices_data[index] + input_shape[axis]);
@ -105,7 +108,7 @@ static inline int64_t GetNegativeIndexAdjustedValue(const Tin* indices_data, Tin
#endif
template <bool is_string, typename T, typename Tin>
static void core_impl(const Tensor* input_tensor, const Tensor* indices_tensor,
Tensor* output_tensor, int64_t axis) {
Tensor* output_tensor, int64_t axis, concurrency::ThreadPool* ttp) {
// get pointer to input data
// optimizer will remove the redundant if/else block based on 'is_string' template parameter
const T* input_data = nullptr;
@ -135,11 +138,15 @@ static void core_impl(const Tensor* input_tensor, const Tensor* indices_tensor,
auto num_elements = indices_tensor->Shape().Size();
int64_t lower_index_limit = -input_shape[axis];
int64_t upper_index_limit = input_shape[axis] - 1;
for (int64_t i = 0; i < num_elements; ++i) {
auto validation_fn = [indices_data, lower_index_limit, upper_index_limit](ptrdiff_t i) {
auto indices_val = indices_data[i];
if (indices_val < lower_index_limit || indices_val > upper_index_limit)
ORT_THROW("GatherElements op: Value in indices must be within bounds [",
lower_index_limit, " , ", upper_index_limit, "]. Actual value is ", indices_val);
};
for (int64_t i = 0; i < num_elements; ++i) { // TODO: parallelize this? didn't give any benefit in my tests
validation_fn(i);
}
int64_t num_inner_dim = calculate_num_inner_dim(indices_shape);
@ -147,11 +154,21 @@ static void core_impl(const Tensor* input_tensor, const Tensor* indices_tensor,
bool processing_inner_dim = (axis == input_rank - 1) ? true : false;
int64_t base_offset = 0;
Tin indices_counter = -1;
int64_t output_counter = -1;
Tin indices_counter = 0;
size_t element_size = input_tensor->DataType()->Size();
std::vector<int64_t> process_dims(input_rank, 0);
int64_t output_counter = 0;
auto conditional_batch_call = [ttp, inner_dim_size](std::function<void(ptrdiff_t)> f) {
if (inner_dim_size < kParallelizationThreshold) { // TODO: tune this, arbitrary threshold
for (int64_t i = 0; i < inner_dim_size; ++i) {
f(i);
}
} else {
concurrency::ThreadPool::TryBatchParallelFor(ttp, inner_dim_size, f, 0);
}
};
if (!processing_inner_dim) {
while (num_inner_dim-- != 0) {
@ -160,22 +177,27 @@ static void core_impl(const Tensor* input_tensor, const Tensor* indices_tensor,
// process 1 chunk of 'inner dimension' length
// optimizer will remove the redundant if/else block based on 'is_string' template parameter
if (is_string) {
for (int64_t i = 0; i < inner_dim_size; ++i) {
output_data[++output_counter] =
auto fn = [input_data, output_data, base_offset, input_shape_pitches,
indices_data, indices_counter, axis, input_shape, output_counter](ptrdiff_t i) {
output_data[i + output_counter] =
input_data[base_offset +
(GetNegativeIndexAdjustedValue<Tin>(indices_data, ++indices_counter, axis, input_shape) *
(GetNegativeIndexAdjustedValue<Tin>(indices_data, static_cast<Tin>(i) + indices_counter, axis, input_shape) *
input_shape_pitches[axis]) +
i];
}
};
conditional_batch_call(fn);
output_counter += inner_dim_size;
} else {
for (int64_t i = 0; i < inner_dim_size; ++i) {
// optimizer will remove the redundant if/else block based on 'is_string' template parameter
memcpy(output_data,
input_data + (base_offset + (GetNegativeIndexAdjustedValue<Tin>(indices_data, ++indices_counter, axis, input_shape) * input_shape_pitches[axis]) + i) * element_size,
auto fn = [input_data, output_data, base_offset, input_shape_pitches, element_size,
indices_data, indices_counter, axis, input_shape](ptrdiff_t i) {
memcpy(output_data + (i * element_size),
input_data + (base_offset + (GetNegativeIndexAdjustedValue<Tin>(indices_data, static_cast<Tin>(i) + indices_counter, axis, input_shape) * input_shape_pitches[axis]) + i) * element_size,
element_size);
output_data += element_size;
}
};
conditional_batch_call(fn);
output_data += inner_dim_size * element_size;
}
indices_counter += static_cast<Tin>(inner_dim_size);
increment_over_inner_dim(process_dims, indices_shape);
}
}
@ -185,27 +207,31 @@ static void core_impl(const Tensor* input_tensor, const Tensor* indices_tensor,
base_offset = compute_base_offset(process_dims, input_shape_pitches, axis);
// process 1 chunk of 'inner dimension' length
// optimizer will remove the redundant if/else block based on 'is_string' template parameter
if (is_string) {
for (int64_t i = 0; i < inner_dim_size; ++i) {
auto fn = [input_data, output_data, base_offset,
indices_data, indices_counter, axis, input_shape, output_counter](ptrdiff_t i) {
// for innermost axis, input_shape_pitches[axis] = 1 (so no need to multiply)
output_data[++output_counter] =
output_data[i + output_counter] =
input_data[base_offset +
GetNegativeIndexAdjustedValue<Tin>(indices_data, ++indices_counter, axis, input_shape)];
}
GetNegativeIndexAdjustedValue<Tin>(indices_data, static_cast<Tin>(i) + indices_counter, axis, input_shape)];
};
conditional_batch_call(fn);
output_counter += inner_dim_size;
} else {
for (int64_t i = 0; i < inner_dim_size; ++i) {
// for innermost axis, input_shape_pitches[axis] = 1 (so no need to multiply)
// optimizer will remove the redundant if/else block based on 'is_string' template parameter
memcpy(output_data,
// for innermost axis, input_shape_pitches[axis] = 1 (so no need to multiply)
auto fn = [input_data, output_data, base_offset, element_size,
indices_data, indices_counter, axis, input_shape](ptrdiff_t i) {
memcpy(output_data + (i * element_size),
input_data + (base_offset +
GetNegativeIndexAdjustedValue<Tin>(indices_data, ++indices_counter, axis, input_shape)) *
GetNegativeIndexAdjustedValue<Tin>(indices_data, static_cast<Tin>(i) + indices_counter, axis, input_shape)) *
element_size,
element_size);
output_data += element_size;
}
};
conditional_batch_call(fn);
output_data += inner_dim_size * element_size;
}
indices_counter += static_cast<Tin>(inner_dim_size);
increment_over_inner_dim(process_dims, indices_shape);
}
}
@ -272,16 +298,17 @@ Status GatherElements::Compute(OpKernelContext* context) const {
if (indices_shape.Size() == 0)
return Status::OK();
auto* ttp = context->GetOperatorThreadPool();
if (input_tensor->IsDataTypeString()) {
if (indices_tensor->IsDataType<int32_t>())
core_impl<true, std::string, int32_t>(input_tensor, indices_tensor, output_tensor, axis);
core_impl<true, std::string, int32_t>(input_tensor, indices_tensor, output_tensor, axis, ttp);
else
core_impl<true, std::string, int64_t>(input_tensor, indices_tensor, output_tensor, axis);
core_impl<true, std::string, int64_t>(input_tensor, indices_tensor, output_tensor, axis, ttp);
} else {
if (indices_tensor->IsDataType<int32_t>())
core_impl<false, int8_t, int32_t>(input_tensor, indices_tensor, output_tensor, axis);
core_impl<false, int8_t, int32_t>(input_tensor, indices_tensor, output_tensor, axis, ttp);
else
core_impl<false, int8_t, int64_t>(input_tensor, indices_tensor, output_tensor, axis);
core_impl<false, int8_t, int64_t>(input_tensor, indices_tensor, output_tensor, axis, ttp);
}
return Status::OK();

View file

@ -3,19 +3,29 @@
#include "gtest/gtest.h"
#include "test/providers/provider_test_utils.h"
using namespace std;
namespace onnxruntime {
namespace test {
template <typename T>
void TestScalar() {
void TestScalar(bool use_big_input = false) {
OpTester test("Scaler", 1, onnxruntime::kMLDomain);
vector<float> scale{3.f, -4.f, 3.0f};
vector<float> offset{4.8f, -0.5f, 77.0f};
test.AddAttribute("scale", scale);
test.AddAttribute("offset", offset);
vector<T> input{1, -2, 3, 4, 5, -6};
vector<int64_t> dims{2, 3};
vector<T> input;
vector<int64_t> dims;
if (!use_big_input) {
input = vector<T>{1, -2, 3, 4, 5, -6};
dims = {2, 3};
} else {
input.resize(15 * 1000); // must be >= kParallelizationThreshold in scaler.cc
std::iota(std::begin(input), std::end(input), static_cast<T>(1));
dims = {5000, 3};
}
// prepare expected output
vector<float> expected_output;
@ -33,6 +43,7 @@ TEST(MLOpTest, ScalerOp) {
TestScalar<double>();
TestScalar<int64_t>();
TestScalar<int32_t>();
TestScalar<float>(true); // use big input
}
TEST(MLOpTest, ScalerOpScaleOffsetSize1) {
@ -55,5 +66,27 @@ TEST(MLOpTest, ScalerOpScaleOffsetSize1) {
test.Run();
}
// tests invocation via TryBatchParallelFor for input of size 10K
TEST(MLOpTest, ScalerOpScaleOffsetSize1BigInput) {
OpTester test("Scaler", 1, onnxruntime::kMLDomain);
vector<float> scale{3.f};
vector<float> offset{4.8f};
test.AddAttribute("scale", scale);
test.AddAttribute("offset", offset);
vector<float> input(15 * 1000); // must be >= kParallelizationThreshold in scaler.cc
std::iota(std::begin(input), std::end(input), 1.0f);
vector<int64_t> dims{3, 5000};
// prepare expected output
vector<float> expected_output;
for (size_t i = 0; i < input.size(); ++i) {
expected_output.push_back((input[i] - offset[0]) * scale[0]);
}
test.AddInput<float>("X", dims, input);
test.AddOutput<float>("Y", dims, expected_output);
test.Run();
}
} // namespace test
} // namespace onnxruntime

View file

@ -315,5 +315,23 @@ TEST(GatherElementsOpTest, string) {
RunTypedTest<std::string>();
}
TEST(GatherElementsOpTest, BigIndices) {
// int32_t indices - axis 0
OpTester test1("GatherElements", 11);
test1.AddAttribute<int64_t>("axis", 0LL);
const int kNumIndices = 10 * 1000; // must be >= kParallelizationThreshold in gather_elements.cc
std::vector<float> input(2 * kNumIndices);
std::iota(std::begin(input), std::end(input), 0.f);
test1.AddInput<float>("data", {2, kNumIndices}, input);
std::vector<int32_t> indices(kNumIndices, 0);
std::vector<float> output(kNumIndices);
std::iota(std::begin(output), std::end(output), 0.f);
test1.AddInput<int32_t>("indices", {1, kNumIndices}, indices);
test1.AddOutput<float>("output", {1, kNumIndices}, output);
test1.Run();
}
} // namespace test
} // namespace onnxruntime