A few performance improvements coming out of ssd_mobilenet and ssd_resnet34 analysis (#1578)

* A few performance improvements: - Make the iteration in NonZero more efficient by using a raw pointer and simplifying the increment logic - add another unit test to check the new logic works with 3 dimensional tensor - gains about 2% for ssd_mobilenet - Avoid floating point operations on each iteration on Concat - about 0.5% for ssd_mobilenet and ssd_resnet34 - Put common case first in ExecutionFrame::AllocateAsPerAllocationPlan to avoid unnecessary call to IsSparseTensor - about 0.05% for ssd_mobilenet - Minor tweak to put some ctors in the TensorShape header so they can be inlined more easily
2026-07-20 19:12:24 +00:00 · 2019-08-08 07:20:00 +10:00 · 2019-08-08 07:20:00 +10:00 · 6e430c0526
commit 6e430c0526
parent a443b013dd
6 changed files with 115 additions and 92 deletions
--- a/include/onnxruntime/core/framework/tensor_shape.h
+++ b/include/onnxruntime/core/framework/tensor_shape.h
@ -34,13 +34,14 @@ class TensorShape : private std::vector<int64_t> {
  TensorShape(TensorShape&& /*other*/) = default;
  TensorShape& operator=(TensorShape&& /*other*/) = default;

+  TensorShape(const std::vector<int64_t>& dims) : std::vector<int64_t>(dims) {}
+
+  TensorShape(std::vector<int64_t>&& dims) : std::vector<int64_t>(std::move(dims)) {}
+
+  TensorShape(const std::initializer_list<int64_t>& dims) : std::vector<int64_t>(dims) {}
+
  TensorShape(const int64_t* dimension_sizes, size_t dimension_count);

-  TensorShape(const std::vector<int64_t>& dims);
-  TensorShape(std::vector<int64_t>&& dims);
-
-  TensorShape(const std::initializer_list<int64_t>& dims);
-
  TensorShape(const std::vector<int64_t>& dims, size_t start, size_t end);

  /**
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@ -402,54 +402,54 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_

  const auto& alloc_info = per_alloc_plan.location;
  const auto* ml_type = per_alloc_plan.value_type;
-  if (ml_type == nullptr)
+  if (ml_type == nullptr) {
    return Status(
        ONNXRUNTIME, INVALID_ARGUMENT,
        "Tried to allocate without valid type information, ort_value index=" + std::to_string(ort_value_index));
+  }

-  if (ml_type->IsSparseTensorType()) {
+  if (ml_type->IsTensorType()) {
+    ORT_ENFORCE(shape, "Allocation of tensor types requires a shape.");
+
+    // tensors
+    const auto* ml_data_type = static_cast<const TensorTypeBase*>(ml_type)->GetElementType();
+
+    AllocKind alloc_kind = per_alloc_plan.alloc_kind;
+    switch (alloc_kind) {
+      // Right now for kAllocate and kAllocateOutput we are using same approach.
+      // In the future we may want to have different way to handle it.
+      case AllocKind::kAllocateOutput:
+      case AllocKind::kAllocate: {
+        ORT_RETURN_IF_ERROR(AllocateMLValueTensorSelfOwnBuffer(ort_value, ort_value_index, ml_data_type, alloc_info,
+                                                               *shape, per_alloc_plan.create_fence_if_async));
+        break;
+      }
+      case AllocKind::kReuse: {
+        int reuse_mlvalue_index = per_alloc_plan.reused_buffer;
+        ORT_RETURN_IF_ERROR(AllocateMLValueTensorPreAllocateBuffer(
+            ort_value, reuse_mlvalue_index, ml_data_type, alloc_info, *shape, per_alloc_plan.create_fence_if_async));
+        break;
+      }
+      case AllocKind::kShare: {
+        int reuse_mlvalue_index = per_alloc_plan.reused_buffer;
+        // copy at the OrtValue level so the shared_ptr for the data is shared between the two OrtValue instances
+        ort_value = GetMutableMLValue(reuse_mlvalue_index);
+        break;
+      }
+      default: {
+        std::ostringstream ostr;
+        ostr << "Invalid allocation kind: " << static_cast<std::underlying_type<AllocKind>::type>(alloc_kind);
+        return Status(ONNXRUNTIME, FAIL, ostr.str());
+      }
+    }
+
+    return Status::OK();
+  } else if (ml_type->IsSparseTensorType()) {
    return AllocateSparseTensor(ort_value, *ml_type, GetAllocator(alloc_info),
                                *shape, nnz, per_alloc_plan.create_fence_if_async, session_state_);
-  }
-  if (!ml_type->IsTensorType()) {
+  } else {
    return AllocateTraditionalMLValue(ort_value, *static_cast<const NonTensorTypeBase*>(ml_type));
  }
-
-  ORT_ENFORCE(shape, "Allocation of tensor types requires a shape.");
-
-  // tensors
-  const auto* ml_data_type = static_cast<const TensorTypeBase*>(ml_type)->GetElementType();
-
-  AllocKind alloc_kind = per_alloc_plan.alloc_kind;
-  switch (alloc_kind) {
-    // Right now for kAllocate and kAllocateOutput we are using same approach.
-    // In the future we may want to have different way to handle it.
-    case AllocKind::kAllocateOutput:
-    case AllocKind::kAllocate: {
-      ORT_RETURN_IF_ERROR(AllocateMLValueTensorSelfOwnBuffer(ort_value, ort_value_index, ml_data_type, alloc_info,
-                                                             *shape, per_alloc_plan.create_fence_if_async));
-      break;
-    }
-    case AllocKind::kReuse: {
-      int reuse_mlvalue_index = per_alloc_plan.reused_buffer;
-      ORT_RETURN_IF_ERROR(AllocateMLValueTensorPreAllocateBuffer(
-          ort_value, reuse_mlvalue_index, ml_data_type, alloc_info, *shape, per_alloc_plan.create_fence_if_async));
-      break;
-    }
-    case AllocKind::kShare: {
-      int reuse_mlvalue_index = per_alloc_plan.reused_buffer;
-      // copy at the OrtValue level so the shared_ptr for the data is shared between the two OrtValue instances
-      ort_value = GetMutableMLValue(reuse_mlvalue_index);
-      break;
-    }
-    default: {
-      std::ostringstream ostr;
-      ostr << "Invalid allocation kind: " << static_cast<std::underlying_type<AllocKind>::type>(alloc_kind);
-      return Status(ONNXRUNTIME, FAIL, ostr.str());
-    }
-  }
-
-  return Status::OK();
 }

 AllocatorPtr ExecutionFrame::GetAllocatorImpl(const OrtAllocatorInfo& info) const {
--- a/onnxruntime/core/framework/tensor_shape.cc
+++ b/onnxruntime/core/framework/tensor_shape.cc
@ -8,16 +8,8 @@

 namespace onnxruntime {

-TensorShape::TensorShape(const std::vector<int64_t>& dims) : std::vector<int64_t>(dims) {
-}
-
-TensorShape::TensorShape(std::vector<int64_t>&& dims) : std::vector<int64_t>(std::move(dims)) {
-}
-
-TensorShape::TensorShape(const std::initializer_list<int64_t>& dims) : std::vector<int64_t>(dims) {
-}
-
-TensorShape::TensorShape(const int64_t* dimension_sizes, size_t dimension_count) : std::vector<int64_t>(dimension_count) {
+TensorShape::TensorShape(const int64_t* dimension_sizes, size_t dimension_count)
+    : std::vector<int64_t>(dimension_count) {
  for (size_t i = 0; i < dimension_count; ++i) {
    (*this)[i] = dimension_sizes[i];
  }
--- a/onnxruntime/core/providers/cpu/tensor/concat.cc
+++ b/onnxruntime/core/providers/cpu/tensor/concat.cc
@ -34,16 +34,17 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
    auto& inputs_n = *tensor_pointer;
    const auto& inputs_n_dims = inputs_n.Shape().GetDims();
    const size_t inputs_n_rank = inputs_n_dims.size();
-    ORT_ENFORCE(inputs_n_rank == inputs_0_rank, "Ranks of input data are different, cannot concatenate them, "
-                "expected rank: ", std::to_string(inputs_0_rank), " got: ", std::to_string(inputs_n_rank));
+    ORT_ENFORCE(inputs_n_rank == inputs_0_rank,
+                "Ranks of input data are different, cannot concatenate them. expected rank: ",
+                inputs_0_rank, " got: ", inputs_n_rank);
    // Ensure all the other (non-concat) axes match
    for (size_t axis_index = 0; axis_index < inputs_0_rank; ++axis_index) {
      num_elements *= inputs_n_dims[axis_index];
      if (axis_index == p.axis)
        continue;
      ORT_RETURN_IF_NOT(inputs_n_dims[axis_index] == inputs_0_dims[axis_index],
-                        "Non concat axis dimensions must match: Axis ", 
-                        axis_index, " has mismatched dimensions of ", inputs_n_dims[axis_index], 
+                        "Non concat axis dimensions must match: Axis ",
+                        axis_index, " has mismatched dimensions of ", inputs_n_dims[axis_index],
                        " and ", inputs_0_dims[axis_index]);
    }
    tensor_num_elements[index] = num_elements;
@ -58,7 +59,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep

  // Calculate the shape of the output tensor
  std::vector<int64_t> dims(inputs_0_rank);
-  size_t num_elements = 1; // cache size of the first input along the way
+  size_t num_elements = 1;  // cache size of the first input along the way
  for (size_t dimension_index = 0; dimension_index < inputs_0_rank; dimension_index++) {
    dims[dimension_index] = inputs_0_dims[dimension_index];
    num_elements *= inputs_0_dims[dimension_index];
@ -66,7 +67,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
  tensor_num_elements[0] = num_elements;
  dims[p.axis] = concat_axis_size;
  TensorShape output_shape(dims);
- 
+
  auto& concat_result = *ctx->Output(0, output_shape);
  p.output_tensor = &concat_result;
  p.output_num_elements = output_shape.Size();
@ -75,7 +76,7 @@ Status ConcatBase::PrepareForCompute(OpKernelContext* ctx, int input_count, Prep
  // there is no need to proceed further
  if (p.output_num_elements == 0)
    return Status::OK();
-    
+
  // The output_axis_pitch is the number of elements to add to move to the next split axis in the output
  p.output_axis_pitch = 1;
  for (size_t i = inputs_0_rank; i-- > p.axis;) p.output_axis_pitch *= dims[i];
@ -110,7 +111,7 @@ Status Concat::Compute(OpKernelContext* ctx) const {

  auto is_string_type = ctx->Input<Tensor>(0)->DataType() == DataTypeImpl::GetType<std::string>();

-  int64_t output_offset = 0;
+  int64_t initial_output_offset = 0;  // initial offset for each input
  auto element_bytes = p.output_tensor->DataType()->Size();
  for (int input_index = 0; input_index < input_count; input_index++) {
    const auto& prep = p.inputs[input_index];
@ -124,19 +125,29 @@ Status Concat::Compute(OpKernelContext* ctx) const {

    // Copy the data across. For every 'input_axis_pitch' values copied, we move over by the 'output_axis_pitch'
    uint8_t* output = static_cast<uint8_t*>(p.output_tensor->MutableDataRaw());
-    for (size_t idxCopy = 0; idxCopy < input_size / input_axis_pitch; ++idxCopy) {
+    int64_t cur_out_offset = 0;
+    int64_t cur_in_offset = 0;
+    for (size_t idx_copy = 0, end = input_size / input_axis_pitch; idx_copy < end; ++idx_copy) {
      if (is_string_type) {
-        for (int idxItem = 0; idxItem < input_axis_pitch; ++idxItem)
-          reinterpret_cast<std::string*>(output)[output_offset + idxCopy * p.output_axis_pitch + idxItem] =
-              reinterpret_cast<const std::string*>(input)[idxCopy * input_axis_pitch + idxItem];
-      } else
+        size_t out = initial_output_offset + cur_out_offset;
+        for (int idx_item = 0; idx_item < input_axis_pitch; ++idx_item) {
+          reinterpret_cast<std::string*>(output)[out + idx_item] =
+              reinterpret_cast<const std::string*>(input)[cur_in_offset + idx_item];
+        }
+      } else {
        memcpy(
-            output + (output_offset + idxCopy * p.output_axis_pitch) * element_bytes,
-            input + idxCopy * input_axis_pitch * element_bytes,
+            output + (initial_output_offset + cur_out_offset) * element_bytes,
+            input + cur_in_offset * element_bytes,
            input_axis_pitch * element_bytes);
+      }
+
+      cur_out_offset += p.output_axis_pitch;
+      cur_in_offset += input_axis_pitch;
    }
-    output_offset += input_axis_pitch;
+
+    initial_output_offset += input_axis_pitch;
  }
+
  return Status::OK();
 }

--- a/onnxruntime/core/providers/cpu/tensor/nonzero_op.cc
+++ b/onnxruntime/core/providers/cpu/tensor/nonzero_op.cc
@ -40,24 +40,6 @@ NONZERO_TYPED_KERNEL(float)
 #undef NONZERO_TYPED_KERNEL_WITH_TYPE_NAME
 #undef NONZERO_TYPED_KERNEL

-namespace {
-void IncrementCoordinate(const TensorShape& shape, std::vector<int64_t>* coordinate) {
-  assert(coordinate->size() == shape.NumDimensions());
-
-  size_t i = 0;
-  const size_t i_end = coordinate->size();
-  for (; i < i_end; ++i) {
-    const size_t i_from_back = i_end - i - 1;
-    if ((*coordinate)[i_from_back] != shape[i_from_back] - 1) break;
-    (*coordinate)[i_from_back] = 0;
-  }
-
-  if (i < i_end) {
-    ++(*coordinate)[i_end - i - 1];
-  }
-}
-}  // namespace
-
 template <typename T>
 Status NonZero<T>::Compute(OpKernelContext* context) const {
  const auto X = context->Input<Tensor>(0);
@ -71,19 +53,37 @@ Status NonZero<T>::Compute(OpKernelContext* context) const {
  // reserve enough space for indices for every element of X
  non_zero_indices_buffer.reserve(X_shape.Size() * coordinate_size);

+  const T* data = X->Data<T>();
+
  if (X_shape.IsScalar()) {
-    const T& value = *(X->Data<T>());
+    const T& value = *data;
    if (value != T{}) {
      non_zero_indices_buffer.push_back(0);
    }
  } else {
    std::vector<int64_t> coordinate(coordinate_size, 0);
-    for (const T& value : X->DataAsSpan<T>()) {
+
+    // as we iterate the entries, increment the coordinate for the current entry
+    // e.g. if shape is {2,2}, we start with 0,0 increment to 0,1 increment to 1,0 and finally 1,1
+    auto increment_coordinate = [&coordinate, &coordinate_size, &X_shape]() {
+      for (int64_t idx = coordinate_size - 1; idx >= 0; --idx) {
+        int64_t& cur_coord = coordinate[idx];
+        if (cur_coord != X_shape[idx] - 1) {
+          ++cur_coord;
+          break;
+        }
+        cur_coord = 0;
+      }
+    };
+
+    for (size_t i = 0, end = X_shape.Size(); i < end; ++i) {
+      const T& value = *data++;
      if (value != T{}) {
        non_zero_indices_buffer.insert(non_zero_indices_buffer.end(),
                                       coordinate.begin(), coordinate.end());
      }
-      IncrementCoordinate(X_shape, &coordinate);
+
+      increment_coordinate();
    }
  }

--- a/onnxruntime/test/providers/cpu/tensor/nonzero_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/nonzero_op_test.cc
@ -48,6 +48,25 @@ TEST(NonZeroOpTest, BasicBool) {
  test.Run();
 }

+TEST(NonZeroOpTest, ThreeDims) {
+  OpTester test{kOpName, kOpVersion};
+
+  std::vector<int64_t> X_dims{2, 2, 2};
+  std::vector<int64_t> X{0, 1,
+                         1, 0,
+
+                         1, 0,
+                         1, 0};
+  test.AddInput<int64_t>("X", X_dims, std::vector<int64_t>{X.begin(), X.end()});
+  test.AddOutput<int64_t>(
+      "Y", {3, 4},
+      {0, 0, 1, 1,
+       0, 1, 0, 1,
+       1, 0, 0, 0});
+
+  test.Run();
+}
+
 TEST(NonZeroOpTest, Scalar) {
  {
    OpTester test{kOpName, kOpVersion};