From 3bf614fd470d6fa37104c0b24014e83540cb8e6e Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 25 Jul 2022 14:14:38 -0700 Subject: [PATCH] Eliminate memory allocations per recent profiling (#12225) * Alloc begin FeedsFetches refactoring Refactor Tensor class Fix buffer deletor Remove new/delete deleted Adjust alloc move Fix up xnnpack provider Clarifying the comment on Create() --- include/onnxruntime/core/common/span_utils.h | 3 + .../core/framework/buffer_deleter.h | 2 +- include/onnxruntime/core/framework/tensor.h | 11 +--- onnxruntime/contrib_ops/cpu/bert/attention.cc | 4 +- .../contrib_ops/cpu/bert/attention_cpu_base.h | 2 +- .../cpu/quantization/attention_quant.cc | 4 +- .../quantization/dynamic_quantize_matmul.cc | 2 +- .../cpu/quantization/nhwc_max_pool.cc | 2 +- .../cpu/quantization/qlinear_pool.cc | 2 +- .../transformers/generation_device_helper.cc | 22 ++++---- .../transformers/generation_device_helper.h | 8 +-- .../cpu/transformers/subgraph_base.cc | 21 +++---- .../cpu/transformers/subgraph_base.h | 6 +- .../transformers/generation_device_helper.cc | 28 +++++----- .../transformers/generation_device_helper.h | 4 +- .../core/framework/feeds_fetches_manager.cc | 25 ++++++--- .../core/framework/feeds_fetches_manager.h | 44 +++++++++++---- onnxruntime/core/framework/iexecutor.h | 10 ++-- .../core/framework/ort_value_name_idx_map.h | 2 +- .../core/framework/ort_value_tensor_slicer.cc | 6 +- .../framework/orttraining_partial_executor.cc | 4 +- .../framework/orttraining_partial_executor.h | 4 +- .../core/framework/parallel_executor.cc | 4 +- .../core/framework/parallel_executor.h | 4 +- .../framework/partial_graph_execution_state.h | 5 +- .../core/framework/sequential_executor.cc | 4 +- .../core/framework/sequential_executor.h | 4 +- onnxruntime/core/framework/utils.cc | 56 ++++++++++--------- onnxruntime/core/framework/utils.h | 12 ++-- .../transpose_optimizer/optimizer_api_impl.cc | 19 +++---- .../core/providers/cpu/generator/random.cc | 2 +- onnxruntime/core/providers/cpu/math/top_k.cc | 19 +++---- onnxruntime/core/providers/cpu/math/top_k.h | 4 +- onnxruntime/core/providers/cpu/nn/conv.cc | 6 +- .../core/providers/cpu/nn/conv_transpose.cc | 6 +- onnxruntime/core/providers/cpu/nn/lrn.cc | 2 +- .../cpu/quantization/conv_integer.cc | 2 +- .../cpu/quantization/matmul_integer_base.h | 2 +- .../quantization/quantize_linear_matmul.cc | 2 +- .../shared_library/provider_interfaces.h | 4 +- .../shared_library/provider_wrappedtypes.h | 7 ++- onnxruntime/core/providers/xnnpack/nn/conv.cc | 17 +++--- onnxruntime/core/providers/xnnpack/nn/conv.h | 2 +- onnxruntime/core/session/inference_session.cc | 28 +++++----- onnxruntime/core/session/inference_session.h | 18 +++--- .../core/session/provider_bridge_ort.cc | 20 ++++++- .../test/framework/execution_frame_test.cc | 11 ++-- onnxruntime/test/framework/function_test.cc | 4 +- .../training_ops/cpu/op_gradients.cc | 28 +++++----- .../training_ops/cpu/optimizer/adamw/adamw.cc | 8 +-- 50 files changed, 289 insertions(+), 227 deletions(-) diff --git a/include/onnxruntime/core/common/span_utils.h b/include/onnxruntime/core/common/span_utils.h index 998fc6e71d..8247cc1394 100644 --- a/include/onnxruntime/core/common/span_utils.h +++ b/include/onnxruntime/core/common/span_utils.h @@ -64,4 +64,7 @@ constexpr auto AsSpan(const T (&arr)[N]) { return details::AsSpanImpl(arr, N); } +template +inline gsl::span EmptySpan() { return gsl::span(); } + } \ No newline at end of file diff --git a/include/onnxruntime/core/framework/buffer_deleter.h b/include/onnxruntime/core/framework/buffer_deleter.h index d8322af77c..e289eaef35 100644 --- a/include/onnxruntime/core/framework/buffer_deleter.h +++ b/include/onnxruntime/core/framework/buffer_deleter.h @@ -12,7 +12,7 @@ class BufferDeleter { public: BufferDeleter() : alloc_(nullptr) {} BufferDeleter(AllocatorPtr alloc) - : alloc_(alloc) {} + : alloc_(std::move(alloc)) {} void operator()(void* p) const { if (alloc_) diff --git a/include/onnxruntime/core/framework/tensor.h b/include/onnxruntime/core/framework/tensor.h index 109b4d5ca2..4ca4777c89 100644 --- a/include/onnxruntime/core/framework/tensor.h +++ b/include/onnxruntime/core/framework/tensor.h @@ -36,15 +36,10 @@ namespace onnxruntime { */ class Tensor final { public: - static std::unique_ptr Create(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator) { - return std::make_unique(p_type, shape, std::move(allocator)); - } - static std::unique_ptr Create(MLDataType p_type, const TensorShape& shape, void* p_data, - const OrtMemoryInfo& alloc, ptrdiff_t offset = 0, - gsl::span strides = {}) { - return std::make_unique(p_type, shape, p_data, alloc, offset, strides); - } + // NB! Removing Create() methods returning unique_ptr. Still available in other EPs that are dynamically linked. + // Strive not to allocate Tensor with new/delete as it is a shallow class and using it by value is just fine. + // Use InitOrtValue() methods to allocate for OrtValue. Tensor() = default; // to allow creating vector to support seq(tensor) diff --git a/onnxruntime/contrib_ops/cpu/bert/attention.cc b/onnxruntime/contrib_ops/cpu/bert/attention.cc index 739360353e..ddf773014c 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention.cc +++ b/onnxruntime/contrib_ops/cpu/bert/attention.cc @@ -302,7 +302,7 @@ bool Attention::IsPackWeightsSuccessful(int qkv_index, // buffer memory and we don not want it uninitialized and generate different hashes // if and when we try to cache this pre-packed buffer for sharing between sessions. memset(packed_weights_data, 0, packed_weights_data_size); - packed_weights_[qkv_index] = BufferUniquePtr(packed_weights_data, BufferDeleter(alloc)); + packed_weights_[qkv_index] = BufferUniquePtr(packed_weights_data, BufferDeleter(std::move(alloc))); packed_weights_size_[qkv_index] = packb_size; for (size_t i = 0; i < loop_len; i++) { @@ -470,7 +470,7 @@ Status Attention::Compute(OpKernelContext* context) const { // D (input_hidden_size) is hidden dimension of input, where D could be larger than any of the hidden_sizes // (NH) when model is pruned. T = H1 + H2 + H3, where H1, H2, H3 are head sizes of Q, K, V respectively auto gemm_data = allocator->Alloc(SafeInt(batch_size) * sequence_length * (q_hidden_size + k_hidden_size + v_hidden_size) * element_size); - BufferUniquePtr gemm_buffer(gemm_data, BufferDeleter(allocator)); + BufferUniquePtr gemm_buffer(gemm_data, BufferDeleter(std::move(allocator))); auto Q = reinterpret_cast(gemm_data); auto K = Q + static_cast(batch_size) * sequence_length * q_hidden_size; diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h index db2c93e377..9eaaf124d6 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h @@ -83,7 +83,7 @@ class AttentionCPUBase : public AttentionBase { // Compute the attentionScore * Value. It does: out_tmp(B, N, S, H) = attention_probs(B, N, S, S*) x V(B, N, S*, H) auto out_tmp_data = allocator->Alloc(SafeInt(batch_size) * num_heads_ * sequence_length * v_head_size * sizeof(T)); - BufferUniquePtr out_tmp_buffer(out_tmp_data, BufferDeleter(allocator)); + BufferUniquePtr out_tmp_buffer(out_tmp_data, BufferDeleter(std::move(allocator))); ComputeVxAttentionScore(output->template MutableData(), static_cast(out_tmp_data), static_cast(attention_probs), V, batch_size, sequence_length, past_sequence_length, v_head_size, v_hidden_size, diff --git a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc index 18bed37532..40878d7759 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc @@ -96,7 +96,7 @@ Status QAttention::PrePack(const Tensor& weights, int input_idx, AllocatorPtr // if and when we try to cache this pre-packed buffer for sharing between sessions. memset(packed_weights_data, 0, packed_weights_data_size); - packed_weights_ = BufferUniquePtr(packed_weights_data, BufferDeleter(alloc)); + packed_weights_ = BufferUniquePtr(packed_weights_data, BufferDeleter(std::move(alloc))); for (size_t i = 0; i < loop_len; i++) { MlasGemmPackB(head_size, input_hidden_size, weights_data, hidden_size_x3, false /*AIsSigned*/, weights_is_signed_, packed_weights_data); @@ -212,7 +212,7 @@ Status QAttention::Compute(OpKernelContext* context) const { // STEP.1: gemm_data(BS, 3NH) = Scale(input(BS, D) x weights(D, 3NH)) + bias(3NH) // D is hidden dimension of input, where input_hidden_size (D) could be larger than hidden_size (NH) when model is pruned. auto gemm_data = allocator->Alloc(SafeInt(batch_size) * sequence_length * 3 * hidden_size * element_size); - BufferUniquePtr gemm_buffer(gemm_data, BufferDeleter(allocator)); + BufferUniquePtr gemm_buffer(gemm_data, BufferDeleter(std::move(allocator))); auto Q = reinterpret_cast(gemm_data); auto K = Q + static_cast(batch_size) * sequence_length * hidden_size; diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc index b0d1c3aec8..574f814f67 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc @@ -215,7 +215,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const { AllocatorPtr allocator; ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&allocator)); uint8_t* a_data_quant = static_cast(allocator->Alloc(SafeInt(num_of_elements) * sizeof(uint8_t))); - BufferUniquePtr a_buffer_quant_holder(a_data_quant, BufferDeleter(allocator)); + BufferUniquePtr a_buffer_quant_holder(a_data_quant, BufferDeleter(std::move(allocator))); ParQuantizeLinear(a_data, a_data_quant, num_of_elements, a_scale, a_zero_point, ctx->GetOperatorThreadPool()); diff --git a/onnxruntime/contrib_ops/cpu/quantization/nhwc_max_pool.cc b/onnxruntime/contrib_ops/cpu/quantization/nhwc_max_pool.cc index f4b3898028..71212a53e6 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/nhwc_max_pool.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/nhwc_max_pool.cc @@ -76,7 +76,7 @@ Status NhwcMaxPool::Compute(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); int64_t col_buffer_batch_count = std::min(output_image_size, output_batch_count); auto* col_data = alloc->Alloc(SafeInt(sizeof(const T8Bits*)) * kernel_size * col_buffer_batch_count); - BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc)); + BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc))); std::vector padding_data(static_cast(C), std::numeric_limits::lowest()); const auto* Xdata = X->template Data(); diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc index d60e77f7a1..5be2f19dff 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_pool.cc @@ -583,7 +583,7 @@ Status QLinearAveragePool::ComputeImpl(OpKernelContext* context) const { BufferUniquePtr x_data_fp32_guard; if (kernel_shape.size() <= 3) { x_data_fp32 = (float*)allocator->Alloc(SafeInt(x_shape.Size()) * sizeof(float)); - x_data_fp32_guard = BufferUniquePtr(x_data_fp32, BufferDeleter(allocator)); + x_data_fp32_guard = BufferUniquePtr(x_data_fp32, BufferDeleter(std::move(allocator))); dequantize_array(x_shape.Size(), X_data, x_scale, x_zero_point, x_data_fp32, tp); } diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc index 179f0baf5a..e0e7e4cc39 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.cc @@ -22,8 +22,8 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest, AllocatorPtr allocator, void* /*stream*/, onnxruntime::concurrency::ThreadPool* threadpool, - std::unique_ptr& output_values, - std::unique_ptr& output_indices) { + Tensor& output_values, + Tensor& output_indices) { if (input->IsDataType()) { return GetTopK(input, axis, k, largest, sorted, allocator, threadpool, output_values, output_indices); } @@ -343,20 +343,20 @@ Status ProcessLogits(const OrtValue& logits, // constexpr bool largest = true; constexpr bool sorted = true; // results returned in sorted order. - std::unique_ptr topk_scores; - std::unique_ptr topk_indices; + Tensor topk_scores; + Tensor topk_indices; ORT_RETURN_IF_ERROR(TopK(&input, axis, top_k, largest, sorted, allocator, stream, thread_pool, topk_scores, topk_indices)); #ifdef DEBUG_GENERATION - dumper->Print("topk_scores", *(topk_scores.get())); - dumper->Print("topk_indices", *(topk_indices.get())); + dumper->Print("topk_scores", topk_scores); + dumper->Print("topk_indices", topk_indices); #endif // Convert indices in range [0, num_beams * vocab_size) to token ID of range [0, vocab_size) like the following: // next_indices = (next_tokens / vocab_size).long() // next_tokens = next_tokens % vocab_size - gsl::span next_token_indices = topk_indices->DataAsSpan(); + gsl::span next_token_indices = topk_indices.DataAsSpan(); offset = 0; for (int i = 0; i < batch_size; i++) { for (unsigned int j = 0; j < top_k; j++, offset++) { @@ -365,7 +365,7 @@ Status ProcessLogits(const OrtValue& logits, // } } - gsl::span next_scores = topk_scores->DataAsSpan(); + gsl::span next_scores = topk_scores.DataAsSpan(); gsl::span next_tokens(beam_state->next_tokens.data(), beam_state->next_tokens.size()); gsl::span next_indices(beam_state->next_indices.data(), beam_state->next_indices.size()); @@ -453,8 +453,8 @@ Status GreedySearchProcessLogits( constexpr bool largest = true; constexpr bool sorted = false; - std::unique_ptr topk_scores; - std::unique_ptr topk_indices; + Tensor topk_scores; + Tensor topk_indices; ORT_RETURN_IF_ERROR( TopK(&input, axis, @@ -472,7 +472,7 @@ Status GreedySearchProcessLogits( dumper->Print("topk_indices", *(topk_indices.get())); #endif - gsl::span next_token_indices = topk_indices->DataAsSpan(); + gsl::span next_token_indices = topk_indices.DataAsSpan(); gsl::copy(next_token_indices, greedy_state->next_tokens_cpu); #ifdef DEBUG_GENERATION diff --git a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h index ce9195204c..aa445e7c62 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h +++ b/onnxruntime/contrib_ops/cpu/transformers/generation_device_helper.h @@ -37,8 +37,8 @@ using TopkFunc = std::function& output_values, - std::unique_ptr& output_indices)>; + Tensor& output_values, + Tensor& output_indices)>; // Create subgraph inputs: input_ids, position_ids and attention_mask (for GPT-2). using CreateGptInputsFunc = std::function& output_values, - std::unique_ptr& output_indices); + Tensor& output_values, + Tensor& output_indices); Status AddToFeeds( const IExecutionProvider* execution_provider, diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc index f2e5dc23dd..d56745c19f 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc +++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.cc @@ -55,7 +55,7 @@ Status Subgraph::Setup(const SessionState& session_state, session_state_ = &session_state; subgraph_session_state_ = &subgraph_session_state; - std::vector feed_names; + InlinedVector feed_names; feed_names.reserve(static_cast(num_subgraph_inputs) + static_cast(num_implicit_inputs)); // Use the first output (logits) to find device location. @@ -69,25 +69,24 @@ Status Subgraph::Setup(const SessionState& session_state, feed_names.push_back(entry->Name()); } - std::vector feed_locations; - feed_locations.resize(feed_names.size()); + InlinedVector feed_locations; + feed_locations.reserve(feed_names.size()); for (size_t i = 0, end = feed_names.size(); i < end; ++i) { if (i >= subgraph_input_names.size()) { // Implicit inputs const auto& location = utils::FindMemoryInfoForValue(session_state, feed_names[i]); - feed_locations[i] = location.device; + feed_locations.push_back(location.device); } else { - feed_locations[i] = default_location.device; + feed_locations.push_back(default_location.device); } } - std::unique_ptr ffm; ORT_RETURN_IF_ERROR(FeedsFetchesManager::Create(feed_names, subgraph_output_names, - subgraph_session_state.GetOrtValueNameIdxMap(), ffm)); - ORT_RETURN_IF_ERROR(utils::InitializeFeedFetchCopyInfo(subgraph_session_state, *ffm)); + subgraph_session_state.GetOrtValueNameIdxMap(), feeds_fetches_manager_)); + ORT_RETURN_IF_ERROR(utils::InitializeFeedFetchCopyInfo(subgraph_session_state, *feeds_fetches_manager_)); // Setup the locations where we want the subgraph output to end up on - std::vector fetch_locations; + InlinedVector fetch_locations; fetch_locations.reserve(num_subgraph_outputs); // Past state need to be where we can feed them in to the next iteration, so set the location to match the feed. @@ -95,9 +94,7 @@ Status Subgraph::Setup(const SessionState& session_state, fetch_locations.push_back(&default_location); } - utils::FinalizeFeedFetchCopyInfo(*ffm, feed_locations, fetch_locations); - - feeds_fetches_manager_ = std::move(ffm); + utils::FinalizeFeedFetchCopyInfo(*feeds_fetches_manager_, feed_locations, fetch_locations); // Check subgraph only need once so put in Setup function. auto& inputs = subgraph.GetInputs(); diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h index 96bd8af644..93c133a85d 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h +++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_base.h @@ -48,7 +48,9 @@ class Subgraph { Status Setup(const SessionState& session_state, const SessionState& subgraph_session_state); - FeedsFetchesManager* GetFeedsFetchesManager() const { return feeds_fetches_manager_.get(); } + FeedsFetchesManager* GetFeedsFetchesManager() { + return (feeds_fetches_manager_.has_value()) ? &*feeds_fetches_manager_ : nullptr; + } const IExecutionProvider* GetProvider() const; @@ -65,7 +67,7 @@ class Subgraph { AllocatorPtr allocator_; const SessionState* session_state_; const SessionState* subgraph_session_state_; - std::unique_ptr feeds_fetches_manager_; + std::optional feeds_fetches_manager_; bool is_output_float16_; }; diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc index ac788b9754..0432b1a345 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.cc @@ -31,8 +31,8 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest, AllocatorPtr allocator, void* stream, onnxruntime::concurrency::ThreadPool* /*threadpool*/, - std::unique_ptr& output_values, - std::unique_ptr& output_indices) { + Tensor& output_values, + Tensor& output_indices) { ORT_ENFORCE(nullptr != input); int32_t rank = static_cast(input->Shape().NumDimensions()); @@ -51,15 +51,15 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest, int64_t dimension = input_shape[axis]; int64_t N = elem_nums_cuda[0] / dimension; - output_values = Tensor::Create(input->DataType(), output_shape, allocator); - output_indices = Tensor::Create(DataTypeImpl::GetType(), output_shape, allocator); + output_values = std::move(*Tensor::Create(input->DataType(), output_shape, allocator)); + output_indices = std::move(*Tensor::Create(DataTypeImpl::GetType(), output_shape, std::move(allocator))); if (input->IsDataType()) { return TopKImpl(nullptr, // We limit number of beams in BeamSearchParameters, so K <= 256 and use NULL here reinterpret_cast(stream), input->Data(), - static_cast(output_values->MutableDataRaw()), - static_cast(output_indices->MutableDataRaw()), + static_cast(output_values.MutableDataRaw()), + static_cast(output_indices.MutableDataRaw()), elem_nums_cuda, static_cast(elem_nums_cuda.Size()), static_cast(axis), @@ -72,8 +72,8 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest, return TopKImpl(nullptr, reinterpret_cast(stream), input->Data(), - static_cast(output_values->MutableDataRaw()), - static_cast(output_indices->MutableDataRaw()), + static_cast(output_values.MutableDataRaw()), + static_cast(output_indices.MutableDataRaw()), elem_nums_cuda, static_cast(elem_nums_cuda.Size()), static_cast(axis), @@ -350,10 +350,10 @@ Status ProcessLogits(const OrtValue& logits, // constexpr bool largest = true; constexpr bool sorted = true; // results returned in sorted order. - std::unique_ptr topk_scores; - std::unique_ptr topk_indices; + std::unique_ptr topk_scores = Tensor::CreateDefault(); + std::unique_ptr topk_indices = Tensor::CreateDefault(); ORT_RETURN_IF_ERROR(TopK(&input, axis, top_k, largest, sorted, allocator, stream, thread_pool, - topk_scores, topk_indices)); + *topk_scores, *topk_indices)); #ifdef DEBUG_GENERATION dumper->Print("topk_scores", *(topk_scores.get())); @@ -514,10 +514,10 @@ Status GreedySearchProcessLogits( constexpr bool largest = true; constexpr bool sorted = false; - std::unique_ptr topk_scores; - std::unique_ptr topk_indices; + auto topk_scores = Tensor::CreateDefault(); + auto topk_indices = Tensor::CreateDefault(); ORT_RETURN_IF_ERROR(TopK(&input, axis, top_k, largest, sorted, allocator, stream, thread_pool, - topk_scores, topk_indices)); + *topk_scores, *topk_indices)); #ifdef DEBUG_GENERATION dumper->Print("topk_scores", *(topk_scores.get())); diff --git a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h index 1f51f38568..ebd2642554 100644 --- a/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h +++ b/onnxruntime/contrib_ops/cuda/transformers/generation_device_helper.h @@ -25,8 +25,8 @@ Status TopK(const Tensor* input, const int axis, const unsigned k, bool largest, AllocatorPtr allocator, void* stream, onnxruntime::concurrency::ThreadPool* threadpool, - std::unique_ptr& output_values, - std::unique_ptr& output_indices); + Tensor& output_values, + Tensor& output_indices); Status AddToFeeds(const IExecutionProvider* execution_provider, std::initializer_list inputs, diff --git a/onnxruntime/core/framework/feeds_fetches_manager.cc b/onnxruntime/core/framework/feeds_fetches_manager.cc index 6f16f24ccd..09ccbf8d0d 100644 --- a/onnxruntime/core/framework/feeds_fetches_manager.cc +++ b/onnxruntime/core/framework/feeds_fetches_manager.cc @@ -8,9 +8,9 @@ #include "core/framework/utils.h" namespace onnxruntime { -common::Status FeedsFetchesInfo::MapNamesToMLValueIdxs(const std::vector& names, +common::Status FeedsFetchesInfo::MapNamesToMLValueIdxs(gsl::span names, const OrtValueNameIdxMap& ort_value_name_idx_map, - std::vector& ort_value_idxs) { + InlinedVector& ort_value_idxs) { auto status = Status::OK(); ort_value_idxs.reserve(names.size()); @@ -40,8 +40,8 @@ Status FeedsFetchesInfo::SetMLValueIdxs(const OrtValueNameIdxMap& ort_value_name return status; } -Status FeedsFetchesManager::Create(const std::vector& feed_names, - const std::vector& output_names, +Status FeedsFetchesManager::Create(gsl::span feed_names, + gsl::span output_names, const OrtValueNameIdxMap& ort_value_name_idx_map, std::unique_ptr& feed_fetch_manager) { FeedsFetchesInfo info{feed_names, output_names, ort_value_name_idx_map}; @@ -51,11 +51,22 @@ Status FeedsFetchesManager::Create(const std::vector& feed_names, return Status::OK(); } +Status FeedsFetchesManager::Create(gsl::span feed_names, + gsl::span output_names, + const OrtValueNameIdxMap& ort_value_name_idx_map, + std::optional& feed_fetch_manager) { + FeedsFetchesInfo info{feed_names, output_names, ort_value_name_idx_map}; + + feed_fetch_manager.emplace(std::move(info)); + + return Status::OK(); +} + FeedsFetchesManager::FeedsFetchesManager(FeedsFetchesInfo&& info) - : feeds_fetches_info_{info} { + : feeds_fetches_info_(std::move(info)) { // init with default values - feeds_device_copy_info_.resize(info.feed_names.size()); - fetches_device_copy_info_.resize(info.output_names.size()); + feeds_device_copy_info_.resize(feeds_fetches_info_.feed_names.size()); + fetches_device_copy_info_.resize(feeds_fetches_info_.output_names.size()); } void FeedsFetchesManager::SetDeviceCopyChecks(DeviceCopyCheck input_copy_needed, DeviceCopyCheck output_copy_needed) { diff --git a/onnxruntime/core/framework/feeds_fetches_manager.h b/onnxruntime/core/framework/feeds_fetches_manager.h index a15a08508f..6f1cafbd4f 100644 --- a/onnxruntime/core/framework/feeds_fetches_manager.h +++ b/onnxruntime/core/framework/feeds_fetches_manager.h @@ -5,6 +5,8 @@ #include #include +#include +#include "core/common/inlined_containers_fwd.h" #ifndef SHARED_PROVIDER #include "core/framework/ort_value.h" @@ -30,25 +32,43 @@ struct DeviceCopyChecks { struct FeedsFetchesInfo { FeedsFetchesInfo() = default; - FeedsFetchesInfo(const std::vector& feed_names_in, - const std::vector& output_names_in, + FeedsFetchesInfo(gsl::span feed_names_in, + gsl::span output_names_in, const OrtValueNameIdxMap& ort_value_name_idx_map) - : feed_names{feed_names_in}, output_names{output_names_in} { + : feed_names(), + output_names() { + feed_names.reserve(feed_names_in.size()); + feed_names.assign(feed_names_in.begin(), feed_names_in.end()); + output_names.reserve(output_names_in.size()); + output_names.assign(output_names_in.begin(), output_names_in.end()); ORT_THROW_IF_ERROR(SetMLValueIdxs(ort_value_name_idx_map)); } - static Status MapNamesToMLValueIdxs(const std::vector& names, + FeedsFetchesInfo(gsl::span feed_names_in, + gsl::span output_names_in, + const OrtValueNameIdxMap& ort_value_name_idx_map) + : feed_names(), + output_names() { + feed_names.reserve(feed_names_in.size()); + feed_names.assign(feed_names_in.begin(), feed_names_in.end()); + output_names.reserve(output_names_in.size()); + output_names.assign(output_names_in.begin(), output_names_in.end()); + ORT_THROW_IF_ERROR(SetMLValueIdxs(ort_value_name_idx_map)); + } + + + static Status MapNamesToMLValueIdxs(gsl::span names, const OrtValueNameIdxMap& ort_value_name_idx_map, - std::vector& ort_value_idxs); + InlinedVector& ort_value_idxs); // set the ort_value_idxs for the current values in feed_names and output_names Status SetMLValueIdxs(const OrtValueNameIdxMap& ort_value_name_idx_map); - std::vector feed_names; - std::vector output_names; + InlinedVector feed_names; + InlinedVector output_names; - std::vector feeds_mlvalue_idxs; - std::vector fetches_mlvalue_idxs; + InlinedVector feeds_mlvalue_idxs; + InlinedVector fetches_mlvalue_idxs; }; struct MLValueCopyInfo { @@ -58,10 +78,14 @@ struct MLValueCopyInfo { class FeedsFetchesManager { public: - static Status Create(const std::vector& feed_names, const std::vector& output_names, + static Status Create(gsl::span feed_names, gsl::span output_names, const OrtValueNameIdxMap& ort_value_name_idx_map, std::unique_ptr& feeds_fetches_manager); + static Status Create(gsl::span feed_names, gsl::span output_names, + const OrtValueNameIdxMap& ort_value_name_idx_map, + std::optional& feeds_fetches_manager); + FeedsFetchesManager(FeedsFetchesInfo&& info); const FeedsFetchesInfo& GetFeedsFetchesInfo() const { return feeds_fetches_info_; } diff --git a/onnxruntime/core/framework/iexecutor.h b/onnxruntime/core/framework/iexecutor.h index f24094da3f..af55ee13ec 100644 --- a/onnxruntime/core/framework/iexecutor.h +++ b/onnxruntime/core/framework/iexecutor.h @@ -28,9 +28,9 @@ class IExecutor { * The lifetime of 'fetches' is limited by 'session_state' */ common::Status Execute(const SessionState& session_state, - const std::vector& feed_mlvalue_idxs, - const std::vector& feeds, - const std::vector& fetch_mlvalue_idxs, + gsl::span feed_mlvalue_idxs, + gsl::span feeds, + gsl::span fetch_mlvalue_idxs, std::vector& fetches, const logging::Logger& logger) { std::unordered_map fetch_allocators; @@ -38,8 +38,8 @@ class IExecutor { } // TODO: as fetch_allocators is optional, it should be a pointer instead of reference - virtual common::Status Execute(const SessionState& session_state, const std::vector& feed_mlvalue_idxs, - const std::vector& feeds, const std::vector& fetch_mlvalue_idxs, + virtual common::Status Execute(const SessionState& session_state, gsl::span feed_mlvalue_idxs, + gsl::span feeds, gsl::span fetch_mlvalue_idxs, std::vector& fetches, // optional custom allocators. key is index in fetches const std::unordered_map& fetch_allocators, diff --git a/onnxruntime/core/framework/ort_value_name_idx_map.h b/onnxruntime/core/framework/ort_value_name_idx_map.h index 45a7fabf91..6035dc4e85 100644 --- a/onnxruntime/core/framework/ort_value_name_idx_map.h +++ b/onnxruntime/core/framework/ort_value_name_idx_map.h @@ -30,7 +30,7 @@ class OrtValueNameIdxMap { return p.first->second; } - common::Status GetIdx(const std::string& name, int& idx) const { + common::Status GetIdx(std::string_view name, int& idx) const { idx = -1; auto it = map_.find(name); diff --git a/onnxruntime/core/framework/ort_value_tensor_slicer.cc b/onnxruntime/core/framework/ort_value_tensor_slicer.cc index b916aa90ba..cf4020d830 100644 --- a/onnxruntime/core/framework/ort_value_tensor_slicer.cc +++ b/onnxruntime/core/framework/ort_value_tensor_slicer.cc @@ -80,9 +80,9 @@ void OrtValueTensorSlicer::Iterator::MaterializeMLValue() const { // // TODO: Ideally we could avoid the overhead of creating a new Tensor (mainly cost of copying type and shape info) // and would simply update Tensor::p_data_ given all other info remains constant for each slice. - auto sub_tensor = Tensor::Create(tensor_data_type_, per_iteration_shape_, const_cast(tensor_slice_data_raw), *tensor_location_); - auto ml_tensor = DataTypeImpl::GetType(); - current_ = OrtValue{sub_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc()}; + OrtValue val; + Tensor::InitOrtValue(tensor_data_type_, per_iteration_shape_, const_cast(tensor_slice_data_raw), *tensor_location_, val); + current_ = std::move(val); } template class OrtValueTensorSlicer; diff --git a/onnxruntime/core/framework/orttraining_partial_executor.cc b/onnxruntime/core/framework/orttraining_partial_executor.cc index 9650c99532..5756ddfca0 100644 --- a/onnxruntime/core/framework/orttraining_partial_executor.cc +++ b/onnxruntime/core/framework/orttraining_partial_executor.cc @@ -130,8 +130,8 @@ static Status ReleaseNodeMLValues(ExecutionFrame& frame, return Status::OK(); } -Status PartialExecutor::Execute(const SessionState& session_state, const std::vector& feed_mlvalue_idxs, - const std::vector& feeds, const std::vector& fetch_mlvalue_idxs, +Status PartialExecutor::Execute(const SessionState& session_state, gsl::span feed_mlvalue_idxs, + gsl::span feeds, gsl::span fetch_mlvalue_idxs, std::vector& fetches, const std::unordered_map& fetch_allocators, const logging::Logger& logger) { diff --git a/onnxruntime/core/framework/orttraining_partial_executor.h b/onnxruntime/core/framework/orttraining_partial_executor.h index b34c4e67f2..bd8bd497d2 100644 --- a/onnxruntime/core/framework/orttraining_partial_executor.h +++ b/onnxruntime/core/framework/orttraining_partial_executor.h @@ -28,8 +28,8 @@ class PartialExecutor : public IExecutor { ORT_UNUSED_PARAMETER(partial_graph_index_); } - common::Status Execute(const SessionState& session_state, const std::vector& feed_mlvalue_idxs, - const std::vector& feeds, const std::vector& fetch_mlvalue_idxs, + common::Status Execute(const SessionState& session_state, gsl::span feed_mlvalue_idxs, + gsl::span feeds, gsl::span fetch_mlvalue_idxs, std::vector& fetches, const std::unordered_map& fetch_allocators, const logging::Logger& logger) override; diff --git a/onnxruntime/core/framework/parallel_executor.cc b/onnxruntime/core/framework/parallel_executor.cc index 1d7cf192c8..e1ff88cde9 100644 --- a/onnxruntime/core/framework/parallel_executor.cc +++ b/onnxruntime/core/framework/parallel_executor.cc @@ -27,8 +27,8 @@ ParallelExecutor::ParallelExecutor(const SessionState& session_state, const bool } } -Status ParallelExecutor::Execute(const SessionState& session_state, const std::vector& feed_mlvalue_idxs, - const std::vector& feeds, const std::vector& fetch_mlvalue_idxs, +Status ParallelExecutor::Execute(const SessionState& session_state, gsl::span feed_mlvalue_idxs, + gsl::span feeds, gsl::span fetch_mlvalue_idxs, std::vector& fetches, const std::unordered_map& fetch_allocators, const logging::Logger& logger) { diff --git a/onnxruntime/core/framework/parallel_executor.h b/onnxruntime/core/framework/parallel_executor.h index ef2eebbf51..bdcc66dac2 100644 --- a/onnxruntime/core/framework/parallel_executor.h +++ b/onnxruntime/core/framework/parallel_executor.h @@ -22,8 +22,8 @@ class ParallelExecutor : public IExecutor { public: ParallelExecutor(const SessionState& session_state, const bool& terminate_flag = false); - common::Status Execute(const SessionState& session_state, const std::vector& feed_mlvalue_idxs, - const std::vector& feeds, const std::vector& fetch_mlvalue_idxs, + common::Status Execute(const SessionState& session_state, gsl::span feed_mlvalue_idxs, + gsl::span feeds, gsl::span fetch_mlvalue_idxs, std::vector& fetches, const std::unordered_map& fetch_allocators, const logging::Logger& logger) override; diff --git a/onnxruntime/core/framework/partial_graph_execution_state.h b/onnxruntime/core/framework/partial_graph_execution_state.h index 7ce701cf16..e0634b71e7 100644 --- a/onnxruntime/core/framework/partial_graph_execution_state.h +++ b/onnxruntime/core/framework/partial_graph_execution_state.h @@ -26,8 +26,9 @@ struct PartialGraphExecutionState { size_t GetProgramCounterStart() { return program_counter_start_; } size_t GetProgramCounterEnd() { return program_counter_end_; } - ExecutionFrame& GetExecutionFrame(const std::vector& feed_mlvalue_idxs, const std::vector& feeds, - const std::vector& fetch_mlvalue_idxs, const std::vector& fetches, + ExecutionFrame& GetExecutionFrame(gsl::span feed_mlvalue_idxs, + gsl::span feeds, gsl::span fetch_mlvalue_idxs, + gsl::span fetches, const std::unordered_map& fetch_allocators, const SessionState& session_state) { if (execution_frame_ == nullptr) { diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc index ed3e3fbaf1..2afe0596c9 100644 --- a/onnxruntime/core/framework/sequential_executor.cc +++ b/onnxruntime/core/framework/sequential_executor.cc @@ -141,8 +141,8 @@ static Status ReleaseNodeMLValues(ExecutionFrame& frame, const SequentialExecutionPlan::NodeExecutionPlan& node_exec_plan, const logging::Logger& logger); -Status SequentialExecutor::Execute(const SessionState& session_state, const std::vector& feed_mlvalue_idxs, - const std::vector& feeds, const std::vector& fetch_mlvalue_idxs, +Status SequentialExecutor::Execute(const SessionState& session_state, gsl::span feed_mlvalue_idxs, + gsl::span feeds, gsl::span fetch_mlvalue_idxs, std::vector& fetches, const std::unordered_map& fetch_allocators, const logging::Logger& logger) { diff --git a/onnxruntime/core/framework/sequential_executor.h b/onnxruntime/core/framework/sequential_executor.h index 463749d747..172de3d41c 100644 --- a/onnxruntime/core/framework/sequential_executor.h +++ b/onnxruntime/core/framework/sequential_executor.h @@ -21,8 +21,8 @@ class SequentialExecutor : public IExecutor { SequentialExecutor(const bool& terminate_flag = false, const bool only_execute_path_to_fetches = false) : terminate_flag_{terminate_flag}, only_execute_path_to_fetches_(only_execute_path_to_fetches) {} - common::Status Execute(const SessionState& session_state, const std::vector& feed_mlvalue_idxs, - const std::vector& feeds, const std::vector& fetch_mlvalue_idxs, + common::Status Execute(const SessionState& session_state, gsl::span feed_mlvalue_idxs, + gsl::span feeds, gsl::span fetch_mlvalue_idxs, std::vector& fetches, const std::unordered_map& fetch_allocators, const logging::Logger& logger) override; diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc index ba4e72f54e..3cfd52f63b 100644 --- a/onnxruntime/core/framework/utils.cc +++ b/onnxruntime/core/framework/utils.cc @@ -251,7 +251,7 @@ static bool HaveCpuExecutionProvidersOnly(const ExecutionProviders& execution_pr static const OrtMemoryInfo& FindMemoryInfoForValue(const OrtValueNameIdxMap& map, const SequentialExecutionPlan& plan, - const std::string& name) { + std::string_view name) { int idx = -1; auto status = map.GetIdx(name, idx); ORT_THROW_IF_ERROR(status); @@ -261,7 +261,7 @@ static const OrtMemoryInfo& FindMemoryInfoForValue(const OrtValueNameIdxMap& map } const OrtMemoryInfo& FindMemoryInfoForValue(const SessionState& session_state, - const std::string& name) { + std::string_view name) { const auto* exec_plan_ptr = session_state.GetExecutionPlan(); ORT_ENFORCE(exec_plan_ptr); @@ -304,7 +304,7 @@ static common::Status CalculateStaticCopyInfoForFeed(const SessionState& session } static common::Status CalculateStaticCopyInfoForFeeds(const SessionState& session_state, - const std::vector& feed_names, + gsl::span feed_names, std::vector& copy_info) { for (size_t idx = 0, end = feed_names.size(); idx < end; ++idx) { ORT_RETURN_IF_ERROR(CalculateStaticCopyInfoForFeed(session_state, feed_names[idx], copy_info[idx])); @@ -316,7 +316,7 @@ static common::Status CalculateStaticCopyInfoForFeeds(const SessionState& sessio // get the source device info for the node producing each output that we will return in the fetches. // target device info is not known until runtime. static common::Status CalculateStaticCopyInfoForFetches(const SessionState& session_state, - const std::vector& fetch_names, + gsl::span fetch_names, std::vector& copy_info) { for (size_t idx = 0, end = fetch_names.size(); idx < end; ++idx) { const std::string& output_name = fetch_names[idx]; @@ -362,7 +362,7 @@ common::Status InitializeFeedFetchCopyInfo(const SessionState& session_state, } // update the allocation_provider in the copy info based on the actual feeds -static bool FinalizeCopyInfoForFeeds(const std::vector& feed_locations, +static bool FinalizeCopyInfoForFeeds(gsl::span feed_locations, std::vector& copy_info) { ORT_ENFORCE(feed_locations.size() == copy_info.size()); bool copy_needed = false; @@ -378,7 +378,7 @@ static bool FinalizeCopyInfoForFeeds(const std::vector& feed_location return copy_needed; } -static bool FinalizeCopyInfoForFetches(const std::vector& fetch_alloc_info, +static bool FinalizeCopyInfoForFetches(gsl::span& fetch_alloc_info, std::vector& copy_info) { ORT_ENFORCE(fetch_alloc_info.size() == copy_info.size()); bool copy_needed = false; @@ -402,8 +402,8 @@ static bool FinalizeCopyInfoForFetches(const std::vector& // Finalize the copy info using the OrtDevice and OrtMemoryInfo for the feeds and fetches // This can be used by control flow nodes prior to the execution of the overall graph. void FinalizeFeedFetchCopyInfo(FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feed_locations, - const std::vector& fetch_alloc_info) { + gsl::span feed_locations, + gsl::span fetch_alloc_info) { if (feeds_fetches_manager.GetDeviceCopyChecks().status == DeviceCopyCheck::NoCopy) return; @@ -418,7 +418,7 @@ void FinalizeFeedFetchCopyInfo(FeedsFetchesManager& feeds_fetches_manager, // Finalize the copy info using the OrtValue instances for the feeds and fetches static void FinalizeFeedFetchCopyInfo(FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feeds, + gsl::span feeds, std::vector& fetches) { if (feeds_fetches_manager.GetDeviceCopyChecks().status == DeviceCopyCheck::NoCopy) return; @@ -465,9 +465,9 @@ static void FinalizeFeedFetchCopyInfo(FeedsFetchesManager& feeds_fetches_manager } static common::Status CopyInputsAcrossDevices(const SessionState& session_state, - const std::vector& orig_feeds, + gsl::span orig_feeds, std::vector& new_feeds, - const std::vector& copy_info) { + gsl::span copy_info) { size_t num_feeds = orig_feeds.size(); ORT_ENFORCE(copy_info.size() == num_feeds); @@ -560,20 +560,26 @@ static common::Status CopyOutputsAcrossDevices(const SessionState& session_state static common::Status ExecuteGraphImpl(const SessionState& session_state, const FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feeds, std::vector& fetches, + gsl::span feeds, std::vector& fetches, const std::unordered_map& fetch_allocators, ExecutionMode execution_mode, const bool& terminate_flag, const logging::Logger& logger, const bool only_execute_path_to_fetches = false) { - std::unique_ptr p_exec; + // avoid memory allocations + std::optional seq_executor; + std::optional par_executor; + IExecutor* p_exec = nullptr; if (execution_mode == ExecutionMode::ORT_SEQUENTIAL) { - p_exec = std::make_unique(terminate_flag, only_execute_path_to_fetches); + seq_executor.emplace(terminate_flag, only_execute_path_to_fetches); + p_exec = &seq_executor.value(); } else if (execution_mode == ExecutionMode::ORT_PARALLEL) { auto* p_inter_op_thread_pool = session_state.GetInterOpThreadPool(); if (!p_inter_op_thread_pool) { LOGS(logger, WARNING) << "Only one thread was configured for parallel execution. Hence will use sequential execution."; - p_exec = std::make_unique(terminate_flag, only_execute_path_to_fetches); + seq_executor.emplace(terminate_flag, only_execute_path_to_fetches); + p_exec = &seq_executor.value(); } else { - p_exec = std::make_unique(session_state, terminate_flag); + par_executor.emplace(session_state, terminate_flag); + p_exec = &par_executor.value(); } } @@ -588,7 +594,7 @@ static common::Status ExecuteGraphImpl(const SessionState& session_state, feeds_fetches_info.fetches_mlvalue_idxs, fetches, fetch_allocators, logger)); } else { - const std::vector* p_feeds = &feeds; + auto feeds_to_use = feeds; std::vector* p_fetches = &fetches; std::vector device_feeds; std::vector device_fetches; @@ -596,7 +602,7 @@ static common::Status ExecuteGraphImpl(const SessionState& session_state, if (device_copy_checks.input_copy_needed == DeviceCopyCheck::Copy) { const auto& feed_copy_info = feeds_fetches_manager.GetFeedsDeviceCopyInfo(); ORT_RETURN_IF_ERROR(CopyInputsAcrossDevices(session_state, feeds, device_feeds, feed_copy_info)); - p_feeds = &device_feeds; + feeds_to_use = device_feeds; } auto num_outputs = fetches.size(); @@ -619,7 +625,7 @@ static common::Status ExecuteGraphImpl(const SessionState& session_state, } ORT_RETURN_IF_ERROR(p_exec->Execute(session_state, - feeds_fetches_info.feeds_mlvalue_idxs, *p_feeds, + feeds_fetches_info.feeds_mlvalue_idxs, feeds_to_use, feeds_fetches_info.fetches_mlvalue_idxs, *p_fetches, fetch_allocators, logger)); @@ -633,7 +639,7 @@ static common::Status ExecuteGraphImpl(const SessionState& session_state, common::Status ExecuteGraph(const SessionState& session_state, FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feeds, std::vector& fetches, + gsl::span feeds, std::vector& fetches, ExecutionMode execution_mode, const bool& terminate_flag, const logging::Logger& logger, bool only_execute_path_to_fetches) { ORT_RETURN_IF_ERROR(utils::InitializeFeedFetchCopyInfo(session_state, feeds_fetches_manager)); @@ -649,7 +655,7 @@ common::Status ExecuteGraph(const SessionState& session_state, #ifdef ENABLE_TRAINING common::Status ExecutePartialGraph(const SessionState& session_state, FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feeds, std::vector& fetches, + gsl::span feeds, std::vector& fetches, const logging::Logger& logger, PartialGraphExecutionState& state, const OrtValueCachePtr& cache, int32_t partial_graph_index) { @@ -667,7 +673,7 @@ common::Status ExecutePartialGraph(const SessionState& session_state, FeedsFetch feeds_fetches_info.fetches_mlvalue_idxs, fetches, {}, logger)); } else { - const std::vector* p_feeds = &feeds; + auto p_feeds = feeds; std::vector* p_fetches = &fetches; std::vector device_feeds; std::vector device_fetches; @@ -675,7 +681,7 @@ common::Status ExecutePartialGraph(const SessionState& session_state, FeedsFetch if (device_copy_checks.input_copy_needed == DeviceCopyCheck::Copy) { const auto& feed_copy_info = feeds_fetches_manager.GetFeedsDeviceCopyInfo(); ORT_RETURN_IF_ERROR(CopyInputsAcrossDevices(session_state, feeds, device_feeds, feed_copy_info)); - p_feeds = &device_feeds; + p_feeds = device_feeds; } auto num_outputs = fetches.size(); @@ -698,7 +704,7 @@ common::Status ExecutePartialGraph(const SessionState& session_state, FeedsFetch } ORT_RETURN_IF_ERROR(executor.Execute(session_state, - feeds_fetches_info.feeds_mlvalue_idxs, *p_feeds, + feeds_fetches_info.feeds_mlvalue_idxs, p_feeds, feeds_fetches_info.fetches_mlvalue_idxs, *p_fetches, {}, logger)); @@ -712,7 +718,7 @@ common::Status ExecutePartialGraph(const SessionState& session_state, FeedsFetch #endif common::Status ExecuteSubgraph(const SessionState& session_state, const FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feeds, std::vector& fetches, + gsl::span feeds, std::vector& fetches, const std::unordered_map& fetch_allocators, ExecutionMode execution_mode, const bool& terminate_flag, const logging::Logger& logger) { auto status = ExecuteGraphImpl(session_state, feeds_fetches_manager, feeds, fetches, fetch_allocators, diff --git a/onnxruntime/core/framework/utils.h b/onnxruntime/core/framework/utils.h index f320c75f34..84c96757bb 100644 --- a/onnxruntime/core/framework/utils.h +++ b/onnxruntime/core/framework/utils.h @@ -71,7 +71,7 @@ common::Status CopyOneInputAcrossDevices(const SessionState& session_state, cons // Searches the allocation plan from the session_state to find the OrtMemoryInfo for the value 'name'. const OrtMemoryInfo& FindMemoryInfoForValue(const SessionState& session_state, - const std::string& name); + std::string_view name); // Initialize the feed and fetch copy info using session_state. // Determines the device that each graph input that will be fed will be consumed on, @@ -82,18 +82,18 @@ common::Status InitializeFeedFetchCopyInfo(const SessionState& session_state, // Finalize the feed and fetch copy info using session_state and the device and location information from the feeds // and fetches that will be used in graph execution. void FinalizeFeedFetchCopyInfo(FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feed_locations, - const std::vector& fetch_alloc_info); + gsl::span feed_locations, + gsl::span fetch_alloc_info); // Execute the main graph. The feed_fetches_manager will be finalized based on the provided feeds and fetches. common::Status ExecuteGraph(const SessionState& session_state, FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feeds, std::vector& fetches, + gsl::span feeds, std::vector& fetches, ExecutionMode execution_mode, const bool& terminate_flag, const logging::Logger& logger, bool only_execute_path_to_fetches = false); #ifdef ENABLE_TRAINING common::Status ExecutePartialGraph(const SessionState& session_state, FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feeds, std::vector& fetches, + gsl::span feeds, std::vector& fetches, const logging::Logger& logger, PartialGraphExecutionState& state, const OrtValueCachePtr& cache, int32_t partial_graph_index); @@ -102,7 +102,7 @@ common::Status ExecutePartialGraph(const SessionState& session_state, FeedsFetch // Execute a subgraph. The feeds_fetches_manager should have been finalized prior to calling this function. // See IControlFlowNode::SetupSubgraphExecutionInfo usage in the control flow kernels. common::Status ExecuteSubgraph(const SessionState& session_state, const FeedsFetchesManager& feeds_fetches_manager, - const std::vector& feeds, std::vector& fetches, + gsl::span feeds, std::vector& fetches, const std::unordered_map& fetch_allocators, ExecutionMode execution_mode, const bool& terminate_flag, const logging::Logger& logger); diff --git a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc index db88896267..4c882ff1c3 100644 --- a/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc +++ b/onnxruntime/core/optimizer/transpose_optimizer/optimizer_api_impl.cc @@ -262,11 +262,11 @@ std::vector ApiTensor::Data() const { const DataTypeImpl* tensor_dtype = DataTypeImpl::TensorTypeFromONNXEnum(tensor_proto_.data_type())->GetElementType(); auto tensor_shape_dims = utils::GetTensorShapeFromTensorProto(tensor_proto_); TensorShape tensor_shape{std::move(tensor_shape_dims)}; - auto tensor = onnxruntime::Tensor::Create(tensor_dtype, tensor_shape, cpu_allocator_); + onnxruntime::Tensor tensor(tensor_dtype, tensor_shape, cpu_allocator_); ORT_THROW_IF_ERROR(utils::TensorProtoToTensor(Env::Default(), model_path_.ToPathString().c_str(), - tensor_proto_, *tensor)); - size_t num_bytes = gsl::narrow_cast(tensor->SizeInBytes()); - const uint8_t* data = static_cast(tensor->DataRaw()); + tensor_proto_, tensor)); + size_t num_bytes = gsl::narrow_cast(tensor.SizeInBytes()); + const uint8_t* data = static_cast(tensor.DataRaw()); return std::vector(data, data + num_bytes); } // @@ -515,7 +515,7 @@ void ApiGraph::TransposeInitializer(std::string_view name, const std::vectordata_type())->GetElementType(); auto tensor_shape_dims = utils::GetTensorShapeFromTensorProto(*tensor_proto); TensorShape tensor_shape{tensor_shape_dims}; - std::unique_ptr in_tensor = Tensor::Create(tensor_dtype, tensor_shape, cpu_allocator_); + Tensor in_tensor(tensor_dtype, tensor_shape, cpu_allocator_); std::vector new_tensor_shape_dims; std::vector permutations; @@ -528,13 +528,12 @@ void ApiGraph::TransposeInitializer(std::string_view name, const std::vector out_tensor = Tensor::Create(tensor_dtype, new_tensor_shape, cpu_allocator_); + Tensor out_tensor(tensor_dtype, new_tensor_shape, cpu_allocator_); ORT_THROW_IF_ERROR(utils::TensorProtoToTensor(Env::Default(), graph_.ModelPath().ToPathString().c_str(), - *tensor_proto, *in_tensor)); + *tensor_proto, in_tensor)); - ORT_THROW_IF_ERROR(Transpose::DoTranspose(permutations, *in_tensor, *out_tensor)); + ORT_THROW_IF_ERROR(Transpose::DoTranspose(permutations, in_tensor, out_tensor)); auto& node_arg = *graph_.GetNodeArg(name_str); TensorShapeProto new_shape; @@ -544,7 +543,7 @@ void ApiGraph::TransposeInitializer(std::string_view name, const std::vectorGetTempSpaceAllocator(&alloc)); auto cdf_data = static_cast(alloc->Alloc(SafeInt(sizeof(double)) * num_classes)); - BufferUniquePtr cdf_buffer(cdf_data, BufferDeleter(alloc)); + BufferUniquePtr cdf_buffer(cdf_data, BufferDeleter(std::move(alloc))); Eigen::array cdf_dims = {{num_classes}}; auto cdf = EigenVector(cdf_data, cdf_dims); // END create temporary tensor diff --git a/onnxruntime/core/providers/cpu/math/top_k.cc b/onnxruntime/core/providers/cpu/math/top_k.cc index f9b2c7d736..7010085978 100644 --- a/onnxruntime/core/providers/cpu/math/top_k.cc +++ b/onnxruntime/core/providers/cpu/math/top_k.cc @@ -26,7 +26,6 @@ #include #include -using namespace std; namespace onnxruntime { template @@ -123,7 +122,7 @@ static void HeapifyIthPosition(int64_t* heap, size_t i, size_t k, const HeapCmp& template static void SelectTopK(const Comparator& comparer, int64_t row_offset, int64_t num_blocks, int64_t block_slice, int64_t inter_block_offset, - const unsigned k, bool sort_top_k, vector& data_holder) { + const unsigned k, bool sort_top_k, std::vector& data_holder) { for (int64_t l = 0; l < num_blocks; ++l) { data_holder[l] = (row_offset + (l * block_slice + inter_block_offset)); } @@ -375,8 +374,8 @@ template Status GetTopK(const Tensor* input, const int axis, const unsigned k, bool largest, bool sorted, AllocatorPtr allocator, onnxruntime::concurrency::ThreadPool* threadpool, - std::unique_ptr& output_values, - std::unique_ptr& output_indices) { + Tensor& output_values, + Tensor& output_indices) { const TensorShape& input_shape = input->Shape(); // Will return axis_ as is if positive or fixes it in case it is negative @@ -394,8 +393,8 @@ Status GetTopK(const Tensor* input, const int axis, const unsigned k, bool large TensorShape output_shape = input_shape; output_shape[axis_parsed] = k; - output_values = Tensor::Create(input->DataType(), output_shape, allocator); - output_indices = Tensor::Create(DataTypeImpl::GetType(), output_shape, allocator); + output_values = Tensor(input->DataType(), output_shape, allocator); + output_indices = Tensor(DataTypeImpl::GetType(), output_shape, allocator); // no-op - no output buffers to fill - return silently if (k == 0) { @@ -403,10 +402,10 @@ Status GetTopK(const Tensor* input, const int axis, const unsigned k, bool large } if (largest) { - FindTopKElements>(input, input_shape, output_values.get(), output_indices.get(), output_shape, k, sorted, + FindTopKElements>(input, input_shape, &output_values, &output_indices, output_shape, k, sorted, gsl::narrow_cast(axis_parsed), threadpool); } else { - FindTopKElements>(input, input_shape, output_values.get(), output_indices.get(), output_shape, k, sorted, + FindTopKElements>(input, input_shape, &output_values, &output_indices, output_shape, k, sorted, gsl::narrow_cast(axis_parsed), threadpool); } @@ -417,8 +416,8 @@ Status GetTopK(const Tensor* input, const int axis, const unsigned k, bool large template Status GetTopK(const Tensor* input, const int axis, const unsigned k, bool largest, bool sorted, AllocatorPtr allocator, onnxruntime::concurrency::ThreadPool* threadpool, - std::unique_ptr& output_values, - std::unique_ptr& output_indices); + Tensor& output_values, + Tensor& output_indices); // Opset ver - 1 to 9 diff --git a/onnxruntime/core/providers/cpu/math/top_k.h b/onnxruntime/core/providers/cpu/math/top_k.h index 596f10a8e6..ed19acb097 100644 --- a/onnxruntime/core/providers/cpu/math/top_k.h +++ b/onnxruntime/core/providers/cpu/math/top_k.h @@ -24,6 +24,6 @@ template Status GetTopK(const Tensor* input, const int axis, const unsigned k, bool largest, bool sorted, AllocatorPtr allocator, onnxruntime::concurrency::ThreadPool* threadpool, - std::unique_ptr& output_values, - std::unique_ptr& output_indices); + Tensor& output_values, + Tensor& output_indices); } // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc index b08111bf45..7b1435af54 100644 --- a/onnxruntime/core/providers/cpu/nn/conv.cc +++ b/onnxruntime/core/providers/cpu/nn/conv.cc @@ -80,7 +80,7 @@ Status Conv::Compute(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); auto* col_data = alloc->Alloc(SafeInt(sizeof(T)) * col_buffer_size); - col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc)); + col_buffer = BufferUniquePtr(col_data, BufferDeleter(std::move(alloc))); } T* col_buffer_data = static_cast(col_buffer.get()); @@ -234,7 +234,7 @@ Status Conv::Compute(OpKernelContext* context) const { auto* working_data = WorkingBufferSize > 0 ? alloc->Alloc(SafeInt(sizeof(float)) * WorkingBufferSize) : nullptr; - BufferUniquePtr working_buffer(working_data, BufferDeleter(alloc)); + BufferUniquePtr working_buffer(working_data, BufferDeleter(std::move(alloc))); MlasConv(&Parameters, Xdata, @@ -254,7 +254,7 @@ Status Conv::Compute(OpKernelContext* context) const { const int64_t col_buffer_size = kernel_dim * output_image_size; auto* col_data = alloc->Alloc(SafeInt(sizeof(float)) * col_buffer_size); - BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc)); + BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc))); auto* col_buffer_data = static_cast(col_buffer.get()); for (int image_id = 0; image_id < N; ++image_id) { diff --git a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc index 1e2f4f60d3..9011985191 100644 --- a/onnxruntime/core/providers/cpu/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/cpu/nn/conv_transpose.cc @@ -73,7 +73,7 @@ Status ConvTranspose::PrePack(const Tensor& tensor, int input_idx, Alloca // if and when we try to cache this pre-packed buffer for sharing between sessions. memset(packed_filter_data, 0, packed_filter_data_size); - transposed_filter_ = BufferUniquePtr(packed_filter_data, BufferDeleter(alloc)); + transposed_filter_ = BufferUniquePtr(packed_filter_data, BufferDeleter(std::move(alloc))); for (int64_t group_id = 0; group_id < conv_transpose_attrs_.group; ++group_id) { MlasTranspose(tensor.Data() + (group_id * N * K), @@ -146,7 +146,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_ const int64_t col_buffer_size = kernel_dim * p.input_shape.Size(); auto col_data = alloc->Alloc(SafeInt(sizeof(T)) * col_buffer_size); - BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc)); + BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc))); T* col_buffer_data = static_cast(col_buffer.get()); const T* Xdata = p.X->template Data(); @@ -246,7 +246,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dyna const int64_t col_buffer_size = kernel_dim * p.input_shape.Size(); auto col_data = alloc->Alloc(SafeInt(sizeof(float)) * col_buffer_size); - BufferUniquePtr col_buffer(col_data, BufferDeleter(alloc)); + BufferUniquePtr col_buffer(col_data, BufferDeleter(std::move(alloc))); float* col_buffer_data = static_cast(col_buffer.get()); const float* Xdata = p.X->template Data(); diff --git a/onnxruntime/core/providers/cpu/nn/lrn.cc b/onnxruntime/core/providers/cpu/nn/lrn.cc index 5111a1479e..e1fac3e9f5 100644 --- a/onnxruntime/core/providers/cpu/nn/lrn.cc +++ b/onnxruntime/core/providers/cpu/nn/lrn.cc @@ -80,7 +80,7 @@ Status LRN::Compute(OpKernelContext* context) const { const size_t padded_square_size = (static_cast(C) + size_ - 1) * H * W; auto psdata = alloc->Alloc(SafeInt(sizeof(float)) * padded_square_size); - BufferUniquePtr padded_square_buffer(psdata, BufferDeleter(alloc)); + BufferUniquePtr padded_square_buffer(psdata, BufferDeleter(std::move(alloc))); auto* padded_square_data = static_cast(padded_square_buffer.get()); math::Set(padded_square_size, 0.0f, padded_square_data, &CPUMathUtil::Instance()); diff --git a/onnxruntime/core/providers/cpu/quantization/conv_integer.cc b/onnxruntime/core/providers/cpu/quantization/conv_integer.cc index 131e9d2ba0..ba2d2b604c 100644 --- a/onnxruntime/core/providers/cpu/quantization/conv_integer.cc +++ b/onnxruntime/core/providers/cpu/quantization/conv_integer.cc @@ -102,7 +102,7 @@ Status ConvInteger::Compute(OpKernelContext* context) const { ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc)); auto* col_data = alloc->Alloc(SafeInt(sizeof(uint8_t)) * col_buffer_size); - col_buffer = BufferUniquePtr(col_data, BufferDeleter(alloc)); + col_buffer = BufferUniquePtr(col_data, BufferDeleter(std::move(alloc))); } auto* col_buffer_data = static_cast(col_buffer.get()); diff --git a/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h b/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h index b824aee812..5a30f18b35 100644 --- a/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h +++ b/onnxruntime/core/providers/cpu/quantization/matmul_integer_base.h @@ -53,7 +53,7 @@ class MatMulIntegerBase : public OpKernel { // if and when we try to cache this pre-packed buffer for sharing between sessions. memset(packed_b_data, 0, packed_b_size); - packed_b_ = BufferUniquePtr(packed_b_data, BufferDeleter(alloc)); + packed_b_ = BufferUniquePtr(packed_b_data, BufferDeleter(std::move(alloc))); MlasGemmPackB(N, K, b_data, N, a_is_signed, b_is_signed_, packed_b_data); bool share_prepacked_weights = (prepacked_weights != nullptr); diff --git a/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc b/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc index f08bade90a..83ec8cc105 100644 --- a/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc +++ b/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc @@ -107,7 +107,7 @@ Status QLinearMatMul::Compute(OpKernelContext* ctx) const { ORT_RETURN_IF_ERROR(ctx->GetTempSpaceAllocator(&alloc)); auto gemm_output_data = alloc->Alloc(SafeInt(gemm_shape.M) * gemm_shape.N * sizeof(int32_t) * num_gemms); - BufferUniquePtr gemm_output_buffer(gemm_output_data, BufferDeleter(alloc)); + BufferUniquePtr gemm_output_buffer(gemm_output_data, BufferDeleter(std::move(alloc))); auto* gemm_output = static_cast(gemm_output_buffer.get()); std::vector gemm_params(num_gemms); diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 1d1e4198be..06403b025f 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -734,7 +734,9 @@ struct ProviderHost { // Tensor virtual std::unique_ptr Tensor__construct(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator) = 0; virtual std::unique_ptr Tensor__construct(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset) = 0; - virtual void Tensor__operator_delete(Tensor* p) = 0; + virtual std::unique_ptr Tensor__construct_default() = 0; + virtual void Tensor__move_assign(Tensor& lhs, Tensor&& rhs) noexcept = 0; + virtual void Tensor__operator_delete(Tensor* p) noexcept = 0; virtual void Tensor__InitOrtValue(MLDataType elt_type, const TensorShape& shape, std::shared_ptr allocator, OrtValue& ort_value) = 0; virtual void Tensor__InitOrtValue(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& location, OrtValue& ort_value) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index e4792cbea9..6e43ff20be 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -882,10 +882,11 @@ class SessionState { }; struct Tensor final { + static std::unique_ptr CreateDefault() { return g_host->Tensor__construct_default(); } static std::unique_ptr Create(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator) { return g_host->Tensor__construct(p_type, shape, std::move(allocator)); } static std::unique_ptr Create(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset = 0) { return g_host->Tensor__construct(p_type, shape, p_data, alloc, offset); } - static void operator delete(void* p) { g_host->Tensor__operator_delete(reinterpret_cast(p)); } + static void operator delete(void* p) noexcept { g_host->Tensor__operator_delete(reinterpret_cast(p)); } static void InitOrtValue(MLDataType elt_type, const TensorShape& shape, std::shared_ptr allocator, OrtValue& ort_value) { g_host->Tensor__InitOrtValue(elt_type, shape, std::move(allocator), ort_value); @@ -935,6 +936,10 @@ struct Tensor final { Tensor() = delete; Tensor(const Tensor&) = delete; void operator=(const Tensor&) = delete; + Tensor& operator=(Tensor&& o) noexcept { + g_host->Tensor__move_assign(*this, std::move(o)); + return *this; + } }; template <> diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.cc b/onnxruntime/core/providers/xnnpack/nn/conv.cc index d9f6134e91..abafe20290 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "conv.h" +#include "core/common/inlined_containers_fwd.h" #include "core/graph/constants.h" #include "core/graph/graph.h" #include "core/graph/graph_utils.h" @@ -230,22 +231,22 @@ Status Conv::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, // Transpose from {M, C/group, kH, kW} to {M, kH, kW, C/group} auto orig_shape = tensor.Shape(); - std::vector perm{0, 2, 3, 1}; - std::vector new_dims{orig_shape[0], - orig_shape[2], - orig_shape[3], - orig_shape[1]}; + InlinedVector perm{0, 2, 3, 1}; + TensorShapeVector new_dims{orig_shape[0], + orig_shape[2], + orig_shape[3], + orig_shape[1]}; - packed_w_ = Tensor::Create(tensor.DataType(), TensorShape(new_dims), alloc); + packed_w_ = Tensor(tensor.DataType(), TensorShape(new_dims), std::move(alloc)); - SingleAxisTranspose(perm, tensor, *packed_w_, /*from*/ 1, /*to*/ 3); + SingleAxisTranspose(perm, tensor, packed_w_, /*from*/ 1, /*to*/ 3); is_packed = true; // we can create the kernel now struct xnn_operator* p = nullptr; ORT_RETURN_IF_ERROR(CreateXnnpackKernel(conv_attrs_, C_, M_, kernel_shape_, clip_min_max_, IsDepthwise(), - *packed_w_, B_ ? B_->Data() : nullptr, p)); + packed_w_, B_ ? B_->Data() : nullptr, p)); op0_.reset(p); } diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.h b/onnxruntime/core/providers/xnnpack/nn/conv.h index 3777619f3d..c4826e85ca 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv.h +++ b/onnxruntime/core/providers/xnnpack/nn/conv.h @@ -38,7 +38,7 @@ class Conv : public OpKernel { TensorShapeVector kernel_shape_; int64_t C_; int64_t M_; - std::unique_ptr packed_w_; + Tensor packed_w_; const Tensor* B_{nullptr}; std::optional> clip_min_max_; diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 1d5a279f95..4f3d92c630 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1636,8 +1636,8 @@ static common::Status CheckTypes(MLDataType actual, MLDataType expected, const s return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, ostr.str()); } -common::Status InferenceSession::ValidateInputs(const std::vector& feed_names, - const std::vector& feeds) const { +common::Status InferenceSession::ValidateInputs(gsl::span feed_names, + gsl::span feeds) const { if (feed_names.size() != feeds.size()) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Size mismatch: feed_names has ", feed_names.size(), "elements, but feeds has ", feeds.size(), " elements."); @@ -1735,7 +1735,7 @@ common::Status InferenceSession::ValidateInputs(const std::vector& return Status::OK(); } -common::Status InferenceSession::ValidateOutputs(const std::vector& output_names, +common::Status InferenceSession::ValidateOutputs(gsl::span output_names, const std::vector* p_fetches) const { if (p_fetches == nullptr) { return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "Output vector pointer is NULL"); @@ -1863,8 +1863,8 @@ struct ThreadPoolSpinningSwitch { } // namespace Status InferenceSession::Run(const RunOptions& run_options, - const std::vector& feed_names, const std::vector& feeds, - const std::vector& output_names, std::vector* p_fetches, + gsl::span feed_names, gsl::span feeds, + gsl::span output_names, std::vector* p_fetches, const std::vector* p_fetches_device_info) { TimePoint tp; if (session_profiler_.IsEnabled()) { @@ -1895,10 +1895,10 @@ Status InferenceSession::Run(const RunOptions& run_options, << " CUDA Graph for this model with tag: " << run_options.run_tag; ORT_RETURN_IF_ERROR_SESSIONID_(cached_execution_provider_for_graph_replay_.ReplayGraph()); } else { - std::vector exec_providers_to_stop; + InlinedVector exec_providers_to_stop; exec_providers_to_stop.reserve(execution_providers_.NumProviders()); - std::vector arenas_to_shrink; + InlinedVector arenas_to_shrink; ORT_TRY { if (!is_inited_) { @@ -2037,17 +2037,17 @@ Status InferenceSession::Run(const RunOptions& run_options, return retval; } -common::Status InferenceSession::Run(const NameMLValMap& feeds, const std::vector& output_names, +common::Status InferenceSession::Run(const NameMLValMap& feeds, gsl::span output_names, std::vector* p_fetches) { return Run(RunOptions(), feeds, output_names, p_fetches); } common::Status InferenceSession::Run(const RunOptions& run_options, const NameMLValMap& feeds_map, - const std::vector& output_names, std::vector* p_fetches) { - std::vector feed_names; - std::vector feeds; + gsl::span output_names, std::vector* p_fetches) { + InlinedVector feed_names; + InlinedVector feeds; - auto num_feeds = feeds_map.size(); + const auto num_feeds = feeds_map.size(); feed_names.reserve(num_feeds); feeds.reserve(num_feeds); @@ -2177,7 +2177,7 @@ AllocatorPtr InferenceSession::GetAllocator(const OrtMemoryInfo& mem_info) const } common::Status InferenceSession::ValidateAndParseShrinkArenaString(const std::string& ort_device_list, - /*out*/ std::vector& arenas_to_shrink) const { + /*out*/ InlinedVector& arenas_to_shrink) const { arenas_to_shrink.reserve(5); // Allocate some memory for the container (we are unlikely to see more than 5 memory arena shrink requests) std::istringstream ss_1(ort_device_list); @@ -2234,7 +2234,7 @@ common::Status InferenceSession::ValidateAndParseShrinkArenaString(const std::st return Status::OK(); } -void InferenceSession::ShrinkMemoryArenas(const std::vector& arenas_to_shrink) { +void InferenceSession::ShrinkMemoryArenas(gsl::span arenas_to_shrink) { for (auto& alloc : arenas_to_shrink) { auto status = static_cast(alloc.get())->Shrink(); diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h index 963eeddb3b..b1cb1a6841 100644 --- a/onnxruntime/core/session/inference_session.h +++ b/onnxruntime/core/session/inference_session.h @@ -300,8 +300,8 @@ class InferenceSession { */ common::Status Initialize() ORT_MUST_USE_RESULT; - common::Status Run(const RunOptions& run_options, const std::vector& feed_names, - const std::vector& feeds, const std::vector& output_names, + common::Status Run(const RunOptions& run_options, gsl::span feed_names, + gsl::span feeds, gsl::span output_names, std::vector* p_fetches, const std::vector* p_fetches_device_info = nullptr) ORT_MUST_USE_RESULT; @@ -315,7 +315,7 @@ class InferenceSession { * This should not be changed during execution of this function. * @return OK if success. */ - common::Status Run(const NameMLValMap& feeds, const std::vector& output_names, + common::Status Run(const NameMLValMap& feeds, gsl::span output_names, std::vector* p_fetches) ORT_MUST_USE_RESULT; /** @@ -324,7 +324,7 @@ class InferenceSession { * @param run_options use this to tune the Run call to your needs. */ common::Status Run(const RunOptions& run_options, const NameMLValMap& feeds, - const std::vector& output_names, + gsl::span output_names, std::vector* p_fetches) ORT_MUST_USE_RESULT; /** @@ -595,10 +595,10 @@ class InferenceSession { common::Status CheckShapes(const std::string& input_name, const TensorShape& input_shape, const TensorShape& expected_shape) const ORT_MUST_USE_RESULT; - common::Status ValidateInputs(const std::vector& feed_names, - const std::vector& feeds) const ORT_MUST_USE_RESULT; + common::Status ValidateInputs(gsl::span feed_names, + gsl::span feeds) const ORT_MUST_USE_RESULT; - common::Status ValidateOutputs(const std::vector& output_names, + common::Status ValidateOutputs(gsl::span output_names, const std::vector* p_fetches) const ORT_MUST_USE_RESULT; common::Status WaitForNotification(Notification* p_executor_done, int64_t timeout_in_ms) ORT_MUST_USE_RESULT; @@ -617,13 +617,13 @@ class InferenceSession { */ common::Status ValidateAndParseShrinkArenaString(const std::string& ort_device_list, - /*out*/ std::vector& arenas_to_shrink) const ORT_MUST_USE_RESULT; + /*out*/ InlinedVector& arenas_to_shrink) const ORT_MUST_USE_RESULT; /* * Performs the shrinkage of arenas requested to be shrunk by the user * The `arenas_to_shrink` parameter is got from ValidateAndParseShrinkArenaString() */ - void ShrinkMemoryArenas(const std::vector& arenas_to_shrink); + void ShrinkMemoryArenas(gsl::span arenas_to_shrink); #if !defined(ORT_MINIMAL_BUILD) virtual common::Status AddPredefinedTransformers( diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 86a2642c29..4ad8c31fea 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -830,9 +830,23 @@ struct ProviderHostImpl : ProviderHost { const DataTransferManager& SessionState__GetDataTransferMgr(const SessionState* p) override { return p->GetDataTransferMgr(); } // Tensor (wrapped) - std::unique_ptr Tensor__construct(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator) override { return std::make_unique(p_type, shape, std::move(allocator)); } - std::unique_ptr Tensor__construct(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset) override { return std::make_unique(p_type, shape, p_data, alloc, offset); } - void Tensor__operator_delete(Tensor* p) override { delete p; } + std::unique_ptr Tensor__construct(MLDataType p_type, const TensorShape& shape, std::shared_ptr allocator) override { + return std::make_unique(p_type, shape, std::move(allocator)); + } + + std::unique_ptr Tensor__construct(MLDataType p_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& alloc, ptrdiff_t offset) override { + return std::make_unique(p_type, shape, p_data, alloc, offset); + } + + std::unique_ptr Tensor__construct_default() override { + return std::make_unique(); + } + + virtual void Tensor__move_assign(Tensor& lhs, Tensor&& rhs) noexcept override { + lhs = std::move(rhs); + }; + + void Tensor__operator_delete(Tensor* p) noexcept override { delete p; } void Tensor__InitOrtValue(MLDataType elt_type, const TensorShape& shape, std::shared_ptr allocator, OrtValue& ort_value) override { Tensor::InitOrtValue(elt_type, shape, std::move(allocator), ort_value); diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc index d3625c3f6b..61d224de40 100644 --- a/onnxruntime/test/framework/execution_frame_test.cc +++ b/onnxruntime/test/framework/execution_frame_test.cc @@ -439,7 +439,8 @@ TEST(ExecutionFrameTestInit, InitializerAsOutput) { const void* orig_buffer = results[0].Get().DataRaw(); RunOptions ro; - ASSERT_STATUS_OK(session.Run(ro, {}, {}, {"values"}, &results, nullptr)); + ASSERT_STATUS_OK(session.Run(ro, EmptySpan(), + EmptySpan(), AsSpan({std::string("values")}), &results, nullptr)); EXPECT_EQ(results[0].Get().DataRaw(), orig_buffer); EXPECT_THAT(results[0].Get().DataAsSpan(), ::testing::ContainerEq(gsl::make_span(expected))); @@ -453,7 +454,8 @@ TEST(ExecutionFrameTestInit, InitializerAsOutput) { std::vector results; RunOptions ro; - ASSERT_STATUS_OK(session.Run(ro, {}, {}, {"values"}, &results, nullptr)); + ASSERT_STATUS_OK(session.Run(ro, EmptySpan(), + EmptySpan(), AsSpan({std::string("values")}), &results, nullptr)); // output buffer should not be the same as the initializer in SessionState const auto& initializers = session.GetSessionState().GetInitializedTensors(); @@ -464,7 +466,6 @@ TEST(ExecutionFrameTestInit, InitializerAsOutput) { #if !defined(DISABLE_SPARSE_TENSORS) TEST(ExecutionFrameTestInit, SparseInitializerAsOutput) { - const std::vector dense_shape{3, 3}; std::vector dense_data = { 0, 0, 1.764052391052246f, @@ -491,7 +492,7 @@ TEST(ExecutionFrameTestInit, SparseInitializerAsOutput) { auto ml_type = DataTypeImpl::GetType(); results[0].Init(p_tensor.release(), ml_type, ml_type->GetDeleteFunc()); RunOptions ro; - ASSERT_STATUS_OK(session.Run(ro, {}, {}, {"values"}, &results, nullptr)); + ASSERT_STATUS_OK(session.Run(ro, EmptySpan(), EmptySpan(), AsSpan({"values"}), &results, nullptr)); ASSERT_TRUE(results[0].IsAllocated()); ASSERT_TRUE(results[0].IsSparseTensor()); @@ -504,7 +505,7 @@ TEST(ExecutionFrameTestInit, SparseInitializerAsOutput) { EXPECT_THAT(coo_view.Indices().DataAsSpan(), ::testing::ContainerEq(gsl::make_span(expected_linear_indices))); } } -#endif // !defined(DISABLE_SPARSE_TENSORS) +#endif // !defined(DISABLE_SPARSE_TENSORS) } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/framework/function_test.cc b/onnxruntime/test/framework/function_test.cc index 82fcbe4f5f..d294f6eb56 100644 --- a/onnxruntime/test/framework/function_test.cc +++ b/onnxruntime/test/framework/function_test.cc @@ -5,6 +5,7 @@ #include "onnx/defs/parser.h" +#include "core/common/span_utils.h" #include "core/graph/model.h" #include "core/providers/cpu/cpu_execution_provider.h" #include "core/session/inference_session.h" @@ -13,6 +14,7 @@ #include "test/framework/test_utils.h" #include "test/common/tensor_op_test_utils.h" + // Unit tests to check the implementation of functions, model-local functions, // function-inlining etc. @@ -56,7 +58,7 @@ static void Check(const char* source, std::vector fetches; - status = session_object.Run(run_options, feeds, {output_name}, &fetches); + status = session_object.Run(run_options, feeds, AsSpan({std::string(output_name)}), &fetches); ASSERT_TRUE(status.IsOK()) << "Session Run failed."; auto& tensor = fetches[0].Get(); diff --git a/orttraining/orttraining/training_ops/cpu/op_gradients.cc b/orttraining/orttraining/training_ops/cpu/op_gradients.cc index e25acd6ea0..e14c12bc01 100644 --- a/orttraining/orttraining/training_ops/cpu/op_gradients.cc +++ b/orttraining/orttraining/training_ops/cpu/op_gradients.cc @@ -86,11 +86,11 @@ Status SoftmaxGrad::Compute(OpKernelContext* context) const { bool is_transpose_required = opset_ >= 13 && axis != (rank - 1); - std::unique_ptr transposed_dY; - std::unique_ptr transposed_Y; - std::vector transposed_input_dims; - std::unique_ptr intermediate_output; // output that the softmax implementation will write into while using transposed input - std::vector permutation(rank); + Tensor transposed_dY; + Tensor transposed_Y; + TensorShapeVector transposed_input_dims; + Tensor intermediate_output; // output that the softmax implementation will write into while using transposed input + InlinedVector permutation(rank); if (is_transpose_required) { AllocatorPtr alloc; @@ -112,26 +112,26 @@ Status SoftmaxGrad::Compute(OpKernelContext* context) const { D = TensorShape(transposed_input_dims).SizeFromDimension(rank - 1); // Allocate a temporary tensor to hold transposed input - auto temp_input0 = Tensor::Create(Y.DataType(), TensorShape(transposed_input_dims), alloc); + auto temp_input0 = Tensor(Y.DataType(), TensorShape(transposed_input_dims), alloc); // Perform the transpose - ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutation, Y, *temp_input0)); + ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutation, Y, temp_input0)); transposed_Y = std::move(temp_input0); - auto temp_input1 = Tensor::Create(Y.DataType(), TensorShape(transposed_input_dims), alloc); - ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutation, dY, *temp_input1)); + auto temp_input1 = Tensor(Y.DataType(), TensorShape(transposed_input_dims), alloc); + ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutation, dY, temp_input1)); transposed_dY = std::move(temp_input1); // Allocate memory for the intermediate output - intermediate_output = Tensor::Create(dX.DataType(), TensorShape(transposed_input_dims), alloc); + intermediate_output = Tensor(dX.DataType(), TensorShape(transposed_input_dims), alloc); } const int n = gsl::narrow_cast(N); const int d = gsl::narrow_cast(D); const int nd = gsl::narrow_cast(N * D); - const float* Ydata = is_transpose_required ? transposed_Y->template Data() : Y.template Data(); - const float* dYdata = is_transpose_required ? transposed_dY->template Data() : dY.template Data(); - float* dXdata = is_transpose_required ? intermediate_output->template MutableData() : dX.template MutableData(); + const float* Ydata = is_transpose_required ? transposed_Y.template Data() : Y.template Data(); + const float* dYdata = is_transpose_required ? transposed_dY.template Data() : dY.template Data(); + float* dXdata = is_transpose_required ? intermediate_output.template MutableData() : dX.template MutableData(); gsl::copy(gsl::make_span(dYdata, nd), gsl::make_span(dXdata, nd)); if (is_logsoftmaxgrad_) { @@ -164,7 +164,7 @@ Status SoftmaxGrad::Compute(OpKernelContext* context) const { } if (is_transpose_required) { // Perform the transpose to get the axes back to the original ordering - ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutation, *intermediate_output, dX)); + ORT_RETURN_IF_ERROR(Transpose::DoTranspose(permutation, intermediate_output, dX)); } return Status::OK(); diff --git a/orttraining/orttraining/training_ops/cpu/optimizer/adamw/adamw.cc b/orttraining/orttraining/training_ops/cpu/optimizer/adamw/adamw.cc index 4050270968..39550fc28d 100644 --- a/orttraining/orttraining/training_ops/cpu/optimizer/adamw/adamw.cc +++ b/orttraining/orttraining/training_ops/cpu/optimizer/adamw/adamw.cc @@ -87,10 +87,10 @@ Status AdamWOptimizerBase::GenerateOutputs(OpKernelContext* ctx, size_t number_o updated_values->Reserve(number_of_values); for (size_t input_idx = 0; input_idx < number_of_values; ++input_idx) { const Tensor& source_tensor = values->Get(input_idx); - std::unique_ptr target_tensor = Tensor::Create(source_tensor.DataType(), - source_tensor.Shape(), alloc); - ORT_RETURN_IF_ERROR(CopyInputTensorToOutputTensor(source_tensor, *target_tensor)); - updated_values->Add(std::move(*target_tensor)); // Add will check for type consistency + Tensor target_tensor(source_tensor.DataType(), + source_tensor.Shape(), alloc); + ORT_RETURN_IF_ERROR(CopyInputTensorToOutputTensor(source_tensor, target_tensor)); + updated_values->Add(std::move(target_tensor)); // Add will check for type consistency } }