onnxruntime/onnxruntime/core/framework/utils.h

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

#include "core/graph/basic_types.h"
#include "core/framework/allocator.h"
#include "core/framework/data_types.h"
#include "core/framework/framework_common.h"
#include "core/framework/iexecutor.h"
#include "core/framework/session_state.h"
#include "core/framework/session_options.h"

namespace ONNX_NAMESPACE {
class TensorShapeProto;
class TensorProto;
std::ostream& operator<<(std::ostream& out, const TensorShapeProto& shape_proto);
std::ostream& operator<<(std::ostream& out, const TensorProto& tensor_proto);
}  // namespace ONNX_NAMESPACE

namespace onnxruntime {
class ExecutionProviders;
struct FeedsFetchesInfo;
class FeedsFetchesManager;
struct MLValueCopyInfo;
class Graph;
class KernelDef;
class KernelRegistryManager;
class IExecutionProvider;
class Node;
class Tensor;

namespace logging {
class Logger;
}

namespace utils {
void* DefaultAlloc(size_t size);
void DefaultFree(void* p);

const std::string& GetNodeInputProviderType(const SessionState::NodeInfo& info);

common::Status CopyOneInputAcrossDevices(const SessionState& session_state, const std::string& input_name,
                                         const OrtValue& orig_mlvalue, OrtValue& new_mlvalue);

// Searches the allocation plan from the session_state to find the OrtMemoryInfo for the value 'name'.
const OrtMemoryInfo& FindMemoryInfoForValue(const SessionState& session_state,
                                            const std::string& name);

// Initialize the feed and fetch copy info using session_state.
// Determines the device that each graph input that will be fed will be consumed on,
// and the device that each graph output that will be fetched will be created on.
common::Status InitializeFeedFetchCopyInfo(const SessionState& session_state,
                                           FeedsFetchesManager& feeds_fetches_manager);

// Finalize the feed and fetch copy info using session_state and the device and location information from the feeds
// and fetches that will be used in graph execution.
void FinalizeFeedFetchCopyInfo(FeedsFetchesManager& feeds_fetches_manager,
                               const std::vector<OrtDevice>& feed_locations,
                               const std::vector<const OrtMemoryInfo*>& fetch_alloc_info);

// Execute the main graph. The feed_fetches_manager will be finalized based on the provided feeds and fetches.
common::Status ExecuteGraph(const SessionState& session_state, FeedsFetchesManager& feeds_fetches_manager,
                            const std::vector<OrtValue>& feeds, std::vector<OrtValue>& fetches,
                            ExecutionMode execution_mode, const bool& terminate_flag, const logging::Logger& logger,
                            bool only_execute_path_to_fetches = false);

// Execute a subgraph. The feeds_fetches_manager should have been finalized prior to calling this function.
// See IControlFlowNode::SetupSubgraphExecutionInfo usage in the control flow kernels.
common::Status ExecuteSubgraph(const SessionState& session_state, const FeedsFetchesManager& feeds_fetches_manager,
                               const std::vector<OrtValue>& feeds, std::vector<OrtValue>& fetches,
                               const std::unordered_map<size_t, IExecutor::CustomAllocator>& fetch_allocators,
                               ExecutionMode execution_mode, const bool& terminate_flag, const logging::Logger& logger);

template <typename T>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<bool>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<std::string>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<float>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<double>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<MLFloat16>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<BFloat16>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<int8_t>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<uint8_t>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<int16_t>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<uint16_t>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<int32_t>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<uint32_t>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<int64_t>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
}

template <>
constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<uint64_t>() {
  return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64;
}

int32_t ONNXTensorElementDataTypeToProtoTensorType(ONNXTensorElementDataType);

common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context);

}  // namespace utils
}  // namespace onnxruntime
Initial bootstrap commit. 2018-11-20 00:48:22 +00:00			`// Copyright (c) Microsoft Corporation. All rights reserved.`
			`// Licensed under the MIT License.`

			`#pragma once`

			`#include "core/graph/basic_types.h"`
			`#include "core/framework/allocator.h"`
Opset 9 Scan implementation (#274) * Templatize Scan as step 1 * Pre-thunderstorm save * Initial v8 and v9 implementations. Need to add transpose to v9 and unit tests. * Make Transpose operator implementation re-usable by Scan. Add transpose logic to Scan. * Rework a bunch of things. First Scan 9 unit test passes * Add more tests. Need to add axis validation and handling of negative values. * Convert remaining Scan 8 tests to also work for Scan 9 if applicable. Add invalid input tests for new Scan 9 attributes. * Add transpose unit test. Some cleanups. * Cleanups * Check number of direction entries for outputs at kernel instantiation. 2019-01-07 22:11:46 +00:00			`#include "core/framework/data_types.h"`
Handle copy to/from non-CPU devices across control flow nodes (#339) 2019-01-17 18:51:23 +00:00			`#include "core/framework/framework_common.h"`
Add the ability to use a custom allocator for fetches to avoid unnecessary copies in control flow operators. (#377) * Add the ability to use a custom allocator for fetches. Allows control flow nodes to forward the allocation to the control flow op and avoid an unnecessary copy when the subgraph output has a symbolic dimension. Update Scan and If to use custom allocators when applicable. * Remove unnecessary forward declaration * Fix Mac build warnings 2019-01-29 09:48:10 +00:00			`#include "core/framework/iexecutor.h"`
Handle copy to/from non-CPU devices across control flow nodes (#339) 2019-01-17 18:51:23 +00:00			`#include "core/framework/session_state.h"`
Introduce execution mode enum for clarity and extensibility; Change Python, C and C# APIs accordingly; Removed EnableSequentialExecution, DisableSequentialExecution in favor of the more general SetExecutionModeAPI. (#2098) * Introduce execution mode for clarity and extensibility; Change Python APIs accordingly; Replace DisableSequentialExecution API with EnableParallelExecution for clarity. * Fix cuda build * Modify the test slightly * Make C and C# APIs consistent with Python. 2019-10-14 16:48:19 +00:00			`#include "core/framework/session_options.h"`
Initial bootstrap commit. 2018-11-20 00:48:22 +00:00
Filter out info from non-const initializers during shape inferencing (#1806) * Don't return shape for non-const initializer in InferenceContextImpl::getInputType Don't return initializer for non-const initializer in InferenceContextImpl::getInputData Update graph_utils to support these scenarios - fix GetConstantInitializer to make sure a name is for an outer scope value before checking a parent graph, as local name could shadow an outer scope initializer. 2019-09-26 03:44:33 +00:00			`namespace ONNX_NAMESPACE {`
			`class TensorShapeProto;`
			`class TensorProto;`
			`std::ostream& operator<<(std::ostream& out, const TensorShapeProto& shape_proto);`
			`std::ostream& operator<<(std::ostream& out, const TensorProto& tensor_proto);`
			`} // namespace ONNX_NAMESPACE`

Initial bootstrap commit. 2018-11-20 00:48:22 +00:00			`namespace onnxruntime {`
			`class ExecutionProviders;`
Rework the feed/fetch copy setup so that it can be calculated prior to subgraph execution (#1761) * Rework the feed/fetch copy setup so that it can be calculated upfront by the control flow nodes. Also simplifies how it all works. Update the control flow nodes to do the calculation prior to graph execution. 2019-09-10 05:46:00 +00:00			`struct FeedsFetchesInfo;`
Various optimizations to reduce the setup and device copying cost outside of the call to ExecuteGraph. (#470) * Various optimizations to reduce the setup and execution cost. Cache information about the feeds and fetches, and any device copies required to execute the graph so we minimize checking for later calls to ExecuteGraph using the same input/output. - enable use of caching in Loop and Scan - make use of caching optional for InferenceSession::Run - handle calls to Run with different feeds and fetches to support scenarios where there may be a truncated sequence in some calls Take the feed names and MLValue instances as vectors so the order is deterministic. Add unit tests Update onnxruntime_perf_test to enable caching. * Couple of tweaks. Fix shared library unit test failure. Attempt to workaround MacOS build failure due to VC++ bug around including reaching scope values in a lambda automatically. * Rework order of init in Run so we get nice error messages about invalid feed/output names. * Refine logic around copying MLValue using execution provider so common code can be used. Simplify the logic due to this change. Split the paths for executing with/without cached info so we can be more const correct with how FeedsFetchesManager is passed in. This makes it clearer when a shared instance can be used due to it being const. Cache the FeedsFetchesManager instances in the control flow nodes. They can be re-used across calls to Compute. * Removed unused local variable to fix some builds. * Fix build issue by cleaning up some more unused params. * Check names when using cache entry from SessionState. Add unit test. 2019-02-20 02:12:17 +00:00			`class FeedsFetchesManager;`
Rework the feed/fetch copy setup so that it can be calculated prior to subgraph execution (#1761) * Rework the feed/fetch copy setup so that it can be calculated upfront by the control flow nodes. Also simplifies how it all works. Update the control flow nodes to do the calculation prior to graph execution. 2019-09-10 05:46:00 +00:00			`struct MLValueCopyInfo;`
Handle copy to/from non-CPU devices across control flow nodes (#339) 2019-01-17 18:51:23 +00:00			`class Graph;`
Initial bootstrap commit. 2018-11-20 00:48:22 +00:00			`class KernelDef;`
			`class KernelRegistryManager;`
Handle copy to/from non-CPU devices across control flow nodes (#339) 2019-01-17 18:51:23 +00:00			`class IExecutionProvider;`
			`class Node;`
			`class Tensor;`
Initial bootstrap commit. 2018-11-20 00:48:22 +00:00
			`namespace logging {`
			`class Logger;`
			`}`

			`namespace utils {`
Share default CPU allocator with Mlas preferred alignment (#1682) Description: make default CPU allocator to use MLAS preferred alignment Motivation and Context This is needed for C API to have an aligned default CPU allocator, the same as the one in CPU provider 2019-08-23 19:06:35 +00:00			`void* DefaultAlloc(size_t size);`
			`void DefaultFree(void* p);`
Initial bootstrap commit. 2018-11-20 00:48:22 +00:00
Handle copy to/from non-CPU devices across control flow nodes (#339) 2019-01-17 18:51:23 +00:00			`const std::string& GetNodeInputProviderType(const SessionState::NodeInfo& info);`

Remove unnecessary casts from OrtValue to MLValue(#1051) 2019-05-17 14:52:59 +00:00			`common::Status CopyOneInputAcrossDevices(const SessionState& session_state, const std::string& input_name,`
			`const OrtValue& orig_mlvalue, OrtValue& new_mlvalue);`
Handle copy to/from non-CPU devices across control flow nodes (#339) 2019-01-17 18:51:23 +00:00
Rework the feed/fetch copy setup so that it can be calculated prior to subgraph execution (#1761) * Rework the feed/fetch copy setup so that it can be calculated upfront by the control flow nodes. Also simplifies how it all works. Update the control flow nodes to do the calculation prior to graph execution. 2019-09-10 05:46:00 +00:00			`// Searches the allocation plan from the session_state to find the OrtMemoryInfo for the value 'name'.`
			`const OrtMemoryInfo& FindMemoryInfoForValue(const SessionState& session_state,`
			`const std::string& name);`

			`// Initialize the feed and fetch copy info using session_state.`
			`// Determines the device that each graph input that will be fed will be consumed on,`
			`// and the device that each graph output that will be fetched will be created on.`
			`common::Status InitializeFeedFetchCopyInfo(const SessionState& session_state,`
			`FeedsFetchesManager& feeds_fetches_manager);`

			`// Finalize the feed and fetch copy info using session_state and the device and location information from the feeds`
			`// and fetches that will be used in graph execution.`
Cleanup SessionState. Move allocator lookup to SessionState. (#4194) * Move allocators to SessionState so they're decoupled from ExecutionProviders - when looking up an allocator it's based on OrtMemoryInfo not the EP so SessionState is a more natural place for that infromation to be stored - add device based lookup - simplifies logic for copying feeds/fetches across devices Cleanup SessionState and SessionStateInitializer - provide more things to SessionState at construction time so we don't construct and instance and immediately after call a bunch of setters - simplify SessionStateInitializer - reduced down to FinalizeSessionState method 2020-06-28 04:55:42 +00:00			`void FinalizeFeedFetchCopyInfo(FeedsFetchesManager& feeds_fetches_manager,`
Rework the feed/fetch copy setup so that it can be calculated prior to subgraph execution (#1761) * Rework the feed/fetch copy setup so that it can be calculated upfront by the control flow nodes. Also simplifies how it all works. Update the control flow nodes to do the calculation prior to graph execution. 2019-09-10 05:46:00 +00:00			`const std::vector<OrtDevice>& feed_locations,`
			`const std::vector<const OrtMemoryInfo*>& fetch_alloc_info);`

			`// Execute the main graph. The feed_fetches_manager will be finalized based on the provided feeds and fetches.`
Remove unnecessary casts from OrtValue to MLValue(#1051) 2019-05-17 14:52:59 +00:00			`common::Status ExecuteGraph(const SessionState& session_state, FeedsFetchesManager& feeds_fetches_manager,`
			`const std::vector<OrtValue>& feeds, std::vector<OrtValue>& fetches,`
Introduce training changes. 2020-03-11 21:25:37 +00:00			`ExecutionMode execution_mode, const bool& terminate_flag, const logging::Logger& logger,`
			`bool only_execute_path_to_fetches = false);`
Rework the feed/fetch copy setup so that it can be calculated prior to subgraph execution (#1761) * Rework the feed/fetch copy setup so that it can be calculated upfront by the control flow nodes. Also simplifies how it all works. Update the control flow nodes to do the calculation prior to graph execution. 2019-09-10 05:46:00 +00:00
			`// Execute a subgraph. The feeds_fetches_manager should have been finalized prior to calling this function.`
			`// See IControlFlowNode::SetupSubgraphExecutionInfo usage in the control flow kernels.`
			`common::Status ExecuteSubgraph(const SessionState& session_state, const FeedsFetchesManager& feeds_fetches_manager,`
			`const std::vector<OrtValue>& feeds, std::vector<OrtValue>& fetches,`
			`const std::unordered_map<size_t, IExecutor::CustomAllocator>& fetch_allocators,`
Introduce execution mode enum for clarity and extensibility; Change Python, C and C# APIs accordingly; Removed EnableSequentialExecution, DisableSequentialExecution in favor of the more general SetExecutionModeAPI. (#2098) * Introduce execution mode for clarity and extensibility; Change Python APIs accordingly; Replace DisableSequentialExecution API with EnableParallelExecution for clarity. * Fix cuda build * Modify the test slightly * Make C and C# APIs consistent with Python. 2019-10-14 16:48:19 +00:00			`ExecutionMode execution_mode, const bool& terminate_flag, const logging::Logger& logger);`
Opset 9 Scan implementation (#274) * Templatize Scan as step 1 * Pre-thunderstorm save * Initial v8 and v9 implementations. Need to add transpose to v9 and unit tests. * Make Transpose operator implementation re-usable by Scan. Add transpose logic to Scan. * Rework a bunch of things. First Scan 9 unit test passes * Add more tests. Need to add axis validation and handling of negative values. * Convert remaining Scan 8 tests to also work for Scan 9 if applicable. Add invalid input tests for new Scan 9 attributes. * Add transpose unit test. Some cleanups. * Cleanups * Check number of direction entries for outputs at kernel instantiation. 2019-01-07 22:11:46 +00:00
Introduce PrimitiveType into a Type System along with an integer constant (#2307) Improve perf by avoiding GetType<T>() calls. Introduce MLTypeCallDispatcher to switch on Input Type. Add Tensor IsType<T>() fast method. 2019-11-09 01:47:06 +00:00			`template <typename T>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<bool>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<std::string>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<float>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<double>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<MLFloat16>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<BFloat16>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<int8_t>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<uint8_t>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<int16_t>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<uint16_t>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<int32_t>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<uint32_t>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<int64_t>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;`
			`}`

			`template <>`
			`constexpr ONNXTensorElementDataType GetONNXTensorElementDataType<uint64_t>() {`
			`return ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64;`
			`}`
Opset 9 Scan implementation (#274) * Templatize Scan as step 1 * Pre-thunderstorm save * Initial v8 and v9 implementations. Need to add transpose to v9 and unit tests. * Make Transpose operator implementation re-usable by Scan. Add transpose logic to Scan. * Rework a bunch of things. First Scan 9 unit test passes * Add more tests. Need to add axis validation and handling of negative values. * Convert remaining Scan 8 tests to also work for Scan 9 if applicable. Add invalid input tests for new Scan 9 attributes. * Add transpose unit test. Some cleanups. * Cleanups * Check number of direction entries for outputs at kernel instantiation. 2019-01-07 22:11:46 +00:00
Introduce container type runtime checks and other improvements (#2522) Rework TensorSeq in a manner consistent with Tensor and SparseTensor in terms of type system setup. Reduce templating. Introduce helpers to ensure the same data type. Make OrtValue __dtor not virtual. Introduce ContainerChecker 2019-12-05 00:04:17 +00:00			`int32_t ONNXTensorElementDataTypeToProtoTensorType(ONNXTensorElementDataType);`

Memory planner and pattern generation enhancements. (#4443) * static allocation. * chanegs. * contigious dynamic allocation. * contigious dynamic allocation. * fix bugs. * fix bug. * build errors. * PR feedback. * PR feedback. * Update Graph builder for nccl_allreduce, mps. * misc. * fix windows build break. * changes. * fine-grained memory-time scheduling. * merge. * fix misc stuff. * fix windows build. * fix windows build. * fix merge bug. * merge conflicts. * revert onnx-tensorrt submodule commit. * fix submodule commit. * misc. * merge conflicts. * Revert "merge conflicts." This reverts commit 319a071a6e63e79d001afe40faba5adccd137902. * merge conflict. * merge conflict. * merge conflicts. * fixes. * PR feedback. * build break. * build break. * Add asserts. * Add asserts. * asserts. * asserts. * asserts. * asserts. * asserts. * fixes. * fixes. Co-authored-by: Ubuntu <OrtTrainingDev3@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> Co-authored-by: root <root@OrtTrainingDev3.af05slrtruoetgaxwwjv5nsq5e.px.internal.cloudapp.net> 2020-11-02 07:05:46 +00:00			`common::Status VerifyInputTensorsAllocatedContiguously(OpKernelContext* context);`

Initial bootstrap commit. 2018-11-20 00:48:22 +00:00			`} // namespace utils`
			`} // namespace onnxruntime`