diff --git a/docs/ONNX_Runtime_Perf_Tuning.md b/docs/ONNX_Runtime_Perf_Tuning.md index aaab8f8422..6c9d82986a 100644 --- a/docs/ONNX_Runtime_Perf_Tuning.md +++ b/docs/ONNX_Runtime_Perf_Tuning.md @@ -1,10 +1,10 @@ # ONNX Runtime Performance Tuning ## Why do we need to tune performance? -ONNX Runtime is designed to be open and extensible with its concept of "Execution Provider" to represents different execution kernels. See the [design overview](./HighLevelDesign.md). +ONNX Runtime is designed to be open and extensible with its concept of "Execution Provider" to represent different execution kernels. See the [design overview](./HighLevelDesign.md). ONNX Runtime supports a variety of execution providers across CPU and GPU: [see the list here](../README.md#high-performance). -For different models and different hardware, there is no silver bullet which can always perform the best. Even for a single execution provider, often there are several knobs that can be tuned (e.g. thread number, wait policy etc.). +For different models and different hardware, there is no silver bullet that can always perform the best. Even for a single execution provider, often there are several knobs that can be tuned (e.g. thread number, wait policy etc.). This document covers basic tools and knobs that can be leveraged to find the best performance for your model and hardware. @@ -84,7 +84,7 @@ sess_options.graph_optimization_level = rt.GraphOptimizationLevel.ORT_ENABLE_ALL * Thread Count * `sess_options.intra_op_num_threads = 2` controls the number of threads to use to run the model * Sequential vs Parallel Execution - * `sess_options.execution_mode = rt.ExecutionMode.ORT_SEQUENTIAL` controls whether then operators in the graph should run sequentially or in parallel. Usually when a model has many branches, setting this option to false will provide better performance. + * `sess_options.execution_mode = rt.ExecutionMode.ORT_SEQUENTIAL` controls whether the operators in the graph run sequentially or in parallel. Usually when a model has many branches, setting this option to false will provide better performance. * When `sess_options.execution_mode = rt.ExecutionMode.ORT_PARALLEL`, you can set `sess_options.inter_op_num_threads` to control the number of threads used to parallelize the execution of the graph (across nodes). @@ -122,3 +122,9 @@ In both cases, you will get a JSON file which contains the detailed performance * Open chrome browser * Type chrome://tracing in the address bar * Load the generated JSON file + + + +## Model graph is not optimized even with graph_optimization_level set to ORT_ENABLE_ALL? + +ONNX model from IR_VERSION 4 only treats initializers that appear in graph input as non-constant. This may fail some of the graph optimizations, like const folding, operator fusion and etc. Move initializers out of graph inputs if there is no need to override them, by either re-generating the model with latest exporter/converter or with the tool onnxruntime/tools/python/remove_initializer_from_input.py. diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 2cba64b36a..6d1cf7547c 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -778,6 +778,13 @@ Graph::Graph(const Model& owning_model, if (matching_graph_input == nullptr) { name_to_type_map[tensor.name()] = t; ORT_IGNORE_RETURN_VALUE(GetOrCreateNodeArg(tensor.name(), &t)); + } else { + LOGS(logger_, WARNING) << "Initializer " << tensor.name() + << " appears in graph inputs and will not be treated as constant value/weight. " + << "This may fail some of the graph optimizations, like const folding. " + << "Move it out of graph inputs if there is no need to override it, " + << "by either re-generating the model with latest exporter/converter " + << "or with the tool onnxruntime/tools/python/remove_initializer_from_input.py."; } } } diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 42406b1214..5aa30f927f 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -852,11 +852,11 @@ common::Status InferenceSession::Initialize() { if (session_options_.execution_mode == ExecutionMode::ORT_PARALLEL && execution_providers_.Get(onnxruntime::kCudaExecutionProvider)) { - LOGS(*session_logger_, ERROR) << "Parallel execution is currently not supported " - "for the registered CUDA Execution Provider."; + LOGS(*session_logger_, ERROR) << "Parallel execution mode doesn't support " + "CUDA Execution Provider currently."; return common::Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, - "Parallel execution is currently not supported " - "for the registered CUDA Execution Provider."); + "Parallel execution mode doesn't support " + "CUDA Execution Provider currently."); } // add predefined transformers @@ -869,7 +869,7 @@ common::Status InferenceSession::Initialize() { // There are 2 kinds of kernel registries with priority from high to low as below, // 1. Custom execution provider type specific kernel registries. // 2. common execution provider type specific kernel registries. - // The 1st and 2nd ones are shared across sessions. + // Kernel registries are shared across sessions. // The 1st ones should have already been registered via session-level API into KernelRegistryManager. // // Register 2nd registries into KernelRegistryManager. diff --git a/tools/python/remove_initializer_from_input.py b/tools/python/remove_initializer_from_input.py new file mode 100644 index 0000000000..6099a0fc35 --- /dev/null +++ b/tools/python/remove_initializer_from_input.py @@ -0,0 +1,37 @@ +import onnx +import sys +import argparse + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True, help="input model") + parser.add_argument("--output", required=True, help="output model") + args = parser.parse_args() + return args + + +def remove_initializer_from_input(): + args = get_args() + + model = onnx.load(args.input) + if model.ir_version < 4: + print( + 'Model with ir_version below 4 requires to include initilizer in graph input' + ) + return + + inputs = model.graph.input + name_to_input = {} + for input in inputs: + name_to_input[input.name] = input + + for initializer in model.graph.initializer: + if initializer.name in name_to_input: + inputs.remove(name_to_input[initializer.name]) + + onnx.save(model, args.output) + + +if __name__ == '__main__': + remove_initializer_from_input()