onnxruntime/cmake/onnxruntime_optimizer.cmake

116 lines
6.6 KiB
CMake
Raw Normal View History

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
set(onnxruntime_optimizer_src_patterns)
if (onnxruntime_MINIMAL_BUILD)
# we include a couple of files so a library is produced and we minimize other changes to the build setup.
# if the transformer base class is unused it will be excluded from the final binary size
list(APPEND onnxruntime_optimizer_src_patterns
"${ONNXRUNTIME_INCLUDE_DIR}/core/optimizer/graph_transformer.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/graph_transformer.cc"
)
if (onnxruntime_EXTENDED_MINIMAL_BUILD)
list(APPEND onnxruntime_optimizer_src_patterns
"${ONNXRUNTIME_INCLUDE_DIR}/core/optimizer/graph_transformer_utils.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/conv_activation_fusion.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/conv_activation_fusion.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/graph_transformer_utils.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/initializer.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/initializer.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/nhwc_transformer.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/nhwc_transformer.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/qdq_final_cleanup.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/qdq_final_cleanup.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/qdq_util.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/qdq_util.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/actions.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/actions.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/helpers.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/helpers.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/helpers.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/helpers.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/selector_action_transformer_apply_contexts.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/selector_action_transformer.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/selector_action_transformer.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimizer/optimizer_api_impl.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimizer/optimizer_api.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimizer/optimizer_utils.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimizer/ort_transpose_optimizer.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimizer/ort_transpose_optimizer.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimizer/transpose_optimizer.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/utils.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/utils.h"
)
endif()
else()
list(APPEND onnxruntime_optimizer_src_patterns
"${ONNXRUNTIME_INCLUDE_DIR}/core/optimizer/*.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/*.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/*.cc"
Optimize computation orders (#13672) ### Optimize computation orders In `Roberta/Electra`, when `ClassificationHead` is used, there is slicing operation on features on sequence_length dimensions, then loss calculations only depend on this sliced data. This is a slicing at axis 1. Before slicing the shape is [batch, sequence_length, hidden], after slicing, it becomes [batch , hidden_stage] We had opportunities to bring this slicing earlier as much as possible, by passing through simple elementwise ops (like Add/Div), or Layernorm/Softmax(if their reduce axis is after the slicing axis), or even MatMul's the left operand (if only it did not affect the last dims). For operators like Reshape/Transpose, it is special since they have either data specified (after slicing we need update), or they have perm specified, which requires the input rank remain unchanged. So for those kinds of operators, we can remain the original rank, but just leave the sliced dim to be 1, after the compute completed, we do a Squeeze. ``` class RobertaClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__(self, config): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) classifier_dropout = ( config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob ) self.dropout = nn.Dropout(classifier_dropout) self.out_proj = nn.Linear(config.hidden_size, config.num_labels) def forward(self, features, **kwargs): x = features[:, 0, :] # take <s> token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = torch.tanh(x) x = self.dropout(x) x = self.out_proj(x) return x ``` src\transformers\models\roberta\modeling_roberta.py src\transformers\models\electra\modeling_electra.py #### Benchmark A simple benchmark shows Robeta training latency dropped from 208ms ~ 199ms. 4.5+% reduction. More comprehensive tests are on the way. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
2022-12-22 07:12:52 +00:00
"${ONNXRUNTIME_ROOT}/core/optimizer/compute_optimizer/*.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/compute_optimizer/*.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/*.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/*.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/*.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/*.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/*.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/*.cc"
Add Transpose Optimizer and modify nhwc optimizer to use it. (#9284) * Add Transpose Optimizer and modify nhwc optimizer to use it. * Fix casts * Fix casts2 * Fix move * Add tests * Add headers * Fixes and tests * Remove explicit template instantiation * Fix build warning * Name unit tests * Code review fixes * Add some comments * Fix some casts * Make optimization slightly less agressive * Some unit test fixes * Update Attention pattern to work with transpose optimizer * Update attention fuser * Fix attention fusion python script * Improve transpose optimizer documentation * Create OptimizerCtx struct * Disable Slice handler for testing * Implement Slice int32 * Only push transposes leading up to other transposes * Improve optimization heuristic * Add exemption for MaxPool * Document transpose optimizer api.h * Revert fusion tests to master * Remove temp files * Replace typedef with using * Trim trailing whitespace * Move class declarations from api_impl.h to api_impl.cc * Remove copy constructors and move allocator * Alphabetize headers * Add override keyword * Comments for nhwc_transformer * Rename OrtGraph to ApiGraph, etc. * Wrap line * Remove extra qualifier on ApiGraph * Refector attention fusion * Remove c-style casts from api_impl.cc * Improve documentation * Avoid printing vector in ORT_ENSURES * Revert attention fusion refactor * Remove duplicate cost heuristics and improve documentation * Fix size_t casts * Fixes from Scott's review * Unrevert attention refactor and more updates from Scott's review * Revert api_impl.cc ValueInfo change * only optimize first transpose input * Unrevert api_impl.cc changes * Make vector call reserve * transpose_optimizer.cc update from Scott's comments * Rename api::Graph to api::GraphRef etc. * Consider domains 'onnx.ai' and '' equal * Replace AddInput with SetInput * Improve tests * quantization and heuristic tests * Comments for tests * Replace const string_view with string_view and update tests * Fixes requested by Edward * Fix std::string to string_view conversion * Add <string> to includes * Fix bug for broadcasting ops with unknown rank. Slight safety improvements * Changes requested by Edward * Fix formatting * Improve description of cost metric
2021-10-28 05:10:39 +00:00
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimizer/*.h"
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimizer/*.cc"
)
endif()
if (onnxruntime_ENABLE_TRAINING_APIS)
# we need optimizers for both full build as well as training api only build.
# Using onnxruntime_ENABLE_TRAINING_APIS since it is always ON in a full training build.
list(APPEND onnxruntime_optimizer_src_patterns
"${ORTTRAINING_SOURCE_DIR}/core/optimizer/*.h"
"${ORTTRAINING_SOURCE_DIR}/core/optimizer/*.cc"
)
2020-03-11 21:25:37 +00:00
endif()
file(GLOB onnxruntime_optimizer_srcs CONFIGURE_DEPENDS ${onnxruntime_optimizer_src_patterns})
source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_optimizer_srcs})
if (onnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH)
set(onnxruntime_external_transformer_src_patterns)
list(APPEND onnxruntime_external_transformer_src_patterns
"${onnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH}/*.cc"
"${onnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH}/*.cpp"
)
file(GLOB onnxruntime_external_transformer_src ${onnxruntime_external_transformer_src_patterns})
list(APPEND onnxruntime_optimizer_srcs ${onnxruntime_external_transformer_src})
endif()
onnxruntime_add_static_library(onnxruntime_optimizer ${onnxruntime_optimizer_srcs})
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/optimizer DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
onnxruntime_add_include_to_target(onnxruntime_optimizer onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface)
target_include_directories(onnxruntime_optimizer PRIVATE ${ONNXRUNTIME_ROOT})
if (onnxruntime_ENABLE_TRAINING_APIS)
2020-03-11 21:25:37 +00:00
target_include_directories(onnxruntime_optimizer PRIVATE ${ORTTRAINING_ROOT})
endif()
add_dependencies(onnxruntime_optimizer ${onnxruntime_EXTERNAL_DEPENDENCIES})
set_target_properties(onnxruntime_optimizer PROPERTIES FOLDER "ONNXRuntime")
if (NOT onnxruntime_BUILD_SHARED_LIB)
install(TARGETS onnxruntime_optimizer
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()