2019-02-04 23:45:12 +00:00
|
|
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
|
|
|
# Licensed under the MIT License.
|
|
|
|
|
|
2021-08-27 07:16:05 +00:00
|
|
|
set(onnxruntime_optimizer_src_patterns)
|
|
|
|
|
|
2020-08-21 21:14:53 +00:00
|
|
|
if (onnxruntime_MINIMAL_BUILD)
|
|
|
|
|
# we include a couple of files so a library is produced and we minimize other changes to the build setup.
|
2021-08-27 07:16:05 +00:00
|
|
|
# if the transformer base class is unused it will be excluded from the final binary size
|
|
|
|
|
list(APPEND onnxruntime_optimizer_src_patterns
|
2020-08-21 21:14:53 +00:00
|
|
|
"${ONNXRUNTIME_INCLUDE_DIR}/core/optimizer/graph_transformer.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/graph_transformer.cc"
|
|
|
|
|
)
|
2021-08-27 07:16:05 +00:00
|
|
|
|
2022-03-09 00:18:49 +00:00
|
|
|
if (onnxruntime_EXTENDED_MINIMAL_BUILD)
|
2022-01-26 01:13:46 +00:00
|
|
|
list(APPEND onnxruntime_optimizer_src_patterns
|
2022-01-04 20:09:07 +00:00
|
|
|
"${ONNXRUNTIME_INCLUDE_DIR}/core/optimizer/graph_transformer_utils.h"
|
2022-03-09 00:18:49 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/conv_activation_fusion.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/conv_activation_fusion.h"
|
2022-01-04 20:09:07 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/graph_transformer_utils.cc"
|
2022-03-02 03:47:55 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/initializer.cc"
|
2022-03-09 00:18:49 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/initializer.h"
|
2024-05-16 18:00:59 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/matmul_nbits_fusion.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/matmul_nbits_fusion.h"
|
2022-03-04 05:45:42 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/nhwc_transformer.cc"
|
2022-03-09 00:18:49 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/nhwc_transformer.h"
|
2022-03-04 05:45:42 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/qdq_final_cleanup.cc"
|
2022-03-09 00:18:49 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/qdq_final_cleanup.h"
|
2021-08-27 07:16:05 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/qdq_util.cc"
|
2022-03-09 00:18:49 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/qdq_util.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/actions.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/actions.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/helpers.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/helpers.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/helpers.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/helpers.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/selector_action_transformer_apply_contexts.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/selector_action_transformer.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/selector_action_transformer.h"
|
2023-07-06 22:24:47 +00:00
|
|
|
# files required for layout transformation
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/layout_transformation/layout_transformation.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/layout_transformation/layout_transformation.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h"
|
|
|
|
|
# files required for transpose optimization post-layout transformation
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimization/optimizer_api.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimization/onnx_transpose_optimization.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimization/ort_optimizer_utils.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimization/ort_transpose_optimization.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimization/ort_transpose_optimization.cc"
|
2022-03-02 03:47:55 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/utils.cc"
|
2022-03-09 00:18:49 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/utils.h"
|
2021-08-27 07:16:05 +00:00
|
|
|
)
|
|
|
|
|
endif()
|
2020-08-21 21:14:53 +00:00
|
|
|
else()
|
2021-08-27 07:16:05 +00:00
|
|
|
list(APPEND onnxruntime_optimizer_src_patterns
|
2019-10-09 00:37:44 +00:00
|
|
|
"${ONNXRUNTIME_INCLUDE_DIR}/core/optimizer/*.h"
|
2019-02-04 23:45:12 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/*.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/*.cc"
|
Optimize computation orders (#13672)
### Optimize computation orders
In `Roberta/Electra`, when `ClassificationHead` is used, there is
slicing operation on features on sequence_length dimensions, then loss
calculations only depend on this sliced data. This is a slicing at axis
1. Before slicing the shape is [batch, sequence_length, hidden], after
slicing, it becomes [batch , hidden_stage]
We had opportunities to bring this slicing earlier as much as possible,
by passing through simple elementwise ops (like Add/Div), or
Layernorm/Softmax(if their reduce axis is after the slicing axis), or
even MatMul's the left operand (if only it did not affect the last
dims).
For operators like Reshape/Transpose, it is special since they have
either data specified (after slicing we need update), or they have perm
specified, which requires the input rank remain unchanged. So for those
kinds of operators, we can remain the original rank, but just leave the
sliced dim to be 1, after the compute completed, we do a Squeeze.
```
class RobertaClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
```
src\transformers\models\roberta\modeling_roberta.py
src\transformers\models\electra\modeling_electra.py
#### Benchmark
A simple benchmark shows Robeta training latency dropped from 208ms ~
199ms. 4.5+% reduction.
More comprehensive tests are on the way.
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
2022-12-22 07:12:52 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/compute_optimizer/*.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/compute_optimizer/*.cc"
|
2023-07-06 22:24:47 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/layout_transformation/*.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/layout_transformation/*.cc"
|
2021-03-25 16:17:23 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/*.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/*.cc"
|
2021-08-27 07:16:05 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/*.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/*.cc"
|
2022-01-12 03:41:45 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/shared/utils.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc"
|
2021-07-09 06:11:43 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/*.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/selectors_actions/*.cc"
|
2023-07-06 22:24:47 +00:00
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimization/*.h"
|
|
|
|
|
"${ONNXRUNTIME_ROOT}/core/optimizer/transpose_optimization/*.cc"
|
2020-08-21 21:14:53 +00:00
|
|
|
)
|
|
|
|
|
endif()
|
2019-02-04 23:45:12 +00:00
|
|
|
|
2023-06-22 19:27:23 +00:00
|
|
|
if (onnxruntime_ENABLE_TRAINING)
|
2021-08-27 07:16:05 +00:00
|
|
|
list(APPEND onnxruntime_optimizer_src_patterns
|
|
|
|
|
"${ORTTRAINING_SOURCE_DIR}/core/optimizer/*.h"
|
|
|
|
|
"${ORTTRAINING_SOURCE_DIR}/core/optimizer/*.cc"
|
2023-04-13 05:02:12 +00:00
|
|
|
"${ORTTRAINING_SOURCE_DIR}/core/optimizer/compute_optimizer/*.h"
|
|
|
|
|
"${ORTTRAINING_SOURCE_DIR}/core/optimizer/compute_optimizer/*.cc"
|
Memory optimization refactor and refinement (#17481)
### Memory optimization refactor and refinement
Currently memory optimizer runs graph transformations and print
recompute opportunities in INFO level, while ORT backend has many many
INFO level logs making users hard to find those information. So we are
looking for a Python binding API to retrieve the memory optimization
opportunities instead of depending on the MemoryOptimizer's default
logging.
Then we can print ORTModule feature statistics using this information.
Also, with such an API, we can create an ORT session created, where
allocation plan is done, the analysis will consider buffer reuse as
well. This can void giving some recomputation subgraphs that are reusing
other subgraphs' output buffers.
Check
https://github.com/microsoft/onnxruntime/blob/pengwa/add_devinfo_level/docs/Memory_Optimizer.md
for the new flow using `MemoryOptimizer`.
This pull requests made following refactoring:
1. Print the log in ORTModule Python script, along with ORTModule
feature enabling stats. This is implemented by exposing an API
`get_serialized_ortmodule_memory_stat` to retrieve the memory
optimization opportunities.
2. We are analyzing memory optimization opportunities considering ORT
memory planning. This is done by firstly creating the execution graph
without enabling MemoryOptimizer, then we call
`execution_agent.get_serialized_ortmodule_memory_stat` which internally
will consider the session memory allocation planner when analyzing
memory optimization opportunity. As a direct result, the memory
optimization opportunities can show those stashed activations that are
reusing other buffers.
3. Move recompute analysis logic from memory_optimizer.h/cc to
recompute_analysis.h/cc.
4. Abstract optimization strategies for their own implementation. This
will make introducing new strategies (for example compression and
decompression ) easier.
New logging matrix (INFO Level), in WARNING level, the details will NOT
show.
```
2023-09-13 13:25:09,249 orttraining.rank-0 [WARNING] -
***** ONNX Runtime Training (ORTModule) is accelerating your model *****
ORTModule is enabled with following features ON/OFF for [training] mode:
ATen Executor : ON : Dispatch ATen operators to ORT's ATen executor
Cast Propagation : ON : Level 1 enabled
Custom Function : ON : Support custom torch.autograd.Function export and execution
Memory Optimizer : ON : RecomputeConfig: Reshape+Where+BiasSoftmax+:1:-1,Cast+:1:-1, ProbeLevel: 1, available configs:
Config Freq Saving(B) Saving Symbolic(Bytes)
- Plan 1 : ON : Reshape+Where+BiasSoftmax+:1:-1 5 671,088,640 640.0*inputs_input_ids_dim0*inputs_input_ids_dim1**2
- Plan 2 : ON : Cast+:1:-1 6 402,587,648 inputs_input_ids_dim0*inputs_input_ids_dim1*(384.0*inputs_input_ids_dim1 - 64.0)
- Plan 3 : OFF : Reshape+Where+:1:-1 1 134,217,728 128.0*inputs_input_ids_dim0*inputs_input_ids_dim1**2
- Plan 4 : OFF : BiasSoftmax+:1:-1 1 134,086,656 128.0*inputs_input_ids_dim0*inputs_input_ids_dim1*(inputs_input_ids_dim1 - 1)
- Plan 5 : OFF : BiasGelu+:1:-1 6 125,808,640 inputs_input_ids_dim0*(122880.0*inputs_input_ids_dim1 - 20480.0)
- Plan 6 : OFF : FusedMatMul+:1:-1 6 125,808,640 inputs_input_ids_dim0*(122880.0*inputs_input_ids_dim1 - 20480.0)
- Plan 7 : OFF : FusedMatMul+Add+FusedMatMul+Add+Add+Add+:1:-1 5 26,214,400 25600.0*inputs_input_ids_dim0*inputs_input_ids_dim1
- Plan 8 : OFF : Add+:1:-1 1 5,237,760 5120.0*inputs_input_ids_dim0*(inputs_input_ids_dim1 - 1)
- Plan 9 : OFF : Reshape+Unsqueeze+Unsqueeze+Cast+Sub+Mul+Cast+:1:-1 1 4,096 4.0*inputs_input_ids_dim0*inputs_input_ids_dim1
- Plan 10 : OFF : Cast+:2:-1 1 2,048 2.0*inputs_input_ids_dim0*inputs_input_ids_dim1
Compute Optimizer : ON : Enable/Disable with env ORTMODULE_ENABLE_COMPUTE_OPTIMIZER=1/0
- FLOPReduction : ON : Reduce FLOPs by upstreaming shrinking-sized ops
Auto Fallback : ON : Fallback to PyTorch when encountering unsupported ops
TritonOp Enabled : OFF : ORT will switch to Triton for executing some ops to further accelerate training.
ZeRO Stage3 Support : OFF : Enable/Disable with env ORTMODULE_ENABLE_ZERO_STAGE3=1/0
Total ORT initialization overhead is 10.73s where export takes 8.39s.
Other overhead details: graph builder init takes 0.06s, runtime detection takes 0.01s, graph building takes 0.31s, session creation takes 1.96s
Versions: ONNX Runtime - 1.16.0+cu118, ONNX - 1.11.0
Note 1: use comma to enable multiple plans at the same time.
export ORTMODULE_MEMORY_OPT_CONFIG=<plan1 config>,<plan2 config>,...
Note 2: saving is calculated based on the 1st batch symbolic dim values:
inputs_input_ids_dim0=1,
inputs_input_ids_dim1=1024,
inputs_attention_mask_dim0=1,
inputs_attention_mask_dim1=1024,
inputs_labels_dim0=1,
inputs_labels_dim1=1024,
************************************************************************
```
If DEVINFO level is enabled, then more details about the memory
optimizations are printed.
```
MemoryInsight Summary - User config: BiasGelu+:1:-1,Cast+:2:-1
==========================================================================================================================================
|Freq | Memory Optimization Opportunities (Clustered by node-level activation patterns) |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|3 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph FusedMatMul+Add+Reshape+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=FusedMatMul+Add+Reshape+:1:-1 |
| | Stashed Activations: |
| | - ReuseFreq : Output 0(3), |
| | - Output 0 : [inputs_input_ids_dim0 x inputs_input_ids_dim1 x 32 x 240 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|2 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph Reshape+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Reshape+:1:-1 |
| | Stashed Activations: |
| | - ReuseFreq : Output 0(2), |
| | - Output 0 : [ x 2560 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|2 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph FusedMatMul+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=FusedMatMul+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x inputs_input_ids_dim1 x 10240 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|2 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph Cast+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Cast+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x 32 x inputs_input_ids_dim1 x inputs_input_ids_dim1 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|2 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph Reshape+Where+BiasSoftmax+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Reshape+Where+BiasSoftmax+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x 32 x inputs_input_ids_dim1 x inputs_input_ids_dim1 x ], byte/elem: 4, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|2 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph BiasGelu+ |
| | Status : Enabled, requested count=-1, actual applied count=2 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x inputs_input_ids_dim1 x 10240 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|2 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph FusedMatMul+Add+FusedMatMul+Add+Add+Add+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=FusedMatMul+Add+FusedMatMul+Add+Add+Add+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x inputs_input_ids_dim1 x 2560 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|1 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph Reshape+Where+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Reshape+Where+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x 32 x inputs_input_ids_dim1 x inputs_input_ids_dim1 x ], byte/elem: 4, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|1 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph FusedMatMul+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=FusedMatMul+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0*(inputs_input_ids_dim1 - 1) x 10240 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|1 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph Cast+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Cast+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x 32 x inputs_input_ids_dim1 - 1 x inputs_input_ids_dim1 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|1 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph Reshape+Unsqueeze+Unsqueeze+Cast+Sub+Mul+Cast+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Reshape+Unsqueeze+Unsqueeze+Cast+Sub+Mul+Cast+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x 1 x 1 x inputs_input_ids_dim1 x ], byte/elem: 4, 100% saved |
| | |
| |>>Option 2 : RecomputeWithCompromise subgraph Cast+ |
| | Status : Enabled, requested count=-1, actual applied count=1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x 1 x 1 x inputs_input_ids_dim1 x ], byte/elem: 4, 50% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|1 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph BiasSoftmax+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=BiasSoftmax+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0 x 32 x inputs_input_ids_dim1 - 1 x inputs_input_ids_dim1 x ], byte/elem: 4, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|1 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph BiasGelu+ |
| | Status : Enabled, requested count=-1, actual applied count=1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0*(inputs_input_ids_dim1 - 1) x 10240 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
|1 |For each row options are mutually exclusive, only one of them can be enabled. |
| | |
| |>>Option 1 : Recompute subgraph Add+ |
| | Status : Disabled. Enable with export ORTMODULE_MEMORY_OPT_CONFIG=Add+:1:-1 |
| | Stashed Activations: |
| | - Output 0 : [inputs_input_ids_dim0*(inputs_input_ids_dim1 - 1) x 2560 x ], byte/elem: 2, 100% saved |
|_ _ _ _|_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ |
==========================================================================================================================================
Note: use comma as a separator for enabling more than one subgraphs.
************************************************************************
```
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
2023-11-23 03:39:00 +00:00
|
|
|
"${ORTTRAINING_SOURCE_DIR}/core/optimizer/memory_optimizer/*.h"
|
|
|
|
|
"${ORTTRAINING_SOURCE_DIR}/core/optimizer/memory_optimizer/*.cc"
|
2021-08-27 07:16:05 +00:00
|
|
|
)
|
2020-03-11 21:25:37 +00:00
|
|
|
endif()
|
|
|
|
|
|
2021-08-27 07:16:05 +00:00
|
|
|
file(GLOB onnxruntime_optimizer_srcs CONFIGURE_DEPENDS ${onnxruntime_optimizer_src_patterns})
|
|
|
|
|
|
2019-02-04 23:45:12 +00:00
|
|
|
source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_optimizer_srcs})
|
|
|
|
|
|
2021-11-15 16:16:20 +00:00
|
|
|
if (onnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH)
|
|
|
|
|
set(onnxruntime_external_transformer_src_patterns)
|
|
|
|
|
list(APPEND onnxruntime_external_transformer_src_patterns
|
|
|
|
|
"${onnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH}/*.cc"
|
|
|
|
|
"${onnxruntime_EXTERNAL_TRANSFORMER_SRC_PATH}/*.cpp"
|
|
|
|
|
)
|
|
|
|
|
file(GLOB onnxruntime_external_transformer_src ${onnxruntime_external_transformer_src_patterns})
|
|
|
|
|
list(APPEND onnxruntime_optimizer_srcs ${onnxruntime_external_transformer_src})
|
|
|
|
|
endif()
|
|
|
|
|
|
2021-04-29 18:54:57 +00:00
|
|
|
onnxruntime_add_static_library(onnxruntime_optimizer ${onnxruntime_optimizer_srcs})
|
2020-08-21 21:14:53 +00:00
|
|
|
|
2022-12-20 19:44:02 +00:00
|
|
|
onnxruntime_add_include_to_target(onnxruntime_optimizer onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface)
|
2019-02-04 23:45:12 +00:00
|
|
|
target_include_directories(onnxruntime_optimizer PRIVATE ${ONNXRUNTIME_ROOT})
|
2023-06-22 19:27:23 +00:00
|
|
|
if (onnxruntime_ENABLE_TRAINING)
|
2020-03-11 21:25:37 +00:00
|
|
|
target_include_directories(onnxruntime_optimizer PRIVATE ${ORTTRAINING_ROOT})
|
2024-05-21 05:38:19 +00:00
|
|
|
onnxruntime_add_include_to_target(onnxruntime_optimizer nlohmann_json::nlohmann_json)
|
2023-10-11 04:36:45 +00:00
|
|
|
if (onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
|
|
|
|
|
onnxruntime_add_include_to_target(onnxruntime_optimizer Python::Module)
|
|
|
|
|
endif()
|
2020-03-11 21:25:37 +00:00
|
|
|
endif()
|
2023-07-13 10:17:58 +00:00
|
|
|
if (onnxruntime_ENABLE_TRITON)
|
|
|
|
|
target_link_libraries(onnxruntime_optimizer PRIVATE nlohmann_json::nlohmann_json)
|
|
|
|
|
onnxruntime_add_include_to_target(onnxruntime_optimizer Python::Module)
|
|
|
|
|
endif()
|
2019-02-04 23:45:12 +00:00
|
|
|
add_dependencies(onnxruntime_optimizer ${onnxruntime_EXTERNAL_DEPENDENCIES})
|
|
|
|
|
set_target_properties(onnxruntime_optimizer PROPERTIES FOLDER "ONNXRuntime")
|
2022-04-04 05:37:18 +00:00
|
|
|
|
|
|
|
|
if (NOT onnxruntime_BUILD_SHARED_LIB)
|
2023-06-20 05:20:31 +00:00
|
|
|
install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/optimizer DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core)
|
|
|
|
|
install(TARGETS onnxruntime_optimizer
|
2022-04-04 05:37:18 +00:00
|
|
|
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
|
|
|
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
|
|
|
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
|
|
|
|
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
|
|
|
endif()
|
2024-01-10 06:49:19 +00:00
|
|
|
|
|
|
|
|
if (onnxruntime_USE_ROCM)
|
|
|
|
|
add_dependencies(onnxruntime_optimizer generate_hipified_files)
|
|
|
|
|
endif()
|