diff --git a/include/onnxruntime/core/optimizer/graph_transformer_config.h b/include/onnxruntime/core/optimizer/graph_transformer_config.h index ac760981b0..c112d9b048 100644 --- a/include/onnxruntime/core/optimizer/graph_transformer_config.h +++ b/include/onnxruntime/core/optimizer/graph_transformer_config.h @@ -11,16 +11,16 @@ struct GraphTransformerConfiguration { struct PropagateCastOpsConfiguration { // Propagate FP16 Cast operations up and FP32 operations down /* - * Cast propagation strategy. - * One strategy is to insert casts around all the nodes with the allowed opcodes - * and reduce, by removing redundent-casts and back-to-back-casts etc., and - * the other is to propagate casts using flood-fill approach, expanding float16 regions in the graph - * traversing the graph up/down. - */ + * Cast propagation strategy. + * One strategy is to insert casts around all the nodes with the allowed opcodes + * and reduce, by removing redundent-casts and back-to-back-casts etc., and + * the other is to propagate casts using flood-fill approach, expanding float16 regions in the graph + * traversing the graph up/down. + */ enum class Strategy { None = 0, InsertAndReduce = 1, - FloodFill = 2, /* Propagate FP16 Cast operations up and FP32 operations down */ + FloodFill = 2, /* Propagate FP16 Cast operations up and FP32 operations down */ }; using Strategy_t = std::underlying_type::type; friend constexpr Strategy operator|(const Strategy s1, const Strategy s2) { @@ -54,7 +54,7 @@ struct GraphTransformerConfiguration { 1 => use ORT predefined list of level 1 opcodes in addition to the user specified allow opcodes 2 => use ORT predefined list of level 2 opcodes in addition to the user specified allow opcodes */ - Strategy strategy = Strategy::None; + Strategy strategy = Strategy::FloodFill; // List of allowed opcodes to consider as safe to execute in float16, while moving cast operations std::vector allow; }; diff --git a/onnxruntime/core/optimizer/propagate_cast_ops.cc b/onnxruntime/core/optimizer/propagate_cast_ops.cc index 09711e1a17..740e915f0a 100644 --- a/onnxruntime/core/optimizer/propagate_cast_ops.cc +++ b/onnxruntime/core/optimizer/propagate_cast_ops.cc @@ -15,9 +15,14 @@ using namespace onnxruntime::common; * and 2. Level 2 being the most agressive, may consider moving float operations to float16 which may result in different numerical results * due to loss of precision. The user may choose level 0, whereby the user chooses the opcodes which are "FP16 Safe" instead of a list * predetermined opcodes as in levels 1 and 2. -* Currently two strategies are available, InsertAndReduce and FloodFill. +* Currently three strategies are available, None, InsertAndReduce and FloodFill. +* None: +* Although no new cast operations are inserted or propagated using this strategy some optimizations are performed +* 1. Remove back-to-back casts +* 2. Fuse subgraphs +* 3. Remove unnecessary casts * InsertAndReduce : -* This transformation converts all FP16 operations to float16. The transformation first +* This transformation converts all FP16 operations to float16. The transformation first * 1. Inserts float16 cast operation on all the float inputs * 2. Changes all float outputs to float16 * 3. Inserts float cast operations on all float outputs as expected diff --git a/onnxruntime/core/optimizer/propagate_cast_ops.h b/onnxruntime/core/optimizer/propagate_cast_ops.h index f57f392771..86f4b63b5a 100644 --- a/onnxruntime/core/optimizer/propagate_cast_ops.h +++ b/onnxruntime/core/optimizer/propagate_cast_ops.h @@ -16,7 +16,7 @@ Propagate FP16 Cast operations up the graph and FP32 Cast operations down the gr class PropagateCastOps : public GraphTransformer { public: PropagateCastOps(GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy strategy = - GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy::InsertAndReduce, + GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy::FloodFill, size_t level = 0, const std::vector& allow_list = {}, const std::unordered_set& compatible_execution_providers = {}) noexcept; diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc index de7817f41a..05f782aaf1 100644 --- a/orttraining/orttraining/python/orttraining_pybind_state.cc +++ b/orttraining/orttraining/python/orttraining_pybind_state.cc @@ -113,7 +113,7 @@ struct TrainingParameters { int propagate_cast_ops_level = 1; std::vector propagate_cast_ops_allow; GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy propagate_cast_ops_strategy = - GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy::None; + GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy::FloodFill; bool allow_layer_norm_mod_precision = false; // graph dumping diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py index 39d24c904e..874b0c605e 100644 --- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py +++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py @@ -112,8 +112,8 @@ class GraphExecutionManager(GraphExecutionInterface): # Graph transformer config # Specify cast propagation strategy. Currently three strategies are available, NONE, INSERT-AND-REDUCE and FLOOD-FILL - # The default is NONE, which implies the transformer does no cast-propagation transformation. - self._propagate_cast_ops_strategy = C.PropagateCastOpsStrategy.NONE + # The default is FLOOD_FILL, expand FP16 computation regions in the graph using allowed opcodes for the given level. + self._propagate_cast_ops_strategy = C.PropagateCastOpsStrategy.FLOOD_FILL # Optimize by moving Cast operations if propagate_cast_ops_level is non-negative. # - If the _propagate_cast_ops_level is set to zero, then the transformation considers only the opcodes specified by _propagate_cast_ops_allow # as "FP16 safe", in order to insert/(re)move cast operations before/after to perform such operations in reduced (16-bit) precision. diff --git a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py index 0c6b99bdf9..8c3d6508d5 100644 --- a/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py +++ b/orttraining/orttraining/python/training/ortmodule/experimental/json_config/_load_config_from_json.py @@ -213,7 +213,7 @@ def load_from_json(ortmodule, path=None): { "PropagateCastOps": { - "Strategy": "FLOOD_FILL", # str representing strategy (like "NONE", "FLOOD_FILL"...) + "Strategy": "FLOOD_FILL", # str representing strategy ("NONE", "FLOOD_FILL", or "INSERT_AND_REDUCE") "Level": 3, # propagate cast ops level as an int "Allow": ["ABC", "DEF"] # propagate cast ops allow as list of strs }, diff --git a/orttraining/orttraining/python/training/orttrainer_options.py b/orttraining/orttraining/python/training/orttrainer_options.py index 527d865fe9..8b4c6a866e 100644 --- a/orttraining/orttraining/python/training/orttrainer_options.py +++ b/orttraining/orttraining/python/training/orttrainer_options.py @@ -197,7 +197,7 @@ class ORTTrainerOptions(object): 'schema': { 'propagate_cast_ops_strategy': { 'type': 'onnxruntime.training.PropagateCastOpsStrategy', - 'default': PropagateCastOpsStrategy.NONE + 'default': PropagateCastOpsStrategy.FLOOD_FILL }, 'propagate_cast_ops_level': { 'type': 'integer', @@ -374,7 +374,7 @@ class ORTTrainerOptions(object): graph_transformer.transformer_layer_recompute(bool, default False) graph_transformer.number_recompute_layers(bool, default False) graph_transformer.propagate_cast_ops_config (dict): - graph_transformer.propagate_cast_ops_config.strategy(PropagateCastOpsStrategy, default NONE) + graph_transformer.propagate_cast_ops_config.strategy(PropagateCastOpsStrategy, default FLOOD_FILL) Specify the choice of the cast propagation optimization strategy, either, NONE, INSERT_AND_REDUCE or FLOOD_FILL. NONE strategy does not perform any cast propagation transformation on the graph, although other optimizations locally change cast operations, for example, in order to fuse Transpose and MatMul nodes, the TransposeMatMulFunsion optimization could @@ -726,12 +726,12 @@ _ORTTRAINER_OPTIONS_SCHEMA = { 'strategy': { 'type': 'propagate_cast_ops_strategy', 'nullable': True, - 'default': PropagateCastOpsStrategy.NONE + 'default': PropagateCastOpsStrategy.FLOOD_FILL }, 'level': { 'type': 'integer', 'min': -1, - 'default': -1 + 'default': 1 }, 'allow': { 'type': 'list', diff --git a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py index 4577de2644..40b3b7ba11 100644 --- a/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py +++ b/orttraining/orttraining/test/python/orttraining_test_orttrainer_frontend.py @@ -80,8 +80,8 @@ def testORTTrainerOptionsDefaultValues(test_input): 'number_recompute_layers': 0, 'allow_layer_norm_mod_precision': False, 'propagate_cast_ops_config': { - 'strategy': PropagateCastOpsStrategy.NONE, - 'level': -1, + 'strategy': PropagateCastOpsStrategy.FLOOD_FILL, + 'level': 1, 'allow': [] } },