onnxruntime/onnxruntime/python/tools/quantization/fusions/fusion_layernorm.py

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from __future__ import annotations

import onnx

from ..onnx_model import ONNXModel
from .fusion import Fusion


class FusionLayerNormalization(Fusion):
    def __init__(self, model: ONNXModel):
        super().__init__(model, "LayerNormalization", "ReduceMean")

    def fuse(
        self,
        reduce_mean_node: onnx.NodeProto,
        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
        output_name_to_node: dict[str, onnx.NodeProto],
    ):
        """
        Interface function that tries to fuse a node sequence containing a ReduceMean node into a single
        LayerNormalization node.

              +----------------------+
              |                      |
              |                      v
          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div --> Mul --> Add
                     (axis=2 or -1)  |      (Y=2)   (axis=2 or -1)  (E-6 or E-12 or 0) ^
                                     |                                                 |
                                     +-------------------------------------------------+

         It also handles cases of duplicated sub nodes exported from older version of PyTorch:

              +----------------------+
              |                      v
              |           +-------> Sub-----------------------------------------------+
              |           |                                                           |
              |           |                                                           v
          [Root] --> ReduceMean -->  Sub  --> Pow --> ReduceMean --> Add --> Sqrt --> Div  --> Mul --> Add
              |                      ^
              |                      |
              +----------------------+
        """
        children = self.model.get_children(reduce_mean_node, input_name_to_nodes)
        if len(children) == 0 or len(children) > 2:
            return

        root_input = reduce_mean_node.input[0]

        if children[0].op_type != "Sub" or children[0].input[0] != root_input:
            return

        if len(children) == 2:
            if children[1].op_type != "Sub" or children[1].input[0] != root_input:
                return

        div_node = None
        for child in children:
            div_node = self.find_first_child_by_type(child, "Div", input_name_to_nodes, recursive=False)
            if div_node is not None:
                break
        if div_node is None:
            return

        path_id, parent_nodes, _ = self.match_parent_paths(
            div_node,
            [
                (["Sqrt", "Add", "ReduceMean", "Pow", "Sub"], [1, 0, 0, 0, 0]),
                (
                    ["Sqrt", "Add", "ReduceMean", "Pow", "Cast", "Sub"],
                    [1, 0, 0, 0, 0, 0],
                ),
            ],
            output_name_to_node,
        )
        if path_id < 0:
            return

        sub_node = parent_nodes[-1]
        if sub_node not in children:
            return

        second_add_node = parent_nodes[1]
        i, add_weight = self.get_constant_input(second_add_node)
        if add_weight is None or add_weight <= 0 or add_weight > 1.0e-4:
            # Skip fusion since epsilon value is not expected.
            return

        pow_node = parent_nodes[3]
        if self.find_constant_input(pow_node, 2.0) != 1:
            return

        mul_node = input_name_to_nodes[div_node.output[0]][0]
        if mul_node.op_type != "Mul":
            return

        last_add_node = input_name_to_nodes[mul_node.output[0]][0]
        if last_add_node.op_type != "Add":
            return

        subgraph_nodes = [reduce_mean_node]
        subgraph_nodes.extend(children)
        subgraph_nodes.extend(parent_nodes[:-1])

        subgraph_nodes.extend([last_add_node, mul_node, div_node])
        if not self.is_safe_to_fuse_nodes(
            subgraph_nodes,
            last_add_node.output,
            input_name_to_nodes,
            output_name_to_node,
        ):
            return

        weight_input = mul_node.input[1 - self.input_index(div_node.output[0], mul_node)]
        if not self.is_constant_with_specified_rank(weight_input, 1):
            return

        bias_input = last_add_node.input[1 - self.input_index(mul_node.output[0], last_add_node)]
        if not self.is_constant_with_specified_rank(bias_input, 1):
            return

        self.nodes_to_remove.extend(subgraph_nodes)

        normalize_node = onnx.helper.make_node(
            "LayerNormalization",
            name=self.create_unique_node_name(),
            inputs=[reduce_mean_node.input[0], weight_input, bias_input],
            outputs=[last_add_node.output[0]],
        )
        normalize_node.attribute.extend([onnx.helper.make_attribute("epsilon", float(add_weight))])
        self.nodes_to_add.append(normalize_node)