onnxruntime/onnxruntime/python/tools/transformers/fusion_attention.py
Ye Wang 0fa00429d5
[T5 optimization] script fusions and fixes (#14967)
### Description
<!-- Describe your changes. -->

1. added script for t5 encoder self attention and t5 decoder self/cross
attention fusions.
2. added simplified layernorm fusion for --external_data_format senario.
(otherwise relying on ORT optimizer)
3. added rel_pos_bias shape inference code, modified attention/mha shape
inference script.
4. reworked graph_topologic_sort() because the currently implementation
is not functioning correctly. also added an option to topo-sort the
graph in a deterministic way to let tests pass.

note:
1. the t5-beamsearch export code is slightly modified. specifically,
encoder_hidden_states(ehs) is no longer an input to the t5 decoder since
the ehs is not actually used in the graph execution.
2. recent PRs do not add optimizations to t5 on cpu. 
3. the fp32 model(encoder and decoder) for t5-small, t5-base and
t5-large can get a parity of e-5 and the corresponding beam search
models generate same results as pytorch.
4. fp16(mixed-precision) models, however, get a parity around 3e-2 and
some has maximum diff a bit over 3e-2. But the beam search models still
generate same results as pytorch (based on limited input data)
5. mt-5 model has a parity issue at the moment, even before any
optimization. will investigate later.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

---------

Co-authored-by: Ubuntu <wy@v100-2.0cdb2e52twzevn1i4fi45bylyg.jx.internal.cloudapp.net>
2023-03-13 23:35:56 -07:00

652 lines
26 KiB
Python

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from logging import getLogger
from typing import List, Optional, Tuple, Union
import numpy as np
from fusion_base import Fusion
from fusion_options import AttentionMaskFormat
from fusion_utils import FusionUtils, NumpyHelper
from onnx import NodeProto, TensorProto, helper, numpy_helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
class AttentionMask:
"""
Fuse Attention subgraph into one Attention node.
"""
def __init__(self, model: OnnxModel):
self.model = model
# A lookup table with mask input as key, and mask index output as value
self.mask_indice = {}
# A lookup table with mask input as key, and cast (to int32) output as value
self.mask_casted = {}
self.utils = FusionUtils(model)
self.mask_format = AttentionMaskFormat.MaskIndexEnd
def set_mask_format(self, mask_format: AttentionMaskFormat):
self.mask_format = mask_format
def set_mask_indice(self, mask, mask_index):
if mask in self.mask_indice:
assert mask_index == self.mask_indice[mask]
self.mask_indice[mask] = mask_index
def get_first_mask(self):
assert len(self.mask_indice) > 0
return next(iter(self.mask_indice))
def process_mask(self, input: str) -> str:
if self.mask_format == AttentionMaskFormat.NoMask:
return None
if input in self.mask_indice:
return self.mask_indice[input]
# Add cast to convert int64 to int32
if self.model.find_graph_input(input):
casted, input_name = self.utils.cast_graph_input_to_int32(input)
else:
input_name, cast_node = self.utils.cast_input_to_int32(input)
casted = True
if casted:
self.mask_casted[input] = input_name
# Attention supports int32 attention mask (2D) since 1.4.0
if self.mask_format == AttentionMaskFormat.AttentionMask:
self.mask_indice[input] = input_name
return input_name
# Add a mask processing node to convert attention mask to mask index (1D)
output_name = self.model.create_node_name("mask_index")
mask_index_node = helper.make_node(
"ReduceSum",
inputs=[input_name],
outputs=[output_name],
name=self.model.create_node_name("ReduceSum", "MaskReduceSum"),
)
mask_index_node.attribute.extend([helper.make_attribute("axes", [1]), helper.make_attribute("keepdims", 0)])
self.model.add_node(mask_index_node)
self.mask_indice[input] = output_name
return output_name
class FusionAttention(Fusion):
"""
Fuse Attention subgraph into one Attention node.
"""
def __init__(
self,
model: OnnxModel,
hidden_size: int,
num_heads: int,
attention_mask: AttentionMask,
use_multi_head_attention: bool = False,
search_op_types: List[str] = ["SkipLayerNormalization", "LayerNormalization"],
):
attention_op_name = "MultiHeadAttention" if use_multi_head_attention else "Attention"
super().__init__(model, attention_op_name, search_op_types)
self.hidden_size = hidden_size
self.num_heads = num_heads
self.attention_mask = attention_mask
self.use_multi_head_attention = use_multi_head_attention
self.mask_filter_value = None
# Flags to show warning only once
self.num_heads_warning = True
self.hidden_size_warning = True
def get_num_heads_and_hidden_size_from_concat(self, concat: NodeProto) -> Tuple[int, int]:
"""
Detect num_heads and hidden_size from Concat node in the following subgraph:
SkipLayerNormalization or EmbedLayerNormalization
/ |
MatMul Shape
| |
Add Gather(indices=0)
| |
| Unsqueeze
| |
| Concat (*, -1, 12, 64)
| /
Reshape
|
Transpose
"""
if len(concat.input) == 4:
num_heads = self.model.get_constant_value(concat.input[2])
head_size = self.model.get_constant_value(concat.input[3])
if (
isinstance(num_heads, np.ndarray)
and num_heads.size == 1
and isinstance(head_size, np.ndarray)
and head_size.size == 1
):
return num_heads[0], num_heads[0] * head_size[0]
return self.num_heads, self.hidden_size
def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
"""Detect num_heads and hidden_size from a reshape node.
Args:
reshape_q (NodeProto): reshape node for Q
Returns:
Tuple[int, int]: num_heads and hidden_size
"""
# we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
q_shape = self.model.get_initializer(reshape_q.input[1])
if q_shape is None:
concat = self.model.get_parent(reshape_q, 1)
if concat is not None and concat.op_type == "Concat":
return self.get_num_heads_and_hidden_size_from_concat(concat)
logger.debug(f"{reshape_q.input[1]} is not initializer.")
return self.num_heads, self.hidden_size # Fall back to user specified value
q_shape_value = NumpyHelper.to_array(q_shape)
if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
return self.num_heads, self.hidden_size # Fall back to user specified value
num_heads = q_shape_value[2]
head_size = q_shape_value[3]
hidden_size = num_heads * head_size
if self.num_heads > 0 and num_heads != self.num_heads:
if self.num_heads_warning:
logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
self.num_heads_warning = False # Do not show the warning more than once
if self.hidden_size > 0 and hidden_size != self.hidden_size:
if self.hidden_size_warning:
logger.warning(
f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
)
self.hidden_size_warning = False # Do not show the warning more than once
return num_heads, hidden_size
def get_add_qk_str(self, add_qk: NodeProto):
shape_infer = self.model.infer_runtime_shape(update=True)
if shape_infer is None:
return
input_0_shape = shape_infer.get_edge_shape(add_qk.input[0])
input_1_shape = shape_infer.get_edge_shape(add_qk.input[1])
if input_0_shape is None or input_1_shape is None:
logger.debug(f"one of the inputs of {add_qk} is None")
return None
if input_0_shape != input_1_shape:
logger.debug(f"the shape of two inputs of {add_qk} is not same")
return None
return add_qk.input[1]
def create_attention_node(
self,
mask_index: str,
q_matmul: NodeProto,
k_matmul: NodeProto,
v_matmul: NodeProto,
q_add: NodeProto,
k_add: NodeProto,
v_add: NodeProto,
num_heads: int,
hidden_size: int,
input: str,
output: str,
add_qk_str: str,
scale: Optional[float] = None,
) -> Union[NodeProto, None]:
"""Create an Attention node.
Args:
mask_index (str): mask input
q_matmul (NodeProto): MatMul node in fully connection for Q
k_matmul (NodeProto): MatMul node in fully connection for K
v_matmul (NodeProto): MatMul node in fully connection for V
q_add (NodeProto): Add bias node in fully connection for Q
k_add (NodeProto): Add bias node in fully connection for K
v_add (NodeProto): Add bias node in fully connection for V
num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning.
hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning.
input (str): input name
output (str): output name
Returns:
Union[NodeProto, None]: the node created or None if failed.
"""
assert num_heads > 0
if hidden_size > 0 and (hidden_size % num_heads) != 0:
logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}")
return None
has_bias = True
if q_add is None and k_add is None and v_add is None:
has_bias = False
q_weight = self.model.get_initializer(q_matmul.input[1])
k_weight = self.model.get_initializer(k_matmul.input[1])
v_weight = self.model.get_initializer(v_matmul.input[1])
q_bias, k_bias, v_bias = None, None, None
if has_bias:
q_bias = self.model.get_initializer(q_add.input[1]) or self.model.get_initializer(q_add.input[0])
k_bias = self.model.get_initializer(k_add.input[1]) or self.model.get_initializer(k_add.input[0])
v_bias = self.model.get_initializer(v_add.input[1]) or self.model.get_initializer(v_add.input[0])
if not (k_weight and v_weight and q_bias and k_bias):
return None
if q_weight is None:
print(
f"{q_matmul.input[1]} is not an initializer. "
"Please set do_constant_folding=True in torch.onnx.export to unblock attention fusion"
)
return None
qw = NumpyHelper.to_array(q_weight)
kw = NumpyHelper.to_array(k_weight)
vw = NumpyHelper.to_array(v_weight)
# assert q and k have same shape as expected
assert qw.shape == kw.shape
qw_in_size = qw.shape[0]
kw_in_size = kw.shape[0]
vw_in_size = vw.shape[0]
assert qw_in_size == kw_in_size == vw_in_size
if hidden_size > 0 and hidden_size != qw_in_size:
logger.warning(
f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). "
"Please provide a correct input hidden size or pass in 0"
)
is_qkv_diff_dims = False
if qw.shape != vw.shape:
is_qkv_diff_dims = True
# All the matrices can have the same shape or q, k matrics can have the same shape with v being different
# For 2d weights, the shapes would be [in_size, out_size].
# For 3d weights, shape would be [in_size, a, b] where a*b = out_size
qw_out_size = np.prod(qw.shape[1:])
kw_out_size = np.prod(kw.shape[1:])
vw_out_size = np.prod(vw.shape[1:])
qkv_weight_dim = 0
if is_qkv_diff_dims:
qkv_weight = np.concatenate((qw, kw, vw), axis=1)
qkv_weight_dim = qw_out_size + kw_out_size + vw_out_size
else:
qkv_weight = np.stack((qw, kw, vw), axis=1)
qkv_weight_dim = 3 * qw_out_size
if has_bias:
qb = NumpyHelper.to_array(q_bias)
kb = NumpyHelper.to_array(k_bias)
vb = NumpyHelper.to_array(v_bias)
q_bias_shape = np.prod(qb.shape)
k_bias_shape = np.prod(kb.shape)
v_bias_shape = np.prod(vb.shape)
assert q_bias_shape == k_bias_shape == qw_out_size
assert v_bias_shape == vw_out_size
qkv_bias_dim = 0
if is_qkv_diff_dims:
qkv_bias = np.concatenate((qb, kb, vb), axis=0)
qkv_bias_dim = q_bias_shape + k_bias_shape + v_bias_shape
else:
qkv_bias = np.stack((qb, kb, vb), axis=0)
qkv_bias_dim = 3 * q_bias_shape
attention_node_name = self.model.create_node_name("Attention")
if not self.use_multi_head_attention:
weight = helper.make_tensor(
name=attention_node_name + "_qkv_weight",
data_type=TensorProto.FLOAT,
dims=[qw_in_size, qkv_weight_dim],
vals=qkv_weight.flatten().tolist(),
)
# Sometimes weights and bias are stored in fp16
if q_weight.data_type == 10:
weight.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(weight).astype(np.float16), weight.name))
self.model.add_initializer(weight, self.this_graph_name)
bias = None
if has_bias:
bias = helper.make_tensor(
name=attention_node_name + "_qkv_bias",
data_type=TensorProto.FLOAT,
dims=[qkv_bias_dim],
vals=qkv_bias.flatten().tolist(),
)
if q_bias.data_type == 10:
bias.CopyFrom(numpy_helper.from_array(NumpyHelper.to_array(bias).astype(np.float16), bias.name))
self.model.add_initializer(bias, self.this_graph_name)
# For MultiHeadAttention operator, use separated inputs for query, key and value, and no weights.
if self.use_multi_head_attention:
if add_qk_str is not None:
logger.debug("MultiHeadAttention does not support relative_position_bias: cannot fuse the attention.")
return None
attention_inputs = [
q_matmul.output[0],
k_matmul.output[0],
v_matmul.output[0],
attention_node_name + "_qkv_bias",
]
if mask_index is not None:
attention_inputs.append(mask_index)
attention_node = helper.make_node(
"MultiHeadAttention",
inputs=attention_inputs,
outputs=[output],
name=attention_node_name,
)
else:
attention_inputs = [
input,
attention_node_name + "_qkv_weight",
attention_node_name + "_qkv_bias" if has_bias else "",
]
if mask_index is not None:
attention_inputs.append(mask_index)
else:
attention_inputs.append("")
if add_qk_str is not None:
attention_inputs.append("") # no past
attention_inputs.append(add_qk_str)
attention_node = helper.make_node(
"Attention",
inputs=attention_inputs,
outputs=[output],
name=attention_node_name,
)
attention_node.domain = "com.microsoft"
attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
if scale is not None:
attention_node.attribute.extend([helper.make_attribute("scale", scale)])
if is_qkv_diff_dims:
attention_node.attribute.extend(
[helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
)
if self.mask_filter_value is not None:
attention_node.attribute.extend([helper.make_attribute("mask_filter_value", float(self.mask_filter_value))])
return attention_node
def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
# Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm
# Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern
start_node = normalize_node
if normalize_node.op_type == "LayerNormalization":
add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0)
if add_before_layernorm is not None:
start_node = add_before_layernorm
else:
return
# SkipLayerNormalization has two inputs, and one of them is the root input for attention.
qkv_nodes = self.model.match_parent_path(
start_node,
["Add", "MatMul", "Reshape", "Transpose", "MatMul"],
[None, None, 0, 0, 0],
)
einsum_node = None
if qkv_nodes is not None:
(_, _, reshape_qkv, transpose_qkv, matmul_qkv) = qkv_nodes
else:
# Match Albert
qkv_nodes = self.model.match_parent_path(
start_node, ["Add", "Einsum", "Transpose", "MatMul"], [1, None, 0, 0]
)
if qkv_nodes is not None:
(_, einsum_node, transpose_qkv, matmul_qkv) = qkv_nodes
else:
return
other_inputs = []
for i, input in enumerate(start_node.input):
if input not in output_name_to_node:
continue
if input == qkv_nodes[0].output[0]:
continue
other_inputs.append(input)
if len(other_inputs) != 1:
return
root_input = other_inputs[0]
"""
Match flaubert Mask
|
Mul --> LayerNormalization --> Attention --> MatMul --> Add
| |
| |
+---------------------------------------------------------
"""
mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0)
if mul_before_layernorm is not None:
mul_children = input_name_to_nodes[mul_before_layernorm.output[0]]
if mul_children is not None and len(mul_children) == 2:
layernorm_node = mul_children[1]
if layernorm_node.op_type == "LayerNormalization":
root_input = layernorm_node.output[0]
else:
return
elif mul_children is not None and len(mul_children) == 5:
root_input = mul_before_layernorm.output[0]
else:
return
elif normalize_node.op_type == "LayerNormalization":
children = input_name_to_nodes[root_input]
for child in children:
if child.op_type == "LayerNormalization":
root_input = child.output[0]
children = input_name_to_nodes[root_input]
children_types = [child.op_type for child in children]
if children_types.count("MatMul") != 3:
return
v_nodes = self.model.match_parent_path(matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
if v_nodes is None:
logger.debug("fuse_attention: failed to match v path")
return
(_, _, add_v, matmul_v) = v_nodes
is_distill = False
is_distill_add = False
qk_paths = {
"path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
"path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
"path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
"path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
}
qk_nodes = None
for k, v in qk_paths.items():
qk_nodes = self.model.match_parent_path(matmul_qkv, v[0], v[1])
if qk_nodes is None:
continue
if k == "path3":
is_distill = True
if k == "path4":
is_distill_add = True
break
if qk_nodes is None:
logger.debug("fuse_attention: failed to match qk path")
return
add_qk = None
matmul_qk = None
where_qk = None
if is_distill:
(_, where_qk, matmul_qk, _) = qk_nodes
elif is_distill_add:
(_, add_qk, where_qk, matmul_qk) = qk_nodes
else:
(_, add_qk, _, matmul_qk) = qk_nodes
q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None])
if q_nodes is None:
q_nodes = self.model.match_parent_path(
matmul_qk,
["Div", "Transpose", "Reshape", "Add", "MatMul"],
[0, 0, 0, 0, None],
)
if q_nodes is None:
logger.debug("fuse_attention: failed to match q path")
return
reshape_q = q_nodes[-3]
add_q = q_nodes[-2]
matmul_q = q_nodes[-1]
k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None])
if k_nodes is None:
k_nodes = self.model.match_parent_path(
matmul_qk,
["Transpose", "Transpose", "Reshape", "Add", "MatMul"],
[1, 0, 0, 0, None],
)
if k_nodes is None:
logger.debug("fuse_attention: failed to match k path")
return
add_k = k_nodes[-2]
matmul_k = k_nodes[-1]
# Note that Cast might be removed by OnnxRuntime so we match two patterns here.
mask_nodes = None
add_qk_str = None
if is_distill:
_, mask_nodes, _ = self.model.match_parent_paths(
where_qk,
[
(["Expand", "Reshape", "Equal"], [0, 0, 0]),
(["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
(["Cast", "Expand", "Reshape", "Equal"], [0, 0, 0, 0]),
],
output_name_to_node,
)
elif is_distill_add:
_, mask_nodes, _ = self.model.match_parent_paths(
where_qk,
[
(["Cast", "Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0, 0]),
(["Equal", "Unsqueeze", "Unsqueeze"], [0, 0, 0]),
],
output_name_to_node,
)
if add_qk is not None:
add_qk_str = self.get_add_qk_str(add_qk)
if add_qk_str is None:
logger.debug(f"fuse_attention: failed to verify shape inference of {add_qk}")
return
else:
_, mask_nodes, _ = self.model.match_parent_paths(
add_qk,
[
(
["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"],
[None, 0, 1, 0, 0],
),
(["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]),
],
output_name_to_node,
)
if mask_nodes is None:
logger.debug("fuse_attention: failed to match mask path")
return
if len(mask_nodes) > 1 and mask_nodes[0].op_type == "Mul":
_, mul_val = self.model.get_constant_input(mask_nodes[0])
if mul_val != -10000:
self.mask_filter_value = mul_val
if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
q_num_heads, q_hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
# number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads
# the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately
new_node = self.create_attention_node(
mask_index,
matmul_q,
matmul_k,
matmul_v,
add_q,
add_k,
add_v,
q_num_heads,
q_hidden_size,
root_input,
attention_last_node.output[0],
add_qk_str,
)
if new_node is None:
return
self.nodes_to_add.append(new_node)
self.node_name_to_graph_name[new_node.name] = self.this_graph_name
if einsum_node is not None:
unique_index = einsum_node.input[0]
new_edge = "edge_modified_" + unique_index
shape_tensor = helper.make_tensor(
name="shape_modified_tensor" + unique_index,
data_type=TensorProto.INT64,
dims=[4],
vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]).tobytes(),
raw=True,
)
self.model.add_initializer(shape_tensor, self.this_graph_name)
self.model.add_node(
helper.make_node(
"Reshape",
[attention_last_node.output[0], shape_tensor.name],
[new_edge],
"reshape_modified_" + unique_index,
),
self.this_graph_name,
)
einsum_node.input[0] = new_edge
self.nodes_to_remove.extend([attention_last_node, transpose_qkv, matmul_qkv])
self.nodes_to_remove.extend(qk_nodes)
# For MultiHeadAttention operator, MatMul nodes for Q/K/V projection shall not be fused.
self.nodes_to_remove.extend(q_nodes if not self.use_multi_head_attention else q_nodes[:-1])
self.nodes_to_remove.extend(k_nodes if not self.use_multi_head_attention else k_nodes[:-1])
self.nodes_to_remove.extend(v_nodes if not self.use_multi_head_attention else v_nodes[:-1])
# Use prune graph to remove mask nodes since they are shared by all attention nodes.
self.prune_graph = True