Introduce QDQ transformer fusion tools for ordered quantized ops (#12661)

This commit is contained in:
Hariharan Seshadri 2022-09-24 23:22:44 -07:00 committed by GitHub
parent 6587a85f8f
commit 19c51376c4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 978 additions and 2 deletions

View file

@ -24,6 +24,7 @@ class FusionOptions:
self.enable_bias_skip_layer_norm = True
self.enable_bias_gelu = True
self.enable_gelu_approximation = False
self.enable_qordered_matmul = True
self.enable_shape_inference = True

View file

@ -0,0 +1,421 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from logging import getLogger
from typing import Tuple
import numpy as np
from fusion_attention import AttentionMask
from fusion_base import Fusion
from fusion_utils import FusionUtils, NumpyHelper
from onnx import NodeProto, helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
class FusionQOrderedAttention(Fusion):
def __init__(
self,
model: OnnxModel,
hidden_size: int,
num_heads: int,
attention_mask: AttentionMask,
):
self.hidden_size = hidden_size
self.num_heads = num_heads
self.attention_mask = attention_mask
super().__init__(model, "QOrderedAttention", "QOrderedLayerNormalization")
def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int]:
"""Detect num_heads and hidden_size from a reshape node.
Args:
reshape_q (NodeProto): reshape node for Q
Returns:
Tuple[int, int]: num_heads and hidden_size
"""
# we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size]
q_shape = self.model.get_initializer(reshape_q.input[1])
if q_shape is None:
logger.debug(f"{reshape_q.input[1]} is not initializer.")
# Check if the second input to Reshape flows through a Constant node
# TODO: Investigate why FusionAttention doesn't have such logic
constant_node = self.model.match_parent_path(reshape_q, ["Constant"], [1])
if constant_node is None:
return self.num_heads, self.hidden_size # Fall back to user specified value
else:
constant_node = constant_node[0]
if len(constant_node.attribute) != 1:
return self.num_heads, self.hidden_size # Fall back to user specified value
# This is assuming it is a Tensor attribute (this is a safe assumption)
q_shape = constant_node.attribute[0].t
q_shape_value = NumpyHelper.to_array(q_shape)
if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0):
logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].")
return self.num_heads, self.hidden_size # Fall back to user specified value
num_heads = q_shape_value[2]
head_size = q_shape_value[3]
hidden_size = num_heads * head_size
if self.num_heads > 0 and num_heads != self.num_heads:
if self.num_heads_warning:
logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.")
self.num_heads_warning = False # Do not show the warning more than once
if self.hidden_size > 0 and hidden_size != self.hidden_size:
if self.hidden_size_warning:
logger.warning(
f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value."
)
self.hidden_size_warning = False # Do not show the warning more than once
return num_heads, hidden_size
def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
add_before_layernorm = self.model.match_parent_path(
normalize_node,
["QuantizeLinear", "Add"],
[0, 0],
)
if add_before_layernorm is not None:
start_node = add_before_layernorm[-1]
else:
return
# Input QDQ nodes
dequantize_input = self.model.match_parent_path(
start_node,
["DequantizeLinear"],
[None],
)
if dequantize_input is None:
logger.debug("fuse_qordered_attention: failed to match input qdq nodes path")
return
dequantize_input = dequantize_input[-1]
# QKV nodes
qkv_nodes = self.model.match_parent_path(
start_node,
["Add", "MatMul", "Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear", "MatMul"],
[None, None, 0, 0, 0, 0, 0],
)
if qkv_nodes is None:
logger.debug("fuse_qordered_attention: failed to match qkv path")
return
(_, projection_matmul, reshape_qkv, transpose_qkv, dequantize_qkv, quantize_qkv, matmul_qkv) = qkv_nodes
# Make sure the Q/DQ has the proper zero points and constant per-tensor scales
if not FusionUtils.check_qdq_node_for_fusion(quantize_qkv, self.model):
return
if not FusionUtils.check_qdq_node_for_fusion(dequantize_qkv, self.model):
return
# Identify the root input to the Attention node
other_inputs = []
for i, input in enumerate(start_node.input):
if input not in output_name_to_node:
continue
if input == qkv_nodes[0].output[0]:
continue
other_inputs.append(input)
if len(other_inputs) != 1:
return
root_input = other_inputs[0]
# V nodes
v_nodes = self.model.match_parent_path(
matmul_qkv,
["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
[1, 0, 0, 0, 0, None],
)
if v_nodes is None:
logger.debug("fuse_qordered_attention: failed to match v path")
return
(_, _, dequantize_v, quantize_v, add_v, matmul_v) = v_nodes
# Make sure the Q/DQ has the proper zero points and constant per-tensor scales
if not FusionUtils.check_qdq_node_for_fusion(quantize_v, self.model):
return
if not FusionUtils.check_qdq_node_for_fusion(dequantize_v, self.model):
return
# V MatMul weight
dequantize_v_matmul_weight = self.model.match_parent_path(matmul_v, ["DequantizeLinear"], [1])
if dequantize_v_matmul_weight is None:
logger.debug("fuse_qordered_attention: failed to match v path")
return
dequantize_v_matmul_weight = dequantize_v_matmul_weight[0]
if self.model.get_constant_value(dequantize_v_matmul_weight.input[0]) is None:
return
# Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
# Per-channel scales are supported for weights alone
if not FusionUtils.check_qdq_node_for_fusion(dequantize_v_matmul_weight, self.model, False):
return
# QK nodes
qk_nodes = self.model.match_parent_path(
matmul_qkv,
[
"DequantizeLinear",
"QuantizeLinear",
"Softmax",
"Add",
"Div",
"DequantizeLinear",
"QuantizeLinear",
"MatMul",
],
[0, 0, 0, 0, None, 0, 0, 0],
)
if qk_nodes is None:
logger.debug("fuse_qordered_attention: failed to match qk path")
return
(
dequantize_qk_softmax,
quantize_qk_softmax,
softmax_qk,
add_qk,
div_qk,
dequantize_qk,
quantize_qk,
matmul_qk,
) = qk_nodes
# Make sure the Q/DQ has the proper zero points and constant per-tensor scales
if not FusionUtils.check_qdq_node_for_fusion(quantize_qk_softmax, self.model):
return
if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk_softmax, self.model):
return
if not FusionUtils.check_qdq_node_for_fusion(quantize_qk, self.model):
return
if not FusionUtils.check_qdq_node_for_fusion(dequantize_qk, self.model):
return
# Q nodes
q_nodes = self.model.match_parent_path(
matmul_qk,
["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
[0, 0, 0, 0, 0, None],
)
if q_nodes is None:
logger.debug("fuse_qordered_attention: failed to match q path")
return
(_, reshape_q, dequantize_q, quantize_q, add_q, matmul_q) = q_nodes
# Make sure the Q/DQ has the proper zero points and constant per-tensor scales
if not FusionUtils.check_qdq_node_for_fusion(quantize_q, self.model):
return
if not FusionUtils.check_qdq_node_for_fusion(dequantize_q, self.model):
return
# Q MatMul weight
dequantize_q_matmul_weight = self.model.match_parent_path(matmul_q, ["DequantizeLinear"], [1])
if dequantize_q_matmul_weight is None:
logger.debug("fuse_qordered_attention: failed to match q path")
return
dequantize_q_matmul_weight = dequantize_q_matmul_weight[0]
if self.model.get_constant_value(dequantize_q_matmul_weight.input[0]) is None:
return
# Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
# Per-channel scales are supported for weights alone
if not FusionUtils.check_qdq_node_for_fusion(dequantize_q_matmul_weight, self.model, False):
return
# K nodes
k_nodes = self.model.match_parent_path(
matmul_qk,
["Transpose", "Reshape", "DequantizeLinear", "QuantizeLinear", "Add", "MatMul"],
[1, 0, 0, 0, 0, None],
)
if k_nodes is None:
logger.debug("fuse_qordered_attention: failed to match k path")
return
(_, _, dequantize_k, quantize_k, add_k, matmul_k) = k_nodes
# Make sure the Q/DQ has the proper zero points and constant per-tensor scales
if not FusionUtils.check_qdq_node_for_fusion(quantize_k, self.model):
return
if not FusionUtils.check_qdq_node_for_fusion(dequantize_k, self.model):
return
# K MatMul weight
dequantize_k_matmul_weight = self.model.match_parent_path(matmul_k, ["DequantizeLinear"], [1])
if dequantize_k_matmul_weight is None:
logger.debug("fuse_qordered_attention: failed to match k path")
return
dequantize_k_matmul_weight = dequantize_k_matmul_weight[0]
if self.model.get_constant_value(dequantize_k_matmul_weight.input[0]) is None:
return
# Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
# Per-channel scales are supported for weights alone
if not FusionUtils.check_qdq_node_for_fusion(dequantize_k_matmul_weight, self.model, False):
return
# Mask nodes
mask_nodes = self.model.match_parent_path(
add_qk, ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0]
)
if mask_nodes is None:
logger.debug("fuse_qordered_attention: failed to match mask_nodes path")
return
# Ascertain `qkv_hidden_sizes` attribute value
q_weight = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
k_weight = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
v_weight = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
qw = NumpyHelper.to_array(q_weight)
kw = NumpyHelper.to_array(k_weight)
vw = NumpyHelper.to_array(v_weight)
qw_out_size = np.prod(qw.shape[1:])
kw_out_size = np.prod(kw.shape[1:])
vw_out_size = np.prod(vw.shape[1:])
# Form QOrderedAttention node
if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
# Ascertain `num_heads` and `hidden_size`
num_heads, hidden_size = self.get_num_heads_and_hidden_size(reshape_q)
# Formulate the inputs
# Actual quantized input
attention_inputs = [dequantize_input.input[0]]
attention_inputs.append(dequantize_input.input[1])
attention_inputs.append(dequantize_q.input[1])
attention_inputs.append(dequantize_k.input[1])
attention_inputs.append(dequantize_v.input[1])
attention_inputs.append(dequantize_q_matmul_weight.input[0])
attention_inputs.append(dequantize_k_matmul_weight.input[0])
attention_inputs.append(dequantize_v_matmul_weight.input[0])
attention_inputs.append(dequantize_q_matmul_weight.input[1])
attention_inputs.append(dequantize_k_matmul_weight.input[1])
attention_inputs.append(dequantize_v_matmul_weight.input[1])
if self.model.get_initializer(add_q.input[0]):
attention_inputs.append(add_q.input[0])
else: # second input is the constant bias
attention_inputs.append(add_q.input[1])
if self.model.get_initializer(add_k.input[0]):
attention_inputs.append(add_k.input[0])
else: # second input is the constant bias
attention_inputs.append(add_k.input[1])
if self.model.get_initializer(add_v.input[0]):
attention_inputs.append(add_v.input[0])
else: # second input is the constant bias
attention_inputs.append(add_v.input[1])
attention_inputs.append(quantize_qk.input[1])
attention_inputs.append(quantize_qk_softmax.input[1])
attention_inputs.append(dequantize_qkv.input[1])
# Mask input
if mask_index is not None:
attention_inputs.append(mask_index)
else:
attention_inputs.append("")
# The MatMul weight 'B' and 'bias' need some post-processing
# Transpose weight 'B' from order ROW to order COL
# This offline transpose is needed only while using the CUDA EP
# TODO: Make this fusion logic EP-agnostic ?
q_weight_tensor = self.model.get_initializer(dequantize_q_matmul_weight.input[0])
FusionUtils.transpose_2d_int8_tensor(q_weight_tensor)
k_weight_tensor = self.model.get_initializer(dequantize_k_matmul_weight.input[0])
FusionUtils.transpose_2d_int8_tensor(k_weight_tensor)
v_weight_tensor = self.model.get_initializer(dequantize_v_matmul_weight.input[0])
FusionUtils.transpose_2d_int8_tensor(v_weight_tensor)
# Name and create Attention node
attention_node_name = self.model.create_node_name("QOrderedAttention")
attention_node = helper.make_node(
"QOrderedAttention",
inputs=attention_inputs,
outputs=[reshape_qkv.output[0]],
name=attention_node_name,
)
self.model.replace_node_input(dequantize_qkv, dequantize_qkv.input[0], attention_node.output[0])
self.model.replace_node_input(projection_matmul, projection_matmul.input[0], dequantize_qkv.output[0])
attention_node.attribute.extend([helper.make_attribute("num_heads", num_heads)])
attention_node.attribute.extend([helper.make_attribute("order_input", 1)])
attention_node.attribute.extend([helper.make_attribute("order_weight", 0)])
attention_node.attribute.extend([helper.make_attribute("order_output", 1)])
attention_node.attribute.extend(
[helper.make_attribute("qkv_hidden_sizes", [qw_out_size, kw_out_size, vw_out_size])]
)
attention_node.domain = "com.microsoft"
self.nodes_to_add.append(attention_node)
self.node_name_to_graph_name[attention_node.name] = self.this_graph_name
self.nodes_to_remove.extend([reshape_qkv, transpose_qkv, quantize_qkv, matmul_qkv])
self.nodes_to_remove.extend(qk_nodes)
self.nodes_to_remove.extend(q_nodes)
self.nodes_to_remove.extend(k_nodes)
self.nodes_to_remove.extend(v_nodes)
self.nodes_to_remove.extend(
[dequantize_q_matmul_weight, dequantize_k_matmul_weight, dequantize_v_matmul_weight]
)
# Use prune graph to remove mask nodes since they are shared by all attention nodes.
# self.nodes_to_remove.extend(mask_nodes)
self.prune_graph = True

View file

@ -0,0 +1,117 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from logging import getLogger
from typing import Dict
from fusion_base import Fusion
from fusion_utils import FusionUtils
from onnx import helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
class FusionQOrderedGelu(Fusion):
def __init__(self, model: OnnxModel):
super().__init__(model, "QOrderedGelu", ["Gelu", "FastGelu"])
def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
"""
INPUT PATTERN
Fuse (quantized) Gelu subgraph into one node QOrderedGelu:
-> quantized input -> DQ -> Gelu -> Q ->
(or)
-> quantized input -> DQ -> FastGelu -> Q ->
OUTPUT PATTERN
-> QOrderedGelu ->
"""
gelu_children = self.model.get_children(node, input_name_to_nodes)
# Should only have 1 child - QuantizeLinear (or)
# Should have 2 children - QuantizeLinear + Shape
if not (
(len(gelu_children) == 1 and gelu_children[0].op_type == "QuantizeLinear")
or (
len(gelu_children) == 2
and gelu_children[0].op_type == "QuantizeLinear"
and gelu_children[1].op_type == "Shape"
)
):
return
downstream_quantize_node = gelu_children[0]
downstream_shape_node = None
if len(gelu_children) == 2:
downstream_shape_node = gelu_children[1]
if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
return
# The first input to Gelu should flow through a DequantizeLinear node
first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
node,
[(["DequantizeLinear"], [0])],
output_name_to_node,
)
if first_path_id < 0:
return
upstream_dequantize_node = first_input_parent_nodes[0]
if not FusionUtils.check_qdq_node_for_fusion(upstream_dequantize_node, self.model):
return
# Fusion logic
subgraph_nodes = [node] # Gelu/FastGelu
subgraph_nodes.extend([downstream_quantize_node, upstream_dequantize_node]) # Relevant Q, DQ nodes
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes,
[node.output[0], downstream_quantize_node.output[0]]
if downstream_shape_node is not None
else downstream_quantize_node.output,
input_name_to_nodes,
output_name_to_node,
):
logger.debug(f"It is not safe to fuse QOrderedGelu node. Skip")
return
self.nodes_to_remove.extend(subgraph_nodes)
ordered_gelu_node = helper.make_node(
"QOrderedGelu",
inputs=[
upstream_dequantize_node.input[0],
upstream_dequantize_node.input[1],
downstream_quantize_node.input[1],
],
outputs=[downstream_quantize_node.output[0]],
name=self.model.create_node_name("QOrderedGelu", name_prefix="QOrderedGelu"),
)
# Arrange the downstream Shape's input to be fed from the
# downstream QuantizeLinear node, so that fusion will
# be deemed safe
if downstream_shape_node is not None:
self.model.replace_node_input(
downstream_shape_node, downstream_shape_node.input[0], downstream_quantize_node.output[0]
)
# TODO: We only support CuBlasLt order ORDER_ROW for now.
# Once we start supporting other data ordering format(s), we
# will support user configuring the data ordering for the op.
ordered_gelu_node.attribute.extend([helper.make_attribute("order_X", 1)])
ordered_gelu_node.attribute.extend([helper.make_attribute("order_Y", 1)])
ordered_gelu_node.domain = "com.microsoft"
self.nodes_to_add.append(ordered_gelu_node)
self.node_name_to_graph_name[ordered_gelu_node.name] = self.this_graph_name

View file

@ -0,0 +1,121 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from logging import getLogger
from typing import Dict
from fusion_base import Fusion
from fusion_utils import FusionUtils
from onnx import helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
class FusionQOrderedLayerNormalization(Fusion):
def __init__(self, model: OnnxModel):
super().__init__(model, "QOrderedLayerNormalization", "LayerNormalization")
def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
"""
Fuse (quantized) Layer Normalization subgraph into one node QOrderedLayerNormalization:
quantized input -> DQ
|
|
(other inputs)-> LayerNormalization --> Q -->
should become
(quantized input + other inputs)-> QOrderedLayerNormalization --> Q -->
"""
children = self.model.get_children(node, input_name_to_nodes)
# Should only have 1 child - QuantizeLinear (or)
# Should have 2 children - QuantizeLinear + Shape
if not (
(len(children) == 1 and children[0].op_type == "QuantizeLinear")
or (len(children) == 2 and children[0].op_type == "QuantizeLinear" and children[1].op_type == "Shape")
):
return
downstream_quantize_node = children[0]
downstream_shape_node = None
if len(children) == 2:
downstream_shape_node = children[1]
if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
return
# The first input to LayerNormalization should flow through a DequantizeLinear node
first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
node,
[(["DequantizeLinear"], [0])],
output_name_to_node,
)
if first_path_id < 0:
return
upstream_dequantize_node = first_input_parent_nodes[0]
if not FusionUtils.check_qdq_node_for_fusion(upstream_dequantize_node, self.model):
return
# Fusion logic
subgraph_nodes = [node] # LayerNormalization
subgraph_nodes.extend([downstream_quantize_node]) # Q node after LayerNormalization
upstream_dequantize_node_children = self.model.get_children(upstream_dequantize_node, input_name_to_nodes)
# In GPT2, the DQ node will be feeding a residual downstream Add and hence,
# we do not want to remove it
if len(upstream_dequantize_node_children) == 1:
subgraph_nodes.extend([upstream_dequantize_node]) # DQ node before LayerNormalization
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes,
[node.output[0], downstream_quantize_node.output[0]]
if downstream_shape_node is not None
else downstream_quantize_node.output,
input_name_to_nodes,
output_name_to_node,
):
logger.debug(f"It is not safe to fuse QOrderedLayerNormalization node. Skip")
return
self.nodes_to_remove.extend(subgraph_nodes)
normalize_node = helper.make_node(
"QOrderedLayerNormalization",
inputs=[
upstream_dequantize_node.input[0],
upstream_dequantize_node.input[1],
node.input[1],
node.input[2],
downstream_quantize_node.input[1],
],
outputs=[downstream_quantize_node.output[0]],
name=self.model.create_node_name("QOrderedLayerNormalization", name_prefix="QOrderedLayerNormalization"),
)
# Arrange the downstream Shape's input to be fed from the
# downstream QuantizeLinear node, so that fusion will
# be deemed safe
if downstream_shape_node is not None:
self.model.replace_node_input(
downstream_shape_node, downstream_shape_node.input[0], downstream_quantize_node.output[0]
)
# TODO: We only support CuBlasLt order ORDER_ROW for now.
# Once we start supporting other data ordering format(s), we
# will support user configuring the data ordering for the op.
normalize_node.attribute.extend([helper.make_attribute("order_X", 1)])
normalize_node.attribute.extend([helper.make_attribute("order_Y", 1)])
normalize_node.domain = "com.microsoft"
self.nodes_to_add.append(normalize_node)
self.node_name_to_graph_name[normalize_node.name] = self.this_graph_name

View file

@ -0,0 +1,217 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
from logging import getLogger
from typing import Dict
from fusion_base import Fusion
from fusion_utils import FusionUtils
from onnx import helper
from onnx_model import OnnxModel
logger = getLogger(__name__)
class FusionQOrderedMatMul(Fusion):
def __init__(self, model: OnnxModel):
super().__init__(model, "QOrderedMatMul", "MatMul")
def fuse(self, node, input_name_to_nodes: Dict, output_name_to_node: Dict):
matmul_children = self.model.get_children(node, input_name_to_nodes)
# Should only have 1 child - Bias Add
if len(matmul_children) != 1 or matmul_children[0].op_type != "Add":
return
bias_add_node = matmul_children[0]
# Atleast one of the inputs to Bias Add node must be a constant
bias_add_node_index = 0
if (
self.model.get_constant_value(bias_add_node.input[0]) is None
and self.model.get_constant_value(bias_add_node.input[1]) is None
):
return
if self.model.get_constant_value(bias_add_node.input[0]) is None:
bias_add_node_index = 1
bias_add_children = self.model.get_children(bias_add_node, input_name_to_nodes)
if len(bias_add_children) != 1:
return
bias_add_child = bias_add_children[0]
# Bias Add can have another Add downstream (Residual Add layer)
residual_add_node = None
downstream_quantize_node = None
if bias_add_child.op_type == "Add":
residual_add_node = bias_add_child
residual_add_children = self.model.get_children(residual_add_node, input_name_to_nodes)
if len(residual_add_children) != 1 or residual_add_children[0].op_type != "QuantizeLinear":
return
downstream_quantize_node = residual_add_children[0]
elif bias_add_child.op_type == "QuantizeLinear":
downstream_quantize_node = bias_add_child
else:
return
# Make sure the downstream QuantizeLinear has the proper zero points and scales
if not FusionUtils.check_qdq_node_for_fusion(downstream_quantize_node, self.model):
return
# The first input to MatMul should flow through a DequantizeLinear node
first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
node,
[(["DequantizeLinear"], [0])],
output_name_to_node,
)
# If Attention is not fused, this is the pattern to look for
# leading upto the MatMul
reshape_node_0 = None
transpose_node_0 = None
if first_path_id < 0:
first_path_id, first_input_parent_nodes, _ = self.model.match_parent_paths(
node,
[(["Reshape", "Transpose", "DequantizeLinear", "QuantizeLinear"], [0, 0, 0, 0])],
output_name_to_node,
)
if first_path_id < 0:
return
reshape_node_0 = first_input_parent_nodes[0]
transpose_node_0 = first_input_parent_nodes[1]
dequantize_node_0 = first_input_parent_nodes[2]
else:
dequantize_node_0 = first_input_parent_nodes[0]
# Make sure the upstream DequantizeLinear-0 has the proper zero points and scales
if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_0, self.model):
return
# The second input to MatMul should flow through a DequantizeLinear node
dequantize_node_1 = None
is_weight_transpose_required = True
weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
node,
[(["DequantizeLinear", "QuantizeLinear", "Transpose", "DequantizeLinear"], [1, 0, 0, 0])],
output_name_to_node,
)
if weight_path_id < 0:
weight_path_id, weight_nodes, _ = self.model.match_parent_paths(
node,
[(["DequantizeLinear"], [1])],
output_name_to_node,
)
if weight_path_id < 0:
return
dequantize_node_1 = weight_nodes[0]
else:
is_weight_transpose_required = False
dequantize_node_1 = weight_nodes[3]
# Check if weight 'B' is a constant
if self.model.get_constant_value(dequantize_node_1.input[0]) is None:
return
# Make sure the upstream DequantizeLinear-1 has the proper zero points and scales
# Per-channel scales are supported for weights alone
if not FusionUtils.check_qdq_node_for_fusion(dequantize_node_1, self.model, False):
return
# Make sure the upstream flow into the Residual Add node flows through a DQ node
residual_add_dequantize_node = None
if residual_add_node is not None:
residual_path_id, residual_input_parent_nodes, _ = self.model.match_parent_paths(
residual_add_node,
[
(["DequantizeLinear"], [1]),
],
output_name_to_node,
)
if residual_path_id < 0:
return
residual_add_dequantize_node = residual_input_parent_nodes[0]
# Make sure the upstream DequantizeLinear to the Residual Add has the proper zero points and scales
if residual_add_dequantize_node is not None and not FusionUtils.check_qdq_node_for_fusion(
residual_add_dequantize_node, self.model
):
return
# Subgraph nodes to be fused
subgraph_nodes = [node, bias_add_node] # MatMul + Bias Add
if residual_add_node is not None:
subgraph_nodes.extend([residual_add_node]) # Residual Add
subgraph_nodes.extend(weight_nodes)
subgraph_nodes.extend([downstream_quantize_node]) # Downstream Q node
if not self.model.is_safe_to_fuse_nodes(
subgraph_nodes, downstream_quantize_node.output, input_name_to_nodes, output_name_to_node
):
logger.debug(f"It is not safe to fuse QOrderedMatMul node. Skip")
return
# Deal with the case where-in the Attention subgraph is not fused
if transpose_node_0 is not None:
self.model.replace_node_input(transpose_node_0, transpose_node_0.input[0], dequantize_node_0.input[0])
# Make inputs
fused_node_inputs = [
reshape_node_0.output[0] if reshape_node_0 is not None else dequantize_node_0.input[0],
dequantize_node_0.input[1],
dequantize_node_1.input[0],
dequantize_node_1.input[1],
downstream_quantize_node.input[1],
bias_add_node.input[bias_add_node_index],
]
if residual_add_node is not None:
fused_node_inputs.append(residual_add_dequantize_node.input[0])
fused_node_inputs.append(residual_add_dequantize_node.input[1])
# The MatMul weight 'B' and 'bias' need some post-processing
# Transpose weight 'B' from order ROW to order COL
# This offline transpose is needed only while using the CUDA EP
# TODO: Make this fusion logic EP-agnostic ?
if is_weight_transpose_required:
weight_tensor = self.model.get_initializer(dequantize_node_1.input[0])
FusionUtils.transpose_2d_int8_tensor(weight_tensor)
fused_node = helper.make_node(
"QOrderedMatMul",
inputs=fused_node_inputs,
outputs=[downstream_quantize_node.output[0]],
name=self.model.create_node_name("QOrderedMatMul", name_prefix="QOrderedMatMul"),
)
fused_node.attribute.extend([helper.make_attribute("order_A", 1)])
fused_node.attribute.extend([helper.make_attribute("order_B", 0)])
fused_node.attribute.extend([helper.make_attribute("order_Y", 1)])
fused_node.domain = "com.microsoft"
self.nodes_to_remove.extend(subgraph_nodes)
self.nodes_to_add.append(fused_node)
self.node_name_to_graph_name[fused_node.name] = self.this_graph_name

View file

@ -5,8 +5,10 @@
from logging import getLogger
from typing import Tuple
import numpy
from numpy import array_equal, ndarray
from onnx import TensorProto, helper, numpy_helper
from onnx import NodeProto, TensorProto, helper, numpy_helper
from onnx import onnx_pb as onnx_proto
from onnx_model import OnnxModel
logger = getLogger(__name__)
@ -83,6 +85,73 @@ class FusionUtils:
else:
return value == expected_value
@staticmethod
def transpose_2d_int8_tensor(tensor: onnx_proto.TensorProto):
"""Transpose a 2-D INT8 TensorProto
Args:
tensor (TensorProto): tensor to be transposed
Returns:
tensor (TensorProto): transposed tensor
"""
if not isinstance(tensor, onnx_proto.TensorProto):
raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor))
if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8:
raise ValueError("Only INT8 2-D tensors can be transposed")
if tensor.raw_data:
int32_data = numpy.reshape(numpy.frombuffer(tensor.raw_data, dtype="int8"), tensor.dims)
int32_transposed_data = numpy.transpose(int32_data, [1, 0])
tensor.raw_data = int32_transposed_data.tobytes()
else:
raise ValueError("only raw buffer supported")
return tensor
@staticmethod
def check_qdq_node_for_fusion(node: NodeProto, model: OnnxModel, allow_per_tensor_quantization_only=True):
"""Verify if a provided QuantizeLinear (Q) / DequantizeLinear (DQ) node is a good candidate for fusion.
It is a good candidate for fusion if:
(1) The Q/DQ node is for per-tensor quantization if allow_per_tensor_quantization_only is `True`
(2) The Q/DQ node should have constant scale
(3) The Q/DQ node should have a zero point of 0
Args:
node (NodeProto): a Q/DQ node to check
Returns:
bool: whether the check is passed or not
"""
if not node.op_type in {"QuantizeLinear", "DequantizeLinear"}:
logger.debug(f"Provided node is not a Q/DQ node. Op Type: {node.op_type}")
scale = model.get_constant_value(node.input[1])
# Scale is not constant
if scale is None:
return False
# Not per-tensor quantization
scale_has_single_element = scale.ndim == 0 or (scale.ndim == 1 and scale.shape[0] == 1)
if allow_per_tensor_quantization_only and not scale_has_single_element:
return False
# If the Q/DQ node has no zero point input, it is assumed to be 0 (per ONNX spec)
if len(node.input) == 2:
return True
# Zero point should be constant and should have a value of 0
zero_point = model.get_constant_value(node.input[2])
# Zero point and scale should have same number of dims
if scale.ndim != zero_point.ndim:
return False
# Zero point is not constant or zero point is not zero
if zero_point is None:
return False
return numpy.all(zero_point == 0)
def check_node_input_value(self, node, input_index: int, expected_value):
"""Verify that a node has expected input value

View file

@ -14,6 +14,10 @@ from fusion_gelu import FusionGelu
from fusion_gelu_approximation import FusionGeluApproximation
from fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
from fusion_options import FusionOptions
from fusion_qordered_attention import FusionQOrderedAttention
from fusion_qordered_gelu import FusionQOrderedGelu
from fusion_qordered_layernorm import FusionQOrderedLayerNormalization
from fusion_qordered_matmul import FusionQOrderedMatMul
from fusion_reshape import FusionReshape
from fusion_shape import FusionShape
from fusion_skiplayernorm import FusionBiasSkipLayerNormalization, FusionSkipLayerNormalization
@ -49,16 +53,24 @@ class BertOnnxModel(OnnxModel):
self.attention_mask = AttentionMask(self)
self.attention_fusion = FusionAttention(self, self.hidden_size, self.num_heads, self.attention_mask)
self.qordered_attention_fusion = FusionQOrderedAttention(
self, self.hidden_size, self.num_heads, self.attention_mask
)
self.utils = FusionUtils(self)
def fuse_attention(self):
self.attention_fusion.apply()
# Only relevant in models with Q-DQ nodes
self.qordered_attention_fusion.apply()
def fuse_gelu(self):
fusion = FusionGelu(self)
fusion.apply()
fusion = FusionFastGelu(self)
fusion.apply()
# Only relevant in models with Q-DQ nodes
fusion = FusionQOrderedGelu(self)
fusion.apply()
def fuse_bias_gelu(self, is_fastgelu):
fusion = FusionBiasGelu(self, is_fastgelu)
@ -91,10 +103,19 @@ class BertOnnxModel(OnnxModel):
fusion = FusionLayerNormalizationTF(self)
fusion.apply()
# Only relevant in models with Q-DQ nodes
fusion = FusionQOrderedLayerNormalization(self)
fusion.apply()
def fuse_skip_layer_norm(self):
fusion = FusionSkipLayerNormalization(self)
fusion.apply()
# Only relevant in models with Q-DQ nodes
def fuse_qordered_mamtul(self):
fusion = FusionQOrderedMatMul(self)
fusion.apply()
def get_graph_inputs_from_node_type(self, op_type: str, input_indices: List[int], casted: bool):
"""
Get graph inputs that feed into node type (like EmbedLayerNormalization or Attention).
@ -364,6 +385,11 @@ class BertOnnxModel(OnnxModel):
self.attention_mask.set_mask_format(options.attention_mask_format)
self.fuse_attention()
# Perform the MatMul fusion after the Attention fusion as we do not
# want to fuse the MatMuls inside the Attention subgraphs
if (options is None) or options.enable_qordered_matmul:
self.fuse_qordered_mamtul()
self.fuse_shape()
if (options is None) or options.enable_embed_layer_norm:
@ -403,11 +429,15 @@ class BertOnnxModel(OnnxModel):
ops = [
"EmbedLayerNormalization",
"Attention",
"QOrderedAttention",
"Gelu",
"QOrderedGelu",
"FastGelu",
"BiasGelu",
"LayerNormalization",
"QOrderedLayerNormalization",
"SkipLayerNormalization",
"QOrderedMatMul",
]
for op in ops:
nodes = self.get_nodes_by_op_type(op)
@ -421,7 +451,7 @@ class BertOnnxModel(OnnxModel):
"""
op_count = self.get_fused_operator_statistics()
embed = op_count["EmbedLayerNormalization"]
attention = op_count["Attention"]
attention = op_count["Attention"] + op_count["QOrderedAttention"]
gelu = op_count["Gelu"] + op_count["BiasGelu"] + op_count["FastGelu"]
layer_norm = op_count["LayerNormalization"] + op_count["SkipLayerNormalization"]
is_perfect = (embed > 0) and (attention > 0) and (attention == gelu) and (layer_norm >= 2 * attention)