Fixes for quantizing a BERT from HuggingFace (#3939)

* Fixes for quantizing a BERT from HuggingFace

* Address CR and some other minor fixes
This commit is contained in:
KeDengMS 2020-05-15 23:41:29 -07:00 committed by GitHub
parent 33208c9f6b
commit 132ce3a561
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 36 additions and 30 deletions

View file

@ -53,7 +53,7 @@ class QuantizeConfig:
('QuantizationType', 'Signed' if self.sign_bit_ else 'Unsigned'),
('ReservedBit', self.reserved_bits_)])
def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg):
def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg, onnx_opset_ver):
assert in_node.op_type == 'MatMul'
# quantize weight
@ -158,7 +158,11 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
Q_Xf = nf.make_node('Mul', [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)])
Q_Xf = nf.make_node('Add', [Q_Xf, np.asarray(0.5).astype(np.float32)])
Q_Xf = nf.make_node('Floor', Q_Xf)
Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()})
if onnx_opset_ver < 11:
Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()})
else:
# Clip changed min max to inputs in opset 11
Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)])
Q_X = nf.make_node('Cast', Q_Xf, {'to':int({np.uint8 : onnx.TensorProto.UINT8,
np.int8 : onnx.TensorProto.INT8,
np.uint16 : onnx.TensorProto.UINT16,
@ -238,7 +242,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
out_mp = onnx.ModelProto()
out_mp.CopyFrom(in_mp)
out_mp.ir_version = 5 # update ir version to avoid requirement of initializer in graph input
ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
onnx_opset_ver = ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
ensure_opset(out_mp, 1, 'com.microsoft') # add MS domain for MatMulInteger16
out_mp.graph.ClearField('node')
nf = NodeFactory(out_mp.graph)
@ -249,7 +253,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
continue
if in_n.op_type == 'MatMul' and not only_for_scan:
if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg):
if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
continue
out_n = out_mp.graph.node.add()
@ -262,7 +266,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
subgraph_quantized_inputs = {} if share_input_quantization else None # remember quantized inputs that might be able to share between MatMuls
for in_sn in in_subgraph.node:
if in_sn.op_type == 'MatMul':
if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg):
if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
continue
if upgrade_op(scan_nf, in_sn):

View file

@ -230,12 +230,16 @@ class SymbolicShapeInference:
if self.auto_merge_:
assert len(dims) == 2 # only allow symbol->int merge in binary ops for now
is_int = [is_literal(d) for d in dims]
assert sum(is_int) == 1
int_dim = is_int.index(1)
if self.verbose_ > 0:
print('dim {} has been merged with value {}'.format(dims[1 - int_dim], dims[int_dim]))
self._check_merged_dims(dims, allow_broadcast=False)
return dims[int_dim]
if sum(is_int) == 1:
int_dim = is_int.index(1)
if self.verbose_ > 0:
print('dim {} has been merged with value {}'.format(dims[1 - int_dim], dims[int_dim]))
self._check_merged_dims(dims, allow_broadcast=False)
return dims[int_dim]
else:
if self.verbose_ > 0:
print('dim {} has been mergd with dim {}'.format(dims[0], dims[1]))
return dims[0]
else:
return None
if all([d == dims[0] for d in dims]):
@ -645,12 +649,10 @@ class SymbolicShapeInference:
def _infer_Expand(self, node):
expand_to_shape = self._try_get_value(node, 1)
if expand_to_shape is not None:
sympy_shape = self._get_sympy_shape(node, 0)
new_sympy_shape = self._broadcast_shapes(sympy_shape, expand_to_shape)
# new_shape's dim can come from 'Expand' computation
self._update_computed_dims(new_sympy_shape)
new_shape = get_shape_from_sympy_shape(new_sympy_shape)
# new_shape's dim can come from shape value
self._update_computed_dims(expand_to_shape)
shape = self._get_shape(node, 0)
new_shape = self._broadcast_shapes(shape, get_shape_from_sympy_shape(expand_to_shape))
vi = self.known_vi_[node.output[0]]
vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, new_shape))
@ -780,13 +782,13 @@ class SymbolicShapeInference:
rank = len(sympy_shape)
if pads is not None:
assert len(pads) == 2*rank
new_shape = [d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])]
self._update_computed_dims(new_shape)
new_sympy_shape = [d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])]
self._update_computed_dims(new_sympy_shape)
else:
# dynamic pads, create new symbolic dimensions
new_shape = self._new_symbolic_shape(rank, node)
new_sympy_shape = self._new_symbolic_shape(rank, node)
output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type
vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_shape)))
vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape)))
def _infer_Pool(self, node):
sympy_shape = self._compute_conv_pool_shape(node)
@ -804,12 +806,12 @@ class SymbolicShapeInference:
start = as_scalar(input_data[0])
limit = as_scalar(input_data[1])
delta = as_scalar(input_data[2])
new_shape = [sympy.Max(sympy.ceiling((limit - start)/delta), 0)]
new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start)/delta), 0)]
else:
new_dim = self._new_symbolic_dim_from_output(node)
new_shape = [self.symbolic_dims_[new_dim]]
self._update_computed_dims(new_shape)
vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, get_shape_from_sympy_shape(new_shape)))
new_sympy_shape = [self.symbolic_dims_[new_dim]]
self._update_computed_dims(new_sympy_shape)
vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, get_shape_from_sympy_shape(new_sympy_shape)))
def _infer_ReduceProd(self, node):
axes = get_attribute(node, 'axes')
@ -1042,15 +1044,15 @@ class SymbolicShapeInference:
def _infer_Tile(self, node):
repeats_value = self._get_value(node, 1)
input_sympy_shape = self._get_sympy_shape(node, 0)
new_shape = []
new_sympy_shape = []
for i,d in enumerate(input_sympy_shape):
new_dim = d * repeats_value[i]
new_shape.append(new_dim)
self._update_computed_dims(new_shape)
new_sympy_shape.append(new_dim)
self._update_computed_dims(new_sympy_shape)
vi = self.known_vi_[node.output[0]]
vi.CopyFrom(helper.make_tensor_value_info(node.output[0],
vi.type.tensor_type.elem_type,
get_shape_from_sympy_shape(new_shape)))
get_shape_from_sympy_shape(new_sympy_shape)))
def _infer_TopK(self, node):
rank = self._get_shape_rank(node, 0)
@ -1268,4 +1270,4 @@ if __name__ == '__main__':
print('output model ' + args.output)
print('Doing symbolic shape inference...')
out_mp = SymbolicShapeInference.infer_shapes(args.input, args.output, args.int_max, args.auto_merge, args.guess_output_rank, args.verbose)
print('Done!')
print('Done!')