Fixes for quantizing a BERT from HuggingFace (#3939)

* Fixes for quantizing a BERT from HuggingFace * Address CR and some other minor fixes
2026-07-20 19:12:24 +00:00 · 2020-05-15 23:41:29 -07:00 · 2020-05-15 23:41:29 -07:00 · 132ce3a561
commit 132ce3a561
parent 33208c9f6b
2 changed files with 36 additions and 30 deletions
--- a/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py
+++ b/onnxruntime/core/providers/nuphar/scripts/model_quantizer.py
@ -53,7 +53,7 @@ class QuantizeConfig:
                     ('QuantizationType', 'Signed' if self.sign_bit_ else 'Unsigned'),
                     ('ReservedBit', self.reserved_bits_)])

-def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg):
+def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, quantized_inputs, qcfg_dict, update_qcfg_dict, default_qcfg, onnx_opset_ver):
    assert in_node.op_type == 'MatMul'

    # quantize weight
@ -158,7 +158,11 @@ def quantize_matmul_2d_with_weight(in_node, in_graph, nf, converted_weights, qua
            Q_Xf = nf.make_node('Mul', [norm_X, np.asarray(x_qcfg.q_range()).astype(np.float32)])
            Q_Xf = nf.make_node('Add', [Q_Xf, np.asarray(0.5).astype(np.float32)])
            Q_Xf = nf.make_node('Floor', Q_Xf)
-            Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()})
+            if onnx_opset_ver < 11:
+                Q_Xf = nf.make_node('Clip', Q_Xf, {'max':x_qcfg.q_max(), 'min':x_qcfg.q_min()})
+            else:
+                # Clip changed min max to inputs in opset 11
+                Q_Xf = nf.make_node('Clip', [Q_Xf, np.asarray(x_qcfg.q_min()).astype(np.float32), np.asarray(x_qcfg.q_max()).astype(np.float32)])
            Q_X = nf.make_node('Cast', Q_Xf, {'to':int({np.uint8  : onnx.TensorProto.UINT8,
                                                        np.int8   : onnx.TensorProto.INT8,
                                                        np.uint16 : onnx.TensorProto.UINT16,
@ -238,7 +242,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
    out_mp = onnx.ModelProto()
    out_mp.CopyFrom(in_mp)
    out_mp.ir_version = 5 # update ir version to avoid requirement of initializer in graph input
-    ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
+    onnx_opset_ver = ensure_opset(out_mp, 10) # bump up to ONNX opset 10, which is required for MatMulInteger
    ensure_opset(out_mp, 1, 'com.microsoft') # add MS domain for MatMulInteger16
    out_mp.graph.ClearField('node')
    nf = NodeFactory(out_mp.graph)
@ -249,7 +253,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
            continue

        if in_n.op_type == 'MatMul' and not only_for_scan:
-            if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg):
+            if quantize_matmul_2d_with_weight(in_n, in_mp.graph, nf, converted_weights, quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
                continue

        out_n = out_mp.graph.node.add()
@ -262,7 +266,7 @@ def convert_matmul_model(input_model, output_model, only_for_scan=False, share_i
            subgraph_quantized_inputs = {} if share_input_quantization else None # remember quantized inputs that might be able to share between MatMuls
            for in_sn in in_subgraph.node:
                if in_sn.op_type == 'MatMul':
-                    if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg):
+                    if quantize_matmul_2d_with_weight(in_sn, in_subgraph, scan_nf, converted_weights, subgraph_quantized_inputs, qcfg_dict, export_qcfg_json, default_qcfg, onnx_opset_ver):
                        continue

                if upgrade_op(scan_nf, in_sn):
--- a/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
+++ b/onnxruntime/core/providers/nuphar/scripts/symbolic_shape_infer.py
@ -230,12 +230,16 @@ class SymbolicShapeInference:
            if self.auto_merge_:
                assert len(dims) == 2 # only allow symbol->int merge in binary ops for now
                is_int = [is_literal(d) for d in dims]
-                assert sum(is_int) == 1
-                int_dim = is_int.index(1)
-                if self.verbose_ > 0:
-                    print('dim {} has been merged with value {}'.format(dims[1 - int_dim], dims[int_dim]))
-                self._check_merged_dims(dims, allow_broadcast=False)
-                return dims[int_dim]
+                if sum(is_int) == 1:
+                  int_dim = is_int.index(1)
+                  if self.verbose_ > 0:
+                      print('dim {} has been merged with value {}'.format(dims[1 - int_dim], dims[int_dim]))
+                  self._check_merged_dims(dims, allow_broadcast=False)
+                  return dims[int_dim]
+                else:
+                  if self.verbose_ > 0:
+                      print('dim {} has been mergd with dim {}'.format(dims[0], dims[1]))
+                  return dims[0]
            else:
                return None
        if all([d == dims[0] for d in dims]):
@ -645,12 +649,10 @@ class SymbolicShapeInference:
    def _infer_Expand(self, node):
        expand_to_shape = self._try_get_value(node, 1)
        if expand_to_shape is not None:
-            sympy_shape = self._get_sympy_shape(node, 0)
-            new_sympy_shape = self._broadcast_shapes(sympy_shape, expand_to_shape)
-
-            # new_shape's dim can come from 'Expand' computation
-            self._update_computed_dims(new_sympy_shape)
-            new_shape = get_shape_from_sympy_shape(new_sympy_shape)
+            # new_shape's dim can come from shape value
+            self._update_computed_dims(expand_to_shape)
+            shape = self._get_shape(node, 0)
+            new_shape = self._broadcast_shapes(shape, get_shape_from_sympy_shape(expand_to_shape))
            vi = self.known_vi_[node.output[0]]
            vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, new_shape))

@ -780,13 +782,13 @@ class SymbolicShapeInference:
            rank = len(sympy_shape)
            if pads is not None:
                assert len(pads) == 2*rank
-                new_shape = [d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])]
-                self._update_computed_dims(new_shape)
+                new_sympy_shape = [d + pad_up + pad_down for d, pad_up, pad_down in zip(sympy_shape, pads[:rank], pads[rank:])]
+                self._update_computed_dims(new_sympy_shape)
            else:
                # dynamic pads, create new symbolic dimensions
-                new_shape = self._new_symbolic_shape(rank, node)
+                new_sympy_shape = self._new_symbolic_shape(rank, node)
            output_tp = self.known_vi_[node.input[0]].type.tensor_type.elem_type
-            vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_shape)))
+            vi.CopyFrom(helper.make_tensor_value_info(node.output[0], output_tp, get_shape_from_sympy_shape(new_sympy_shape)))

    def _infer_Pool(self, node):
        sympy_shape = self._compute_conv_pool_shape(node)
@ -804,12 +806,12 @@ class SymbolicShapeInference:
            start = as_scalar(input_data[0])
            limit = as_scalar(input_data[1])
            delta = as_scalar(input_data[2])
-            new_shape = [sympy.Max(sympy.ceiling((limit - start)/delta), 0)]
+            new_sympy_shape = [sympy.Max(sympy.ceiling((limit - start)/delta), 0)]
        else:
            new_dim = self._new_symbolic_dim_from_output(node)
-            new_shape = [self.symbolic_dims_[new_dim]]
-        self._update_computed_dims(new_shape)
-        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, get_shape_from_sympy_shape(new_shape)))
+            new_sympy_shape = [self.symbolic_dims_[new_dim]]
+        self._update_computed_dims(new_sympy_shape)
+        vi.CopyFrom(helper.make_tensor_value_info(node.output[0], self.known_vi_[node.input[0]].type.tensor_type.elem_type, get_shape_from_sympy_shape(new_sympy_shape)))

    def _infer_ReduceProd(self, node):
        axes = get_attribute(node, 'axes')
@ -1042,15 +1044,15 @@ class SymbolicShapeInference:
    def _infer_Tile(self, node):
        repeats_value = self._get_value(node, 1)
        input_sympy_shape = self._get_sympy_shape(node, 0)
-        new_shape = []
+        new_sympy_shape = []
        for i,d in enumerate(input_sympy_shape):
            new_dim = d * repeats_value[i]
-            new_shape.append(new_dim)
-        self._update_computed_dims(new_shape)
+            new_sympy_shape.append(new_dim)
+        self._update_computed_dims(new_sympy_shape)
        vi = self.known_vi_[node.output[0]]
        vi.CopyFrom(helper.make_tensor_value_info(node.output[0],
                                                  vi.type.tensor_type.elem_type,
-                                                  get_shape_from_sympy_shape(new_shape)))
+                                                  get_shape_from_sympy_shape(new_sympy_shape)))

    def _infer_TopK(self, node):
        rank = self._get_shape_rank(node, 0)
@ -1268,4 +1270,4 @@ if __name__ == '__main__':
        print('output model ' + args.output)
    print('Doing symbolic shape inference...')
    out_mp = SymbolicShapeInference.infer_shapes(args.input, args.output, args.int_max, args.auto_merge, args.guess_output_rank, args.verbose)
-    print('Done!')
+    print('Done!')