[python] MinMax calibration per channel (#19285)

### Description Following the issue #19223, introduce `per_channel` attribute in `MinMaxCalibrater` to develop per-channel calibration. If required, this new functionality should be implemented in the other _Calibraters_ (`HistogramCalibrater`, `EntropyCalibrater`, ...). ### Motivation and Context - This is the first part to solve #19223's proposal. - If per channel calibration was allowed, the quantization algorithm could be updated to improve quantization performance, i.e. weights quantization per channel and not per tensor. That is why it would be interesting to have a 'per_channel' option in any 'Calibrater' class to produce a set of calibration vectors instead of a single scalar.
2026-07-12 17:57:38 +00:00 · 2024-04-26 21:40:49 +02:00 · 2024-04-26 21:40:49 +02:00 · 619ceeed9e
commit 619ceeed9e
parent ddd4e8c3e3
2 changed files with 71 additions and 7 deletions
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@ -164,13 +164,15 @@ class CalibraterBase:
        augmented_model_path="augmented_model.onnx",
        symmetric=False,
        use_external_data_format=False,
+        per_channel=False,
    ):
        """
        :param model_path: ONNX model to calibrate. It should be a model file path
        :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
        :param augmented_model_path: save augmented model to this path.
        :param symmetric: make range of tensor symmetric (central point is 0).
-        :param use_external_data_format: use external data format to store model which size is >= 2Gb
+        :param use_external_data_format: use external data format to store model which size is >= 2Gb.
+        :param per_channel: whether to compute ranges per each channel.
        """
        if isinstance(model_path, str):
            self.model = load_model_with_shape_infer(Path(model_path))
@ -183,6 +185,7 @@ class CalibraterBase:
        self.augmented_model_path = augmented_model_path
        self.symmetric = symmetric
        self.use_external_data_format = use_external_data_format
+        self.per_channel = per_channel

        self.augment_model = None
        self.infer_session = None
@ -274,6 +277,7 @@ class MinMaxCalibrater(CalibraterBase):
        moving_average=False,
        averaging_constant=0.01,
        max_intermediate_outputs=None,
+        per_channel=False,
    ):
        """
        :param model_path: ONNX model to calibrate. It is a model path
@ -284,6 +288,7 @@ class MinMaxCalibrater(CalibraterBase):
        :param moving_average: compute the moving average of the minimum and maximum values instead of the global minimum and maximum.
        :param averaging_constant: constant smoothing factor to use when computing the moving average.
        :param max_intermediate_outputs: maximum number of intermediate outputs before an intermediate range is computed.
+        :param per_channel: whether to compute ranges per each channel.
        """
        super().__init__(
            model_path,
@ -291,6 +296,7 @@ class MinMaxCalibrater(CalibraterBase):
            augmented_model_path=augmented_model_path,
            symmetric=symmetric,
            use_external_data_format=use_external_data_format,
+            per_channel=per_channel,
        )
        self.intermediate_outputs = []
        self.calibrate_tensors_range = None
@ -310,9 +316,15 @@ class MinMaxCalibrater(CalibraterBase):
        """
        tensors, _ = self.select_tensors_to_calibrate(self.model)
        reshape_shape_name = str(uuid.uuid4())
-        reshape_shape = numpy_helper.from_array(np.array([1], dtype=np.int64), reshape_shape_name)
+        reshape_shape = numpy_helper.from_array(np.array([-1], dtype=np.int64), reshape_shape_name)
        self.model.graph.initializer.append(reshape_shape)

+        def get_op_version(op_type, model):
+            for opset_import in model.opset_import:
+                if onnx.defs.has(op_type, opset_import.domain):
+                    return opset_import.version
+            raise RuntimeError(f"Model does not contain a version for '{op_type}'.")
+
        def add_reduce_min_max(tensor_name, reduce_op_name):
            # When doing ReduceMax/ReduceMin, ORT can't reduce on dim with value of 0 if 'keepdims' is false.
            # To make the code simple, we always let keepdims to be 1.
@ -332,7 +344,6 @@ class MinMaxCalibrater(CalibraterBase):
                name=intermediate_output,
            )

-            self.model.graph.node.extend([reduce_node, reshape_node])
            value_infos = {vi.name: vi for vi in self.model.graph.value_info}
            value_infos.update({o.name: o for o in self.model.graph.output})
            value_infos.update({i.name: i for i in self.model.graph.input})
@ -343,7 +354,22 @@ class MinMaxCalibrater(CalibraterBase):
                    f"Unable to guess tensor type for tensor {tensor_name!r}, "
                    f"running shape inference before quantization may resolve this issue."
                )
-            self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [1]))
+
+            # Include axes in reduce_op when per_channel, always keeping axis=1
+            if self.per_channel:
+                tensor_rank = len(value_infos[tensor_name].type.tensor_type.shape.dim)
+                reduced_axes = [0, *range(2, tensor_rank)]
+                # Depending on opset version, axes in ReduceMin/ReduceMax are in attribute or inputs
+                if get_op_version(reduce_op_name, self.model) < 18:
+                    reduce_node.attribute.append(helper.make_attribute("axes", reduced_axes))
+                else:
+                    reduce_axes_name = str(uuid.uuid4())
+                    reduce_axes = numpy_helper.from_array(np.array(reduced_axes, dtype=np.int64), reduce_axes_name)
+                    reduce_node.input.append(reduce_axes_name)
+                    self.model.graph.initializer.append(reduce_axes)
+
+            self.model.graph.node.extend([reduce_node, reshape_node])
+            self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [None]))

        for tensor in tensors:
            add_reduce_min_max(tensor, "ReduceMin")
@ -430,7 +456,7 @@ class MinMaxCalibrater(CalibraterBase):
                max_value_array = np.max(merged_added_output_dict[added_output_names[i + 1]], axis=0)

            if self.symmetric:
-                max_absolute_value = max(np.abs(min_value_array), np.abs(max_value_array))
+                max_absolute_value = np.max([np.abs(min_value_array), np.abs(max_value_array)], axis=0)
                pairs.append(tuple([-max_absolute_value, max_absolute_value]))
            else:
                pairs.append(tuple([min_value_array, max_value_array]))
@ -1097,6 +1123,7 @@ def create_calibrator(
        moving_average = extra_options.get("moving_average", False)
        averaging_constant = extra_options.get("averaging_constant", 0.01)
        max_intermediate_outputs = extra_options.get("max_intermediate_outputs", None)
+        per_channel = extra_options.get("per_channel", False)
        calibrator = MinMaxCalibrater(
            model,
            op_types_to_calibrate,
@ -1106,6 +1133,7 @@ def create_calibrator(
            moving_average=moving_average,
            averaging_constant=averaging_constant,
            max_intermediate_outputs=max_intermediate_outputs,
+            per_channel=per_channel,
        )
    elif calibrate_method == CalibrationMethod.Entropy:
        # default settings for entropy algorithm
--- a/onnxruntime/test/python/quantization/test_calibration.py
+++ b/onnxruntime/test/python/quantization/test_calibration.py
@ -275,7 +275,7 @@ class TestCalibrateMinMaxCalibrator(unittest.TestCase):
        for output in added_outputs:
            self.assertTrue(output in augmented_model_outputs)

-    def construct_test_compute_data_model(self, test_model_path):
+    def construct_test_compute_data_model(self, test_model_path, opset_version=13):
        #    (input)
        #       |
        #      Relu
@ -320,7 +320,7 @@ class TestCalibrateMinMaxCalibrator(unittest.TestCase):
        graph.initializer.add().CopyFrom(b3)
        graph.initializer.add().CopyFrom(w5)
        graph.initializer.add().CopyFrom(b5)
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset_version)])
        onnx.save(model, test_model_path)

    def test_compute_data(self):
@ -456,6 +456,42 @@ class TestCalibrateMinMaxCalibrator(unittest.TestCase):
        for output in added_outputs:
            self.assertTrue(output in augmented_model_outputs)

+    def test_compute_data_per_channel(self):
+        test_model_path = Path(self._tmp_model_dir.name).joinpath("./test_model_6.onnx")
+        self.construct_test_compute_data_model(test_model_path.as_posix(), opset_version=18)
+
+        augmented_model_path = Path(self._tmp_model_dir.name).joinpath("./augmented_test_model_6.onnx")
+        calibrater = create_calibrator(
+            test_model_path, augmented_model_path=augmented_model_path.as_posix(), extra_options={"per_channel": True}
+        )
+        data_reader = TestDataReader()
+        calibrater.collect_data(data_reader)
+        tensors_range = calibrater.compute_data()
+
+        sess_options = onnxruntime.SessionOptions()
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+        infer_session = onnxruntime.InferenceSession(
+            test_model_path.as_posix(),
+            sess_options=sess_options,
+            providers=["CPUExecutionProvider"],
+        )
+        data_reader.rewind()
+        rmin = np.array([np.inf, np.inf, np.inf, np.inf, np.inf, np.inf], dtype=np.float32)[:, np.newaxis]
+        rmax = -1.0 * rmin
+        while True:
+            input = data_reader.get_next()
+            if not input:
+                break
+            output = np.asarray(infer_session.run(None, input)).reshape((6, 3, -1))
+            rmin = np.minimum(rmin, np.amin(output, axis=-1))
+            rmax = np.maximum(rmax, np.amax(output, axis=-1))
+
+        min_max_pairs = list(zip(rmin, rmax))
+        output_names = [infer_session.get_outputs()[i].name for i in range(len(infer_session.get_outputs()))]
+        output_min_max_dict = dict(zip(output_names, min_max_pairs))
+        for output_name in output_min_max_dict:
+            np.testing.assert_equal(output_min_max_dict[output_name], tensors_range[output_name].range_value)
+

 if __name__ == "__main__":
    unittest.main()