From 619ceeed9e9be54f4ef5fffbb702fc6476d389d2 Mon Sep 17 00:00:00 2001
From: Johan MEJIA <69996955+Johansmm@users.noreply.github.com>
Date: Fri, 26 Apr 2024 21:40:49 +0200
Subject: [PATCH] [python] MinMax calibration per channel (#19285)

### Description

Following the issue #19223, introduce `per_channel` attribute in
`MinMaxCalibrater` to develop per-channel calibration.

If required, this new functionality should be implemented in the other
_Calibraters_ (`HistogramCalibrater`, `EntropyCalibrater`, ...).

### Motivation and Context
- This is the first part to solve #19223's proposal.
- If per channel calibration was allowed, the quantization algorithm
could be updated to improve quantization performance, i.e. weights
quantization per channel and not per tensor. That is why it would be
interesting to have a 'per_channel' option in any 'Calibrater' class to
produce a set of calibration vectors instead of a single scalar.
---
 .../python/tools/quantization/calibrate.py    | 38 +++++++++++++++---
 .../python/quantization/test_calibration.py   | 40 ++++++++++++++++++-
 2 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py
index fe37cf3c87..3f5e4e6600 100644
--- a/onnxruntime/python/tools/quantization/calibrate.py
+++ b/onnxruntime/python/tools/quantization/calibrate.py
@@ -164,13 +164,15 @@ class CalibraterBase:
         augmented_model_path="augmented_model.onnx",
         symmetric=False,
         use_external_data_format=False,
+        per_channel=False,
     ):
         """
         :param model_path: ONNX model to calibrate. It should be a model file path
         :param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
         :param augmented_model_path: save augmented model to this path.
         :param symmetric: make range of tensor symmetric (central point is 0).
-        :param use_external_data_format: use external data format to store model which size is >= 2Gb
+        :param use_external_data_format: use external data format to store model which size is >= 2Gb.
+        :param per_channel: whether to compute ranges per each channel.
         """
         if isinstance(model_path, str):
             self.model = load_model_with_shape_infer(Path(model_path))
@@ -183,6 +185,7 @@ class CalibraterBase:
         self.augmented_model_path = augmented_model_path
         self.symmetric = symmetric
         self.use_external_data_format = use_external_data_format
+        self.per_channel = per_channel
 
         self.augment_model = None
         self.infer_session = None
@@ -274,6 +277,7 @@ class MinMaxCalibrater(CalibraterBase):
         moving_average=False,
         averaging_constant=0.01,
         max_intermediate_outputs=None,
+        per_channel=False,
     ):
         """
         :param model_path: ONNX model to calibrate. It is a model path
@@ -284,6 +288,7 @@ class MinMaxCalibrater(CalibraterBase):
         :param moving_average: compute the moving average of the minimum and maximum values instead of the global minimum and maximum.
         :param averaging_constant: constant smoothing factor to use when computing the moving average.
         :param max_intermediate_outputs: maximum number of intermediate outputs before an intermediate range is computed.
+        :param per_channel: whether to compute ranges per each channel.
         """
         super().__init__(
             model_path,
@@ -291,6 +296,7 @@ class MinMaxCalibrater(CalibraterBase):
             augmented_model_path=augmented_model_path,
             symmetric=symmetric,
             use_external_data_format=use_external_data_format,
+            per_channel=per_channel,
         )
         self.intermediate_outputs = []
         self.calibrate_tensors_range = None
@@ -310,9 +316,15 @@ class MinMaxCalibrater(CalibraterBase):
         """
         tensors, _ = self.select_tensors_to_calibrate(self.model)
         reshape_shape_name = str(uuid.uuid4())
-        reshape_shape = numpy_helper.from_array(np.array([1], dtype=np.int64), reshape_shape_name)
+        reshape_shape = numpy_helper.from_array(np.array([-1], dtype=np.int64), reshape_shape_name)
         self.model.graph.initializer.append(reshape_shape)
 
+        def get_op_version(op_type, model):
+            for opset_import in model.opset_import:
+                if onnx.defs.has(op_type, opset_import.domain):
+                    return opset_import.version
+            raise RuntimeError(f"Model does not contain a version for '{op_type}'.")
+
         def add_reduce_min_max(tensor_name, reduce_op_name):
             # When doing ReduceMax/ReduceMin, ORT can't reduce on dim with value of 0 if 'keepdims' is false.
             # To make the code simple, we always let keepdims to be 1.
@@ -332,7 +344,6 @@ class MinMaxCalibrater(CalibraterBase):
                 name=intermediate_output,
             )
 
-            self.model.graph.node.extend([reduce_node, reshape_node])
             value_infos = {vi.name: vi for vi in self.model.graph.value_info}
             value_infos.update({o.name: o for o in self.model.graph.output})
             value_infos.update({i.name: i for i in self.model.graph.input})
@@ -343,7 +354,22 @@ class MinMaxCalibrater(CalibraterBase):
                     f"Unable to guess tensor type for tensor {tensor_name!r}, "
                     f"running shape inference before quantization may resolve this issue."
                 )
-            self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [1]))
+
+            # Include axes in reduce_op when per_channel, always keeping axis=1
+            if self.per_channel:
+                tensor_rank = len(value_infos[tensor_name].type.tensor_type.shape.dim)
+                reduced_axes = [0, *range(2, tensor_rank)]
+                # Depending on opset version, axes in ReduceMin/ReduceMax are in attribute or inputs
+                if get_op_version(reduce_op_name, self.model) < 18:
+                    reduce_node.attribute.append(helper.make_attribute("axes", reduced_axes))
+                else:
+                    reduce_axes_name = str(uuid.uuid4())
+                    reduce_axes = numpy_helper.from_array(np.array(reduced_axes, dtype=np.int64), reduce_axes_name)
+                    reduce_node.input.append(reduce_axes_name)
+                    self.model.graph.initializer.append(reduce_axes)
+
+            self.model.graph.node.extend([reduce_node, reshape_node])
+            self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [None]))
 
         for tensor in tensors:
             add_reduce_min_max(tensor, "ReduceMin")
@@ -430,7 +456,7 @@ class MinMaxCalibrater(CalibraterBase):
                 max_value_array = np.max(merged_added_output_dict[added_output_names[i + 1]], axis=0)
 
             if self.symmetric:
-                max_absolute_value = max(np.abs(min_value_array), np.abs(max_value_array))
+                max_absolute_value = np.max([np.abs(min_value_array), np.abs(max_value_array)], axis=0)
                 pairs.append(tuple([-max_absolute_value, max_absolute_value]))
             else:
                 pairs.append(tuple([min_value_array, max_value_array]))
@@ -1097,6 +1123,7 @@ def create_calibrator(
         moving_average = extra_options.get("moving_average", False)
         averaging_constant = extra_options.get("averaging_constant", 0.01)
         max_intermediate_outputs = extra_options.get("max_intermediate_outputs", None)
+        per_channel = extra_options.get("per_channel", False)
         calibrator = MinMaxCalibrater(
             model,
             op_types_to_calibrate,
@@ -1106,6 +1133,7 @@ def create_calibrator(
             moving_average=moving_average,
             averaging_constant=averaging_constant,
             max_intermediate_outputs=max_intermediate_outputs,
+            per_channel=per_channel,
         )
     elif calibrate_method == CalibrationMethod.Entropy:
         # default settings for entropy algorithm
diff --git a/onnxruntime/test/python/quantization/test_calibration.py b/onnxruntime/test/python/quantization/test_calibration.py
index 795447e8b7..b36ba45cdb 100644
--- a/onnxruntime/test/python/quantization/test_calibration.py
+++ b/onnxruntime/test/python/quantization/test_calibration.py
@@ -275,7 +275,7 @@ class TestCalibrateMinMaxCalibrator(unittest.TestCase):
         for output in added_outputs:
             self.assertTrue(output in augmented_model_outputs)
 
-    def construct_test_compute_data_model(self, test_model_path):
+    def construct_test_compute_data_model(self, test_model_path, opset_version=13):
         #    (input)
         #       |
         #      Relu
@@ -320,7 +320,7 @@ class TestCalibrateMinMaxCalibrator(unittest.TestCase):
         graph.initializer.add().CopyFrom(b3)
         graph.initializer.add().CopyFrom(w5)
         graph.initializer.add().CopyFrom(b5)
-        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset_version)])
         onnx.save(model, test_model_path)
 
     def test_compute_data(self):
@@ -456,6 +456,42 @@ class TestCalibrateMinMaxCalibrator(unittest.TestCase):
         for output in added_outputs:
             self.assertTrue(output in augmented_model_outputs)
 
+    def test_compute_data_per_channel(self):
+        test_model_path = Path(self._tmp_model_dir.name).joinpath("./test_model_6.onnx")
+        self.construct_test_compute_data_model(test_model_path.as_posix(), opset_version=18)
+
+        augmented_model_path = Path(self._tmp_model_dir.name).joinpath("./augmented_test_model_6.onnx")
+        calibrater = create_calibrator(
+            test_model_path, augmented_model_path=augmented_model_path.as_posix(), extra_options={"per_channel": True}
+        )
+        data_reader = TestDataReader()
+        calibrater.collect_data(data_reader)
+        tensors_range = calibrater.compute_data()
+
+        sess_options = onnxruntime.SessionOptions()
+        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
+        infer_session = onnxruntime.InferenceSession(
+            test_model_path.as_posix(),
+            sess_options=sess_options,
+            providers=["CPUExecutionProvider"],
+        )
+        data_reader.rewind()
+        rmin = np.array([np.inf, np.inf, np.inf, np.inf, np.inf, np.inf], dtype=np.float32)[:, np.newaxis]
+        rmax = -1.0 * rmin
+        while True:
+            input = data_reader.get_next()
+            if not input:
+                break
+            output = np.asarray(infer_session.run(None, input)).reshape((6, 3, -1))
+            rmin = np.minimum(rmin, np.amin(output, axis=-1))
+            rmax = np.maximum(rmax, np.amax(output, axis=-1))
+
+        min_max_pairs = list(zip(rmin, rmax))
+        output_names = [infer_session.get_outputs()[i].name for i in range(len(infer_session.get_outputs()))]
+        output_min_max_dict = dict(zip(output_names, min_max_pairs))
+        for output_name in output_min_max_dict:
+            np.testing.assert_equal(output_min_max_dict[output_name], tensors_range[output_name].range_value)
+
 
 if __name__ == "__main__":
     unittest.main()