[python] MinMax calibration per channel (#19285)

### Description

Following the issue #19223, introduce `per_channel` attribute in
`MinMaxCalibrater` to develop per-channel calibration.

If required, this new functionality should be implemented in the other
_Calibraters_ (`HistogramCalibrater`, `EntropyCalibrater`, ...).

### Motivation and Context
- This is the first part to solve #19223's proposal.
- If per channel calibration was allowed, the quantization algorithm
could be updated to improve quantization performance, i.e. weights
quantization per channel and not per tensor. That is why it would be
interesting to have a 'per_channel' option in any 'Calibrater' class to
produce a set of calibration vectors instead of a single scalar.
This commit is contained in:
Johan MEJIA 2024-04-26 21:40:49 +02:00 committed by GitHub
parent ddd4e8c3e3
commit 619ceeed9e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 71 additions and 7 deletions

View file

@ -164,13 +164,15 @@ class CalibraterBase:
augmented_model_path="augmented_model.onnx",
symmetric=False,
use_external_data_format=False,
per_channel=False,
):
"""
:param model_path: ONNX model to calibrate. It should be a model file path
:param op_types_to_calibrate: operator types to calibrate. By default, calibrate all the float32/float16 tensors.
:param augmented_model_path: save augmented model to this path.
:param symmetric: make range of tensor symmetric (central point is 0).
:param use_external_data_format: use external data format to store model which size is >= 2Gb
:param use_external_data_format: use external data format to store model which size is >= 2Gb.
:param per_channel: whether to compute ranges per each channel.
"""
if isinstance(model_path, str):
self.model = load_model_with_shape_infer(Path(model_path))
@ -183,6 +185,7 @@ class CalibraterBase:
self.augmented_model_path = augmented_model_path
self.symmetric = symmetric
self.use_external_data_format = use_external_data_format
self.per_channel = per_channel
self.augment_model = None
self.infer_session = None
@ -274,6 +277,7 @@ class MinMaxCalibrater(CalibraterBase):
moving_average=False,
averaging_constant=0.01,
max_intermediate_outputs=None,
per_channel=False,
):
"""
:param model_path: ONNX model to calibrate. It is a model path
@ -284,6 +288,7 @@ class MinMaxCalibrater(CalibraterBase):
:param moving_average: compute the moving average of the minimum and maximum values instead of the global minimum and maximum.
:param averaging_constant: constant smoothing factor to use when computing the moving average.
:param max_intermediate_outputs: maximum number of intermediate outputs before an intermediate range is computed.
:param per_channel: whether to compute ranges per each channel.
"""
super().__init__(
model_path,
@ -291,6 +296,7 @@ class MinMaxCalibrater(CalibraterBase):
augmented_model_path=augmented_model_path,
symmetric=symmetric,
use_external_data_format=use_external_data_format,
per_channel=per_channel,
)
self.intermediate_outputs = []
self.calibrate_tensors_range = None
@ -310,9 +316,15 @@ class MinMaxCalibrater(CalibraterBase):
"""
tensors, _ = self.select_tensors_to_calibrate(self.model)
reshape_shape_name = str(uuid.uuid4())
reshape_shape = numpy_helper.from_array(np.array([1], dtype=np.int64), reshape_shape_name)
reshape_shape = numpy_helper.from_array(np.array([-1], dtype=np.int64), reshape_shape_name)
self.model.graph.initializer.append(reshape_shape)
def get_op_version(op_type, model):
for opset_import in model.opset_import:
if onnx.defs.has(op_type, opset_import.domain):
return opset_import.version
raise RuntimeError(f"Model does not contain a version for '{op_type}'.")
def add_reduce_min_max(tensor_name, reduce_op_name):
# When doing ReduceMax/ReduceMin, ORT can't reduce on dim with value of 0 if 'keepdims' is false.
# To make the code simple, we always let keepdims to be 1.
@ -332,7 +344,6 @@ class MinMaxCalibrater(CalibraterBase):
name=intermediate_output,
)
self.model.graph.node.extend([reduce_node, reshape_node])
value_infos = {vi.name: vi for vi in self.model.graph.value_info}
value_infos.update({o.name: o for o in self.model.graph.output})
value_infos.update({i.name: i for i in self.model.graph.input})
@ -343,7 +354,22 @@ class MinMaxCalibrater(CalibraterBase):
f"Unable to guess tensor type for tensor {tensor_name!r}, "
f"running shape inference before quantization may resolve this issue."
)
self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [1]))
# Include axes in reduce_op when per_channel, always keeping axis=1
if self.per_channel:
tensor_rank = len(value_infos[tensor_name].type.tensor_type.shape.dim)
reduced_axes = [0, *range(2, tensor_rank)]
# Depending on opset version, axes in ReduceMin/ReduceMax are in attribute or inputs
if get_op_version(reduce_op_name, self.model) < 18:
reduce_node.attribute.append(helper.make_attribute("axes", reduced_axes))
else:
reduce_axes_name = str(uuid.uuid4())
reduce_axes = numpy_helper.from_array(np.array(reduced_axes, dtype=np.int64), reduce_axes_name)
reduce_node.input.append(reduce_axes_name)
self.model.graph.initializer.append(reduce_axes)
self.model.graph.node.extend([reduce_node, reshape_node])
self.model.graph.output.append(helper.make_tensor_value_info(reduce_output, onnx_type, [None]))
for tensor in tensors:
add_reduce_min_max(tensor, "ReduceMin")
@ -430,7 +456,7 @@ class MinMaxCalibrater(CalibraterBase):
max_value_array = np.max(merged_added_output_dict[added_output_names[i + 1]], axis=0)
if self.symmetric:
max_absolute_value = max(np.abs(min_value_array), np.abs(max_value_array))
max_absolute_value = np.max([np.abs(min_value_array), np.abs(max_value_array)], axis=0)
pairs.append(tuple([-max_absolute_value, max_absolute_value]))
else:
pairs.append(tuple([min_value_array, max_value_array]))
@ -1097,6 +1123,7 @@ def create_calibrator(
moving_average = extra_options.get("moving_average", False)
averaging_constant = extra_options.get("averaging_constant", 0.01)
max_intermediate_outputs = extra_options.get("max_intermediate_outputs", None)
per_channel = extra_options.get("per_channel", False)
calibrator = MinMaxCalibrater(
model,
op_types_to_calibrate,
@ -1106,6 +1133,7 @@ def create_calibrator(
moving_average=moving_average,
averaging_constant=averaging_constant,
max_intermediate_outputs=max_intermediate_outputs,
per_channel=per_channel,
)
elif calibrate_method == CalibrationMethod.Entropy:
# default settings for entropy algorithm

View file

@ -275,7 +275,7 @@ class TestCalibrateMinMaxCalibrator(unittest.TestCase):
for output in added_outputs:
self.assertTrue(output in augmented_model_outputs)
def construct_test_compute_data_model(self, test_model_path):
def construct_test_compute_data_model(self, test_model_path, opset_version=13):
# (input)
# |
# Relu
@ -320,7 +320,7 @@ class TestCalibrateMinMaxCalibrator(unittest.TestCase):
graph.initializer.add().CopyFrom(b3)
graph.initializer.add().CopyFrom(w5)
graph.initializer.add().CopyFrom(b5)
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset_version)])
onnx.save(model, test_model_path)
def test_compute_data(self):
@ -456,6 +456,42 @@ class TestCalibrateMinMaxCalibrator(unittest.TestCase):
for output in added_outputs:
self.assertTrue(output in augmented_model_outputs)
def test_compute_data_per_channel(self):
test_model_path = Path(self._tmp_model_dir.name).joinpath("./test_model_6.onnx")
self.construct_test_compute_data_model(test_model_path.as_posix(), opset_version=18)
augmented_model_path = Path(self._tmp_model_dir.name).joinpath("./augmented_test_model_6.onnx")
calibrater = create_calibrator(
test_model_path, augmented_model_path=augmented_model_path.as_posix(), extra_options={"per_channel": True}
)
data_reader = TestDataReader()
calibrater.collect_data(data_reader)
tensors_range = calibrater.compute_data()
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
infer_session = onnxruntime.InferenceSession(
test_model_path.as_posix(),
sess_options=sess_options,
providers=["CPUExecutionProvider"],
)
data_reader.rewind()
rmin = np.array([np.inf, np.inf, np.inf, np.inf, np.inf, np.inf], dtype=np.float32)[:, np.newaxis]
rmax = -1.0 * rmin
while True:
input = data_reader.get_next()
if not input:
break
output = np.asarray(infer_session.run(None, input)).reshape((6, 3, -1))
rmin = np.minimum(rmin, np.amin(output, axis=-1))
rmax = np.maximum(rmax, np.amax(output, axis=-1))
min_max_pairs = list(zip(rmin, rmax))
output_names = [infer_session.get_outputs()[i].name for i in range(len(infer_session.get_outputs()))]
output_min_max_dict = dict(zip(output_names, min_max_pairs))
for output_name in output_min_max_dict:
np.testing.assert_equal(output_min_max_dict[output_name], tensors_range[output_name].range_value)
if __name__ == "__main__":
unittest.main()