From 643ed14720d4e92e29ec58d277f63ba1e1a2abe9 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Fri, 10 May 2024 17:23:24 -0700 Subject: [PATCH] Quant tool: make removal of Clip/Relu ops configurable (#20616) ### Description Adds the extra option `QDQKeepRemovableActivations` to optionally prevent automatic removal of Clip/Relu ops in QDQ models. The current default behavior, which is to remove Clip/Relu, remains the same if the new option is not enabled. ### Motivation and Context Explicitly representing these Relu/Clip operators in the QDQ model is necessary if optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear operators from the model. --- .../execution_providers/qnn/quant_config.py | 7 + .../quantization/operators/activation.py | 6 +- .../tools/quantization/qdq_quantizer.py | 4 + .../python/tools/quantization/quantize.py | 12 ++ .../test/python/quantization/test_qdq.py | 129 ++++++++++++++++++ 5 files changed, 156 insertions(+), 2 deletions(-) diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py index 184fe54a3f..3b857c9919 100644 --- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py +++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py @@ -50,6 +50,7 @@ def get_qnn_qdq_config( add_qtype_converts: bool = True, activation_symmetric: bool = False, weight_symmetric: bool | None = None, + keep_removable_activations: bool = False, ) -> StaticQuantConfig: """ Returns a static quantization configuration suitable for running QDQ models on QNN EP. @@ -109,6 +110,11 @@ def get_qnn_qdq_config( the zero-point values are 128 and 32,768, respectively. weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default. Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int. + keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not + be removed, and will be explicitly represented in the QDQ model. If false, these activations + are automatically removed if activations are asymmetrically quantized. Keeping these activations + is necessary if optimizations or EP transformations will later remove + QuantizeLinear/DequantizeLinear operators from the model. Returns: A StaticQuantConfig object @@ -160,6 +166,7 @@ def get_qnn_qdq_config( extra_options = { "MinimumRealRange": 0.0001, "DedicatedQDQPair": False, # Let ORT optimizer duplicate DQ nodes + "QDQKeepRemovableActivations": keep_removable_activations, "TensorQuantOverrides": overrides_helper.get_dict(), "ActivationSymmetric": activation_symmetric, "WeightSymmetric": weight_symmetric, diff --git a/onnxruntime/python/tools/quantization/operators/activation.py b/onnxruntime/python/tools/quantization/operators/activation.py index 1119ce00a4..3c80626a14 100644 --- a/onnxruntime/python/tools/quantization/operators/activation.py +++ b/onnxruntime/python/tools/quantization/operators/activation.py @@ -106,8 +106,10 @@ class QDQRemovableActivation(QDQOperatorBase): if not self.quantizer.is_tensor_quantized(node.input[0]): return - if not self.quantizer.is_activation_symmetric and self.quantizer.try_replacing_upstream_output( - node.input[0], node.output[0] + if ( + not self.quantizer.is_activation_symmetric + and not self.quantizer.qdq_keep_removable_activations + and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0]) ): self.quantizer.remove_node(self.node) else: diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 724c3549e2..b7f4b06904 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -187,6 +187,10 @@ class QDQQuantizer(BaseQuantizer): self.qdq_op_domain = ms_domain if extra_options.get("UseQDQContribOps", False) else None + # User can specify if removable activations, like Clip/Relu, should be kept in the graph. + # Used in the QDQRemovableActivation class. + self.qdq_keep_removable_activations = extra_options.get("QDQKeepRemovableActivations", False) + # The ONNX spec did not support 16-bit Q/DQ ops before opset 21. # So, may have to override the Q/DQ op domain to 'com.microsoft' if the activation or weight types # are 16-bit integers. diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 9ebd7bf3c4..f8b74a7ae4 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -186,6 +186,12 @@ class StaticQuantConfig(QuantConfig): Invalid if also set `scale` or `zero_point`. 'rmin' = Float : Override the minimum real tensor value in calibration data. Invalid if also set `scale` or `zero_point`. + QDQKeepRemovableActivations = True/False: + Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and + will be explicitly represented in the QDQ model. If false, these activations are automatically + removed if activations are asymmetrically quantized. Keeping these activations is necessary if + optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear + operators from the model. execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc. Raises: ValueError: Raise ValueError if execution provider is unknown @@ -423,6 +429,12 @@ def quantize_static( Invalid if also set `scale` or `zero_point`. 'rmin' = Float : Override the minimum real tensor value in calibration data. Invalid if also set `scale` or `zero_point`. + QDQKeepRemovableActivations = True/False: + Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and + will be explicitly represented in the QDQ model. If false, these activations are automatically + removed if activations are asymmetrically quantized. Keeping these activations is necessary if + optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear + operators from the model. """ if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN: if calibrate_method != CalibrationMethod.Distribution: diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py index db4ab7e8a4..8a7cfda8f0 100644 --- a/onnxruntime/test/python/quantization/test_qdq.py +++ b/onnxruntime/test/python/quantization/test_qdq.py @@ -39,6 +39,18 @@ class TestQDQFormat(unittest.TestCase): class TestQDQExtraOptions(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.extra_options_") + + # Note: swap with the commented line if you want to see the models in local test dir. + cls._tmp_dir_path = cls._tmp_model_dir.name + # cls._tmp_dir_path = "." + + @classmethod + def tearDownClass(cls): + cls._tmp_model_dir.cleanup() + def test_qdq_extra_options(self): # (input) # | @@ -236,6 +248,123 @@ class TestQDQExtraOptions(unittest.TestCase): }, ) + def test_qdq_keep_removable_activations_option(self): + # + # Create f32 model with Relu and Clip. + # input0 ---> Conv ---> Relu ---> Conv ---> Clip ---> output + # + shape1 = (1, 1, 3, 3) + w_shape1 = (2, 1, 2, 2) + w_shape2 = (2, 2, 2, 2) + shape3 = (1, 2, 1, 1) + + input0 = onnx.helper.make_tensor_value_info("input0", onnx.TensorProto.FLOAT, shape1) + output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape3) + + # Conv1 + weight1_data = np.random.normal(-1.0, 1.0, w_shape1).astype(np.float32) + weight1_const = onnx.numpy_helper.from_array(weight1_data, "weight1_const") + conv1_node = onnx.helper.make_node("Conv", ["input0", "weight1_const"], ["conv1_out"], name="conv1_node") + + # Relu1 + relu1_node = onnx.helper.make_node("Relu", ["conv1_out"], ["relu1_out"], name="relu1_node") + + # Conv2 + weight2_data = np.random.normal(-1.8, 1.8, w_shape2).astype(np.float32) + weight2_const = onnx.numpy_helper.from_array(weight2_data, "weight2_const") + conv2_node = onnx.helper.make_node("Conv", ["relu1_out", "weight2_const"], ["conv2_out"], name="conv2_node") + + # Clip1 + min_const = onnx.numpy_helper.from_array(np.array(0.0, dtype=np.float32), "min_const") + max_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "max_const") + clip1_node = onnx.helper.make_node( + "Clip", ["conv2_out", "min_const", "max_const"], ["output"], name="clip1_node" + ) + + graph = onnx.helper.make_graph( + [conv1_node, relu1_node, conv2_node, clip1_node], + "keep_qdq_activations", + [input0], + [output], + initializer=[weight1_const, weight2_const, min_const, max_const], + ) + opset_imports = [ + onnx.helper.make_opsetid("", 18), + ] + f32_model = onnx.helper.make_model(graph, opset_imports=opset_imports) + f32_model = onnx.shape_inference.infer_shapes(f32_model) + f32_model_path = os.path.join(self._tmp_dir_path, "keep.act.model.onnx") + onnx.save_model(f32_model, f32_model_path) + + # Create a data reader. + input_data_list = [] + for _ in range(5): + inputs = {"input0": np.random.randint(-10, 10, shape1).astype(np.float32)} + input_data_list.extend([inputs]) + data_reader = TestDataFeeds(input_data_list) + + # + # Quantize model with extra option to KEEP removable activations. + # + qdq_model_path = os.path.join(self._tmp_dir_path, "keep.act.model.qdq.onnx") + + # Create u8_act/u8_wgt qdq model + quantize_static( + f32_model_path, + qdq_model_path, + data_reader, + quant_format=QuantFormat.QDQ, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QUInt8, + op_types_to_quantize=[node.op_type for node in f32_model.graph.node], + extra_options={"QDQKeepRemovableActivations": True}, + ) + + has_relu = False + has_clip = False + + qdq_model = onnx.load_model(qdq_model_path) + + for node in qdq_model.graph.node: + if node.op_type == "Relu": + has_relu = True + if node.op_type == "Clip": + has_clip = True + + self.assertTrue(has_relu) + self.assertTrue(has_clip) + + # + # Quantize model without extra option. Clip and Relu should be removed by default. + # + qdq_model_path = os.path.join(self._tmp_dir_path, "nokeep.act.model.qdq.onnx") + data_reader.rewind() + + # Create u8_act/u8_wgt qdq model + quantize_static( + f32_model_path, + qdq_model_path, + data_reader, + quant_format=QuantFormat.QDQ, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QUInt8, + op_types_to_quantize=[node.op_type for node in f32_model.graph.node], + ) + + has_relu = False + has_clip = False + + qdq_model = onnx.load_model(qdq_model_path) + + for node in qdq_model.graph.node: + if node.op_type == "Relu": + has_relu = True + if node.op_type == "Clip": + has_clip = True + + self.assertFalse(has_relu) + self.assertFalse(has_clip) + class TestQDQFormatConv(TestQDQFormat): def check_per_channel_counts(self, model_path, channel_count: int, axis: int = 0):