Quant tool: make removal of Clip/Relu ops configurable (#20616)

### Description Adds the extra option `QDQKeepRemovableActivations` to optionally prevent automatic removal of Clip/Relu ops in QDQ models. The current default behavior, which is to remove Clip/Relu, remains the same if the new option is not enabled. ### Motivation and Context Explicitly representing these Relu/Clip operators in the QDQ model is necessary if optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear operators from the model.
2026-07-19 19:00:47 +00:00 · 2024-05-10 17:23:24 -07:00 · 2024-05-10 17:23:24 -07:00 · 643ed14720
commit 643ed14720
parent 49d197a8e6
5 changed files with 156 additions and 2 deletions
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@ -50,6 +50,7 @@ def get_qnn_qdq_config(
    add_qtype_converts: bool = True,
    activation_symmetric: bool = False,
    weight_symmetric: bool | None = None,
+    keep_removable_activations: bool = False,
 ) -> StaticQuantConfig:
    """
    Returns a static quantization configuration suitable for running QDQ models on QNN EP.
@ -109,6 +110,11 @@ def get_qnn_qdq_config(
            the zero-point values are 128 and 32,768, respectively.
        weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
            Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
+        keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
+                        be removed, and will be explicitly represented in the QDQ model. If false, these activations
+                        are automatically removed if activations are asymmetrically quantized. Keeping these activations
+                        is necessary if optimizations or EP transformations will later remove
+                        QuantizeLinear/DequantizeLinear operators from the model.

    Returns:
        A StaticQuantConfig object
@ -160,6 +166,7 @@ def get_qnn_qdq_config(
    extra_options = {
        "MinimumRealRange": 0.0001,
        "DedicatedQDQPair": False,  # Let ORT optimizer duplicate DQ nodes
+        "QDQKeepRemovableActivations": keep_removable_activations,
        "TensorQuantOverrides": overrides_helper.get_dict(),
        "ActivationSymmetric": activation_symmetric,
        "WeightSymmetric": weight_symmetric,
--- a/onnxruntime/python/tools/quantization/operators/activation.py
+++ b/onnxruntime/python/tools/quantization/operators/activation.py
@ -106,8 +106,10 @@ class QDQRemovableActivation(QDQOperatorBase):
        if not self.quantizer.is_tensor_quantized(node.input[0]):
            return

-        if not self.quantizer.is_activation_symmetric and self.quantizer.try_replacing_upstream_output(
-            node.input[0], node.output[0]
+        if (
+            not self.quantizer.is_activation_symmetric
+            and not self.quantizer.qdq_keep_removable_activations
+            and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
        ):
            self.quantizer.remove_node(self.node)
        else:
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@ -187,6 +187,10 @@ class QDQQuantizer(BaseQuantizer):

        self.qdq_op_domain = ms_domain if extra_options.get("UseQDQContribOps", False) else None

+        # User can specify if removable activations, like Clip/Relu, should be kept in the graph.
+        # Used in the QDQRemovableActivation class.
+        self.qdq_keep_removable_activations = extra_options.get("QDQKeepRemovableActivations", False)
+
        # The ONNX spec did not support 16-bit Q/DQ ops before opset 21.
        # So, may have to override the Q/DQ op domain to 'com.microsoft' if the activation or weight types
        # are 16-bit integers.
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@ -186,6 +186,12 @@ class StaticQuantConfig(QuantConfig):
                                                       Invalid if also set `scale` or `zero_point`.
                            'rmin' = Float           : Override the minimum real tensor value in calibration data.
                                                       Invalid if also set `scale` or `zero_point`.
+                    QDQKeepRemovableActivations = True/False:
+                        Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                        will be explicitly represented in the QDQ model. If false, these activations are automatically
+                        removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                        optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                        operators from the model.
            execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
        Raises:
            ValueError: Raise ValueError if execution provider is unknown
@ -423,6 +429,12 @@ def quantize_static(
                                                   Invalid if also set `scale` or `zero_point`.
                        'rmin' = Float           : Override the minimum real tensor value in calibration data.
                                                   Invalid if also set `scale` or `zero_point`.
+                QDQKeepRemovableActivations = True/False:
+                    Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                    will be explicitly represented in the QDQ model. If false, these activations are automatically
+                    removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                    optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                    operators from the model.
    """
    if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
        if calibrate_method != CalibrationMethod.Distribution:
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@ -39,6 +39,18 @@ class TestQDQFormat(unittest.TestCase):


 class TestQDQExtraOptions(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.extra_options_")
+
+        # Note: swap with the commented line if you want to see the models in local test dir.
+        cls._tmp_dir_path = cls._tmp_model_dir.name
+        # cls._tmp_dir_path = "."
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
    def test_qdq_extra_options(self):
        #   (input)
        #      |
@ -236,6 +248,123 @@ class TestQDQExtraOptions(unittest.TestCase):
                    },
                )

+    def test_qdq_keep_removable_activations_option(self):
+        #
+        # Create f32 model with Relu and Clip.
+        # input0 ---> Conv ---> Relu ---> Conv ---> Clip ---> output
+        #
+        shape1 = (1, 1, 3, 3)
+        w_shape1 = (2, 1, 2, 2)
+        w_shape2 = (2, 2, 2, 2)
+        shape3 = (1, 2, 1, 1)
+
+        input0 = onnx.helper.make_tensor_value_info("input0", onnx.TensorProto.FLOAT, shape1)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape3)
+
+        # Conv1
+        weight1_data = np.random.normal(-1.0, 1.0, w_shape1).astype(np.float32)
+        weight1_const = onnx.numpy_helper.from_array(weight1_data, "weight1_const")
+        conv1_node = onnx.helper.make_node("Conv", ["input0", "weight1_const"], ["conv1_out"], name="conv1_node")
+
+        # Relu1
+        relu1_node = onnx.helper.make_node("Relu", ["conv1_out"], ["relu1_out"], name="relu1_node")
+
+        # Conv2
+        weight2_data = np.random.normal(-1.8, 1.8, w_shape2).astype(np.float32)
+        weight2_const = onnx.numpy_helper.from_array(weight2_data, "weight2_const")
+        conv2_node = onnx.helper.make_node("Conv", ["relu1_out", "weight2_const"], ["conv2_out"], name="conv2_node")
+
+        # Clip1
+        min_const = onnx.numpy_helper.from_array(np.array(0.0, dtype=np.float32), "min_const")
+        max_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "max_const")
+        clip1_node = onnx.helper.make_node(
+            "Clip", ["conv2_out", "min_const", "max_const"], ["output"], name="clip1_node"
+        )
+
+        graph = onnx.helper.make_graph(
+            [conv1_node, relu1_node, conv2_node, clip1_node],
+            "keep_qdq_activations",
+            [input0],
+            [output],
+            initializer=[weight1_const, weight2_const, min_const, max_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        f32_model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        f32_model = onnx.shape_inference.infer_shapes(f32_model)
+        f32_model_path = os.path.join(self._tmp_dir_path, "keep.act.model.onnx")
+        onnx.save_model(f32_model, f32_model_path)
+
+        # Create a data reader.
+        input_data_list = []
+        for _ in range(5):
+            inputs = {"input0": np.random.randint(-10, 10, shape1).astype(np.float32)}
+            input_data_list.extend([inputs])
+        data_reader = TestDataFeeds(input_data_list)
+
+        #
+        # Quantize model with extra option to KEEP removable activations.
+        #
+        qdq_model_path = os.path.join(self._tmp_dir_path, "keep.act.model.qdq.onnx")
+
+        # Create u8_act/u8_wgt qdq model
+        quantize_static(
+            f32_model_path,
+            qdq_model_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+            op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
+            extra_options={"QDQKeepRemovableActivations": True},
+        )
+
+        has_relu = False
+        has_clip = False
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        for node in qdq_model.graph.node:
+            if node.op_type == "Relu":
+                has_relu = True
+            if node.op_type == "Clip":
+                has_clip = True
+
+        self.assertTrue(has_relu)
+        self.assertTrue(has_clip)
+
+        #
+        # Quantize model without extra option. Clip and Relu should be removed by default.
+        #
+        qdq_model_path = os.path.join(self._tmp_dir_path, "nokeep.act.model.qdq.onnx")
+        data_reader.rewind()
+
+        # Create u8_act/u8_wgt qdq model
+        quantize_static(
+            f32_model_path,
+            qdq_model_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+            op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
+        )
+
+        has_relu = False
+        has_clip = False
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        for node in qdq_model.graph.node:
+            if node.op_type == "Relu":
+                has_relu = True
+            if node.op_type == "Clip":
+                has_clip = True
+
+        self.assertFalse(has_relu)
+        self.assertFalse(has_clip)
+

 class TestQDQFormatConv(TestQDQFormat):
    def check_per_channel_counts(self, model_path, channel_count: int, axis: int = 0):