From 643ed14720d4e92e29ec58d277f63ba1e1a2abe9 Mon Sep 17 00:00:00 2001
From: Adrian Lizarraga <adlizarraga@microsoft.com>
Date: Fri, 10 May 2024 17:23:24 -0700
Subject: [PATCH] Quant tool: make removal of Clip/Relu ops configurable
 (#20616)

### Description
Adds the extra option `QDQKeepRemovableActivations` to optionally
prevent automatic removal of Clip/Relu ops in QDQ models. The current
default behavior, which is to remove Clip/Relu, remains the same if the
new option is not enabled.

### Motivation and Context
Explicitly representing these Relu/Clip operators in the QDQ model is
necessary if optimizations or EP transformations will later remove
QuantizeLinear/DequantizeLinear operators from the model.
---
 .../execution_providers/qnn/quant_config.py   |   7 +
 .../quantization/operators/activation.py      |   6 +-
 .../tools/quantization/qdq_quantizer.py       |   4 +
 .../python/tools/quantization/quantize.py     |  12 ++
 .../test/python/quantization/test_qdq.py      | 129 ++++++++++++++++++
 5 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
index 184fe54a3f..3b857c9919 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -50,6 +50,7 @@ def get_qnn_qdq_config(
     add_qtype_converts: bool = True,
     activation_symmetric: bool = False,
     weight_symmetric: bool | None = None,
+    keep_removable_activations: bool = False,
 ) -> StaticQuantConfig:
     """
     Returns a static quantization configuration suitable for running QDQ models on QNN EP.
@@ -109,6 +110,11 @@ def get_qnn_qdq_config(
             the zero-point values are 128 and 32,768, respectively.
         weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
             Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
+        keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
+                        be removed, and will be explicitly represented in the QDQ model. If false, these activations
+                        are automatically removed if activations are asymmetrically quantized. Keeping these activations
+                        is necessary if optimizations or EP transformations will later remove
+                        QuantizeLinear/DequantizeLinear operators from the model.
 
     Returns:
         A StaticQuantConfig object
@@ -160,6 +166,7 @@ def get_qnn_qdq_config(
     extra_options = {
         "MinimumRealRange": 0.0001,
         "DedicatedQDQPair": False,  # Let ORT optimizer duplicate DQ nodes
+        "QDQKeepRemovableActivations": keep_removable_activations,
         "TensorQuantOverrides": overrides_helper.get_dict(),
         "ActivationSymmetric": activation_symmetric,
         "WeightSymmetric": weight_symmetric,
diff --git a/onnxruntime/python/tools/quantization/operators/activation.py b/onnxruntime/python/tools/quantization/operators/activation.py
index 1119ce00a4..3c80626a14 100644
--- a/onnxruntime/python/tools/quantization/operators/activation.py
+++ b/onnxruntime/python/tools/quantization/operators/activation.py
@@ -106,8 +106,10 @@ class QDQRemovableActivation(QDQOperatorBase):
         if not self.quantizer.is_tensor_quantized(node.input[0]):
             return
 
-        if not self.quantizer.is_activation_symmetric and self.quantizer.try_replacing_upstream_output(
-            node.input[0], node.output[0]
+        if (
+            not self.quantizer.is_activation_symmetric
+            and not self.quantizer.qdq_keep_removable_activations
+            and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
         ):
             self.quantizer.remove_node(self.node)
         else:
diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py
index 724c3549e2..b7f4b06904 100644
--- a/onnxruntime/python/tools/quantization/qdq_quantizer.py
+++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py
@@ -187,6 +187,10 @@ class QDQQuantizer(BaseQuantizer):
 
         self.qdq_op_domain = ms_domain if extra_options.get("UseQDQContribOps", False) else None
 
+        # User can specify if removable activations, like Clip/Relu, should be kept in the graph.
+        # Used in the QDQRemovableActivation class.
+        self.qdq_keep_removable_activations = extra_options.get("QDQKeepRemovableActivations", False)
+
         # The ONNX spec did not support 16-bit Q/DQ ops before opset 21.
         # So, may have to override the Q/DQ op domain to 'com.microsoft' if the activation or weight types
         # are 16-bit integers.
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index 9ebd7bf3c4..f8b74a7ae4 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -186,6 +186,12 @@ class StaticQuantConfig(QuantConfig):
                                                        Invalid if also set `scale` or `zero_point`.
                             'rmin' = Float           : Override the minimum real tensor value in calibration data.
                                                        Invalid if also set `scale` or `zero_point`.
+                    QDQKeepRemovableActivations = True/False:
+                        Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                        will be explicitly represented in the QDQ model. If false, these activations are automatically
+                        removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                        optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                        operators from the model.
             execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
         Raises:
             ValueError: Raise ValueError if execution provider is unknown
@@ -423,6 +429,12 @@ def quantize_static(
                                                    Invalid if also set `scale` or `zero_point`.
                         'rmin' = Float           : Override the minimum real tensor value in calibration data.
                                                    Invalid if also set `scale` or `zero_point`.
+                QDQKeepRemovableActivations = True/False:
+                    Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
+                    will be explicitly represented in the QDQ model. If false, these activations are automatically
+                    removed if activations are asymmetrically quantized. Keeping these activations is necessary if
+                    optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
+                    operators from the model.
     """
     if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
         if calibrate_method != CalibrationMethod.Distribution:
diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py
index db4ab7e8a4..8a7cfda8f0 100644
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@@ -39,6 +39,18 @@ class TestQDQFormat(unittest.TestCase):
 
 
 class TestQDQExtraOptions(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.extra_options_")
+
+        # Note: swap with the commented line if you want to see the models in local test dir.
+        cls._tmp_dir_path = cls._tmp_model_dir.name
+        # cls._tmp_dir_path = "."
+
+    @classmethod
+    def tearDownClass(cls):
+        cls._tmp_model_dir.cleanup()
+
     def test_qdq_extra_options(self):
         #   (input)
         #      |
@@ -236,6 +248,123 @@ class TestQDQExtraOptions(unittest.TestCase):
                     },
                 )
 
+    def test_qdq_keep_removable_activations_option(self):
+        #
+        # Create f32 model with Relu and Clip.
+        # input0 ---> Conv ---> Relu ---> Conv ---> Clip ---> output
+        #
+        shape1 = (1, 1, 3, 3)
+        w_shape1 = (2, 1, 2, 2)
+        w_shape2 = (2, 2, 2, 2)
+        shape3 = (1, 2, 1, 1)
+
+        input0 = onnx.helper.make_tensor_value_info("input0", onnx.TensorProto.FLOAT, shape1)
+        output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape3)
+
+        # Conv1
+        weight1_data = np.random.normal(-1.0, 1.0, w_shape1).astype(np.float32)
+        weight1_const = onnx.numpy_helper.from_array(weight1_data, "weight1_const")
+        conv1_node = onnx.helper.make_node("Conv", ["input0", "weight1_const"], ["conv1_out"], name="conv1_node")
+
+        # Relu1
+        relu1_node = onnx.helper.make_node("Relu", ["conv1_out"], ["relu1_out"], name="relu1_node")
+
+        # Conv2
+        weight2_data = np.random.normal(-1.8, 1.8, w_shape2).astype(np.float32)
+        weight2_const = onnx.numpy_helper.from_array(weight2_data, "weight2_const")
+        conv2_node = onnx.helper.make_node("Conv", ["relu1_out", "weight2_const"], ["conv2_out"], name="conv2_node")
+
+        # Clip1
+        min_const = onnx.numpy_helper.from_array(np.array(0.0, dtype=np.float32), "min_const")
+        max_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "max_const")
+        clip1_node = onnx.helper.make_node(
+            "Clip", ["conv2_out", "min_const", "max_const"], ["output"], name="clip1_node"
+        )
+
+        graph = onnx.helper.make_graph(
+            [conv1_node, relu1_node, conv2_node, clip1_node],
+            "keep_qdq_activations",
+            [input0],
+            [output],
+            initializer=[weight1_const, weight2_const, min_const, max_const],
+        )
+        opset_imports = [
+            onnx.helper.make_opsetid("", 18),
+        ]
+        f32_model = onnx.helper.make_model(graph, opset_imports=opset_imports)
+        f32_model = onnx.shape_inference.infer_shapes(f32_model)
+        f32_model_path = os.path.join(self._tmp_dir_path, "keep.act.model.onnx")
+        onnx.save_model(f32_model, f32_model_path)
+
+        # Create a data reader.
+        input_data_list = []
+        for _ in range(5):
+            inputs = {"input0": np.random.randint(-10, 10, shape1).astype(np.float32)}
+            input_data_list.extend([inputs])
+        data_reader = TestDataFeeds(input_data_list)
+
+        #
+        # Quantize model with extra option to KEEP removable activations.
+        #
+        qdq_model_path = os.path.join(self._tmp_dir_path, "keep.act.model.qdq.onnx")
+
+        # Create u8_act/u8_wgt qdq model
+        quantize_static(
+            f32_model_path,
+            qdq_model_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+            op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
+            extra_options={"QDQKeepRemovableActivations": True},
+        )
+
+        has_relu = False
+        has_clip = False
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        for node in qdq_model.graph.node:
+            if node.op_type == "Relu":
+                has_relu = True
+            if node.op_type == "Clip":
+                has_clip = True
+
+        self.assertTrue(has_relu)
+        self.assertTrue(has_clip)
+
+        #
+        # Quantize model without extra option. Clip and Relu should be removed by default.
+        #
+        qdq_model_path = os.path.join(self._tmp_dir_path, "nokeep.act.model.qdq.onnx")
+        data_reader.rewind()
+
+        # Create u8_act/u8_wgt qdq model
+        quantize_static(
+            f32_model_path,
+            qdq_model_path,
+            data_reader,
+            quant_format=QuantFormat.QDQ,
+            activation_type=QuantType.QUInt8,
+            weight_type=QuantType.QUInt8,
+            op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
+        )
+
+        has_relu = False
+        has_clip = False
+
+        qdq_model = onnx.load_model(qdq_model_path)
+
+        for node in qdq_model.graph.node:
+            if node.op_type == "Relu":
+                has_relu = True
+            if node.op_type == "Clip":
+                has_clip = True
+
+        self.assertFalse(has_relu)
+        self.assertFalse(has_clip)
+
 
 class TestQDQFormatConv(TestQDQFormat):
     def check_per_channel_counts(self, model_path, channel_count: int, axis: int = 0):