Quant tool: make removal of Clip/Relu ops configurable (#20616)

### Description
Adds the extra option `QDQKeepRemovableActivations` to optionally
prevent automatic removal of Clip/Relu ops in QDQ models. The current
default behavior, which is to remove Clip/Relu, remains the same if the
new option is not enabled.

### Motivation and Context
Explicitly representing these Relu/Clip operators in the QDQ model is
necessary if optimizations or EP transformations will later remove
QuantizeLinear/DequantizeLinear operators from the model.
This commit is contained in:
Adrian Lizarraga 2024-05-10 17:23:24 -07:00 committed by GitHub
parent 49d197a8e6
commit 643ed14720
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 156 additions and 2 deletions

View file

@ -50,6 +50,7 @@ def get_qnn_qdq_config(
add_qtype_converts: bool = True,
activation_symmetric: bool = False,
weight_symmetric: bool | None = None,
keep_removable_activations: bool = False,
) -> StaticQuantConfig:
"""
Returns a static quantization configuration suitable for running QDQ models on QNN EP.
@ -109,6 +110,11 @@ def get_qnn_qdq_config(
the zero-point values are 128 and 32,768, respectively.
weight_symmetric: True if weights should be quantized symmetrically (i.e., rmax == -rmin) by default.
Defaults to None. If set to None, weight_symmetric is assumed true if the weight_type is a signed int.
keep_removable_activations: Defaults to false. If true, "removable" activations (e.g., Clip or Relu) will not
be removed, and will be explicitly represented in the QDQ model. If false, these activations
are automatically removed if activations are asymmetrically quantized. Keeping these activations
is necessary if optimizations or EP transformations will later remove
QuantizeLinear/DequantizeLinear operators from the model.
Returns:
A StaticQuantConfig object
@ -160,6 +166,7 @@ def get_qnn_qdq_config(
extra_options = {
"MinimumRealRange": 0.0001,
"DedicatedQDQPair": False, # Let ORT optimizer duplicate DQ nodes
"QDQKeepRemovableActivations": keep_removable_activations,
"TensorQuantOverrides": overrides_helper.get_dict(),
"ActivationSymmetric": activation_symmetric,
"WeightSymmetric": weight_symmetric,

View file

@ -106,8 +106,10 @@ class QDQRemovableActivation(QDQOperatorBase):
if not self.quantizer.is_tensor_quantized(node.input[0]):
return
if not self.quantizer.is_activation_symmetric and self.quantizer.try_replacing_upstream_output(
node.input[0], node.output[0]
if (
not self.quantizer.is_activation_symmetric
and not self.quantizer.qdq_keep_removable_activations
and self.quantizer.try_replacing_upstream_output(node.input[0], node.output[0])
):
self.quantizer.remove_node(self.node)
else:

View file

@ -187,6 +187,10 @@ class QDQQuantizer(BaseQuantizer):
self.qdq_op_domain = ms_domain if extra_options.get("UseQDQContribOps", False) else None
# User can specify if removable activations, like Clip/Relu, should be kept in the graph.
# Used in the QDQRemovableActivation class.
self.qdq_keep_removable_activations = extra_options.get("QDQKeepRemovableActivations", False)
# The ONNX spec did not support 16-bit Q/DQ ops before opset 21.
# So, may have to override the Q/DQ op domain to 'com.microsoft' if the activation or weight types
# are 16-bit integers.

View file

@ -186,6 +186,12 @@ class StaticQuantConfig(QuantConfig):
Invalid if also set `scale` or `zero_point`.
'rmin' = Float : Override the minimum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
QDQKeepRemovableActivations = True/False:
Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
will be explicitly represented in the QDQ model. If false, these activations are automatically
removed if activations are asymmetrically quantized. Keeping these activations is necessary if
optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
operators from the model.
execution_provider : A enum indicates the Execution Provider such as: CPU, TRT, NNAPI, SNE, etc.
Raises:
ValueError: Raise ValueError if execution provider is unknown
@ -423,6 +429,12 @@ def quantize_static(
Invalid if also set `scale` or `zero_point`.
'rmin' = Float : Override the minimum real tensor value in calibration data.
Invalid if also set `scale` or `zero_point`.
QDQKeepRemovableActivations = True/False:
Default is False. If true, "removable" activations (e.g., Clip or Relu) will not be removed, and
will be explicitly represented in the QDQ model. If false, these activations are automatically
removed if activations are asymmetrically quantized. Keeping these activations is necessary if
optimizations or EP transformations will later remove QuantizeLinear/DequantizeLinear
operators from the model.
"""
if activation_type == QuantType.QFLOAT8E4M3FN or weight_type == QuantType.QFLOAT8E4M3FN:
if calibrate_method != CalibrationMethod.Distribution:

View file

@ -39,6 +39,18 @@ class TestQDQFormat(unittest.TestCase):
class TestQDQExtraOptions(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.extra_options_")
# Note: swap with the commented line if you want to see the models in local test dir.
cls._tmp_dir_path = cls._tmp_model_dir.name
# cls._tmp_dir_path = "."
@classmethod
def tearDownClass(cls):
cls._tmp_model_dir.cleanup()
def test_qdq_extra_options(self):
# (input)
# |
@ -236,6 +248,123 @@ class TestQDQExtraOptions(unittest.TestCase):
},
)
def test_qdq_keep_removable_activations_option(self):
#
# Create f32 model with Relu and Clip.
# input0 ---> Conv ---> Relu ---> Conv ---> Clip ---> output
#
shape1 = (1, 1, 3, 3)
w_shape1 = (2, 1, 2, 2)
w_shape2 = (2, 2, 2, 2)
shape3 = (1, 2, 1, 1)
input0 = onnx.helper.make_tensor_value_info("input0", onnx.TensorProto.FLOAT, shape1)
output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.FLOAT, shape3)
# Conv1
weight1_data = np.random.normal(-1.0, 1.0, w_shape1).astype(np.float32)
weight1_const = onnx.numpy_helper.from_array(weight1_data, "weight1_const")
conv1_node = onnx.helper.make_node("Conv", ["input0", "weight1_const"], ["conv1_out"], name="conv1_node")
# Relu1
relu1_node = onnx.helper.make_node("Relu", ["conv1_out"], ["relu1_out"], name="relu1_node")
# Conv2
weight2_data = np.random.normal(-1.8, 1.8, w_shape2).astype(np.float32)
weight2_const = onnx.numpy_helper.from_array(weight2_data, "weight2_const")
conv2_node = onnx.helper.make_node("Conv", ["relu1_out", "weight2_const"], ["conv2_out"], name="conv2_node")
# Clip1
min_const = onnx.numpy_helper.from_array(np.array(0.0, dtype=np.float32), "min_const")
max_const = onnx.numpy_helper.from_array(np.array(0.5, dtype=np.float32), "max_const")
clip1_node = onnx.helper.make_node(
"Clip", ["conv2_out", "min_const", "max_const"], ["output"], name="clip1_node"
)
graph = onnx.helper.make_graph(
[conv1_node, relu1_node, conv2_node, clip1_node],
"keep_qdq_activations",
[input0],
[output],
initializer=[weight1_const, weight2_const, min_const, max_const],
)
opset_imports = [
onnx.helper.make_opsetid("", 18),
]
f32_model = onnx.helper.make_model(graph, opset_imports=opset_imports)
f32_model = onnx.shape_inference.infer_shapes(f32_model)
f32_model_path = os.path.join(self._tmp_dir_path, "keep.act.model.onnx")
onnx.save_model(f32_model, f32_model_path)
# Create a data reader.
input_data_list = []
for _ in range(5):
inputs = {"input0": np.random.randint(-10, 10, shape1).astype(np.float32)}
input_data_list.extend([inputs])
data_reader = TestDataFeeds(input_data_list)
#
# Quantize model with extra option to KEEP removable activations.
#
qdq_model_path = os.path.join(self._tmp_dir_path, "keep.act.model.qdq.onnx")
# Create u8_act/u8_wgt qdq model
quantize_static(
f32_model_path,
qdq_model_path,
data_reader,
quant_format=QuantFormat.QDQ,
activation_type=QuantType.QUInt8,
weight_type=QuantType.QUInt8,
op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
extra_options={"QDQKeepRemovableActivations": True},
)
has_relu = False
has_clip = False
qdq_model = onnx.load_model(qdq_model_path)
for node in qdq_model.graph.node:
if node.op_type == "Relu":
has_relu = True
if node.op_type == "Clip":
has_clip = True
self.assertTrue(has_relu)
self.assertTrue(has_clip)
#
# Quantize model without extra option. Clip and Relu should be removed by default.
#
qdq_model_path = os.path.join(self._tmp_dir_path, "nokeep.act.model.qdq.onnx")
data_reader.rewind()
# Create u8_act/u8_wgt qdq model
quantize_static(
f32_model_path,
qdq_model_path,
data_reader,
quant_format=QuantFormat.QDQ,
activation_type=QuantType.QUInt8,
weight_type=QuantType.QUInt8,
op_types_to_quantize=[node.op_type for node in f32_model.graph.node],
)
has_relu = False
has_clip = False
qdq_model = onnx.load_model(qdq_model_path)
for node in qdq_model.graph.node:
if node.op_type == "Relu":
has_relu = True
if node.op_type == "Clip":
has_clip = True
self.assertFalse(has_relu)
self.assertFalse(has_clip)
class TestQDQFormatConv(TestQDQFormat):
def check_per_channel_counts(self, model_path, channel_count: int, axis: int = 0):