diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py index 3c9b319c78..184fe54a3f 100644 --- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py +++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py @@ -165,10 +165,13 @@ def get_qnn_qdq_config( "WeightSymmetric": weight_symmetric, } - # TODO: Remove this extra option once ORT uses an ONNX version that supports 16-bit Q/DQ ops. - overrides_have_int16 = any(t in Q16_TYPES for t in overrides_helper.get_quant_types()) - if activation_type in Q16_TYPES or weight_type in Q16_TYPES or overrides_have_int16: - extra_options["UseQDQContribOps"] = True + # ONNX opset < 21 does not support 16-bit quantization, so must use 'com.microsoft' domain + # on Q/DQ operators if using 16-bit quantization. + onnx_opset = next(x for x in model.opset_import if x.domain == "" or x.domain == "ai.onnx") + if onnx_opset.version < 21: + overrides_have_int16 = any(t in Q16_TYPES for t in overrides_helper.get_quant_types()) + if activation_type in Q16_TYPES or weight_type in Q16_TYPES or overrides_have_int16: + extra_options["UseQDQContribOps"] = True return StaticQuantConfig( calibration_data_reader, diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 2416cf970e..724c3549e2 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -187,20 +187,22 @@ class QDQQuantizer(BaseQuantizer): self.qdq_op_domain = ms_domain if extra_options.get("UseQDQContribOps", False) else None - # The ONNX spec does not yet support 16-bit Q/DQ ops. So, must override the Q/DQ op domain to 'com.microsoft' - # if the activation or weight types are 16-bit integers. - # TODO: Remove this override (and use only the 'UseQDQContribOps' option) if/when ONNX adds 16-bit support. - int16_types = (TensorProto.UINT16, TensorProto.INT16) - overrides_have_int16 = any(t.tensor_type in int16_types for t in self.tensor_quant_override_qtypes) - if not self.qdq_op_domain and ( - self.activation_qType in int16_types or self.weight_qType in int16_types or overrides_have_int16 - ): - logging.warning( - "ONNX QuantizeLinear and DequantizeLinear operators do not support 16-bit integer quantization types. " - f"The domain of QuantizeLinear and DequantizeLinear operators will be set to '{ms_domain}' to " - "enable support." - ) - self.qdq_op_domain = ms_domain + # The ONNX spec did not support 16-bit Q/DQ ops before opset 21. + # So, may have to override the Q/DQ op domain to 'com.microsoft' if the activation or weight types + # are 16-bit integers. + if self.opset_version < 21: + int16_types = (TensorProto.UINT16, TensorProto.INT16) + overrides_have_int16 = any(t.tensor_type in int16_types for t in self.tensor_quant_override_qtypes) + if not self.qdq_op_domain and ( + self.activation_qType in int16_types or self.weight_qType in int16_types or overrides_have_int16 + ): + logging.warning( + "ONNX QuantizeLinear and DequantizeLinear operators do not support " + "16-bit integer quantization types prior to opset 21. " + f"The domain of QuantizeLinear and DequantizeLinear operators will be set to '{ms_domain}' to " + "enable support." + ) + self.qdq_op_domain = ms_domain self.quantization_params = self.calc_graph_quant_params() diff --git a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py index ff97e04fb7..8691471b04 100644 --- a/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py +++ b/onnxruntime/test/python/quantization/test_tensor_quant_overrides_option.py @@ -52,7 +52,7 @@ class TestTensorQuantOverridesOption(unittest.TestCase): "OUT": (0, np.float32(0.005075461231172085)), } - def build_float32_model(self): + def build_float32_model(self, opset=13): # (input) # | # Sigmoid @@ -72,11 +72,13 @@ class TestTensorQuantOverridesOption(unittest.TestCase): graph = onnx.helper.make_graph( [sigmoid_node, conv_node], "test", [inp], [out], initializer=[wgt_init, bias_init] ) - model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", 13)]) + model = onnx.helper.make_model(graph, opset_imports=[onnx.helper.make_opsetid("", opset)]) onnx.save(model, "model.onnx") - def perform_qdq_quantization(self, output_model_name, extra_options=None, per_channel=False, activation_type=None): - self.build_float32_model() + def perform_qdq_quantization( + self, output_model_name, extra_options=None, per_channel=False, activation_type=None, opset=13 + ): + self.build_float32_model(opset) if activation_type is None: activation_type = self.default_act_qtype @@ -428,8 +430,9 @@ class TestTensorQuantOverridesOption(unittest.TestCase): def test_16bit_overrides_set_ms_domain(self): """ - Test that overriding a tensor to 16bit (when default is 8bit) automatically sets the 'com.microsoft' - domain on DQ and Q ops. + Test that overriding a tensor to 16bit (when default is 8bit) automatically + sets the 'com.microsoft' domain on DQ and Q ops for opset < 21. + Before ONNX 1.16.0, we had to use the 'com.microsoft' domain to be able to use 16-bit quantization. """ qdq_model_name = "model_quant_overrides_to_16bit.onnx" inp_zp, _, sig_out_zp, _, _, _, _, _, out_zp, _ = self.perform_qdq_quantization( @@ -441,6 +444,7 @@ class TestTensorQuantOverridesOption(unittest.TestCase): "SIG_OUT": [{"quant_type": QuantType.QUInt16}], } }, + opset=19, ) # Input and Sigmoid's output should be overridden to 16bit @@ -456,6 +460,38 @@ class TestTensorQuantOverridesOption(unittest.TestCase): if node.op_type in {"QuantizeLinear", "DequantizeLinear"}: self.assertEqual(node.domain, ms_domain) + def test_16bit_overrides_not_set_ms_domain(self): + """ + Test that overriding a tensor to 16bit (when default is 8bit) no longer automatically + sets the 'com.microsoft' domain on DQ and Q ops for opset >= 21. + Before ONNX 1.16.0, we had to use the 'com.microsoft' domain to be able to use 16-bit quantization. + """ + qdq_model_name = "model_quant_overrides_to_16bit.onnx" + inp_zp, _, sig_out_zp, _, _, _, _, _, out_zp, _ = self.perform_qdq_quantization( + qdq_model_name, + activation_type=onnx.TensorProto.UINT8, # Default to 8bit activations + extra_options={ + "TensorQuantOverrides": { + "INP": [{"quant_type": QuantType.QUInt16}], + "SIG_OUT": [{"quant_type": QuantType.QUInt16}], + } + }, + opset=21, + ) + + # Input and Sigmoid's output should be overridden to 16bit + self.assertEqual(inp_zp.data_type, onnx.TensorProto.UINT16) + self.assertEqual(sig_out_zp.data_type, onnx.TensorProto.UINT16) + + # Output should the default uint8 type + self.assertEqual(out_zp.data_type, onnx.TensorProto.UINT8) + + # Q/DQ ops should all have the 'com.microsoft' domain + qdq_model = onnx.load_model(qdq_model_name) + for node in qdq_model.graph.node: + if node.op_type in {"QuantizeLinear", "DequantizeLinear"}: + self.assertNotEqual(node.domain, ms_domain) + def test_override_validation_nonexisting_tensor(self): """ Test that specifying a non-existing tensor should fail.