[QNN EP] Add quantization axis checks for Conv/ConvTranspose/Q/DQ ops (#21016)

### Description
Updates QNN EP to reject Conv/ConvTranspose/Q/DQ ops with unsupported
quantization axis values.



### Motivation and Context
Allows these unsupported operators to be handled by the CPU EP.

Fixes errors like the following:

> Node 'ConvTranspose' OpType:ConvTranspose with
domain:com.ms.internal.nhwc was inserted using the NHWC format as
requested by QNNExecutionProvider, but was not selected by that EP. This
means the graph is now invalid as there will not be an EP able to run
the node. This could be a bug in layout transformer, or in the
GetCapability implementation of the EP.

---------

Signed-off-by: adrianlizarraga <adlizarraga@microsoft.com>
This commit is contained in:
Adrian Lizarraga 2024-06-17 09:46:14 -07:00 committed by GitHub
parent c501c6ffaf
commit a6c18ae9df
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 139 additions and 20 deletions

View file

@ -283,6 +283,7 @@ ProviderType NodeUnit::GetExecutionProviderType() const noexcept { return target
void NodeUnit::InitForSingleNode() {
const auto& input_defs = target_node_.InputDefs();
const auto& output_defs = target_node_.OutputDefs();
const auto& node_attrs = target_node_.GetAttributes();
auto qlinear_type = GetQLinearOpType(target_node_);
if (qlinear_type == QLinearOpType::Unknown || IsVariadicQLinearOp(qlinear_type)) { // TODO, add variadic support
// Not a Qlinear op, add all inputs / outputs
@ -321,19 +322,35 @@ void NodeUnit::InitForSingleNode() {
// DequantizeLinear has 3 inputs
// x, x_scale, x_zp
// output is not quantized
inputs_.push_back(NodeUnitIODef{*input_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
? input_defs[2]
: nullptr}});
// Get the DQ axis attribute if available.
std::optional<int64_t> axis;
if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) {
axis = entry->second.i();
}
inputs_.push_back(NodeUnitIODef{*input_defs[0],
NodeUnitIODef::QuantParam{*input_defs[1],
input_defs.size() == 3 ? input_defs[2] : nullptr,
axis}});
outputs_.push_back(NodeUnitIODef{*output_defs[0], std::nullopt});
} else if (qlinear_type == QLinearOpType::QuantizeLinear) {
// QuantizeLinear the input is not quantized and has 3 inputs
// x, y_scale, y_zp (optional)
// The output is quantized
// Get the Q axis attribute if available.
std::optional<int64_t> axis;
if (auto entry = node_attrs.find("axis"); entry != node_attrs.end()) {
axis = entry->second.i();
}
inputs_.push_back(NodeUnitIODef{*input_defs[0], std::nullopt});
outputs_.push_back(NodeUnitIODef{*output_defs[0], NodeUnitIODef::QuantParam{*input_defs[1], input_defs.size() == 3
? input_defs[2]
: nullptr}});
outputs_.push_back(NodeUnitIODef{*output_defs[0],
NodeUnitIODef::QuantParam{*input_defs[1],
input_defs.size() == 3 ? input_defs[2] : nullptr,
axis}});
} else {
ORT_THROW("The QLinear op [", static_cast<uint8_t>(qlinear_type), "] is not supported");
}

View file

@ -120,7 +120,8 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
if (is_npu_backend) {
const auto& input_1 = inputs[1]; // weight
bool is_per_axis_quant = false;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant));
int64_t quant_axis = 0;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(input_1, is_per_axis_quant, quant_axis));
if (is_per_axis_quant) {
int32_t elem_data_type = 0;
@ -129,6 +130,13 @@ Status ConvOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
const bool is_signed_type = (elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT8) ||
(elem_data_type == ONNX_NAMESPACE::TensorProto_DataType_INT16);
ORT_RETURN_IF_NOT(is_signed_type, "Conv weights must be of a signed quantized type if quantized per-channel");
if (conv_type == OnnxConvType::kConvTranspose) {
ORT_RETURN_IF_NOT(quant_axis == 1,
"ConvTranspose's input[1] must be use axis == 1 for per-channel quantization");
} else {
ORT_RETURN_IF_NOT(quant_axis == 0, "Conv's input[1] must be use axis == 0 for per-channel quantization");
}
}
}

View file

@ -41,7 +41,7 @@ class SimpleOpBuilder : public BaseOpBuilder {
QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
private:
Status ExplicitOpCheck(const NodeUnit& node_unit) const;
Status ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit) const;
Status ProcessSigmoidOrTanhOutput(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit,
std::vector<std::string>&& input_names,
@ -138,7 +138,8 @@ Status SimpleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
return Status::OK();
}
Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const {
Status SimpleOpBuilder::ExplicitOpCheck(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& node_unit) const {
const std::string& op_type = node_unit.OpType();
if (op_type == "GridSample") {
@ -158,6 +159,20 @@ Status SimpleOpBuilder::ExplicitOpCheck(const NodeUnit& node_unit) const {
"QNN EP only supports Min and Max operators with exactly 2 inputs.");
}
if (op_type == "DequantizeLinear") {
bool is_per_chan_quant = false;
int64_t quant_axis = 0;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Inputs()[0], is_per_chan_quant, quant_axis));
ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone DQ op with per-channel quantization");
}
if (op_type == "QuantizeLinear") {
bool is_per_chan_quant = false;
int64_t quant_axis = 0;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.IsPerChannelQuantized(node_unit.Outputs()[0], is_per_chan_quant, quant_axis));
ORT_RETURN_IF(is_per_chan_quant, "QNN EP does not support a standalone Q op with per-channel quantization");
}
return Status::OK();
}
@ -475,7 +490,7 @@ Status SimpleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_w
const std::string& op_type = node_unit.OpType();
if (do_op_validation) {
ORT_RETURN_IF_ERROR(ExplicitOpCheck(node_unit));
ORT_RETURN_IF_ERROR(ExplicitOpCheck(qnn_model_wrapper, node_unit));
// Skip the op validation for DepthToSpace & SpaceToDepth if it's not NHWC data layout
if (node_unit.Domain() != kMSInternalNHWCDomain && (op_type == "DepthToSpace" || op_type == "SpaceToDepth" || op_type == "GridSample")) {
return Status::OK();

View file

@ -412,9 +412,10 @@ Status QnnModelWrapper::UnpackScales(const std::string& initializer_name, std::v
// Checks if a tensor in the ONNX graph is per-channel quantized.
Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def,
/*out*/ bool& is_per_axis) const {
/*out*/ bool& is_per_channel,
/*out*/ int64_t& axis) const {
if (!io_def.quant_param) {
is_per_axis = false;
is_per_channel = false;
return Status::OK();
}
@ -432,7 +433,12 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef&
const bool is_scalar_or_1_elem_vector = scale_shape.NumDimensions() == 0 ||
(scale_shape.NumDimensions() == 1 && scale_shape.Size() == 1);
is_per_axis = !is_scalar_or_1_elem_vector;
is_per_channel = !is_scalar_or_1_elem_vector;
if (is_per_channel) {
axis = io_def.quant_param->axis.value_or(1); // 1 is default axis for Q/DQ ops.
}
return Status::OK();
}

View file

@ -218,8 +218,10 @@ class QnnModelWrapper {
// Unpack zero-points from initializer and convert to int32_t (1 zero-point for per-tensor, > 1 for per-channel).
Status UnpackZeroPoints(const std::string& initializer_name, std::vector<int32_t>& zero_points) const;
// Checks if a tensor in the ONNX graph is per-axis quantized.
Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def, /*out*/ bool& is_per_axis) const;
// Checks if a tensor in the ONNX graph is per-channel quantized.
Status IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& io_def,
/*out*/ bool& is_per_channel,
/*out*/ int64_t& axis) const;
private:
bool CreateQnnInputOutputTensors(const std::string& qnn_node_name,

View file

@ -154,6 +154,7 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
const TestInputDef<float>& input_def,
const TestInputDef<float>& weights_def,
const TestInputDef<float>& bias_def,
int64_t weight_quant_axis,
const std::vector<int64_t>& strides,
const std::vector<int64_t>& pads,
const std::vector<int64_t>& dilations,
@ -161,8 +162,9 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
const std::string& auto_pad = "NOTSET",
bool use_contrib_qdq = false) {
return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
dilations, group, auto_pad, use_contrib_qdq](ModelTestBuilder& builder,
std::vector<QuantParams<ActivationQType>>& output_qparams) {
dilations, group, auto_pad, use_contrib_qdq,
weight_quant_axis](ModelTestBuilder& builder,
std::vector<QuantParams<ActivationQType>>& output_qparams) {
std::vector<NodeArg*> conv_inputs;
// input -> Q/DQ ->
@ -174,7 +176,6 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
// Quantized(weights) -> DQ ->
ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData());
int64_t weight_quant_axis = conv_op_type == "Conv" ? 0 : 1; // 0 for Conv, 1 for ConvTranspose
std::vector<float> weight_scales;
std::vector<WeightQType> weight_zero_points;
GetTestInputQuantParamsPerChannel<WeightQType>(weights_def, weight_scales, weight_zero_points,
@ -283,6 +284,7 @@ template <typename ActivationQType, typename WeightQType>
static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const TestInputDef<float>& input_def,
const TestInputDef<float>& weights_def,
const TestInputDef<float>& bias_def,
int64_t weight_quant_axis,
const std::vector<int64_t>& strides,
const std::vector<int64_t>& pads,
const std::vector<int64_t>& dilations,
@ -303,8 +305,9 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te
auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
group, auto_pad);
auto qdq_fn = BuildQDQPerChannelConvTestCase<ActivationQType, WeightQType>(conv_op_type, input_def, weights_def,
bias_def, strides, pads, dilations,
group, auto_pad, use_contrib_qdq);
bias_def, weight_quant_axis, strides,
pads, dilations, group, auto_pad,
use_contrib_qdq);
TestQDQModelAccuracy(f32_fn, qdq_fn, provider_options, opset, expected_ep_assignment, tolerance);
}
@ -713,6 +716,7 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
@ -723,6 +727,34 @@ TEST_F(QnnHTPBackendTests, ConvU8S8S32_PerChannel) {
13); // opset
}
// Test per-channel QDQ Conv is rejected with weight axis != 0
TEST_F(QnnHTPBackendTests, Conv_PerChannel_UnsupportedAxis) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 3, 3};
std::vector<int64_t> bias_shape = {3};
TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
TestInputDef<float> bias_def(bias_shape, true,
GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
RunHTPConvOpPerChannelTest<uint8_t, int8_t>("Conv",
input_def,
weight_def,
bias_def,
2, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::None,
false, // use_qdq_contrib_ops
13); // opset
}
// Test per-channel QDQ Conv. in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8
// \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::QNN_Conv3d_w_scale
// \QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:1187:ERROR:Op 0x1a preparation failed with err:-1
@ -748,6 +780,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1, 1}, // Strides
{0, 0, 0, 0, 0, 0}, // Pads
{1, 1, 1}, // Dilations
@ -776,6 +809,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU8S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
@ -811,6 +845,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U8S8S32_PerChannel2) {
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1, 1}, // Strides
{0, 0, 0, 0, 0, 0}, // Pads
{1, 1, 1}, // Dilations
@ -838,6 +873,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
1, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
@ -848,6 +884,34 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU8S8S32_PerChannel) {
13); // opset
}
// Test per-channel QDQ ConvTranspose is unsupported with weight axis != 1.
TEST_F(QnnHTPBackendTests, ConvTranspose_PerChannel_UnsupportedAxis) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {2, 3, 3, 3};
std::vector<int64_t> bias_shape = {3};
TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(-10.0f, 10.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
TestInputDef<float> bias_def(bias_shape, true,
GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
RunHTPConvOpPerChannelTest<uint8_t, int8_t>("ConvTranspose",
input_def,
weight_def,
bias_def,
2, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::None,
false, // use_qdq_contrib_ops
13); // opset
}
// ConvTranspose3D per-channel
// Disable it for 2.21 since it failed, re-enabled it for 2.22
TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) {
@ -866,6 +930,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U8S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
1, // weight quant axis
{1, 1, 1}, // Strides
{0, 0, 0, 0, 0, 0}, // Pads
{1, 1, 1}, // Dilations
@ -893,6 +958,7 @@ TEST_F(QnnHTPBackendTests, ConvU16S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
@ -928,6 +994,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1, 1}, // Strides
{0, 0, 0, 0, 0, 0}, // Pads
{1, 1, 1}, // Dilations
@ -955,6 +1022,7 @@ TEST_F(QnnHTPBackendTests, ConvTransposeU16S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
1, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
@ -982,6 +1050,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvTranspose3D_U16S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
1, // weight quant axis
{1, 1, 1}, // Strides
{0, 0, 0, 0, 0, 0}, // Pads
{1, 1, 1}, // Dilations
@ -1010,6 +1079,7 @@ TEST_F(QnnHTPBackendTests, ConvDepthwiseU16S8S32_PerChannel) {
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
@ -1045,6 +1115,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_Conv3D_U16S8S32_PerChannel2) {
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1, 1}, // Strides
{0, 0, 0, 0, 0, 0}, // Pads
{1, 1, 1}, // Dilations