ORT 1.19.0 Release: Cherry-Pick Round 0 (#21609)

### Description
<!-- Describe your changes. -->

Critical changes required for an external developer (GeekBench)
 

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

ORT 1.19.0 Release Preparation

---------

Co-authored-by: Adrian Lizarraga <adlizarraga@microsoft.com>
This commit is contained in:
Prathik Rao 2024-08-03 22:04:57 -07:00 committed by GitHub
parent 530a2d7b41
commit ee2fe87e2d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
20 changed files with 1824 additions and 502 deletions

View file

@ -4,6 +4,7 @@
#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
#include "node_unit.h"
#include <utility>
#include "core/graph/graph_viewer.h"
namespace onnxruntime {
@ -272,6 +273,20 @@ NodeUnit::NodeUnit(const GraphViewer& graph_viewer, const QDQ::NodeGroup& node_g
}
}
NodeUnit::NodeUnit(gsl::span<const Node* const> dq_nodes, const Node& target_node,
gsl::span<const Node* const> q_nodes, Type unit_type,
gsl::span<const NodeUnitIODef> inputs, gsl::span<const NodeUnitIODef> outputs,
size_t input_edge_count, Node::EdgeSet output_edges)
: dq_nodes_(dq_nodes.begin(), dq_nodes.end()),
target_node_(target_node),
q_nodes_(q_nodes.begin(), q_nodes.end()),
type_(unit_type),
inputs_(inputs.begin(), inputs.end()),
outputs_(outputs.begin(), outputs.end()),
input_edge_count_(input_edge_count),
output_edges_(std::move(output_edges)) {
}
const std::string& NodeUnit::Domain() const noexcept { return target_node_.Domain(); }
const std::string& NodeUnit::OpType() const noexcept { return target_node_.OpType(); }
const std::string& NodeUnit::Name() const noexcept { return target_node_.Name(); }

View file

@ -68,6 +68,10 @@ class NodeUnit {
public:
explicit NodeUnit(const Node& node);
explicit NodeUnit(const GraphViewer& graph_viewer, const QDQ::NodeGroup& node_group);
NodeUnit(gsl::span<const Node* const> dq_nodes, const Node& target_node,
gsl::span<const Node* const> q_nodes, Type unit_type,
gsl::span<const NodeUnitIODef> inputs, gsl::span<const NodeUnitIODef> outputs,
size_t input_edge_count, Node::EdgeSet output_edges);
Type UnitType() const noexcept { return type_; }

View file

@ -1,294 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/qnn/builder/qnn_fusions.h"
#include <limits>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "core/graph/graph_utils.h"
#include "core/optimizer/qdq_transformer/qdq_util.h"
#include "core/framework/node_unit.h"
#include "core/providers/qnn/builder/qnn_utils.h"
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
#include "core/providers/qnn/builder/op_builder_factory.h"
#define QNN_RETURN_OK_IF_ERROR(expr, logger) \
do { \
auto _status = (expr); \
if ((!_status.IsOK())) { \
LOGS((logger), VERBOSE) << _status.ErrorMessage(); \
return Status::OK(); \
} \
} while (0)
namespace onnxruntime {
namespace qnn {
/**
* Tries to merge a DQ -> Q sequence into a QNN Convert operator. The DQ -> Q must be converting from
* one quantization type (e.g., uint8_t) to another (e.g., uint16_t).
*
* \param fused_nodes Output list of node units that were fused. Remains empty if fusion is not applied.
* \param qnn_model_wrapper The QNN model that is being built.
* \param start_node_unit The node unit that could potentially start the DQ -> Q sequence.
* \param node_unit_map Maps a node to its node unit.
* \param handled_node_units Set of node units that have already been processed. Fusion will not fuse nodes
* in this set.
* \param logger The logger.
* \param do_op_validation True if should call QNN operator validation APIs.
* \return An onnxruntime::Status
*/
static Status TryHandleConvertSequence(std::vector<const NodeUnit*>& fused_nodes,
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& start_node_unit,
const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
const std::unordered_set<const NodeUnit*>& handled_node_units,
const logging::Logger& logger,
bool do_op_validation) {
const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
// Looking for a standalone DQ to start the sequence.
if (start_node_unit.OpType() != QDQ::DQOpName || start_node_unit.UnitType() != NodeUnit::Type::SingleNode) {
return Status::OK();
}
const Node& dq_node = start_node_unit.GetNode();
// DQ must have a single Q child. DQ must not produce a graph output.
auto children = graph_utils::FindChildrenByType(dq_node, QDQ::QOpName);
if (children.size() != 1 || dq_node.GetOutputEdgesCount() != 1 || graph_viewer.NodeProducesGraphOutput(dq_node)) {
return Status::OK();
}
const Node& q_node = *children[0];
const auto q_node_unit_it = node_unit_map.find(&q_node);
ORT_RETURN_IF(q_node_unit_it == node_unit_map.end(), "Node does not have a corresponding NodeUnit");
const NodeUnit* q_node_unit = q_node_unit_it->second;
// Check if Q node has already been handled. Should not be the case if this
// fusion function has been called in topological order, but check to be safe.
if (handled_node_units.count(q_node_unit) != 0) {
return Status::OK();
}
// Q child must not already be part of a QDQ NodeUnit (i.e., be standalone).
if (q_node_unit->UnitType() != NodeUnit::Type::SingleNode) {
return Status::OK();
}
auto get_const_initializer = [&graph_viewer](const std::string& initializer_name) {
return graph_viewer.GetConstantInitializer(initializer_name, true);
};
// DQ and Q must have equal scale type and different zp type.
if (!QDQ::IsDQQConversion(dq_node, q_node, get_const_initializer, graph_viewer.ModelPath())) {
return Status::OK();
}
const auto& node_name = utils::GetNodeName(start_node_unit);
const NodeUnitIODef& input_def = start_node_unit.Inputs()[0];
const NodeUnitIODef& output_def = q_node_unit->Outputs()[0];
QnnTensorWrapper input_tensor;
QnnTensorWrapper output_tensor;
// Run QNN validation on the final fused node before committing to doing a fusion.
// Importantly, this validation process does not modify the qnn_model_wrapper.
// If validation fails here, we return Status::OK() to allow QNN EP to use the normal OpBuilder workflow.
QNN_RETURN_OK_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(input_def, input_tensor), logger);
QNN_RETURN_OK_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(output_def, output_tensor), logger);
QNN_RETURN_OK_IF_ERROR(qnn_model_wrapper.ValidateQnnNode(node_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CONVERT,
{input_tensor.GetQnnTensor()},
{output_tensor.GetQnnTensor()},
{}),
logger);
// Validation passed, so we're now committed to doing a fusion. The following statements modify qnn_model_wrapper.
// If we encounter an error, we return it directly to caller.
LOGS(logger, VERBOSE) << " Adding QNN Convert via fusion. dq_node name: [" << dq_node.Name()
<< "] dq_node optype: [" << dq_node.OpType()
<< "] q_node name: [" << q_node_unit->Name()
<< "] q_node optype: [" << q_node_unit->OpType()
<< "]";
// Add a QNN Convert to the model. Get the input from the DQ node, and the output from the Q node.
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensor)), "Failed to add input");
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensor)), "Failed to add output");
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(*q_node_unit),
QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CONVERT,
{input_def.node_arg.Name()},
{output_def.node_arg.Name()},
{},
do_op_validation),
"Failed to add fused Convert node.");
fused_nodes.push_back(&start_node_unit);
fused_nodes.push_back(q_node_unit);
return Status::OK();
}
/**
* Tries to fuse the sequence `x * HardSigmoid<alpha=1/6, beta=0.5>(x)` into a single HardSwish(x) operator.
* Should be called in a topologically ordered iteration of node units.
*
* \param fused_nodes Output list of node units that were fused. Remains empty if fusion was not applied.
* \param qnn_model_wrapper The QNN model that is being built.
* \param starting_node The node unit that could potentially start the sequence.
* \param node_unit_map Maps a node to its node unit.
* \param handled_node_units Set of node units that have already been processed. Fusion will not fuse nodes
* in this set.
* \param logger The logger.
* \param do_op_validation True if should call QNN operator validation APIs.
* \return A Status indicating a potential failure.
*/
static Status TryHandleHardSigmoidSequence(std::vector<const NodeUnit*>& fused_nodes,
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& start_node_unit,
const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
const std::unordered_set<const NodeUnit*>& handled_node_units,
const logging::Logger& logger,
bool do_op_validation) {
// Looking for a standalone HardSigmoid to start the sequence.
if (start_node_unit.OpType() != "HardSigmoid" || start_node_unit.UnitType() != NodeUnit::Type::SingleNode) {
return Status::OK();
}
NodeAttrHelper hs_attr_helper(start_node_unit);
float alpha = hs_attr_helper.Get("alpha", 0.2f);
float beta = hs_attr_helper.Get("beta", 0.5f);
constexpr float req_alpha = 1.0f / 6.0f;
constexpr float req_beta = 0.5f;
constexpr float alpha_eps = std::numeric_limits<float>::epsilon() * req_alpha;
constexpr float beta_eps = std::numeric_limits<float>::epsilon() * req_beta;
// Check for explicit values of alpha and beta.
if (std::abs(alpha - req_alpha) > alpha_eps || std::abs(beta - req_beta) > beta_eps) {
return Status::OK();
}
const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
const Node& hs_node = start_node_unit.GetNode();
// HardSigmoid must have a single Mul child. HardSigmoid must not produce a graph output.
auto children = graph_utils::FindChildrenByType(hs_node, "Mul");
if (children.size() != 1 || hs_node.GetOutputEdgesCount() != 1 || graph_viewer.NodeProducesGraphOutput(hs_node)) {
return Status::OK();
}
const Node& mul_node = *children[0];
const auto mul_node_unit_it = node_unit_map.find(&mul_node);
ORT_RETURN_IF(mul_node_unit_it == node_unit_map.end(), "Node does not have a corresponding NodeUnit");
const NodeUnit* mul_node_unit = mul_node_unit_it->second;
// Check if Mul node has already been handled. Should not be the case if this
// fusion function has been called in topological order, but check to be safe.
if (handled_node_units.count(mul_node_unit) != 0) {
return Status::OK();
}
// Mul child must not already be part of a QDQ NodeUnit (i.e., be standalone).
if (mul_node_unit->UnitType() != NodeUnit::Type::SingleNode) {
return Status::OK();
}
// Input to HardSigmoid must also be the other input to the Mul.
auto& hs_input_name = start_node_unit.Inputs()[0].node_arg.Name();
const bool same_root_input = mul_node.InputDefs()[0]->Name() == hs_input_name ||
mul_node.InputDefs()[1]->Name() == hs_input_name;
if (!same_root_input) {
return Status::OK();
}
const auto& node_name = utils::GetNodeName(start_node_unit);
const NodeUnitIODef& input_def = start_node_unit.Inputs()[0];
const NodeUnitIODef& output_def = mul_node_unit->Outputs()[0];
QnnTensorWrapper input_tensor;
QnnTensorWrapper output_tensor;
// Run QNN validation on the final fused node before committing to doing a fusion.
// Importantly, this validation process does not modify the qnn_model_wrapper.
// If validation fails here, we return Status::OK() to allow QNN EP to use the normal OpBuilder workflow.
QNN_RETURN_OK_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(input_def, input_tensor), logger);
QNN_RETURN_OK_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(output_def, output_tensor), logger);
QNN_RETURN_OK_IF_ERROR(qnn_model_wrapper.ValidateQnnNode(node_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_HARD_SWISH,
{input_tensor.GetQnnTensor()},
{output_tensor.GetQnnTensor()},
{}),
logger);
// Validation passed, so we're now committed to doing a fusion. The following statements modify qnn_model_wrapper.
// If we encounter an error, we return it directly to caller.
LOGS(logger, VERBOSE) << " Adding QNN HardSwish via fusion. HardSigmoid name: [" << start_node_unit.Name()
<< "] Mul name: [" << mul_node_unit->Name() << "]";
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensor)), "Failed to add input");
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensor)), "Failed to add output");
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(node_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_HARD_SWISH,
{input_def.node_arg.Name()},
{output_def.node_arg.Name()},
{},
do_op_validation),
"Failed to add fused HardSwish node.");
fused_nodes.push_back(&start_node_unit);
fused_nodes.push_back(mul_node_unit);
return Status::OK();
}
using FusionFunc = Status (*)(std::vector<const NodeUnit*>&,
QnnModelWrapper&,
const NodeUnit&,
const std::unordered_map<const Node*, const NodeUnit*>&,
const std::unordered_set<const NodeUnit*>&,
const logging::Logger&,
bool);
Status TryFusions(/*out*/ std::vector<const NodeUnit*>& fused_nodes,
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& starting_node,
const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
const std::unordered_set<const NodeUnit*>& handled_node_units,
const logging::Logger& logger,
bool validate) {
// Maps a starting operator type to the fusion function.
static std::unordered_map<std::string, FusionFunc> fusions = {
{"DequantizeLinear", TryHandleConvertSequence},
{"HardSigmoid", TryHandleHardSigmoidSequence},
};
// For now, all fusions involve standalone node units (i.e., no wrapping DQ/Q nodes).
if (starting_node.UnitType() != NodeUnit::Type::SingleNode) {
return Status::OK();
}
auto iter = fusions.find(starting_node.OpType());
if (iter != fusions.end()) {
fused_nodes.clear();
FusionFunc fusion_func = iter->second;
ORT_RETURN_IF_ERROR(fusion_func(fused_nodes, qnn_model_wrapper, starting_node, node_unit_map,
handled_node_units, logger, validate));
}
return Status::OK();
}
} // namespace qnn
} // namespace onnxruntime

View file

@ -1,38 +0,0 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "core/framework/node_unit.h"
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
namespace onnxruntime {
namespace qnn {
/**
* Tries to fuse a node sequence starting from the given starting node. Should be called in a topologically ordered
* walk of node units.
*
* \param fused_nodes Output list of node units that were fused. Remains empty if fusion was not applied.
* \param qnn_model_wrapper The QNN model that is being built.
* \param starting_node The node unit that could potentially start the sequence.
* \param node_unit_map Maps a node to its node unit.
* \param handled_node_units Set of node units that have already been processed. Fusion will not fuse nodes
* in this set.
* \param logger The logger.
* \param do_op_validation True if should call QNN operator validation APIs.
* \return A Status indicating a potential failure.
*/
Status TryFusions(/*out*/ std::vector<const NodeUnit*>& fused_nodes,
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& starting_node,
const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
const std::unordered_set<const NodeUnit*>& handled_node_units,
const logging::Logger& logger,
bool do_op_validation);
} // namespace qnn
} // namespace onnxruntime

View file

@ -7,7 +7,7 @@
#include "QnnOpDef.h"
#include "core/providers/qnn/builder/op_builder_factory.h"
#include "core/providers/qnn/builder/qnn_fusions.h"
#include "core/providers/qnn/builder/qnn_node_group.h"
#include "core/providers/shared/utils/utils.h"
#include "core/framework/utils.h"
#include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h"
@ -117,49 +117,20 @@ Status QnnModel::ComposeGraph(const GraphViewer& graph_viewer,
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to initialize qnn_model_wrapper.");
}
std::unordered_set<const NodeUnit*> handled_node_units;
std::vector<std::unique_ptr<qnn::IQnnNodeGroup>> qnn_node_groups;
qnn_node_groups.reserve(node_unit_holder.size());
// Op builer
const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
for (size_t i = 0; i < node_indices.size(); i++) {
const auto* node(graph_viewer.GetNode(node_indices[i]));
ORT_RETURN_IF_ERROR(qnn::GetQnnNodeGroups(qnn_node_groups, qnn_model_wrapper, node_unit_map,
node_unit_holder.size(), logger_));
// Check whether it's part of NodeUnit
const NodeUnit& node_unit = GetNodeUnit(node, node_unit_map);
// Q, DQ nodes in the node unit only carry the quantization parameters
// Add the QNN node when it is the target node (It's a normal node or a single Q/DQ node)
const std::string& op_type = node_unit.OpType();
for (const std::unique_ptr<qnn::IQnnNodeGroup>& qnn_node_group : qnn_node_groups) {
Status status = qnn_node_group->AddToModelBuilder(qnn_model_wrapper, logger_);
if (node != &node_unit.GetNode()) {
continue;
if (!status.IsOK()) {
LOGS(logger_, ERROR) << "[QNN EP] Failed to add supported node to QNN graph during EP's compile call: "
<< status.ErrorMessage() << std::endl;
return status;
}
if (handled_node_units.count(&node_unit) != 0) {
continue; // Already handled.
}
// Try to see if this node unit can be fused.
std::vector<const NodeUnit*> fused_nodes;
ORT_RETURN_IF_ERROR(TryFusions(fused_nodes, qnn_model_wrapper, node_unit, node_unit_map,
handled_node_units, logger_, false /*do_op_validation*/));
if (!fused_nodes.empty()) {
for (auto fused_node_unit : fused_nodes) {
handled_node_units.insert(fused_node_unit);
}
continue;
}
LOGS(logger_, VERBOSE) << " node name: [" << node->Name()
<< "] node optype: [" << op_type
<< "] as part of the NodeUnit type: [" << node_unit.OpType()
<< "] name: [" << node_unit.Name()
<< "]";
if (const auto* op_builder = GetOpBuilder(op_type)) {
ORT_RETURN_IF_ERROR(op_builder->AddToModelBuilder(qnn_model_wrapper, node_unit, logger_));
}
handled_node_units.insert(&node_unit);
}
ORT_RETURN_IF_NOT(qnn_model_wrapper.ComposeQnnGraph(), "Failed to compose Qnn graph.");

View file

@ -239,6 +239,8 @@ bool QnnModelWrapper::CreateQnnNode(const std::string& qnn_node_name,
std::string error_msg;
bool rt = op_config_wrapper.QnnGraphOpValidation(qnn_interface_, backend_handle_, error_msg);
if (!rt) {
// TODO(adrianlizarraga): Return a Status with the error message so that aggregated logs show a more
// specific validation error (instead of "failed to add node").
LOGS(logger_, WARNING) << error_msg;
}
return rt;
@ -617,6 +619,12 @@ Status QnnModelWrapper::UnpackInitializerData(const ONNX_NAMESPACE::TensorProto&
auto dst = gsl::make_span(reinterpret_cast<int8_t*>(unpacked_tensor.data()), unpacked_tensor.size());
auto src = gsl::make_span(reinterpret_cast<const Int4x2*>(packed_int4_bytes.data()), packed_int4_bytes.size());
ORT_RETURN_IF_NOT(Int4x2::Unpack(dst, src), "Failed to unpack Tensor<Int4x2> for QNN");
// NOTE: Masking off top 4 bits to workaround a QNN INT4 accuracy bug.
// Docs explicitly state that masking off top 4 bits should not be required.
for (size_t i = 0; i < dst.size(); i++) {
dst[i] &= 0x0F; // -3 (0b1111_1101) becomes 13 (0b0000_1101)
}
} else if (onnx_data_type == ONNX_NAMESPACE::TensorProto_DataType_UINT4) {
TensorShape shape = onnxruntime::utils::GetTensorShapeFromTensorProto(initializer);
const size_t num_elems = shape.Size();

View file

@ -0,0 +1,68 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <gsl/gsl>
#include <memory>
#include <unordered_map>
#include <vector>
#include "core/common/logging/logging.h"
#include "core/framework/node_unit.h"
namespace onnxruntime {
namespace qnn {
class QnnModelWrapper;
/// <summary>
/// Represents a group of NodeUnits that QNN EP translates into a core QNN operator. Can represent a single NodeUnit
/// or a fusion of multiple NodeUnits (e.g., DQ* -> Conv -> Relu -> Q).
/// </summary>
class IQnnNodeGroup {
public:
virtual ~IQnnNodeGroup() = default;
// Returns an OK status if this IQnnNodeGroup is supported by QNN.
virtual Status IsSupported(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const = 0;
// Adds this IQnnNodeGroup to the QNN model wrapper.
virtual Status AddToModelBuilder(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const = 0;
// Returns a list of NodeUnits contained by this IQnnNodeGroup.
virtual gsl::span<const NodeUnit* const> GetNodeUnits() const = 0;
/// <summary>
/// Returns the "target" NodeUnit of the group. This is important for topological ordering of IQnnNodeGroups.
/// The target should be the first NodeUnit where all input paths (of the IQnnNodeGroup) converge.
/// For example, "Conv" should be the target NodeUnit for the following IQnnNodeGroup with 6 NodeUnits.
/// input0 -> DQ -> Conv -> Relu -> Q
/// ^
/// |
/// input1 -> DQ ----+
/// </summary>
/// <returns>Target NodeUnit in IQnnNodeGroup</returns>
virtual const NodeUnit* GetTargetNodeUnit() const = 0;
// Returns a string representation of the IQnnNodeGroup's type.
virtual std::string_view Type() const = 0;
};
/// <summary>
/// Traverses the ONNX graph to create IQnnNodeGroup objects, each containing one or more NodeUnits.
/// The returned IQnnNodeGroup objects are sorted in topological order.
/// </summary>
/// <param name="qnn_node_groups">Output vector into which the resulting IQnnNodeGroup objects are stored.</param>
/// <param name="qnn_model_wrapper">Contains reference to the ONNX GraphViewer and used for validaton on QNN</param>
/// <param name="node_to_node_unit">Maps a Node* to a NodeUnit*</param>
/// <param name="num_node_units">The number of NodeUnits in the ONNX graph.</param>
/// <param name="logger">Logger</param>
/// <returns>Status with potential error</returns>
Status GetQnnNodeGroups(/*out*/ std::vector<std::unique_ptr<IQnnNodeGroup>>& qnn_node_groups,
QnnModelWrapper& qnn_model_wrapper,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
size_t num_node_units,
const logging::Logger& logger);
} // namespace qnn
} // namespace onnxruntime

View file

@ -0,0 +1,480 @@
#include "core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h"
#include <gsl/gsl>
#include <algorithm>
#include <cassert>
#include <limits>
#include <optional>
#include <string>
#include "core/graph/graph_utils.h"
#include "core/framework/node_unit.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/qnn/builder/qnn_utils.h"
#include "core/providers/qnn/builder/op_builder_factory.h"
#include "core/providers/qnn/builder/qnn_node_group/utils.h"
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
namespace onnxruntime {
namespace qnn {
// Gets the scale, zero-point, and zero-point type for a QuantizeLinear node that uses per-tensor quantization.
static bool GetQScalarScaleZeroPoint(const QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& q_node_unit,
/*out*/ float& scale,
/*out*/ int32_t& zero_point,
/*out*/ int32_t& zp_data_type) {
assert(q_node_unit.OpType() == QUANTIZE_LINEAR);
const auto& q_inputs = q_node_unit.GetNode().InputDefs();
// Require an explicit zero-point input for now.
if (q_inputs.size() != 3 || !q_inputs[QDQ_ZERO_POINT_INPUT_IDX]->Exists()) {
return false;
}
std::vector<int32_t> zero_points;
Status status = qnn_model_wrapper.UnpackZeroPoints(q_inputs[QDQ_ZERO_POINT_INPUT_IDX]->Name(),
zero_points, zp_data_type);
// Should only have one zero-point (per-tensor).
if (!status.IsOK() || zero_points.size() != 1) {
return false;
}
zero_point = -zero_points[0]; // QNN zero-points are negated.
std::vector<float> scales;
status = qnn_model_wrapper.UnpackScales(q_inputs[QDQ_SCALE_INPUT_IDX]->Name(), scales);
// Should only have one scale (per-tensor).
if (!status.IsOK() || scales.size() != 1) {
return false;
}
scale = scales[0];
return true;
}
// Computes the floating point range (rmin, rmax) from a QuantizeLinear node's scale/zero-point.
static bool GetQRminRmax(const QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& q_node_unit,
/*out*/ float& rmin,
/*out*/ float& rmax) {
int32_t zp_data_type = ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_UNDEFINED;
int32_t zero_point = 0;
float scale = 0.0f;
if (!GetQScalarScaleZeroPoint(qnn_model_wrapper, q_node_unit, scale, zero_point, zp_data_type)) {
return false;
}
switch (zp_data_type) {
case ONNX_NAMESPACE::TensorProto_DataType_INT8: {
rmin = scale * (std::numeric_limits<int8_t>::lowest() - zero_point);
rmax = scale * (std::numeric_limits<int8_t>::max() - zero_point);
break;
}
case ONNX_NAMESPACE::TensorProto_DataType_UINT8: {
rmin = scale * (std::numeric_limits<uint8_t>::lowest() - zero_point);
rmax = scale * (std::numeric_limits<uint8_t>::max() - zero_point);
break;
}
case ONNX_NAMESPACE::TensorProto_DataType_INT16: {
rmin = scale * (std::numeric_limits<int16_t>::lowest() - zero_point);
rmax = scale * (std::numeric_limits<int16_t>::max() - zero_point);
break;
}
case ONNX_NAMESPACE::TensorProto_DataType_UINT16: {
rmin = scale * (std::numeric_limits<uint16_t>::lowest() - zero_point);
rmax = scale * (std::numeric_limits<uint16_t>::max() - zero_point);
break;
}
default:
return false;
}
return true;
}
// Returns true if the Clip in the sequence (Clip -> Q) can be removed because it is made redundant by the Q.
static bool CanClipBeRemoved(const QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& clip_node_unit,
const NodeUnit& q_node_unit,
const logging::Logger& logger) {
assert(clip_node_unit.OpType() == "Clip" && q_node_unit.OpType() == QUANTIZE_LINEAR);
float rmin = 0.0f;
float rmax = 0.0f;
if (!GetQRminRmax(qnn_model_wrapper, q_node_unit, rmin, rmax)) {
return false;
}
float clip_min = std::numeric_limits<float>::lowest();
float clip_max = std::numeric_limits<float>::max();
if (!onnxruntime::GetClipMinMax(qnn_model_wrapper.GetGraphViewer(), clip_node_unit.GetNode(),
clip_min, clip_max, logger)) {
return false;
}
// The clip range must entirely overlap the quantization range (quantization can be smaller).
// Clip range: [------------------]
// Quant range: [-------------]
constexpr float epsilon = std::numeric_limits<float>::epsilon();
if ((epsilon < clip_min - rmin) || (epsilon < rmax - clip_max)) {
return false;
}
return true;
}
// Returns true if the Relu in the sequence (Relu -> Q) can be removed because it is made redundant by the Q.
static bool CanQRelaceRelu(const QnnModelWrapper& qnn_model_wrapper, const NodeUnit& q_node_unit) {
assert(q_node_unit.OpType() == QUANTIZE_LINEAR);
int32_t zp_data_type = ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_UNDEFINED;
int32_t zero_point = 0;
float scale = 0.0f;
if (!GetQScalarScaleZeroPoint(qnn_model_wrapper, q_node_unit, scale, zero_point, zp_data_type)) {
return false;
}
// Relu is redundant if the zero-point is set to the smallest quantized value.
switch (zp_data_type) {
case ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_INT8:
return zero_point == static_cast<int32_t>(std::numeric_limits<int8_t>::lowest());
case ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_UINT8:
return zero_point == static_cast<int32_t>(std::numeric_limits<uint8_t>::lowest());
case ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_INT16:
return zero_point == static_cast<int32_t>(std::numeric_limits<int16_t>::lowest());
case ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_UINT16:
return zero_point == static_cast<int32_t>(std::numeric_limits<uint16_t>::lowest());
default:
return false;
}
}
// Returns true if the Clip/Relu in the sequence (Clip/Relu -> Q) can be removed because it is made redundant by the Q.
static bool CanActivationBeRemoved(const QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& activation_node_unit,
const NodeUnit& q_node_unit,
const logging::Logger& logger) {
const std::string& activation_type = activation_node_unit.OpType();
if (activation_type == "Relu") {
return CanQRelaceRelu(qnn_model_wrapper, q_node_unit);
}
if (activation_type == "Clip") {
return CanClipBeRemoved(qnn_model_wrapper, activation_node_unit, q_node_unit, logger);
}
return false;
}
// Returns the parent DQ nodes for a given node.
static std::vector<const Node*> FindParentDQNodes(const GraphViewer& graph_viewer, const Node& node) {
// Get all parent DQ nodes sorted by destination argument index.
std::vector<const Node*> parents(node.InputDefs().size(), nullptr);
for (auto it = node.InputEdgesBegin(); it != node.InputEdgesEnd(); it++) {
if (it->GetNode().OpType().compare(DEQUANTIZE_LINEAR) == 0) {
parents[it->GetDstArgIndex()] = &(it->GetNode());
}
}
// Remove all the nodes which are not in the graph_viewer
parents.erase(std::remove_if(parents.begin(), parents.end(),
[&graph_viewer](const Node* _node) {
return _node == nullptr || graph_viewer.GetNode(_node->Index()) == nullptr;
}),
parents.end());
return parents;
}
// Gets the parent DQ nodes for the given Conv node. This fuction checks that the DQs are not a part of
// any other NodeUnit and that every Conv input comes from a parent DQ.
static bool GetConvDQs(
const GraphViewer& graph_viewer,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
const Node& conv_node,
/*out*/ std::array<const NodeUnit*, 3>& dq_node_units) {
if (conv_node.OpType() != "Conv" && conv_node.OpType() != "ConvTranspose") {
return false;
}
// Count number of inputs to Conv node.
const auto& conv_inputs = conv_node.InputDefs();
const size_t num_conv_inputs = std::count_if(conv_inputs.cbegin(), conv_inputs.cend(),
[](const NodeArg* input) { return input && input->Exists(); });
// Get the Conv's parent DQ nodes.
std::vector<const Node*> dq_nodes = FindParentDQNodes(graph_viewer, conv_node);
const size_t num_dqs = dq_nodes.size();
// Within a QDQ node group, a target node input is the only consumer of each DQ.
if ((num_conv_inputs != num_dqs) || (num_dqs > dq_node_units.size())) {
return false;
}
dq_node_units.fill(nullptr);
for (size_t i = 0; i < num_dqs; i++) {
const Node* dq_node = dq_nodes[i];
// DQ must not produce a graph output.
if (!dq_node || graph_viewer.NodeProducesGraphOutput(*dq_node)) {
return false;
}
// Conv should be the only consumer of a parent DQ.
const bool dq_has_single_output_edge_to_target =
dq_node->GetOutputEdgesCount() == 1 &&
dq_node->OutputEdgesBegin()->GetNode().Index() == conv_node.Index();
if (!dq_has_single_output_edge_to_target) {
return false;
}
// DQ node must be part of a "standalone" NodeUnit.
const auto it = node_to_node_unit.find(dq_node);
if (it == node_to_node_unit.end()) {
return false;
}
const NodeUnit* dq_node_unit = it->second;
if (!dq_node_unit || node_unit_to_qnn_node_group.count(dq_node_unit) != 0) {
return false;
}
if (dq_node_unit->UnitType() != NodeUnit::Type::SingleNode) {
return false;
}
dq_node_units[i] = dq_node_unit;
}
return true;
}
// Checks that the input and output data types are valid for a QDQ Conv.
static bool CheckQDQConvDataTypes(std::array<const NodeUnit*, 3>& dq_node_units,
gsl::not_null<const NodeUnit*> q_node_unit) {
assert(q_node_unit->OpType() == QUANTIZE_LINEAR);
// input and output types need to be same
int32_t dt_input = dq_node_units[0]->GetNode().InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
int32_t dt_weight = dq_node_units[1]->GetNode().InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
int32_t dt_output = q_node_unit->GetNode().OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
if (dt_input != dt_output) {
return false;
}
if (dt_input == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8) {
if (dt_weight != dt_input) {
return false;
}
}
if (dq_node_units[2] != nullptr) { // has bias
int32_t dt_bias = dq_node_units[2]->GetNode().InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
if (dt_bias != ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32) {
return false;
}
}
return true;
}
// Utility function to either validate or create a quantized QNN Conv node. The function creates a temporary
// custom NodeUnit that excludes the Clip/Relu because it is redundant. This custom NodeUnit is passed to our
// existing Conv OpBuilder for creation or validation via QNN APIs.
#define ValidateOnQnn(qnn_model_wrapper, dq_node_units, conv_node_unit, q_node_unit, logger) \
CreateOrValidateOnQnn((qnn_model_wrapper), (dq_node_units), (conv_node_unit), (q_node_unit), (logger), true)
#define CreateOnQnn(qnn_model_wrapper, dq_node_units, conv_node_unit, q_node_unit, logger) \
CreateOrValidateOnQnn((qnn_model_wrapper), (dq_node_units), (conv_node_unit), (q_node_unit), (logger), false)
static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
gsl::span<const NodeUnit* const> dq_node_units,
const NodeUnit* conv_node_unit,
const NodeUnit* q_node_unit,
const logging::Logger& logger,
bool validate) {
const size_t num_dqs = dq_node_units.size();
constexpr size_t max_num_dqs = 3;
ORT_RETURN_IF_NOT(num_dqs == 2 || num_dqs == max_num_dqs, "QDQ Conv should have 2 or 3 DQs");
ORT_RETURN_IF_NOT(conv_node_unit->OpType() == "Conv" && q_node_unit->OpType() == QUANTIZE_LINEAR,
"Expected Conv/ConvTranspose and QuantizeLinear but got ", conv_node_unit->OpType(), " and ",
q_node_unit->OpType());
std::array<const Node*, max_num_dqs> dq_nodes_buf = {};
for (size_t i = 0; i < num_dqs; i++) {
dq_nodes_buf[i] = &dq_node_units[i]->GetNode();
}
gsl::span<const Node*> dq_nodes(dq_nodes_buf.data(), num_dqs);
std::array<const Node*, 1> q_nodes = {&q_node_unit->GetNode()};
const Node& target_node = conv_node_unit->GetNode();
// Populate NodeUnit inputs
std::vector<NodeUnitIODef> inputs;
inputs.reserve(num_dqs);
for (const Node* dq_node : dq_nodes) {
const auto dq_inputs = dq_node->InputDefs();
const auto& dq_attrs = dq_node->GetAttributes();
std::optional<int64_t> axis;
if (auto entry = dq_attrs.find("axis"); entry != dq_attrs.end()) {
axis = entry->second.i();
}
// quantization scale and zp are always the input[1, 2]
NodeUnitIODef::QuantParam quant_param{*dq_inputs[1], dq_inputs.size() == 3 ? dq_inputs[2] : nullptr, axis};
inputs.push_back(NodeUnitIODef{*dq_inputs[0], quant_param});
}
// Populate NodeUnit outputs and output edges
std::vector<NodeUnitIODef> outputs;
Node::EdgeSet output_edges;
for (const Node* q_node : q_nodes) {
const auto q_inputs = q_node->InputDefs();
const auto& q_attrs = q_node->GetAttributes();
const auto q_outputs = q_node->OutputDefs();
std::optional<int64_t> axis;
if (auto entry = q_attrs.find("axis"); entry != q_attrs.end()) {
axis = entry->second.i();
}
// quantization scale and zp are always the input[1, 2]
NodeUnitIODef::QuantParam quant_param{*q_inputs[1], q_inputs.size() == 3 ? q_inputs[2] : nullptr, axis};
outputs.push_back(NodeUnitIODef{*q_outputs[0], quant_param});
// Gather output edges out of the Q node.
auto q_cur_edge = q_node->OutputEdgesBegin();
auto q_end_edge = q_node->OutputEdgesEnd();
for (; q_cur_edge != q_end_edge; ++q_cur_edge) {
output_edges.insert(Node::EdgeEnd{q_cur_edge->GetNode(), 0, q_cur_edge->GetDstArgIndex()});
}
}
NodeUnit custom_node_unit(dq_nodes, target_node, q_nodes, NodeUnit::Type::QDQGroup,
inputs, outputs, num_dqs, output_edges);
const auto* conv_op_builder = qnn::GetOpBuilder(custom_node_unit.OpType());
if (conv_op_builder == nullptr) {
return Status::OK();
}
if (validate) {
return conv_op_builder->IsOpSupported(qnn_model_wrapper, custom_node_unit, logger);
}
return conv_op_builder->AddToModelBuilder(qnn_model_wrapper, custom_node_unit, logger, validate);
}
// Traverses graph to check if the given NodeUnit is part of a valid DQ* -> Conv -> Relu/Clip -> Q sequence.
// If so, returns a IQnnNodeGroup that contains the constituent NodeUnits.
std::unique_ptr<IQnnNodeGroup> ConvActivationFusion::TryFusion(
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& conv_node_unit,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
const logging::Logger& logger) {
// Expect that this function is called with a standalone Conv or ConvTranspose.
const auto& conv_type = conv_node_unit.OpType();
if ((conv_type != "Conv" && conv_type != "ConvTranspose") ||
(conv_node_unit.UnitType() != NodeUnit::Type::SingleNode)) {
return nullptr;
}
const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
// Conv must have a single Relu or Clip child.
const std::array<std::string_view, 2> activation_op_types = {"Relu", "Clip"};
const NodeUnit* activation_node_unit = GetOnlyChildOfType(graph_viewer, conv_node_unit, activation_op_types,
node_to_node_unit, node_unit_to_qnn_node_group);
if (activation_node_unit == nullptr) {
return nullptr;
}
// Relu/Clip must have a single Q child.
const std::array<std::string_view, 1> q_op_types = {QUANTIZE_LINEAR};
const NodeUnit* q_node_unit = GetOnlyChildOfType(graph_viewer, *activation_node_unit, q_op_types,
node_to_node_unit, node_unit_to_qnn_node_group);
if (q_node_unit == nullptr) {
return nullptr;
}
// Check if Clip/Relu can be removed because the Q node provides an equivalent effect.
if (!CanActivationBeRemoved(qnn_model_wrapper, *activation_node_unit, *q_node_unit, logger)) {
return nullptr;
}
// Create a QDQ node group with DQ* -> Conv -> Q
const Node& conv_node = conv_node_unit.GetNode();
std::array<const NodeUnit*, 3> dq_node_units = {};
if (!GetConvDQs(graph_viewer,
node_to_node_unit,
node_unit_to_qnn_node_group,
conv_node, dq_node_units)) {
return nullptr;
}
if (!CheckQDQConvDataTypes(dq_node_units, q_node_unit)) {
return nullptr;
}
return std::make_unique<ConvActivationFusion>(*dq_node_units[0],
*dq_node_units[1],
dq_node_units[2],
conv_node_unit,
*activation_node_unit,
*q_node_unit);
}
ConvActivationFusion::ConvActivationFusion(const NodeUnit& dq_node_unit_0,
const NodeUnit& dq_node_unit_1,
const NodeUnit* dq_node_unit_2,
const NodeUnit& conv_node_unit,
const NodeUnit& activation_node_unit,
const NodeUnit& q_node_unit)
: node_units_{} {
size_t i = 0;
node_units_[i++] = &dq_node_unit_0;
node_units_[i++] = &dq_node_unit_1;
if (dq_node_unit_2 != nullptr) {
node_units_[i++] = dq_node_unit_2;
}
node_units_[i++] = &conv_node_unit;
node_units_[i++] = &activation_node_unit;
node_units_[i++] = &q_node_unit;
assert((!dq_node_unit_2 && i == 5) || (dq_node_unit_2 && i == 6));
}
Status ConvActivationFusion::IsSupported(QnnModelWrapper& qmw, const logging::Logger& logger) const {
const size_t num_dqs = node_units_.back() != nullptr ? 3 : 2;
gsl::span<const NodeUnit* const> dq_node_units(node_units_.data(), num_dqs);
return ValidateOnQnn(qmw, dq_node_units,
node_units_[num_dqs], // Conv
node_units_[num_dqs + 2], // Q
logger);
}
Status ConvActivationFusion::AddToModelBuilder(QnnModelWrapper& qmw, const logging::Logger& logger) const {
const size_t num_dqs = node_units_.back() != nullptr ? 3 : 2;
gsl::span<const NodeUnit* const> dq_node_units(node_units_.data(), num_dqs);
return CreateOnQnn(qmw, dq_node_units,
node_units_[num_dqs], // Conv
node_units_[num_dqs + 2], // Q
logger);
}
gsl::span<const NodeUnit* const> ConvActivationFusion::GetNodeUnits() const {
const size_t num_node_units = node_units_.back() != nullptr ? 6 : 5;
return gsl::make_span<const NodeUnit* const>(node_units_.data(), num_node_units);
}
const NodeUnit* ConvActivationFusion::GetTargetNodeUnit() const {
const size_t conv_index = node_units_.back() != nullptr ? 3 : 2;
return node_units_[conv_index];
}
} // namespace qnn
} // namespace onnxruntime

View file

@ -0,0 +1,63 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <gsl/gsl>
#include <array>
#include <memory>
#include <unordered_map>
#include <vector>
#include "core/framework/node_unit.h"
#include "core/providers/qnn/builder/qnn_node_group.h"
namespace onnxruntime {
namespace qnn {
class QnnModelWrapper;
/// <summary>
/// Represents a fusion of a DQ* -> Conv -> Relu/Clip -> Q sequence where the Relu (or Clip) is redundant
/// due to the quantization effects of the Q. This sequence is translated to a quantized QNN Conv.
/// All contained NodeUnits are of type SingleNode since they are not a part of an existing QDQ node unit.
/// </summary>
class ConvActivationFusion : public IQnnNodeGroup {
public:
ConvActivationFusion(const NodeUnit& dq_node_unit_0,
const NodeUnit& dq_node_unit_1,
const NodeUnit* dq_node_unit_2,
const NodeUnit& conv_node_unit,
const NodeUnit& activation_node_unit,
const NodeUnit& q_node_unit);
ORT_DISALLOW_COPY_AND_ASSIGNMENT(ConvActivationFusion);
Status IsSupported(QnnModelWrapper& qmw, const logging::Logger& logger) const override;
Status AddToModelBuilder(QnnModelWrapper& qmw, const logging::Logger& logger) const override;
gsl::span<const NodeUnit* const> GetNodeUnits() const override;
const NodeUnit* GetTargetNodeUnit() const override;
std::string_view Type() const override { return "ConvActivationFusion"; }
/// <summary>
/// Traverses graph to check if the given NodeUnit is part of a valid DQ* -> Conv -> Relu/Clip -> Q sequence.
/// If so, returns a IQnnNodeGroup that contains the constituent NodeUnits.
/// </summary>
/// <param name="qnn_model_wrapper">Used for validation and to traverse/query the graph</param>
/// <param name="conv_node_unit">Conv node unit (type SingleNode) that be part of the sequence.</param>
/// <param name="node_to_node_unit">Maps a Node to a NodeUnit.</param>
/// <param name="node_unit_to_qnn_node_group">Maps a NodeUnit to a IQnnNodeGroup.</param>
/// <param name="logger"></param>
/// <returns>A valid IQnnNodeGroup on success or an empty std::unique_ptr otherwise</returns>
static std::unique_ptr<IQnnNodeGroup> TryFusion(
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& conv_node_unit,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
const logging::Logger& logger);
private:
std::array<const NodeUnit*, 6> node_units_; // Last elem is nullptr if the optional bias DQ is missing.
};
} // namespace qnn
} // namespace onnxruntime

View file

@ -0,0 +1,179 @@
#include "core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h"
#include <gsl/gsl>
#include <algorithm>
#include <cassert>
#include <limits>
#include <optional>
#include <utility>
#include "core/graph/graph_utils.h"
#include "core/framework/node_unit.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/qnn/builder/qnn_utils.h"
#include "core/providers/qnn/builder/op_builder_factory.h"
#include "core/providers/qnn/builder/qnn_node_group/utils.h"
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
namespace onnxruntime {
namespace qnn {
// Forward declarations.
#define ValidateOnQnn(qnn_model_wrapper, dq_node_unit, q_node_unit) \
CreateOrValidateOnQnn((qnn_model_wrapper), (dq_node_unit), (q_node_unit), true)
#define CreateOnQnn(qnn_model_wrapper, dq_node_unit, q_node_unit) \
CreateOrValidateOnQnn((qnn_model_wrapper), (dq_node_unit), (q_node_unit), false)
static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& dq_node_unit,
const NodeUnit& q_node_unit, bool validate);
static bool IsDQQConversion(const GraphViewer& graph_viewer, const Node& dq_node, const Node& q_node);
std::unique_ptr<IQnnNodeGroup> DQQFusion::TryFusion(
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& dq_node_unit,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
const logging::Logger& logger) {
ORT_UNUSED_PARAMETER(logger);
// Expect that this function is called with a standalone DQ.
if (dq_node_unit.OpType() != DEQUANTIZE_LINEAR || dq_node_unit.UnitType() != NodeUnit::Type::SingleNode) {
return nullptr;
}
const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
const Node& dq_node = dq_node_unit.GetNode();
// DQ must have a single Q child (1 output edge) and must not produce a graph output.
const std::array<std::string_view, 1> child_types = {QUANTIZE_LINEAR};
const NodeUnit* q_node_unit = GetOnlyChildOfType(graph_viewer, dq_node_unit, child_types,
node_to_node_unit, node_unit_to_qnn_node_group);
if (q_node_unit == nullptr) {
return nullptr;
}
// DQ and Q must have equal scale type and different zp type.
if (!IsDQQConversion(graph_viewer, dq_node, q_node_unit->GetNode())) {
return nullptr;
}
if (Status status = ValidateOnQnn(qnn_model_wrapper, dq_node_unit, *q_node_unit);
!status.IsOK()) {
return nullptr;
}
return std::make_unique<DQQFusion>(dq_node_unit, *q_node_unit);
}
DQQFusion::DQQFusion(const NodeUnit& dq_node_unit, const NodeUnit& q_node_unit)
: node_units_{&dq_node_unit, &q_node_unit} {
}
Status DQQFusion::IsSupported(QnnModelWrapper& qmw, const logging::Logger& logger) const {
ORT_UNUSED_PARAMETER(logger);
return ValidateOnQnn(qmw, *node_units_[0], *node_units_[1]);
}
Status DQQFusion::AddToModelBuilder(QnnModelWrapper& qmw, const logging::Logger& logger) const {
ORT_UNUSED_PARAMETER(logger);
return CreateOnQnn(qmw, *node_units_[0], *node_units_[1]);
}
gsl::span<const NodeUnit* const> DQQFusion::GetNodeUnits() const {
return node_units_;
}
const NodeUnit* DQQFusion::GetTargetNodeUnit() const {
return node_units_[0];
}
static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& dq_node_unit,
const NodeUnit& q_node_unit,
bool validate) {
assert(dq_node_unit.OpType() == DEQUANTIZE_LINEAR && q_node_unit.OpType() == QUANTIZE_LINEAR);
const auto& node_name = utils::GetNodeName(dq_node_unit);
const NodeUnitIODef& input_def = dq_node_unit.Inputs()[0];
const NodeUnitIODef& output_def = q_node_unit.Outputs()[0];
QnnTensorWrapper input_tensor;
QnnTensorWrapper output_tensor;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(input_def, input_tensor));
ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(output_def, output_tensor));
if (validate) {
ORT_RETURN_IF_ERROR(qnn_model_wrapper.ValidateQnnNode(node_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CONVERT,
{input_tensor.GetQnnTensor()},
{output_tensor.GetQnnTensor()},
{}));
} else {
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensor)), "Failed to add input");
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensor)), "Failed to add output");
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetNodeName(q_node_unit),
QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_CONVERT,
{input_def.node_arg.Name()},
{output_def.node_arg.Name()},
{},
validate),
"Failed to add fused Convert node.");
}
return Status::OK();
}
static bool IsDQQConversion(const GraphViewer& graph_viewer, const Node& dq_node, const Node& q_node) {
ConstPointerContainer<std::vector<NodeArg*>> dq_input_defs = dq_node.InputDefs();
ConstPointerContainer<std::vector<NodeArg*>> q_input_defs = q_node.InputDefs();
auto is_scalar_shape = [](const NodeArg& input_arg) -> bool {
auto shape = input_arg.Shape();
if (shape == nullptr) {
return false;
}
auto dim_size = shape->dim_size();
return dim_size == 0 || (dim_size == 1 && shape->dim(0).has_dim_value() && shape->dim(0).dim_value() == 1);
};
// Q/DQ contains optional input is not supported
// non-scalar Q/DQ scale and zero point needs are not supported
if (dq_input_defs.size() != QDQ_MAX_NUM_INPUTS ||
q_input_defs.size() != QDQ_MAX_NUM_INPUTS ||
!is_scalar_shape(*q_input_defs[QDQ_SCALE_INPUT_IDX]) ||
!is_scalar_shape(*q_input_defs[QDQ_ZERO_POINT_INPUT_IDX]) ||
!is_scalar_shape(*dq_input_defs[QDQ_SCALE_INPUT_IDX]) ||
!is_scalar_shape(*dq_input_defs[QDQ_ZERO_POINT_INPUT_IDX])) {
return false;
}
// if Q/DQ scale and zero point are not constant, return false
const ONNX_NAMESPACE::TensorProto* dq_scale_tensor_proto =
graph_viewer.GetConstantInitializer(dq_input_defs[QDQ_SCALE_INPUT_IDX]->Name());
const ONNX_NAMESPACE::TensorProto* q_scale_tensor_proto =
graph_viewer.GetConstantInitializer(q_input_defs[QDQ_SCALE_INPUT_IDX]->Name());
const ONNX_NAMESPACE::TensorProto* dq_zp_tensor_proto =
graph_viewer.GetConstantInitializer(dq_input_defs[QDQ_ZERO_POINT_INPUT_IDX]->Name());
const ONNX_NAMESPACE::TensorProto* q_zp_tensor_proto =
graph_viewer.GetConstantInitializer(q_input_defs[QDQ_ZERO_POINT_INPUT_IDX]->Name());
if (nullptr == q_zp_tensor_proto ||
nullptr == dq_zp_tensor_proto ||
nullptr == q_scale_tensor_proto ||
nullptr == dq_scale_tensor_proto) {
return false;
}
// All TensorProtos must have a data type
if (!q_zp_tensor_proto->has_data_type() || !dq_zp_tensor_proto->has_data_type() ||
!q_scale_tensor_proto->has_data_type() || !dq_scale_tensor_proto->has_data_type()) {
return false;
}
// check Q/DQ have same scale type and different zero point type
return (dq_zp_tensor_proto->data_type() != q_zp_tensor_proto->data_type()) &&
(dq_scale_tensor_proto->data_type() == q_scale_tensor_proto->data_type());
}
} // namespace qnn
} // namespace onnxruntime

View file

@ -0,0 +1,57 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <memory>
#include <unordered_map>
#include <vector>
#include "core/common/common.h"
#include "core/framework/node_unit.h"
#include "core/providers/qnn/builder/qnn_node_group.h"
namespace onnxruntime {
namespace qnn {
class QnnModelWrapper;
/// <summary>
/// Represents a fusion of a DQ -> Q sequence that converts from one quantization type (e.g., uint8_t) to
/// another (e.g., uint16_t). This is translated into a QNN Convert operator, which is much faster than individual
/// ops. The DQ and Q are standalone NodeUnits that are not part of a QDQ node unit.
/// </summary>
class DQQFusion : public IQnnNodeGroup {
public:
DQQFusion(const NodeUnit& dq_node_unit, const NodeUnit& q_node_unit);
ORT_DISALLOW_COPY_AND_ASSIGNMENT(DQQFusion);
Status IsSupported(QnnModelWrapper& qmw, const logging::Logger& logger) const override;
Status AddToModelBuilder(QnnModelWrapper& qmw, const logging::Logger& logger) const override;
gsl::span<const NodeUnit* const> GetNodeUnits() const override;
const NodeUnit* GetTargetNodeUnit() const override;
std::string_view Type() const override { return "DQQFusion"; }
/// <summary>
/// Traverses graph to check if the given starting NodeUnit is part of a valid DQ -> Q sequence.
/// If so, returns a IQnnNodeGroup that contains the DQ and Q NodeUnits.
/// </summary>
/// <param name="qnn_model_wrapper">Used for validation and traverse/query the graph</param>
/// <param name="dq_node_unit">DQ node unit that could start the DQ -> Q sequence</param>
/// <param name="node_to_node_unit">Maps a Node to a NodeUnit.</param>
/// <param name="node_unit_to_qnn_node_group">Maps a NodeUnit to a IQnnNodeGroup.</param>
/// <param name="logger"></param>
/// <returns>A valid IQnnNodeGroup on success or an empty std::unique_ptr otherwise</returns>
static std::unique_ptr<IQnnNodeGroup> TryFusion(
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& dq_node_unit,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
const logging::Logger& logger);
private:
std::array<const NodeUnit*, 2> node_units_;
};
} // namespace qnn
} // namespace onnxruntime

View file

@ -0,0 +1,144 @@
#include "core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h"
#include <gsl/gsl>
#include <algorithm>
#include <cassert>
#include <limits>
#include <optional>
#include <utility>
#include "core/graph/graph_utils.h"
#include "core/framework/node_unit.h"
#include "core/providers/shared/utils/utils.h"
#include "core/providers/qnn/builder/qnn_utils.h"
#include "core/providers/qnn/builder/op_builder_factory.h"
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
#include "core/providers/qnn/builder/qnn_node_group/utils.h"
namespace onnxruntime {
namespace qnn {
// Forward declarations.
#define ValidateOnQnn(qnn_model_wrapper, hardsigmoid_node_unit, mul_node_unit) \
CreateOrValidateOnQnn((qnn_model_wrapper), (hardsigmoid_node_unit), (mul_node_unit), true)
#define CreateOnQnn(qnn_model_wrapper, hardsigmoid_node_unit, mul_node_unit) \
CreateOrValidateOnQnn((qnn_model_wrapper), (hardsigmoid_node_unit), (mul_node_unit), false)
static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& hardsigmoid_node_unit,
const NodeUnit& mul_node_unit, bool validate);
std::unique_ptr<IQnnNodeGroup> HardSigmoidMulFusion::TryFusion(
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& hardsigmoid_node_unit,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
const logging::Logger& logger) {
ORT_UNUSED_PARAMETER(logger);
// Looking for a standalone HardSigmoid to start the sequence.
if (hardsigmoid_node_unit.OpType() != "HardSigmoid" ||
hardsigmoid_node_unit.UnitType() != NodeUnit::Type::SingleNode) {
return nullptr;
}
NodeAttrHelper hs_attr_helper(hardsigmoid_node_unit);
float alpha = hs_attr_helper.Get("alpha", 0.2f);
float beta = hs_attr_helper.Get("beta", 0.5f);
constexpr float req_alpha = 1.0f / 6.0f;
constexpr float req_beta = 0.5f;
constexpr float alpha_eps = std::numeric_limits<float>::epsilon() * req_alpha;
constexpr float beta_eps = std::numeric_limits<float>::epsilon() * req_beta;
// Check for explicit values of alpha and beta.
if (std::abs(alpha - req_alpha) > alpha_eps || std::abs(beta - req_beta) > beta_eps) {
return nullptr;
}
// HardSigmoid must have a single Mul child (1 output edge) and must not produce a graph output.
const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
const std::array<std::string_view, 1> child_types = {"Mul"};
const NodeUnit* mul_node_unit = GetOnlyChildOfType(graph_viewer, hardsigmoid_node_unit, child_types,
node_to_node_unit, node_unit_to_qnn_node_group);
if (mul_node_unit == nullptr) {
return nullptr;
}
// Input to HardSigmoid must also be the other input to the Mul.
const Node& mul_node = mul_node_unit->GetNode();
auto& hs_input_name = hardsigmoid_node_unit.Inputs()[0].node_arg.Name();
const bool same_root_input = mul_node.InputDefs()[0]->Name() == hs_input_name ||
mul_node.InputDefs()[1]->Name() == hs_input_name;
if (!same_root_input) {
return nullptr;
}
if (Status status = ValidateOnQnn(qnn_model_wrapper, hardsigmoid_node_unit, *mul_node_unit);
!status.IsOK()) {
return nullptr;
}
return std::make_unique<HardSigmoidMulFusion>(hardsigmoid_node_unit, *mul_node_unit);
}
HardSigmoidMulFusion::HardSigmoidMulFusion(const NodeUnit& hardsigmoid_node_unit, const NodeUnit& mul_node_unit)
: node_units_{&hardsigmoid_node_unit, &mul_node_unit} {
}
Status HardSigmoidMulFusion::IsSupported(QnnModelWrapper& qmw, const logging::Logger& logger) const {
ORT_UNUSED_PARAMETER(logger);
return ValidateOnQnn(qmw, *node_units_[0], *node_units_[1]);
}
Status HardSigmoidMulFusion::AddToModelBuilder(QnnModelWrapper& qmw, const logging::Logger& logger) const {
ORT_UNUSED_PARAMETER(logger);
return CreateOnQnn(qmw, *node_units_[0], *node_units_[1]);
}
gsl::span<const NodeUnit* const> HardSigmoidMulFusion::GetNodeUnits() const {
return node_units_;
}
const NodeUnit* HardSigmoidMulFusion::GetTargetNodeUnit() const {
return node_units_[0];
}
static Status CreateOrValidateOnQnn(QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& hardsigmoid_node_unit,
const NodeUnit& mul_node_unit,
bool validate) {
assert(hardsigmoid_node_unit.OpType() == "HardSigmoid" && mul_node_unit.OpType() == "Mul");
const auto& node_name = utils::GetNodeName(hardsigmoid_node_unit);
const NodeUnitIODef& input_def = hardsigmoid_node_unit.Inputs()[0];
const NodeUnitIODef& output_def = mul_node_unit.Outputs()[0];
QnnTensorWrapper input_tensor;
QnnTensorWrapper output_tensor;
ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(input_def, input_tensor));
ORT_RETURN_IF_ERROR(qnn_model_wrapper.MakeTensorWrapper(output_def, output_tensor));
if (validate) {
ORT_RETURN_IF_ERROR(qnn_model_wrapper.ValidateQnnNode(node_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_HARD_SWISH,
{input_tensor.GetQnnTensor()},
{output_tensor.GetQnnTensor()},
{}));
} else {
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensor)), "Failed to add input");
ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensor)), "Failed to add output");
ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(node_name,
QNN_OP_PACKAGE_NAME_QTI_AISW,
QNN_OP_HARD_SWISH,
{input_def.node_arg.Name()},
{output_def.node_arg.Name()},
{},
validate),
"Failed to add fused HardSwish node.");
}
return Status::OK();
}
} // namespace qnn
} // namespace onnxruntime

View file

@ -0,0 +1,57 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <memory>
#include <unordered_map>
#include <vector>
#include "core/common/common.h"
#include "core/framework/node_unit.h"
#include "core/providers/qnn/builder/qnn_node_group.h"
namespace onnxruntime {
namespace qnn {
class QnnModelWrapper;
/// <summary>
/// Represents a fusion of a HardSigmoid -> Mul sequence that computes `x * HardSigmoid<alpha=1/6, beta=0.5>(x)`.
/// This is translated into a QNN HardSwish operator.
/// The contained NodeUnits are of type SingleNode since they are not a part of a QDQ node unit.
/// </summary>
class HardSigmoidMulFusion : public IQnnNodeGroup {
public:
HardSigmoidMulFusion(const NodeUnit& hardsigmoid_node_unit, const NodeUnit& mul_node_unit);
ORT_DISALLOW_COPY_AND_ASSIGNMENT(HardSigmoidMulFusion);
Status IsSupported(QnnModelWrapper& qmw, const logging::Logger& logger) const override;
Status AddToModelBuilder(QnnModelWrapper& qmw, const logging::Logger& logger) const override;
gsl::span<const NodeUnit* const> GetNodeUnits() const override;
const NodeUnit* GetTargetNodeUnit() const override;
std::string_view Type() const override { return "HardSigmoidMulFusion"; }
/// <summary>
/// Traverses graph to check if the given starting NodeUnit is part of a valid HardSigmoid -> Mul sequence.
/// If so, returns a IQnnNodeGroup that contains the HardSigmoid and Mul NodeUnits.
/// </summary>
/// <param name="qnn_model_wrapper">Used for validation and traverse/query the graph</param>
/// <param name="hardsigmoid_node_unit">HardSigmoid node unit that could start the sequence</param>
/// <param name="node_to_node_unit">Maps a Node to a NodeUnit.</param>
/// <param name="node_unit_to_qnn_node_group">Maps a NodeUnit to a IQnnNodeGroup.</param>
/// <param name="logger"></param>
/// <returns>A valid IQnnNodeGroup on success or an empty std::unique_ptr otherwise</returns>
static std::unique_ptr<IQnnNodeGroup> TryFusion(
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& hardsigmoid_node_unit,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
const logging::Logger& logger);
private:
std::array<const NodeUnit*, 2> node_units_;
};
} // namespace qnn
} // namespace onnxruntime

View file

@ -0,0 +1,221 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/providers/qnn/builder/qnn_node_group.h"
#include <gsl/gsl>
#include <limits>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include "core/graph/graph_utils.h"
#include "core/framework/node_unit.h"
#include "core/providers/qnn/builder/qnn_utils.h"
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
#include "core/providers/qnn/builder/op_builder_factory.h"
#include "core/providers/qnn/builder/qnn_node_group/conv_activation_fusion.h"
#include "core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h"
#include "core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h"
namespace onnxruntime {
namespace qnn {
/// <summary>
/// A IQnnNodeGroup class that wraps a single NodeUnit. Most NodeUnits in the ONNX graph will
/// be wrapped by this class.
/// </summary>
class QnnNodeUnitWrapper : public IQnnNodeGroup {
public:
explicit QnnNodeUnitWrapper(const NodeUnit& node_unit) : node_unit_(&node_unit) {}
ORT_DISALLOW_COPY_AND_ASSIGNMENT(QnnNodeUnitWrapper);
Status IsSupported(QnnModelWrapper& qmw, const logging::Logger& logger) const override {
const std::string& op_type = node_unit_->OpType();
const auto* op_builder = qnn::GetOpBuilder(op_type);
ORT_RETURN_IF_NOT(op_builder != nullptr, "Operators of type `", op_type,
"` are not supported by QNN EP.", op_type, " node `",
node_unit_->Name(), "` will not be assigned to QNN EP.");
return op_builder->IsOpSupported(qmw, *node_unit_, logger);
}
Status AddToModelBuilder(QnnModelWrapper& qmw, const logging::Logger& logger) const override {
const std::string& op_type = node_unit_->OpType();
const auto* op_builder = qnn::GetOpBuilder(op_type);
ORT_RETURN_IF_NOT(op_builder != nullptr, "[QNN EP]: Missing OpBuilder for OpType ", op_type);
return op_builder->AddToModelBuilder(qmw, *node_unit_, logger, /*do_op_validation*/ false);
}
gsl::span<const NodeUnit* const> GetNodeUnits() const override {
return gsl::span<const NodeUnit* const>{&node_unit_, 1ULL};
}
const NodeUnit* GetTargetNodeUnit() const override { return node_unit_; }
std::string_view Type() const override { return "NodeUnit"; }
private:
const NodeUnit* node_unit_;
};
/// <summary>
/// The type of a function that tries to fuse NodeUnits into a IQnnNodeGroup.
/// </summary>
using FusionFunc = std::unique_ptr<IQnnNodeGroup> (*)(
QnnModelWrapper&,
const NodeUnit&,
const std::unordered_map<const Node*, const NodeUnit*>&,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>&,
const logging::Logger&);
/// <summary>
/// Given a starting NodeUnit, this function tries all possible fusions that start with that NodeUnit.
/// If successful, returns a IQnnNodeGroup object that represents the fusion of various NodeUnits.
/// Currently only handles standalone NodeUnits that are not in a QDQ unit but that can change in the future.
/// </summary>
/// <param name="qnn_model_wrapper">QnnModelWrapper that contains the ONNX GraphViewer. Used for validation.</param>
/// <param name="starting_node_unit">NodeUnit that potentially starts a fusion.</param>
/// <param name="node_to_node_unit">Maps a Node* to a NodeUnit*</param>
/// <param name="node_unit_to_qnn_node_group">Maps a NodeUnit* to a IQnnNodeGroup*</param>
/// <param name="logger"></param>
/// <returns>IQnnNodeGroup representing the fusion or an empty std::unique_ptr</returns>
static std::unique_ptr<IQnnNodeGroup> TryQnnFusions(
QnnModelWrapper& qnn_model_wrapper,
const NodeUnit& starting_node_unit,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
const logging::Logger& logger) {
// Maps a starting operator type to the fusion function.
static std::unordered_map<std::string, FusionFunc> fusions = {
{"DequantizeLinear", DQQFusion::TryFusion},
{"HardSigmoid", HardSigmoidMulFusion::TryFusion},
{"Conv", ConvActivationFusion::TryFusion},
{"ConvTranspose", ConvActivationFusion::TryFusion},
};
// For now, all fusions involve standalone node units (i.e., no wrapping DQ/Q nodes).
if (starting_node_unit.UnitType() != NodeUnit::Type::SingleNode) {
return nullptr;
}
auto iter = fusions.find(starting_node_unit.OpType());
if (iter != fusions.end()) {
FusionFunc fusion_func = iter->second;
return fusion_func(qnn_model_wrapper, starting_node_unit, node_to_node_unit,
node_unit_to_qnn_node_group, logger);
}
return nullptr;
}
// Traverses the ONNX Graph and groups NodeUnits into IQnnNodeGroup objects. Some IQnnNodeGroup objects
// represent a fusion of various NodeUnits. This function generates a vector of indices that
// represent the topological order of the qnn_node_groups.
static Status GetQnnNodeGroupsImpl(/*out*/ std::vector<std::unique_ptr<IQnnNodeGroup>>& qnn_node_groups,
/*out*/ std::vector<size_t>& sorted_qnn_node_group_indices,
QnnModelWrapper& qnn_model_wrapper,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const size_t num_node_units,
const logging::Logger& logger) {
const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
const std::vector<NodeIndex> sorted_node_indices = graph_viewer.GetNodesInTopologicalOrder();
sorted_qnn_node_group_indices.reserve(num_node_units);
qnn_node_groups.reserve(num_node_units);
std::unordered_map<const NodeUnit*, const IQnnNodeGroup*> node_unit_to_qnn_node_group;
std::unordered_map<const IQnnNodeGroup*, size_t> fused_qnn_node_group_indices;
std::vector<gsl::not_null<const NodeUnit*>> sorted_node_units;
sorted_node_units.reserve(num_node_units);
// Process just the fusions of NodeUnits first to ensure a correct topological order of all IQnnNodeGroups.
// This is the same approach taken by ORT utilities for grouping Nodes into NodeUnits.
for (NodeIndex node_index : sorted_node_indices) {
gsl::not_null<const Node*> node = graph_viewer.GetNode(node_index);
// Get the NodeUnit associated with the node.
const auto node_unit_it = node_to_node_unit.find(node);
ORT_RETURN_IF_NOT(node_unit_it != node_to_node_unit.end(), "Could not find NodeUnit for Node ", node->Name());
gsl::not_null<const NodeUnit*> node_unit = node_unit_it->second;
// Skip this node if it is not the NodeUnit's target node to ensure NodeUnits are visited in topological order.
if (node != &node_unit->GetNode()) {
continue;
}
sorted_node_units.push_back(node_unit);
if (node_unit_to_qnn_node_group.count(node_unit) != 0) {
continue; // Already handled this node unit
}
std::unique_ptr<IQnnNodeGroup> fused_node_group = TryQnnFusions(qnn_model_wrapper, *node_unit,
node_to_node_unit, node_unit_to_qnn_node_group,
logger);
if (fused_node_group) {
const size_t index = qnn_node_groups.size();
fused_qnn_node_group_indices[fused_node_group.get()] = index;
for (const NodeUnit* fused_node_unit : fused_node_group->GetNodeUnits()) {
assert(fused_node_unit != nullptr);
node_unit_to_qnn_node_group.insert({fused_node_unit, fused_node_group.get()});
}
qnn_node_groups.push_back(std::move(fused_node_group));
}
}
// Create IQnnNodeGroups for the leftover NodeUnits that were not fused.
for (gsl::not_null<const NodeUnit*> node_unit : sorted_node_units) {
const auto it = node_unit_to_qnn_node_group.find(node_unit);
if (it != node_unit_to_qnn_node_group.end()) {
// Already added this NodeUnit to a IQnnNodeGroup, so we'll skip it.
// However, if this NodeUnit is the "target" for the IQnnNodeGroup, then add its index to
// the sorted list of indices.
gsl::not_null<const IQnnNodeGroup*> fused_qnn_node_group = it->second;
if (node_unit == fused_qnn_node_group->GetTargetNodeUnit()) {
sorted_qnn_node_group_indices.push_back(fused_qnn_node_group_indices[fused_qnn_node_group]);
}
continue;
}
const size_t index = qnn_node_groups.size();
auto qnn_node_group = std::make_unique<QnnNodeUnitWrapper>(*node_unit);
node_unit_to_qnn_node_group.insert({node_unit, qnn_node_group.get()});
qnn_node_groups.push_back(std::move(qnn_node_group));
sorted_qnn_node_group_indices.push_back(index);
}
assert(qnn_node_groups.size() == sorted_qnn_node_group_indices.size());
return Status::OK();
}
Status GetQnnNodeGroups(/*out*/ std::vector<std::unique_ptr<IQnnNodeGroup>>& qnn_node_groups,
QnnModelWrapper& qnn_model_wrapper,
const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
const size_t num_node_units,
const logging::Logger& logger) {
std::vector<size_t> sorted_qnn_node_group_indices;
std::vector<std::unique_ptr<IQnnNodeGroup>> qnn_node_groups_holder;
ORT_RETURN_IF_ERROR(GetQnnNodeGroupsImpl(qnn_node_groups_holder, sorted_qnn_node_group_indices, qnn_model_wrapper,
node_to_node_unit, num_node_units, logger));
// Move IQnnNodeGroups to the output std::vector in sorted (topological) order.
qnn_node_groups.resize(0);
qnn_node_groups.reserve(qnn_node_groups_holder.size());
for (auto index : sorted_qnn_node_group_indices) {
assert(index < qnn_node_groups_holder.size());
std::unique_ptr<IQnnNodeGroup> qnn_node_group = std::move(qnn_node_groups_holder[index]);
qnn_node_groups.push_back(std::move(qnn_node_group));
}
assert(qnn_node_groups.size() == sorted_qnn_node_group_indices.size());
return Status::OK();
}
} // namespace qnn
} // namespace onnxruntime

View file

@ -0,0 +1,66 @@
#include "core/providers/qnn/builder/qnn_node_group/utils.h"
#include <gsl/gsl>
#include <string_view>
#include <unordered_map>
#include "core/graph/graph_viewer.h"
#include "core/framework/node_unit.h"
#include "core/providers/qnn/builder/qnn_node_group.h"
namespace onnxruntime {
namespace qnn {
const NodeUnit* GetOnlyChildOfType(const GraphViewer& graph_viewer,
const NodeUnit& parent_node_unit,
gsl::span<const std::string_view> child_op_types,
const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& qnn_node_group_map) {
const Node& parent_node = parent_node_unit.GetNode();
// Parent must have a single child (1 output edge) and must not produce a graph output.
if (parent_node.GetOutputEdgesCount() != 1 || graph_viewer.NodeProducesGraphOutput(parent_node)) {
return nullptr;
}
// Child must be of a valid type.
const Node& child_node = parent_node.OutputEdgesBegin()->GetNode();
if (graph_viewer.GetNode(child_node.Index()) == nullptr) {
return nullptr; // Node is not in this GraphViewer
}
const std::string& child_type = child_node.OpType();
bool is_valid_child_type = false;
for (const auto& valid_op_type : child_op_types) {
if (valid_op_type == child_type) {
is_valid_child_type = true;
break;
}
}
if (!is_valid_child_type) {
return nullptr;
}
const auto child_node_unit_it = node_unit_map.find(&child_node);
if (child_node_unit_it == node_unit_map.end()) {
return nullptr;
}
const NodeUnit* child_node_unit = child_node_unit_it->second;
// Check if child node has already been handled. Should not be the case if the calling
// fusion function has been called in topological order, but check to be safe.
if (qnn_node_group_map.count(child_node_unit) != 0) {
return nullptr;
}
// child must not already be part of a QDQ NodeUnit (i.e., be standalone).
if (child_node_unit->UnitType() != NodeUnit::Type::SingleNode) {
return nullptr;
}
return child_node_unit;
}
} // namespace qnn
} // namespace onnxruntime

View file

@ -0,0 +1,40 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include <gsl/gsl>
#include <string_view>
#include <unordered_map>
#include "core/graph/graph_viewer.h"
#include "core/framework/node_unit.h"
#include "core/providers/qnn/builder/qnn_node_group.h"
namespace onnxruntime {
namespace qnn {
constexpr const char* QUANTIZE_LINEAR = "QuantizeLinear";
constexpr const char* DEQUANTIZE_LINEAR = "DequantizeLinear";
constexpr size_t QDQ_MAX_NUM_INPUTS = 3;
constexpr size_t QDQ_SCALE_INPUT_IDX = 1;
constexpr size_t QDQ_ZERO_POINT_INPUT_IDX = 2;
/// <summary>
/// Utility function to get a child NodeUnit. The returned NodeUnit must be the parent's only child, must be
/// of the expected type, and must not be a part of another IQnnNodeGroup.
/// </summary>
/// <param name="graph_viewer">GraphViewer containing all Nodes</param>
/// <param name="parent_node_unit">Parent NodeUnit</param>
/// <param name="child_op_types">Valid child types</param>
/// <param name="node_unit_map">Maps a Node to its NodeUnit</param>
/// <param name="node_unit_to_qnn_node_group">Maps a NodeUnit to its IQnnNodeGroup.
/// Used to check that the child has not already been added to another IQnnNodeGroup.</param>
/// <returns></returns>
const NodeUnit* GetOnlyChildOfType(const GraphViewer& graph_viewer,
const NodeUnit& parent_node_unit,
gsl::span<const std::string_view> child_op_types,
const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group);
} // namespace qnn
} // namespace onnxruntime

View file

@ -16,10 +16,10 @@
#include "core/platform/env.h"
#include "core/providers/common.h"
#include "core/providers/partitioning_utils.h"
#include "core/providers/qnn/builder/qnn_fusions.h"
#include "core/providers/partitioning_utils.h"
#include "core/providers/qnn/builder/qnn_model_wrapper.h"
#include "core/providers/qnn/builder/op_builder_factory.h"
#include "core/providers/qnn/builder/qnn_node_group.h"
#include "core/providers/qnn/builder/qnn_def.h"
#include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
#include "core/framework/run_options.h"
@ -412,25 +412,35 @@ QNNExecutionProvider::~QNNExecutionProvider() {
#endif
}
bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
const logging::Logger& logger) const {
const std::string& op_type = node_unit.OpType();
bool supported = false;
const auto* op_builder = qnn::GetOpBuilder(op_type);
if (op_builder == nullptr) {
LOGS(logger, WARNING) << "Operators of type `" << node_unit.OpType() << "` are not supported by QNN EP."
<< node_unit.OpType() << " node `" << node_unit.Name()
<< "` will not be assigned to QNN EP.";
} else {
auto status = op_builder->IsOpSupported(qnn_model_wrapper,
node_unit, logger);
if (Status::OK() != status) {
LOGS(logger, WARNING) << node_unit.OpType() << " node `" << node_unit.Name()
<< "` is not supported: " << status.ErrorMessage();
}
supported = (Status::OK() == status);
// Logs information about the supported/unsupported nodes.
static void LogNodeSupport(const logging::Logger& logger,
logging::Severity log_severity,
logging::DataType log_data_type,
const onnxruntime::CodeLocation& call_site,
const qnn::IQnnNodeGroup& qnn_node_group,
Status support_status) {
if (!logger.OutputIsEnabled(log_severity, log_data_type)) {
return;
}
return supported;
std::ostringstream oss;
oss << (support_status.IsOK() ? "Validation PASSED " : "Validation FAILED ") << "for nodes ("
<< qnn_node_group.Type() << "):" << std::endl;
for (const NodeUnit* node_unit : qnn_node_group.GetNodeUnits()) {
for (const Node* node : node_unit->GetAllNodesInGroup()) {
oss << "\tOperator type: " << node->OpType()
<< " Node name: " << node->Name()
<< " Node index: " << node->Index() << std::endl;
}
}
if (!support_status.IsOK()) {
oss << "\tREASON : " << support_status.ErrorMessage() << std::endl;
}
logging::Capture(logger, log_severity, logging::Category::onnxruntime,
log_data_type, call_site)
.Stream()
<< oss.str();
}
std::unordered_set<const Node*>
@ -469,68 +479,33 @@ QNNExecutionProvider::GetSupportedNodes(const GraphViewer& graph_viewer,
initializer_input_lookup,
qnn_backend_manager_->GetQnnBackendType());
std::unordered_set<const NodeUnit*> handled_node_units;
handled_node_units.reserve(node_unit_size);
std::vector<std::unique_ptr<qnn::IQnnNodeGroup>> qnn_node_groups;
qnn_node_groups.reserve(node_unit_size);
auto add_supported_nodes = [](std::unordered_set<const Node*>& supported_nodes, const NodeUnit* node_unit) {
for (const auto* node_in_group : node_unit->GetAllNodesInGroup()) {
supported_nodes.insert(node_in_group);
if (Status status = qnn::GetQnnNodeGroups(qnn_node_groups, qnn_model_wrapper,
node_unit_map, node_unit_size, logger);
!status.IsOK()) {
LOGS(logger, ERROR) << status.ErrorMessage();
return {};
}
for (const std::unique_ptr<qnn::IQnnNodeGroup>& qnn_node_group : qnn_node_groups) {
Status status = qnn_node_group->IsSupported(qnn_model_wrapper, logger);
const bool supported = status.IsOK();
constexpr auto log_severity = logging::Severity::kVERBOSE;
constexpr auto log_data_type = logging::DataType::SYSTEM;
if (logger.OutputIsEnabled(log_severity, log_data_type)) {
LogNodeSupport(logger, log_severity, log_data_type, ORT_WHERE, *qnn_node_group, status);
}
};
const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
for (size_t i = 0; i < node_indices.size(); i++) {
gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
// Get the node_unit associated with the node. Note that the node may not be the node_unit's target node.
const NodeUnit* node_unit = node_unit_map.at(node);
// Visiting 'nodes' in topological order does not guarantee that 'node_units' are
// also visited in topological order. Skip this node if it is not the node_unit's target node
// to ensure 'node_units' are visited in topological order.
if (node != &node_unit->GetNode()) {
continue;
}
if (handled_node_units.count(node_unit) != 0) {
continue; // Already handled this node unit
}
// Try to see if this node unit can be fused.
std::vector<const NodeUnit*> fused_nodes;
Status fusion_status = TryFusions(fused_nodes, qnn_model_wrapper, *node_unit, node_unit_map,
handled_node_units, logger, true /*do_op_validation*/);
if (!fusion_status.IsOK()) {
LOGS(logger, WARNING) << "Failed to apply fusion: " << fusion_status.ErrorMessage();
handled_node_units.insert(node_unit);
continue;
}
if (!fused_nodes.empty()) {
for (auto fused_node_unit : fused_nodes) {
handled_node_units.insert(fused_node_unit);
add_supported_nodes(supported_nodes, fused_node_unit);
}
continue;
}
// Couldn't fuse the node unit. See if it is supported by itself.
const bool supported = IsNodeSupported(qnn_model_wrapper, *node_unit, logger);
LOGS(logger, VERBOSE) << "Node supported: [" << supported
<< "] index: [" << node->Index()
<< "] name: [" << node->Name()
<< "] Operator type: [" << node->OpType()
<< "] as part of the NodeUnit type: [" << node_unit->OpType()
<< "] index: [" << node_unit->Index()
<< "] name: [" << node_unit->Name()
<< "]";
if (supported) {
add_supported_nodes(supported_nodes, node_unit);
for (const NodeUnit* node_unit : qnn_node_group->GetNodeUnits()) {
for (const Node* node : node_unit->GetAllNodesInGroup()) {
supported_nodes.insert(node);
}
}
}
handled_node_units.insert(node_unit);
}
return supported_nodes;

View file

@ -53,9 +53,6 @@ class QNNExecutionProvider : public IExecutionProvider {
Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
private:
bool IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
const logging::Logger& logger) const;
std::unordered_set<const Node*> GetSupportedNodes(const GraphViewer& graph_viewer,
const std::unordered_map<const Node*, const NodeUnit*>& node_unit_map,
const size_t node_unit_size,

View file

@ -15,6 +15,12 @@
namespace onnxruntime {
namespace test {
// Information for activation node placed between the Conv and Q.
struct OutputActivationInfo {
std::string op_type; // Relu or Clip
std::vector<float> const_inputs;
};
// Creates a graph with a single float32 Conv operator. Used for testing CPU backend.
static GetTestModelFn BuildF32ConvTestCase(const std::string& conv_op_type, const TestInputDef<float>& input_def,
const TestInputDef<float>& weights_def,
@ -23,9 +29,10 @@ static GetTestModelFn BuildF32ConvTestCase(const std::string& conv_op_type, cons
const std::vector<int64_t>& pads,
const std::vector<int64_t>& dilations,
std::optional<int64_t> group,
const std::string& auto_pad = "NOTSET") {
const std::string& auto_pad = "NOTSET",
std::optional<OutputActivationInfo> output_activation = std::nullopt) {
return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
dilations, group, auto_pad](ModelTestBuilder& builder) {
dilations, group, auto_pad, output_activation](ModelTestBuilder& builder) {
std::vector<NodeArg*> conv_inputs = {
MakeTestInput(builder, input_def),
MakeTestInput(builder, weights_def)};
@ -34,9 +41,9 @@ static GetTestModelFn BuildF32ConvTestCase(const std::string& conv_op_type, cons
conv_inputs.push_back(MakeTestInput(builder, bias_def));
}
auto* output = builder.MakeOutput();
auto* conv_output = output_activation.has_value() ? builder.MakeIntermediate() : builder.MakeOutput();
Node& conv_node = builder.AddNode(conv_op_type, conv_inputs, {output});
Node& conv_node = builder.AddNode(conv_op_type, conv_inputs, {conv_output});
conv_node.AddAttribute("auto_pad", auto_pad);
if (group.has_value()) {
@ -54,6 +61,15 @@ static GetTestModelFn BuildF32ConvTestCase(const std::string& conv_op_type, cons
if (!dilations.empty()) {
conv_node.AddAttribute("dilations", dilations);
}
if (output_activation.has_value()) {
NodeArg* output = builder.MakeOutput();
std::vector<NodeArg*> activation_inputs = {conv_output};
for (auto val : output_activation->const_inputs) {
activation_inputs.push_back(builder.MakeScalarInitializer(val));
}
builder.AddNode(output_activation->op_type, activation_inputs, {output});
}
};
}
@ -88,19 +104,22 @@ static void RunCPUConvOpTest(const std::string& conv_op_type, const TestInputDef
// Creates a graph with a single Q/DQ Conv operator. Used for testing HTP backend.
template <typename ActivationQType, typename WeightQType>
static GetTestQDQModelFn<ActivationQType> BuildQDQConvTestCase(const std::string& conv_op_type,
const TestInputDef<float>& input_def,
const TestInputDef<float>& weights_def,
const TestInputDef<float>& bias_def,
const std::vector<int64_t>& strides,
const std::vector<int64_t>& pads,
const std::vector<int64_t>& dilations,
std::optional<int64_t> group,
const std::string& auto_pad = "NOTSET",
bool use_contrib_qdq = false) {
static GetTestQDQModelFn<ActivationQType> BuildQDQConvTestCase(
const std::string& conv_op_type,
const TestInputDef<float>& input_def,
const TestInputDef<float>& weights_def,
const TestInputDef<float>& bias_def,
const std::vector<int64_t>& strides,
const std::vector<int64_t>& pads,
const std::vector<int64_t>& dilations,
std::optional<int64_t> group,
const std::string& auto_pad = "NOTSET",
bool use_contrib_qdq = false,
std::optional<OutputActivationInfo> output_activation = std::nullopt) {
return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
dilations, group, auto_pad, use_contrib_qdq](ModelTestBuilder& builder,
std::vector<QuantParams<ActivationQType>>& output_qparams) {
dilations, group, auto_pad,
use_contrib_qdq, output_activation](ModelTestBuilder& builder,
std::vector<QuantParams<ActivationQType>>& output_qparams) {
std::vector<NodeArg*> conv_inputs;
// input -> Q/DQ ->
@ -144,27 +163,39 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQConvTestCase(const std::string
conv_node.AddAttribute("dilations", dilations);
}
AddQDQNodePairWithOutputAsGraphOutput<ActivationQType>(builder, conv_output, output_qparams[0].scale,
NodeArg* q_input = conv_output;
if (output_activation.has_value()) {
q_input = builder.MakeIntermediate();
std::vector<NodeArg*> activation_inputs = {conv_output};
for (auto val : output_activation->const_inputs) {
activation_inputs.push_back(builder.MakeScalarInitializer(val));
}
builder.AddNode(output_activation->op_type, activation_inputs, {q_input});
}
AddQDQNodePairWithOutputAsGraphOutput<ActivationQType>(builder, q_input, output_qparams[0].scale,
output_qparams[0].zero_point, use_contrib_qdq);
};
}
template <typename ActivationQType, typename WeightQType>
static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const std::string& conv_op_type,
const TestInputDef<float>& input_def,
const TestInputDef<float>& weights_def,
const TestInputDef<float>& bias_def,
int64_t weight_quant_axis,
const std::vector<int64_t>& strides,
const std::vector<int64_t>& pads,
const std::vector<int64_t>& dilations,
std::optional<int64_t> group,
const std::string& auto_pad = "NOTSET",
bool use_contrib_qdq = false) {
static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(
const std::string& conv_op_type,
const TestInputDef<float>& input_def,
const TestInputDef<float>& weights_def,
const TestInputDef<float>& bias_def,
int64_t weight_quant_axis,
const std::vector<int64_t>& strides,
const std::vector<int64_t>& pads,
const std::vector<int64_t>& dilations,
std::optional<int64_t> group,
const std::string& auto_pad = "NOTSET",
bool use_contrib_qdq = false,
std::optional<OutputActivationInfo> output_activation = std::nullopt) {
return [conv_op_type, input_def, weights_def, bias_def, strides, pads,
dilations, group, auto_pad, use_contrib_qdq,
weight_quant_axis](ModelTestBuilder& builder,
std::vector<QuantParams<ActivationQType>>& output_qparams) {
weight_quant_axis, output_activation](ModelTestBuilder& builder,
std::vector<QuantParams<ActivationQType>>& output_qparams) {
std::vector<NodeArg*> conv_inputs;
// input -> Q/DQ ->
@ -248,7 +279,17 @@ static GetTestQDQModelFn<ActivationQType> BuildQDQPerChannelConvTestCase(const s
conv_node.AddAttribute("dilations", dilations);
}
AddQDQNodePairWithOutputAsGraphOutput<ActivationQType>(builder, conv_output, output_qparams[0].scale,
NodeArg* q_input = conv_output;
if (output_activation.has_value()) {
q_input = builder.MakeIntermediate();
std::vector<NodeArg*> activation_inputs = {conv_output};
for (auto val : output_activation->const_inputs) {
activation_inputs.push_back(builder.MakeScalarInitializer(val));
}
builder.AddNode(output_activation->op_type, activation_inputs, {q_input});
}
AddQDQNodePairWithOutputAsGraphOutput<ActivationQType>(builder, q_input, output_qparams[0].scale,
output_qparams[0].zero_point, use_contrib_qdq);
};
}
@ -267,7 +308,8 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef
ExpectedEPNodeAssignment expected_ep_assignment,
bool use_contrib_qdq = false,
int opset = 13,
QDQTolerance tolerance = QDQTolerance()) {
QDQTolerance tolerance = QDQTolerance(),
std::optional<OutputActivationInfo> output_activation = std::nullopt) {
ProviderOptions provider_options;
#if defined(_WIN32)
@ -277,10 +319,11 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef
#endif
TestQDQModelAccuracy(BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
group, auto_pad),
group, auto_pad, output_activation),
BuildQDQConvTestCase<ActivationQType, WeightQType>(conv_op_type, input_def, weights_def,
bias_def, strides, pads, dilations,
group, auto_pad, use_contrib_qdq),
group, auto_pad, use_contrib_qdq,
output_activation),
provider_options,
opset,
expected_ep_assignment,
@ -302,7 +345,8 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te
ExpectedEPNodeAssignment expected_ep_assignment,
bool use_contrib_qdq = false,
int opset = 13,
QDQTolerance tolerance = QDQTolerance()) {
QDQTolerance tolerance = QDQTolerance(),
std::optional<OutputActivationInfo> output_activation = std::nullopt) {
ProviderOptions provider_options;
#if defined(_WIN32)
@ -312,11 +356,11 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te
#endif
auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
group, auto_pad);
group, auto_pad, output_activation);
auto qdq_fn = BuildQDQPerChannelConvTestCase<ActivationQType, WeightQType>(conv_op_type, input_def, weights_def,
bias_def, weight_quant_axis, strides,
pads, dilations, group, auto_pad,
use_contrib_qdq);
use_contrib_qdq, output_activation);
TestQDQModelAccuracy(f32_fn, qdq_fn, provider_options, opset, expected_ep_assignment, tolerance);
}
@ -764,6 +808,140 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) {
21); // opset
}
// Test fusion of DQs -> Conv -> Relu/Clip -> Q.
// User per-tensor quantization.
TEST_F(QnnHTPBackendTests, ConvU8U8S32_ReluClipFusion) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};
std::vector<int64_t> bias_shape = {3};
TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
TestInputDef<float> bias_def(bias_shape, true,
GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
// DQs -> Conv (w/ bias) -> Relu -> Q
OutputActivationInfo relu_info = {"Relu", {}};
RunHTPConvOpTest<uint8_t, uint8_t>("Conv",
input_def,
weight_def,
bias_def,
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21, // opset
QDQTolerance(),
relu_info);
// DQs -> Conv (NO bias) -> Relu -> Q
RunHTPConvOpTest<uint8_t, uint8_t>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21, // opset
QDQTolerance(),
relu_info);
// DQs -> Conv (w/ bias) -> Clip -> Q
// Opset 6 Clip uses attributes for min/max
OutputActivationInfo clip_info = {"Clip", {0.0f, 2.0f}};
RunHTPConvOpTest<uint8_t, uint8_t>("Conv",
input_def,
weight_def,
bias_def,
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
19, // opset
QDQTolerance(),
clip_info);
// DQs -> Conv (NO bias) -> Clip -> Q
OutputActivationInfo clip_info_2 = {"Clip", {-6.0f, 6.0f}};
RunHTPConvOpTest<uint16_t, uint8_t>("Conv",
input_def,
weight_def,
TestInputDef<float>(),
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21, // opset
QDQTolerance(),
clip_info_2);
}
// Test fusion of DQs -> Conv -> Relu/Clip -> Q.
// User per-channel quantization.
TEST_F(QnnHTPBackendTests, ConvS8S8S32_PerChannel_ReluClipFusion) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};
std::vector<int64_t> bias_shape = {3};
TestInputDef<float> input_def(input_shape, false,
GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size()));
TestInputDef<float> weight_def(weight_shape, true,
GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size()));
TestInputDef<float> bias_def(bias_shape, true,
GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size()));
// DQs -> Conv (w/ bias) -> Relu -> Q
OutputActivationInfo relu_info = {"Relu", {}};
RunHTPConvOpPerChannelTest<int8_t, int8_t>("Conv",
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21, // opset
QDQTolerance(),
relu_info);
// DQs -> Conv (w/ bias) -> Clip -> Q
OutputActivationInfo clip_info = {"Clip", {0.0f, 6.0f}};
RunHTPConvOpPerChannelTest<int8_t, int8_t>("Conv",
input_def,
weight_def,
bias_def,
0, // weight quant axis
{1, 1}, // Strides
{0, 0, 0, 0}, // Pads
{1, 1}, // Dilations
1, // default group
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21, // opset
QDQTolerance(),
clip_info);
}
// Test per-channel QDQ Conv with INT4 weights and a negative weight quantization axis that still points to dimension 0.
TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_NegativeWeightQuantAxis) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
@ -799,7 +977,7 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_NegativeWeightQuantAxis) {
// CPU EP (f32 model): 25.143 21.554 17.964 10.785 7.195 3.605 -3.574 -7.164 -10.753
// CPU EP (qdq model): 24.670 21.103 17.536 10.254 6.689 2.972 -4.161 -7.728 -10.700
// QNN EP (qdq model): 27.186 27.186 27.186 21.541 6.685 -8.022 -10.548 -10.548 -10.548
TEST_F(QnnHTPBackendTests, DISABLED_ConvU16S4S32_PerChannel_AccuracyIssue) {
TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_AccuracyIssue) {
std::vector<int64_t> input_shape = {1, 2, 4, 4};
std::vector<int64_t> weight_shape = {3, 2, 2, 2};
std::vector<int64_t> bias_shape = {3};
@ -835,7 +1013,8 @@ TEST_F(QnnHTPBackendTests, DISABLED_ConvU16S4S32_PerChannel_AccuracyIssue) {
"NOTSET",
ExpectedEPNodeAssignment::All,
false, // use_qdq_contrib_ops
21); // opset
21, // opset
QDQTolerance(0.005f));
}
// Test per-channel QDQ Conv is rejected with weight axis != 0

View file

@ -28,26 +28,25 @@ static GetTestModelFn BuildMatMulOpTestCase(const TestInputDef<float>& input1_de
// Returns a function that creates a graph with a QDQ MatMul operator.
template <typename Input0QType, typename Input1QType, typename OutputQType>
static GetTestQDQModelFn<OutputQType> BuildMatMulOpQDQTestCase(const TestInputDef<float>& input1_def,
const TestInputDef<float>& input2_def,
static GetTestQDQModelFn<OutputQType> BuildMatMulOpQDQTestCase(const TestInputDef<float>& input0_def,
const TestInputDef<float>& input1_def,
bool use_contrib_qdq) {
return [input1_def, input2_def, use_contrib_qdq](ModelTestBuilder& builder,
return [input0_def, input1_def, use_contrib_qdq](ModelTestBuilder& builder,
std::vector<QuantParams<OutputQType>>& output_qparams) {
// input1 -> Q -> DQ ->
NodeArg* input1 = MakeTestInput(builder, input1_def);
QuantParams<Input0QType> input1_qparams = GetTestInputQuantParams<Input0QType>(input1_def);
auto* input1_qdq = AddQDQNodePair<Input0QType>(builder, input1, input1_qparams.scale, input1_qparams.zero_point,
NodeArg* input0 = MakeTestInput(builder, input0_def);
QuantParams<Input0QType> input0_qparams = GetTestInputQuantParams<Input0QType>(input0_def);
auto* input0_qdq = AddQDQNodePair<Input0QType>(builder, input0, input0_qparams.scale, input0_qparams.zero_point,
use_contrib_qdq);
// input2 -> Q -> DQ ->
NodeArg* input2 = MakeTestInput(builder, input2_def);
QuantParams<Input1QType> input2_qparams = GetTestInputQuantParams<Input1QType>(input2_def);
auto* input2_qdq = AddQDQNodePair<Input1QType>(builder, input2, input2_qparams.scale, input2_qparams.zero_point,
// input1 -> Q -> DQ ->
NodeArg* input1 = MakeTestInput(builder, input1_def);
QuantParams<Input1QType> input1_qparams = GetTestInputQuantParams<Input1QType>(input1_def);
auto* input1_qdq = AddQDQNodePair<Input1QType>(builder, input1, input1_qparams.scale, input1_qparams.zero_point,
use_contrib_qdq);
// MatMul
auto* op_output = builder.MakeIntermediate();
builder.AddNode("MatMul", {input1_qdq, input2_qdq}, {op_output});
builder.AddNode("MatMul", {input0_qdq, input1_qdq}, {op_output});
// op_output -> Q -> DQ -> output
AddQDQNodePairWithOutputAsGraphOutput<OutputQType>(builder, op_output, output_qparams[0].scale,
@ -55,6 +54,88 @@ static GetTestQDQModelFn<OutputQType> BuildMatMulOpQDQTestCase(const TestInputDe
};
}
template <typename Input0QType, typename WeightQType, typename OutputQType>
static GetTestQDQModelFn<OutputQType> BuildQDQPerChannelMatMulTestCase(const TestInputDef<float>& input_def,
const TestInputDef<float>& weights_def,
int64_t weight_quant_axis,
bool use_contrib_qdq = false) {
return [input_def, weights_def, weight_quant_axis,
use_contrib_qdq](ModelTestBuilder& builder,
std::vector<QuantParams<OutputQType>>& output_qparams) {
std::vector<NodeArg*> matmul_inputs;
// input -> Q/DQ ->
auto* input = MakeTestInput(builder, input_def);
QuantParams<Input0QType> input_qparams = GetTestInputQuantParams<Input0QType>(input_def);
auto* input_qdq = AddQDQNodePair<Input0QType>(builder, input, input_qparams.scale, input_qparams.zero_point,
use_contrib_qdq);
matmul_inputs.push_back(input_qdq);
// Quantized(weights) -> DQ ->
ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData());
std::vector<float> weight_scales;
std::vector<WeightQType> weight_zero_points;
TensorShape weights_shape = weights_def.GetTensorShape();
int64_t pos_weight_quant_axis = weight_quant_axis;
if (pos_weight_quant_axis < 0) {
pos_weight_quant_axis += static_cast<int64_t>(weights_shape.NumDimensions());
}
GetTestInputQuantParamsPerChannel<WeightQType>(weights_def, weight_scales, weight_zero_points,
static_cast<size_t>(pos_weight_quant_axis), true);
std::vector<WeightQType> quantized_weights;
size_t num_weight_storage_elems = weights_shape.Size();
if constexpr (std::is_same_v<WeightQType, Int4x2> || std::is_same_v<WeightQType, UInt4x2>) {
num_weight_storage_elems = Int4x2::CalcNumInt4Pairs(weights_shape.Size());
}
quantized_weights.resize(num_weight_storage_elems);
QuantizeValues<float, WeightQType>(weights_def.GetRawData(), quantized_weights, weights_shape,
weight_scales, weight_zero_points, pos_weight_quant_axis);
NodeArg* weights_initializer = builder.MakeInitializer<WeightQType>(weights_def.GetShape(), quantized_weights);
NodeArg* weights_dq = builder.MakeIntermediate();
Node& weights_dq_node = builder.AddDequantizeLinearNode<WeightQType>(weights_initializer, weight_scales,
weight_zero_points, weights_dq,
nullptr, use_contrib_qdq);
weights_dq_node.AddAttribute("axis", weight_quant_axis);
matmul_inputs.push_back(weights_dq);
auto* matmul_output = builder.MakeIntermediate();
builder.AddNode("MatMul", matmul_inputs, {matmul_output});
AddQDQNodePairWithOutputAsGraphOutput<OutputQType>(builder, matmul_output, output_qparams[0].scale,
output_qparams[0].zero_point, use_contrib_qdq);
};
}
// Runs a QDQ per-channel MatMul model on the QNN HTP backend. Checks the graph node assignment, and that the
// QDQ model is accurate on QNN EP (compared to CPU EP).
template <typename Input0QType, typename WeightQType, typename OutputQType>
static void RunQDQPerChannelMatMulOpOpTest(const TestInputDef<float>& input_def,
const TestInputDef<float>& weights_def,
int64_t weight_quant_axis,
ExpectedEPNodeAssignment expected_ep_assignment,
int opset = 21,
bool use_contrib_qdq = false,
QDQTolerance tolerance = QDQTolerance()) {
ProviderOptions provider_options;
#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
TestQDQModelAccuracy(BuildMatMulOpTestCase(input_def, weights_def),
BuildQDQPerChannelMatMulTestCase<Input0QType, WeightQType, OutputQType>(input_def,
weights_def,
weight_quant_axis,
use_contrib_qdq),
provider_options,
opset,
expected_ep_assignment,
tolerance);
}
// Runs an MatMul model on the QNN CPU backend. Checks the graph node assignment, and that inference
// outputs for QNN and CPU match.
static void RunMatMulOpOpTest(const TestInputDef<float>& input1_def,
@ -160,6 +241,55 @@ TEST_F(QnnHTPBackendTests, MatMulOp_HTP_A16_W8Static) {
true); // Use com.microsoft Q/DQ ops
}
// Test QDQ per-channel MatMul with 16-bit act, signed 4-bit weights (static)
TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightInt4) {
std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
RunQDQPerChannelMatMulOpOpTest<uint16_t, Int4x2, uint16_t>(TestInputDef<float>({1, 1, 2, 3}, false, input0_data),
TestInputDef<float>({1, 1, 3, 2}, true, input1_data),
1, // quantization axis
ExpectedEPNodeAssignment::All,
21,
false);
}
// Test QDQ per-channel MatMul with 16-bit act, unsigned 4-bit weights (static)
TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightUInt4) {
std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
RunQDQPerChannelMatMulOpOpTest<uint16_t, UInt4x2, uint16_t>(TestInputDef<float>({1, 1, 2, 3}, false, input0_data),
TestInputDef<float>({1, 1, 3, 2}, true, input1_data),
1, // quantization axis
ExpectedEPNodeAssignment::All,
21,
false);
}
// Test QDQ per-channel MatMul with int8 act, int4 weights (static)
TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_AS8_WeightInt4) {
std::vector<float> input0_data = GetFloatDataInRange(-5.0f, 5.0f, 6);
std::vector<float> input1_data = {-2.0f, -1.0f, -0.5f, 0.0f, 1.0f, 2.0f};
RunQDQPerChannelMatMulOpOpTest<int8_t, Int4x2, int8_t>(TestInputDef<float>({1, 1, 2, 3}, false, input0_data),
TestInputDef<float>({1, 1, 3, 2}, true, input1_data),
1, // quantization axis
ExpectedEPNodeAssignment::All,
21,
false,
QDQTolerance(0.007f));
}
// Test QDQ per-channel MatMul with 16-bit act, int8 weights (static)
TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightInt8) {
std::vector<float> input0_data = {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f};
std::vector<float> input1_data = {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f};
RunQDQPerChannelMatMulOpOpTest<uint16_t, int8_t, uint16_t>(TestInputDef<float>({1, 1, 2, 3}, false, input0_data),
TestInputDef<float>({1, 1, 3, 2}, true, input1_data),
1, // quantization axis
ExpectedEPNodeAssignment::All,
21,
false);
}
// Test QDQ MatMul with uint16 activation uint16 weights, both dynamic
// Inaccuracy detected for output 'output_0', element 1.
// Output quant params: scale=0.0015259021893143654, zero_point=0.