mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
Enable ClipQuantFusion exclusively on CPU EP (#20627)
### Motivation and Context The Intel NPU does not support 16-bit int quantized operators. Consequently, the execution provider removes the QuantizeLinear/DeQuantizeLinear (Q/DQ) operators from node units and executes the operation as FP16 in the backend. However, if a Clip operator was fused into a Q operator in the node unit, the removal of Q/DQ operators results in inaccuracies because the effect of the original Clip operators is lost. Consider the following example: - FP32 model: -> Op_FP32 -> Clip -> - QDQ model: -> (DQ-> Op_FP32 -> Q) -> (DQ' -> Clip -> Q') -> - After ClipQuantFusion: -> (DQ-> Op_FP32 -> Q) -> (DQ' -> Q') -> - Intel Execution Provider strips Q/DQ: -> Op_FP16 -> To solve this issue, we have enabled ClipQuantFusion exclusively on the CPU execution provider.
This commit is contained in:
parent
4fe565a62a
commit
49d197a8e6
3 changed files with 5 additions and 4 deletions
|
|
@ -132,14 +132,13 @@ InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
|
|||
rules.push_back(std::make_unique<ConvBNFusion>());
|
||||
rules.push_back(std::make_unique<PadFusion>());
|
||||
rules.push_back(std::make_unique<MatmulBNFusion>());
|
||||
rules.push_back(std::make_unique<ClipQuantFusion>());
|
||||
rules.push_back(std::make_unique<ReluQuantFusion>());
|
||||
rules.push_back(std::make_unique<LabelEncoderFusion>());
|
||||
break;
|
||||
|
||||
case TransformerLevel::Level2:
|
||||
rules.push_back(std::make_unique<ClipQuantFusion>());
|
||||
rules.push_back(std::make_unique<GemmTransposeFusion>());
|
||||
// No level2 rules available today
|
||||
break;
|
||||
|
||||
case TransformerLevel::Level3:
|
||||
|
|
|
|||
|
|
@ -83,13 +83,15 @@ static bool GetQConstantLowerUpper(const Graph& graph, const Node& node, float&
|
|||
|
||||
bool ClipQuantFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& /*logger*/) const {
|
||||
if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Clip", {1, 6, 11, 12, 13}) ||
|
||||
!graph_utils::IsSupportedProvider(node, {kCpuExecutionProvider}) ||
|
||||
!optimizer_utils::CheckOutputEdges(graph, node, 1)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if Clip is followed by QuantizeLinear, it can be fused into QuantizeLinear potentially
|
||||
const auto& next_node = *node.OutputNodesBegin();
|
||||
if (!QDQ::MatchQNode(next_node)) {
|
||||
if (!graph_utils::IsSupportedProvider(next_node, {kCpuExecutionProvider}) ||
|
||||
!QDQ::MatchQNode(next_node)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -2565,7 +2565,7 @@ TEST(QDQTransformerTests, Clip) {
|
|||
|
||||
TransformerTester(build_test_case, check_clip_graph,
|
||||
TransformerLevel::Default,
|
||||
TransformerLevel::Level1,
|
||||
TransformerLevel::Level2,
|
||||
opset_version,
|
||||
epsilon,
|
||||
epsilon);
|
||||
|
|
|
|||
Loading…
Reference in a new issue