[QNN EP] Make offloading graph input/output quantization (to CPU) the default (#23368)

Makes the QNN provider option `offload_graph_io_quantization` enabled by
default. It was previously disabled by default.

Enabling this option significantly decreases inference latency for many
models.
This commit is contained in:
Adrian Lizarraga 2025-02-04 11:42:46 -08:00 committed by adrianlizarraga
parent 378714bfcb
commit 9b3df90ee8
35 changed files with 102 additions and 9 deletions

View file

@ -3660,8 +3660,8 @@ struct OrtApi {
* - "1": Enabled.
* "offload_graph_io_quantization": Offload graph input quantization and graph output dequantization to another
* execution provider (typically CPU EP).
* - "0": Default. Disabled. QNN EP will handle quantization and dequantization of graph I/O.
* - "1": Enabled.
* - "0": Disabled. QNN EP will handle quantization and dequantization of graph I/O.
* - "1": Enabled. This is the default value.
*
* SNPE supported keys:
* "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",

View file

@ -377,13 +377,15 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_weight_sharing: " << enable_htp_weight_sharing_;
}
model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", false,
model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", true,
provider_options_map);
if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) {
LOGS_DEFAULT(WARNING) << "Fallback to CPU EP is disabled, but user configured QNN EP to offload graph I/O "
<< "quantization/dequantization to another EP. Session creation will fail if the CPU EP "
<< "handles the graph I/O quantization/dequantization.";
LOGS_DEFAULT(INFO) << "Fallback to CPU EP is disabled, but user tried to configure QNN EP to offload graph I/O "
<< "quantization/dequantization to another EP. These are conflicting options. Fallback to CPU "
<< "EP will remain disabled and graph I/O quantization/dequantization will not be offloaded "
<< "to another EP.";
model_settings_.offload_graph_io_quantization = false;
}
qnn_backend_manager_ = std::make_unique<qnn::QnnBackendManager>(

View file

@ -70,6 +70,7 @@ static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs), // baseline float32 model
BuildQDQArgMxxTestCase<QType>(op_type, input_def, attrs), // QDQ model

View file

@ -31,6 +31,7 @@ static void RunAveragePoolOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
provider_options,
@ -53,6 +54,7 @@ static void RunQDQAveragePoolOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
BuildQDQOpTestCase<QuantType>(op_type, input_defs, {}, attrs),

View file

@ -160,6 +160,7 @@ static void RunBatchNormQDQTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
// Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
TestQDQModelAccuracy(BuildBatchNormTestCase(input_def, scale_def, bias_def),
@ -180,6 +181,7 @@ static void RunBatchNormFP16Test(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestInputDef<MLFloat16> input_fp16_def = ConvertToFP16InputDef(input_def);
TestInputDef<MLFloat16> scale_fp16_def = ConvertToFP16InputDef(scale_def);

View file

@ -57,6 +57,7 @@ static void RunCastOpTest(const std::vector<int64_t>& shape, ONNX_NAMESPACE::Ten
#else
provider_options["backend_path"] = use_htp ? "libQnnHtp.so" : "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
if (use_htp && enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";

View file

@ -117,6 +117,7 @@ static void RunQDQClipTestOnHTP(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_model_builder = BuildOpTestCase<float, float>("Clip", {input_def}, {min_max_defs}, {});
auto qdq_model_builder = BuildQDQOpTestCase<QType, float>("Clip", {input_def}, {min_max_defs}, {},
@ -205,6 +206,7 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(model_fn,
provider_options,

View file

@ -93,6 +93,8 @@ static void RunCPUConvOpTest(const std::string& conv_op_type, const TestInputDef
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto build_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads,
dilations, group, auto_pad);
RunQnnModelTest(build_fn,
@ -317,6 +319,7 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
group, auto_pad, output_activation),
@ -354,6 +357,7 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
group, auto_pad, output_activation);
@ -665,6 +669,7 @@ TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto BuildConvMulGraph = [](ModelTestBuilder& builder) {
// DQ node for Conv input

View file

@ -101,6 +101,7 @@ static void RunQDQFlattenTestOnHTP(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_model_builder = BuildOpTestCase<float>("Flatten", {input_def}, {}, attrs);
auto qdq_model_builder = BuildQDQOpTestCase<QType>("Flatten", {input_def}, {}, attrs, kOnnxDomain, use_contrib_qdq);
@ -172,6 +173,7 @@ TEST_F(QnnHTPBackendTests, Flatten_QDQ8bit_Rank5) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(model_fn,
provider_options,

View file

@ -67,6 +67,7 @@ static void RunCPUGatherElemsOpTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<DataType, IndexType>("GatherElements", {input_def}, {indices_def}, attrs),
provider_options,
@ -91,6 +92,7 @@ static void RunHTPQDQGatherElemsOpTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_model_builder = BuildOpTestCase<float, IndexType>("GatherElements", {input_def}, {indices_def}, attrs);
auto qdq_model_builder = BuildQDQGatherElemsTestCase<QuantType, IndexType>(input_def, indices_def, attrs,
@ -119,6 +121,7 @@ static void RunHTPGatherElemsOpTest(const TestInputDef<DataType>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<DataType, IndexType>("GatherElements", {input_def}, {indices_def}, attrs),
provider_options,

View file

@ -63,6 +63,7 @@ static void RunQDQGatherOpTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_model_builder = BuildOpTestCase<float, IndicesType>("Gather", {input_def}, {indices_def}, attrs);
auto qdq_model_builder = BuildQDQGatherTestCase<QuantType, IndicesType>(input_def, indices_def, attrs,
@ -152,4 +153,4 @@ TEST_F(QnnHTPBackendTests, DISABLED_GatherOp_IndicesStaticInt32_Axis1) {
} // namespace test
} // namespace onnxruntime
#endif
#endif

View file

@ -29,6 +29,7 @@ static void RunGemmTestOnCPU(const std::vector<TestInputDef<DataType>>& input_de
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<float>("Gemm", input_defs, {}, attrs),
provider_options,
@ -246,6 +247,8 @@ static void RunQDQGemmTestOnHTP(const std::vector<TestInputDef<float>>& input_de
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_model_builder = BuildOpTestCase<float>("Gemm", input_defs, {}, attrs);
auto qdq_model_builder = BuildQDQGemmTestCase<InputAQType, InputBQType>(input_defs, attrs, use_contrib_qdq);
TestQDQModelAccuracy<InputAQType>(f32_model_builder,

View file

@ -79,6 +79,7 @@ static void RunInstanceNormQDQTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
// Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
TestQDQModelAccuracy(BuildOpTestCase<float>("InstanceNormalization", {input_def, scale_def, bias_def}, {}, attrs),

View file

@ -28,6 +28,7 @@ static void RunLayerNormCpuTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, {}, attrs),
provider_options,
@ -152,6 +153,7 @@ static void RunLayerNormQDQTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, {}, attrs),
BuildQDQLayerNormTestCase<InputQType, ScaleQType>(input_def, scale_def, bias_def, attrs,

View file

@ -28,6 +28,7 @@ static void RunLeakyReluOpQDQTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildOpTestCase<float>("LeakyRelu", {input_def}, {}, attrs),
BuildQDQOpTestCase<QuantType>("LeakyRelu", {input_def}, {}, attrs),
@ -66,6 +67,7 @@ TEST_F(QnnHTPBackendTests, LeakyReluFP16OpSet16) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto input_def = TestInputDef<float>({1, 2, 3}, false, {-40.0f, -20.0f, 1.0f, 10.0f, 30.0f, 40.0f});
TestInputDef<MLFloat16> input_fp16_def = ConvertToFP16InputDef(input_def);

View file

@ -73,6 +73,7 @@ static void RunCPULogicalOpTest(const std::string& op_type, const std::vector<in
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildLogicalOpTestCase(op_type, shape),
provider_options,
@ -92,6 +93,7 @@ static void RunQDQLogicalOpTest(const std::string& op_type, const std::vector<in
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildQDQLogicalOpTestCase<QuantType>(op_type, shape),
provider_options,
@ -157,6 +159,7 @@ TEST_F(QnnHTPBackendTests, EqualToCast4D) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
// Model building function that creates a QDQ graph with an Equal node followed by
// a Cast to float32.

View file

@ -70,6 +70,7 @@ static void RunCPULRNOpTest(const TestInputDef<float>& input_def, int64_t size,
provider_options["backend_path"] = "libQnnCpu.so";
fp32_abs_err = 1.5e-5f; // On linux we need slightly larger tolerance.
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildLRNTestCase(input_def, size, alpha, beta, bias),
provider_options,
@ -91,6 +92,7 @@ static void RunQDQLRNOpTest(const TestInputDef<float>& input_def, int64_t size,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildLRNTestCase(input_def, size, alpha, beta, bias),
BuildQDQLRNTestCase<QuantType>(input_def, size, alpha, beta, bias),

View file

@ -125,6 +125,7 @@ static void RunQDQPerChannelMatMulOpOpTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
if (enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
@ -178,6 +179,7 @@ static void RunQDQMatMulOpOpTest(const TestInputDef<float>& input1_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildMatMulOpTestCase(input1_def, input2_def),
BuildMatMulOpQDQTestCase<Input0QType, Input1QType, OutputQType>(input1_def, input2_def,

View file

@ -26,6 +26,7 @@ static void RunCPUMinOrMaxOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<float>(op_type, input_defs, {}, {}, kOnnxDomain),
provider_options,
@ -47,6 +48,7 @@ static void RunQDQMinOrMaxOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, {}, kOnnxDomain), // baseline float32 model
BuildQDQOpTestCase<QType>(op_type, input_defs, {}, {}, kOnnxDomain), // QDQ model

View file

@ -116,6 +116,7 @@ static void RunPadOpTest(const TestInputDef<float>& data_def,
provider_options["backend_path"] = "libQnnCpu.so";
#endif
}
provider_options["offload_graph_io_quantization"] = "0";
if (enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
@ -144,6 +145,7 @@ static void RunQDQPadOpTest(const TestInputDef<float>& data_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildPadTestCase(data_def, pads_def, constant_value_def, attrs),
BuildPadQDQTestCase<QuantType>(data_def, pads_def, constant_value_def, attrs,

View file

@ -60,6 +60,7 @@ static void RunPoolOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),
provider_options,
@ -83,6 +84,7 @@ static void RunQDQPoolOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),
BuildPoolQDQTestCase<QuantType>(op_type, input_def, attrs, use_contrib_qdq_ops),

View file

@ -117,6 +117,7 @@ TEST(QnnEP, TestDisableCPUFallback_ModelNotFullySupported) {
#else
options["backend_path"] = "libQnnCpu.so";
#endif
options["offload_graph_io_quantization"] = "0";
so.AppendExecutionProvider("QNN", options);
@ -148,6 +149,7 @@ TEST(QnnEP, TestDisableCPUFallback_TryingToRunOnQnnCPU) {
#else
options["backend_path"] = "libQnnCpu.so";
#endif
options["offload_graph_io_quantization"] = "0";
auto input_defs = {TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f),
TestInputDef<float>({1, 2, 2, 2}, false, -10.0f, 10.0f)};
@ -196,6 +198,7 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) {
#else
options["backend_path"] = "libQnnCpu.so";
#endif
options["offload_graph_io_quantization"] = "0";
so.AppendExecutionProvider("QNN", options);
@ -226,6 +229,7 @@ TEST_F(QnnHTPBackendTests, TestConvWithExternalData) {
#else
options["backend_path"] = "libQnnHtp.so";
#endif
options["offload_graph_io_quantization"] = "0";
so.AppendExecutionProvider("QNN", options);
@ -301,6 +305,7 @@ static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bo
so.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
onnxruntime::ProviderOptions options;
options["offload_graph_io_quantization"] = "0";
#if defined(_WIN32)
options["backend_path"] = use_htp ? "QnnHtp.dll" : "QnnCpu.dll";
@ -591,6 +596,7 @@ TEST_F(QnnHTPBackendTests, MultithreadSessionRun) {
#else
options["backend_path"] = "libQnnHtp.so";
#endif
options["offload_graph_io_quantization"] = "0";
auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
@ -640,6 +646,7 @@ TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgSessionRunOption) {
#else
options["backend_path"] = "libQnnHtp.so";
#endif
options["offload_graph_io_quantization"] = "0";
auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
@ -705,6 +712,7 @@ TEST_F(QnnHTPBackendTests, MultithreadDefaultHtpPowerCfgFromEpOption) {
#else
options["backend_path"] = "libQnnHtp.so";
#endif
options["offload_graph_io_quantization"] = "0";
options["htp_performance_mode"] = "burst";
auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
@ -756,6 +764,7 @@ TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgDefaultAndRunOption) {
#else
options["backend_path"] = "libQnnHtp.so";
#endif
options["offload_graph_io_quantization"] = "0";
options["htp_performance_mode"] = "burst";
auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
@ -920,6 +929,7 @@ TEST_F(QnnHTPBackendTests, ProfilingTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
provider_options["enable_htp_fp16_precision"] = "1";
provider_options["profiling_level"] = "detailed";
provider_options["profiling_file_path"] = "detailed_profile.csv";
@ -940,6 +950,7 @@ TEST_F(QnnHTPBackendTests, CastAddHTPAccuracyTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildCastAddTestCase(),
provider_options,
@ -1010,6 +1021,7 @@ TEST_F(QnnHTPBackendTests, EPRejectsDynamicShapesF32) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
provider_options["enable_htp_fp16_precision"] = "1"; // QNN EP will use fp16 precision.
// CPU EP will use fp32, so we can relax accuracy requirements.

View file

@ -83,6 +83,7 @@ void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
@ -225,6 +226,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
@ -275,6 +277,7 @@ TEST_F(QnnHTPBackendTests, QnnContextGeneration2InputsOrderIssue) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
// Add kMSDomain to cover contrib op like Gelu
const std::unordered_map<std::string, int> domain_to_version = {{"", 13}, {kMSDomain, 1}};
@ -311,6 +314,7 @@ TEST_F(QnnHTPBackendTests, QnnContextGenerationNodeNamePrefix) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
std::string node_name_prefix = "node_name_prefix_test";
// Add kMSDomain to cover contrib op like Gelu
@ -353,6 +357,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheEmbedModeTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
const std::string context_binary_file = "./qnn_context_binary_test.onnx";
std::remove(context_binary_file.c_str());
@ -401,6 +406,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheNonEmbedModeTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
const std::string context_binary_file = "./testdata/qnn_context_cache_non_embed.onnx";
std::string qnn_ctx_bin = "./testdata/qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
@ -482,6 +488,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_InvalidGraph) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
std::remove(context_binary_file.c_str());
@ -579,6 +586,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryRelativePathTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
@ -609,6 +617,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryAbsolutePathTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
@ -634,6 +643,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryFileNotExistTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
@ -659,6 +669,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryFileEmptyStringTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
@ -676,6 +687,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
const std::string context_binary_file = "./qnn_context_binary_2inputs_test.onnx";
std::remove(context_binary_file.c_str());
@ -727,6 +739,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphName
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
std::remove(context_binary_file.c_str());
@ -804,6 +817,7 @@ TEST_F(QnnHTPBackendTests, QnnMultiContextEmbeded) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
Ort::SessionOptions so;
so.AppendExecutionProvider("QNN", provider_options);
@ -819,6 +833,7 @@ TEST_F(QnnHTPBackendTests, QnnMultiContextExternal) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
Ort::SessionOptions so;
so.AppendExecutionProvider("QNN", provider_options);
@ -950,6 +965,7 @@ TEST_F(QnnHTPBackendTests, QnnContextShareAcrossSessions1) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
// Create QDQ models
std::vector<std::string> onnx_model_paths{"./weight_share1.onnx", "./weight_share2.onnx"};
@ -1047,6 +1063,7 @@ TEST_F(QnnHTPBackendTests, QnnContextShareAcrossSessions2) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
// Create QDQ models
std::vector<std::string> onnx_model_paths{"./weight_share21.onnx", "./weight_share22.onnx"};

View file

@ -278,7 +278,7 @@ static BackendSupport GetHTPSupport(const onnxruntime::logging::Logger& logger)
MockKernelLookup kernel_lookup;
onnxruntime::GraphViewer graph_viewer(graph);
std::unique_ptr<onnxruntime::IExecutionProvider> qnn_ep = QnnExecutionProviderWithOptions(
{{"backend_path", "QnnHtp.dll"}});
{{"backend_path", "QnnHtp.dll"}, {"offload_graph_io_quantization", "0"}});
qnn_ep->SetLogger(&logger);
auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup);
@ -341,7 +341,7 @@ static BackendSupport GetCPUSupport(const onnxruntime::logging::Logger& logger)
MockKernelLookup kernel_lookup;
onnxruntime::GraphViewer graph_viewer(graph);
std::unique_ptr<onnxruntime::IExecutionProvider> qnn_ep = QnnExecutionProviderWithOptions(
{{"backend_path", "QnnCpu.dll"}});
{{"backend_path", "QnnCpu.dll"}, {"offload_graph_io_quantization", "0"}});
qnn_ep->SetLogger(&logger);
auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup);

View file

@ -82,6 +82,7 @@ static void RunReduceTest(const std::string& op_type,
float fp32_abs_err = 1e-5f,
bool enable_fp16 = false) {
ProviderOptions provider_options;
provider_options["offload_graph_io_quantization"] = "0";
if (enable_fp16) {
#if defined(_WIN32)
provider_options["backend_path"] = "QnnHtp.dll";
@ -401,6 +402,7 @@ static void RunReduceOpQDQTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
constexpr bool noop_with_empty_axes = false;
const bool axes_as_input = ReduceOpHasAxesInput(op_type, opset); // Later opsets have "axes" as an input.

View file

@ -30,6 +30,7 @@ static void RunReshapeExpandTestOnCPU(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<DataType, int64_t>(op_type, {input_def}, {shape_def}, attrs),
provider_options,
@ -161,6 +162,7 @@ static void RunReshapeExpandTestOnHTP(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<DataType, int64_t>(op_type, {input_def}, {shape_def}, attrs),
provider_options,
@ -185,6 +187,7 @@ static void RunQDQReshapeExpandTestOnHTP(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_model_builder = BuildOpTestCase<float, int64_t>(op_type, {input_def}, {shape_def}, attrs);
auto qdq_model_builder = BuildQDQReshapeExpandTestCase<QType>(op_type, input_def, shape_def, attrs, use_contrib_qdq);

View file

@ -127,6 +127,7 @@ static void RunCPUResizeOpTest(const TestInputDef<float>& input_def, const std::
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(GetResizeModelBuilder(input_def, sizes_data, mode, coordinate_transformation_mode, nearest_mode),
provider_options,
@ -145,6 +146,7 @@ static void RunCPUResizeOpTestWithScales(const TestInputDef<float>& input_def, c
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(GetResizeModelBuilderWithScales(input_def, scales_data, mode, coordinate_transformation_mode, nearest_mode),
provider_options,
@ -166,6 +168,7 @@ static void RunQDQResizeOpTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(GetResizeModelBuilder(input_def, sizes_data, mode, coordinate_transformation_mode, nearest_mode),
GetQDQResizeModelBuilder<QuantType>(input_def, sizes_data, mode, coordinate_transformation_mode,

View file

@ -32,6 +32,7 @@ static void RunOpTestOnCPU(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(BuildOpTestCase<InputType>(op_type, input_defs, {}, attrs, op_domain),
provider_options,
@ -129,6 +130,7 @@ static void RunQDQOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, attrs, op_domain),
BuildQDQOpTestCase<InputQType>(op_type, input_defs, {}, attrs, op_domain, use_contrib_qdq),
@ -780,6 +782,7 @@ TEST_F(QnnHTPBackendTests, QuantAccuracyTest) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
// Note: a graph input -> Q -> DQ -> is optimized by Qnn to have a perfectly accurate output.
// ORT's CPU EP, on the otherhand, actually quantizes and dequantizes the input, which leads to different outputs.
@ -1206,6 +1209,7 @@ TEST_F(QnnHTPBackendTests, Add_U8_U16_Convert) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
TestQDQModelAccuracy(BuildOpTestCase<float>("Add", {input0_def, input1_def}, {}, {}, kOnnxDomain),
BuildQDQConvertAddTestCase(input0_def, input1_def),
@ -1271,6 +1275,7 @@ TEST_F(QnnHTPBackendTests, DQ_Q_ConvertFusion_SameType) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
QuantParams<uint8_t> out_qparams_u8 = {1.0f, 128};
QuantParams<uint16_t> out_qparams_u16 = {1.0f, 32768};

View file

@ -84,6 +84,7 @@ static void RunSliceQDQTest(const TestInputDef<float>& data_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
const std::vector<TestInputDef<float>> f32_inputs = {data_def};
const std::vector<TestInputDef<int64_t>> int64_inputs = {starts_def, ends_def, axes_def, steps_def};

View file

@ -276,6 +276,7 @@ static void RunQDQSplitOpTestOnHTP(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
const bool split_is_input = opset >= 13;
auto f32_model_builder = BuildSplitTestCase<float>(input_def, split, split_is_input, axis, num_outputs);

View file

@ -161,6 +161,7 @@ static void RunQDQSqueezeTestOnHTP(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_model_builder = BuildOpTestCase<float, int64_t>(op_type, {input_def}, {axes_def}, {});
auto qdq_model_builder = BuildQDQSqueezeTestCase<QType>(op_type, input_def, axes_def, use_contrib_qdq);
@ -219,6 +220,7 @@ TEST_F(QnnHTPBackendTests, Squeeze_Rank5_Rank2_f32) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(model_fn,
provider_options,
@ -275,6 +277,7 @@ TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank5_f32) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
RunQnnModelTest(model_fn,
provider_options,

View file

@ -98,6 +98,7 @@ static void RunQDQTileTestOnHTP(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_model_builder = BuildOpTestCase<float, int64_t>("Tile", {input_def}, {repeats_def}, {});
auto qdq_model_builder = BuildQDQTileTestCase<QType>(input_def, repeats_def, use_contrib_qdq);

View file

@ -154,6 +154,7 @@ static void RunQDQTopKTestOnHTP(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
auto f32_model_builder = BuildTopKTestCase<float>(input_def, k_def, attrs);
auto qdq_model_builder = BuildQDQTopKTestCase<QType>(input_def, k_def, attrs, use_contrib_qdq);

View file

@ -70,6 +70,7 @@ static void RunTransposeQDQTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
// Runs model with DQ-> Transpose -> Q and compares the outputs of the CPU and QNN EPs.
TestQDQModelAccuracy(BuildTransposeTestCase<float>(input_def, attrs),

View file

@ -79,6 +79,7 @@ static void RunWhereQDQTest(const TestInputDef<bool>& condition_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";
// Runs model with DQ-> Where -> Q and compares the outputs of the CPU and QNN EPs.
TestQDQModelAccuracy(BuildWhereTestCase(condition_def, x_def, y_def),