diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index bde27df94e..ab344d367a 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -3660,8 +3660,8 @@ struct OrtApi { * - "1": Enabled. * "offload_graph_io_quantization": Offload graph input quantization and graph output dequantization to another * execution provider (typically CPU EP). - * - "0": Default. Disabled. QNN EP will handle quantization and dequantization of graph I/O. - * - "1": Enabled. + * - "0": Disabled. QNN EP will handle quantization and dequantization of graph I/O. + * - "1": Enabled. This is the default value. * * SNPE supported keys: * "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16", diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index ed193904fe..a7a91d107b 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -377,13 +377,15 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio LOGS_DEFAULT(VERBOSE) << "User specified enable_htp_weight_sharing: " << enable_htp_weight_sharing_; } - model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", false, + model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", true, provider_options_map); if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) { - LOGS_DEFAULT(WARNING) << "Fallback to CPU EP is disabled, but user configured QNN EP to offload graph I/O " - << "quantization/dequantization to another EP. Session creation will fail if the CPU EP " - << "handles the graph I/O quantization/dequantization."; + LOGS_DEFAULT(INFO) << "Fallback to CPU EP is disabled, but user tried to configure QNN EP to offload graph I/O " + << "quantization/dequantization to another EP. These are conflicting options. Fallback to CPU " + << "EP will remain disabled and graph I/O quantization/dequantization will not be offloaded " + << "to another EP."; + model_settings_.offload_graph_io_quantization = false; } qnn_backend_manager_ = std::make_unique( diff --git a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc index c514cf16b2..41de81fd47 100644 --- a/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc +++ b/onnxruntime/test/providers/qnn/argmaxmin_op_test.cc @@ -70,6 +70,7 @@ static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef i #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildOpTestCase(op_type, {input_def}, {}, attrs), // baseline float32 model BuildQDQArgMxxTestCase(op_type, input_def, attrs), // QDQ model diff --git a/onnxruntime/test/providers/qnn/average_pool_test.cc b/onnxruntime/test/providers/qnn/average_pool_test.cc index 1a0f9bfcba..45d9a4fd2d 100644 --- a/onnxruntime/test/providers/qnn/average_pool_test.cc +++ b/onnxruntime/test/providers/qnn/average_pool_test.cc @@ -31,6 +31,7 @@ static void RunAveragePoolOpTest(const std::string& op_type, #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase(op_type, input_defs, {}, attrs), provider_options, @@ -53,6 +54,7 @@ static void RunQDQAveragePoolOpTest(const std::string& op_type, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildOpTestCase(op_type, input_defs, {}, attrs), BuildQDQOpTestCase(op_type, input_defs, {}, attrs), diff --git a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc index 0a39413a4e..7471b44faf 100644 --- a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc +++ b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc @@ -160,6 +160,7 @@ static void RunBatchNormQDQTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs. TestQDQModelAccuracy(BuildBatchNormTestCase(input_def, scale_def, bias_def), @@ -180,6 +181,7 @@ static void RunBatchNormFP16Test(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestInputDef input_fp16_def = ConvertToFP16InputDef(input_def); TestInputDef scale_fp16_def = ConvertToFP16InputDef(scale_def); diff --git a/onnxruntime/test/providers/qnn/cast_test.cc b/onnxruntime/test/providers/qnn/cast_test.cc index 9b83dd281a..713baab0b7 100644 --- a/onnxruntime/test/providers/qnn/cast_test.cc +++ b/onnxruntime/test/providers/qnn/cast_test.cc @@ -57,6 +57,7 @@ static void RunCastOpTest(const std::vector& shape, ONNX_NAMESPACE::Ten #else provider_options["backend_path"] = use_htp ? "libQnnHtp.so" : "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; if (use_htp && enable_fp16_precision) { provider_options["enable_htp_fp16_precision"] = "1"; diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc index cfa77a4621..ae169f7b25 100644 --- a/onnxruntime/test/providers/qnn/clip_op_test.cc +++ b/onnxruntime/test/providers/qnn/clip_op_test.cc @@ -117,6 +117,7 @@ static void RunQDQClipTestOnHTP(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto f32_model_builder = BuildOpTestCase("Clip", {input_def}, {min_max_defs}, {}); auto qdq_model_builder = BuildQDQOpTestCase("Clip", {input_def}, {min_max_defs}, {}, @@ -205,6 +206,7 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(model_fn, provider_options, diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index cf37fc0033..91677781e8 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -93,6 +93,8 @@ static void RunCPUConvOpTest(const std::string& conv_op_type, const TestInputDef #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; + auto build_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, group, auto_pad); RunQnnModelTest(build_fn, @@ -317,6 +319,7 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, group, auto_pad, output_activation), @@ -354,6 +357,7 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations, group, auto_pad, output_activation); @@ -665,6 +669,7 @@ TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto BuildConvMulGraph = [](ModelTestBuilder& builder) { // DQ node for Conv input diff --git a/onnxruntime/test/providers/qnn/flatten_op_test.cc b/onnxruntime/test/providers/qnn/flatten_op_test.cc index 637d3257dd..6370d4a4f1 100644 --- a/onnxruntime/test/providers/qnn/flatten_op_test.cc +++ b/onnxruntime/test/providers/qnn/flatten_op_test.cc @@ -101,6 +101,7 @@ static void RunQDQFlattenTestOnHTP(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto f32_model_builder = BuildOpTestCase("Flatten", {input_def}, {}, attrs); auto qdq_model_builder = BuildQDQOpTestCase("Flatten", {input_def}, {}, attrs, kOnnxDomain, use_contrib_qdq); @@ -172,6 +173,7 @@ TEST_F(QnnHTPBackendTests, Flatten_QDQ8bit_Rank5) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(model_fn, provider_options, diff --git a/onnxruntime/test/providers/qnn/gather_elems_op_test.cc b/onnxruntime/test/providers/qnn/gather_elems_op_test.cc index 81c0887306..68c9867617 100644 --- a/onnxruntime/test/providers/qnn/gather_elems_op_test.cc +++ b/onnxruntime/test/providers/qnn/gather_elems_op_test.cc @@ -67,6 +67,7 @@ static void RunCPUGatherElemsOpTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase("GatherElements", {input_def}, {indices_def}, attrs), provider_options, @@ -91,6 +92,7 @@ static void RunHTPQDQGatherElemsOpTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto f32_model_builder = BuildOpTestCase("GatherElements", {input_def}, {indices_def}, attrs); auto qdq_model_builder = BuildQDQGatherElemsTestCase(input_def, indices_def, attrs, @@ -119,6 +121,7 @@ static void RunHTPGatherElemsOpTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase("GatherElements", {input_def}, {indices_def}, attrs), provider_options, diff --git a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc index 55177cc7ed..4478d36ebf 100644 --- a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc @@ -63,6 +63,7 @@ static void RunQDQGatherOpTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto f32_model_builder = BuildOpTestCase("Gather", {input_def}, {indices_def}, attrs); auto qdq_model_builder = BuildQDQGatherTestCase(input_def, indices_def, attrs, @@ -152,4 +153,4 @@ TEST_F(QnnHTPBackendTests, DISABLED_GatherOp_IndicesStaticInt32_Axis1) { } // namespace test } // namespace onnxruntime -#endif \ No newline at end of file +#endif diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc index 33c868694c..b2aa6280ef 100644 --- a/onnxruntime/test/providers/qnn/gemm_op_test.cc +++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc @@ -29,6 +29,7 @@ static void RunGemmTestOnCPU(const std::vector>& input_de #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase("Gemm", input_defs, {}, attrs), provider_options, @@ -246,6 +247,8 @@ static void RunQDQGemmTestOnHTP(const std::vector>& input_de #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; + auto f32_model_builder = BuildOpTestCase("Gemm", input_defs, {}, attrs); auto qdq_model_builder = BuildQDQGemmTestCase(input_defs, attrs, use_contrib_qdq); TestQDQModelAccuracy(f32_model_builder, diff --git a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc index 3598ba1ac8..d4f66b72e0 100644 --- a/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc +++ b/onnxruntime/test/providers/qnn/instance_norm_htp_test.cc @@ -79,6 +79,7 @@ static void RunInstanceNormQDQTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs. TestQDQModelAccuracy(BuildOpTestCase("InstanceNormalization", {input_def, scale_def, bias_def}, {}, attrs), diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc index 947ac19be4..b2997c6278 100644 --- a/onnxruntime/test/providers/qnn/layer_norm_test.cc +++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc @@ -28,6 +28,7 @@ static void RunLayerNormCpuTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase("LayerNormalization", {input_def, scale_def}, {}, attrs), provider_options, @@ -152,6 +153,7 @@ static void RunLayerNormQDQTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildOpTestCase("LayerNormalization", {input_def, scale_def}, {}, attrs), BuildQDQLayerNormTestCase(input_def, scale_def, bias_def, attrs, diff --git a/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc b/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc index ece8d91d53..77d96b56d2 100644 --- a/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc @@ -28,6 +28,7 @@ static void RunLeakyReluOpQDQTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildOpTestCase("LeakyRelu", {input_def}, {}, attrs), BuildQDQOpTestCase("LeakyRelu", {input_def}, {}, attrs), @@ -66,6 +67,7 @@ TEST_F(QnnHTPBackendTests, LeakyReluFP16OpSet16) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto input_def = TestInputDef({1, 2, 3}, false, {-40.0f, -20.0f, 1.0f, 10.0f, 30.0f, 40.0f}); TestInputDef input_fp16_def = ConvertToFP16InputDef(input_def); diff --git a/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc b/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc index 5910513678..a49fa7c5fc 100644 --- a/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc +++ b/onnxruntime/test/providers/qnn/logical_comp_ops_test.cc @@ -73,6 +73,7 @@ static void RunCPULogicalOpTest(const std::string& op_type, const std::vector(op_type, shape), provider_options, @@ -157,6 +159,7 @@ TEST_F(QnnHTPBackendTests, EqualToCast4D) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; // Model building function that creates a QDQ graph with an Equal node followed by // a Cast to float32. diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc index a99cba66bf..8327849a17 100644 --- a/onnxruntime/test/providers/qnn/lrn_op_test.cc +++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc @@ -70,6 +70,7 @@ static void RunCPULRNOpTest(const TestInputDef& input_def, int64_t size, provider_options["backend_path"] = "libQnnCpu.so"; fp32_abs_err = 1.5e-5f; // On linux we need slightly larger tolerance. #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildLRNTestCase(input_def, size, alpha, beta, bias), provider_options, @@ -91,6 +92,7 @@ static void RunQDQLRNOpTest(const TestInputDef& input_def, int64_t size, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildLRNTestCase(input_def, size, alpha, beta, bias), BuildQDQLRNTestCase(input_def, size, alpha, beta, bias), diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp index 5c6967761b..24a74b9c4c 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cpp +++ b/onnxruntime/test/providers/qnn/matmul_test.cpp @@ -125,6 +125,7 @@ static void RunQDQPerChannelMatMulOpOpTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; if (enable_fp16_precision) { provider_options["enable_htp_fp16_precision"] = "1"; @@ -178,6 +179,7 @@ static void RunQDQMatMulOpOpTest(const TestInputDef& input1_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildMatMulOpTestCase(input1_def, input2_def), BuildMatMulOpQDQTestCase(input1_def, input2_def, diff --git a/onnxruntime/test/providers/qnn/max_min_op_test.cc b/onnxruntime/test/providers/qnn/max_min_op_test.cc index 3deff121f3..4db1fdcec4 100644 --- a/onnxruntime/test/providers/qnn/max_min_op_test.cc +++ b/onnxruntime/test/providers/qnn/max_min_op_test.cc @@ -26,6 +26,7 @@ static void RunCPUMinOrMaxOpTest(const std::string& op_type, #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase(op_type, input_defs, {}, {}, kOnnxDomain), provider_options, @@ -47,6 +48,7 @@ static void RunQDQMinOrMaxOpTest(const std::string& op_type, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildOpTestCase(op_type, input_defs, {}, {}, kOnnxDomain), // baseline float32 model BuildQDQOpTestCase(op_type, input_defs, {}, {}, kOnnxDomain), // QDQ model diff --git a/onnxruntime/test/providers/qnn/pad_op_test.cpp b/onnxruntime/test/providers/qnn/pad_op_test.cpp index a6b8664c6c..17629b9218 100644 --- a/onnxruntime/test/providers/qnn/pad_op_test.cpp +++ b/onnxruntime/test/providers/qnn/pad_op_test.cpp @@ -116,6 +116,7 @@ static void RunPadOpTest(const TestInputDef& data_def, provider_options["backend_path"] = "libQnnCpu.so"; #endif } + provider_options["offload_graph_io_quantization"] = "0"; if (enable_fp16_precision) { provider_options["enable_htp_fp16_precision"] = "1"; @@ -144,6 +145,7 @@ static void RunQDQPadOpTest(const TestInputDef& data_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildPadTestCase(data_def, pads_def, constant_value_def, attrs), BuildPadQDQTestCase(data_def, pads_def, constant_value_def, attrs, diff --git a/onnxruntime/test/providers/qnn/pool_op_test.cpp b/onnxruntime/test/providers/qnn/pool_op_test.cpp index 5dd3a6aaa3..8905a64ab4 100644 --- a/onnxruntime/test/providers/qnn/pool_op_test.cpp +++ b/onnxruntime/test/providers/qnn/pool_op_test.cpp @@ -60,6 +60,7 @@ static void RunPoolOpTest(const std::string& op_type, #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase(op_type, {input_def}, {}, attrs), provider_options, @@ -83,6 +84,7 @@ static void RunQDQPoolOpTest(const std::string& op_type, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildOpTestCase(op_type, {input_def}, {}, attrs), BuildPoolQDQTestCase(op_type, input_def, attrs, use_contrib_qdq_ops), diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index e8282dbad9..8335bf0b15 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -117,6 +117,7 @@ TEST(QnnEP, TestDisableCPUFallback_ModelNotFullySupported) { #else options["backend_path"] = "libQnnCpu.so"; #endif + options["offload_graph_io_quantization"] = "0"; so.AppendExecutionProvider("QNN", options); @@ -148,6 +149,7 @@ TEST(QnnEP, TestDisableCPUFallback_TryingToRunOnQnnCPU) { #else options["backend_path"] = "libQnnCpu.so"; #endif + options["offload_graph_io_quantization"] = "0"; auto input_defs = {TestInputDef({1, 2, 2, 2}, false, -10.0f, 10.0f), TestInputDef({1, 2, 2, 2}, false, -10.0f, 10.0f)}; @@ -196,6 +198,7 @@ TEST(QnnEP, TestDisableCPUFallback_ConflictingConfig) { #else options["backend_path"] = "libQnnCpu.so"; #endif + options["offload_graph_io_quantization"] = "0"; so.AppendExecutionProvider("QNN", options); @@ -226,6 +229,7 @@ TEST_F(QnnHTPBackendTests, TestConvWithExternalData) { #else options["backend_path"] = "libQnnHtp.so"; #endif + options["offload_graph_io_quantization"] = "0"; so.AppendExecutionProvider("QNN", options); @@ -301,6 +305,7 @@ static void RunNHWCResizeModel(const ORTCHAR_T* ort_model_path, bool use_htp, bo so.SetGraphOptimizationLevel(ORT_ENABLE_ALL); onnxruntime::ProviderOptions options; + options["offload_graph_io_quantization"] = "0"; #if defined(_WIN32) options["backend_path"] = use_htp ? "QnnHtp.dll" : "QnnCpu.dll"; @@ -591,6 +596,7 @@ TEST_F(QnnHTPBackendTests, MultithreadSessionRun) { #else options["backend_path"] = "libQnnHtp.so"; #endif + options["offload_graph_io_quantization"] = "0"; auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts); EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK()); @@ -640,6 +646,7 @@ TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgSessionRunOption) { #else options["backend_path"] = "libQnnHtp.so"; #endif + options["offload_graph_io_quantization"] = "0"; auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts); EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK()); @@ -705,6 +712,7 @@ TEST_F(QnnHTPBackendTests, MultithreadDefaultHtpPowerCfgFromEpOption) { #else options["backend_path"] = "libQnnHtp.so"; #endif + options["offload_graph_io_quantization"] = "0"; options["htp_performance_mode"] = "burst"; auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts); @@ -756,6 +764,7 @@ TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgDefaultAndRunOption) { #else options["backend_path"] = "libQnnHtp.so"; #endif + options["offload_graph_io_quantization"] = "0"; options["htp_performance_mode"] = "burst"; auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts); @@ -920,6 +929,7 @@ TEST_F(QnnHTPBackendTests, ProfilingTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; provider_options["enable_htp_fp16_precision"] = "1"; provider_options["profiling_level"] = "detailed"; provider_options["profiling_file_path"] = "detailed_profile.csv"; @@ -940,6 +950,7 @@ TEST_F(QnnHTPBackendTests, CastAddHTPAccuracyTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildCastAddTestCase(), provider_options, @@ -1010,6 +1021,7 @@ TEST_F(QnnHTPBackendTests, EPRejectsDynamicShapesF32) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; provider_options["enable_htp_fp16_precision"] = "1"; // QNN EP will use fp16 precision. // CPU EP will use fp32, so we can relax accuracy requirements. diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc index a3f0ed55b8..dda03829dc 100644 --- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc @@ -83,6 +83,7 @@ void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; const std::unordered_map domain_to_version = {{"", 13}, {kMSDomain, 1}}; @@ -225,6 +226,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryGeneration2InputTypes) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; const std::unordered_map domain_to_version = {{"", 13}, {kMSDomain, 1}}; @@ -275,6 +277,7 @@ TEST_F(QnnHTPBackendTests, QnnContextGeneration2InputsOrderIssue) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; // Add kMSDomain to cover contrib op like Gelu const std::unordered_map domain_to_version = {{"", 13}, {kMSDomain, 1}}; @@ -311,6 +314,7 @@ TEST_F(QnnHTPBackendTests, QnnContextGenerationNodeNamePrefix) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; std::string node_name_prefix = "node_name_prefix_test"; // Add kMSDomain to cover contrib op like Gelu @@ -353,6 +357,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheEmbedModeTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; const std::string context_binary_file = "./qnn_context_binary_test.onnx"; std::remove(context_binary_file.c_str()); @@ -401,6 +406,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCacheNonEmbedModeTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; const std::string context_binary_file = "./testdata/qnn_context_cache_non_embed.onnx"; std::string qnn_ctx_bin = "./testdata/qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin"; @@ -482,6 +488,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_InvalidGraph) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx"; std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin"; std::remove(context_binary_file.c_str()); @@ -579,6 +586,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryRelativePathTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options))); ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast(model_data.size()))); @@ -609,6 +617,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryAbsolutePathTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options))); ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast(model_data.size()))); @@ -634,6 +643,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryFileNotExistTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options))); ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast(model_data.size()))); @@ -659,6 +669,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryFileEmptyStringTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options))); ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast(model_data.size()))); @@ -676,6 +687,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; const std::string context_binary_file = "./qnn_context_binary_2inputs_test.onnx"; std::remove(context_binary_file.c_str()); @@ -727,6 +739,7 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphName #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx"; std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin"; std::remove(context_binary_file.c_str()); @@ -804,6 +817,7 @@ TEST_F(QnnHTPBackendTests, QnnMultiContextEmbeded) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; Ort::SessionOptions so; so.AppendExecutionProvider("QNN", provider_options); @@ -819,6 +833,7 @@ TEST_F(QnnHTPBackendTests, QnnMultiContextExternal) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; Ort::SessionOptions so; so.AppendExecutionProvider("QNN", provider_options); @@ -950,6 +965,7 @@ TEST_F(QnnHTPBackendTests, QnnContextShareAcrossSessions1) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; // Create QDQ models std::vector onnx_model_paths{"./weight_share1.onnx", "./weight_share2.onnx"}; @@ -1047,6 +1063,7 @@ TEST_F(QnnHTPBackendTests, QnnContextShareAcrossSessions2) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; // Create QDQ models std::vector onnx_model_paths{"./weight_share21.onnx", "./weight_share22.onnx"}; diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.cc b/onnxruntime/test/providers/qnn/qnn_test_utils.cc index 79e7d39e85..3f6efc453c 100644 --- a/onnxruntime/test/providers/qnn/qnn_test_utils.cc +++ b/onnxruntime/test/providers/qnn/qnn_test_utils.cc @@ -278,7 +278,7 @@ static BackendSupport GetHTPSupport(const onnxruntime::logging::Logger& logger) MockKernelLookup kernel_lookup; onnxruntime::GraphViewer graph_viewer(graph); std::unique_ptr qnn_ep = QnnExecutionProviderWithOptions( - {{"backend_path", "QnnHtp.dll"}}); + {{"backend_path", "QnnHtp.dll"}, {"offload_graph_io_quantization", "0"}}); qnn_ep->SetLogger(&logger); auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup); @@ -341,7 +341,7 @@ static BackendSupport GetCPUSupport(const onnxruntime::logging::Logger& logger) MockKernelLookup kernel_lookup; onnxruntime::GraphViewer graph_viewer(graph); std::unique_ptr qnn_ep = QnnExecutionProviderWithOptions( - {{"backend_path", "QnnCpu.dll"}}); + {{"backend_path", "QnnCpu.dll"}, {"offload_graph_io_quantization", "0"}}); qnn_ep->SetLogger(&logger); auto result = qnn_ep->GetCapability(graph_viewer, kernel_lookup); diff --git a/onnxruntime/test/providers/qnn/reduce_op_test.cc b/onnxruntime/test/providers/qnn/reduce_op_test.cc index 13173d9a87..69df89ebcf 100644 --- a/onnxruntime/test/providers/qnn/reduce_op_test.cc +++ b/onnxruntime/test/providers/qnn/reduce_op_test.cc @@ -82,6 +82,7 @@ static void RunReduceTest(const std::string& op_type, float fp32_abs_err = 1e-5f, bool enable_fp16 = false) { ProviderOptions provider_options; + provider_options["offload_graph_io_quantization"] = "0"; if (enable_fp16) { #if defined(_WIN32) provider_options["backend_path"] = "QnnHtp.dll"; @@ -401,6 +402,7 @@ static void RunReduceOpQDQTest(const std::string& op_type, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; constexpr bool noop_with_empty_axes = false; const bool axes_as_input = ReduceOpHasAxesInput(op_type, opset); // Later opsets have "axes" as an input. diff --git a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc index 3964edc114..21abc66e67 100644 --- a/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc +++ b/onnxruntime/test/providers/qnn/reshape_expand_op_test.cc @@ -30,6 +30,7 @@ static void RunReshapeExpandTestOnCPU(const std::string& op_type, #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase(op_type, {input_def}, {shape_def}, attrs), provider_options, @@ -161,6 +162,7 @@ static void RunReshapeExpandTestOnHTP(const std::string& op_type, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase(op_type, {input_def}, {shape_def}, attrs), provider_options, @@ -185,6 +187,7 @@ static void RunQDQReshapeExpandTestOnHTP(const std::string& op_type, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto f32_model_builder = BuildOpTestCase(op_type, {input_def}, {shape_def}, attrs); auto qdq_model_builder = BuildQDQReshapeExpandTestCase(op_type, input_def, shape_def, attrs, use_contrib_qdq); diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc index 15612e3267..15fcd039a0 100644 --- a/onnxruntime/test/providers/qnn/resize_test.cc +++ b/onnxruntime/test/providers/qnn/resize_test.cc @@ -127,6 +127,7 @@ static void RunCPUResizeOpTest(const TestInputDef& input_def, const std:: #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(GetResizeModelBuilder(input_def, sizes_data, mode, coordinate_transformation_mode, nearest_mode), provider_options, @@ -145,6 +146,7 @@ static void RunCPUResizeOpTestWithScales(const TestInputDef& input_def, c #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(GetResizeModelBuilderWithScales(input_def, scales_data, mode, coordinate_transformation_mode, nearest_mode), provider_options, @@ -166,6 +168,7 @@ static void RunQDQResizeOpTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(GetResizeModelBuilder(input_def, sizes_data, mode, coordinate_transformation_mode, nearest_mode), GetQDQResizeModelBuilder(input_def, sizes_data, mode, coordinate_transformation_mode, diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc index 7541d94bac..c0e567b07d 100644 --- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc @@ -32,6 +32,7 @@ static void RunOpTestOnCPU(const std::string& op_type, #else provider_options["backend_path"] = "libQnnCpu.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(BuildOpTestCase(op_type, input_defs, {}, attrs, op_domain), provider_options, @@ -129,6 +130,7 @@ static void RunQDQOpTest(const std::string& op_type, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildOpTestCase(op_type, input_defs, {}, attrs, op_domain), BuildQDQOpTestCase(op_type, input_defs, {}, attrs, op_domain, use_contrib_qdq), @@ -780,6 +782,7 @@ TEST_F(QnnHTPBackendTests, QuantAccuracyTest) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; // Note: a graph input -> Q -> DQ -> is optimized by Qnn to have a perfectly accurate output. // ORT's CPU EP, on the otherhand, actually quantizes and dequantizes the input, which leads to different outputs. @@ -1206,6 +1209,7 @@ TEST_F(QnnHTPBackendTests, Add_U8_U16_Convert) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; TestQDQModelAccuracy(BuildOpTestCase("Add", {input0_def, input1_def}, {}, {}, kOnnxDomain), BuildQDQConvertAddTestCase(input0_def, input1_def), @@ -1271,6 +1275,7 @@ TEST_F(QnnHTPBackendTests, DQ_Q_ConvertFusion_SameType) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; QuantParams out_qparams_u8 = {1.0f, 128}; QuantParams out_qparams_u16 = {1.0f, 32768}; diff --git a/onnxruntime/test/providers/qnn/slice_htp_test.cc b/onnxruntime/test/providers/qnn/slice_htp_test.cc index 07c97d2d7b..dc16192188 100644 --- a/onnxruntime/test/providers/qnn/slice_htp_test.cc +++ b/onnxruntime/test/providers/qnn/slice_htp_test.cc @@ -84,6 +84,7 @@ static void RunSliceQDQTest(const TestInputDef& data_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; const std::vector> f32_inputs = {data_def}; const std::vector> int64_inputs = {starts_def, ends_def, axes_def, steps_def}; diff --git a/onnxruntime/test/providers/qnn/split_op_test.cc b/onnxruntime/test/providers/qnn/split_op_test.cc index 6dc721edb4..2b8a913ba4 100644 --- a/onnxruntime/test/providers/qnn/split_op_test.cc +++ b/onnxruntime/test/providers/qnn/split_op_test.cc @@ -276,6 +276,7 @@ static void RunQDQSplitOpTestOnHTP(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; const bool split_is_input = opset >= 13; auto f32_model_builder = BuildSplitTestCase(input_def, split, split_is_input, axis, num_outputs); diff --git a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc index 33d2f64c03..249a5da733 100644 --- a/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc +++ b/onnxruntime/test/providers/qnn/squeeze_unsqueeze_op_test.cc @@ -161,6 +161,7 @@ static void RunQDQSqueezeTestOnHTP(const std::string& op_type, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto f32_model_builder = BuildOpTestCase(op_type, {input_def}, {axes_def}, {}); auto qdq_model_builder = BuildQDQSqueezeTestCase(op_type, input_def, axes_def, use_contrib_qdq); @@ -219,6 +220,7 @@ TEST_F(QnnHTPBackendTests, Squeeze_Rank5_Rank2_f32) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(model_fn, provider_options, @@ -275,6 +277,7 @@ TEST_F(QnnHTPBackendTests, Unsqueeze_Rank3_Rank5_f32) { #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; RunQnnModelTest(model_fn, provider_options, diff --git a/onnxruntime/test/providers/qnn/tile_op_test.cc b/onnxruntime/test/providers/qnn/tile_op_test.cc index 2b35c730ee..20fb280117 100644 --- a/onnxruntime/test/providers/qnn/tile_op_test.cc +++ b/onnxruntime/test/providers/qnn/tile_op_test.cc @@ -98,6 +98,7 @@ static void RunQDQTileTestOnHTP(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto f32_model_builder = BuildOpTestCase("Tile", {input_def}, {repeats_def}, {}); auto qdq_model_builder = BuildQDQTileTestCase(input_def, repeats_def, use_contrib_qdq); diff --git a/onnxruntime/test/providers/qnn/topk_op_test.cc b/onnxruntime/test/providers/qnn/topk_op_test.cc index 5a9351b936..aee86428b1 100644 --- a/onnxruntime/test/providers/qnn/topk_op_test.cc +++ b/onnxruntime/test/providers/qnn/topk_op_test.cc @@ -154,6 +154,7 @@ static void RunQDQTopKTestOnHTP(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; auto f32_model_builder = BuildTopKTestCase(input_def, k_def, attrs); auto qdq_model_builder = BuildQDQTopKTestCase(input_def, k_def, attrs, use_contrib_qdq); diff --git a/onnxruntime/test/providers/qnn/transpose_htp_test.cc b/onnxruntime/test/providers/qnn/transpose_htp_test.cc index 63746e22d2..b7bec34f7d 100644 --- a/onnxruntime/test/providers/qnn/transpose_htp_test.cc +++ b/onnxruntime/test/providers/qnn/transpose_htp_test.cc @@ -70,6 +70,7 @@ static void RunTransposeQDQTest(const TestInputDef& input_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; // Runs model with DQ-> Transpose -> Q and compares the outputs of the CPU and QNN EPs. TestQDQModelAccuracy(BuildTransposeTestCase(input_def, attrs), diff --git a/onnxruntime/test/providers/qnn/where_htp_test.cc b/onnxruntime/test/providers/qnn/where_htp_test.cc index ec525ef4eb..e1b0604b31 100644 --- a/onnxruntime/test/providers/qnn/where_htp_test.cc +++ b/onnxruntime/test/providers/qnn/where_htp_test.cc @@ -79,6 +79,7 @@ static void RunWhereQDQTest(const TestInputDef& condition_def, #else provider_options["backend_path"] = "libQnnHtp.so"; #endif + provider_options["offload_graph_io_quantization"] = "0"; // Runs model with DQ-> Where -> Q and compares the outputs of the CPU and QNN EPs. TestQDQModelAccuracy(BuildWhereTestCase(condition_def, x_def, y_def),