From e8b327d6573845654700c048c330c9c722f3d3f8 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Tue, 3 Dec 2019 16:28:44 +1000 Subject: [PATCH] Fix constant folding of node assigned to CUDA (#2510) * Constant folding bug fix/improvements - Handle constant folding for node that is assigned to a non cpu EP - Check for errors in optimizer execution frame setup - Improve CUDA partitioning to look for initializers in parent graphs - Add unit test Fixes #2474 --- .../core/optimizer/constant_folding.cc | 25 +++++++++++- onnxruntime/core/optimizer/constant_folding.h | 9 +---- .../optimizer/optimizer_execution_frame.cc | 13 +++--- .../providers/cuda/cuda_execution_provider.cc | 40 +++++++++---------- .../test/optimizer/graph_transform_test.cc | 28 ++++++++++++- 5 files changed, 79 insertions(+), 36 deletions(-) diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc index 3c4fdb71bb..d00c1759ce 100644 --- a/onnxruntime/core/optimizer/constant_folding.cc +++ b/onnxruntime/core/optimizer/constant_folding.cc @@ -25,6 +25,17 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level, InitializedTensorSet constant_inputs; + // we currently constant fold using the CPU EP only. + // if the node is assigned to a different EP we can run it if it's an ONNX op as we have CPU based implementations + // for all ONNX ops. if it's from a different domain we can't. + // NOTE: This is in addition to the IsSupportedProvider check below which will optionally do further filtering + // on the EPs we constant fold for. + auto ep_type = node->GetExecutionProviderType(); + bool cpu_ep = ep_type == kCpuExecutionProvider; + if (!cpu_ep && node->Domain() != kOnnxDomain) { + continue; + } + // Check if constant folding can be applied on this node. if (!graph_utils::IsSupportedProvider(*node, GetCompatibleExecutionProviders()) || excluded_op_types_.find(node->OpType()) != excluded_op_types_.end() || @@ -36,9 +47,19 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level, continue; } + // override the EP while setting up OptimizerExecutionFrame::Info so that it will use the CPU kernel for Compute. + if (!cpu_ep) { + node->SetExecutionProviderType(kCpuExecutionProvider); + } + // Create execution frame for executing constant nodes. OptimizerExecutionFrame::Info info({node}, constant_inputs); + // undo the EP change in case something fails prior to node removal + if (!cpu_ep) { + node->SetExecutionProviderType(ep_type); + } + std::vector fetch_mlvalue_idxs; for (const auto* node_out : node->OutputDefs()) { fetch_mlvalue_idxs.push_back(info.GetMLValueIndex(node_out->Name())); @@ -62,8 +83,8 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level, OrtValue& ort_value = fetches[fetch_idx]; if (!ort_value.IsTensor()) { - LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type() - << ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'"; + LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type() + << ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'"; unsupported_output_type = true; break; } diff --git a/onnxruntime/core/optimizer/constant_folding.h b/onnxruntime/core/optimizer/constant_folding.h index 15cc065b75..2be65a2670 100644 --- a/onnxruntime/core/optimizer/constant_folding.h +++ b/onnxruntime/core/optimizer/constant_folding.h @@ -16,8 +16,8 @@ it statically computes parts of the graph that rely only on constant initializer */ class ConstantFolding : public GraphTransformer { public: - ConstantFolding(const std::unordered_set& compatible_execution_providers = {}) noexcept : - GraphTransformer("ConstantFolding", compatible_execution_providers) {} + ConstantFolding(const std::unordered_set& compatible_execution_providers = {}) noexcept + : GraphTransformer("ConstantFolding", compatible_execution_providers) {} private: /** Constant folding will not be applied to nodes whose op_type is included in this set. @@ -26,11 +26,6 @@ class ConstantFolding : public GraphTransformer { {"RandomUniform", "RandomNormal", "RandomUniformLike", "RandomNormalLike", "Multinomial"}; Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override; - - /** Create a TensorProto that has the same value as the given OrtValue - and the same type and dimensions as the given NodeArg. */ - void BuildTensorProtoForInitializer(const OrtValue& ort_value, const NodeArg& constant_node_arg, - ONNX_NAMESPACE::TensorProto& tensorproto) const; }; } // namespace onnxruntime diff --git a/onnxruntime/core/optimizer/optimizer_execution_frame.cc b/onnxruntime/core/optimizer/optimizer_execution_frame.cc index a9ba230cf3..b876521ca6 100644 --- a/onnxruntime/core/optimizer/optimizer_execution_frame.cc +++ b/onnxruntime/core/optimizer/optimizer_execution_frame.cc @@ -57,8 +57,8 @@ OptimizerExecutionFrame::Info::Info(const std::vector& nodes, // TODO: node->ImplicitInputDefs() need to be added here for control flow nodes. for (auto* node : nodes) { - onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps); - onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps); + ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps)); + ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps)); } node_index_info_ = onnxruntime::make_unique(nodes, ort_value_name_idx_map_); @@ -67,8 +67,9 @@ OptimizerExecutionFrame::Info::Info(const std::vector& nodes, for (auto* node : nodes) { std::unique_ptr op_kernel; std::shared_ptr kernel_registry = cpu_execution_provider_->GetKernelRegistry(); - auto status = kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_, - ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_, op_kernel); + ORT_THROW_IF_ERROR(kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_, + ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_, + op_kernel)); kernels_[node->Index()] = std::move(op_kernel); } } @@ -118,8 +119,8 @@ Status OptimizerExecutionFrame::CreateNodeOutputMLValueImpl(OrtValue& ort_value, auto element_type = static_cast(ml_type)->GetElementType(); AllocatorPtr allocator_ptr = info_.GetAllocator(); std::unique_ptr p_tensor = onnxruntime::make_unique(element_type, - *shape, - allocator_ptr); + *shape, + allocator_ptr); auto ml_tensor = DataTypeImpl::GetType(); ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc()); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index d9b8205e0c..1f900d3a7c 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -3,11 +3,12 @@ #include "cuda_common.h" #include "cuda_execution_provider.h" -#include "core/framework/memcpy.h" #include "cuda_fence.h" #include "cuda_allocator.h" #include "core/framework/kernel_registry.h" #include "core/framework/compute_capability.h" +#include "core/framework/memcpy.h" +#include "core/graph/graph_utils.h" #include "core/providers/cuda/gpu_data_transfer.h" #ifndef DISABLE_CONTRIB_OPS @@ -1303,28 +1304,27 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph, // Note that nodes with only inputs from initializer would not be place on CUDA // Ideally, those nodes should be eliminated in constant folding bool should_force_outside = true; - bool all_input_are_initializer = true; - node.ForEachWithIndex( - node.InputDefs(), - [&](const NodeArg& def, size_t index) { - const ONNX_NAMESPACE::TensorProto* initializer = nullptr; - // The input is not a initializer and the input is from CPU - // or the input declared as CPU memory and is from CPU - // in that case we should still keep the node on CUDA - bool initializer_input = graph.GetInitializedTensor(def.Name(), initializer); - bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0; - if ((!initializer_input && !input_is_on_cpu) || - (input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) - should_force_outside = false; + bool all_inputs_are_initializers = true; + node.ForEachWithIndex(node.InputDefs(), + [&](const NodeArg& def, size_t index) { + // The input is not a initializer and the input is from CPU + // or the input declared as CPU memory and is from CPU + // in that case we should still keep the node on CUDA + bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true); + bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0; + if ((!initializer_input && !input_is_on_cpu) || + (input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) { + should_force_outside = false; + } - if (!initializer_input) { - all_input_are_initializer = false; - } - return Status::OK(); - }); + if (!initializer_input) { + all_inputs_are_initializers = false; + } + return Status::OK(); + }); // If all the inputs are initializers, we shouldn't force it to CPU - if (should_force_outside && !all_input_are_initializer) { + if (should_force_outside && !all_inputs_are_initializers) { force_outside = true; } } diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 00220740a6..ce24d9e964 100644 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -131,6 +131,33 @@ TEST(GraphTransformationTests, ConstantFolding) { ASSERT_TRUE(op_to_count["Unsqueeze"] == 0); } +TEST(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) { + auto model_uri = MODEL_FOLDER "fusion/fuse-conv-bn-mul-add-unsqueeze.onnx"; + std::shared_ptr model; + ASSERT_TRUE(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK()); + Graph& graph = model->MainGraph(); + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_to_count["Unsqueeze"] == 2); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level1); + + // assign all nodes to CUDA. the constant folding should override this to perform the constant folding on cpu + for (auto& node : graph.Nodes()) { + node.SetExecutionProviderType(kCudaExecutionProvider); + } + + ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, DefaultLoggingManager().DefaultLogger()).IsOK()); + + op_to_count = CountOpsInGraph(graph); + ASSERT_TRUE(op_to_count["Unsqueeze"] == 0); + + // all remaining nodes should still be on CUDA + for (auto& node : graph.Nodes()) { + EXPECT_STREQ(node.GetExecutionProviderType().c_str(), kCudaExecutionProvider); + } +} + TEST(GraphTransformationTests, ConstantFoldingSubgraph) { TensorProto value_tensor; value_tensor.add_dims(1); @@ -1010,7 +1037,6 @@ static void ValidateAttention(Graph& graph) { for (size_t i = 0; i < expected_value2.size(); i++) { EXPECT_EQ(data2[i], static_cast(expected_value2[i])); } - } } }