Fix constant folding of node assigned to CUDA (#2510)

* Constant folding bug fix/improvements
  - Handle constant folding for node that is assigned to a non cpu EP
  - Check for errors in optimizer execution frame setup
  - Improve CUDA partitioning to look for initializers in parent graphs
  - Add unit test

Fixes #2474
This commit is contained in:
Scott McKay 2019-12-03 16:28:44 +10:00 committed by GitHub
parent 4354023913
commit e8b327d657
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 79 additions and 36 deletions

View file

@ -25,6 +25,17 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
InitializedTensorSet constant_inputs;
// we currently constant fold using the CPU EP only.
// if the node is assigned to a different EP we can run it if it's an ONNX op as we have CPU based implementations
// for all ONNX ops. if it's from a different domain we can't.
// NOTE: This is in addition to the IsSupportedProvider check below which will optionally do further filtering
// on the EPs we constant fold for.
auto ep_type = node->GetExecutionProviderType();
bool cpu_ep = ep_type == kCpuExecutionProvider;
if (!cpu_ep && node->Domain() != kOnnxDomain) {
continue;
}
// Check if constant folding can be applied on this node.
if (!graph_utils::IsSupportedProvider(*node, GetCompatibleExecutionProviders()) ||
excluded_op_types_.find(node->OpType()) != excluded_op_types_.end() ||
@ -36,9 +47,19 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
continue;
}
// override the EP while setting up OptimizerExecutionFrame::Info so that it will use the CPU kernel for Compute.
if (!cpu_ep) {
node->SetExecutionProviderType(kCpuExecutionProvider);
}
// Create execution frame for executing constant nodes.
OptimizerExecutionFrame::Info info({node}, constant_inputs);
// undo the EP change in case something fails prior to node removal
if (!cpu_ep) {
node->SetExecutionProviderType(ep_type);
}
std::vector<int> fetch_mlvalue_idxs;
for (const auto* node_out : node->OutputDefs()) {
fetch_mlvalue_idxs.push_back(info.GetMLValueIndex(node_out->Name()));
@ -62,8 +83,8 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
OrtValue& ort_value = fetches[fetch_idx];
if (!ort_value.IsTensor()) {
LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type()
<< ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'";
LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type()
<< ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'";
unsupported_output_type = true;
break;
}

View file

@ -16,8 +16,8 @@ it statically computes parts of the graph that rely only on constant initializer
*/
class ConstantFolding : public GraphTransformer {
public:
ConstantFolding(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept :
GraphTransformer("ConstantFolding", compatible_execution_providers) {}
ConstantFolding(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
: GraphTransformer("ConstantFolding", compatible_execution_providers) {}
private:
/** Constant folding will not be applied to nodes whose op_type is included in this set.
@ -26,11 +26,6 @@ class ConstantFolding : public GraphTransformer {
{"RandomUniform", "RandomNormal", "RandomUniformLike", "RandomNormalLike", "Multinomial"};
Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
/** Create a TensorProto that has the same value as the given OrtValue
and the same type and dimensions as the given NodeArg. */
void BuildTensorProtoForInitializer(const OrtValue& ort_value, const NodeArg& constant_node_arg,
ONNX_NAMESPACE::TensorProto& tensorproto) const;
};
} // namespace onnxruntime

View file

@ -57,8 +57,8 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
// TODO: node->ImplicitInputDefs() need to be added here for control flow nodes.
for (auto* node : nodes) {
onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps);
onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps);
ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps));
ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps));
}
node_index_info_ = onnxruntime::make_unique<NodeIndexInfo>(nodes, ort_value_name_idx_map_);
@ -67,8 +67,9 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
for (auto* node : nodes) {
std::unique_ptr<OpKernel> op_kernel;
std::shared_ptr<KernelRegistry> kernel_registry = cpu_execution_provider_->GetKernelRegistry();
auto status = kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_,
ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_, op_kernel);
ORT_THROW_IF_ERROR(kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_,
ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_,
op_kernel));
kernels_[node->Index()] = std::move(op_kernel);
}
}
@ -118,8 +119,8 @@ Status OptimizerExecutionFrame::CreateNodeOutputMLValueImpl(OrtValue& ort_value,
auto element_type = static_cast<const TensorTypeBase*>(ml_type)->GetElementType();
AllocatorPtr allocator_ptr = info_.GetAllocator();
std::unique_ptr<Tensor> p_tensor = onnxruntime::make_unique<Tensor>(element_type,
*shape,
allocator_ptr);
*shape,
allocator_ptr);
auto ml_tensor = DataTypeImpl::GetType<Tensor>();
ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());

View file

@ -3,11 +3,12 @@
#include "cuda_common.h"
#include "cuda_execution_provider.h"
#include "core/framework/memcpy.h"
#include "cuda_fence.h"
#include "cuda_allocator.h"
#include "core/framework/kernel_registry.h"
#include "core/framework/compute_capability.h"
#include "core/framework/memcpy.h"
#include "core/graph/graph_utils.h"
#include "core/providers/cuda/gpu_data_transfer.h"
#ifndef DISABLE_CONTRIB_OPS
@ -1303,28 +1304,27 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
// Note that nodes with only inputs from initializer would not be place on CUDA
// Ideally, those nodes should be eliminated in constant folding
bool should_force_outside = true;
bool all_input_are_initializer = true;
node.ForEachWithIndex(
node.InputDefs(),
[&](const NodeArg& def, size_t index) {
const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
// The input is not a initializer and the input is from CPU
// or the input declared as CPU memory and is from CPU
// in that case we should still keep the node on CUDA
bool initializer_input = graph.GetInitializedTensor(def.Name(), initializer);
bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
if ((!initializer_input && !input_is_on_cpu) ||
(input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index)))
should_force_outside = false;
bool all_inputs_are_initializers = true;
node.ForEachWithIndex(node.InputDefs(),
[&](const NodeArg& def, size_t index) {
// The input is not a initializer and the input is from CPU
// or the input declared as CPU memory and is from CPU
// in that case we should still keep the node on CUDA
bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true);
bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
if ((!initializer_input && !input_is_on_cpu) ||
(input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) {
should_force_outside = false;
}
if (!initializer_input) {
all_input_are_initializer = false;
}
return Status::OK();
});
if (!initializer_input) {
all_inputs_are_initializers = false;
}
return Status::OK();
});
// If all the inputs are initializers, we shouldn't force it to CPU
if (should_force_outside && !all_input_are_initializer) {
if (should_force_outside && !all_inputs_are_initializers) {
force_outside = true;
}
}

View file

@ -131,6 +131,33 @@ TEST(GraphTransformationTests, ConstantFolding) {
ASSERT_TRUE(op_to_count["Unsqueeze"] == 0);
}
TEST(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) {
auto model_uri = MODEL_FOLDER "fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
std::shared_ptr<Model> model;
ASSERT_TRUE(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
Graph& graph = model->MainGraph();
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
ASSERT_TRUE(op_to_count["Unsqueeze"] == 2);
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(), TransformerLevel::Level1);
// assign all nodes to CUDA. the constant folding should override this to perform the constant folding on cpu
for (auto& node : graph.Nodes()) {
node.SetExecutionProviderType(kCudaExecutionProvider);
}
ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, DefaultLoggingManager().DefaultLogger()).IsOK());
op_to_count = CountOpsInGraph(graph);
ASSERT_TRUE(op_to_count["Unsqueeze"] == 0);
// all remaining nodes should still be on CUDA
for (auto& node : graph.Nodes()) {
EXPECT_STREQ(node.GetExecutionProviderType().c_str(), kCudaExecutionProvider);
}
}
TEST(GraphTransformationTests, ConstantFoldingSubgraph) {
TensorProto value_tensor;
value_tensor.add_dims(1);
@ -1010,7 +1037,6 @@ static void ValidateAttention(Graph& graph) {
for (size_t i = 0; i < expected_value2.size(); i++) {
EXPECT_EQ(data2[i], static_cast<float>(expected_value2[i]));
}
}
}
}