mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-27 22:45:57 +00:00
Fix constant folding of node assigned to CUDA (#2510)
* Constant folding bug fix/improvements - Handle constant folding for node that is assigned to a non cpu EP - Check for errors in optimizer execution frame setup - Improve CUDA partitioning to look for initializers in parent graphs - Add unit test Fixes #2474
This commit is contained in:
parent
4354023913
commit
e8b327d657
5 changed files with 79 additions and 36 deletions
|
|
@ -25,6 +25,17 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
|
|||
|
||||
InitializedTensorSet constant_inputs;
|
||||
|
||||
// we currently constant fold using the CPU EP only.
|
||||
// if the node is assigned to a different EP we can run it if it's an ONNX op as we have CPU based implementations
|
||||
// for all ONNX ops. if it's from a different domain we can't.
|
||||
// NOTE: This is in addition to the IsSupportedProvider check below which will optionally do further filtering
|
||||
// on the EPs we constant fold for.
|
||||
auto ep_type = node->GetExecutionProviderType();
|
||||
bool cpu_ep = ep_type == kCpuExecutionProvider;
|
||||
if (!cpu_ep && node->Domain() != kOnnxDomain) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if constant folding can be applied on this node.
|
||||
if (!graph_utils::IsSupportedProvider(*node, GetCompatibleExecutionProviders()) ||
|
||||
excluded_op_types_.find(node->OpType()) != excluded_op_types_.end() ||
|
||||
|
|
@ -36,9 +47,19 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
|
|||
continue;
|
||||
}
|
||||
|
||||
// override the EP while setting up OptimizerExecutionFrame::Info so that it will use the CPU kernel for Compute.
|
||||
if (!cpu_ep) {
|
||||
node->SetExecutionProviderType(kCpuExecutionProvider);
|
||||
}
|
||||
|
||||
// Create execution frame for executing constant nodes.
|
||||
OptimizerExecutionFrame::Info info({node}, constant_inputs);
|
||||
|
||||
// undo the EP change in case something fails prior to node removal
|
||||
if (!cpu_ep) {
|
||||
node->SetExecutionProviderType(ep_type);
|
||||
}
|
||||
|
||||
std::vector<int> fetch_mlvalue_idxs;
|
||||
for (const auto* node_out : node->OutputDefs()) {
|
||||
fetch_mlvalue_idxs.push_back(info.GetMLValueIndex(node_out->Name()));
|
||||
|
|
@ -62,8 +83,8 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
|
|||
OrtValue& ort_value = fetches[fetch_idx];
|
||||
|
||||
if (!ort_value.IsTensor()) {
|
||||
LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type()
|
||||
<< ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'";
|
||||
LOGS(logger, WARNING) << "Unsupported output type of " << ort_value.Type()
|
||||
<< ". Can't constant fold " << node->OpType() << " node '" << node->Name() << "'";
|
||||
unsupported_output_type = true;
|
||||
break;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,8 +16,8 @@ it statically computes parts of the graph that rely only on constant initializer
|
|||
*/
|
||||
class ConstantFolding : public GraphTransformer {
|
||||
public:
|
||||
ConstantFolding(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept :
|
||||
GraphTransformer("ConstantFolding", compatible_execution_providers) {}
|
||||
ConstantFolding(const std::unordered_set<std::string>& compatible_execution_providers = {}) noexcept
|
||||
: GraphTransformer("ConstantFolding", compatible_execution_providers) {}
|
||||
|
||||
private:
|
||||
/** Constant folding will not be applied to nodes whose op_type is included in this set.
|
||||
|
|
@ -26,11 +26,6 @@ class ConstantFolding : public GraphTransformer {
|
|||
{"RandomUniform", "RandomNormal", "RandomUniformLike", "RandomNormalLike", "Multinomial"};
|
||||
|
||||
Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
|
||||
|
||||
/** Create a TensorProto that has the same value as the given OrtValue
|
||||
and the same type and dimensions as the given NodeArg. */
|
||||
void BuildTensorProtoForInitializer(const OrtValue& ort_value, const NodeArg& constant_node_arg,
|
||||
ONNX_NAMESPACE::TensorProto& tensorproto) const;
|
||||
};
|
||||
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -57,8 +57,8 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
|
|||
|
||||
// TODO: node->ImplicitInputDefs() need to be added here for control flow nodes.
|
||||
for (auto* node : nodes) {
|
||||
onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps);
|
||||
onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps);
|
||||
ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->InputDefs(), initialize_maps));
|
||||
ORT_THROW_IF_ERROR(onnxruntime::Node::ForEachWithIndex(node->OutputDefs(), initialize_maps));
|
||||
}
|
||||
|
||||
node_index_info_ = onnxruntime::make_unique<NodeIndexInfo>(nodes, ort_value_name_idx_map_);
|
||||
|
|
@ -67,8 +67,9 @@ OptimizerExecutionFrame::Info::Info(const std::vector<const Node*>& nodes,
|
|||
for (auto* node : nodes) {
|
||||
std::unique_ptr<OpKernel> op_kernel;
|
||||
std::shared_ptr<KernelRegistry> kernel_registry = cpu_execution_provider_->GetKernelRegistry();
|
||||
auto status = kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_,
|
||||
ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_, op_kernel);
|
||||
ORT_THROW_IF_ERROR(kernel_registry->TryCreateKernel(*node, *cpu_execution_provider_, initializers_,
|
||||
ort_value_name_idx_map_, FuncManager(), data_transfer_mgr_,
|
||||
op_kernel));
|
||||
kernels_[node->Index()] = std::move(op_kernel);
|
||||
}
|
||||
}
|
||||
|
|
@ -118,8 +119,8 @@ Status OptimizerExecutionFrame::CreateNodeOutputMLValueImpl(OrtValue& ort_value,
|
|||
auto element_type = static_cast<const TensorTypeBase*>(ml_type)->GetElementType();
|
||||
AllocatorPtr allocator_ptr = info_.GetAllocator();
|
||||
std::unique_ptr<Tensor> p_tensor = onnxruntime::make_unique<Tensor>(element_type,
|
||||
*shape,
|
||||
allocator_ptr);
|
||||
*shape,
|
||||
allocator_ptr);
|
||||
|
||||
auto ml_tensor = DataTypeImpl::GetType<Tensor>();
|
||||
ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
|
||||
|
|
|
|||
|
|
@ -3,11 +3,12 @@
|
|||
|
||||
#include "cuda_common.h"
|
||||
#include "cuda_execution_provider.h"
|
||||
#include "core/framework/memcpy.h"
|
||||
#include "cuda_fence.h"
|
||||
#include "cuda_allocator.h"
|
||||
#include "core/framework/kernel_registry.h"
|
||||
#include "core/framework/compute_capability.h"
|
||||
#include "core/framework/memcpy.h"
|
||||
#include "core/graph/graph_utils.h"
|
||||
#include "core/providers/cuda/gpu_data_transfer.h"
|
||||
|
||||
#ifndef DISABLE_CONTRIB_OPS
|
||||
|
|
@ -1303,28 +1304,27 @@ CUDAExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph,
|
|||
// Note that nodes with only inputs from initializer would not be place on CUDA
|
||||
// Ideally, those nodes should be eliminated in constant folding
|
||||
bool should_force_outside = true;
|
||||
bool all_input_are_initializer = true;
|
||||
node.ForEachWithIndex(
|
||||
node.InputDefs(),
|
||||
[&](const NodeArg& def, size_t index) {
|
||||
const ONNX_NAMESPACE::TensorProto* initializer = nullptr;
|
||||
// The input is not a initializer and the input is from CPU
|
||||
// or the input declared as CPU memory and is from CPU
|
||||
// in that case we should still keep the node on CUDA
|
||||
bool initializer_input = graph.GetInitializedTensor(def.Name(), initializer);
|
||||
bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
|
||||
if ((!initializer_input && !input_is_on_cpu) ||
|
||||
(input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index)))
|
||||
should_force_outside = false;
|
||||
bool all_inputs_are_initializers = true;
|
||||
node.ForEachWithIndex(node.InputDefs(),
|
||||
[&](const NodeArg& def, size_t index) {
|
||||
// The input is not a initializer and the input is from CPU
|
||||
// or the input declared as CPU memory and is from CPU
|
||||
// in that case we should still keep the node on CUDA
|
||||
bool initializer_input = graph.IsConstantInitializer(def.Name(), /*check_outer_scope*/ true);
|
||||
bool input_is_on_cpu = defs_outside_cuda.count(&def) > 0;
|
||||
if ((!initializer_input && !input_is_on_cpu) ||
|
||||
(input_is_on_cpu && cuda_kernel_def->kernel_def->IsInputOnCpu(index))) {
|
||||
should_force_outside = false;
|
||||
}
|
||||
|
||||
if (!initializer_input) {
|
||||
all_input_are_initializer = false;
|
||||
}
|
||||
return Status::OK();
|
||||
});
|
||||
if (!initializer_input) {
|
||||
all_inputs_are_initializers = false;
|
||||
}
|
||||
return Status::OK();
|
||||
});
|
||||
|
||||
// If all the inputs are initializers, we shouldn't force it to CPU
|
||||
if (should_force_outside && !all_input_are_initializer) {
|
||||
if (should_force_outside && !all_inputs_are_initializers) {
|
||||
force_outside = true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -131,6 +131,33 @@ TEST(GraphTransformationTests, ConstantFolding) {
|
|||
ASSERT_TRUE(op_to_count["Unsqueeze"] == 0);
|
||||
}
|
||||
|
||||
TEST(GraphTransformationTests, ConstantFoldingNodesOnDifferentEP) {
|
||||
auto model_uri = MODEL_FOLDER "fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
|
||||
std::shared_ptr<Model> model;
|
||||
ASSERT_TRUE(Model::Load(model_uri, model, nullptr, DefaultLoggingManager().DefaultLogger()).IsOK());
|
||||
Graph& graph = model->MainGraph();
|
||||
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
|
||||
ASSERT_TRUE(op_to_count["Unsqueeze"] == 2);
|
||||
|
||||
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
|
||||
graph_transformation_mgr.Register(onnxruntime::make_unique<ConstantFolding>(), TransformerLevel::Level1);
|
||||
|
||||
// assign all nodes to CUDA. the constant folding should override this to perform the constant folding on cpu
|
||||
for (auto& node : graph.Nodes()) {
|
||||
node.SetExecutionProviderType(kCudaExecutionProvider);
|
||||
}
|
||||
|
||||
ASSERT_TRUE(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, DefaultLoggingManager().DefaultLogger()).IsOK());
|
||||
|
||||
op_to_count = CountOpsInGraph(graph);
|
||||
ASSERT_TRUE(op_to_count["Unsqueeze"] == 0);
|
||||
|
||||
// all remaining nodes should still be on CUDA
|
||||
for (auto& node : graph.Nodes()) {
|
||||
EXPECT_STREQ(node.GetExecutionProviderType().c_str(), kCudaExecutionProvider);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(GraphTransformationTests, ConstantFoldingSubgraph) {
|
||||
TensorProto value_tensor;
|
||||
value_tensor.add_dims(1);
|
||||
|
|
@ -1010,7 +1037,6 @@ static void ValidateAttention(Graph& graph) {
|
|||
for (size_t i = 0; i < expected_value2.size(); i++) {
|
||||
EXPECT_EQ(data2[i], static_cast<float>(expected_value2[i]));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue