mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-14 20:48:00 +00:00
Allow upstream for Slice on single axis (#16410)
### Allow upstream for Slice on single axis #### Benchmark on 8x32GB V100 + DeepSpeed On Bloom560M model, there is 1.5% throughput gains on the same max batch size 6. ``` torchrun --nproc_per_node=8 examples/onnxruntime/training/language-modeling/run_clm.py --model_name_or_path bigscience/bloom-560m --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --num_train_epochs 10 --per_device_train_batch_size 6 --per_device_eval_batch_size 1 --do_train --overwrite_output_dir --output_dir ./outputs/ --seed 1137 --fp16 --report_to none --optim adamw_ort_fused --max_steps 200 --logging_steps 1 --use_module_with_loss --deepspeed aml_ds_config_zero_1.json ``` ##### Main branch ``` Total overhead: 38957ms where export takes 35493ms. ***** train metrics ***** epoch = 4.08 train_loss = 2.6841 train_runtime = 0:03:10.67 train_samples = 2318 train_samples_per_second = 50.348 train_steps_per_second = 1.049 throughput per gpu=4.08 * 2318 / (190.67 - 38.957) / 8(gpu) = 7.792 samples/second ``` ##### This PR ``` Total overhead: 38649ms where export takes 34946ms. ***** train metrics ***** epoch = 4.08 train_loss = 2.6757 train_runtime = 0:03:08.08 train_samples = 2318 train_samples_per_second = 51.04 train_steps_per_second = 1.063 throughput per gpu=4.08 * 2318 / (188.08 - 38.649) / 8(gpu) = 7.911 samples/second ``` #### Benchmark on 4x16GB V100 + AutoCast On Bloom560M model, there is 1.8% throughput gains on the same batch size, 24% gains with corresponding maximum batch size. Also it allow ORT run bigger batch size (from 3 to 4) on following recipe. ``` torchrun --nproc_per_node=4 examples/onnxruntime/training/language-modeling/run_clm.py --model_name_or_path bigscience/bloom-560m --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --num_train_epochs 10 --per_device_train_batch_size 3 --per_device_eval_batch_size 1 --do_train --overwrite_output_dir --output_dir ./outputs/ --seed 1137 --fp16 --report_to none --optim adamw_ort_fused --max_steps 200 --logging_steps 1 --use_module_with_loss ``` ##### Main branch ``` Total overhead: 4789ms where export takes 3798ms. ***** train metrics ***** epoch = 1.02 train_loss = 20.3338 train_runtime = 0:01:42.78 train_samples = 2343 train_samples_per_second = 23.349 train_steps_per_second = 1.946 throughput per gpu=1.02 * 2343 / (102.78 - 4.789) / 4(gpu) = 6.097 samples/second ``` ##### This PR ``` Total overhead: 4608ms where export takes 3555ms. ***** train metrics ***** epoch = 1.02 train_loss = 20.3364 train_runtime = 0:01:40.87 train_samples = 2343 train_samples_per_second = 23.792 throughput per gpu=1.02 * 2343 / (100.87 - 4.608) / 4(gpu) = 6.207 samples/second ``` With this PR, also can run batch size 4 (main branch fails), ``` Total overhead: 4743ms where export takes 3698ms. ***** train metrics ***** epoch = 1.36 train_loss = 20.2096 train_runtime = 0:01:50.42 train_samples = 2343 train_samples_per_second = 28.979 train_steps_per_second = 1.811 throughput per gpu= 1.36 * 2343 / (110 - 4.743) / 4(gpu) =7.57 sample/second ``` #### Benchmark on 8x32GB V100 + AutoCast On Bloom560M model, there is 0.9% throughput gains on the same batch size, 8.6% gains with corresponding maximum batch size. Also it allow ORT run bigger batch size (from 3 to 4) on following recipe. ``` torchrun --nproc_per_node=8 examples/onnxruntime/training/language-modeling/run_clm.py --model_name_or_path bigscience/bloom-560m --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --num_train_epochs 10 --per_device_train_batch_size 3 --per_device_eval_batch_size 1 --do_train --overwrite_output_dir --output_dir ./outputs/ --seed 1137 --fp16 --report_to none --optim adamw_ort_fused --max_steps 200 --logging_steps 1 --use_module_with_loss ``` ##### Main branch ``` Total overhead: 55259ms where export takes 51140ms. ***** train metrics ***** epoch = 2.06 train_loss = 2.8788 train_runtime = 0:02:36.65 train_samples = 2318 train_samples_per_second = 30.64 train_steps_per_second = 1.277 throughput per gpu=2.06 * 2318 / (156.65 - 55.259) / 8(gpu) = 5.887 samples/second ``` ##### This PR ``` Total overhead: 55712ms where export takes 51418ms. ***** train metrics ***** epoch = 2.06 train_loss = 2.8696 train_runtime = 0:02:36.19 train_samples = 2318 train_samples_per_second = 30.731 train_steps_per_second = 1.28 throughput per gpu=2.06 * 2318/ (156.19 - 55.712) / 8(gpu) = 5.940 samples/second ``` With this PR, also can run batch size 4 (main branch fails), ``` Total overhead: 54238ms where export takes 49899ms. ***** train metrics ***** epoch = 2.74 train_loss = 2.7692 train_runtime = 0:02:58.47 train_samples = 2318 train_samples_per_second = 35.859 train_steps_per_second = 1.121 throughput per gpu= 2.74 * 2318 / (178.47 - 54.238) / 8(gpu) =6.391sample/second ```
This commit is contained in:
parent
67f4cd54fa
commit
bcebd3b1ca
2 changed files with 238 additions and 0 deletions
|
|
@ -345,6 +345,65 @@ std::optional<SliceInfo> IsSupportedShrunkenGather(Graph& graph, Node& node,
|
|||
return SliceInfo(graph, &node, false /*is_slice_scalar*/, "axis", axis, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check if the Slice node can be up-streamed to the previous node.
|
||||
*
|
||||
* If "Slice" node is operating on one single axis, then it is supported.
|
||||
* @return std::optional<SliceInfo>
|
||||
*/
|
||||
std::optional<SliceInfo> IsSupportedSlice(Graph& graph, Node& node,
|
||||
const InlinedHashSet<std::string_view>&
|
||||
compatible_execution_providers,
|
||||
const logging::Logger& logger) {
|
||||
if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Slice", {10, 11, 13}) ||
|
||||
!graph_utils::IsSupportedProvider(node, compatible_execution_providers)) {
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const NodeArg* data_input = node.InputDefs()[0];
|
||||
const NodeArg* starts_input = node.InputDefs()[1];
|
||||
const NodeArg* ends_input = node.InputDefs()[2];
|
||||
const NodeArg* axes_input = node.InputDefs().size() > 3 ? node.InputDefs()[3] : nullptr;
|
||||
|
||||
if (data_input->Shape() == nullptr || starts_input->Shape() == nullptr || ends_input->Shape() == nullptr ||
|
||||
(axes_input && axes_input->Shape() == nullptr)) {
|
||||
LOG_DEBUG_INFO(logger, "Skip Slice node " + node.Name() + " due to undefined shape.");
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Make sure starts/ends/axes/steps are all 1D tensors, since we only support single-dimension slicing.
|
||||
if (starts_input->Shape()->dim_size() != 1 || ends_input->Shape()->dim_size() != 1 ||
|
||||
(axes_input && axes_input->Shape()->dim_size() != 1)) {
|
||||
LOG_DEBUG_INFO(logger, "Skip Slice node " + node.Name() + " due to unsupported dim size: " +
|
||||
std::to_string(starts_input->Shape()->dim_size()) + ", " +
|
||||
std::to_string(ends_input->Shape()->dim_size()) + ", " +
|
||||
std::to_string(axes_input ? axes_input->Shape()->dim_size() : 0));
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Try to parse the 'axes' value.
|
||||
int axis = 0;
|
||||
if (axes_input) {
|
||||
InlinedVector<int64_t> axes_values;
|
||||
if (!graph_utils::IsConstantInitializer(graph, axes_input->Name()) ||
|
||||
!optimizer_utils::AppendTensorFromInitializer(graph, *axes_input, axes_values, true) ||
|
||||
axes_values.size() != 1) {
|
||||
return std::nullopt;
|
||||
}
|
||||
axis = static_cast<int>(axes_values[0]);
|
||||
} else {
|
||||
// If 'axes' is not specified, then it is [0, .., r-1], so we force data rank to be 1.
|
||||
if (data_input->Shape()->dim_size() != 1) {
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
if (axis < 0)
|
||||
axis += data_input->Shape()->dim_size();
|
||||
|
||||
return SliceInfo(graph, &node, false /*is_slice_scalar*/, "axis", axis, true);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::optional<SliceInfo> UpStreamGatherGraphTransformer::IsSupportedForUpstream(
|
||||
|
|
@ -358,6 +417,9 @@ std::optional<SliceInfo> UpStreamGatherGraphTransformer::IsSupportedForUpstream(
|
|||
if (!gather_info.has_value()) {
|
||||
gather_info = IsSupportedShrunkenGather(graph, node, GetCompatibleExecutionProviders(), logger);
|
||||
}
|
||||
if (!gather_info.has_value()) {
|
||||
gather_info = IsSupportedSlice(graph, node, GetCompatibleExecutionProviders(), logger);
|
||||
}
|
||||
return gather_info;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1576,6 +1576,182 @@ TEST(ComputeOptimizerTests, ShrunkenGatherElementwiseOps_PropagationOnTwoBranche
|
|||
1, pre_graph_checker, post_graph_checker));
|
||||
}
|
||||
|
||||
/*
|
||||
Test graph includes multiple equivalent subgraphs as below.
|
||||
graph input [4, 32, 256] (float) graph input [4, 32, 256] (float)
|
||||
| |
|
||||
\_____________ ______________/
|
||||
\ /
|
||||
Add starts:(0) ends: (-1) axes: (1) steps: (1)
|
||||
\ \ | / /
|
||||
\ \ | / /
|
||||
\ \ | / /
|
||||
\ \ | / /
|
||||
\ \ | / /
|
||||
Slice
|
||||
|
|
||||
Identity
|
||||
|
|
||||
graph output [4, 31, 256] (float)
|
||||
|
||||
Add an Identity node because currently we don't allow Slice generates graph output.
|
||||
*/
|
||||
TEST(ComputeOptimizerTests, SliceElementwiseOps_PropagationOnTwoBranches) {
|
||||
const logging::Logger* logger = &logging::LoggingManager::DefaultLogger();
|
||||
InlinedVector<int64_t> starts_indices;
|
||||
auto pre_graph_checker = [&starts_indices](Graph& graph) -> Status {
|
||||
auto op_count_pre = CountOpsInGraph(graph);
|
||||
TEST_RETURN_IF_NOT(op_count_pre.size() == 3U);
|
||||
TEST_RETURN_IF_NOT(op_count_pre["Add"] == 1);
|
||||
TEST_RETURN_IF_NOT(op_count_pre["Slice"] == 1);
|
||||
TEST_RETURN_IF_NOT(op_count_pre["Identity"] == 1);
|
||||
|
||||
for (Node& node : graph.Nodes()) {
|
||||
if (node.OpType() == "Slice") {
|
||||
TEST_RETURN_IF_NOT(starts_indices.empty());
|
||||
constexpr bool require_constant = true;
|
||||
NodeArg* initializer_node_arg = graph.GetNodeArg(node.InputDefs()[1]->Name());
|
||||
TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, starts_indices,
|
||||
require_constant));
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
auto post_graph_checker = [&starts_indices](Graph& graph) {
|
||||
auto op_count_post = CountOpsInGraph(graph);
|
||||
TEST_RETURN_IF_NOT(op_count_post.size() == 3U);
|
||||
TEST_RETURN_IF_NOT(op_count_post["Add"] == 1);
|
||||
TEST_RETURN_IF_NOT(op_count_post["Slice"] == 2);
|
||||
TEST_RETURN_IF_NOT(op_count_post["Identity"] == 1);
|
||||
|
||||
for (Node& node : graph.Nodes()) {
|
||||
if (node.OpType() == "Add") {
|
||||
const auto& input_defs = node.InputDefs();
|
||||
|
||||
{
|
||||
auto producer_node = graph.GetProducerNode(input_defs[0]->Name());
|
||||
TEST_RETURN_IF_NOT(producer_node != nullptr);
|
||||
TEST_RETURN_IF_NOT(producer_node->OpType() == "Slice");
|
||||
|
||||
InlinedVector<int64_t> values;
|
||||
constexpr bool require_constant = true;
|
||||
NodeArg* initializer_node_arg = graph.GetNodeArg(producer_node->InputDefs()[1]->Name());
|
||||
TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, values,
|
||||
require_constant));
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
TEST_RETURN_IF_NOT(values[i] == starts_indices[i]);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
auto producer_node = graph.GetProducerNode(input_defs[1]->Name());
|
||||
TEST_RETURN_IF_NOT(producer_node != nullptr);
|
||||
TEST_RETURN_IF_NOT(producer_node->OpType() == "Slice");
|
||||
|
||||
InlinedVector<int64_t> values;
|
||||
constexpr bool require_constant = true;
|
||||
NodeArg* initializer_node_arg = graph.GetNodeArg(producer_node->InputDefs()[1]->Name());
|
||||
TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, values, require_constant));
|
||||
for (size_t i = 0; i < values.size(); i++) {
|
||||
TEST_RETURN_IF_NOT(values[i] == starts_indices[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
auto build_test_case = [](ModelTestBuilder& builder) {
|
||||
auto* input1_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
|
||||
auto* input2_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
|
||||
auto* add_out = builder.MakeIntermediate();
|
||||
builder.AddNode("Add", {input1_arg, input2_arg}, {add_out});
|
||||
|
||||
auto* starts_initializer = builder.MakeInitializer<int64_t>({1}, {0});
|
||||
auto* ends_initializer = builder.MakeInitializer<int64_t>({1}, {-1});
|
||||
auto* axes_initializer = builder.MakeInitializer<int64_t>({1}, {1});
|
||||
auto* steps_initializer = builder.MakeInitializer<int64_t>({1}, {1});
|
||||
auto* slice_out = builder.MakeIntermediate();
|
||||
builder.AddNode("Slice", {add_out, starts_initializer, ends_initializer, axes_initializer, steps_initializer},
|
||||
{slice_out});
|
||||
|
||||
auto* identity_out = builder.MakeOutput();
|
||||
builder.AddNode("Identity", {slice_out}, {identity_out});
|
||||
};
|
||||
|
||||
std::unique_ptr<GraphTransformer> transformer = std::make_unique<UpStreamGatherGraphTransformer>();
|
||||
ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger, std::move(transformer),
|
||||
TransformerLevel::Level1,
|
||||
1, pre_graph_checker, post_graph_checker));
|
||||
}
|
||||
|
||||
/*
|
||||
Test graph includes multiple equivalent subgraphs as below.
|
||||
graph input [4, 32, 256] (float) graph input [4, 32, 256] (float)
|
||||
| |
|
||||
\_____________ ______________/
|
||||
\ /
|
||||
Add starts:(0,0) ends: (-1,-1) axes: (0,1) steps: (1,1)
|
||||
\ \ | / /
|
||||
\ \ | / /
|
||||
\ \ | / /
|
||||
\ \ | / /
|
||||
\ \ | / /
|
||||
Slice
|
||||
|
|
||||
Identity
|
||||
|
|
||||
graph output [3, 31, 256] (float)
|
||||
|
||||
Add an Identity node because currently we don't allow Slice generates graph output.
|
||||
*/
|
||||
TEST(ComputeOptimizerTests, SliceElementwiseOps_NoPropagationForMutipleAxesSlice) {
|
||||
const logging::Logger* logger = &logging::LoggingManager::DefaultLogger();
|
||||
auto pre_graph_checker = [](Graph& graph) -> Status {
|
||||
auto op_count_pre = CountOpsInGraph(graph);
|
||||
TEST_RETURN_IF_NOT(op_count_pre.size() == 3U);
|
||||
TEST_RETURN_IF_NOT(op_count_pre["Add"] == 1);
|
||||
TEST_RETURN_IF_NOT(op_count_pre["Slice"] == 1);
|
||||
TEST_RETURN_IF_NOT(op_count_pre["Identity"] == 1);
|
||||
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
auto post_graph_checker = [](Graph& graph) {
|
||||
auto op_count_post = CountOpsInGraph(graph);
|
||||
TEST_RETURN_IF_NOT(op_count_post.size() == 3U);
|
||||
TEST_RETURN_IF_NOT(op_count_post["Add"] == 1);
|
||||
TEST_RETURN_IF_NOT(op_count_post["Slice"] == 1);
|
||||
TEST_RETURN_IF_NOT(op_count_post["Identity"] == 1);
|
||||
|
||||
return Status::OK();
|
||||
};
|
||||
|
||||
auto build_test_case = [](ModelTestBuilder& builder) {
|
||||
auto* input1_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
|
||||
auto* input2_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
|
||||
auto* add_out = builder.MakeIntermediate();
|
||||
builder.AddNode("Add", {input1_arg, input2_arg}, {add_out});
|
||||
|
||||
auto* starts_initializer = builder.MakeInitializer<int64_t>({2}, {0, 0});
|
||||
auto* ends_initializer = builder.MakeInitializer<int64_t>({2}, {-1, -1});
|
||||
auto* axes_initializer = builder.MakeInitializer<int64_t>({2}, {0, 1});
|
||||
auto* steps_initializer = builder.MakeInitializer<int64_t>({2}, {1, 1});
|
||||
auto* slice_out = builder.MakeIntermediate();
|
||||
builder.AddNode("Slice", {add_out, starts_initializer, ends_initializer, axes_initializer, steps_initializer},
|
||||
{slice_out});
|
||||
|
||||
auto* identity_out = builder.MakeOutput();
|
||||
builder.AddNode("Identity", {slice_out}, {identity_out});
|
||||
};
|
||||
|
||||
std::unique_ptr<GraphTransformer> transformer = std::make_unique<UpStreamGatherGraphTransformer>();
|
||||
ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger, std::move(transformer),
|
||||
TransformerLevel::Level1,
|
||||
1, pre_graph_checker, post_graph_checker));
|
||||
}
|
||||
|
||||
/*
|
||||
Test graph include multiple equivalent subgraphs as below.
|
||||
graph input [4, 32, 256] (int64_t) graph input [4, 32, 256] (int64_t)
|
||||
|
|
|
|||
Loading…
Reference in a new issue