From bcebd3b1ca415e2b22b033c17a50ec0c4ecbc14c Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Mon, 10 Jul 2023 08:36:11 +0800
Subject: [PATCH] Allow upstream for Slice on single axis (#16410)

### Allow upstream for Slice on single axis

#### Benchmark on 8x32GB V100 + DeepSpeed

On Bloom560M model, there is 1.5% throughput gains on the same max batch
size 6.
```
torchrun --nproc_per_node=8 examples/onnxruntime/training/language-modeling/run_clm.py  --model_name_or_path bigscience/bloom-560m --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1  --num_train_epochs 10 --per_device_train_batch_size 6 --per_device_eval_batch_size 1 --do_train  --overwrite_output_dir --output_dir ./outputs/ --seed 1137 --fp16 --report_to none --optim adamw_ort_fused  --max_steps 200 --logging_steps 1 --use_module_with_loss --deepspeed aml_ds_config_zero_1.json
```

##### Main branch

```
Total overhead: 38957ms where export takes 35493ms.
***** train metrics *****
  epoch                    =       4.08
  train_loss               =     2.6841
  train_runtime            = 0:03:10.67
  train_samples            =       2318
  train_samples_per_second =     50.348
  train_steps_per_second   =      1.049

throughput  per gpu=4.08 * 2318 / (190.67 - 38.957) / 8(gpu) = 7.792 samples/second
```

##### This PR

```
Total overhead: 38649ms where export takes 34946ms.

***** train metrics *****
  epoch                    =       4.08
  train_loss               =     2.6757
  train_runtime            = 0:03:08.08
  train_samples            =       2318
  train_samples_per_second =      51.04
  train_steps_per_second   =      1.063

throughput  per gpu=4.08 * 2318 / (188.08 - 38.649) / 8(gpu) = 7.911 samples/second
```

#### Benchmark on 4x16GB V100 + AutoCast

On Bloom560M model, there is 1.8% throughput gains on the same batch
size, 24% gains with corresponding maximum batch size.

Also it allow ORT run bigger batch size (from 3 to 4) on following
recipe.

```
torchrun --nproc_per_node=4 examples/onnxruntime/training/language-modeling/run_clm.py  --model_name_or_path bigscience/bloom-560m --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1  --num_train_epochs 10 --per_device_train_batch_size 3 --per_device_eval_batch_size 1 --do_train  --overwrite_output_dir --output_dir ./outputs/ --seed 1137 --fp16 --report_to none --optim adamw_ort_fused  --max_steps 200 --logging_steps 1 --use_module_with_loss
```

##### Main branch

```
Total overhead: 4789ms where export takes 3798ms.
***** train metrics *****
  epoch                    =       1.02
  train_loss               =    20.3338
  train_runtime            = 0:01:42.78
  train_samples            =       2343
  train_samples_per_second =     23.349
  train_steps_per_second   =      1.946

throughput  per gpu=1.02 * 2343 / (102.78 - 4.789) / 4(gpu) = 6.097 samples/second
```

##### This PR

```
Total overhead: 4608ms where export takes 3555ms.
***** train metrics *****
  epoch                    =       1.02
  train_loss               =    20.3364
  train_runtime            = 0:01:40.87
  train_samples            =       2343
  train_samples_per_second =     23.792

throughput  per gpu=1.02 * 2343 / (100.87 - 4.608) / 4(gpu) = 6.207 samples/second
```

With this PR, also can run batch size 4 (main branch fails),

```
Total overhead: 4743ms where export takes 3698ms.
***** train metrics *****
  epoch                    =       1.36
  train_loss               =    20.2096
  train_runtime            = 0:01:50.42
  train_samples            =       2343
  train_samples_per_second =     28.979
  train_steps_per_second   =      1.811


throughput  per gpu= 1.36 *  2343 / (110 - 4.743) / 4(gpu) =7.57 sample/second
```


#### Benchmark on 8x32GB V100 + AutoCast

On Bloom560M model, there is 0.9% throughput gains on the same batch
size, 8.6% gains with corresponding maximum batch size.

Also it allow ORT run bigger batch size (from 3 to 4) on following
recipe.

```
torchrun --nproc_per_node=8 examples/onnxruntime/training/language-modeling/run_clm.py  --model_name_or_path bigscience/bloom-560m --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1  --num_train_epochs 10 --per_device_train_batch_size 3 --per_device_eval_batch_size 1 --do_train  --overwrite_output_dir --output_dir ./outputs/ --seed 1137 --fp16 --report_to none --optim adamw_ort_fused  --max_steps 200 --logging_steps 1 --use_module_with_loss

```

##### Main branch

```
Total overhead: 55259ms where export takes 51140ms.
***** train metrics *****
  epoch                    =       2.06
  train_loss               =     2.8788
  train_runtime            = 0:02:36.65
  train_samples            =       2318
  train_samples_per_second =      30.64
  train_steps_per_second   =      1.277

throughput  per gpu=2.06 * 2318 / (156.65 - 55.259) / 8(gpu) = 5.887 samples/second
```

##### This PR

```
Total overhead: 55712ms where export takes 51418ms.
***** train metrics *****
  epoch                    =       2.06
  train_loss               =     2.8696
  train_runtime            = 0:02:36.19
  train_samples            =       2318
  train_samples_per_second =     30.731
  train_steps_per_second   =       1.28

throughput  per gpu=2.06 * 2318/ (156.19 - 55.712) / 8(gpu) = 5.940 samples/second
```

With this PR, also can run batch size 4 (main branch fails),

```
Total overhead: 54238ms where export takes 49899ms.
***** train metrics *****
  epoch                    =       2.74
  train_loss               =     2.7692
  train_runtime            = 0:02:58.47
  train_samples            =       2318
  train_samples_per_second =     35.859
  train_steps_per_second   =      1.121

throughput  per gpu= 2.74 * 2318 / (178.47 - 54.238) / 8(gpu) =6.391sample/second
```
---
 .../compute_optimizer/upstream_gather.cc      |  62 ++++++
 .../test/optimizer/compute_optimizer_test.cc  | 176 ++++++++++++++++++
 2 files changed, 238 insertions(+)

diff --git a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
index 319b61fa32..9ad5edf4f2 100644
--- a/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
+++ b/onnxruntime/core/optimizer/compute_optimizer/upstream_gather.cc
@@ -345,6 +345,65 @@ std::optional<SliceInfo> IsSupportedShrunkenGather(Graph& graph, Node& node,
   return SliceInfo(graph, &node, false /*is_slice_scalar*/, "axis", axis, true);
 }
 
+/**
+ * @brief Check if the Slice node can be up-streamed to the previous node.
+ *
+ * If "Slice" node is operating on one single axis, then it is supported.
+ * @return std::optional<SliceInfo>
+ */
+std::optional<SliceInfo> IsSupportedSlice(Graph& graph, Node& node,
+                                          const InlinedHashSet<std::string_view>&
+                                              compatible_execution_providers,
+                                          const logging::Logger& logger) {
+  if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Slice", {10, 11, 13}) ||
+      !graph_utils::IsSupportedProvider(node, compatible_execution_providers)) {
+    return std::nullopt;
+  }
+
+  const NodeArg* data_input = node.InputDefs()[0];
+  const NodeArg* starts_input = node.InputDefs()[1];
+  const NodeArg* ends_input = node.InputDefs()[2];
+  const NodeArg* axes_input = node.InputDefs().size() > 3 ? node.InputDefs()[3] : nullptr;
+
+  if (data_input->Shape() == nullptr || starts_input->Shape() == nullptr || ends_input->Shape() == nullptr ||
+      (axes_input && axes_input->Shape() == nullptr)) {
+    LOG_DEBUG_INFO(logger, "Skip Slice node " + node.Name() + " due to undefined shape.");
+    return std::nullopt;
+  }
+
+  // Make sure starts/ends/axes/steps are all 1D tensors, since we only support single-dimension slicing.
+  if (starts_input->Shape()->dim_size() != 1 || ends_input->Shape()->dim_size() != 1 ||
+      (axes_input && axes_input->Shape()->dim_size() != 1)) {
+    LOG_DEBUG_INFO(logger, "Skip Slice node " + node.Name() + " due to unsupported dim size: " +
+                               std::to_string(starts_input->Shape()->dim_size()) + ", " +
+                               std::to_string(ends_input->Shape()->dim_size()) + ", " +
+                               std::to_string(axes_input ? axes_input->Shape()->dim_size() : 0));
+    return std::nullopt;
+  }
+
+  // Try to parse the 'axes' value.
+  int axis = 0;
+  if (axes_input) {
+    InlinedVector<int64_t> axes_values;
+    if (!graph_utils::IsConstantInitializer(graph, axes_input->Name()) ||
+        !optimizer_utils::AppendTensorFromInitializer(graph, *axes_input, axes_values, true) ||
+        axes_values.size() != 1) {
+      return std::nullopt;
+    }
+    axis = static_cast<int>(axes_values[0]);
+  } else {
+    // If 'axes' is not specified, then it is [0, .., r-1], so we force data rank to be 1.
+    if (data_input->Shape()->dim_size() != 1) {
+      return std::nullopt;
+    }
+  }
+
+  if (axis < 0)
+    axis += data_input->Shape()->dim_size();
+
+  return SliceInfo(graph, &node, false /*is_slice_scalar*/, "axis", axis, true);
+}
+
 }  // namespace
 
 std::optional<SliceInfo> UpStreamGatherGraphTransformer::IsSupportedForUpstream(
@@ -358,6 +417,9 @@ std::optional<SliceInfo> UpStreamGatherGraphTransformer::IsSupportedForUpstream(
   if (!gather_info.has_value()) {
     gather_info = IsSupportedShrunkenGather(graph, node, GetCompatibleExecutionProviders(), logger);
   }
+  if (!gather_info.has_value()) {
+    gather_info = IsSupportedSlice(graph, node, GetCompatibleExecutionProviders(), logger);
+  }
   return gather_info;
 }
 
diff --git a/onnxruntime/test/optimizer/compute_optimizer_test.cc b/onnxruntime/test/optimizer/compute_optimizer_test.cc
index d374492057..55a7864820 100644
--- a/onnxruntime/test/optimizer/compute_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/compute_optimizer_test.cc
@@ -1576,6 +1576,182 @@ TEST(ComputeOptimizerTests, ShrunkenGatherElementwiseOps_PropagationOnTwoBranche
                                         1, pre_graph_checker, post_graph_checker));
 }
 
+/*
+Test graph includes multiple equivalent subgraphs as below.
+           graph input [4, 32, 256] (float)            graph input [4, 32, 256] (float)
+                            |                                |
+                             \_____________   ______________/
+                                           \ /
+                                           Add  starts:(0)  ends: (-1)  axes: (1) steps: (1)
+                                            \       \       |          /         /
+                                               \       \     |        /       /
+                                                  \      \   |     /      /
+                                                    \     \  |   /     /
+                                                       \   \ |  /   /
+                                                            Slice
+                                                             |
+                                                          Identity
+                                                             |
+                                                graph output [4, 31, 256] (float)
+
+Add an Identity node because currently we don't allow Slice generates graph output.
+*/
+TEST(ComputeOptimizerTests, SliceElementwiseOps_PropagationOnTwoBranches) {
+  const logging::Logger* logger = &logging::LoggingManager::DefaultLogger();
+  InlinedVector<int64_t> starts_indices;
+  auto pre_graph_checker = [&starts_indices](Graph& graph) -> Status {
+    auto op_count_pre = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_pre.size() == 3U);
+    TEST_RETURN_IF_NOT(op_count_pre["Add"] == 1);
+    TEST_RETURN_IF_NOT(op_count_pre["Slice"] == 1);
+    TEST_RETURN_IF_NOT(op_count_pre["Identity"] == 1);
+
+    for (Node& node : graph.Nodes()) {
+      if (node.OpType() == "Slice") {
+        TEST_RETURN_IF_NOT(starts_indices.empty());
+        constexpr bool require_constant = true;
+        NodeArg* initializer_node_arg = graph.GetNodeArg(node.InputDefs()[1]->Name());
+        TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, starts_indices,
+                                                                        require_constant));
+      }
+    }
+    return Status::OK();
+  };
+
+  auto post_graph_checker = [&starts_indices](Graph& graph) {
+    auto op_count_post = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_post.size() == 3U);
+    TEST_RETURN_IF_NOT(op_count_post["Add"] == 1);
+    TEST_RETURN_IF_NOT(op_count_post["Slice"] == 2);
+    TEST_RETURN_IF_NOT(op_count_post["Identity"] == 1);
+
+    for (Node& node : graph.Nodes()) {
+      if (node.OpType() == "Add") {
+        const auto& input_defs = node.InputDefs();
+
+        {
+          auto producer_node = graph.GetProducerNode(input_defs[0]->Name());
+          TEST_RETURN_IF_NOT(producer_node != nullptr);
+          TEST_RETURN_IF_NOT(producer_node->OpType() == "Slice");
+
+          InlinedVector<int64_t> values;
+          constexpr bool require_constant = true;
+          NodeArg* initializer_node_arg = graph.GetNodeArg(producer_node->InputDefs()[1]->Name());
+          TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, values,
+                                                                          require_constant));
+          for (size_t i = 0; i < values.size(); i++) {
+            TEST_RETURN_IF_NOT(values[i] == starts_indices[i]);
+          }
+        }
+
+        {
+          auto producer_node = graph.GetProducerNode(input_defs[1]->Name());
+          TEST_RETURN_IF_NOT(producer_node != nullptr);
+          TEST_RETURN_IF_NOT(producer_node->OpType() == "Slice");
+
+          InlinedVector<int64_t> values;
+          constexpr bool require_constant = true;
+          NodeArg* initializer_node_arg = graph.GetNodeArg(producer_node->InputDefs()[1]->Name());
+          TEST_RETURN_IF_NOT(optimizer_utils::AppendTensorFromInitializer(graph, *initializer_node_arg, values, require_constant));
+          for (size_t i = 0; i < values.size(); i++) {
+            TEST_RETURN_IF_NOT(values[i] == starts_indices[i]);
+          }
+        }
+      }
+    }
+    return Status::OK();
+  };
+
+  auto build_test_case = [](ModelTestBuilder& builder) {
+    auto* input1_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
+    auto* input2_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
+    auto* add_out = builder.MakeIntermediate();
+    builder.AddNode("Add", {input1_arg, input2_arg}, {add_out});
+
+    auto* starts_initializer = builder.MakeInitializer<int64_t>({1}, {0});
+    auto* ends_initializer = builder.MakeInitializer<int64_t>({1}, {-1});
+    auto* axes_initializer = builder.MakeInitializer<int64_t>({1}, {1});
+    auto* steps_initializer = builder.MakeInitializer<int64_t>({1}, {1});
+    auto* slice_out = builder.MakeIntermediate();
+    builder.AddNode("Slice", {add_out, starts_initializer, ends_initializer, axes_initializer, steps_initializer},
+                    {slice_out});
+
+    auto* identity_out = builder.MakeOutput();
+    builder.AddNode("Identity", {slice_out}, {identity_out});
+  };
+
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<UpStreamGatherGraphTransformer>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger, std::move(transformer),
+                                        TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
+}
+
+/*
+Test graph includes multiple equivalent subgraphs as below.
+           graph input [4, 32, 256] (float)            graph input [4, 32, 256] (float)
+                            |                                |
+                             \_____________   ______________/
+                                           \ /
+                                           Add  starts:(0,0)  ends: (-1,-1)  axes: (0,1) steps: (1,1)
+                                            \       \        |              /           /
+                                               \       \     |           /          /
+                                                  \      \   |        /         /
+                                                    \     \  |     /        /
+                                                       \   \ |   /     /
+                                                            Slice
+                                                             |
+                                                          Identity
+                                                             |
+                                                graph output [3, 31, 256] (float)
+
+Add an Identity node because currently we don't allow Slice generates graph output.
+*/
+TEST(ComputeOptimizerTests, SliceElementwiseOps_NoPropagationForMutipleAxesSlice) {
+  const logging::Logger* logger = &logging::LoggingManager::DefaultLogger();
+  auto pre_graph_checker = [](Graph& graph) -> Status {
+    auto op_count_pre = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_pre.size() == 3U);
+    TEST_RETURN_IF_NOT(op_count_pre["Add"] == 1);
+    TEST_RETURN_IF_NOT(op_count_pre["Slice"] == 1);
+    TEST_RETURN_IF_NOT(op_count_pre["Identity"] == 1);
+
+    return Status::OK();
+  };
+
+  auto post_graph_checker = [](Graph& graph) {
+    auto op_count_post = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_count_post.size() == 3U);
+    TEST_RETURN_IF_NOT(op_count_post["Add"] == 1);
+    TEST_RETURN_IF_NOT(op_count_post["Slice"] == 1);
+    TEST_RETURN_IF_NOT(op_count_post["Identity"] == 1);
+
+    return Status::OK();
+  };
+
+  auto build_test_case = [](ModelTestBuilder& builder) {
+    auto* input1_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
+    auto* input2_arg = builder.MakeInput<int64_t>({{4, 32, 256}});
+    auto* add_out = builder.MakeIntermediate();
+    builder.AddNode("Add", {input1_arg, input2_arg}, {add_out});
+
+    auto* starts_initializer = builder.MakeInitializer<int64_t>({2}, {0, 0});
+    auto* ends_initializer = builder.MakeInitializer<int64_t>({2}, {-1, -1});
+    auto* axes_initializer = builder.MakeInitializer<int64_t>({2}, {0, 1});
+    auto* steps_initializer = builder.MakeInitializer<int64_t>({2}, {1, 1});
+    auto* slice_out = builder.MakeIntermediate();
+    builder.AddNode("Slice", {add_out, starts_initializer, ends_initializer, axes_initializer, steps_initializer},
+                    {slice_out});
+
+    auto* identity_out = builder.MakeOutput();
+    builder.AddNode("Identity", {slice_out}, {identity_out});
+  };
+
+  std::unique_ptr<GraphTransformer> transformer = std::make_unique<UpStreamGatherGraphTransformer>();
+  ASSERT_STATUS_OK(TestGraphTransformer(build_test_case, 14, *logger, std::move(transformer),
+                                        TransformerLevel::Level1,
+                                        1, pre_graph_checker, post_graph_checker));
+}
+
 /*
 Test graph include multiple equivalent subgraphs as below.
            graph input [4, 32, 256] (int64_t)            graph input [4, 32, 256] (int64_t)