Add nightly pipeline for MI100 to run convergence and batch size test similar to V100. (#6611)

* Partial updating of ROCM reduction code. * Update reduction_all.cu * Add reduce template parameters. * miopen common * Reuse CUDA's reduction_functions.cc * Reduction ops. * Update remaining reduction ops to use MIOpen. double datatype is not supported, so disable those typed kernels. * Disable a couple more unsupported tests. * Code formatting. * Delete ROCM-specific reduction code that is identical to CUDA reduction code. * Fix scratch buffer early free. * Fix merge conflict. * first attempt nightly amd ci pipeline * try fix bad yaml file * try again with corrected model directory * add convergence test as well * update reference loss for amd mi100 * include mi100 test results csv * update the mi100 convergence test reference values * update batch sizes for mi100 32g * fix gpu sku for run_convergence_test.py * undo unrelated changes to master * pr comments * pr comment Co-authored-by: Jesse Benson <jesseb@microsoft.com>
2026-07-20 19:12:24 +00:00 · 2021-02-12 16:22:06 -05:00 · 2021-02-12 16:22:06 -05:00 · e6de0eb813
commit e6de0eb813
parent f11b5d3072
5 changed files with 132 additions and 5 deletions
--- a/orttraining/orttraining/models/bert/main.cc
+++ b/orttraining/orttraining/models/bert/main.cc
@ -33,6 +33,29 @@ using namespace onnxruntime::training;
 using namespace onnxruntime::training::tensorboard;
 using namespace std;

+static SessionOptions session_options = {
+    ExecutionMode::ORT_SEQUENTIAL,     //execution_mode
+    ExecutionOrder::PRIORITY_BASED,    //execution_order
+    false,                             //enable_profiling
+    ORT_TSTR(""),                      //optimized_model_filepath
+    true,                              //enable_mem_pattern
+    true,                              //enable_cpu_mem_arena
+    ORT_TSTR("onnxruntime_profile_"),  //profile_file_prefix
+    "",                                //session_logid
+    -1,                                //session_log_severity_level
+    0,                                 //session_log_verbosity_level
+    5,                                 //max_num_graph_transformation_steps
+    TransformerLevel::Level1,          //graph_optimization_level
+    {},                                //intra_op_param
+    {},                                //inter_op_param
+    {},                                //free_dimension_overrides
+    true,                              //use_per_session_threads
+    true,                              //thread_pool_allow_spinning
+    false,                             //use_deterministic_compute
+    {},                                //session_configurations
+    {},                                // initializers_to_share_map
+}; 
+
 struct BertParameters : public TrainingRunner::Parameters {
  int max_sequence_length = 512;
  int max_predictions_per_sequence = 80;
@ -109,6 +132,7 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
      ("iterations_per_loop", "How many steps to make in each estimator call.", cxxopts::value<int>()->default_value("1000"))
      ("max_eval_steps", "Maximum number of eval steps.", cxxopts::value<int>()->default_value("100"))
      ("seed", "Random seed.", cxxopts::value<int64_t>()->default_value("-1"))
+      ("use_deterministic_compute", "Whether to enable deterministic compute.", cxxopts::value<bool>()->default_value("false"))
      ("use_mixed_precision", "Whether to use a mix of fp32 and fp16 arithmetic on GPU.", cxxopts::value<bool>()->default_value("false"))
      ("use_bfloat16", "Whether to use BFloat16 arithmetic on GPU.", cxxopts::value<bool>()->default_value("false"))
      ("enable_adasum", "Whether to use Adasum for allreduction.", cxxopts::value<bool>()->default_value("false"))
@ -469,6 +493,8 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
      std::cout << "Random seed is set to: " << seed << std::endl;
    }

+    session_options.use_deterministic_compute = flags["use_deterministic_compute"].as<bool>();
+
    params.enable_gelu_approximation = flags["enable_gelu_approximation"].as<bool>();
    params.attn_dropout_recompute = flags["attn_dropout_recompute"].as<bool>();
    params.gelu_recompute = flags["gelu_recompute"].as<bool>();
@ -746,7 +772,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment
  auto random_perf_data = std::make_shared<RandomDataSet>(num_of_perf_samples, tensor_names, tensor_shapes, tensor_types);
  auto random_perf_data_loader = onnxruntime::make_unique<SingleDataLoader>(random_perf_data, tensor_names);

-  TrainingRunner runner{params, env};
+  TrainingRunner runner{params, env, session_options};
  ORT_RETURN_IF_ERROR(runner.Initialize());
  ORT_RETURN_IF_ERROR(runner.Run(random_perf_data_loader.get(), random_perf_data_loader.get()));

@ -756,7 +782,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment
 static Status RunTraining(const BertParameters& params, const Environment& env) {
  const size_t max_num_files_preload = 2;

-  auto runner = onnxruntime::make_unique<TrainingRunner>(params, env);
+  auto runner = onnxruntime::make_unique<TrainingRunner>(params, env, session_options);
  ORT_RETURN_IF_ERROR(runner->Initialize());

  BertParameters params_for_phase;
--- a/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv
+++ b/orttraining/tools/ci_test/results/bert_base.convergence.baseline.mi100.csv
@ -0,0 +1,11 @@
+step,total_loss,mlm_loss,nsp_loss
+0,11.217,10.5178,0.699256
+5,9.67644,7.52047,2.15598
+10,8.31964,7.54136,0.778281
+15,8.22823,7.54625,0.681978
+20,8.17299,7.49675,0.676236
+25,8.2415,7.5356,0.705902
+30,8.0874,7.39312,0.694279
+35,7.99095,7.25612,0.734829
+40,7.92988,7.25608,0.673804
+45,7.94762,7.27291,0.674713
--- a/orttraining/tools/ci_test/run_batch_size_test.py
+++ b/orttraining/tools/ci_test/run_batch_size_test.py
@ -13,6 +13,8 @@ def parse_args():
    parser = argparse.ArgumentParser(description="Runs a BERT batch size test.")
    parser.add_argument("--binary_dir", required=True, help="Path to the ORT binary directory.")
    parser.add_argument("--model_root", required=True, help="Path to the model root directory.")
+    parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False, 
+            help="GPU model (e.g. V100_16G, MI100_32G).")
    return parser.parse_args()


@ -24,7 +26,9 @@ def main():
                                               "max_batch_size", 
                                               "max_predictions_per_seq", 
                                               "additional_options"])
-    configs = [
+
+    configs = {}
+    configs['V100_16G'] = [
        Config(True, 128, 76, 20, ""),
        Config(True, 512, 11, 80, ""),
        Config(False, 128, 39, 20, ""),
@ -41,8 +45,15 @@ def main():
        Config(True, 512, 50, 80, "--transformer_layer_recompute"),
    ]

+    configs['MI100_32G'] = [
+        Config(True, 128, 201, 20, ""),
+        Config(True, 512, 31, 80, ""),
+        Config(False, 128, 109, 20, ""),
+        Config(False, 512, 16, 80, ""),
+    ]
+ 
    # run BERT training
-    for config in configs:
+    for config in configs[args.gpu_sku]:
        print("##### testing name - {}-{} #####".format("fp16" if config.enable_mixed_precision else "fp32",
                                                        config.sequence_length))
        cmds = [
--- a/orttraining/tools/ci_test/run_convergence_test.py
+++ b/orttraining/tools/ci_test/run_convergence_test.py
@ -20,6 +20,8 @@ def parse_args():
                      help="Path to the training data root directory.")
  parser.add_argument("--model_root", required=True,
                      help="Path to the model root directory.")
+  parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False, 
+                      help="GPU model (e.g. V100_16G, MI100_32G).")
  return parser.parse_args()

 def main():
@ -49,6 +51,7 @@ def main():
        "--gradient_accumulation_steps", "16",
        "--max_predictions_per_seq=20",
        "--use_mixed_precision",
+        "--use_deterministic_compute",
        "--allreduce_in_fp16",
        "--lambda", "0",
        "--use_nccl",
@ -57,10 +60,18 @@ def main():
        "--enable_grad_norm_clip=false",
    ]).check_returncode()

+    # reference data
+    if args.gpu_sku == 'MI100_32G':
+        reference_csv = "bert_base.convergence.baseline.mi100.csv"
+    elif args.gpu_sku == 'V100_16G':
+        reference_csv = "bert_base.convergence.baseline.csv"
+    else:
+        raise ValueError('Unrecognized gpu_sku {}'.format(args.gpu_sku))
+
    # verify output
    comparison_result = compare_results_files(
        expected_results_path=os.path.join(
-            SCRIPT_DIR, "results", "bert_base.convergence.baseline.csv"),
+            SCRIPT_DIR, "results", reference_csv),
        actual_results_path=convergence_test_output_path,
        field_comparisons={
            "step": Comparisons.eq(),
--- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-amd-e2e-test-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-amd-e2e-test-ci-pipeline.yml
@ -0,0 +1,68 @@
+trigger: none
+
+name: 'orttraining_amd_nightly_$(Date:yyyyMMdd)_$(Rev:r)'
+pool: 'AMD-GPU'
+
+jobs:
+- job: Onnxruntime_Linux_GPU_AMD_Training_E2E_Test
+
+  timeoutInMinutes: 60
+
+  steps:
+  - checkout: self
+    clean: true
+    submodules: recursive
+
+  - script: |-
+      echo "##vso[task.prependpath]/home/ciagent/conda/bin/"
+      echo "##vso[task.prependpath]/home/ciagent/pkg/openmpi-4.0.5/bin/"
+      echo '##vso[task.setvariable variable=LD_LIBRARY_PATH]/home/ciagent/pkg/openmpi-4.0.5/lib/'
+      eval "$('/home/ciagent/conda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
+      echo "Selecting GPU based on HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES"
+    displayName: 'Initialize environment'
+  
+  # update these if the E2E test data changes
+  - script: |-
+      python orttraining/tools/ci_test/download_azure_blob_archive.py \
+        --azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
+        --target_dir training_e2e_test_data \
+        --archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
+    displayName: 'Download onnxruntime_training_data.zip data'
+
+  - script: |-
+      python tools/ci_build/build.py \
+        --config RelWithDebInfo \
+        --enable_training \
+        --mpi_home /home/ciagent/pkg/openmpi-4.0.5 \
+        --use_rocm \
+        --rocm_home /opt/rocm \
+        --nccl_home /opt/rocm \
+        --update \
+        --build_dir ./build \
+        --build \
+        --parallel 8 \
+        --build_wheel \
+        --skip_tests
+    displayName: 'Build onnxruntime'
+
+  - script: |-
+      cd ./build/RelWithDebInfo &&\
+      ../../tools/ci_build/github/pai/pai_test_launcher.sh
+    displayName: 'Run unit tests'
+
+  - script: |-
+     python orttraining/tools/ci_test/run_batch_size_test.py \
+       --binary_dir build/RelWithDebInfo \
+       --model_root training_e2e_test_data/models \
+       --gpu_sku MI100_32G
+    displayName: 'Run batch size test'
+    condition: succeededOrFailed() # ensure all tests are run
+
+  - script: |-
+     python orttraining/tools/ci_test/run_convergence_test.py \
+       --binary_dir build/RelWithDebInfo \
+       --model_root training_e2e_test_data/models \
+       --training_data_root training_e2e_test_data/data \
+       --gpu_sku MI100_32G
+    displayName: 'Run convergence test'
+    condition: succeededOrFailed() # ensure all tests are run