onnxruntime/orttraining/tools/ci_test/run_convergence_test.py
Suffian Khan e6de0eb813
Add nightly pipeline for MI100 to run convergence and batch size test similar to V100. (#6611)
* Partial updating of ROCM reduction code.

* Update reduction_all.cu

* Add reduce template parameters.

* miopen common

* Reuse CUDA's reduction_functions.cc

* Reduction ops.

* Update remaining reduction ops to use MIOpen.  double datatype is not supported, so disable those typed kernels.

* Disable a couple more unsupported tests.

* Code formatting.

* Delete ROCM-specific reduction code that is identical to CUDA reduction code.

* Fix scratch buffer early free.

* Fix merge conflict.

* first attempt nightly amd ci pipeline

* try fix bad yaml file

* try again with corrected model directory

* add convergence test as well

* update reference loss for amd mi100

* include mi100 test results csv

* update the mi100  convergence test reference values

* update batch sizes for mi100 32g

* fix gpu sku for run_convergence_test.py

* undo unrelated changes to master

* pr comments

* pr comment

Co-authored-by: Jesse Benson <jesseb@microsoft.com>
2021-02-12 13:22:06 -08:00

86 lines
3.1 KiB
Python
Executable file

#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import argparse
import subprocess
import sys
import tempfile
import os
from compare_results import compare_results_files, Comparisons
SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
def parse_args():
parser = argparse.ArgumentParser(description="Runs a BERT convergence test.")
parser.add_argument("--binary_dir", required=True,
help="Path to the ORT binary directory.")
parser.add_argument("--training_data_root", required=True,
help="Path to the training data root directory.")
parser.add_argument("--model_root", required=True,
help="Path to the model root directory.")
parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False,
help="GPU model (e.g. V100_16G, MI100_32G).")
return parser.parse_args()
def main():
args = parse_args()
with tempfile.TemporaryDirectory() as output_dir:
convergence_test_output_path = os.path.join(
output_dir, "convergence_test_out.csv")
# run BERT training
subprocess.run([
os.path.join(args.binary_dir, "onnxruntime_training_bert"),
"--model_name", os.path.join(
args.model_root, "nv/bert-base/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12"),
"--train_data_dir", os.path.join(
args.training_data_root, "128/books_wiki_en_corpus/train"),
"--test_data_dir", os.path.join(
args.training_data_root, "128/books_wiki_en_corpus/test"),
"--train_batch_size", "64",
"--mode", "train",
"--num_train_steps", "800",
"--display_loss_steps", "5",
"--optimizer", "adam",
"--learning_rate", "5e-4",
"--warmup_ratio", "0.1",
"--warmup_mode", "Linear",
"--gradient_accumulation_steps", "16",
"--max_predictions_per_seq=20",
"--use_mixed_precision",
"--use_deterministic_compute",
"--allreduce_in_fp16",
"--lambda", "0",
"--use_nccl",
"--convergence_test_output_file", convergence_test_output_path,
"--seed", "42",
"--enable_grad_norm_clip=false",
]).check_returncode()
# reference data
if args.gpu_sku == 'MI100_32G':
reference_csv = "bert_base.convergence.baseline.mi100.csv"
elif args.gpu_sku == 'V100_16G':
reference_csv = "bert_base.convergence.baseline.csv"
else:
raise ValueError('Unrecognized gpu_sku {}'.format(args.gpu_sku))
# verify output
comparison_result = compare_results_files(
expected_results_path=os.path.join(
SCRIPT_DIR, "results", reference_csv),
actual_results_path=convergence_test_output_path,
field_comparisons={
"step": Comparisons.eq(),
"total_loss": Comparisons.float_le(1e-3),
"mlm_loss": Comparisons.float_le(1e-3),
"nsp_loss": Comparisons.float_le(1e-3),
})
return 0 if comparison_result else 1
if __name__ == "__main__":
sys.exit(main())