Add nightly pipeline for MI100 to run convergence and batch size test similar to V100. (#6611)

* Partial updating of ROCM reduction code.

* Update reduction_all.cu

* Add reduce template parameters.

* miopen common

* Reuse CUDA's reduction_functions.cc

* Reduction ops.

* Update remaining reduction ops to use MIOpen.  double datatype is not supported, so disable those typed kernels.

* Disable a couple more unsupported tests.

* Code formatting.

* Delete ROCM-specific reduction code that is identical to CUDA reduction code.

* Fix scratch buffer early free.

* Fix merge conflict.

* first attempt nightly amd ci pipeline

* try fix bad yaml file

* try again with corrected model directory

* add convergence test as well

* update reference loss for amd mi100

* include mi100 test results csv

* update the mi100  convergence test reference values

* update batch sizes for mi100 32g

* fix gpu sku for run_convergence_test.py

* undo unrelated changes to master

* pr comments

* pr comment

Co-authored-by: Jesse Benson <jesseb@microsoft.com>
This commit is contained in:
Suffian Khan 2021-02-12 16:22:06 -05:00 committed by GitHub
parent f11b5d3072
commit e6de0eb813
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 132 additions and 5 deletions

View file

@ -33,6 +33,29 @@ using namespace onnxruntime::training;
using namespace onnxruntime::training::tensorboard;
using namespace std;
static SessionOptions session_options = {
ExecutionMode::ORT_SEQUENTIAL, //execution_mode
ExecutionOrder::PRIORITY_BASED, //execution_order
false, //enable_profiling
ORT_TSTR(""), //optimized_model_filepath
true, //enable_mem_pattern
true, //enable_cpu_mem_arena
ORT_TSTR("onnxruntime_profile_"), //profile_file_prefix
"", //session_logid
-1, //session_log_severity_level
0, //session_log_verbosity_level
5, //max_num_graph_transformation_steps
TransformerLevel::Level1, //graph_optimization_level
{}, //intra_op_param
{}, //inter_op_param
{}, //free_dimension_overrides
true, //use_per_session_threads
true, //thread_pool_allow_spinning
false, //use_deterministic_compute
{}, //session_configurations
{}, // initializers_to_share_map
};
struct BertParameters : public TrainingRunner::Parameters {
int max_sequence_length = 512;
int max_predictions_per_sequence = 80;
@ -109,6 +132,7 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
("iterations_per_loop", "How many steps to make in each estimator call.", cxxopts::value<int>()->default_value("1000"))
("max_eval_steps", "Maximum number of eval steps.", cxxopts::value<int>()->default_value("100"))
("seed", "Random seed.", cxxopts::value<int64_t>()->default_value("-1"))
("use_deterministic_compute", "Whether to enable deterministic compute.", cxxopts::value<bool>()->default_value("false"))
("use_mixed_precision", "Whether to use a mix of fp32 and fp16 arithmetic on GPU.", cxxopts::value<bool>()->default_value("false"))
("use_bfloat16", "Whether to use BFloat16 arithmetic on GPU.", cxxopts::value<bool>()->default_value("false"))
("enable_adasum", "Whether to use Adasum for allreduction.", cxxopts::value<bool>()->default_value("false"))
@ -469,6 +493,8 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
std::cout << "Random seed is set to: " << seed << std::endl;
}
session_options.use_deterministic_compute = flags["use_deterministic_compute"].as<bool>();
params.enable_gelu_approximation = flags["enable_gelu_approximation"].as<bool>();
params.attn_dropout_recompute = flags["attn_dropout_recompute"].as<bool>();
params.gelu_recompute = flags["gelu_recompute"].as<bool>();
@ -746,7 +772,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment
auto random_perf_data = std::make_shared<RandomDataSet>(num_of_perf_samples, tensor_names, tensor_shapes, tensor_types);
auto random_perf_data_loader = onnxruntime::make_unique<SingleDataLoader>(random_perf_data, tensor_names);
TrainingRunner runner{params, env};
TrainingRunner runner{params, env, session_options};
ORT_RETURN_IF_ERROR(runner.Initialize());
ORT_RETURN_IF_ERROR(runner.Run(random_perf_data_loader.get(), random_perf_data_loader.get()));
@ -756,7 +782,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment
static Status RunTraining(const BertParameters& params, const Environment& env) {
const size_t max_num_files_preload = 2;
auto runner = onnxruntime::make_unique<TrainingRunner>(params, env);
auto runner = onnxruntime::make_unique<TrainingRunner>(params, env, session_options);
ORT_RETURN_IF_ERROR(runner->Initialize());
BertParameters params_for_phase;

View file

@ -0,0 +1,11 @@
step,total_loss,mlm_loss,nsp_loss
0,11.217,10.5178,0.699256
5,9.67644,7.52047,2.15598
10,8.31964,7.54136,0.778281
15,8.22823,7.54625,0.681978
20,8.17299,7.49675,0.676236
25,8.2415,7.5356,0.705902
30,8.0874,7.39312,0.694279
35,7.99095,7.25612,0.734829
40,7.92988,7.25608,0.673804
45,7.94762,7.27291,0.674713
1 step total_loss mlm_loss nsp_loss
2 0 11.217 10.5178 0.699256
3 5 9.67644 7.52047 2.15598
4 10 8.31964 7.54136 0.778281
5 15 8.22823 7.54625 0.681978
6 20 8.17299 7.49675 0.676236
7 25 8.2415 7.5356 0.705902
8 30 8.0874 7.39312 0.694279
9 35 7.99095 7.25612 0.734829
10 40 7.92988 7.25608 0.673804
11 45 7.94762 7.27291 0.674713

View file

@ -13,6 +13,8 @@ def parse_args():
parser = argparse.ArgumentParser(description="Runs a BERT batch size test.")
parser.add_argument("--binary_dir", required=True, help="Path to the ORT binary directory.")
parser.add_argument("--model_root", required=True, help="Path to the model root directory.")
parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False,
help="GPU model (e.g. V100_16G, MI100_32G).")
return parser.parse_args()
@ -24,7 +26,9 @@ def main():
"max_batch_size",
"max_predictions_per_seq",
"additional_options"])
configs = [
configs = {}
configs['V100_16G'] = [
Config(True, 128, 76, 20, ""),
Config(True, 512, 11, 80, ""),
Config(False, 128, 39, 20, ""),
@ -41,8 +45,15 @@ def main():
Config(True, 512, 50, 80, "--transformer_layer_recompute"),
]
configs['MI100_32G'] = [
Config(True, 128, 201, 20, ""),
Config(True, 512, 31, 80, ""),
Config(False, 128, 109, 20, ""),
Config(False, 512, 16, 80, ""),
]
# run BERT training
for config in configs:
for config in configs[args.gpu_sku]:
print("##### testing name - {}-{} #####".format("fp16" if config.enable_mixed_precision else "fp32",
config.sequence_length))
cmds = [

View file

@ -20,6 +20,8 @@ def parse_args():
help="Path to the training data root directory.")
parser.add_argument("--model_root", required=True,
help="Path to the model root directory.")
parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False,
help="GPU model (e.g. V100_16G, MI100_32G).")
return parser.parse_args()
def main():
@ -49,6 +51,7 @@ def main():
"--gradient_accumulation_steps", "16",
"--max_predictions_per_seq=20",
"--use_mixed_precision",
"--use_deterministic_compute",
"--allreduce_in_fp16",
"--lambda", "0",
"--use_nccl",
@ -57,10 +60,18 @@ def main():
"--enable_grad_norm_clip=false",
]).check_returncode()
# reference data
if args.gpu_sku == 'MI100_32G':
reference_csv = "bert_base.convergence.baseline.mi100.csv"
elif args.gpu_sku == 'V100_16G':
reference_csv = "bert_base.convergence.baseline.csv"
else:
raise ValueError('Unrecognized gpu_sku {}'.format(args.gpu_sku))
# verify output
comparison_result = compare_results_files(
expected_results_path=os.path.join(
SCRIPT_DIR, "results", "bert_base.convergence.baseline.csv"),
SCRIPT_DIR, "results", reference_csv),
actual_results_path=convergence_test_output_path,
field_comparisons={
"step": Comparisons.eq(),

View file

@ -0,0 +1,68 @@
trigger: none
name: 'orttraining_amd_nightly_$(Date:yyyyMMdd)_$(Rev:r)'
pool: 'AMD-GPU'
jobs:
- job: Onnxruntime_Linux_GPU_AMD_Training_E2E_Test
timeoutInMinutes: 60
steps:
- checkout: self
clean: true
submodules: recursive
- script: |-
echo "##vso[task.prependpath]/home/ciagent/conda/bin/"
echo "##vso[task.prependpath]/home/ciagent/pkg/openmpi-4.0.5/bin/"
echo '##vso[task.setvariable variable=LD_LIBRARY_PATH]/home/ciagent/pkg/openmpi-4.0.5/lib/'
eval "$('/home/ciagent/conda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
echo "Selecting GPU based on HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES"
displayName: 'Initialize environment'
# update these if the E2E test data changes
- script: |-
python orttraining/tools/ci_test/download_azure_blob_archive.py \
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
--target_dir training_e2e_test_data \
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
displayName: 'Download onnxruntime_training_data.zip data'
- script: |-
python tools/ci_build/build.py \
--config RelWithDebInfo \
--enable_training \
--mpi_home /home/ciagent/pkg/openmpi-4.0.5 \
--use_rocm \
--rocm_home /opt/rocm \
--nccl_home /opt/rocm \
--update \
--build_dir ./build \
--build \
--parallel 8 \
--build_wheel \
--skip_tests
displayName: 'Build onnxruntime'
- script: |-
cd ./build/RelWithDebInfo &&\
../../tools/ci_build/github/pai/pai_test_launcher.sh
displayName: 'Run unit tests'
- script: |-
python orttraining/tools/ci_test/run_batch_size_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--gpu_sku MI100_32G
displayName: 'Run batch size test'
condition: succeededOrFailed() # ensure all tests are run
- script: |-
python orttraining/tools/ci_test/run_convergence_test.py \
--binary_dir build/RelWithDebInfo \
--model_root training_e2e_test_data/models \
--training_data_root training_e2e_test_data/data \
--gpu_sku MI100_32G
displayName: 'Run convergence test'
condition: succeededOrFailed() # ensure all tests are run