mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-02 03:55:34 +00:00
Add nightly pipeline for MI100 to run convergence and batch size test similar to V100. (#6611)
* Partial updating of ROCM reduction code. * Update reduction_all.cu * Add reduce template parameters. * miopen common * Reuse CUDA's reduction_functions.cc * Reduction ops. * Update remaining reduction ops to use MIOpen. double datatype is not supported, so disable those typed kernels. * Disable a couple more unsupported tests. * Code formatting. * Delete ROCM-specific reduction code that is identical to CUDA reduction code. * Fix scratch buffer early free. * Fix merge conflict. * first attempt nightly amd ci pipeline * try fix bad yaml file * try again with corrected model directory * add convergence test as well * update reference loss for amd mi100 * include mi100 test results csv * update the mi100 convergence test reference values * update batch sizes for mi100 32g * fix gpu sku for run_convergence_test.py * undo unrelated changes to master * pr comments * pr comment Co-authored-by: Jesse Benson <jesseb@microsoft.com>
This commit is contained in:
parent
f11b5d3072
commit
e6de0eb813
5 changed files with 132 additions and 5 deletions
|
|
@ -33,6 +33,29 @@ using namespace onnxruntime::training;
|
|||
using namespace onnxruntime::training::tensorboard;
|
||||
using namespace std;
|
||||
|
||||
static SessionOptions session_options = {
|
||||
ExecutionMode::ORT_SEQUENTIAL, //execution_mode
|
||||
ExecutionOrder::PRIORITY_BASED, //execution_order
|
||||
false, //enable_profiling
|
||||
ORT_TSTR(""), //optimized_model_filepath
|
||||
true, //enable_mem_pattern
|
||||
true, //enable_cpu_mem_arena
|
||||
ORT_TSTR("onnxruntime_profile_"), //profile_file_prefix
|
||||
"", //session_logid
|
||||
-1, //session_log_severity_level
|
||||
0, //session_log_verbosity_level
|
||||
5, //max_num_graph_transformation_steps
|
||||
TransformerLevel::Level1, //graph_optimization_level
|
||||
{}, //intra_op_param
|
||||
{}, //inter_op_param
|
||||
{}, //free_dimension_overrides
|
||||
true, //use_per_session_threads
|
||||
true, //thread_pool_allow_spinning
|
||||
false, //use_deterministic_compute
|
||||
{}, //session_configurations
|
||||
{}, // initializers_to_share_map
|
||||
};
|
||||
|
||||
struct BertParameters : public TrainingRunner::Parameters {
|
||||
int max_sequence_length = 512;
|
||||
int max_predictions_per_sequence = 80;
|
||||
|
|
@ -109,6 +132,7 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
|
|||
("iterations_per_loop", "How many steps to make in each estimator call.", cxxopts::value<int>()->default_value("1000"))
|
||||
("max_eval_steps", "Maximum number of eval steps.", cxxopts::value<int>()->default_value("100"))
|
||||
("seed", "Random seed.", cxxopts::value<int64_t>()->default_value("-1"))
|
||||
("use_deterministic_compute", "Whether to enable deterministic compute.", cxxopts::value<bool>()->default_value("false"))
|
||||
("use_mixed_precision", "Whether to use a mix of fp32 and fp16 arithmetic on GPU.", cxxopts::value<bool>()->default_value("false"))
|
||||
("use_bfloat16", "Whether to use BFloat16 arithmetic on GPU.", cxxopts::value<bool>()->default_value("false"))
|
||||
("enable_adasum", "Whether to use Adasum for allreduction.", cxxopts::value<bool>()->default_value("false"))
|
||||
|
|
@ -469,6 +493,8 @@ Status ParseArguments(int argc, char* argv[], BertParameters& params, OrtParamet
|
|||
std::cout << "Random seed is set to: " << seed << std::endl;
|
||||
}
|
||||
|
||||
session_options.use_deterministic_compute = flags["use_deterministic_compute"].as<bool>();
|
||||
|
||||
params.enable_gelu_approximation = flags["enable_gelu_approximation"].as<bool>();
|
||||
params.attn_dropout_recompute = flags["attn_dropout_recompute"].as<bool>();
|
||||
params.gelu_recompute = flags["gelu_recompute"].as<bool>();
|
||||
|
|
@ -746,7 +772,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment
|
|||
auto random_perf_data = std::make_shared<RandomDataSet>(num_of_perf_samples, tensor_names, tensor_shapes, tensor_types);
|
||||
auto random_perf_data_loader = onnxruntime::make_unique<SingleDataLoader>(random_perf_data, tensor_names);
|
||||
|
||||
TrainingRunner runner{params, env};
|
||||
TrainingRunner runner{params, env, session_options};
|
||||
ORT_RETURN_IF_ERROR(runner.Initialize());
|
||||
ORT_RETURN_IF_ERROR(runner.Run(random_perf_data_loader.get(), random_perf_data_loader.get()));
|
||||
|
||||
|
|
@ -756,7 +782,7 @@ static Status RunPerformanceTest(const BertParameters& params, const Environment
|
|||
static Status RunTraining(const BertParameters& params, const Environment& env) {
|
||||
const size_t max_num_files_preload = 2;
|
||||
|
||||
auto runner = onnxruntime::make_unique<TrainingRunner>(params, env);
|
||||
auto runner = onnxruntime::make_unique<TrainingRunner>(params, env, session_options);
|
||||
ORT_RETURN_IF_ERROR(runner->Initialize());
|
||||
|
||||
BertParameters params_for_phase;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,11 @@
|
|||
step,total_loss,mlm_loss,nsp_loss
|
||||
0,11.217,10.5178,0.699256
|
||||
5,9.67644,7.52047,2.15598
|
||||
10,8.31964,7.54136,0.778281
|
||||
15,8.22823,7.54625,0.681978
|
||||
20,8.17299,7.49675,0.676236
|
||||
25,8.2415,7.5356,0.705902
|
||||
30,8.0874,7.39312,0.694279
|
||||
35,7.99095,7.25612,0.734829
|
||||
40,7.92988,7.25608,0.673804
|
||||
45,7.94762,7.27291,0.674713
|
||||
|
|
|
@ -13,6 +13,8 @@ def parse_args():
|
|||
parser = argparse.ArgumentParser(description="Runs a BERT batch size test.")
|
||||
parser.add_argument("--binary_dir", required=True, help="Path to the ORT binary directory.")
|
||||
parser.add_argument("--model_root", required=True, help="Path to the model root directory.")
|
||||
parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False,
|
||||
help="GPU model (e.g. V100_16G, MI100_32G).")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
|
|
@ -24,7 +26,9 @@ def main():
|
|||
"max_batch_size",
|
||||
"max_predictions_per_seq",
|
||||
"additional_options"])
|
||||
configs = [
|
||||
|
||||
configs = {}
|
||||
configs['V100_16G'] = [
|
||||
Config(True, 128, 76, 20, ""),
|
||||
Config(True, 512, 11, 80, ""),
|
||||
Config(False, 128, 39, 20, ""),
|
||||
|
|
@ -41,8 +45,15 @@ def main():
|
|||
Config(True, 512, 50, 80, "--transformer_layer_recompute"),
|
||||
]
|
||||
|
||||
configs['MI100_32G'] = [
|
||||
Config(True, 128, 201, 20, ""),
|
||||
Config(True, 512, 31, 80, ""),
|
||||
Config(False, 128, 109, 20, ""),
|
||||
Config(False, 512, 16, 80, ""),
|
||||
]
|
||||
|
||||
# run BERT training
|
||||
for config in configs:
|
||||
for config in configs[args.gpu_sku]:
|
||||
print("##### testing name - {}-{} #####".format("fp16" if config.enable_mixed_precision else "fp32",
|
||||
config.sequence_length))
|
||||
cmds = [
|
||||
|
|
|
|||
|
|
@ -20,6 +20,8 @@ def parse_args():
|
|||
help="Path to the training data root directory.")
|
||||
parser.add_argument("--model_root", required=True,
|
||||
help="Path to the model root directory.")
|
||||
parser.add_argument("--gpu_sku", choices=['V100_16G', 'MI100_32G'], default='V100_16G', required=False,
|
||||
help="GPU model (e.g. V100_16G, MI100_32G).")
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
|
|
@ -49,6 +51,7 @@ def main():
|
|||
"--gradient_accumulation_steps", "16",
|
||||
"--max_predictions_per_seq=20",
|
||||
"--use_mixed_precision",
|
||||
"--use_deterministic_compute",
|
||||
"--allreduce_in_fp16",
|
||||
"--lambda", "0",
|
||||
"--use_nccl",
|
||||
|
|
@ -57,10 +60,18 @@ def main():
|
|||
"--enable_grad_norm_clip=false",
|
||||
]).check_returncode()
|
||||
|
||||
# reference data
|
||||
if args.gpu_sku == 'MI100_32G':
|
||||
reference_csv = "bert_base.convergence.baseline.mi100.csv"
|
||||
elif args.gpu_sku == 'V100_16G':
|
||||
reference_csv = "bert_base.convergence.baseline.csv"
|
||||
else:
|
||||
raise ValueError('Unrecognized gpu_sku {}'.format(args.gpu_sku))
|
||||
|
||||
# verify output
|
||||
comparison_result = compare_results_files(
|
||||
expected_results_path=os.path.join(
|
||||
SCRIPT_DIR, "results", "bert_base.convergence.baseline.csv"),
|
||||
SCRIPT_DIR, "results", reference_csv),
|
||||
actual_results_path=convergence_test_output_path,
|
||||
field_comparisons={
|
||||
"step": Comparisons.eq(),
|
||||
|
|
|
|||
|
|
@ -0,0 +1,68 @@
|
|||
trigger: none
|
||||
|
||||
name: 'orttraining_amd_nightly_$(Date:yyyyMMdd)_$(Rev:r)'
|
||||
pool: 'AMD-GPU'
|
||||
|
||||
jobs:
|
||||
- job: Onnxruntime_Linux_GPU_AMD_Training_E2E_Test
|
||||
|
||||
timeoutInMinutes: 60
|
||||
|
||||
steps:
|
||||
- checkout: self
|
||||
clean: true
|
||||
submodules: recursive
|
||||
|
||||
- script: |-
|
||||
echo "##vso[task.prependpath]/home/ciagent/conda/bin/"
|
||||
echo "##vso[task.prependpath]/home/ciagent/pkg/openmpi-4.0.5/bin/"
|
||||
echo '##vso[task.setvariable variable=LD_LIBRARY_PATH]/home/ciagent/pkg/openmpi-4.0.5/lib/'
|
||||
eval "$('/home/ciagent/conda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
|
||||
echo "Selecting GPU based on HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES"
|
||||
displayName: 'Initialize environment'
|
||||
|
||||
# update these if the E2E test data changes
|
||||
- script: |-
|
||||
python orttraining/tools/ci_test/download_azure_blob_archive.py \
|
||||
--azure_blob_url https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z \
|
||||
--target_dir training_e2e_test_data \
|
||||
--archive_sha256_digest B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9
|
||||
displayName: 'Download onnxruntime_training_data.zip data'
|
||||
|
||||
- script: |-
|
||||
python tools/ci_build/build.py \
|
||||
--config RelWithDebInfo \
|
||||
--enable_training \
|
||||
--mpi_home /home/ciagent/pkg/openmpi-4.0.5 \
|
||||
--use_rocm \
|
||||
--rocm_home /opt/rocm \
|
||||
--nccl_home /opt/rocm \
|
||||
--update \
|
||||
--build_dir ./build \
|
||||
--build \
|
||||
--parallel 8 \
|
||||
--build_wheel \
|
||||
--skip_tests
|
||||
displayName: 'Build onnxruntime'
|
||||
|
||||
- script: |-
|
||||
cd ./build/RelWithDebInfo &&\
|
||||
../../tools/ci_build/github/pai/pai_test_launcher.sh
|
||||
displayName: 'Run unit tests'
|
||||
|
||||
- script: |-
|
||||
python orttraining/tools/ci_test/run_batch_size_test.py \
|
||||
--binary_dir build/RelWithDebInfo \
|
||||
--model_root training_e2e_test_data/models \
|
||||
--gpu_sku MI100_32G
|
||||
displayName: 'Run batch size test'
|
||||
condition: succeededOrFailed() # ensure all tests are run
|
||||
|
||||
- script: |-
|
||||
python orttraining/tools/ci_test/run_convergence_test.py \
|
||||
--binary_dir build/RelWithDebInfo \
|
||||
--model_root training_e2e_test_data/models \
|
||||
--training_data_root training_e2e_test_data/data \
|
||||
--gpu_sku MI100_32G
|
||||
displayName: 'Run convergence test'
|
||||
condition: succeededOrFailed() # ensure all tests are run
|
||||
Loading…
Reference in a new issue