From fb4efafc8e2e5cf4361b11538aa80bc12eed439f Mon Sep 17 00:00:00 2001 From: ytaous <4484531+ytaous@users.noreply.github.com> Date: Tue, 19 May 2020 10:21:40 -0700 Subject: [PATCH] GPT-2 training perf scripts (#3974) * gpt2 training perf * gpt2 training perf * debug * debug * debug * fix bug * minor * on comments * dynamic sql * fix build * minor * linked hash * on comments * minor * mem * minor Co-authored-by: Ethan Tao --- orttraining/orttraining/models/bert/main.cc | 23 ++-- orttraining/orttraining/models/gpt2/main.cc | 32 ++++- .../models/runner/training_runner.cc | 2 +- .../tools/ci_test/run_gpt2_perf_test.py | 60 ++++++++++ ...aining-linux-gpu-perf-test-ci-pipeline.yml | 13 ++ .../java/com/msft/send_perf_metrics/App.java | 113 +++++++++--------- 6 files changed, 173 insertions(+), 70 deletions(-) create mode 100644 orttraining/tools/ci_test/run_gpt2_perf_test.py diff --git a/orttraining/orttraining/models/bert/main.cc b/orttraining/orttraining/models/bert/main.cc index 9d48b920c8..1e5a7b3e08 100644 --- a/orttraining/orttraining/models/bert/main.cc +++ b/orttraining/orttraining/models/bert/main.cc @@ -432,8 +432,16 @@ float GetLossValue(const Tensor& loss_tensor) { return loss; } -// mapping of max_sequence_length and max_predictions_per_sequence position derived from training data -std::map> input_to_dimension_mapping; +// use this table mapping to define what to be stored in mapped_dimensions, and ultimately in json structure +// Be mindful on the position, if it's invalid or out of bound, the property population process will be +// either incorrect or aborted. Also make sure to substract the index position by 1 to get valid correspondent value +// namely, in the graph, sequence is at position 1, but in initial tensor shape vector loaded from training data is at position 0, +// batch is not part of the initial tensor shape vector till later +// see GetTensorDimensionsFromInputs() in training_util.h and training_runner.cc for more details +const std::map> input_to_dimension_mapping = { + {"input1", {"SeqLen", 0}}, // int64[batch,sequence] "sequence" -> "SeqLen", 0 + {"masked_lm_ids", {"PredictionsPerSeq", 0}} // int64[batch,dynamic_prediction_count] +}; // generic properties for storing perf metrics MapStringToString mapped_dimensions; @@ -516,17 +524,6 @@ void setup_training_params(BertParameters& params) { {"masked_lm_weights", "masked_lm_weights"}, {"next_sentence_label", "next_sentence_labels"}}; - // use this table mapping to define what to be stored in mapped_dimensions, and ultimately in json structure - // Be mindful on the position, if it's invalid or out of bound, the property population process will be - // either incorrect or aborted. Also make sure to substract the index position by 1 to get valid correspondent value - // namely, in the graph, sequence is at position 1, but in initial tensor shape vector loaded from training data is at position 0, - // batch is not part of the initial tensor shape vector till later - // see GetTensorDimensionsFromInputs() in training_util.h and training_runner.cc for more details - input_to_dimension_mapping = { - {"input1", {"SeqLen", 0}}, // int64[batch,sequence] "sequence" -> "SeqLen", 0 - {"masked_lm_ids", {"PredictionsPerSeq", 0}} // int64[batch,dynamic_prediction_count] - }; - params.model_type = "bert"; params.skip_evaluation = params.is_perf_test; diff --git a/orttraining/orttraining/models/gpt2/main.cc b/orttraining/orttraining/models/gpt2/main.cc index 47e34e7c81..1e74f7cead 100644 --- a/orttraining/orttraining/models/gpt2/main.cc +++ b/orttraining/orttraining/models/gpt2/main.cc @@ -49,9 +49,12 @@ Status ParseArguments(int argc, char* argv[], GPT2Parameters& params, OrtParamet cxxopts::value()->default_value("data/1024/books_wiki_en_corpus/test")) ("output_dir", "The output directory where the trained model files will be written.", cxxopts::value()->default_value("")) + ("perf_output_dir", "The output directory where the trained perf metrics files will be written.", + cxxopts::value()->default_value("")) ("log_dir", "The directory to write tensorboard events.", cxxopts::value()->default_value("")) ("train_batch_size", "Total batch size for training.", cxxopts::value()) + ("eval_batch_size", "Total batch size for eval.", cxxopts::value()) ("learning_rate", "The initial learning rate for the optimizer.", cxxopts::value()->default_value("5e-5")) ("num_train_steps", "Total number of training steps to perform.", cxxopts::value()->default_value("100")) ("warmup_ratio", "Fraction of training steps for learning rate warmup.", cxxopts::value()->default_value("0")) @@ -119,7 +122,11 @@ Status ParseArguments(int argc, char* argv[], GPT2Parameters& params, OrtParamet params.num_train_steps = flags["num_train_steps"].as(); params.batch_size = flags["train_batch_size"].as(); - + if (flags.count("eval_batch_size")) { + params.eval_batch_size = flags["eval_batch_size"].as(); + } else { + params.eval_batch_size = params.batch_size; + } params.max_sequence_length = flags["max_seq_length"].as(); params.gradient_accumulation_steps = flags["gradient_accumulation_steps"].as(); @@ -136,6 +143,10 @@ Status ParseArguments(int argc, char* argv[], GPT2Parameters& params, OrtParamet if (params.output_dir.empty()) { printf("No output directory specified. Trained model files will not be saved.\n"); } + params.perf_output_dir = ToPathString(flags["perf_output_dir"].as()); + if (params.perf_output_dir.empty()) { + printf("No perf output directory specified. Trained perf metrics will not be saved.\n"); + } params.use_mixed_precision = flags["use_mixed_precision"].as(); params.allreduce_in_fp16 = flags["allreduce_in_fp16"].as() && params.use_mixed_precision; @@ -260,6 +271,15 @@ float GetLossValue(const Tensor& loss_tensor) { return loss; } +// mapping to define what to be stored in mapped_dimensions +// see GetTensorDimensionsFromInputs() in training_util.h and training_runner.cc for more details +const std::map> input_to_dimension_mapping = { + {"input_ids", {"SeqLen", 0}}, // int64[batch,seqlen] "seqlen" -> "SeqLen", 0 +}; + +// generic properties for storing perf metrics +MapStringToString mapped_dimensions; + void setup_training_params(GPT2Parameters& params) { params.model_path = ToPathString(params.model_name) + ORT_TSTR(".onnx"); params.model_with_loss_func_path = ToPathString(params.model_name) + ORT_TSTR("_with_cost.onnx"); @@ -316,6 +336,8 @@ void setup_training_params(GPT2Parameters& params) { {"attention_mask", "attention_mask"}, {"labels", "labels"}}; + params.model_type = "gpt2"; + #ifdef USE_CUDA OrtDevice::DeviceId device_id = static_cast(params.mpi_context.local_rank); params.providers.emplace(kCudaExecutionProvider, CreateExecutionProviderFactory_CUDA(device_id)); @@ -408,7 +430,13 @@ static Status RunTraining(const GPT2Parameters& params, const Environment& env) max_num_files_preload); } - ORT_RETURN_IF_ERROR(runner->Run(training_data_loader.get(), test_data_loader.get())); + if (!params.perf_output_dir.empty()) { + // collecting GPT2 related params from training data + auto training_data = training_data_loader->CurrentDataSet(); + ORT_RETURN_IF_ERROR(training_data->GetTensorDimensionsFromInputs(input_to_dimension_mapping, mapped_dimensions)); + } + + ORT_RETURN_IF_ERROR(runner->Run(training_data_loader.get(), test_data_loader.get(), mapped_dimensions)); // only test and save trained model on device #0 if (params.mpi_context.world_rank == 0) { diff --git a/orttraining/orttraining/models/runner/training_runner.cc b/orttraining/orttraining/models/runner/training_runner.cc index 5eebbb87a5..0d0f5131ec 100644 --- a/orttraining/orttraining/models/runner/training_runner.cc +++ b/orttraining/orttraining/models/runner/training_runner.cc @@ -923,7 +923,7 @@ Status TrainingRunner::SavePerfMetrics(const size_t number_of_batches, const siz (seq_len.empty() ? "" : "_" + seq_len) + "_" + optimizer; perf_metrics["DisplayName"] = display_name; - perf_metrics["Memory"] = peak_workingset_size; + perf_metrics["Memory"] = peak_workingset_size >> 20; // mb perf_metrics["AvgCPU"] = average_cpu_usage; // diff --git a/orttraining/tools/ci_test/run_gpt2_perf_test.py b/orttraining/tools/ci_test/run_gpt2_perf_test.py new file mode 100644 index 0000000000..75de8a83a5 --- /dev/null +++ b/orttraining/tools/ci_test/run_gpt2_perf_test.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import argparse +import subprocess +import sys +import os +from collections import namedtuple + +SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__)) + +def parse_args(): + parser = argparse.ArgumentParser(description="Runs GPT-2 performance tests.") + parser.add_argument("--binary_dir", required=True, + help="Path to the ORT binary directory.") + parser.add_argument("--training_data_root", required=True, + help="Path to the training data root directory.") + parser.add_argument("--model_root", required=True, + help="Path to the model root directory.") + return parser.parse_args() + +# TODO - review to finalize params +def main(): + args = parse_args() + + Config = namedtuple('Config', ['use_mixed_precision', 'max_seq_length', 'batch_size']) + configs = [ + Config(True, 1024, 1), + Config(False, 1024, 1) + ] + + # run GPT-2 training + for c in configs: + print("######## testing name - " + ('fp16-' if c.use_mixed_precision else 'fp32-') + str(c.max_seq_length) + " ##############") + cmds = [ + os.path.join(args.binary_dir, "onnxruntime_training_gpt2"), + "--model_name", os.path.join( + args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized"), + "--train_data_dir", os.path.join( + args.training_data_root, "train"), + "--test_data_dir", os.path.join( + args.training_data_root, "test"), + "--train_batch_size", str(c.batch_size), + "--mode", "train", + "--max_seq_length", str(c.max_seq_length), + "--num_train_steps", "200", + "--gradient_accumulation_steps", "1", + "--perf_output_dir", os.path.join(SCRIPT_DIR, "results"), + ] + + if c.use_mixed_precision: + cmds.append("--use_mixed_precision"), + + subprocess.run(cmds).check_returncode() + + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-perf-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-perf-test-ci-pipeline.yml index 523bd0b363..644ea20a1b 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-perf-test-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-linux-gpu-perf-test-ci-pipeline.yml @@ -36,6 +36,19 @@ jobs: --model_root /build/bert_models displayName: 'Run bert performance tests' + - script: > + docker run --gpus all --rm --name onnxruntime-gpu-perf + --volume $(Build.SourcesDirectory):/onnxruntime_src + --volume $(Build.BinariesDirectory):/build + --volume /bert_ort/gpt2_models:/build/gpt2_models:ro + --volume /bert_data/gpt2_data:/build/gpt2_data:ro + -e NIGHTLY_BUILD onnxruntime-ubuntu16.04-cuda10.1-cudnn7.6 + /usr/bin/python3.6 /onnxruntime_src/orttraining/tools/ci_test/run_gpt2_perf_test.py + --binary_dir /build/RelWithDebInfo + --training_data_root /build/gpt2_data + --model_root /build/gpt2_models + displayName: 'Run gpt-2 performance tests' + # generate jdbc.properties - script: > mkdir -p $(Build.SourcesDirectory)/tools/perf_util/src/main/resources && diff --git a/tools/perf_util/src/main/java/com/msft/send_perf_metrics/App.java b/tools/perf_util/src/main/java/com/msft/send_perf_metrics/App.java index c71d772727..665501bc56 100644 --- a/tools/perf_util/src/main/java/com/msft/send_perf_metrics/App.java +++ b/tools/perf_util/src/main/java/com/msft/send_perf_metrics/App.java @@ -1,6 +1,5 @@ package com.msft.send_perf_metrics; -import org.json.simple.JSONArray; import org.json.simple.JSONObject; import org.json.simple.parser.JSONParser; @@ -13,11 +12,9 @@ import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.sql.Connection; -import java.sql.Types; +import java.sql.PreparedStatement; import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.List; +import java.util.*; public class App { @@ -94,63 +91,57 @@ public class App { } } + static private void loadMetricsIntoMySQL(java.sql.Connection conn, String commit_id, String batch_id, JSONObject json_object) throws Exception { - try (java.sql.PreparedStatement st = conn.prepareStatement( - "INSERT INTO perf_test_training_data (BatchId,CommitId,Model,ModelName,DisplayName,UseMixedPrecision,Optimizer,BatchSize,SeqLen,PredictionsPerSeq," + - "NumOfBatches,WeightUpdateSteps,Round,GradAccSteps,AvgTimePerBatch,Throughput,StabilizedThroughput,TotalTime,AvgCPU,Memory,RunConfig,Time) " + - "values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,Now())" - + " ON DUPLICATE KEY UPDATE AvgTimePerBatch=?,Throughput=?,StabilizedThroughput=?,TotalTime=?,AvgCPU=?,Memory=?")) { + // field name -> json value + Map field_mapping = new LinkedHashMap(); + Set update_on_duplicate_fields = + new LinkedHashSet<> (Arrays.asList("AvgTimePerBatch", "Throughput", "StabilizedThroughput", "TotalTime", "AvgCPU", "Memory")); - int i = 0; - - // unique key section - st.setString(++i, batch_id); - st.setString(++i, commit_id.substring(0, 8)); - st.setString(++i, (String) json_object.get("Model")); - st.setString(++i, (String) json_object.get("ModelName")); - st.setString(++i, (String) json_object.get("DisplayName")); - st.setBoolean(++i, (Boolean) json_object.get("UseMixedPrecision")); - st.setString(++i, (String) json_object.get("Optimizer")); - st.setInt(++i, (int)(long) json_object.get("BatchSize")); - - // non-key section - JSONObject properties = (JSONObject) json_object.get("DerivedProperties"); - if (properties != null) { - if (properties.get("SeqLen") == null) // mysql allows null value in unique key column - st.setNull(++i, Types.INTEGER); - else - st.setInt(++i, Integer.parseInt((String) properties.get("SeqLen"))); - - if (properties.get("PredictionsPerSeq") == null) // mysql allows null value in unique key column - st.setNull(++i, Types.INTEGER); - else - st.setInt(++i, Integer.parseInt((String) properties.get("PredictionsPerSeq"))); + field_mapping.put("BatchId", batch_id); + field_mapping.put("CommitId", commit_id.substring(0, 8)); + json_object.forEach((key, value) -> { + if (key.equals("DerivedProperties")) { + JSONObject properties = (JSONObject) json_object.get("DerivedProperties"); + properties.forEach((sub_key, sub_value) -> { + field_mapping.put((String)sub_key, sub_value); + }); } else { - st.setNull(++i, Types.INTEGER); - st.setNull(++i, Types.INTEGER); + field_mapping.put((String)key, value); + } + }); + + // building sql statement + StringBuilder sb = new StringBuilder("INSERT INTO perf_test_training_data ("); + field_mapping.forEach((key, value) -> { + sb.append(key).append(","); + }); + sb.append("Time) values ("); + for(int i = 0; i < field_mapping.size(); i++) { + sb.append("?,"); + } + sb.append("Now()) ON DUPLICATE KEY UPDATE "); + update_on_duplicate_fields.forEach((key) -> { + if(field_mapping.get(key) != null) { + sb.append(key).append("=?,"); + } + }); + + try (java.sql.PreparedStatement st = conn.prepareStatement(sb.substring(0, sb.length() - 1))) { + int i = 0; // param index + for (Map.Entry entry : field_mapping.entrySet()) { + setSqlParam(++i, st, entry.getValue()); } - st.setInt(++i, (int)(long) json_object.get("NumOfBatches")); - st.setInt(++i, (int)(long) json_object.get("WeightUpdateSteps")); - st.setInt(++i, (int)(long) json_object.get("Round")); - st.setInt(++i, (int)(long) json_object.get("GradAccSteps")); - st.setFloat(++i, (float)(double) json_object.get("AvgTimePerBatch")); // ms - st.setFloat(++i, (float)(double) json_object.get("Throughput")); // examples/sec - st.setFloat(++i, (float)(double) json_object.get("StabilizedThroughput")); // examples/sec - st.setFloat(++i, (float)(double) json_object.get("TotalTime")); // secs - st.setInt(++i, (int)(long) json_object.get("AvgCPU")); - st.setInt(++i, (int)((long) json_object.get("Memory") >> 20)); // mb - st.setString(++i, (String) json_object.get("RunConfig")); - // update section - st.setFloat(++i, (float)(double) json_object.get("AvgTimePerBatch")); // ms - st.setFloat(++i, (float)(double) json_object.get("Throughput")); // examples/sec - st.setFloat(++i, (float)(double) json_object.get("StabilizedThroughput")); // examples/sec - st.setFloat(++i, (float)(double) json_object.get("TotalTime")); // secs - st.setInt(++i, (int)((long) json_object.get("Memory") >> 20)); // mb - st.setString(++i, (String) json_object.get("RunConfig")); + for(String key : update_on_duplicate_fields) { + Object value = field_mapping.get(key); + if(value != null) { + setSqlParam(++i, st, value); + } + } st.executeUpdate(); } catch (Exception e) { @@ -160,4 +151,18 @@ public class App { } + static void setSqlParam(int param_index, PreparedStatement st, Object value) throws Exception { + if (value instanceof String) { + st.setString(param_index, (String) value); + } else if (value instanceof Long) { + st.setInt(param_index, (int) (long) value); + } else if (value instanceof Double) { + st.setFloat(param_index, (float) (double) value); + } else if (value instanceof Boolean) { + st.setBoolean(param_index, (Boolean) value); + } else { + throw new Exception("Unsupported data type:" + value.getClass().getName()); + } + } + }