GPT-2 training perf scripts (#3974)

* gpt2 training perf

* gpt2 training perf

* debug

* debug

* debug

* fix bug

* minor

* on comments

* dynamic sql

* fix build

* minor

* linked hash

* on comments

* minor

* mem

* minor

Co-authored-by: Ethan Tao <ettao@microsoft.com>
This commit is contained in:
ytaous 2020-05-19 10:21:40 -07:00 committed by GitHub
parent 36bcb28238
commit fb4efafc8e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 173 additions and 70 deletions

View file

@ -432,8 +432,16 @@ float GetLossValue(const Tensor& loss_tensor) {
return loss;
}
// mapping of max_sequence_length and max_predictions_per_sequence position derived from training data
std::map<std::string, std::pair<std::string, size_t>> input_to_dimension_mapping;
// use this table mapping to define what to be stored in mapped_dimensions, and ultimately in json structure
// Be mindful on the position, if it's invalid or out of bound, the property population process will be
// either incorrect or aborted. Also make sure to substract the index position by 1 to get valid correspondent value
// namely, in the graph, sequence is at position 1, but in initial tensor shape vector loaded from training data is at position 0,
// batch is not part of the initial tensor shape vector till later
// see GetTensorDimensionsFromInputs() in training_util.h and training_runner.cc for more details
const std::map<std::string, std::pair<std::string, size_t>> input_to_dimension_mapping = {
{"input1", {"SeqLen", 0}}, // int64[batch,sequence] "sequence" -> "SeqLen", 0
{"masked_lm_ids", {"PredictionsPerSeq", 0}} // int64[batch,dynamic_prediction_count]
};
// generic properties for storing perf metrics
MapStringToString mapped_dimensions;
@ -516,17 +524,6 @@ void setup_training_params(BertParameters& params) {
{"masked_lm_weights", "masked_lm_weights"},
{"next_sentence_label", "next_sentence_labels"}};
// use this table mapping to define what to be stored in mapped_dimensions, and ultimately in json structure
// Be mindful on the position, if it's invalid or out of bound, the property population process will be
// either incorrect or aborted. Also make sure to substract the index position by 1 to get valid correspondent value
// namely, in the graph, sequence is at position 1, but in initial tensor shape vector loaded from training data is at position 0,
// batch is not part of the initial tensor shape vector till later
// see GetTensorDimensionsFromInputs() in training_util.h and training_runner.cc for more details
input_to_dimension_mapping = {
{"input1", {"SeqLen", 0}}, // int64[batch,sequence] "sequence" -> "SeqLen", 0
{"masked_lm_ids", {"PredictionsPerSeq", 0}} // int64[batch,dynamic_prediction_count]
};
params.model_type = "bert";
params.skip_evaluation = params.is_perf_test;

View file

@ -49,9 +49,12 @@ Status ParseArguments(int argc, char* argv[], GPT2Parameters& params, OrtParamet
cxxopts::value<std::string>()->default_value("data/1024/books_wiki_en_corpus/test"))
("output_dir", "The output directory where the trained model files will be written.",
cxxopts::value<std::string>()->default_value(""))
("perf_output_dir", "The output directory where the trained perf metrics files will be written.",
cxxopts::value<std::string>()->default_value(""))
("log_dir", "The directory to write tensorboard events.",
cxxopts::value<std::string>()->default_value(""))
("train_batch_size", "Total batch size for training.", cxxopts::value<int>())
("eval_batch_size", "Total batch size for eval.", cxxopts::value<int>())
("learning_rate", "The initial learning rate for the optimizer.", cxxopts::value<float>()->default_value("5e-5"))
("num_train_steps", "Total number of training steps to perform.", cxxopts::value<int>()->default_value("100"))
("warmup_ratio", "Fraction of training steps for learning rate warmup.", cxxopts::value<float>()->default_value("0"))
@ -119,7 +122,11 @@ Status ParseArguments(int argc, char* argv[], GPT2Parameters& params, OrtParamet
params.num_train_steps = flags["num_train_steps"].as<int>();
params.batch_size = flags["train_batch_size"].as<int>();
if (flags.count("eval_batch_size")) {
params.eval_batch_size = flags["eval_batch_size"].as<int>();
} else {
params.eval_batch_size = params.batch_size;
}
params.max_sequence_length = flags["max_seq_length"].as<int>();
params.gradient_accumulation_steps = flags["gradient_accumulation_steps"].as<int>();
@ -136,6 +143,10 @@ Status ParseArguments(int argc, char* argv[], GPT2Parameters& params, OrtParamet
if (params.output_dir.empty()) {
printf("No output directory specified. Trained model files will not be saved.\n");
}
params.perf_output_dir = ToPathString(flags["perf_output_dir"].as<std::string>());
if (params.perf_output_dir.empty()) {
printf("No perf output directory specified. Trained perf metrics will not be saved.\n");
}
params.use_mixed_precision = flags["use_mixed_precision"].as<bool>();
params.allreduce_in_fp16 = flags["allreduce_in_fp16"].as<bool>() && params.use_mixed_precision;
@ -260,6 +271,15 @@ float GetLossValue(const Tensor& loss_tensor) {
return loss;
}
// mapping to define what to be stored in mapped_dimensions
// see GetTensorDimensionsFromInputs() in training_util.h and training_runner.cc for more details
const std::map<std::string, std::pair<std::string, size_t>> input_to_dimension_mapping = {
{"input_ids", {"SeqLen", 0}}, // int64[batch,seqlen] "seqlen" -> "SeqLen", 0
};
// generic properties for storing perf metrics
MapStringToString mapped_dimensions;
void setup_training_params(GPT2Parameters& params) {
params.model_path = ToPathString(params.model_name) + ORT_TSTR(".onnx");
params.model_with_loss_func_path = ToPathString(params.model_name) + ORT_TSTR("_with_cost.onnx");
@ -316,6 +336,8 @@ void setup_training_params(GPT2Parameters& params) {
{"attention_mask", "attention_mask"},
{"labels", "labels"}};
params.model_type = "gpt2";
#ifdef USE_CUDA
OrtDevice::DeviceId device_id = static_cast<OrtDevice::DeviceId>(params.mpi_context.local_rank);
params.providers.emplace(kCudaExecutionProvider, CreateExecutionProviderFactory_CUDA(device_id));
@ -408,7 +430,13 @@ static Status RunTraining(const GPT2Parameters& params, const Environment& env)
max_num_files_preload);
}
ORT_RETURN_IF_ERROR(runner->Run(training_data_loader.get(), test_data_loader.get()));
if (!params.perf_output_dir.empty()) {
// collecting GPT2 related params from training data
auto training_data = training_data_loader->CurrentDataSet();
ORT_RETURN_IF_ERROR(training_data->GetTensorDimensionsFromInputs(input_to_dimension_mapping, mapped_dimensions));
}
ORT_RETURN_IF_ERROR(runner->Run(training_data_loader.get(), test_data_loader.get(), mapped_dimensions));
// only test and save trained model on device #0
if (params.mpi_context.world_rank == 0) {

View file

@ -923,7 +923,7 @@ Status TrainingRunner::SavePerfMetrics(const size_t number_of_batches, const siz
(seq_len.empty() ? "" : "_" + seq_len) + "_" + optimizer;
perf_metrics["DisplayName"] = display_name;
perf_metrics["Memory"] = peak_workingset_size;
perf_metrics["Memory"] = peak_workingset_size >> 20; // mb
perf_metrics["AvgCPU"] = average_cpu_usage;
//

View file

@ -0,0 +1,60 @@
#!/usr/bin/env python3
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import argparse
import subprocess
import sys
import os
from collections import namedtuple
SCRIPT_DIR = os.path.realpath(os.path.dirname(__file__))
def parse_args():
parser = argparse.ArgumentParser(description="Runs GPT-2 performance tests.")
parser.add_argument("--binary_dir", required=True,
help="Path to the ORT binary directory.")
parser.add_argument("--training_data_root", required=True,
help="Path to the training data root directory.")
parser.add_argument("--model_root", required=True,
help="Path to the model root directory.")
return parser.parse_args()
# TODO - review to finalize params
def main():
args = parse_args()
Config = namedtuple('Config', ['use_mixed_precision', 'max_seq_length', 'batch_size'])
configs = [
Config(True, 1024, 1),
Config(False, 1024, 1)
]
# run GPT-2 training
for c in configs:
print("######## testing name - " + ('fp16-' if c.use_mixed_precision else 'fp32-') + str(c.max_seq_length) + " ##############")
cmds = [
os.path.join(args.binary_dir, "onnxruntime_training_gpt2"),
"--model_name", os.path.join(
args.model_root, "megatron-gpt2_hidden-size-1024_num-layers-24_vocab-size-50257_num-attention-heads-16_max-position-embeddings-1024_optimized"),
"--train_data_dir", os.path.join(
args.training_data_root, "train"),
"--test_data_dir", os.path.join(
args.training_data_root, "test"),
"--train_batch_size", str(c.batch_size),
"--mode", "train",
"--max_seq_length", str(c.max_seq_length),
"--num_train_steps", "200",
"--gradient_accumulation_steps", "1",
"--perf_output_dir", os.path.join(SCRIPT_DIR, "results"),
]
if c.use_mixed_precision:
cmds.append("--use_mixed_precision"),
subprocess.run(cmds).check_returncode()
return 0
if __name__ == "__main__":
sys.exit(main())

View file

@ -36,6 +36,19 @@ jobs:
--model_root /build/bert_models
displayName: 'Run bert performance tests'
- script: >
docker run --gpus all --rm --name onnxruntime-gpu-perf
--volume $(Build.SourcesDirectory):/onnxruntime_src
--volume $(Build.BinariesDirectory):/build
--volume /bert_ort/gpt2_models:/build/gpt2_models:ro
--volume /bert_data/gpt2_data:/build/gpt2_data:ro
-e NIGHTLY_BUILD onnxruntime-ubuntu16.04-cuda10.1-cudnn7.6
/usr/bin/python3.6 /onnxruntime_src/orttraining/tools/ci_test/run_gpt2_perf_test.py
--binary_dir /build/RelWithDebInfo
--training_data_root /build/gpt2_data
--model_root /build/gpt2_models
displayName: 'Run gpt-2 performance tests'
# generate jdbc.properties
- script: >
mkdir -p $(Build.SourcesDirectory)/tools/perf_util/src/main/resources &&

View file

@ -1,6 +1,5 @@
package com.msft.send_perf_metrics;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
@ -13,11 +12,9 @@ import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.sql.Connection;
import java.sql.Types;
import java.sql.PreparedStatement;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.*;
public class App {
@ -94,63 +91,57 @@ public class App {
}
}
static private void loadMetricsIntoMySQL(java.sql.Connection conn, String commit_id, String batch_id,
JSONObject json_object) throws Exception {
try (java.sql.PreparedStatement st = conn.prepareStatement(
"INSERT INTO perf_test_training_data (BatchId,CommitId,Model,ModelName,DisplayName,UseMixedPrecision,Optimizer,BatchSize,SeqLen,PredictionsPerSeq," +
"NumOfBatches,WeightUpdateSteps,Round,GradAccSteps,AvgTimePerBatch,Throughput,StabilizedThroughput,TotalTime,AvgCPU,Memory,RunConfig,Time) " +
"values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,Now())"
+ " ON DUPLICATE KEY UPDATE AvgTimePerBatch=?,Throughput=?,StabilizedThroughput=?,TotalTime=?,AvgCPU=?,Memory=?")) {
// field name -> json value
Map<String, Object> field_mapping = new LinkedHashMap();
Set<String> update_on_duplicate_fields =
new LinkedHashSet<> (Arrays.asList("AvgTimePerBatch", "Throughput", "StabilizedThroughput", "TotalTime", "AvgCPU", "Memory"));
int i = 0;
// unique key section
st.setString(++i, batch_id);
st.setString(++i, commit_id.substring(0, 8));
st.setString(++i, (String) json_object.get("Model"));
st.setString(++i, (String) json_object.get("ModelName"));
st.setString(++i, (String) json_object.get("DisplayName"));
st.setBoolean(++i, (Boolean) json_object.get("UseMixedPrecision"));
st.setString(++i, (String) json_object.get("Optimizer"));
st.setInt(++i, (int)(long) json_object.get("BatchSize"));
// non-key section
JSONObject properties = (JSONObject) json_object.get("DerivedProperties");
if (properties != null) {
if (properties.get("SeqLen") == null) // mysql allows null value in unique key column
st.setNull(++i, Types.INTEGER);
else
st.setInt(++i, Integer.parseInt((String) properties.get("SeqLen")));
if (properties.get("PredictionsPerSeq") == null) // mysql allows null value in unique key column
st.setNull(++i, Types.INTEGER);
else
st.setInt(++i, Integer.parseInt((String) properties.get("PredictionsPerSeq")));
field_mapping.put("BatchId", batch_id);
field_mapping.put("CommitId", commit_id.substring(0, 8));
json_object.forEach((key, value) -> {
if (key.equals("DerivedProperties")) {
JSONObject properties = (JSONObject) json_object.get("DerivedProperties");
properties.forEach((sub_key, sub_value) -> {
field_mapping.put((String)sub_key, sub_value);
});
} else {
st.setNull(++i, Types.INTEGER);
st.setNull(++i, Types.INTEGER);
field_mapping.put((String)key, value);
}
});
// building sql statement
StringBuilder sb = new StringBuilder("INSERT INTO perf_test_training_data (");
field_mapping.forEach((key, value) -> {
sb.append(key).append(",");
});
sb.append("Time) values (");
for(int i = 0; i < field_mapping.size(); i++) {
sb.append("?,");
}
sb.append("Now()) ON DUPLICATE KEY UPDATE ");
update_on_duplicate_fields.forEach((key) -> {
if(field_mapping.get(key) != null) {
sb.append(key).append("=?,");
}
});
try (java.sql.PreparedStatement st = conn.prepareStatement(sb.substring(0, sb.length() - 1))) {
int i = 0; // param index
for (Map.Entry<String, Object> entry : field_mapping.entrySet()) {
setSqlParam(++i, st, entry.getValue());
}
st.setInt(++i, (int)(long) json_object.get("NumOfBatches"));
st.setInt(++i, (int)(long) json_object.get("WeightUpdateSteps"));
st.setInt(++i, (int)(long) json_object.get("Round"));
st.setInt(++i, (int)(long) json_object.get("GradAccSteps"));
st.setFloat(++i, (float)(double) json_object.get("AvgTimePerBatch")); // ms
st.setFloat(++i, (float)(double) json_object.get("Throughput")); // examples/sec
st.setFloat(++i, (float)(double) json_object.get("StabilizedThroughput")); // examples/sec
st.setFloat(++i, (float)(double) json_object.get("TotalTime")); // secs
st.setInt(++i, (int)(long) json_object.get("AvgCPU"));
st.setInt(++i, (int)((long) json_object.get("Memory") >> 20)); // mb
st.setString(++i, (String) json_object.get("RunConfig"));
// update section
st.setFloat(++i, (float)(double) json_object.get("AvgTimePerBatch")); // ms
st.setFloat(++i, (float)(double) json_object.get("Throughput")); // examples/sec
st.setFloat(++i, (float)(double) json_object.get("StabilizedThroughput")); // examples/sec
st.setFloat(++i, (float)(double) json_object.get("TotalTime")); // secs
st.setInt(++i, (int)((long) json_object.get("Memory") >> 20)); // mb
st.setString(++i, (String) json_object.get("RunConfig"));
for(String key : update_on_duplicate_fields) {
Object value = field_mapping.get(key);
if(value != null) {
setSqlParam(++i, st, value);
}
}
st.executeUpdate();
} catch (Exception e) {
@ -160,4 +151,18 @@ public class App {
}
static void setSqlParam(int param_index, PreparedStatement st, Object value) throws Exception {
if (value instanceof String) {
st.setString(param_index, (String) value);
} else if (value instanceof Long) {
st.setInt(param_index, (int) (long) value);
} else if (value instanceof Double) {
st.setFloat(param_index, (float) (double) value);
} else if (value instanceof Boolean) {
st.setBoolean(param_index, (Boolean) value);
} else {
throw new Exception("Unsupported data type:" + value.getClass().getName());
}
}
}