diff --git a/onnxruntime/python/tools/transformers/convert_to_onnx.py b/onnxruntime/python/tools/transformers/convert_to_onnx.py index cd856afd53..500a3f3b90 100644 --- a/onnxruntime/python/tools/transformers/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/convert_to_onnx.py @@ -92,14 +92,22 @@ def parse_arguments(argv=None): type=Precision, default=Precision.FLOAT32, choices=list(Precision), - help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization") + help= + "Precision of model to run. fp32 for full precision, fp16 for half or mixed precision, and int8 for quantization" + ) parser.add_argument("-t", "--test_cases", required=False, type=int, default=1000, - help="Number of test cases for parity") + help="Number of test cases per run for parity") + parser.add_argument("-r", + "--test_runs", + required=False, + type=int, + default=10, + help="Number of runs for parity. It is used for significance test.") parser.add_argument('--verbose', required=False, action='store_true') parser.set_defaults(verbose=False) @@ -180,6 +188,17 @@ def parse_arguments(argv=None): return args +def get_onnx_model_size(onnx_path: str, use_external_data_format: bool): + if not use_external_data_format: + return os.path.getsize(onnx_path) + else: + return sum([f.stat().st_size for f in Path(onnx_path).parent.rglob('*')]) + + +def get_latency_name(): + return "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)" + + def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_results.csv"): result = {} from transformers import __version__ as transformers_version @@ -216,6 +235,8 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu assert not args.output.endswith('.onnx'), "output shall be a directory for --use_external_data_format" model_class = MODEL_CLASSES[args.model_class][0] + use_padding = MODEL_CLASSES[args.model_class][2] + if args.model_class == "GPT2LMHeadModel_BeamSearchStep": model_type = "beam_search_step" elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch": @@ -255,23 +276,26 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu if (not args.use_external_data_format) and (config.n_layer > 24): logger.info(f"Try --use_external_data_format when model size > 2GB") - onnx_model_paths = gpt2helper.get_onnx_paths(output_dir, - args.model_name_or_path, - args.model_class, - new_folder=args.use_external_data_format) + onnx_model_paths = gpt2helper.get_onnx_paths( + output_dir, + args.model_name_or_path, + args.model_class, + new_folder=args.use_external_data_format, + remove_existing=["fp32", "fp16", "int8"]) # Do not remove raw model to save time in parity test raw_onnx_model = onnx_model_paths["raw"] - logger.info(f"Exporting ONNX model to {raw_onnx_model}") - use_padding = MODEL_CLASSES[args.model_class][2] - - gpt2helper.export_onnx(model, - device, - raw_onnx_model, - args.verbose, - args.use_external_data_format, - has_position_ids=use_padding, - has_attention_mask=use_padding) + if os.path.exists(raw_onnx_model): + logger.warning(f"Skip exporting ONNX model since it existed: {raw_onnx_model}") + else: + logger.info(f"Exporting ONNX model to {raw_onnx_model}") + gpt2helper.export_onnx(model, + device, + raw_onnx_model, + args.verbose, + args.use_external_data_format, + has_position_ids=use_padding, + has_attention_mask=use_padding) fp16_params = {"keep_io_types": args.keep_io_types} if args.io_block_list: @@ -308,6 +332,7 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu output_path = args.output logger.info(f"Output path: {output_path}") + model_size_in_MB = int(get_onnx_model_size(output_path, args.use_external_data_format) / 1024 / 1024) session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose) if args.model_class == "GPT2LMHeadModel" and session is not None: @@ -320,7 +345,8 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu model_class=args.model_class, has_position_ids=use_padding, has_attention_mask=use_padding, - total_test_cases=args.test_cases, + test_cases_per_run=args.test_cases, + total_runs=args.test_runs, verbose=args.verbose) latency = gpt2helper.test_performance(session, @@ -342,15 +368,15 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu # Write results to file import csv from onnxruntime import __version__ as ort_version - latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)" + latency_name = get_latency_name() csv_file_existed = os.path.exists(csv_filename) with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ "experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases", - "keep_io_types", "io_block_list", "op_block_list", "node_block_list", "force_fp16_initializers", - "ORT_TRANSFORMER_OPTIONS", "ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name, "diff_50_percentile", - "diff_90_percentile", "diff_95_percentile", "diff_99_percentile", "diff_pass_rate", "nan_rate", - "top1_match_rate", "onnx_size_in_MB" + "runs", "keep_io_types", "io_block_list", "op_block_list", "node_block_list", "force_fp16_initializers", + "ORT_TRANSFORMER_OPTIONS", "ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name, "top1_match_rate", + "onnx_size_in_MB", "diff_50_percentile", "diff_90_percentile", "diff_95_percentile", + "diff_99_percentile", "diff_pass_rate", "nan_rate", "top1_match_rate_per_run" ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) if not csv_file_existed: @@ -364,6 +390,7 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu "precision": args.precision, "optimizer": args.optimize_onnx, "test_cases": args.test_cases, + "runs": args.test_runs, "keep_io_types": args.keep_io_types, "io_block_list": args.io_block_list, "op_block_list": args.op_block_list, @@ -380,7 +407,8 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu "diff_pass_rate": parity_result["diff_pass_rate"], "nan_rate": parity_result["nan_rate"], "top1_match_rate": parity_result["top1_match_rate"], - "onnx_size_in_MB": "{}".format(int(os.path.getsize(output_path) / 1024 / 1024)) + "top1_match_rate_per_run": parity_result["top1_match_rate_per_run"], + "onnx_size_in_MB": "{}".format(model_size_in_MB), } logger.info(f"result: {row}") result.update(row) diff --git a/onnxruntime/python/tools/transformers/gpt2_helper.py b/onnxruntime/python/tools/transformers/gpt2_helper.py index b15340892d..06b4bf5de9 100644 --- a/onnxruntime/python/tools/transformers/gpt2_helper.py +++ b/onnxruntime/python/tools/transformers/gpt2_helper.py @@ -611,7 +611,8 @@ class Gpt2Helper: is_float16=False, rtol=5e-4, atol=5e-4, - total_test_cases=100, + test_cases_per_run=10000, + total_runs=1, use_io_binding=True, model_class="GPT2LMHeadModel", has_position_ids=True, @@ -624,7 +625,7 @@ class Gpt2Helper: config: GPT2Config = model.config logger.info( - f"Running parity test (atol={atol}, test_cases={total_test_cases}, use_io_binding={use_io_binding}, model_class={model_class}, is_float16={is_float16}) ..." + f"Running parity test (atol={atol}, test_cases={test_cases_per_run}, runs={total_runs}, use_io_binding={use_io_binding}, model_class={model_class}, is_float16={is_float16}) ..." ) max_batch_size = 8 @@ -641,7 +642,10 @@ class Gpt2Helper: top1_matched_cases = 0 max_abs_diff_list = [] + top1_matched_cases_per_run = [0] * total_runs + total_test_cases = test_cases_per_run * total_runs for i in range(total_test_cases): + run_id = int(i / test_cases_per_run) sequence_length = random.randint(1, max_seq_len) past_sequence_length = random.randint(0, max_past_seq_len) batch_size = random.randint(1, max_batch_size) @@ -669,6 +673,7 @@ class Gpt2Helper: passed_test_cases += 1 if is_top1_matched: top1_matched_cases += 1 + top1_matched_cases_per_run[run_id] += 1 if verbose and not is_all_close: logger.info( @@ -691,6 +696,7 @@ class Gpt2Helper: result = {f"max_diff_percentile_{p}": "nan" for p in [50, 90, 95, 99]} result["top1_match_rate"] = top1_matched_cases * 1.0 / total_test_cases + result["top1_match_rate_per_run"] = [x * 1.0 / test_cases_per_run for x in top1_matched_cases_per_run] result["diff_pass_rate"] = passed_test_cases * 1.0 / total_test_cases result["nan_rate"] = (total_test_cases - len(max_abs_diff_list)) * 1.0 / total_test_cases @@ -762,7 +768,8 @@ class Gpt2Helper: model_name_or_path, model_class: str = 'GPT2LMHeadModel', has_past=True, - new_folder=False): + new_folder=False, + remove_existing=["raw", "fp32", "fp16", "int8"]): """ Build a path name for given model based on given attributes. """ model_name = model_name_or_path @@ -777,15 +784,19 @@ class Gpt2Helper: model_name += "_past" if new_folder: + suffix = {"raw": "", "fp32": "_fp32", "fp16": "_fp16", "int8": "_int8"} # Remove the directories if existed. - for suffix in ["", "_fp32", "_fp16", "_int8"]: - new_dir = os.path.join(output_dir, model_name + suffix) + for model_type in ["raw", "fp32", "fp16", "int8"]: + new_dir = os.path.join(output_dir, model_name + suffix[model_type]) if os.path.exists(new_dir): - try: - shutil.rmtree(new_dir) - logger.info(f"Removed the existed directory: {new_dir}") - except OSError as e: - logger.info(f"Failed to remove the directory {new_dir}: {e.strerror}") + if (model_type in remove_existing): + try: + shutil.rmtree(new_dir) + logger.info(f"Removed the existed directory: {new_dir}") + except OSError as e: + logger.info(f"Failed to remove the directory {new_dir}: {e.strerror}") + else: + logger.info(f"Directory for {model_type} existed: {new_dir}") # store each model to its own directory (for external data format). return { diff --git a/onnxruntime/python/tools/transformers/gpt2_parity.py b/onnxruntime/python/tools/transformers/gpt2_parity.py index ea280b41ce..d74544772c 100644 --- a/onnxruntime/python/tools/transformers/gpt2_parity.py +++ b/onnxruntime/python/tools/transformers/gpt2_parity.py @@ -4,12 +4,24 @@ # license information. # -------------------------------------------------------------------------- -from convert_to_onnx import main +# This script uses different configurations in mixed precision conversion for GPT-2 model, and +# measures the inference latency, top 1 match rate (compared to PyTorch FP32 model) and ONNX model size. +# It outputs a csv file with Mann-Whitney U test and T-Test on each pair of experiments, where +# pvalue < 0.05 means two experiments have significant difference on top 1 match rate. +# User could use this script to select the best mixed precision model according to these metrics. + +from convert_to_onnx import main, get_latency_name import os import argparse import logging -from gpt2_helper import PRETRAINED_GPT2_MODELS +from gpt2_helper import PRETRAINED_GPT2_MODELS, Gpt2Helper from benchmark_helper import setup_logger +from onnx_model import OnnxModel +import onnx +import csv +import datetime +import scipy.stats +import torch logger = logging.getLogger('') @@ -29,11 +41,9 @@ def parse_arguments(argv=None): default='gpt2_parity_results.csv', help='path of csv file to save the result') - parser.add_argument('--runs', - required=False, - type=int, - default=5, - help="number of repeated runs to get median value of each metric") + parser.add_argument('--test_cases', required=False, type=int, default=500, help="number of test cases per run") + + parser.add_argument('--runs', required=False, type=int, default=40, help="number of repeated runs") parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU for inference") parser.set_defaults(use_gpu=False) @@ -47,131 +57,350 @@ def parse_arguments(argv=None): parser.add_argument('--verbose', required=False, action='store_true') parser.set_defaults(verbose=False) + parser.add_argument('--skip_test', + required=False, + action='store_true', + help="do not run test, and only rank experiments based on existing csv file") + parser.set_defaults(skip_test=False) + args = parser.parse_args(argv) return args class ParityTask: - def __init__(self, total_runs, csv_path): + def __init__(self, test_cases, total_runs, csv_path): self.total_runs = total_runs + self.test_cases = test_cases self.csv_path = csv_path - self.latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)" - self.metric_names = [ - self.latency_name, "diff_50_percentile", "diff_90_percentile", "diff_95_percentile", "diff_99_percentile", - "diff_pass_rate", "nan_rate", "top1_match_rate", "onnx_size_in_MB" + self.results = [] + self.run_id = 0 + + def run(self, argv, experiment_name): + start_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S') + run_id = f"{start_time}_{self.run_id}" + self.run_id += 1 + + try: + result = main(argv + ["-t", f"{self.test_cases}", "-r", f"{self.total_runs}"], + experiment_name=experiment_name, + run_id=run_id, + csv_filename=self.csv_path) + except: + logger.exception(f"Failed to run experiment {experiment_name}") + + if result: + self.results.append(result) + + +def load_results_from_csv(csv_path): + rows = [] + import csv + with open(csv_path, newline='') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + rows.append(row) + return rows + + +def score(row): + """Scoring function based on 3 metrics. The larger score is better.""" + latency_in_ms = float(row[get_latency_name()]) + top1_match_rate = float(row["top1_match_rate"]) + onnx_size_in_MB = float(row["onnx_size_in_MB"]) + # A simple scoring function: cost of 0.1ms latency ~ 0.1% match rate ~ 100MB size + return (top1_match_rate * 1000 - latency_in_ms * 10 - onnx_size_in_MB / 100) + + +def print_wins(wins, rows, test_name): + print() + print("*" * 10) + + row_map = {} + for row in rows: + row_map[row["run_id"]] = row + + sorted_wins = dict(sorted(wins.items(), key=lambda item: (item[1], score(row_map[item[0]])), reverse=True)) + logger.debug(f"{test_name} Wins:{sorted_wins}") + logger.info(f"Based on {test_name} wins and a scoring function, the ranking:") + + rank = 0 + previous_value = -1 + count = 0 + for key, value in sorted_wins.items(): + if value != previous_value: + rank = count + previous_value = value + count += 1 + + for row in rows: + if row["run_id"] == key: + logger.info( + "{:02d}: WINs={:02d}, run_id={}, latency={:5.2f} top1_match={:.4f} size={}_MB experiment={} {}". + format( + rank, value, key, float(row[get_latency_name()]), float(row["top1_match_rate"]), + row["onnx_size_in_MB"], row["experiment"], " (Half2 Disabled)" if + (row['ORT_CUDA_GEMM_OPTIONS'] == "4" and "Half2" not in row["experiment"]) else "")) + break + + +def run_significance_test(rows, output_csv_path): + """Run U test and T test. + """ + utest_wins = {} + ttest_wins = {} + for row in rows: + run_id = row["run_id"] + utest_wins[run_id] = 0 + ttest_wins[run_id] = 0 + + with open(output_csv_path, 'w', newline='') as csvfile: + column_names = [ + 'model_name', 'run_id_1', 'experiment_1', 'top1_match_rate_1', 'run_id_2', 'experiment_2', + 'top1_match_rate_2', 'U_statistic', 'U_pvalue', "T_statistic", "T_pvalue" ] - def run(self, argv, name): - results = [] - experiment_name = name - for i in range(self.total_runs): - try: - result = main(argv, experiment_name=experiment_name, run_id=i, csv_filename=self.csv_path) - except: - logger.error(f"Failed to run experiment{experiment_name}") - continue - if result: - results.append(result) + writer = csv.DictWriter(csvfile, fieldnames=column_names) + writer.writeheader() - if len(results) == 0: - return + required_match_columns = ["model_name", "test_cases", "runs"] + num_results = len(rows) + for i in range(num_results - 1): + result1 = rows[i] - # Calculate median value per metric - all_results = {} - for name in self.metric_names: - all_results[name] = [] + for j in range(i + 1, num_results, 1): + result2 = rows[j] - for result in results: - for name in self.metric_names: - if name in result: - all_results[name].append(result[name]) + all_matched = True + for column in required_match_columns: + if (result1[column] != result2[column]): + all_matched = False + break + if not all_matched: + continue - import statistics - median_result = results[0] - for name in self.metric_names: - median_result[name] = statistics.median(all_results[name]) + if isinstance(result1["top1_match_rate_per_run"], str): + import json + a = json.loads(result1["top1_match_rate_per_run"]) + b = json.loads(result2["top1_match_rate_per_run"]) + else: + a = result1["top1_match_rate_per_run"] + b = result2["top1_match_rate_per_run"] - self.save_result(median_result) + try: + utest_statistic, utest_pvalue = scipy.stats.mannwhitneyu( + a, b, use_continuity=True, alternative="two-sided" + ) #TODO: shall we use one-sided: less or greater according to "top1_match_rate" + except ValueError: #ValueError: All numbers are identical in mannwhitneyu + utest_statistic = None + utest_pvalue = None + ttest_statistic, ttest_pvalue = scipy.stats.ttest_ind(a, b, axis=None, equal_var=True) - def save_result(self, result): - import csv - csv_filename = self.csv_path + if utest_pvalue < 0.05: + if float(result1["top1_match_rate"]) > float(result2["top1_match_rate"]): + utest_wins[result1["run_id"]] += 1 + else: + utest_wins[result2["run_id"]] += 1 - csv_file_existed = os.path.exists(csv_filename) - with open(csv_filename, mode="a", newline='') as csv_file: - column_names = [ - "experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases", - "keep_io_types", "io_block_list", "op_block_list", "node_block_list", "force_fp16_initializers", - "ORT_TRANSFORMER_OPTIONS", "ORT_CUDA_GEMM_OPTIONS", "onnxruntime" - ] + self.metric_names + if ttest_pvalue < 0.05: + if float(result1["top1_match_rate"]) > float(result2["top1_match_rate"]): + ttest_wins[result1["run_id"]] += 1 + else: + ttest_wins[result2["run_id"]] += 1 - csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) - if not csv_file_existed: - csv_writer.writeheader() + row = { + 'model_name': result1["model_name"], + 'run_id_1': result1["run_id"], + 'experiment_1': result1["experiment"], + 'top1_match_rate_1': float(result1["top1_match_rate"]), + "run_id_2": result2["run_id"], + "experiment_2": result2["experiment"], + 'top1_match_rate_2': float(result2["top1_match_rate"]), + 'U_statistic': utest_statistic, + 'U_pvalue': utest_pvalue, + 'T_statistic': ttest_statistic, + 'T_pvalue': ttest_pvalue + } - row = {} - for name in column_names: - row[name] = result[name] - - row["run_id"] = "median" - - csv_writer.writerow(row) - logger.info(f"result saved to {csv_filename}: {row}") + writer.writerow(row) + logger.info(f"U-Test and T-Test results are output to {output_csv_path}") + print_wins(utest_wins, rows, "U-Test") + print_wins(ttest_wins, rows, "T-Test") -def run_parity(args): - task = ParityTask(args.runs, args.csv) +def get_last_matmul_node_name(raw_onnx_model: str): + model = onnx.load(raw_onnx_model) + onnx_model = OnnxModel(model) + output_name_to_node = onnx_model.output_name_to_node() + assert model.graph.output[0].name in output_name_to_node + node = output_name_to_node[model.graph.output[0].name] + if node.op_type == "MatMul": + logger.info(f"Found last MatMul node for logits: {node.name}") + return node.name + + logger.warning(f"Failed to find MatMul node for logits. Found {node.op_type} of node {node.name}") + return None + + +def get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list): + model = args.model_name_or_path + parameters = f"-m {model} -o --use_gpu -p fp16".split() + if args.use_external_data_format: + parameters.append("--use_external_data_format") + parameters += ["--io_block_list", "logits", "--node_block_list", last_matmul_node_name] + + if op_block_list: + parameters.extend(["--op_block_list"] + op_block_list) + + return parameters + + +def run_candidate(task: ParityTask, args, last_matmul_node_name, op_block_list=["FastGelu", "LayerNormalization"]): + parameters = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list) + op_block_list_str = ','.join(sorted(op_block_list)) + name_suffix = " (Half2 Disabled)" if os.getenv('ORT_CUDA_GEMM_OPTIONS') == "4" else "" + if op_block_list: + name = f"Mixed precision baseline + {op_block_list_str} in FP32{name_suffix}" + else: + name = f"Mixed precision baseline (logits output and last MatMul node {last_matmul_node_name} in FP32){name_suffix}" + task.run(parameters, name) + + +def get_baselines(args): model = args.model_name_or_path fp32_baseline = f"-m {model} -o -p fp32".split() if args.use_gpu: fp32_baseline.append("--use_gpu") - if args.use_external_data_format: fp32_baseline.append("--use_external_data_format") - task.run(fp32_baseline, "fp32 baseline") + fp16_baseline = f"-m {model} -o --use_gpu -p fp16".split() + if args.use_external_data_format: + fp16_baseline.append("--use_external_data_format") + + return fp32_baseline, fp16_baseline + + +def get_all_operators(): + """All operators in the optimized model""" + return "Attention Gather Add LayerNormalization FastGelu MatMul".split() + + +def run_tuning_step0(task, fp16_baseline): + """Step 0 is to check which operator in FP16 causes most loss""" + fp32_logits = ["--io_block_list", "logits"] + task.run(fp16_baseline + fp32_logits, "FP16 except logits") + + fp32_io = ["--keep_io_types"] + task.run(fp16_baseline + fp32_io, "Graph I/O FP32, Other FP16") + + op_list = get_all_operators() + #task.run(fp16_baseline + fp32_io + ["--op_block_list"] + [o for o in op_list], "Everthing in FP32") + + # Only weights in FP16 + task.run(fp16_baseline + fp32_io + ["--op_block_list"] + [o for o in op_list] + ['--force_fp16_initializers'], + "FP32 except weights in FP16") + + for op in op_list: + op_block_list = ["--op_block_list"] + [o for o in op_list if o != op] + task.run(fp16_baseline + fp32_io + op_block_list, f"FP32 except {op} in FP16") + + +def run_tuning_step1(task, mixed_precision_baseline): + """Step 1 is to figure out which operator in FP32 could benefit most""" + for op in get_all_operators(): + op_block_list = ["--op_block_list", op] + task.run(mixed_precision_baseline + op_block_list, f"Mixed precision baseline + {op} in FP32") + + +def run_tuning_step2(task, mixed_precision_baseline): + """Assumed that you have run step 1 to figure out that Logits FP32 and Add FP32 is important, + Step 2 is to figure out a combination of two operators (one is Add from step one) to get better result + """ + for op in get_all_operators(): + if op not in ['Add']: + op_block_list = ["--op_block_list", 'Add', op] + task.run(mixed_precision_baseline + op_block_list, f"Mixed precision baseline + Add,{op} in FP32") + + +def run_parity_disable_half2(task: ParityTask, args): + onnx_model_paths = Gpt2Helper.get_onnx_paths('onnx_models', + args.model_name_or_path, + new_folder=args.use_external_data_format, + remove_existing=[]) + last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"]) + run_candidate(task, args, last_matmul_node_name, op_block_list=[]) + run_candidate(task, args, last_matmul_node_name, op_block_list=["Add"]) + run_candidate(task, args, last_matmul_node_name, op_block_list=["LayerNormalization", "Add"]) + + +def run_parity(task: ParityTask, args): + onnx_model_paths = Gpt2Helper.get_onnx_paths('onnx_models', + args.model_name_or_path, + new_folder=args.use_external_data_format, + remove_existing=[]) + + fp32_baseline, fp16_baseline = get_baselines(args) + + task.run(fp32_baseline, "FP32 baseline") # The following tests for fp16 requires GPU if not args.use_gpu: logger.info("skip mixed precision since --use_gpu is not specified") return - baseline = f"-m {model} -o --use_gpu -p fp16".split() - if args.use_external_data_format: - baseline.append("--use_external_data_format") - task.run(baseline, "fp16 baseline") + task.run(fp16_baseline, "FP16 baseline") - if not args.all: - logger.info("skip remaining combinations since --all is not specified") - return + last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"]) - fp32_logits = ["--io_block_list", "logits"] - task.run(baseline + fp32_logits, "fp16 except logits") + # Mixed precision baseline + run_candidate(task, args, last_matmul_node_name, op_block_list=[]) - fp32_io = ["--keep_io_types"] - task.run(baseline + fp32_io, "Graph I/O FP32, Other FP16") + # Result from tuning step 1 + run_candidate(task, args, last_matmul_node_name, op_block_list=["Add"]) - op_list = "Attention Gather Add LayerNormalization FastGelu MatMul".split() - task.run(baseline + fp32_io + ["--op_block_list"] + [o for o in op_list], "Everthing in FP32") + if args.all: + run_tuning_step0(task, fp16_baseline) + mixed_precision_baseline = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list=[]) + run_tuning_step1(task, mixed_precision_baseline) + run_tuning_step2(task, mixed_precision_baseline) + else: + run_candidate(task, args, last_matmul_node_name, op_block_list=["LayerNormalization", "Add"]) + run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu", "Add"]) - for op in op_list: - op_block_list = ["--op_block_list"] + [o for o in op_list if o != op] - task.run(baseline + fp32_io + op_block_list, f"FP32 except {op} in fp16") - - for op in op_list: - op_block_list = ["--op_block_list", op] - task.run(baseline + op_block_list, f"FP16 except {op} in fp32") - - op_block_list = ["--op_block_list", "LayerNormalization", "FastGelu"] - task.run(baseline + op_block_list, f"FP16 except LayerNormalization and FastGelu in fp32") - - task.run(baseline + op_block_list + fp32_logits, f"FP16 except logits, LayerNormalization and FastGelu in fp32") + # Run a few good candidates + run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu", "LayerNormalization", "Add"]) + run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu", "LayerNormalization", "Add", "Gather"]) + run_candidate(task, args, last_matmul_node_name, \ + op_block_list=["FastGelu", "LayerNormalization", "Add", "Gather", "MatMul"]) if __name__ == '__main__': args = parse_arguments() setup_logger(args.verbose) - run_parity(args) + if args.test_cases < 100 or args.runs < 20 or args.test_cases * args.runs < 10000: + logger.warning( + "Not enough test cases or runs to get stable results or test significance. Recommend test_cases >= 100, runs >= 20, test_cases * runs >= 10000." + ) + + task = ParityTask(args.test_cases, args.runs, args.csv) + + if not args.skip_test: + if (os.getenv('ORT_CUDA_GEMM_OPTIONS') == "4" and args.use_gpu): + assert torch.cuda.get_device_capability( + )[0] >= 7, "half2 kernel is not avaiable in current GPU device. Please set environment variable ORT_CUDA_GEMM_OPTIONS=0 or use supported GPU like V100 or T4" + run_parity_disable_half2(task, args) + else: + run_parity(task, args) + + try: + rows = load_results_from_csv(task.csv_path) + except: + logger.exception(f"Failed to load csv {task.csv_path}") + rows = task.results + + logger.info("Start running significance tests...") + summary_csv = task.csv_path.replace('.csv', ".stats.csv") + run_significance_test(rows, summary_csv) diff --git a/onnxruntime/python/tools/transformers/requirements.txt b/onnxruntime/python/tools/transformers/requirements.txt index 6f58a38d4a..b1908acdd3 100644 --- a/onnxruntime/python/tools/transformers/requirements.txt +++ b/onnxruntime/python/tools/transformers/requirements.txt @@ -6,6 +6,7 @@ py-cpuinfo py3nvml packaging transformers >= 4.0 +scipy # please follow https://pytorch.org/ to install PyTorch for your OS torch >= 1.8 \ No newline at end of file