mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-03 23:49:44 +00:00
Add t-test to compare experiments in GPT-2 mixed precision conversion (#9042)
* Add t-test to compare two experiments * Ranking based on pair-wise T-test results and a custom scoring function
This commit is contained in:
parent
7d28b596f4
commit
3ec3e9f705
4 changed files with 391 additions and 122 deletions
|
|
@ -92,14 +92,22 @@ def parse_arguments(argv=None):
|
|||
type=Precision,
|
||||
default=Precision.FLOAT32,
|
||||
choices=list(Precision),
|
||||
help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization")
|
||||
help=
|
||||
"Precision of model to run. fp32 for full precision, fp16 for half or mixed precision, and int8 for quantization"
|
||||
)
|
||||
|
||||
parser.add_argument("-t",
|
||||
"--test_cases",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of test cases for parity")
|
||||
help="Number of test cases per run for parity")
|
||||
parser.add_argument("-r",
|
||||
"--test_runs",
|
||||
required=False,
|
||||
type=int,
|
||||
default=10,
|
||||
help="Number of runs for parity. It is used for significance test.")
|
||||
|
||||
parser.add_argument('--verbose', required=False, action='store_true')
|
||||
parser.set_defaults(verbose=False)
|
||||
|
|
@ -180,6 +188,17 @@ def parse_arguments(argv=None):
|
|||
return args
|
||||
|
||||
|
||||
def get_onnx_model_size(onnx_path: str, use_external_data_format: bool):
|
||||
if not use_external_data_format:
|
||||
return os.path.getsize(onnx_path)
|
||||
else:
|
||||
return sum([f.stat().st_size for f in Path(onnx_path).parent.rglob('*')])
|
||||
|
||||
|
||||
def get_latency_name():
|
||||
return "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)"
|
||||
|
||||
|
||||
def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_results.csv"):
|
||||
result = {}
|
||||
from transformers import __version__ as transformers_version
|
||||
|
|
@ -216,6 +235,8 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu
|
|||
assert not args.output.endswith('.onnx'), "output shall be a directory for --use_external_data_format"
|
||||
|
||||
model_class = MODEL_CLASSES[args.model_class][0]
|
||||
use_padding = MODEL_CLASSES[args.model_class][2]
|
||||
|
||||
if args.model_class == "GPT2LMHeadModel_BeamSearchStep":
|
||||
model_type = "beam_search_step"
|
||||
elif args.model_class == "GPT2LMHeadModel_ConfigurableOneStepSearch":
|
||||
|
|
@ -255,23 +276,26 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu
|
|||
if (not args.use_external_data_format) and (config.n_layer > 24):
|
||||
logger.info(f"Try --use_external_data_format when model size > 2GB")
|
||||
|
||||
onnx_model_paths = gpt2helper.get_onnx_paths(output_dir,
|
||||
args.model_name_or_path,
|
||||
args.model_class,
|
||||
new_folder=args.use_external_data_format)
|
||||
onnx_model_paths = gpt2helper.get_onnx_paths(
|
||||
output_dir,
|
||||
args.model_name_or_path,
|
||||
args.model_class,
|
||||
new_folder=args.use_external_data_format,
|
||||
remove_existing=["fp32", "fp16", "int8"]) # Do not remove raw model to save time in parity test
|
||||
|
||||
raw_onnx_model = onnx_model_paths["raw"]
|
||||
|
||||
logger.info(f"Exporting ONNX model to {raw_onnx_model}")
|
||||
use_padding = MODEL_CLASSES[args.model_class][2]
|
||||
|
||||
gpt2helper.export_onnx(model,
|
||||
device,
|
||||
raw_onnx_model,
|
||||
args.verbose,
|
||||
args.use_external_data_format,
|
||||
has_position_ids=use_padding,
|
||||
has_attention_mask=use_padding)
|
||||
if os.path.exists(raw_onnx_model):
|
||||
logger.warning(f"Skip exporting ONNX model since it existed: {raw_onnx_model}")
|
||||
else:
|
||||
logger.info(f"Exporting ONNX model to {raw_onnx_model}")
|
||||
gpt2helper.export_onnx(model,
|
||||
device,
|
||||
raw_onnx_model,
|
||||
args.verbose,
|
||||
args.use_external_data_format,
|
||||
has_position_ids=use_padding,
|
||||
has_attention_mask=use_padding)
|
||||
|
||||
fp16_params = {"keep_io_types": args.keep_io_types}
|
||||
if args.io_block_list:
|
||||
|
|
@ -308,6 +332,7 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu
|
|||
output_path = args.output
|
||||
|
||||
logger.info(f"Output path: {output_path}")
|
||||
model_size_in_MB = int(get_onnx_model_size(output_path, args.use_external_data_format) / 1024 / 1024)
|
||||
|
||||
session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose)
|
||||
if args.model_class == "GPT2LMHeadModel" and session is not None:
|
||||
|
|
@ -320,7 +345,8 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu
|
|||
model_class=args.model_class,
|
||||
has_position_ids=use_padding,
|
||||
has_attention_mask=use_padding,
|
||||
total_test_cases=args.test_cases,
|
||||
test_cases_per_run=args.test_cases,
|
||||
total_runs=args.test_runs,
|
||||
verbose=args.verbose)
|
||||
|
||||
latency = gpt2helper.test_performance(session,
|
||||
|
|
@ -342,15 +368,15 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu
|
|||
# Write results to file
|
||||
import csv
|
||||
from onnxruntime import __version__ as ort_version
|
||||
latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)"
|
||||
latency_name = get_latency_name()
|
||||
csv_file_existed = os.path.exists(csv_filename)
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
column_names = [
|
||||
"experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases",
|
||||
"keep_io_types", "io_block_list", "op_block_list", "node_block_list", "force_fp16_initializers",
|
||||
"ORT_TRANSFORMER_OPTIONS", "ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name, "diff_50_percentile",
|
||||
"diff_90_percentile", "diff_95_percentile", "diff_99_percentile", "diff_pass_rate", "nan_rate",
|
||||
"top1_match_rate", "onnx_size_in_MB"
|
||||
"runs", "keep_io_types", "io_block_list", "op_block_list", "node_block_list", "force_fp16_initializers",
|
||||
"ORT_TRANSFORMER_OPTIONS", "ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name, "top1_match_rate",
|
||||
"onnx_size_in_MB", "diff_50_percentile", "diff_90_percentile", "diff_95_percentile",
|
||||
"diff_99_percentile", "diff_pass_rate", "nan_rate", "top1_match_rate_per_run"
|
||||
]
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
||||
if not csv_file_existed:
|
||||
|
|
@ -364,6 +390,7 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu
|
|||
"precision": args.precision,
|
||||
"optimizer": args.optimize_onnx,
|
||||
"test_cases": args.test_cases,
|
||||
"runs": args.test_runs,
|
||||
"keep_io_types": args.keep_io_types,
|
||||
"io_block_list": args.io_block_list,
|
||||
"op_block_list": args.op_block_list,
|
||||
|
|
@ -380,7 +407,8 @@ def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_resu
|
|||
"diff_pass_rate": parity_result["diff_pass_rate"],
|
||||
"nan_rate": parity_result["nan_rate"],
|
||||
"top1_match_rate": parity_result["top1_match_rate"],
|
||||
"onnx_size_in_MB": "{}".format(int(os.path.getsize(output_path) / 1024 / 1024))
|
||||
"top1_match_rate_per_run": parity_result["top1_match_rate_per_run"],
|
||||
"onnx_size_in_MB": "{}".format(model_size_in_MB),
|
||||
}
|
||||
logger.info(f"result: {row}")
|
||||
result.update(row)
|
||||
|
|
|
|||
|
|
@ -611,7 +611,8 @@ class Gpt2Helper:
|
|||
is_float16=False,
|
||||
rtol=5e-4,
|
||||
atol=5e-4,
|
||||
total_test_cases=100,
|
||||
test_cases_per_run=10000,
|
||||
total_runs=1,
|
||||
use_io_binding=True,
|
||||
model_class="GPT2LMHeadModel",
|
||||
has_position_ids=True,
|
||||
|
|
@ -624,7 +625,7 @@ class Gpt2Helper:
|
|||
config: GPT2Config = model.config
|
||||
|
||||
logger.info(
|
||||
f"Running parity test (atol={atol}, test_cases={total_test_cases}, use_io_binding={use_io_binding}, model_class={model_class}, is_float16={is_float16}) ..."
|
||||
f"Running parity test (atol={atol}, test_cases={test_cases_per_run}, runs={total_runs}, use_io_binding={use_io_binding}, model_class={model_class}, is_float16={is_float16}) ..."
|
||||
)
|
||||
|
||||
max_batch_size = 8
|
||||
|
|
@ -641,7 +642,10 @@ class Gpt2Helper:
|
|||
top1_matched_cases = 0
|
||||
|
||||
max_abs_diff_list = []
|
||||
top1_matched_cases_per_run = [0] * total_runs
|
||||
total_test_cases = test_cases_per_run * total_runs
|
||||
for i in range(total_test_cases):
|
||||
run_id = int(i / test_cases_per_run)
|
||||
sequence_length = random.randint(1, max_seq_len)
|
||||
past_sequence_length = random.randint(0, max_past_seq_len)
|
||||
batch_size = random.randint(1, max_batch_size)
|
||||
|
|
@ -669,6 +673,7 @@ class Gpt2Helper:
|
|||
passed_test_cases += 1
|
||||
if is_top1_matched:
|
||||
top1_matched_cases += 1
|
||||
top1_matched_cases_per_run[run_id] += 1
|
||||
|
||||
if verbose and not is_all_close:
|
||||
logger.info(
|
||||
|
|
@ -691,6 +696,7 @@ class Gpt2Helper:
|
|||
result = {f"max_diff_percentile_{p}": "nan" for p in [50, 90, 95, 99]}
|
||||
|
||||
result["top1_match_rate"] = top1_matched_cases * 1.0 / total_test_cases
|
||||
result["top1_match_rate_per_run"] = [x * 1.0 / test_cases_per_run for x in top1_matched_cases_per_run]
|
||||
result["diff_pass_rate"] = passed_test_cases * 1.0 / total_test_cases
|
||||
result["nan_rate"] = (total_test_cases - len(max_abs_diff_list)) * 1.0 / total_test_cases
|
||||
|
||||
|
|
@ -762,7 +768,8 @@ class Gpt2Helper:
|
|||
model_name_or_path,
|
||||
model_class: str = 'GPT2LMHeadModel',
|
||||
has_past=True,
|
||||
new_folder=False):
|
||||
new_folder=False,
|
||||
remove_existing=["raw", "fp32", "fp16", "int8"]):
|
||||
""" Build a path name for given model based on given attributes.
|
||||
"""
|
||||
model_name = model_name_or_path
|
||||
|
|
@ -777,15 +784,19 @@ class Gpt2Helper:
|
|||
model_name += "_past"
|
||||
|
||||
if new_folder:
|
||||
suffix = {"raw": "", "fp32": "_fp32", "fp16": "_fp16", "int8": "_int8"}
|
||||
# Remove the directories if existed.
|
||||
for suffix in ["", "_fp32", "_fp16", "_int8"]:
|
||||
new_dir = os.path.join(output_dir, model_name + suffix)
|
||||
for model_type in ["raw", "fp32", "fp16", "int8"]:
|
||||
new_dir = os.path.join(output_dir, model_name + suffix[model_type])
|
||||
if os.path.exists(new_dir):
|
||||
try:
|
||||
shutil.rmtree(new_dir)
|
||||
logger.info(f"Removed the existed directory: {new_dir}")
|
||||
except OSError as e:
|
||||
logger.info(f"Failed to remove the directory {new_dir}: {e.strerror}")
|
||||
if (model_type in remove_existing):
|
||||
try:
|
||||
shutil.rmtree(new_dir)
|
||||
logger.info(f"Removed the existed directory: {new_dir}")
|
||||
except OSError as e:
|
||||
logger.info(f"Failed to remove the directory {new_dir}: {e.strerror}")
|
||||
else:
|
||||
logger.info(f"Directory for {model_type} existed: {new_dir}")
|
||||
|
||||
# store each model to its own directory (for external data format).
|
||||
return {
|
||||
|
|
|
|||
|
|
@ -4,12 +4,24 @@
|
|||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
from convert_to_onnx import main
|
||||
# This script uses different configurations in mixed precision conversion for GPT-2 model, and
|
||||
# measures the inference latency, top 1 match rate (compared to PyTorch FP32 model) and ONNX model size.
|
||||
# It outputs a csv file with Mann-Whitney U test and T-Test on each pair of experiments, where
|
||||
# pvalue < 0.05 means two experiments have significant difference on top 1 match rate.
|
||||
# User could use this script to select the best mixed precision model according to these metrics.
|
||||
|
||||
from convert_to_onnx import main, get_latency_name
|
||||
import os
|
||||
import argparse
|
||||
import logging
|
||||
from gpt2_helper import PRETRAINED_GPT2_MODELS
|
||||
from gpt2_helper import PRETRAINED_GPT2_MODELS, Gpt2Helper
|
||||
from benchmark_helper import setup_logger
|
||||
from onnx_model import OnnxModel
|
||||
import onnx
|
||||
import csv
|
||||
import datetime
|
||||
import scipy.stats
|
||||
import torch
|
||||
|
||||
logger = logging.getLogger('')
|
||||
|
||||
|
|
@ -29,11 +41,9 @@ def parse_arguments(argv=None):
|
|||
default='gpt2_parity_results.csv',
|
||||
help='path of csv file to save the result')
|
||||
|
||||
parser.add_argument('--runs',
|
||||
required=False,
|
||||
type=int,
|
||||
default=5,
|
||||
help="number of repeated runs to get median value of each metric")
|
||||
parser.add_argument('--test_cases', required=False, type=int, default=500, help="number of test cases per run")
|
||||
|
||||
parser.add_argument('--runs', required=False, type=int, default=40, help="number of repeated runs")
|
||||
|
||||
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU for inference")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
|
@ -47,131 +57,350 @@ def parse_arguments(argv=None):
|
|||
parser.add_argument('--verbose', required=False, action='store_true')
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
parser.add_argument('--skip_test',
|
||||
required=False,
|
||||
action='store_true',
|
||||
help="do not run test, and only rank experiments based on existing csv file")
|
||||
parser.set_defaults(skip_test=False)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
class ParityTask:
|
||||
def __init__(self, total_runs, csv_path):
|
||||
def __init__(self, test_cases, total_runs, csv_path):
|
||||
self.total_runs = total_runs
|
||||
self.test_cases = test_cases
|
||||
self.csv_path = csv_path
|
||||
self.latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)"
|
||||
self.metric_names = [
|
||||
self.latency_name, "diff_50_percentile", "diff_90_percentile", "diff_95_percentile", "diff_99_percentile",
|
||||
"diff_pass_rate", "nan_rate", "top1_match_rate", "onnx_size_in_MB"
|
||||
self.results = []
|
||||
self.run_id = 0
|
||||
|
||||
def run(self, argv, experiment_name):
|
||||
start_time = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
|
||||
run_id = f"{start_time}_{self.run_id}"
|
||||
self.run_id += 1
|
||||
|
||||
try:
|
||||
result = main(argv + ["-t", f"{self.test_cases}", "-r", f"{self.total_runs}"],
|
||||
experiment_name=experiment_name,
|
||||
run_id=run_id,
|
||||
csv_filename=self.csv_path)
|
||||
except:
|
||||
logger.exception(f"Failed to run experiment {experiment_name}")
|
||||
|
||||
if result:
|
||||
self.results.append(result)
|
||||
|
||||
|
||||
def load_results_from_csv(csv_path):
|
||||
rows = []
|
||||
import csv
|
||||
with open(csv_path, newline='') as csvfile:
|
||||
reader = csv.DictReader(csvfile)
|
||||
for row in reader:
|
||||
rows.append(row)
|
||||
return rows
|
||||
|
||||
|
||||
def score(row):
|
||||
"""Scoring function based on 3 metrics. The larger score is better."""
|
||||
latency_in_ms = float(row[get_latency_name()])
|
||||
top1_match_rate = float(row["top1_match_rate"])
|
||||
onnx_size_in_MB = float(row["onnx_size_in_MB"])
|
||||
# A simple scoring function: cost of 0.1ms latency ~ 0.1% match rate ~ 100MB size
|
||||
return (top1_match_rate * 1000 - latency_in_ms * 10 - onnx_size_in_MB / 100)
|
||||
|
||||
|
||||
def print_wins(wins, rows, test_name):
|
||||
print()
|
||||
print("*" * 10)
|
||||
|
||||
row_map = {}
|
||||
for row in rows:
|
||||
row_map[row["run_id"]] = row
|
||||
|
||||
sorted_wins = dict(sorted(wins.items(), key=lambda item: (item[1], score(row_map[item[0]])), reverse=True))
|
||||
logger.debug(f"{test_name} Wins:{sorted_wins}")
|
||||
logger.info(f"Based on {test_name} wins and a scoring function, the ranking:")
|
||||
|
||||
rank = 0
|
||||
previous_value = -1
|
||||
count = 0
|
||||
for key, value in sorted_wins.items():
|
||||
if value != previous_value:
|
||||
rank = count
|
||||
previous_value = value
|
||||
count += 1
|
||||
|
||||
for row in rows:
|
||||
if row["run_id"] == key:
|
||||
logger.info(
|
||||
"{:02d}: WINs={:02d}, run_id={}, latency={:5.2f} top1_match={:.4f} size={}_MB experiment={} {}".
|
||||
format(
|
||||
rank, value, key, float(row[get_latency_name()]), float(row["top1_match_rate"]),
|
||||
row["onnx_size_in_MB"], row["experiment"], " (Half2 Disabled)" if
|
||||
(row['ORT_CUDA_GEMM_OPTIONS'] == "4" and "Half2" not in row["experiment"]) else ""))
|
||||
break
|
||||
|
||||
|
||||
def run_significance_test(rows, output_csv_path):
|
||||
"""Run U test and T test.
|
||||
"""
|
||||
utest_wins = {}
|
||||
ttest_wins = {}
|
||||
for row in rows:
|
||||
run_id = row["run_id"]
|
||||
utest_wins[run_id] = 0
|
||||
ttest_wins[run_id] = 0
|
||||
|
||||
with open(output_csv_path, 'w', newline='') as csvfile:
|
||||
column_names = [
|
||||
'model_name', 'run_id_1', 'experiment_1', 'top1_match_rate_1', 'run_id_2', 'experiment_2',
|
||||
'top1_match_rate_2', 'U_statistic', 'U_pvalue', "T_statistic", "T_pvalue"
|
||||
]
|
||||
|
||||
def run(self, argv, name):
|
||||
results = []
|
||||
experiment_name = name
|
||||
for i in range(self.total_runs):
|
||||
try:
|
||||
result = main(argv, experiment_name=experiment_name, run_id=i, csv_filename=self.csv_path)
|
||||
except:
|
||||
logger.error(f"Failed to run experiment{experiment_name}")
|
||||
continue
|
||||
if result:
|
||||
results.append(result)
|
||||
writer = csv.DictWriter(csvfile, fieldnames=column_names)
|
||||
writer.writeheader()
|
||||
|
||||
if len(results) == 0:
|
||||
return
|
||||
required_match_columns = ["model_name", "test_cases", "runs"]
|
||||
num_results = len(rows)
|
||||
for i in range(num_results - 1):
|
||||
result1 = rows[i]
|
||||
|
||||
# Calculate median value per metric
|
||||
all_results = {}
|
||||
for name in self.metric_names:
|
||||
all_results[name] = []
|
||||
for j in range(i + 1, num_results, 1):
|
||||
result2 = rows[j]
|
||||
|
||||
for result in results:
|
||||
for name in self.metric_names:
|
||||
if name in result:
|
||||
all_results[name].append(result[name])
|
||||
all_matched = True
|
||||
for column in required_match_columns:
|
||||
if (result1[column] != result2[column]):
|
||||
all_matched = False
|
||||
break
|
||||
if not all_matched:
|
||||
continue
|
||||
|
||||
import statistics
|
||||
median_result = results[0]
|
||||
for name in self.metric_names:
|
||||
median_result[name] = statistics.median(all_results[name])
|
||||
if isinstance(result1["top1_match_rate_per_run"], str):
|
||||
import json
|
||||
a = json.loads(result1["top1_match_rate_per_run"])
|
||||
b = json.loads(result2["top1_match_rate_per_run"])
|
||||
else:
|
||||
a = result1["top1_match_rate_per_run"]
|
||||
b = result2["top1_match_rate_per_run"]
|
||||
|
||||
self.save_result(median_result)
|
||||
try:
|
||||
utest_statistic, utest_pvalue = scipy.stats.mannwhitneyu(
|
||||
a, b, use_continuity=True, alternative="two-sided"
|
||||
) #TODO: shall we use one-sided: less or greater according to "top1_match_rate"
|
||||
except ValueError: #ValueError: All numbers are identical in mannwhitneyu
|
||||
utest_statistic = None
|
||||
utest_pvalue = None
|
||||
ttest_statistic, ttest_pvalue = scipy.stats.ttest_ind(a, b, axis=None, equal_var=True)
|
||||
|
||||
def save_result(self, result):
|
||||
import csv
|
||||
csv_filename = self.csv_path
|
||||
if utest_pvalue < 0.05:
|
||||
if float(result1["top1_match_rate"]) > float(result2["top1_match_rate"]):
|
||||
utest_wins[result1["run_id"]] += 1
|
||||
else:
|
||||
utest_wins[result2["run_id"]] += 1
|
||||
|
||||
csv_file_existed = os.path.exists(csv_filename)
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
column_names = [
|
||||
"experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases",
|
||||
"keep_io_types", "io_block_list", "op_block_list", "node_block_list", "force_fp16_initializers",
|
||||
"ORT_TRANSFORMER_OPTIONS", "ORT_CUDA_GEMM_OPTIONS", "onnxruntime"
|
||||
] + self.metric_names
|
||||
if ttest_pvalue < 0.05:
|
||||
if float(result1["top1_match_rate"]) > float(result2["top1_match_rate"]):
|
||||
ttest_wins[result1["run_id"]] += 1
|
||||
else:
|
||||
ttest_wins[result2["run_id"]] += 1
|
||||
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
||||
if not csv_file_existed:
|
||||
csv_writer.writeheader()
|
||||
row = {
|
||||
'model_name': result1["model_name"],
|
||||
'run_id_1': result1["run_id"],
|
||||
'experiment_1': result1["experiment"],
|
||||
'top1_match_rate_1': float(result1["top1_match_rate"]),
|
||||
"run_id_2": result2["run_id"],
|
||||
"experiment_2": result2["experiment"],
|
||||
'top1_match_rate_2': float(result2["top1_match_rate"]),
|
||||
'U_statistic': utest_statistic,
|
||||
'U_pvalue': utest_pvalue,
|
||||
'T_statistic': ttest_statistic,
|
||||
'T_pvalue': ttest_pvalue
|
||||
}
|
||||
|
||||
row = {}
|
||||
for name in column_names:
|
||||
row[name] = result[name]
|
||||
|
||||
row["run_id"] = "median"
|
||||
|
||||
csv_writer.writerow(row)
|
||||
logger.info(f"result saved to {csv_filename}: {row}")
|
||||
writer.writerow(row)
|
||||
logger.info(f"U-Test and T-Test results are output to {output_csv_path}")
|
||||
print_wins(utest_wins, rows, "U-Test")
|
||||
print_wins(ttest_wins, rows, "T-Test")
|
||||
|
||||
|
||||
def run_parity(args):
|
||||
task = ParityTask(args.runs, args.csv)
|
||||
def get_last_matmul_node_name(raw_onnx_model: str):
|
||||
model = onnx.load(raw_onnx_model)
|
||||
onnx_model = OnnxModel(model)
|
||||
output_name_to_node = onnx_model.output_name_to_node()
|
||||
|
||||
assert model.graph.output[0].name in output_name_to_node
|
||||
node = output_name_to_node[model.graph.output[0].name]
|
||||
if node.op_type == "MatMul":
|
||||
logger.info(f"Found last MatMul node for logits: {node.name}")
|
||||
return node.name
|
||||
|
||||
logger.warning(f"Failed to find MatMul node for logits. Found {node.op_type} of node {node.name}")
|
||||
return None
|
||||
|
||||
|
||||
def get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list):
|
||||
model = args.model_name_or_path
|
||||
parameters = f"-m {model} -o --use_gpu -p fp16".split()
|
||||
if args.use_external_data_format:
|
||||
parameters.append("--use_external_data_format")
|
||||
parameters += ["--io_block_list", "logits", "--node_block_list", last_matmul_node_name]
|
||||
|
||||
if op_block_list:
|
||||
parameters.extend(["--op_block_list"] + op_block_list)
|
||||
|
||||
return parameters
|
||||
|
||||
|
||||
def run_candidate(task: ParityTask, args, last_matmul_node_name, op_block_list=["FastGelu", "LayerNormalization"]):
|
||||
parameters = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list)
|
||||
op_block_list_str = ','.join(sorted(op_block_list))
|
||||
name_suffix = " (Half2 Disabled)" if os.getenv('ORT_CUDA_GEMM_OPTIONS') == "4" else ""
|
||||
if op_block_list:
|
||||
name = f"Mixed precision baseline + {op_block_list_str} in FP32{name_suffix}"
|
||||
else:
|
||||
name = f"Mixed precision baseline (logits output and last MatMul node {last_matmul_node_name} in FP32){name_suffix}"
|
||||
task.run(parameters, name)
|
||||
|
||||
|
||||
def get_baselines(args):
|
||||
model = args.model_name_or_path
|
||||
fp32_baseline = f"-m {model} -o -p fp32".split()
|
||||
if args.use_gpu:
|
||||
fp32_baseline.append("--use_gpu")
|
||||
|
||||
if args.use_external_data_format:
|
||||
fp32_baseline.append("--use_external_data_format")
|
||||
|
||||
task.run(fp32_baseline, "fp32 baseline")
|
||||
fp16_baseline = f"-m {model} -o --use_gpu -p fp16".split()
|
||||
if args.use_external_data_format:
|
||||
fp16_baseline.append("--use_external_data_format")
|
||||
|
||||
return fp32_baseline, fp16_baseline
|
||||
|
||||
|
||||
def get_all_operators():
|
||||
"""All operators in the optimized model"""
|
||||
return "Attention Gather Add LayerNormalization FastGelu MatMul".split()
|
||||
|
||||
|
||||
def run_tuning_step0(task, fp16_baseline):
|
||||
"""Step 0 is to check which operator in FP16 causes most loss"""
|
||||
fp32_logits = ["--io_block_list", "logits"]
|
||||
task.run(fp16_baseline + fp32_logits, "FP16 except logits")
|
||||
|
||||
fp32_io = ["--keep_io_types"]
|
||||
task.run(fp16_baseline + fp32_io, "Graph I/O FP32, Other FP16")
|
||||
|
||||
op_list = get_all_operators()
|
||||
#task.run(fp16_baseline + fp32_io + ["--op_block_list"] + [o for o in op_list], "Everthing in FP32")
|
||||
|
||||
# Only weights in FP16
|
||||
task.run(fp16_baseline + fp32_io + ["--op_block_list"] + [o for o in op_list] + ['--force_fp16_initializers'],
|
||||
"FP32 except weights in FP16")
|
||||
|
||||
for op in op_list:
|
||||
op_block_list = ["--op_block_list"] + [o for o in op_list if o != op]
|
||||
task.run(fp16_baseline + fp32_io + op_block_list, f"FP32 except {op} in FP16")
|
||||
|
||||
|
||||
def run_tuning_step1(task, mixed_precision_baseline):
|
||||
"""Step 1 is to figure out which operator in FP32 could benefit most"""
|
||||
for op in get_all_operators():
|
||||
op_block_list = ["--op_block_list", op]
|
||||
task.run(mixed_precision_baseline + op_block_list, f"Mixed precision baseline + {op} in FP32")
|
||||
|
||||
|
||||
def run_tuning_step2(task, mixed_precision_baseline):
|
||||
"""Assumed that you have run step 1 to figure out that Logits FP32 and Add FP32 is important,
|
||||
Step 2 is to figure out a combination of two operators (one is Add from step one) to get better result
|
||||
"""
|
||||
for op in get_all_operators():
|
||||
if op not in ['Add']:
|
||||
op_block_list = ["--op_block_list", 'Add', op]
|
||||
task.run(mixed_precision_baseline + op_block_list, f"Mixed precision baseline + Add,{op} in FP32")
|
||||
|
||||
|
||||
def run_parity_disable_half2(task: ParityTask, args):
|
||||
onnx_model_paths = Gpt2Helper.get_onnx_paths('onnx_models',
|
||||
args.model_name_or_path,
|
||||
new_folder=args.use_external_data_format,
|
||||
remove_existing=[])
|
||||
last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"])
|
||||
run_candidate(task, args, last_matmul_node_name, op_block_list=[])
|
||||
run_candidate(task, args, last_matmul_node_name, op_block_list=["Add"])
|
||||
run_candidate(task, args, last_matmul_node_name, op_block_list=["LayerNormalization", "Add"])
|
||||
|
||||
|
||||
def run_parity(task: ParityTask, args):
|
||||
onnx_model_paths = Gpt2Helper.get_onnx_paths('onnx_models',
|
||||
args.model_name_or_path,
|
||||
new_folder=args.use_external_data_format,
|
||||
remove_existing=[])
|
||||
|
||||
fp32_baseline, fp16_baseline = get_baselines(args)
|
||||
|
||||
task.run(fp32_baseline, "FP32 baseline")
|
||||
|
||||
# The following tests for fp16 requires GPU
|
||||
if not args.use_gpu:
|
||||
logger.info("skip mixed precision since --use_gpu is not specified")
|
||||
return
|
||||
|
||||
baseline = f"-m {model} -o --use_gpu -p fp16".split()
|
||||
if args.use_external_data_format:
|
||||
baseline.append("--use_external_data_format")
|
||||
task.run(baseline, "fp16 baseline")
|
||||
task.run(fp16_baseline, "FP16 baseline")
|
||||
|
||||
if not args.all:
|
||||
logger.info("skip remaining combinations since --all is not specified")
|
||||
return
|
||||
last_matmul_node_name = get_last_matmul_node_name(onnx_model_paths["raw"])
|
||||
|
||||
fp32_logits = ["--io_block_list", "logits"]
|
||||
task.run(baseline + fp32_logits, "fp16 except logits")
|
||||
# Mixed precision baseline
|
||||
run_candidate(task, args, last_matmul_node_name, op_block_list=[])
|
||||
|
||||
fp32_io = ["--keep_io_types"]
|
||||
task.run(baseline + fp32_io, "Graph I/O FP32, Other FP16")
|
||||
# Result from tuning step 1
|
||||
run_candidate(task, args, last_matmul_node_name, op_block_list=["Add"])
|
||||
|
||||
op_list = "Attention Gather Add LayerNormalization FastGelu MatMul".split()
|
||||
task.run(baseline + fp32_io + ["--op_block_list"] + [o for o in op_list], "Everthing in FP32")
|
||||
if args.all:
|
||||
run_tuning_step0(task, fp16_baseline)
|
||||
mixed_precision_baseline = get_mixed_precision_parameters(args, last_matmul_node_name, op_block_list=[])
|
||||
run_tuning_step1(task, mixed_precision_baseline)
|
||||
run_tuning_step2(task, mixed_precision_baseline)
|
||||
else:
|
||||
run_candidate(task, args, last_matmul_node_name, op_block_list=["LayerNormalization", "Add"])
|
||||
run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu", "Add"])
|
||||
|
||||
for op in op_list:
|
||||
op_block_list = ["--op_block_list"] + [o for o in op_list if o != op]
|
||||
task.run(baseline + fp32_io + op_block_list, f"FP32 except {op} in fp16")
|
||||
|
||||
for op in op_list:
|
||||
op_block_list = ["--op_block_list", op]
|
||||
task.run(baseline + op_block_list, f"FP16 except {op} in fp32")
|
||||
|
||||
op_block_list = ["--op_block_list", "LayerNormalization", "FastGelu"]
|
||||
task.run(baseline + op_block_list, f"FP16 except LayerNormalization and FastGelu in fp32")
|
||||
|
||||
task.run(baseline + op_block_list + fp32_logits, f"FP16 except logits, LayerNormalization and FastGelu in fp32")
|
||||
# Run a few good candidates
|
||||
run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu", "LayerNormalization", "Add"])
|
||||
run_candidate(task, args, last_matmul_node_name, op_block_list=["FastGelu", "LayerNormalization", "Add", "Gather"])
|
||||
run_candidate(task, args, last_matmul_node_name, \
|
||||
op_block_list=["FastGelu", "LayerNormalization", "Add", "Gather", "MatMul"])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
setup_logger(args.verbose)
|
||||
|
||||
run_parity(args)
|
||||
if args.test_cases < 100 or args.runs < 20 or args.test_cases * args.runs < 10000:
|
||||
logger.warning(
|
||||
"Not enough test cases or runs to get stable results or test significance. Recommend test_cases >= 100, runs >= 20, test_cases * runs >= 10000."
|
||||
)
|
||||
|
||||
task = ParityTask(args.test_cases, args.runs, args.csv)
|
||||
|
||||
if not args.skip_test:
|
||||
if (os.getenv('ORT_CUDA_GEMM_OPTIONS') == "4" and args.use_gpu):
|
||||
assert torch.cuda.get_device_capability(
|
||||
)[0] >= 7, "half2 kernel is not avaiable in current GPU device. Please set environment variable ORT_CUDA_GEMM_OPTIONS=0 or use supported GPU like V100 or T4"
|
||||
run_parity_disable_half2(task, args)
|
||||
else:
|
||||
run_parity(task, args)
|
||||
|
||||
try:
|
||||
rows = load_results_from_csv(task.csv_path)
|
||||
except:
|
||||
logger.exception(f"Failed to load csv {task.csv_path}")
|
||||
rows = task.results
|
||||
|
||||
logger.info("Start running significance tests...")
|
||||
summary_csv = task.csv_path.replace('.csv', ".stats.csv")
|
||||
run_significance_test(rows, summary_csv)
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ py-cpuinfo
|
|||
py3nvml
|
||||
packaging
|
||||
transformers >= 4.0
|
||||
scipy
|
||||
|
||||
# please follow https://pytorch.org/ to install PyTorch for your OS
|
||||
torch >= 1.8
|
||||
Loading…
Reference in a new issue