diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py index 9aa8cc85ff..905f7bca1d 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py @@ -939,6 +939,9 @@ def run_onnxruntime(args, models): model_to_fail_ep = {} # model -> failing ep model_to_session = {} # models -> session creation time + if args.running_mode == "benchmark": + model_to_session = read_map_from_file(SESSION_FILE) + ep_list = [] if args.ep: ep_list.append(args.ep) @@ -995,9 +998,9 @@ def run_onnxruntime(args, models): # Set environment variables for ort-trt benchmarking if "ORT-TRT" in ep: os.environ["ORT_TENSORRT_FP16_ENABLE"] = "1" if "Fp16" in ep else "0" - os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1" os.environ["ORT_TENSORRT_MAX_WORKSPACE_SIZE"] = "4294967296" - + if args.enable_cache: + os.environ["ORT_TENSORRT_ENGINE_CACHE_ENABLE"] = "1" fp16 = False # use float16.py for cuda fp16 only @@ -1054,6 +1057,7 @@ def run_onnxruntime(args, models): # resolve providers to create session providers = ep_to_provider_list[ep] options = onnxruntime.SessionOptions() + enablement = args.graph_enablement if enablement == enable_all: options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL @@ -1066,12 +1070,15 @@ def run_onnxruntime(args, models): # create onnxruntime inference session try: - sess, _ = create_session(model_path, providers, options) + sess, second_creation_time = create_session(model_path, providers, options) except Exception as e: logger.error(e) update_fail_model_map(model_to_fail_ep, name, ep, 'runtime error', e) continue + + if second_creation_time: + model_to_session[name] = copy.deepcopy({ep + second: second_creation_time}) logger.info("start to inference {} with {} ...".format(name, ep)) logger.info(sess.get_providers()) @@ -1091,6 +1098,8 @@ def run_onnxruntime(args, models): "device": ep, "fp16": fp16, "io_binding": args.io_binding, + "graph_optimizations": args.graph_enablement, + "enable_cache": args.enable_cache, "model_name": name, "inputs": len(sess.get_inputs()), "batch_size": batch_size, @@ -1110,6 +1119,7 @@ def run_onnxruntime(args, models): continue if result: + latency_result[ep] = {} latency_result[ep]["average_latency_ms"] = result["average_latency_ms"] latency_result[ep]["latency_90_percentile"] = result["latency_90_percentile"] @@ -1241,9 +1251,7 @@ def output_details(results, csv_filename): with open(csv_filename, mode="a", newline='') as csv_file: column_names = [ - "engine", "version", "device", "fp16", "io_binding", "model_name", "inputs", "batch_size", - "sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance", - "latency_90_percentile", "latency_95_percentile", "latency_99_percentile" + "engine", "version", "device", "fp16", "io_binding", "graph_optimizations", "enable_cache", "model_name", "inputs", "batch_size", "sequence_length", "datetime", "test_times", "QPS", "average_latency_ms", "latency_variance", "latency_90_percentile", "latency_95_percentile", "latency_99_percentile" ] csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) @@ -1374,10 +1382,9 @@ def output_session_creation(results, csv_filename): need_write_header = False with open(csv_filename, mode="a", newline='') as csv_file: - column_names = [model_title] - for provider in ort_provider_list: - column_names.append(provider + session_ending) - + session_1 = [p + session_ending for p in ort_provider_list] + session_2 = [p + second_session_ending for p in ort_provider_list] + column_names = [model_title] + session_1 + session_2 csv_writer = csv.writer(csv_file) @@ -1391,6 +1398,11 @@ def output_session_creation(results, csv_filename): trt_fp32_time = "" cuda_fp16_time = "" trt_fp16_time = "" + cpu_time_2 = "" + cuda_fp32_time_2 = "" + trt_fp32_time_2 = "" + cuda_fp16_time_2 = "" + trt_fp16_time_2 = "" for model_name, ep_dict in results.items(): for ep, time in ep_dict.items(): @@ -1404,6 +1416,16 @@ def output_session_creation(results, csv_filename): cuda_fp16_time = time elif ep == trt_fp16: trt_fp16_time = time + if ep == cpu + second: + cpu_time_2 = time + elif ep == cuda + second: + cuda_fp32_time_2 = time + elif ep == trt + second: + trt_fp32_time_2 = time + elif ep == cuda_fp16 + second: + cuda_fp16_time_2 = time + elif ep == trt_fp16 + second: + trt_fp16_time_2 = time else: continue @@ -1412,7 +1434,12 @@ def output_session_creation(results, csv_filename): cuda_fp32_time, trt_fp32_time, cuda_fp16_time, - trt_fp16_time] + trt_fp16_time, + cpu_time_2, + cuda_fp32_time_2, + trt_fp32_time_2, + cuda_fp16_time_2, + trt_fp16_time_2] csv_writer.writerow(row) @@ -1659,15 +1686,17 @@ def parse_arguments(): parser.add_argument("-w", "--workspace", required=False, default="/", help="Workspace to find tensorrt and perf script (with models if parsing with model file)") - parser.add_argument("--track_memory", required=False, default=True, help="Track CUDA and TRT Memory Usage") - - parser.add_argument("--io_binding", required=False, default=False, help="Bind Inputs") + parser.add_argument("-e", "--ep_list", nargs="+", required=False, default=None, help="Specify ORT Execution Providers list.") - parser.add_argument("--graph_enablement", required=False, default=enable_all, choices=[disable, basic, extended, enable_all], help="Choose graph optimization enablement.") + parser.add_argument("-z", "--track_memory", required=False, default=True, help="Track CUDA and TRT Memory Usage") - parser.add_argument("--ep", required=False, default=None, help="Specify ORT Execution Provider.") + parser.add_argument("-b", "--io_binding", required=False, default=False, help="Bind Inputs") - parser.add_argument("--ep_list", nargs="+", required=False, default=None, help="Specify ORT Execution Providers list.") + parser.add_argument("-g", "--graph_enablement", required=False, default=enable_all, choices=[disable, basic, extended, enable_all], help="Choose graph optimization enablement.") + + parser.add_argument("-n", "--enable_cache", required=False, default=True, help="Enable ORT-TRT Caching") + + parser.add_argument("--ep", required=False, default=None, help="Specify ORT Execution Provider.") parser.add_argument("--fp16", required=False, default=True, action="store_true", help="Inlcude Float16 into benchmarking.") @@ -1703,12 +1732,13 @@ def setup_logger(verbose): logging.getLogger("transformers").setLevel(logging.WARNING) def parse_models_helper(args, models): - if ".json" in args.model_source: + model_source = os.path.join(args.workspace, args.model_source) + if ".json" in model_source: logger.info("Parsing model information from file ...") - parse_models_info_from_file(args.workspace, args.model_source, models) + parse_models_info_from_file(args.workspace, model_source, models) else: logger.info("Parsing model information from directory ...") - parse_models_info_from_directory(args.model_source, models) + parse_models_info_from_directory(model_source, models) def main(): args = parse_arguments() diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py index 86186a8037..4995d82a7a 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark_wrapper.py @@ -64,7 +64,7 @@ def main(): model_list_file = os.path.join(os.getcwd(), model +'.json') write_model_info_to_file([model_info], model_list_file) - + for ep in ep_list: command = ["python3", @@ -120,6 +120,7 @@ def main(): output_metrics(model_to_metrics, os.path.join(path, benchmark_metrics_csv)) logger.info("\nSaved model metrics results to {}".format(benchmark_metrics_csv)) + elif args.running_mode == "benchmark": logger.info("\n=========================================") logger.info("======= Models/EPs session creation =======") logger.info("=========================================") @@ -129,8 +130,7 @@ def main(): pretty_print(pp, model_to_session) output_session_creation(model_to_session, os.path.join(path, benchmark_session_csv)) logger.info("\nSaved session creation results to {}".format(benchmark_session_csv)) - - elif args.running_mode == "benchmark": + logger.info("\n=========================================================") logger.info("========== Failing Models/EPs (accumulated) ==============") logger.info("==========================================================") diff --git a/onnxruntime/python/tools/tensorrt/perf/perf.sh b/onnxruntime/python/tools/tensorrt/perf/perf.sh index 90f46a305a..78d9bebc14 100755 --- a/onnxruntime/python/tools/tensorrt/perf/perf.sh +++ b/onnxruntime/python/tools/tensorrt/perf/perf.sh @@ -1,13 +1,13 @@ #!/bin/bash -while getopts d:o:m:w:e: parameter +while getopts d:o:m:e:a: parameter do case "${parameter}" in d) PERF_DIR=${OPTARG};; o) OPTION=${OPTARG};; m) MODEL_PATH=${OPTARG};; -w) WORKSPACE=${OPTARG};; e) EP_LIST=${OPTARG};; +a) OPTIONAL_ARGS=${OPTARG};; esac done @@ -15,7 +15,8 @@ done RUN_EPS="" if [ ! -z "$EP_LIST" ] then - RUN_EPS="--ep_list $EP_LIST" + RUN_EPS=" -e $EP_LIST" + OPTIONAL_ARGS=$OPTIONAL_ARGS$RUN_EPS fi # change dir if docker @@ -54,5 +55,5 @@ setup() { } setup -python3 benchmark_wrapper.py -r validate -m $MODEL_PATH -o result/$OPTION -w $WORKSPACE $RUN_EPS -python3 benchmark_wrapper.py -r benchmark -t 1200 -m $MODEL_PATH -o result/$OPTION -w $WORKSPACE $RUN_EPS +python3 benchmark_wrapper.py -r validate -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS +python3 benchmark_wrapper.py -r benchmark -t 1200 -m $MODEL_PATH -o result/$OPTION $OPTIONAL_ARGS diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py index 508e02780d..62e10220ba 100644 --- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py +++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py @@ -41,11 +41,13 @@ model_title = 'Model' group_title = 'Group' # endings +second = "_second" csv_ending = '.csv' avg_ending = ' \nmean (ms)' percentile_ending = ' \n90th percentile (ms)' memory_ending = ' \npeak memory usage (MiB)' session_ending = ' \n session creation time (s)' +second_session_ending = ' \n second session creation time (s)' ort_provider_list = [cpu, cuda, trt, cuda_fp16, trt_fp16] provider_list = [cpu, cuda, trt, standalone_trt, cuda_fp16, trt_fp16, standalone_trt_fp16] table_headers = [model_title] + provider_list diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py index 6d6c3667df..bc1f7f590c 100644 --- a/onnxruntime/python/tools/tensorrt/perf/post.py +++ b/onnxruntime/python/tools/tensorrt/perf/post.py @@ -4,10 +4,10 @@ import os import pandas as pd import time from azure.kusto.data import KustoConnectionStringBuilder +from azure.kusto.data.data_format import DataFormat from azure.kusto.data.helpers import dataframe_from_result_table from azure.kusto.ingest import ( IngestionProperties, - DataFormat, ReportLevel, QueuedIngestClient, ) @@ -91,7 +91,7 @@ def get_specs(specs, branch, commit_id, date_time): def get_session(session, model_group): session_columns = session.keys() - session_db_columns = [model_title] + ort_provider_list + session_db_columns = [model_title] + ort_provider_list + [p + second for p in ort_provider_list] session = adjust_columns(session, session_columns, session_db_columns, model_group) return session diff --git a/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh b/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh index d0b46153ec..dd32892590 100755 --- a/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh +++ b/onnxruntime/python/tools/tensorrt/perf/run_perf_docker.sh @@ -1,7 +1,7 @@ #!/bin/bash # Parse Arguments -while getopts d:o:m:p:e:v: parameter +while getopts d:o:m:p:e:v:a: parameter do case "${parameter}" in d) DOCKER_IMAGE=${OPTARG};; @@ -10,12 +10,11 @@ m) MODEL_PATH=${OPTARG};; p) PERF_DIR=${OPTARG};; e) EP_LIST=${OPTARG};; v) MODEL_VOLUME=${OPTARG};; +a) PERF_ARGUMENTS=${OPTARG};; esac done # Variables DOCKER_PERF_DIR='/perf/' -WORKSPACE='/' -MODEL_PATH=$WORKSPACE$MODEL_PATH -docker run --gpus all -v $PERF_DIR:$DOCKER_PERF_DIR -v $MODEL_VOLUME/$OPTION:$DOCKER_PERF_DIR$OPTION $DOCKER_IMAGE /bin/bash $DOCKER_PERF_DIR'perf.sh' -d $DOCKER_PERF_DIR -o $OPTION -m $MODEL_PATH -w $WORKSPACE -e "$EP_LIST" +docker run --gpus all -v $PERF_DIR:$DOCKER_PERF_DIR -v $MODEL_VOLUME/$OPTION:$DOCKER_PERF_DIR$OPTION $DOCKER_IMAGE /bin/bash $DOCKER_PERF_DIR'perf.sh' -d $DOCKER_PERF_DIR -o $OPTION -m $MODEL_PATH -e "$EP_LIST" "$PERF_ARGUMENTS" diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index 6887e1733a..2270061e75 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -90,7 +90,10 @@ jobs: value: machine.sh - name: with_arguments - value: $(environment) -e "$(epList)" + value: $(environment) -e "$(epList)" + + - name: optional_arguments + value: -a "-a -g $(optimizeGraph) -b $(bindInputs) -n $(enableCache)" steps: @@ -127,7 +130,7 @@ jobs: displayName: 'Start Anubis Job' - ${{ each option in parameters.ModelGroups }}: - - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_$(with_arguments) -o ${{option}} -m $(${{option}})' + - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/run_perf_$(with_arguments) -o ${{option}} -m $(${{option}}) $(optional_arguments)' displayName: '${{option}} perf' workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/'