mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-04 23:59:56 +00:00
Separating result processor out from profiler.py (#23251)
### Description Separating result processor out from profiler.py without changing the behaviors of current profile.py ### Motivation and Context Less dependency and smaller code for processing profile from other scenarios. --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
This commit is contained in:
parent
704523c2d8
commit
655b3efee4
2 changed files with 359 additions and 292 deletions
|
|
@ -0,0 +1,358 @@
|
|||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
"""This profiler result processor print out the kernel time spent on each Node of the model.
|
||||
Example of importing profile result file from onnxruntime_perf_test:
|
||||
python profile_result_processor.py --input profile_2021-10-25_12-02-41.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
|
||||
_NODES_TYPE_CONTAINING_SUBGRAPH = frozenset(("Scan", "Loop", "If"))
|
||||
|
||||
|
||||
def parse_arguments(argv=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--input",
|
||||
required=False,
|
||||
type=str,
|
||||
help="Set the input file for reading the profile results",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--threshold",
|
||||
required=False,
|
||||
type=float,
|
||||
default=0.01,
|
||||
help="Threshold of run time ratio among all nodes. Nodes with larger ratio will show in top expensive nodes.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
required=False,
|
||||
type=str,
|
||||
default="cuda",
|
||||
help="Execution provider to use",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--kernel_time_only",
|
||||
required=False,
|
||||
action="store_true",
|
||||
help="Only include the kernel time and no fence time",
|
||||
)
|
||||
|
||||
parser.set_defaults(kernel_time_only=False)
|
||||
|
||||
parser.add_argument("-v", "--verbose", required=False, action="store_true")
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
return parser.parse_args(argv)
|
||||
|
||||
|
||||
def load_profile_json(profile_file):
|
||||
print(f"loading profile output {profile_file} ...")
|
||||
|
||||
with open(profile_file) as opened_file:
|
||||
sess_time = json.load(opened_file)
|
||||
|
||||
assert isinstance(sess_time, list)
|
||||
return sess_time
|
||||
|
||||
|
||||
def parse_kernel_results(sess_time, threshold=0):
|
||||
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
|
||||
|
||||
Args:
|
||||
sess_time (List[Dict]): profile data
|
||||
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
List[str]: lines of string for output.
|
||||
"""
|
||||
kernel_name_to_op_name = {}
|
||||
kernel_time = {}
|
||||
kernel_freq = {}
|
||||
total = 0
|
||||
session_init = False
|
||||
for item in sess_time:
|
||||
# Skip all MemcpyHostToDevice before session_initialization
|
||||
if item["cat"] == "Session" and item["name"] == "session_initialization":
|
||||
session_init = True
|
||||
if not session_init:
|
||||
continue
|
||||
|
||||
if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]:
|
||||
kernel_name = item["name"]
|
||||
|
||||
op_name = item["args"]["op_name"]
|
||||
if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
|
||||
continue
|
||||
|
||||
# Handle MemcpyHostToDevice and MemcpyDeviceToHost here
|
||||
if not op_name:
|
||||
op_name = f"({kernel_name})"
|
||||
|
||||
if kernel_name in kernel_time:
|
||||
kernel_time[kernel_name] += item["dur"]
|
||||
kernel_freq[kernel_name] += 1
|
||||
else:
|
||||
kernel_time[kernel_name] = item["dur"]
|
||||
kernel_freq[kernel_name] = 1
|
||||
kernel_name_to_op_name[kernel_name] = op_name
|
||||
|
||||
total += item["dur"]
|
||||
|
||||
if not kernel_time:
|
||||
return ["No kernel record found!"]
|
||||
|
||||
# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
|
||||
lines = []
|
||||
lines.append(f"\nTop expensive kernels with Time% >= {threshold * 100:.2f}:")
|
||||
lines.append("-" * 64)
|
||||
lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
|
||||
for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
|
||||
ratio = duration / total
|
||||
if ratio < threshold:
|
||||
continue
|
||||
|
||||
calls = kernel_freq[kernel_name]
|
||||
avg_time = duration / float(calls)
|
||||
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}")
|
||||
|
||||
# Group by operator
|
||||
op_time = {}
|
||||
for kernel_name, op_name in kernel_name_to_op_name.items():
|
||||
duration = kernel_time[kernel_name]
|
||||
if op_name in op_time:
|
||||
op_time[op_name] += duration
|
||||
else:
|
||||
op_time[op_name] = duration
|
||||
|
||||
lines.append("\nGroup kernel time by operator:")
|
||||
lines.append("-" * 64)
|
||||
lines.append("Total(μs)\tTime%\tOperator")
|
||||
for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
|
||||
ratio = duration / total
|
||||
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}")
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
|
||||
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
|
||||
|
||||
Args:
|
||||
sess_time (List[Dict]): profile data
|
||||
kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
|
||||
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
List[str]: lines of string for output.
|
||||
"""
|
||||
node_name_list = []
|
||||
node_time = {}
|
||||
node_freq = {}
|
||||
node_provider = {}
|
||||
total = 0
|
||||
for item in sess_time:
|
||||
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
|
||||
node_name = (
|
||||
item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "")
|
||||
)
|
||||
|
||||
if "provider" in item["args"]:
|
||||
if item["args"]["provider"] == "CPUExecutionProvider":
|
||||
device = "CPU"
|
||||
elif item["args"]["provider"] == "CUDAExecutionProvider":
|
||||
device = "CUDA"
|
||||
elif item["args"]["provider"] == "DmlExecutionProvider":
|
||||
device = "DML"
|
||||
|
||||
if node_name not in node_provider:
|
||||
node_provider[node_name] = device
|
||||
else:
|
||||
assert node_provider[node_name] == device
|
||||
elif kernel_time_only:
|
||||
continue
|
||||
|
||||
op_name = item["args"]["op_name"]
|
||||
if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
|
||||
continue
|
||||
|
||||
if node_name in node_time:
|
||||
node_time[node_name] += item["dur"]
|
||||
node_freq[node_name] += 1
|
||||
else:
|
||||
node_time[node_name] = item["dur"]
|
||||
node_freq[node_name] = 1
|
||||
node_name_list.append(node_name)
|
||||
|
||||
total += item["dur"]
|
||||
|
||||
# Output items in the original order.
|
||||
lines = [
|
||||
"\nNodes in the original order:",
|
||||
"-" * 64,
|
||||
"Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode",
|
||||
]
|
||||
before_percentage = 0.0
|
||||
for node_name in node_name_list:
|
||||
duration = node_time[node_name]
|
||||
calls = node_freq[node_name]
|
||||
avg_time = duration / float(calls)
|
||||
percentage = (duration / total) * 100.0
|
||||
provider = node_provider.get(node_name, "")
|
||||
before_percentage += percentage
|
||||
lines.append(
|
||||
f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
|
||||
)
|
||||
|
||||
# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
|
||||
lines.append(f"\nTop expensive nodes with Time% >= {threshold * 100:.2f}:")
|
||||
lines.append("-" * 64)
|
||||
lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
|
||||
for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
|
||||
ratio = duration / total
|
||||
if ratio < threshold:
|
||||
continue
|
||||
|
||||
calls = node_freq[node_name]
|
||||
avg_time = duration / float(calls)
|
||||
percentage = (duration / total) * 100.0
|
||||
provider = node_provider.get(node_name, "")
|
||||
lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def group_node_results(sess_time):
|
||||
"""Group results by operator name.
|
||||
|
||||
Args:
|
||||
sess_time (List[Dict]): profile data
|
||||
|
||||
Returns:
|
||||
List[str]: lines of string for output.
|
||||
"""
|
||||
op_kernel_time = {}
|
||||
op_kernel_records = {}
|
||||
total_kernel_time = 0
|
||||
|
||||
provider_op_kernel_time = {}
|
||||
provider_op_kernel_records = {}
|
||||
provider_kernel_time = {}
|
||||
|
||||
op_fence_time = {}
|
||||
total_fence_time = 0
|
||||
|
||||
provider_counter = {}
|
||||
for item in sess_time:
|
||||
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
|
||||
op_name = item["args"]["op_name"]
|
||||
|
||||
# TODO: shall we have a separated group for nodes with subgraph?
|
||||
if op_name in _NODES_TYPE_CONTAINING_SUBGRAPH:
|
||||
continue
|
||||
|
||||
if "provider" not in item["args"]:
|
||||
if "fence" in item["name"]:
|
||||
if op_name in op_fence_time:
|
||||
op_fence_time[op_name] += item["dur"]
|
||||
else:
|
||||
op_fence_time[op_name] = item["dur"]
|
||||
total_fence_time += item["dur"]
|
||||
continue
|
||||
|
||||
provider = item["args"].get("provider", "")
|
||||
if provider in provider_counter:
|
||||
provider_counter[provider] += 1
|
||||
else:
|
||||
provider_counter[provider] = 1
|
||||
|
||||
key = f"{provider}:{op_name}"
|
||||
if key in provider_op_kernel_time:
|
||||
provider_op_kernel_time[key] += item["dur"]
|
||||
provider_op_kernel_records[key] += 1
|
||||
else:
|
||||
provider_op_kernel_time[key] = item["dur"]
|
||||
provider_op_kernel_records[key] = 1
|
||||
|
||||
if provider in provider_kernel_time:
|
||||
provider_kernel_time[provider] += item["dur"]
|
||||
else:
|
||||
provider_kernel_time[provider] = item["dur"]
|
||||
|
||||
if op_name in op_kernel_time:
|
||||
op_kernel_time[op_name] += item["dur"]
|
||||
op_kernel_records[op_name] += 1
|
||||
else:
|
||||
op_kernel_time[op_name] = item["dur"]
|
||||
op_kernel_records[op_name] = 1
|
||||
|
||||
total_kernel_time += item["dur"]
|
||||
|
||||
lines = ["", "Grouped by operator"]
|
||||
lines.append("-" * 64)
|
||||
lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
|
||||
for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
|
||||
fence_time = op_fence_time.get(op_name, 0)
|
||||
kernel_time_ratio = kernel_time / total_kernel_time
|
||||
total_time = kernel_time + fence_time
|
||||
time_ratio = total_time / (total_kernel_time + total_fence_time)
|
||||
kernel_calls = op_kernel_records[op_name]
|
||||
avg_kernel_time = kernel_time / kernel_calls
|
||||
lines.append(
|
||||
f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}"
|
||||
)
|
||||
|
||||
lines += ["", "Grouped by provider + operator"]
|
||||
lines.append("-" * 64)
|
||||
lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator")
|
||||
for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True):
|
||||
parts = key.split(":")
|
||||
provider = parts[0]
|
||||
op_name = parts[1]
|
||||
short_ep = provider.replace("ExecutionProvider", "")
|
||||
calls = provider_op_kernel_records[key]
|
||||
avg_kernel_time = kernel_time / calls
|
||||
provider_time_ratio = kernel_time / provider_kernel_time[provider]
|
||||
lines.append(
|
||||
f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}"
|
||||
)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def process_results(profile_file, args):
|
||||
profile_records = load_profile_json(profile_file)
|
||||
|
||||
lines = parse_kernel_results(profile_records, args.threshold)
|
||||
|
||||
lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold)
|
||||
|
||||
lines += group_node_results(profile_records)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
arguments = parse_arguments()
|
||||
print("Arguments", arguments)
|
||||
|
||||
from benchmark_helper import setup_logger
|
||||
|
||||
setup_logger(arguments.verbose)
|
||||
|
||||
profile_file = arguments.input
|
||||
|
||||
results = process_results(profile_file, arguments)
|
||||
|
||||
for line in results:
|
||||
print(line)
|
||||
|
|
@ -1,5 +1,4 @@
|
|||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
import numpy
|
||||
|
|
@ -14,8 +13,6 @@ Example of importing profile result file from onnxruntime_perf_test:
|
|||
python profiler.py --input profile_2021-10-25_12-02-41.json
|
||||
"""
|
||||
|
||||
NODES_TYPE_CONTAINING_SUBGRAPH = ["Scan", "Loop", "If"]
|
||||
|
||||
|
||||
def parse_arguments(argv=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
|
|
@ -176,283 +173,6 @@ def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_n
|
|||
return profile_file
|
||||
|
||||
|
||||
def load_profile_json(profile_file):
|
||||
print(f"loading profile output {profile_file} ...")
|
||||
|
||||
with open(profile_file) as opened_file:
|
||||
sess_time = json.load(opened_file)
|
||||
|
||||
assert isinstance(sess_time, list)
|
||||
return sess_time
|
||||
|
||||
|
||||
def parse_kernel_results(sess_time, threshold=0):
|
||||
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
|
||||
|
||||
Args:
|
||||
sess_time (List[Dict]): profile data
|
||||
kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
|
||||
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
List[str]: lines of string for output.
|
||||
"""
|
||||
kernel_name_to_op_name = {}
|
||||
kernel_time = {}
|
||||
kernel_freq = {}
|
||||
total = 0
|
||||
session_init = False
|
||||
for item in sess_time:
|
||||
# Skip all MemcpyHostToDevice before session_initialization
|
||||
if item["cat"] == "Session" and item["name"] == "session_initialization":
|
||||
session_init = True
|
||||
if not session_init:
|
||||
continue
|
||||
|
||||
if item["cat"] == "Kernel" and "dur" in item and "args" in item and "op_name" in item["args"]:
|
||||
kernel_name = item["name"]
|
||||
|
||||
op_name = item["args"]["op_name"]
|
||||
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
|
||||
continue
|
||||
|
||||
# Handle MemcpyHostToDevice and MemcpyDeviceToHost here
|
||||
if not op_name:
|
||||
op_name = f"({kernel_name})"
|
||||
|
||||
if kernel_name in kernel_time:
|
||||
kernel_time[kernel_name] += item["dur"]
|
||||
kernel_freq[kernel_name] += 1
|
||||
else:
|
||||
kernel_time[kernel_name] = item["dur"]
|
||||
kernel_freq[kernel_name] = 1
|
||||
kernel_name_to_op_name[kernel_name] = op_name
|
||||
|
||||
total += item["dur"]
|
||||
|
||||
if not kernel_time:
|
||||
return ["No kernel record found!"]
|
||||
|
||||
# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
|
||||
lines = []
|
||||
lines.append(f"\nTop expensive kernels with Time% >= {threshold*100:.2f}:")
|
||||
lines.append("-" * 64)
|
||||
lines.append("Total(μs)\tTime%\tCalls\tAvg(μs)\tKernel")
|
||||
for kernel_name, duration in sorted(kernel_time.items(), key=lambda x: x[1], reverse=True):
|
||||
ratio = duration / total
|
||||
if ratio < threshold:
|
||||
continue
|
||||
|
||||
calls = kernel_freq[kernel_name]
|
||||
avg_time = duration / float(calls)
|
||||
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{calls:5d}\t{avg_time:8.1f}\t{kernel_name}")
|
||||
|
||||
# Group by operator
|
||||
op_time = {}
|
||||
for kernel_name, op_name in kernel_name_to_op_name.items():
|
||||
duration = kernel_time[kernel_name]
|
||||
if op_name in op_time:
|
||||
op_time[op_name] += duration
|
||||
else:
|
||||
op_time[op_name] = duration
|
||||
|
||||
lines.append("\nGroup kernel time by operator:")
|
||||
lines.append("-" * 64)
|
||||
lines.append("Total(μs)\tTime%\tOperator")
|
||||
for op_name, duration in sorted(op_time.items(), key=lambda x: x[1], reverse=True):
|
||||
ratio = duration / total
|
||||
lines.append(f"{duration:10d}\t{ratio * 100.0:5.2f}\t{op_name}")
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def parse_node_results(sess_time, kernel_time_only=False, threshold=0):
|
||||
"""Parse profile data and output nodes in two sections - nodes in the original order, and top expensive nodes.
|
||||
|
||||
Args:
|
||||
sess_time (List[Dict]): profile data
|
||||
kernel_time_only (bool, optional): Only include items for kernel time. Defaults to False.
|
||||
threshold (int, optional): Minimum ratio of duration among all. Defaults to 0.
|
||||
|
||||
Returns:
|
||||
List[str]: lines of string for output.
|
||||
"""
|
||||
node_name_list = []
|
||||
node_time = {}
|
||||
node_freq = {}
|
||||
node_provider = {}
|
||||
total = 0
|
||||
for item in sess_time:
|
||||
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
|
||||
node_name = (
|
||||
item["name"].replace("_kernel_time", "").replace("_fence_before", "").replace("_fence_after", "")
|
||||
)
|
||||
|
||||
if "provider" in item["args"]:
|
||||
if item["args"]["provider"] == "CPUExecutionProvider":
|
||||
device = "CPU"
|
||||
elif item["args"]["provider"] == "CUDAExecutionProvider":
|
||||
device = "CUDA"
|
||||
elif item["args"]["provider"] == "DmlExecutionProvider":
|
||||
device = "DML"
|
||||
|
||||
if node_name not in node_provider:
|
||||
node_provider[node_name] = device
|
||||
else:
|
||||
assert node_provider[node_name] == device
|
||||
elif kernel_time_only:
|
||||
continue
|
||||
|
||||
op_name = item["args"]["op_name"]
|
||||
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
|
||||
continue
|
||||
|
||||
if node_name in node_time:
|
||||
node_time[node_name] += item["dur"]
|
||||
node_freq[node_name] += 1
|
||||
else:
|
||||
node_time[node_name] = item["dur"]
|
||||
node_freq[node_name] = 1
|
||||
node_name_list.append(node_name)
|
||||
|
||||
total += item["dur"]
|
||||
|
||||
# Output items in the original order.
|
||||
lines = [
|
||||
"\nNodes in the original order:",
|
||||
"-" * 64,
|
||||
"Total(μs)\tTime%\tAcc %\tAvg(μs)\tCalls\tProvider\tNode",
|
||||
]
|
||||
before_percentage = 0.0
|
||||
for node_name in node_name_list:
|
||||
duration = node_time[node_name]
|
||||
calls = node_freq[node_name]
|
||||
avg_time = duration / float(calls)
|
||||
percentage = (duration / total) * 100.0
|
||||
provider = node_provider.get(node_name, "")
|
||||
before_percentage += percentage
|
||||
lines.append(
|
||||
f"{duration:10d}\t{percentage:5.2f}\t{before_percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}"
|
||||
)
|
||||
|
||||
# Output items with run time ratio > thresholds, and sorted by duration in the descending order.
|
||||
lines.append(f"\nTop expensive nodes with Time% >= {threshold*100:.2f}:")
|
||||
lines.append("-" * 64)
|
||||
lines.append("Total(μs)\tTime%\tAvg(μs)\tCalls\tProvider\tNode")
|
||||
for node_name, duration in sorted(node_time.items(), key=lambda x: x[1], reverse=True):
|
||||
ratio = duration / total
|
||||
if ratio < threshold:
|
||||
continue
|
||||
|
||||
calls = node_freq[node_name]
|
||||
avg_time = duration / float(calls)
|
||||
percentage = (duration / total) * 100.0
|
||||
provider = node_provider.get(node_name, "")
|
||||
lines.append(f"{duration:10d}\t{percentage:5.2f}\t{avg_time:8.1f}\t{calls:5d}\t{provider:8s}\t{node_name}")
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def group_node_results(sess_time, kernel_time_only, use_gpu):
|
||||
"""Group results by operator name.
|
||||
|
||||
Args:
|
||||
sess_time (List[Dict]): profile data
|
||||
kernel_time_only (bool): Only include items for kernel time.
|
||||
use_gpu (bool): GPU is used in profiling or not.
|
||||
|
||||
Returns:
|
||||
List[str]: lines of string for output.
|
||||
"""
|
||||
op_kernel_time = {}
|
||||
op_kernel_records = {}
|
||||
total_kernel_time = 0
|
||||
|
||||
provider_op_kernel_time = {}
|
||||
provider_op_kernel_records = {}
|
||||
provider_kernel_time = {}
|
||||
|
||||
op_fence_time = {}
|
||||
total_fence_time = 0
|
||||
|
||||
provider_counter = {}
|
||||
for item in sess_time:
|
||||
if item["cat"] == "Node" and "dur" in item and "args" in item and "op_name" in item["args"]:
|
||||
op_name = item["args"]["op_name"]
|
||||
|
||||
# TODO: shall we have a separated group for nodes with subgraph?
|
||||
if op_name in NODES_TYPE_CONTAINING_SUBGRAPH:
|
||||
continue
|
||||
|
||||
if "provider" not in item["args"]:
|
||||
if "fence" in item["name"]:
|
||||
if op_name in op_fence_time:
|
||||
op_fence_time[op_name] += item["dur"]
|
||||
else:
|
||||
op_fence_time[op_name] = item["dur"]
|
||||
total_fence_time += item["dur"]
|
||||
continue
|
||||
|
||||
provider = item["args"].get("provider", "")
|
||||
if provider in provider_counter:
|
||||
provider_counter[provider] += 1
|
||||
else:
|
||||
provider_counter[provider] = 1
|
||||
|
||||
key = f"{provider}:{op_name}"
|
||||
if key in provider_op_kernel_time:
|
||||
provider_op_kernel_time[key] += item["dur"]
|
||||
provider_op_kernel_records[key] += 1
|
||||
else:
|
||||
provider_op_kernel_time[key] = item["dur"]
|
||||
provider_op_kernel_records[key] = 1
|
||||
|
||||
if provider in provider_kernel_time:
|
||||
provider_kernel_time[provider] += item["dur"]
|
||||
else:
|
||||
provider_kernel_time[provider] = item["dur"]
|
||||
|
||||
if op_name in op_kernel_time:
|
||||
op_kernel_time[op_name] += item["dur"]
|
||||
op_kernel_records[op_name] += 1
|
||||
else:
|
||||
op_kernel_time[op_name] = item["dur"]
|
||||
op_kernel_records[op_name] = 1
|
||||
|
||||
total_kernel_time += item["dur"]
|
||||
|
||||
lines = ["", "Grouped by operator"]
|
||||
lines.append("-" * 64)
|
||||
lines.append("Total(μs)\tTime%\tKernel(μs)\tKernel%\tCalls\tAvgKernel(μs)\tFence(μs)\tOperator")
|
||||
for op_name, kernel_time in sorted(op_kernel_time.items(), key=lambda x: x[1], reverse=True):
|
||||
fence_time = op_fence_time.get(op_name, 0)
|
||||
kernel_time_ratio = kernel_time / total_kernel_time
|
||||
total_time = kernel_time + fence_time
|
||||
time_ratio = total_time / (total_kernel_time + total_fence_time)
|
||||
kernel_calls = op_kernel_records[op_name]
|
||||
avg_kernel_time = kernel_time / kernel_calls
|
||||
lines.append(
|
||||
f"{total_time:10d}\t{time_ratio * 100.0:5.2f}\t{kernel_time:11d}\t{kernel_time_ratio * 100.0:5.2f}\t{kernel_calls:5d}\t{avg_kernel_time:14.1f}\t{fence_time:10d}\t{op_name}"
|
||||
)
|
||||
|
||||
lines += ["", "Grouped by provider + operator"]
|
||||
lines.append("-" * 64)
|
||||
lines.append("Kernel(μs)\tProvider%\tCalls\tAvgKernel(μs)\tProvider\tOperator")
|
||||
for key, kernel_time in sorted(provider_op_kernel_time.items(), key=lambda x: x[1], reverse=True):
|
||||
parts = key.split(":")
|
||||
provider = parts[0]
|
||||
op_name = parts[1]
|
||||
short_ep = provider.replace("ExecutionProvider", "")
|
||||
calls = provider_op_kernel_records[key]
|
||||
avg_kernel_time = kernel_time / calls
|
||||
provider_time_ratio = kernel_time / provider_kernel_time[provider]
|
||||
lines.append(
|
||||
f"{kernel_time:10d}\t{provider_time_ratio * 100.0:9.2f}\t{calls:5d}\t{avg_kernel_time:14.1f}\t{short_ep:8s}\t{op_name}"
|
||||
)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def get_dim_from_type_proto(dim):
|
||||
return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) == str else None # noqa: E721
|
||||
|
||||
|
|
@ -639,18 +359,6 @@ def create_longformer_inputs(onnx_model, batch_size, sequence_length, global_len
|
|||
return all_inputs
|
||||
|
||||
|
||||
def process_results(profile_file, args):
|
||||
profile_records = load_profile_json(profile_file)
|
||||
|
||||
lines = parse_kernel_results(profile_records, args.threshold)
|
||||
|
||||
lines += parse_node_results(profile_records, args.kernel_time_only, args.threshold)
|
||||
|
||||
lines += group_node_results(profile_records, args.kernel_time_only, args.use_gpu)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def run(args):
|
||||
num_threads = args.thread_num if args.thread_num > 0 else psutil.cpu_count(logical=False)
|
||||
|
||||
|
|
@ -718,6 +426,7 @@ if __name__ == "__main__":
|
|||
profile_file = run(arguments)
|
||||
else:
|
||||
profile_file = arguments.input
|
||||
from profile_result_processor import process_results
|
||||
|
||||
results = process_results(profile_file, arguments)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue