mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-28 22:56:32 +00:00
Add a script for quick profile analysis (#13423)
### Description Implements a Python script for quick analysis of a generated JSON profile from ORT. ### Motivation and Context This PR implements a script that lists kernels that take up the most time in a JSON profile, from both the CPU and GPU points-of-view. The script also supports various options for CSV output, grouping of kernels wrt shape of input tensors and wrt kernel dimensions. Co-authored-by: Abhishek Udupa <abhishek.udupa@microsoft.com>
This commit is contained in:
parent
a0cc289be6
commit
8fbdc6cc46
2 changed files with 275 additions and 0 deletions
72
onnxruntime/python/tools/profile_explorer/README.md
Normal file
72
onnxruntime/python/tools/profile_explorer/README.md
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
# Profile Explorer
|
||||
|
||||
Profile Explorer is a script that makes analyzing profile outputs generated by ONNX Runtime a breeze. Simply point the script at a generated .json file and gain visibility into the aspects of the model that are taking the most time.
|
||||
|
||||
## Usage
|
||||
|
||||
For the most up-to-date options, please run `python ./profile_explorer.py --help` to see a list of all options. Here are some of the common use cases covered:
|
||||
|
||||
Find the top-5 operators that are taking the most amount of time:
|
||||
```bash
|
||||
python ./profile_explorer.py --count 5 <JSON file containing profiling data>
|
||||
```
|
||||
|
||||
Perform a shape sensitive version of the previous command:
|
||||
```bash
|
||||
python ./profile_explorer.py --count 5 --shape-sensitive <JSON file containing profiling data>
|
||||
```
|
||||
|
||||
Be sensitive to GPU kernel dimensions while performing the top-5 analysis, in addition to kernel input shapes:
|
||||
```bash
|
||||
python ./profile_explorer.py --count 5 --shape-sensitive --dimension-sensitive <JSON file containing profiling data>
|
||||
```
|
||||
|
||||
Use a custom C++ name demangler to produce readable outputs for GPU kernel names:
|
||||
```bash
|
||||
python ./profile_explorer.py --count 5 --demangler </path/to/your/favorite/demangler> <JSON file containing profiling data>
|
||||
```
|
||||
|
||||
Save output to CSV files with names prefixes `profile_output`
|
||||
```bash
|
||||
python ./profile_explorer.py --count 5 --csv profile_output <JSON file containing profiling data>
|
||||
```
|
||||
|
||||
An example run and output from the run:
|
||||
```
|
||||
python3 ./profile_explorer.py onnxruntime_profile__2022-10-23_07-01-30.json --filter '*Mul*'
|
||||
|
||||
------ Top CPU Kernel Times ------
|
||||
name duration pct count cumulative_pct cumulative_dur
|
||||
MatMul 321596 98.16 924 98.16 321596
|
||||
FusedMatMul 5670 1.73 132 99.89 327266
|
||||
Mul 369 0.11 11 100.00 327635
|
||||
|
||||
------ Top GPU Kernel Times ------
|
||||
name duration pct count cumulative_pct cumulative_dur
|
||||
Cijk_Ailk_Bljk_HHS_BH_MT256x128x16_MI32x32x8x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW8_GSU1_GSUASB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB8_LDL1_LRVW4_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS1_SU32_SUM3_SUS64_SCIUI1_SPO0_SRVW0_SSO0_SVW2_SNLL0_TT4_64_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16 491799 32.53 660 32.53 491799
|
||||
Cijk_Ailk_Bljk_HHS_BH_MT128x256x32_MI32x32x8x1_SE_1LDSB1_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW8_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB8_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR5_RK0_SIA3_SS0_SU0_SUM0_SUS0_SCIUI1_SPO0_SRVW4_SSO0_SVW4_SNLL0_TT2_128_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG64_4_1_WGM1 230208 15.23 132 47.76 722007
|
||||
void onnxruntime::contrib::rocm::BiasSoftmaxWarpForward<__half, __half, float, 9, true>(__half*, __half const*, __half const*, int, int, onnxruntime::rocm::fast_divmod) 218718 14.47 132 62.23 940725
|
||||
void onnxruntime::contrib::rocm::FastGeluKernelVec<__half, 256u, 1>(int, int, __half const*, __half const*, __half*) 137577 9.10 132 71.33 1078302
|
||||
Cijk_Ailk_Bljk_HHS_BH_MT64x128x32_MI32x32x8x1_SE_1LDSB0_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW8_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB8_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR5_RK0_SIA3_SS0_SU0_SUM0_SUS0_SCIUI1_SPO0_SRVW4_SSO0_SVW4_SNLL0_TT1_64_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG64_4_1_WGM1 103875 6.87 132 78.20 1182177
|
||||
void onnxruntime::rocm::_BinaryElementWise<__half, __half, __half, onnxruntime::rocm::OP_Add<__half, __half, __half>, true, false, 512, 2>(int, onnxruntime::rocm::TArray<long, 8>, __half const*, onnxruntime::rocm::TArray<long, 8>, __half const*, onnxruntime::rocm::TArray<onnxruntime::rocm::fast_divmod, 8>, __half*, onnxruntime::rocm::OP_Add<__half, __half, __half> const&, int) 93086 6.16 396 84.36 1275263
|
||||
Cijk_Ailk_Bljk_HHS_BH_MT128x144x32_MI16x16x16x1_SE_1LDSB1_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB16_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR2_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW4_SSO0_SVW4_SNLL0_TT2_144_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG64_4_1_WGM1 74188 4.91 132 89.27 1349451
|
||||
void onnxruntime::contrib::rocm::SkipLayerNormKernelSmall<__half, 192u, 4>(int, __half const*, __half const*, __half const*, __half const*, __half const*, __half, __half*, bool) 62288 4.12 264 93.39 1411739
|
||||
void onnxruntime::rocm::Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<2>(onnxruntime::rocm::TArray<long, 8>, void const*, onnxruntime::rocm::TArray<long, 8>, void*, long, int) 47324 3.13 396 96.52 1459063
|
||||
void onnxruntime::rocm::Transpose3DKernel<short, 32u>(onnxruntime::rocm::TArray<long, 8>, onnxruntime::rocm::TArray<long, 8>, short const*, short*) 19311 1.28 132 97.80 1478374
|
||||
CopyHostToDevice 14215 0.94 206 98.74 1492589
|
||||
void onnxruntime::rocm::_GatherKernel<short>(long, long, onnxruntime::rocm::fast_divmod, onnxruntime::rocm::fast_divmod, void const*, unsigned long, short const*, short*, int) 6816 0.45 44 99.19 1499405
|
||||
void onnxruntime::rocm::cuApplyLayerNorm<__half, float, __half, false>(__half*, float*, float*, __half const*, int, int, float, __half const*, __half const*) 3477 0.23 11 99.42 1502882
|
||||
void onnxruntime::rocm::_BinaryElementWise<__half, __half, __half, onnxruntime::rocm::OP_Add<__half, __half, __half>, false, true, 512, 2>(int, onnxruntime::rocm::TArray<long, 8>, __half const*, onnxruntime::rocm::TArray<long, 8>, __half const*, onnxruntime::rocm::TArray<onnxruntime::rocm::fast_divmod, 8>, __half*, onnxruntime::rocm::OP_Add<__half, __half, __half> const&, int) 2599 0.17 11 99.59 1505481
|
||||
void onnxruntime::rocm::_UnaryElementWise<__half, float, onnxruntime::rocm::OP_Cast<__half, float>, 256, 4>(__half const*, float*, onnxruntime::rocm::OP_Cast<__half, float>, int) 2161 0.14 22 99.74 1507642
|
||||
void onnxruntime::rocm::_BinaryElementWiseSimple<true, true, __half, __half, __half, onnxruntime::rocm::OP_Add<__half, __half, __half>, 512, 2>(__half const*, __half const*, __half*, onnxruntime::rocm::OP_Add<__half, __half, __half> const&, int) 2037 0.13 11 99.87 1509679
|
||||
CopyDeviceToHost 1329 0.09 2 99.96 1511008
|
||||
Cijk_Alik_Bljk_HHS_BH_MT64x32x64_MI16x16x16x1_SE_1LDSB0_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW8_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA16_LPB16_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR5_RK0_SIA3_SS0_SU0_SUM0_SUS0_SCIUI1_SPO0_SRVW4_SSO0_SVW4_SNLL0_TT2_16_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM15 152 0.01 11 99.97 1511160
|
||||
void onnxruntime::rocm::_SliceKernel<false, 2, 256, 4, long>(onnxruntime::rocm::TArray<long, 8>, onnxruntime::rocm::TArray<long, 8>, onnxruntime::rocm::TArray<long, 8>, onnxruntime::rocm::TArray<onnxruntime::rocm::fast_divmod, 8>, long const*, long*, int) 109 0.01 22 99.98 1511269
|
||||
void onnxruntime::rocm::ExpandKernel2D<long>(int, long const*, long*, onnxruntime::rocm::fast_divmod, int, int) 89 0.01 11 99.98 1511358
|
||||
Cijk_Ailk_Bljk_HHS_BH_MT64x16x32_MI16x16x16x1_SE_1LDSB0_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW8_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB16_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR2_RK0_SIA3_SS0_SU0_SUM0_SUS0_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT1_16_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG64_4_1_WGM15 83 0.01 11 99.99 1511441
|
||||
void onnxruntime::rocm::_Fill<short, 256, 4>(short*, short, int) 52 0.00 11 99.99 1511493
|
||||
void onnxruntime::rocm::_UnaryElementWise<__half, __half, onnxruntime::rocm::OP_Tanh<__half>, 256, 4>(__half const*, __half*, onnxruntime::rocm::OP_Tanh<__half>, int) 51 0.00 11 99.99 1511544
|
||||
void onnxruntime::rocm::_BinaryElementWiseSimple<true, false, __half, __half, __half, onnxruntime::rocm::OP_Mul<__half, __half, __half>, 512, 2>(__half const*, __half const*, __half*, onnxruntime::rocm::OP_Mul<__half, __half, __half> const&, int) 46 0.00 11 100.00 1511590
|
||||
void onnxruntime::rocm::_BinaryElementWiseSimple<false, true, __half, __half, __half, onnxruntime::rocm::OP_Sub<__half, __half, __half>, 512, 2>(__half const*, __half const*, __half*, onnxruntime::rocm::OP_Sub<__half, __half, __half> const&, int) 45 0.00 11 100.00 1511635
|
||||
void onnxruntime::rocm::_Fill<__half, 256, 4>(__half*, __half, int) 4 0.00 1 100.00 1511639
|
||||
```
|
||||
203
onnxruntime/python/tools/profile_explorer/profile_explorer.py
Normal file
203
onnxruntime/python/tools/profile_explorer/profile_explorer.py
Normal file
|
|
@ -0,0 +1,203 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import argparse
|
||||
import fnmatch
|
||||
import json
|
||||
import subprocess as sp
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def _demangle(name, demangler="c++filt"):
|
||||
try:
|
||||
with sp.Popen([demangler, name], stdin=sp.PIPE, stdout=sp.PIPE) as proc:
|
||||
out, _ = proc.communicate()
|
||||
return out.decode("utf-8").strip()
|
||||
except:
|
||||
return name
|
||||
|
||||
|
||||
def _get_args():
|
||||
parser = argparse.ArgumentParser(description="onnxruntime bench tool")
|
||||
parser.add_argument("input", type=str, help="Trace input file, formatted as JSON")
|
||||
parser.add_argument(
|
||||
"--demangler",
|
||||
required=False,
|
||||
type=str,
|
||||
default="c++filt",
|
||||
help="The command to use to demangle C++ identifiers",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--shape-sensitive", action="store_true", help="Perform a shape sensitive analysis of kernel execution times"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dimension-sensitive",
|
||||
action="store_true",
|
||||
help="Perform a kernel launch dimension sensitive analysis of kernel execution times",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--filter",
|
||||
type=str,
|
||||
nargs="+",
|
||||
action="extend",
|
||||
help="Restrict analysis to the specified identifiers, i.e., specify a filter list. Also supports UNIX-style wildcards.",
|
||||
)
|
||||
parser.add_argument("--csv", help="save data to csv")
|
||||
parser.add_argument("-c", "--count", type=int, default=40, help="list top N items")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="verbose")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def _shape_to_string(shape):
|
||||
res = ""
|
||||
for dict_obj in shape:
|
||||
if len(dict_obj) > 1:
|
||||
raise ValueError("Unhandled type in _shape_to_string()")
|
||||
key = list(dict_obj.keys())[0]
|
||||
value = list(dict_obj.values())[0]
|
||||
if len(res) != 0:
|
||||
res += "__"
|
||||
res += f'{key}_{"x".join(str(v) for v in value)}'
|
||||
return res
|
||||
|
||||
|
||||
def _json_to_df(profile_path, filter_matcher):
|
||||
cpu_entries = []
|
||||
gpu_entries = []
|
||||
|
||||
with open(profile_path, "r", encoding="utf-8") as file_obj:
|
||||
data = json.load(file_obj)
|
||||
if isinstance(data, dict):
|
||||
data = data["traceEvents"]
|
||||
|
||||
for item in data:
|
||||
cat = item.get("cat")
|
||||
if cat is None:
|
||||
continue
|
||||
dur = item.get("dur")
|
||||
if dur is None:
|
||||
continue
|
||||
arg = item.get("args")
|
||||
if arg is None:
|
||||
continue
|
||||
op_name = arg.get("op_name")
|
||||
|
||||
name = item["name"]
|
||||
|
||||
if not filter_matcher(name) and op_name is not None and not filter_matcher(op_name):
|
||||
continue
|
||||
|
||||
if cat != "Kernel" and not name.endswith("kernel_time"):
|
||||
continue
|
||||
|
||||
block_x = arg.get("block_x", -1)
|
||||
block_y = arg.get("block_y", -1)
|
||||
block_z = arg.get("block_z", -1)
|
||||
grid_x = arg.get("grid_x", -1)
|
||||
grid_y = arg.get("grid_y", -1)
|
||||
grid_z = arg.get("grid_z", -1)
|
||||
|
||||
if cat == "Kernel":
|
||||
gpu_entries.append(
|
||||
{
|
||||
"name": name,
|
||||
"duration": dur,
|
||||
"dimensions": f"{block_x}_{block_y}_{block_z}_{grid_x}_{grid_y}_{grid_z}",
|
||||
"op_name": op_name,
|
||||
}
|
||||
)
|
||||
else:
|
||||
cpu_entries.append(
|
||||
{
|
||||
"name": item["args"]["op_name"],
|
||||
"duration": dur,
|
||||
"input_shape": _shape_to_string(item["args"]["input_type_shape"]),
|
||||
"output_shape": _shape_to_string(item["args"]["output_type_shape"]),
|
||||
}
|
||||
)
|
||||
|
||||
cpu_df = pd.DataFrame(cpu_entries)
|
||||
gpu_df = pd.DataFrame(gpu_entries)
|
||||
cpu_df["count"] = 1
|
||||
gpu_df["count"] = 1
|
||||
return cpu_df, gpu_df
|
||||
|
||||
|
||||
def _print_cpu_top_hitters(frame, args):
|
||||
if len(frame) == 0:
|
||||
print("No CPU entries found!")
|
||||
return
|
||||
top = args.count
|
||||
group_key = ["name", "input_shape"] if args.shape_sensitive else ["name"]
|
||||
frame2 = frame[["duration", "count"]].sum()
|
||||
frame["pct"] = 100 * (frame["duration"] / frame2["duration"])
|
||||
fields = group_key + ["duration", "pct", "count"]
|
||||
frame1 = frame[fields].groupby(group_key).sum().reset_index()
|
||||
frame1 = frame1.sort_values(by="duration", ascending=False)[:top]
|
||||
frame1["cumulative_pct"] = frame1["pct"].cumsum()
|
||||
frame1["cumulative_dur"] = frame1["duration"].cumsum()
|
||||
print("\n------ Top CPU Kernel Times ------")
|
||||
print(frame1.round(2).to_string(index=False))
|
||||
if args.csv:
|
||||
frame1.to_csv(f"{args.csv}_cpu_kernel_times.csv", index=False)
|
||||
|
||||
|
||||
def _print_gpu_top_hitters(frame, args):
|
||||
if len(frame) == 0:
|
||||
print("No GPU entries found!")
|
||||
return
|
||||
top = args.count
|
||||
group_key = ["name", "dimensions"] if args.dimension_sensitive else ["name"]
|
||||
frame2 = frame[["duration", "count"]].sum()
|
||||
frame["pct"] = 100 * (frame["duration"] / frame2["duration"])
|
||||
fields = group_key + ["duration", "pct", "count"]
|
||||
frame1 = frame[fields].groupby(group_key).sum().reset_index()
|
||||
frame1 = frame1.sort_values(by="duration", ascending=False)[:top]
|
||||
frame1["cumulative_pct"] = frame1["pct"].cumsum()
|
||||
frame1["cumulative_dur"] = frame1["duration"].cumsum()
|
||||
frame1["name"] = frame1["name"].apply(lambda x: _demangle(x, args.demangler))
|
||||
print("\n------ Top GPU Kernel Times ------")
|
||||
print(frame1.round(2).to_string(index=False))
|
||||
if args.csv:
|
||||
frame1.to_csv(f"{args.csv}_gpu_kernel_times.csv", index=False)
|
||||
|
||||
|
||||
def _construct_filter_matcher(args):
|
||||
if args.filter is None or len(args.filter) == 0:
|
||||
return lambda x: True
|
||||
filter_list = args.filter
|
||||
concrete_filter_set = set()
|
||||
fnmatch_filter_set = set()
|
||||
for pattern in filter_list:
|
||||
if "*" in pattern or "?" in pattern or "[" in pattern or "]" in pattern:
|
||||
fnmatch_filter_set.add(pattern)
|
||||
else:
|
||||
concrete_filter_set.add(pattern)
|
||||
|
||||
def _match_item(item):
|
||||
if item in concrete_filter_set:
|
||||
return True
|
||||
for pattern in fnmatch_filter_set:
|
||||
if fnmatch.fnmatch(item, pattern):
|
||||
return True
|
||||
return False
|
||||
|
||||
return _match_item
|
||||
|
||||
|
||||
def main():
|
||||
args = _get_args()
|
||||
filter_matcher = _construct_filter_matcher(args)
|
||||
|
||||
cpu_df, gpu_df = _json_to_df(args.input, filter_matcher)
|
||||
|
||||
pd.set_option("display.max_colwidth", 120)
|
||||
_print_cpu_top_hitters(cpu_df, args)
|
||||
_print_gpu_top_hitters(gpu_df, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue