Add a script for quick profile analysis (#13423)

### Description Implements a Python script for quick analysis of a generated JSON profile from ORT. ### Motivation and Context This PR implements a script that lists kernels that take up the most time in a JSON profile, from both the CPU and GPU points-of-view. The script also supports various options for CSV output, grouping of kernels wrt shape of input tensors and wrt kernel dimensions. Co-authored-by: Abhishek Udupa <abhishek.udupa@microsoft.com>
2026-07-13 18:08:13 +00:00 · 2022-10-26 07:43:03 -07:00 · 2022-10-26 07:43:03 -07:00 · 8fbdc6cc46
commit 8fbdc6cc46
parent a0cc289be6
2 changed files with 275 additions and 0 deletions
--- a/onnxruntime/python/tools/profile_explorer/README.md
+++ b/onnxruntime/python/tools/profile_explorer/README.md
@ -0,0 +1,72 @@
+# Profile Explorer
+
+Profile Explorer is a script that makes analyzing profile outputs generated by ONNX Runtime a breeze. Simply point the script at a generated .json file and gain visibility into the aspects of the model that are taking the most time.
+
+## Usage
+
+For the most up-to-date options, please run `python ./profile_explorer.py --help` to see a list of all options. Here are some of the common use cases covered:
+
+Find the top-5 operators that are taking the most amount of time:
+```bash
+python ./profile_explorer.py --count 5 <JSON file containing profiling data>
+```
+
+Perform a shape sensitive version of the previous command:
+```bash
+python ./profile_explorer.py --count 5 --shape-sensitive <JSON file containing profiling data>
+```
+
+Be sensitive to GPU kernel dimensions while performing the top-5 analysis, in addition to kernel input shapes:
+```bash
+python ./profile_explorer.py --count 5 --shape-sensitive --dimension-sensitive <JSON file containing profiling data>
+```
+
+Use a custom C++ name demangler to produce readable outputs for GPU kernel names:
+```bash
+python ./profile_explorer.py --count 5 --demangler </path/to/your/favorite/demangler> <JSON file containing profiling data>
+```
+
+Save output to CSV files with names prefixes `profile_output`
+```bash
+python ./profile_explorer.py --count 5 --csv profile_output <JSON file containing profiling data>
+```
+
+An example run and output from the run:
+```
+python3 ./profile_explorer.py onnxruntime_profile__2022-10-23_07-01-30.json --filter '*Mul*'
+
+------ Top CPU Kernel Times ------
+       name  duration   pct  count  cumulative_pct  cumulative_dur
+     MatMul    321596 98.16    924           98.16          321596
+FusedMatMul      5670  1.73    132           99.89          327266
+        Mul       369  0.11     11          100.00          327635
+
+------ Top GPU Kernel Times ------
+                                                                                                                                                                                                                                                                                                                                                                                                                                                              name  duration   pct  count  cumulative_pct  cumulative_dur
+   Cijk_Ailk_Bljk_HHS_BH_MT256x128x16_MI32x32x8x1_SN_1LDSB0_APM1_ABV0_ACED0_AF0EM1_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW8_GSU1_GSUASB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB8_LDL1_LRVW4_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR1_RK0_SIA3_SS1_SU32_SUM3_SUS64_SCIUI1_SPO0_SRVW0_SSO0_SVW2_SNLL0_TT4_64_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG64_4_1_WGM16    491799 32.53    660           32.53          491799
+     Cijk_Ailk_Bljk_HHS_BH_MT128x256x32_MI32x32x8x1_SE_1LDSB1_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW8_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB8_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR5_RK0_SIA3_SS0_SU0_SUM0_SUS0_SCIUI1_SPO0_SRVW4_SSO0_SVW4_SNLL0_TT2_128_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG64_4_1_WGM1    230208 15.23    132           47.76          722007
+                                                                                                                                                                                                                                                                                          void onnxruntime::contrib::rocm::BiasSoftmaxWarpForward<__half, __half, float, 9, true>(__half*, __half const*, __half const*, int, int, onnxruntime::rocm::fast_divmod)    218718 14.47    132           62.23          940725
+                                                                                                                                                                                                                                                                                                                                              void onnxruntime::contrib::rocm::FastGeluKernelVec<__half, 256u, 1>(int, int, __half const*, __half const*, __half*)    137577  9.10    132           71.33         1078302
+       Cijk_Ailk_Bljk_HHS_BH_MT64x128x32_MI32x32x8x1_SE_1LDSB0_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW8_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB8_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR5_RK0_SIA3_SS0_SU0_SUM0_SUS0_SCIUI1_SPO0_SRVW4_SSO0_SVW4_SNLL0_TT1_64_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG64_4_1_WGM1    103875  6.87    132           78.20         1182177
+                                                                       void onnxruntime::rocm::_BinaryElementWise<__half, __half, __half, onnxruntime::rocm::OP_Add<__half, __half, __half>, true, false, 512, 2>(int, onnxruntime::rocm::TArray<long, 8>, __half const*, onnxruntime::rocm::TArray<long, 8>, __half const*, onnxruntime::rocm::TArray<onnxruntime::rocm::fast_divmod, 8>, __half*, onnxruntime::rocm::OP_Add<__half, __half, __half> const&, int)     93086  6.16    396           84.36         1275263
+Cijk_Ailk_Bljk_HHS_BH_MT128x144x32_MI16x16x16x1_SE_1LDSB1_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB16_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR2_RK0_SIA3_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW4_SSO0_SVW4_SNLL0_TT2_144_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG64_4_1_WGM1     74188  4.91    132           89.27         1349451
+                                                                                                                                                                                                                                                                                 void onnxruntime::contrib::rocm::SkipLayerNormKernelSmall<__half, 192u, 4>(int, __half const*, __half const*, __half const*, __half const*, __half const*, __half, __half*, bool)     62288  4.12    264           93.39         1411739
+                                                                                                                                                                                                                                                             void onnxruntime::rocm::Transpose4DKernelParallelizeMultipleElementsPerThreadInInnermostDim<2>(onnxruntime::rocm::TArray<long, 8>, void const*, onnxruntime::rocm::TArray<long, 8>, void*, long, int)     47324  3.13    396           96.52         1459063
+                                                                                                                                                                                                                                                                                                               void onnxruntime::rocm::Transpose3DKernel<short, 32u>(onnxruntime::rocm::TArray<long, 8>, onnxruntime::rocm::TArray<long, 8>, short const*, short*)     19311  1.28    132           97.80         1478374
+                                                                                                                                                                                                                                                                                                                                                                                                                                                  CopyHostToDevice     14215  0.94    206           98.74         1492589
+                                                                                                                                                                                                                                                                                   void onnxruntime::rocm::_GatherKernel<short>(long, long, onnxruntime::rocm::fast_divmod, onnxruntime::rocm::fast_divmod, void const*, unsigned long, short const*, short*, int)      6816  0.45     44           99.19         1499405
+                                                                                                                                                                                                                                                                                                     void onnxruntime::rocm::cuApplyLayerNorm<__half, float, __half, false>(__half*, float*, float*, __half const*, int, int, float, __half const*, __half const*)      3477  0.23     11           99.42         1502882
+                                                                       void onnxruntime::rocm::_BinaryElementWise<__half, __half, __half, onnxruntime::rocm::OP_Add<__half, __half, __half>, false, true, 512, 2>(int, onnxruntime::rocm::TArray<long, 8>, __half const*, onnxruntime::rocm::TArray<long, 8>, __half const*, onnxruntime::rocm::TArray<onnxruntime::rocm::fast_divmod, 8>, __half*, onnxruntime::rocm::OP_Add<__half, __half, __half> const&, int)      2599  0.17     11           99.59         1505481
+                                                                                                                                                                                                                                                                                void onnxruntime::rocm::_UnaryElementWise<__half, float, onnxruntime::rocm::OP_Cast<__half, float>, 256, 4>(__half const*, float*, onnxruntime::rocm::OP_Cast<__half, float>, int)      2161  0.14     22           99.74         1507642
+                                                                                                                                                                                                             void onnxruntime::rocm::_BinaryElementWiseSimple<true, true, __half, __half, __half, onnxruntime::rocm::OP_Add<__half, __half, __half>, 512, 2>(__half const*, __half const*, __half*, onnxruntime::rocm::OP_Add<__half, __half, __half> const&, int)      2037  0.13     11           99.87         1509679
+                                                                                                                                                                                                                                                                                                                                                                                                                                                  CopyDeviceToHost      1329  0.09      2           99.96         1511008
+    Cijk_Alik_Bljk_HHS_BH_MT64x32x64_MI16x16x16x1_SE_1LDSB0_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW8_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA16_LPB16_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR5_RK0_SIA3_SS0_SU0_SUM0_SUS0_SCIUI1_SPO0_SRVW4_SSO0_SVW4_SNLL0_TT2_16_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG32_8_1_WGM15       152  0.01     11           99.97         1511160
+                                                                                                                                                                                                   void onnxruntime::rocm::_SliceKernel<false, 2, 256, 4, long>(onnxruntime::rocm::TArray<long, 8>, onnxruntime::rocm::TArray<long, 8>, onnxruntime::rocm::TArray<long, 8>, onnxruntime::rocm::TArray<onnxruntime::rocm::fast_divmod, 8>, long const*, long*, int)       109  0.01     22           99.98         1511269
+                                                                                                                                                                                                                                                                                                                                                   void onnxruntime::rocm::ExpandKernel2D<long>(int, long const*, long*, onnxruntime::rocm::fast_divmod, int, int)        89  0.01     11           99.98         1511358
+     Cijk_Ailk_Bljk_HHS_BH_MT64x16x32_MI16x16x16x1_SE_1LDSB0_APM1_ABV0_ACED0_AF0EM8_AF1EM1_AMAS3_ASGT_ASAE01_ASCE01_ASEM1_AAC0_BL1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW8_GSU1_GSUAMB_GLS0_ISA90a_IU1_K1_KLA_LBSPP128_LPA0_LPB16_LDL1_LRVW8_LWPMn1_LDW0_MAC_MIAV0_MDA2_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR2_PLR2_RK0_SIA3_SS0_SU0_SUM0_SUS0_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT1_16_TLDS1_USFGROn1_VAW2_VSn1_VW2_WSGRA1_WSGRB1_WS64_WG64_4_1_WGM15        83  0.01     11           99.99         1511441
+                                                                                                                                                                                                                                                                                                                                                                                                  void onnxruntime::rocm::_Fill<short, 256, 4>(short*, short, int)        52  0.00     11           99.99         1511493
+                                                                                                                                                                                                                                                                                            void onnxruntime::rocm::_UnaryElementWise<__half, __half, onnxruntime::rocm::OP_Tanh<__half>, 256, 4>(__half const*, __half*, onnxruntime::rocm::OP_Tanh<__half>, int)        51  0.00     11           99.99         1511544
+                                                                                                                                                                                                            void onnxruntime::rocm::_BinaryElementWiseSimple<true, false, __half, __half, __half, onnxruntime::rocm::OP_Mul<__half, __half, __half>, 512, 2>(__half const*, __half const*, __half*, onnxruntime::rocm::OP_Mul<__half, __half, __half> const&, int)        46  0.00     11          100.00         1511590
+                                                                                                                                                                                                            void onnxruntime::rocm::_BinaryElementWiseSimple<false, true, __half, __half, __half, onnxruntime::rocm::OP_Sub<__half, __half, __half>, 512, 2>(__half const*, __half const*, __half*, onnxruntime::rocm::OP_Sub<__half, __half, __half> const&, int)        45  0.00     11          100.00         1511635
+                                                                                                                                                                                                                                                                                                                                                                                               void onnxruntime::rocm::_Fill<__half, 256, 4>(__half*, __half, int)         4  0.00      1          100.00         1511639
+```
--- a/onnxruntime/python/tools/profile_explorer/profile_explorer.py
+++ b/onnxruntime/python/tools/profile_explorer/profile_explorer.py
@ -0,0 +1,203 @@
+#!/usr/bin/python
+
+import argparse
+import fnmatch
+import json
+import subprocess as sp
+
+import pandas as pd
+
+
+def _demangle(name, demangler="c++filt"):
+    try:
+        with sp.Popen([demangler, name], stdin=sp.PIPE, stdout=sp.PIPE) as proc:
+            out, _ = proc.communicate()
+            return out.decode("utf-8").strip()
+    except:
+        return name
+
+
+def _get_args():
+    parser = argparse.ArgumentParser(description="onnxruntime bench tool")
+    parser.add_argument("input", type=str, help="Trace input file, formatted as JSON")
+    parser.add_argument(
+        "--demangler",
+        required=False,
+        type=str,
+        default="c++filt",
+        help="The command to use to demangle C++ identifiers",
+    )
+    parser.add_argument(
+        "--shape-sensitive", action="store_true", help="Perform a shape sensitive analysis of kernel execution times"
+    )
+
+    parser.add_argument(
+        "--dimension-sensitive",
+        action="store_true",
+        help="Perform a kernel launch dimension sensitive analysis of kernel execution times",
+    )
+
+    parser.add_argument(
+        "--filter",
+        type=str,
+        nargs="+",
+        action="extend",
+        help="Restrict analysis to the specified identifiers, i.e., specify a filter list. Also supports UNIX-style wildcards.",
+    )
+    parser.add_argument("--csv", help="save data to csv")
+    parser.add_argument("-c", "--count", type=int, default=40, help="list top N items")
+    parser.add_argument("-v", "--verbose", action="store_true", help="verbose")
+    args = parser.parse_args()
+    return args
+
+
+def _shape_to_string(shape):
+    res = ""
+    for dict_obj in shape:
+        if len(dict_obj) > 1:
+            raise ValueError("Unhandled type in _shape_to_string()")
+        key = list(dict_obj.keys())[0]
+        value = list(dict_obj.values())[0]
+        if len(res) != 0:
+            res += "__"
+        res += f'{key}_{"x".join(str(v) for v in value)}'
+    return res
+
+
+def _json_to_df(profile_path, filter_matcher):
+    cpu_entries = []
+    gpu_entries = []
+
+    with open(profile_path, "r", encoding="utf-8") as file_obj:
+        data = json.load(file_obj)
+    if isinstance(data, dict):
+        data = data["traceEvents"]
+
+    for item in data:
+        cat = item.get("cat")
+        if cat is None:
+            continue
+        dur = item.get("dur")
+        if dur is None:
+            continue
+        arg = item.get("args")
+        if arg is None:
+            continue
+        op_name = arg.get("op_name")
+
+        name = item["name"]
+
+        if not filter_matcher(name) and op_name is not None and not filter_matcher(op_name):
+            continue
+
+        if cat != "Kernel" and not name.endswith("kernel_time"):
+            continue
+
+        block_x = arg.get("block_x", -1)
+        block_y = arg.get("block_y", -1)
+        block_z = arg.get("block_z", -1)
+        grid_x = arg.get("grid_x", -1)
+        grid_y = arg.get("grid_y", -1)
+        grid_z = arg.get("grid_z", -1)
+
+        if cat == "Kernel":
+            gpu_entries.append(
+                {
+                    "name": name,
+                    "duration": dur,
+                    "dimensions": f"{block_x}_{block_y}_{block_z}_{grid_x}_{grid_y}_{grid_z}",
+                    "op_name": op_name,
+                }
+            )
+        else:
+            cpu_entries.append(
+                {
+                    "name": item["args"]["op_name"],
+                    "duration": dur,
+                    "input_shape": _shape_to_string(item["args"]["input_type_shape"]),
+                    "output_shape": _shape_to_string(item["args"]["output_type_shape"]),
+                }
+            )
+
+    cpu_df = pd.DataFrame(cpu_entries)
+    gpu_df = pd.DataFrame(gpu_entries)
+    cpu_df["count"] = 1
+    gpu_df["count"] = 1
+    return cpu_df, gpu_df
+
+
+def _print_cpu_top_hitters(frame, args):
+    if len(frame) == 0:
+        print("No CPU entries found!")
+        return
+    top = args.count
+    group_key = ["name", "input_shape"] if args.shape_sensitive else ["name"]
+    frame2 = frame[["duration", "count"]].sum()
+    frame["pct"] = 100 * (frame["duration"] / frame2["duration"])
+    fields = group_key + ["duration", "pct", "count"]
+    frame1 = frame[fields].groupby(group_key).sum().reset_index()
+    frame1 = frame1.sort_values(by="duration", ascending=False)[:top]
+    frame1["cumulative_pct"] = frame1["pct"].cumsum()
+    frame1["cumulative_dur"] = frame1["duration"].cumsum()
+    print("\n------ Top CPU Kernel Times ------")
+    print(frame1.round(2).to_string(index=False))
+    if args.csv:
+        frame1.to_csv(f"{args.csv}_cpu_kernel_times.csv", index=False)
+
+
+def _print_gpu_top_hitters(frame, args):
+    if len(frame) == 0:
+        print("No GPU entries found!")
+        return
+    top = args.count
+    group_key = ["name", "dimensions"] if args.dimension_sensitive else ["name"]
+    frame2 = frame[["duration", "count"]].sum()
+    frame["pct"] = 100 * (frame["duration"] / frame2["duration"])
+    fields = group_key + ["duration", "pct", "count"]
+    frame1 = frame[fields].groupby(group_key).sum().reset_index()
+    frame1 = frame1.sort_values(by="duration", ascending=False)[:top]
+    frame1["cumulative_pct"] = frame1["pct"].cumsum()
+    frame1["cumulative_dur"] = frame1["duration"].cumsum()
+    frame1["name"] = frame1["name"].apply(lambda x: _demangle(x, args.demangler))
+    print("\n------ Top GPU Kernel Times ------")
+    print(frame1.round(2).to_string(index=False))
+    if args.csv:
+        frame1.to_csv(f"{args.csv}_gpu_kernel_times.csv", index=False)
+
+
+def _construct_filter_matcher(args):
+    if args.filter is None or len(args.filter) == 0:
+        return lambda x: True
+    filter_list = args.filter
+    concrete_filter_set = set()
+    fnmatch_filter_set = set()
+    for pattern in filter_list:
+        if "*" in pattern or "?" in pattern or "[" in pattern or "]" in pattern:
+            fnmatch_filter_set.add(pattern)
+        else:
+            concrete_filter_set.add(pattern)
+
+    def _match_item(item):
+        if item in concrete_filter_set:
+            return True
+        for pattern in fnmatch_filter_set:
+            if fnmatch.fnmatch(item, pattern):
+                return True
+        return False
+
+    return _match_item
+
+
+def main():
+    args = _get_args()
+    filter_matcher = _construct_filter_matcher(args)
+
+    cpu_df, gpu_df = _json_to_df(args.input, filter_matcher)
+
+    pd.set_option("display.max_colwidth", 120)
+    _print_cpu_top_hitters(cpu_df, args)
+    _print_gpu_top_hitters(gpu_df, args)
+
+
+if __name__ == "__main__":
+    main()