mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-16 21:00:14 +00:00
The ROCm EP is designed and implemented based on AMD GPU software stack named ROCm. Here is the link for the details about ROCm: https://rocmdocs.amd.com/en/latest/ ROCm EP was created based on the following things: 1. AMD GPU programming language: HIP 2. AMD GPU HIP language runtime: amdhip64 3. BLAS: rocBLAS, hipBLAS 4. DNN: miOpen 5. Collective Communication library: RCCL 6. cub: hipCub 7. … Current status: BERT-L and GPT2 training can be ran on AMD GPU with data parallel. Next: 1. Make more GPU code be sharable between ROCm EP and CUDA EP since HIP language and HIP runtime API are very close to CUDA. 2. Continue improving the implementation. 3. Continue GPU kernel optimization. 4. Support model parallelism on ROCm EP. …… The rocm kernels have been removed from this commit and will be in a separate PR. Since the original PR was too big(~180 files), it was suggested to split the PR into two parts, one is rocm-kernels, the other is non rocm kernels. Co-authored-by: Weixing Zhang <wezhan@microsoft.com> Co-authored-by: sabreshao <sabre.shao@amd.com> Co-authored-by: anghostcici <11013544+anghostcici@users.noreply.github.com> Co-authored-by: Suffian Khan <sukha@microsoft.com> Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com>
66 lines
2.4 KiB
Python
66 lines
2.4 KiB
Python
import argparse
|
|
import numpy as np
|
|
import os
|
|
import csv
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--input', type=str)
|
|
args = parser.parse_args()
|
|
|
|
def get_gpu_lines(path):
|
|
lines = []
|
|
with open(path, newline='') as f:
|
|
reader = csv.reader(f, delimiter=',')
|
|
for row in reader:
|
|
if row[2].find('TotalDurationNs') < 0 :
|
|
lines.append(row)
|
|
return lines
|
|
|
|
activities = [
|
|
('nccl', lambda x : x.find('nccl') >= 0),
|
|
('gemm', lambda x : x.find('Cijk_') >= 0),
|
|
('memcpy', lambda x : x.find('CUDA mem') >= 0),
|
|
('adam', lambda x : x.lower().find('adam') >= 0),
|
|
('lamb', lambda x : x.lower().find('lamb') >= 0 or x.lower().find('multi_tensor_apply') >= 0),
|
|
('dropout', lambda x : x.lower().find('dropout') >= 0 or x.find('curand') >= 0),
|
|
('layernorm', lambda x : x.find('LayerNorm') >= 0 or x.find('cuCompute') >= 0),
|
|
('reduce', lambda x : x.find('reduce') >= 0),
|
|
('softmax', lambda x : x.lower().find('softmax') >= 0),
|
|
('transpose', lambda x : x.lower().find('transpose') >= 0),
|
|
('element-wise', lambda x : x.lower().find('elementwise') >= 0 or x.find('DivGrad') >= 0),
|
|
('jit', lambda x : x.startswith('kernel_')),
|
|
('misc', lambda x : True),
|
|
]
|
|
|
|
def group_gpu_activity(lines):
|
|
groups = { name : [] for name,_ in activities }
|
|
for line in lines:
|
|
for name, check in activities:
|
|
if check(line[0]):
|
|
groups[name].append(line)
|
|
break
|
|
return groups
|
|
|
|
def get_seconds(time):
|
|
return float(time.replace('us','')) / (1000.0 * 1000.0 * 1000.0)
|
|
|
|
def gpu_percent_time(activities):
|
|
return sum([float(a[4].replace('%','')) for a in activities])
|
|
|
|
def gpu_absolute_time(activities):
|
|
return sum([get_seconds(a[2]) for a in activities])
|
|
|
|
def gpu_kernel_calls(activities):
|
|
return sum([int(a[1]) for a in activities])
|
|
|
|
lines = get_gpu_lines(args.input)
|
|
groups = group_gpu_activity(lines)
|
|
|
|
for name in groups:
|
|
activities = groups[name]
|
|
print('{}: N={}, calls={}, absolute={:.3f}s, percent={:.2f}%'.format(name, len(activities), gpu_kernel_calls(activities), gpu_absolute_time(activities), gpu_percent_time(activities)))
|
|
|
|
total = [item for name in groups for item in groups[name]]
|
|
print('Total: N={}, calls={}, absolute={:.3f}s, percent={:.2f}%'.format(len(total), gpu_kernel_calls(total), gpu_absolute_time(total), gpu_percent_time(total)))
|
|
|
|
|