mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-17 21:10:43 +00:00
* Add kernels for AMD GPU. This PR is mostly about GPU kernels for ROCm EP. Due to similar GPU programming language (CUDA and HIP and similar math library calls, one principle in ROCM EP design is to share CUDA kernels as much as possible for ROCm. Thus, the script amd_hipify.py has been created for converting CUDA kernels to ROCm HIP kernels automatically during compilation phase. But, for some reasons such as perf issue, syntax difference..., some converted kernels need some manual intervention. These kernels will be checked in the repo physically for now. In order to avoid manual intervention, the plan is to refactor CUDA kernels to make them portable between CUDA EP and ROCm EP as much as possible. Please refer to "HIP Porting Guide" for details. * like lamb, multi-tensor-apply needs to be disabled for IsAllFiniteOp and ReduceAllL2, current AMD GPU compiler has perf issue for kernel parameter which is a structure with "pass by value". * Use hipMemsetAsync and add checks on HIP calls. * move the generated files to build folder. Co-authored-by: Jesse Benson <jesseb@microsoft.com>
84 lines
3.1 KiB
Bash
84 lines
3.1 KiB
Bash
if [ "$#" -ne 12 ]; then
|
|
echo "Usage: $0 ngpu batch_size seq_len num_train_steps optimizer model_size training_mode[fp32|fp16] display_loss_steps gradient_accumulation_steps loss_scale gpu_name profile"
|
|
exit 1
|
|
fi
|
|
|
|
ngpu=${1:-1}
|
|
batch_size=${2:-64}
|
|
seq_len=${3:-128}
|
|
|
|
if [ ${seq_len} == 128 ]; then
|
|
max_predictions_per_seq=20
|
|
elif [ ${seq_len} == 512 ]; then
|
|
max_predictions_per_seq=80
|
|
else
|
|
echo "seq_len is not 128 or 512"
|
|
exit 1
|
|
fi
|
|
|
|
num_train_steps=${4:-400}
|
|
optimizer=${5:-"adam"}
|
|
model_size=${6:-"large"}
|
|
training_mode=${7:-"fp32"}
|
|
display_loss_steps=${8:-1}
|
|
grad_acc=${9:-1}
|
|
loss_scale=${10:-1024}
|
|
gpu_name=${11:-"mi100"}
|
|
profile=${12:-0}
|
|
|
|
lr=5e-5
|
|
warmup_ratio=0.2843
|
|
warmup_mode=Poly
|
|
effective_batch_size=$((ngpu * batch_size * grad_acc))
|
|
time_now=$(date +%m%d%H%M)
|
|
|
|
HOME_DIR=/workspace
|
|
ORT_DIR=${HOME_DIR}/github/onnxruntime
|
|
commit=$(git -C ${ORT_DIR} rev-parse HEAD | cut -c1-8)
|
|
|
|
if [ ${model_size} == "large" ]; then
|
|
model_dir=${HOME_DIR}/model/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12
|
|
elif [ ${model_size} == "base" ]; then
|
|
model_dir=${HOME_DIR}/model/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12
|
|
elif [ ${model_size} == "tiny" ]; then
|
|
model_dir=${HOME_DIR}/model/bert-tiny-uncased_L_3_H_128_A_2_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12
|
|
else
|
|
echo "model_size is not large, base or tiny"
|
|
exit 1
|
|
fi
|
|
|
|
data_dir=/data/wezhan/bert/${seq_len}/train
|
|
training_bert_dir=${ORT_DIR}/build/RelWithDebInfo
|
|
|
|
log_dir=${HOME_DIR}/logs/bert_${model_size}/$(date +%m%d)
|
|
if [ ! -d ${log_dir} ]; then
|
|
mkdir -p ${log_dir}
|
|
fi
|
|
|
|
run_name=bert_${model_size}_${commit}_g${ngpu}_bs${batch_size}_sl${seq_len}_steps${num_train_steps}_${optimizer}_${training_mode}_acc${grad_acc}_efbs${effective_batch_size}_${time_now}_${gpu_name}
|
|
|
|
if [ ! -d ${log_dir}/${run_name} ]; then
|
|
mkdir -p ${log_dir}/${run_name}
|
|
fi
|
|
|
|
if [ ${ngpu} != 1 ]; then
|
|
mpi_cmd="${OPENMPI_DIR}/bin/mpirun --allow-run-as-root -n ${ngpu} -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=INIT,COLL -x NCCL_MIN_NCHANNELS=4"
|
|
fi
|
|
|
|
if [ ${training_mode} == "fp16" ]; then
|
|
fp16_commands="--use_mixed_precision --allreduce_in_fp16 --loss_scale ${loss_scale}"
|
|
fi
|
|
|
|
if [ ${profile} == 1 ]; then
|
|
if [ ${gpu_name} == "mi100" ]; then
|
|
profile_commands="/opt/rocm/bin/rocprof --obj-tracking on --stats"
|
|
elif [ ${gpu_name} == "v100" ]; then
|
|
profile_commands="nvprof --print-gpu-summary --log-file ${log_dir}/${run_name}-trace.log"
|
|
fi
|
|
fi
|
|
|
|
nohup ${profile_commands} ${mpi_cmd} ${training_bert_dir}/onnxruntime_training_bert --model_name ${model_dir} --train_data_dir ${data_dir} --test_data_dir ${data_dir} --train_batch_size ${batch_size} --mode train --num_train_steps ${num_train_steps} --optimizer ${optimizer} --learning_rate ${lr} --warmup_ratio ${warmup_ratio} --warmup_mode ${warmup_mode} --gradient_accumulation_steps ${grad_acc} --max_seq_length ${seq_len} --max_predictions_per_seq=${max_predictions_per_seq} --use_nccl --lambda 0 ${fp16_commands} --display_loss_steps ${display_loss_steps} --log_dir ${log_dir}/${run_name} > ${log_dir}/${run_name}.log 2>&1 &
|
|
|
|
tail -f ${log_dir}/${run_name}.log
|
|
|
|
exit 0
|