# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. import concurrent.futures import functools import os import shutil import subprocess from logger import get_logger log = get_logger("amd_hipify") contrib_ops_path = 'onnxruntime/contrib_ops' providers_path = 'onnxruntime/core/providers' training_ops_path = 'orttraining/orttraining/training_ops' contrib_ops_excluded_files = [ 'bert/attention.cc', 'bert/attention_impl.cu', 'bert/attention_softmax.h', 'bert/decoder_attention.h', 'bert/decoder_attention.cc', 'bert/embed_layer_norm.cc', 'bert/embed_layer_norm.h', 'bert/embed_layer_norm_impl.cu', 'bert/embed_layer_norm_impl.h', 'bert/fast_gelu_impl.cu', 'bert/layer_norm.cuh', 'bert/longformer_attention.cc', 'bert/longformer_attention.h', 'bert/longformer_attention_softmax.cu', 'bert/longformer_attention_softmax.h', 'bert/longformer_attention_impl.cu', 'bert/longformer_attention_impl.h', 'bert/longformer_global_impl.cu', 'bert/longformer_global_impl.h', 'bert/transformer_cuda_common.h', 'math/bias_softmax.cc', 'math/bias_softmax.h', 'math/bias_softmax_impl.cu', 'math/complex_mul.cc', 'math/complex_mul.h', 'math/complex_mul_impl.cu', 'math/complex_mul_impl.h', 'math/cufft_plan_cache.h', 'math/fft_ops.cc', 'math/fft_ops.h', 'math/fft_ops_impl.cu', 'math/fft_ops_impl.h', 'quantization/attention_quantization.cc', 'quantization/attention_quantization.h', 'quantization/attention_quantization_impl.cu', 'quantization/attention_quantization_impl.cuh', 'quantization/quantize_dequantize_linear.cc', 'tensor/crop.cc', 'tensor/crop.h', 'tensor/crop_impl.cu', 'tensor/crop_impl.h', 'tensor/dynamicslice.cc', 'tensor/image_scaler.cc', 'tensor/image_scaler.h', 'tensor/image_scaler_impl.cu', 'tensor/image_scaler_impl.h', 'transformers/beam_search.cc', 'transformers/beam_search.h', 'transformers/beam_search_device_helper.cc', 'transformers/beam_search_device_helper.h', 'transformers/beam_search_impl.cu', 'transformers/beam_search_impl.h', 'transformers/dump_cuda_tensor.cc', 'transformers/dump_cuda_tensor.h', 'conv_transpose_with_dynamic_pads.cc', 'conv_transpose_with_dynamic_pads.h', 'cuda_contrib_kernels.cc', 'cuda_contrib_kernels.h', 'inverse.cc', 'fused_conv.cc' ] provider_excluded_files = [ 'atomic/common.cuh', 'controlflow/if.cc', 'controlflow/if.h', 'controlflow/loop.cc', 'controlflow/loop.h', 'controlflow/scan.cc', 'controlflow/scan.h', 'cu_inc/common.cuh', 'math/einsum_utils/einsum_auxiliary_ops.cc', 'math/einsum_utils/einsum_auxiliary_ops.h', 'math/einsum_utils/einsum_auxiliary_ops_diagonal.cu', 'math/einsum_utils/einsum_auxiliary_ops_diagonal.h', 'math/einsum.cc', 'math/einsum.h', 'math/gemm.cc', 'math/matmul.cc', 'math/matmul_integer.cc', 'math/matmul_integer.cu', 'math/matmul_integer.cuh', 'math/matmul_integer.h', 'math/softmax_impl.cu', 'math/softmax_warpwise_impl.cuh', 'math/softmax.cc', 'nn/batch_norm.cc', 'nn/batch_norm.h', 'nn/conv.cc', 'nn/conv.h', 'nn/conv_transpose.cc', 'nn/conv_transpose.h', 'nn/instance_norm.cc', 'nn/instance_norm.h', 'nn/instance_norm_impl.cu', 'nn/instance_norm_impl.h', 'nn/lrn.cc', 'nn/lrn.h', 'nn/max_pool_with_index.cu', 'nn/max_pool_with_index.h', 'nn/pool.cc', 'nn/pool.h', 'reduction/reduction_ops.cc', 'reduction/reduction_ops.h', 'rnn/cudnn_rnn_base.cc', 'rnn/cudnn_rnn_base.h', 'rnn/gru.cc', 'rnn/gru.h', 'rnn/lstm.cc', 'rnn/lstm.h', 'rnn/rnn.cc', 'rnn/rnn.h', 'rnn/rnn_impl.cu', 'rnn/rnn_impl.h', 'shared_inc/cuda_call.h', 'shared_inc/fpgeneric.h', 'shared_inc/integer_gemm.h', 'cuda_allocator.cc', 'cuda_allocator.h', 'cuda_call.cc', 'cuda_common.cc', 'cuda_common.h', 'cuda_execution_provider_info.cc', 'cuda_execution_provider_info.h', 'cuda_execution_provider.cc', 'cuda_execution_provider.h', 'cuda_memory_check.cc', 'cuda_memory_check.h', 'cuda_fence.cc', 'cuda_fence.h', 'cuda_fwd.h', 'cuda_kernel.h', 'cuda_pch.cc', 'cuda_pch.h', 'cuda_provider_factory.cc', 'cuda_provider_factory.h', 'cuda_utils.cu', 'cudnn_common.cc', 'cudnn_common.h', 'fpgeneric.cu', 'gpu_data_transfer.cc', 'gpu_data_transfer.h', 'integer_gemm.cc', 'symbols.txt', ] training_ops_excluded_files = [ 'activation/gelu_grad_impl_common.cuh', # uses custom tanh 'collective/adasum_kernels.cc', 'collective/adasum_kernels.h', 'math/div_grad.cc', # miopen API differs from cudnn, no double type support 'math/softmax_grad_impl.cu', # warp size differences 'math/softmax_grad.cc', # miopen API differs from cudnn, no double type support 'nn/batch_norm_grad.cc', # no double type support 'nn/batch_norm_grad.h', # miopen API differs from cudnn 'nn/batch_norm_internal.cc', # miopen API differs from cudnn, no double type support 'nn/batch_norm_internal.h', # miopen API differs from cudnn, no double type support 'nn/conv_grad.cc', 'nn/conv_grad.h', 'reduction/reduction_all.cc', # deterministic = true, ignore ctx setting 'reduction/reduction_ops.cc', # no double type support 'cuda_training_kernels.cc', 'cuda_training_kernels.h', ] @functools.lru_cache(maxsize=1) def get_hipify_path(): # prefer the hipify-perl in PATH HIPIFY_PERL = shutil.which('hipify-perl') # if not found, attempt hard-coded location 1 if HIPIFY_PERL is None: print('hipify-perl not found, trying default location 1') hipify_path = '/opt/rocm/hip/bin/hipify-perl' HIPIFY_PERL = hipify_path if os.access(hipify_path, os.X_OK) else None # if not found, attempt hard-coded location 2 if HIPIFY_PERL is None: print('hipify-perl not found, trying default location 2') hipify_path = '/opt/rocm/bin/hipify-perl' HIPIFY_PERL = hipify_path if os.access(hipify_path, os.X_OK) else None # fail if HIPIFY_PERL is None: raise RuntimeError('Could not locate hipify-perl script') return HIPIFY_PERL def hipify(src_file_path, dst_file_path): dst_file_path = dst_file_path.replace('cuda', 'rocm') dir_name = os.path.dirname(dst_file_path) if not os.path.exists(dir_name): os.makedirs(dir_name, exist_ok=True) # Run hipify-perl first, capture output s = subprocess.run([get_hipify_path(), src_file_path], stdout=subprocess.PIPE, universal_newlines=True).stdout # Additional exact-match replacements. # Order matters for all of the following replacements, reglardless of appearing in logical sections. s = s.replace('kCudaExecutionProvider', 'kRocmExecutionProvider') s = s.replace('CUDAStreamType', 'HIPStreamType') s = s.replace('kCudaStreamDefault', 'kHipStreamDefault') s = s.replace('kCudaStreamCopyIn', 'kHipStreamCopyIn') s = s.replace('kCudaStreamCopyOut', 'kHipStreamCopyOut') s = s.replace('kTotalCudaStreams', 'kTotalHipStreams') # We want rocblas interfaces, not hipblas. Also force some hipify replacements back to rocblas from hipblas. s = s.replace('CublasHandle', 'RocblasHandle') s = s.replace('cublas_handle', 'rocblas_handle') s = s.replace('hipblasHandle_t', 'rocblas_handle') s = s.replace('hipblasDatatype_t', 'rocblas_datatype') s = s.replace('HIPBLAS_STATUS_SUCCESS', 'rocblas_status_success') s = s.replace('hipblasStatus_t', 'rocblas_status') s = s.replace('hipblasCreate', 'rocblas_create_handle') s = s.replace('hipblasDestroy', 'rocblas_destroy_handle') s = s.replace('hipblasSetStream', 'rocblas_set_stream') s = s.replace('HIPBLAS_OP_T', 'rocblas_operation_transpose') s = s.replace('RegisterCudaContribKernels', 'RegisterRocmContribKernels') s = s.replace('cudaEvent', 'hipEvent') s = s.replace('CreateCudaAllocator', 'CreateRocmAllocator') s = s.replace('CudaErrString', 'RocmErrString') s = s.replace('CudaAsyncBuffer', 'RocmAsyncBuffer') s = s.replace('CudaKernel', 'RocmKernel') s = s.replace('ToCudaType', 'ToHipType') s = s.replace('CudaT', 'HipT') s = s.replace('CUDA_LONG', 'HIP_LONG') s = s.replace('CUDA_RETURN_IF_ERROR', 'HIP_RETURN_IF_ERROR') s = s.replace('CUDA_KERNEL_ASSERT', 'HIP_KERNEL_ASSERT') s = s.replace('CUDA_CALL', 'HIP_CALL') s = s.replace('SliceCuda', 'SliceRocm') s = s.replace('thrust::cuda', 'thrust::hip') s = s.replace('CudaCall', 'RocmCall') s = s.replace('cuda', 'rocm') # s = s.replace('Cuda', 'Rocm') s = s.replace('CUDA', 'ROCM') s = s.replace('GPU_WARP_SIZE = 32', 'GPU_WARP_SIZE = 64') s = s.replace('std::exp', 'expf') s = s.replace('std::log', 'logf') s = s.replace('#include ', '#include \n#include ') s = s.replace('#include "cub/device/device_radix_sort.cuh"', '#include \n#include ') s = s.replace('#include ', '#include ') s = s.replace('#include ', '#include ') s = s.replace('#include ', '#include ') s = s.replace('#include ', '#include ') s = s.replace('#include ', '#include ') s = s.replace('#include ', '#include ') s = s.replace('#include "cub/util_allocator.cuh"', '#include ') s = s.replace('#include ', '#include ') s = s.replace('#include "cub/util_type.cuh"', '#include ') s = s.replace('typedef half MappedType', 'typedef __half MappedType') # CUBLAS -> HIPBLAS # Note: We do not use the hipblas marshalling interfaces; use rocblas instead. # s = s.replace('CUBLAS', 'HIPBLAS') # s = s.replace('Cublas', 'Hipblas') # s = s.replace('cublas', 'hipblas') # CUBLAS -> ROCBLAS s = s.replace('CUBLAS', 'ROCBLAS') s = s.replace('Cublas', 'Rocblas') s = s.replace('cublas', 'rocblas') # CURAND -> HIPRAND s = s.replace('CURAND', 'HIPRAND') s = s.replace('Curand', 'Hiprand') s = s.replace('curand', 'hiprand') # NCCL -> RCCL # s = s.replace('NCCL_CALL', 'RCCL_CALL') s = s.replace('#include ', '#include ') # CUDNN -> MIOpen s = s.replace('CUDNN', 'MIOPEN') s = s.replace('Cudnn', 'Miopen') s = s.replace('cudnn', 'miopen') # hipify seems to have a bug for MIOpen, cudnn.h -> hipDNN.h, cudnn -> hipdnn s = s.replace('#include ', '#include ') s = s.replace('hipdnn', 'miopen') s = s.replace('HIPDNN_STATUS_SUCCESS', 'miopenStatusSuccess') s = s.replace('HIPDNN', 'MIOPEN') # CUSPARSE -> HIPSPARSE s = s.replace('CUSPARSE', 'HIPSPARSE') # CUFFT -> HIPFFT s = s.replace('CUFFT', 'HIPFFT') # Undo where above hipify steps went too far. s = s.replace('id, ROCM', 'id, CUDA') # cuda_execution_provider.cc s = s.replace('ROCM error executing', 'HIP error executing') s = s.replace('ROCM_PINNED', 'CUDA_PINNED') s = s.replace('rocm_err', 'hip_err') s = s.replace('RegisterHipTrainingKernels', 'RegisterRocmTrainingKernels') s = s.replace('ROCM_VERSION', 'CUDA_VERSION') # semantically different meanings, cannot hipify s = s.replace('__ROCM_ARCH__', '__CUDA_ARCH__') # semantically different meanings, cannot hipify # "std::log" above incorrectly changed "std::logic_error" to "logfic_error" s = s.replace('logfic_error', 'std::logic_error') # Deletions s = s.replace('#include "device_atomic_functions.h"', '') # HIP atomics in main hip header already do_write = True if os.path.exists(dst_file_path): with open(dst_file_path, 'r', encoding='utf-8') as fout_old: do_write = fout_old.read() != s if do_write: with open(dst_file_path, 'w') as f: f.write(s) return 'Hipified: "{}" -> "{}"'.format(src_file_path, dst_file_path) else: return 'Repeated: "{}" -> "{}"'.format(src_file_path, dst_file_path) def list_files(prefix, path): all_files = [] curr_path = os.path.join(prefix, path) for root, dirs, files in os.walk(curr_path): for file in files: full_path = os.path.join(root, file) all_files.append(os.path.relpath(full_path, curr_path)) return all_files def amd_hipify(config_build_dir): # determine hipify script path now to avoid doing so concurrently in the thread pool print('Using %s' % get_hipify_path()) with concurrent.futures.ThreadPoolExecutor() as executor: cuda_path = os.path.join(contrib_ops_path, 'cuda') rocm_path = os.path.join(config_build_dir, 'amdgpu', contrib_ops_path, 'rocm') contrib_files = list_files(cuda_path, '') contrib_results = [executor.submit(hipify, os.path.join(cuda_path, f), os.path.join(rocm_path, f)) for f in contrib_files if f not in contrib_ops_excluded_files] cuda_path = os.path.join(providers_path, 'cuda') rocm_path = os.path.join(config_build_dir, 'amdgpu', providers_path, 'rocm') provider_files = list_files(cuda_path, '') provider_results = [executor.submit(hipify, os.path.join(cuda_path, f), os.path.join(rocm_path, f)) for f in provider_files if f not in provider_excluded_files] cuda_path = os.path.join(training_ops_path, 'cuda') rocm_path = os.path.join(config_build_dir, 'amdgpu', training_ops_path, 'rocm') training_files = list_files(cuda_path, '') training_results = [executor.submit(hipify, os.path.join(cuda_path, f), os.path.join(rocm_path, f)) for f in training_files if f not in training_ops_excluded_files] # explicitly wait so that hipify warnings finish printing before logging the hipify statements concurrent.futures.wait(contrib_results) concurrent.futures.wait(provider_results) concurrent.futures.wait(training_results) for result in contrib_results: log.debug(result.result()) for result in provider_results: log.debug(result.result()) for result in training_results: log.debug(result.result()) if __name__ == '__main__': import sys amd_hipify(sys.argv[1])