# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. import concurrent.futures import functools import os import shutil import subprocess from logger import get_logger log = get_logger("amd_hipify") contrib_ops_path = "onnxruntime/contrib_ops" providers_path = "onnxruntime/core/providers" training_ops_path = "orttraining/orttraining/training_ops" contrib_ops_excluded_files = [ "bert/attention.cc", "bert/attention_impl.cu", "bert/attention_softmax.h", "bert/decoder_attention.h", "bert/decoder_attention.cc", "bert/embed_layer_norm.cc", "bert/embed_layer_norm.h", "bert/embed_layer_norm_impl.cu", "bert/embed_layer_norm_impl.h", "bert/fast_gelu_impl.cu", "bert/fast_gelu_impl.h", "bert/fast_gelu.cc", "bert/fast_gelu.h", "bert/layer_norm.cuh", "bert/skip_layer_norm.cc", "bert/skip_layer_norm.h", "bert/skip_layer_norm_impl.cu", "bert/skip_layer_norm_impl.h", "bert/longformer_attention.cc", "bert/longformer_attention.h", "bert/longformer_attention_softmax.cu", "bert/longformer_attention_softmax.h", "bert/longformer_attention_impl.cu", "bert/longformer_attention_impl.h", "bert/longformer_global_impl.cu", "bert/longformer_global_impl.h", "bert/transformer_cuda_common.h", "bert/transformer_common.h", "bert/transformer_common.cc", "math/bias_softmax.cc", "math/bias_softmax.h", "math/bias_softmax_impl.cu", "math/complex_mul.cc", "math/complex_mul.h", "math/complex_mul_impl.cu", "math/complex_mul_impl.h", "math/cufft_plan_cache.h", "math/fft_ops.cc", "math/fft_ops.h", "math/fft_ops_impl.cu", "math/fft_ops_impl.h", "quantization/attention_quantization.cc", "quantization/attention_quantization.h", "quantization/attention_quantization_impl.cu", "quantization/attention_quantization_impl.cuh", "quantization/quantize_dequantize_linear.cc", "tensor/crop.cc", "tensor/crop.h", "tensor/crop_impl.cu", "tensor/crop_impl.h", "tensor/dynamicslice.cc", "tensor/image_scaler.cc", "tensor/image_scaler.h", "tensor/image_scaler_impl.cu", "tensor/image_scaler_impl.h", "transformers/beam_search.cc", "transformers/beam_search.h", "transformers/generation_device_helper.cc", "transformers/generation_device_helper.h", "transformers/beam_search_impl.cu", "transformers/beam_search_impl.h", "transformers/greedy_search.cc", "transformers/greedy_search.h", "transformers/dump_cuda_tensor.cc", "transformers/dump_cuda_tensor.h", "conv_transpose_with_dynamic_pads.cc", "conv_transpose_with_dynamic_pads.h", "cuda_contrib_kernels.cc", "cuda_contrib_kernels.h", "inverse.cc", "fused_conv.cc", ] provider_excluded_files = [ "atomic/common.cuh", "controlflow/if.cc", "controlflow/if.h", "controlflow/loop.cc", "controlflow/loop.h", "controlflow/scan.cc", "controlflow/scan.h", "cu_inc/common.cuh", "math/einsum_utils/einsum_auxiliary_ops.cc", "math/einsum_utils/einsum_auxiliary_ops.h", "math/einsum_utils/einsum_auxiliary_ops_diagonal.cu", "math/einsum_utils/einsum_auxiliary_ops_diagonal.h", "math/einsum.cc", "math/einsum.h", "math/gemm.cc", "math/matmul.cc", "math/softmax_impl.cu", "math/softmax_warpwise_impl.cuh", "math/softmax.cc", "nn/batch_norm.cc", "nn/batch_norm.h", "nn/conv.cc", "nn/conv.h", "nn/conv_transpose.cc", "nn/conv_transpose.h", "nn/instance_norm.cc", "nn/instance_norm.h", "nn/instance_norm_impl.cu", "nn/instance_norm_impl.h", "nn/lrn.cc", "nn/lrn.h", "nn/max_pool_with_index.cu", "nn/max_pool_with_index.h", "nn/pool.cc", "nn/pool.h", "reduction/reduction_ops.cc", "rnn/cudnn_rnn_base.cc", "rnn/cudnn_rnn_base.h", "rnn/gru.cc", "rnn/gru.h", "rnn/lstm.cc", "rnn/lstm.h", "rnn/rnn.cc", "rnn/rnn.h", "rnn/rnn_impl.cu", "rnn/rnn_impl.h", "shared_inc/cuda_call.h", "shared_inc/fpgeneric.h", "cuda_allocator.cc", "cuda_allocator.h", "cuda_call.cc", "cuda_common.cc", "cuda_common.h", "cuda_execution_provider_info.cc", "cuda_execution_provider_info.h", "cuda_execution_provider.cc", "cuda_execution_provider.h", "cuda_memory_check.cc", "cuda_memory_check.h", "cuda_fence.cc", "cuda_fence.h", "cuda_fwd.h", "cuda_kernel.h", "cuda_pch.cc", "cuda_pch.h", "cuda_provider_factory.cc", "cuda_provider_factory.h", "cuda_utils.cu", "cudnn_common.cc", "cudnn_common.h", "fpgeneric.cu", "gpu_data_transfer.cc", "gpu_data_transfer.h", "integer_gemm.cc", "symbols.txt", ] training_ops_excluded_files = [ "activation/gelu_grad_impl_common.cuh", # uses custom tanh "collective/adasum_kernels.cc", "collective/adasum_kernels.h", "math/div_grad.cc", # miopen API differs from cudnn, no double type support "math/softmax_grad_impl.cu", # warp size differences "math/softmax_grad.cc", # miopen API differs from cudnn, no double type support "nn/batch_norm_grad.cc", # no double type support "nn/batch_norm_grad.h", # miopen API differs from cudnn "nn/batch_norm_internal.cc", # miopen API differs from cudnn, no double type support "nn/batch_norm_internal.h", # miopen API differs from cudnn, no double type support "nn/conv_grad.cc", "nn/conv_grad.h", "reduction/reduction_all.cc", # deterministic = true, ignore ctx setting "reduction/reduction_ops.cc", # no double type support "cuda_training_kernels.cc", "cuda_training_kernels.h", ] @functools.lru_cache(maxsize=1) def get_hipify_path(): # prefer the hipify-perl in PATH HIPIFY_PERL = shutil.which("hipify-perl") # if not found, attempt hard-coded location 1 if HIPIFY_PERL is None: print("hipify-perl not found, trying default location 1") hipify_path = "/opt/rocm/hip/bin/hipify-perl" HIPIFY_PERL = hipify_path if os.access(hipify_path, os.X_OK) else None # if not found, attempt hard-coded location 2 if HIPIFY_PERL is None: print("hipify-perl not found, trying default location 2") hipify_path = "/opt/rocm/bin/hipify-perl" HIPIFY_PERL = hipify_path if os.access(hipify_path, os.X_OK) else None # fail if HIPIFY_PERL is None: raise RuntimeError("Could not locate hipify-perl script") return HIPIFY_PERL def hipify(src_file_path, dst_file_path): dst_file_path = dst_file_path.replace("cuda", "rocm") dir_name = os.path.dirname(dst_file_path) if not os.path.exists(dir_name): os.makedirs(dir_name, exist_ok=True) # Run hipify-perl first, capture output s = subprocess.run([get_hipify_path(), src_file_path], stdout=subprocess.PIPE, universal_newlines=True).stdout # Additional exact-match replacements. # Order matters for all of the following replacements, reglardless of appearing in logical sections. s = s.replace("kCudaExecutionProvider", "kRocmExecutionProvider") s = s.replace("CUDAStreamType", "HIPStreamType") s = s.replace("kCudaStreamDefault", "kHipStreamDefault") s = s.replace("kCudaStreamCopyIn", "kHipStreamCopyIn") s = s.replace("kCudaStreamCopyOut", "kHipStreamCopyOut") s = s.replace("kTotalCudaStreams", "kTotalHipStreams") # We want rocblas interfaces, not hipblas. Also force some hipify replacements back to rocblas from hipblas. s = s.replace("CublasHandle", "RocblasHandle") s = s.replace("cublas_handle", "rocblas_handle") s = s.replace("hipblasHandle_t", "rocblas_handle") s = s.replace("hipblasDatatype_t", "rocblas_datatype") s = s.replace("HIPBLAS_STATUS_SUCCESS", "rocblas_status_success") s = s.replace("hipblasStatus_t", "rocblas_status") s = s.replace("hipblasCreate", "rocblas_create_handle") s = s.replace("hipblasDestroy", "rocblas_destroy_handle") s = s.replace("hipblasSetStream", "rocblas_set_stream") s = s.replace("HIPBLAS_OP_T", "rocblas_operation_transpose") s = s.replace("RegisterCudaContribKernels", "RegisterRocmContribKernels") s = s.replace("cudaEvent", "hipEvent") s = s.replace("CreateCudaAllocator", "CreateRocmAllocator") s = s.replace("CudaErrString", "RocmErrString") s = s.replace("CudaAsyncBuffer", "RocmAsyncBuffer") s = s.replace("CudaKernel", "RocmKernel") s = s.replace("ToCudaType", "ToHipType") s = s.replace("CudaT", "HipT") s = s.replace("CUDA_LONG", "HIP_LONG") s = s.replace("CUDA_RETURN_IF_ERROR", "HIP_RETURN_IF_ERROR") s = s.replace("CUDA_KERNEL_ASSERT", "HIP_KERNEL_ASSERT") s = s.replace("CUDA_CALL", "HIP_CALL") s = s.replace("SliceCuda", "SliceRocm") s = s.replace("thrust::cuda", "thrust::hip") s = s.replace("CudaCall", "RocmCall") s = s.replace("cuda", "rocm") # s = s.replace('Cuda', 'Rocm') s = s.replace("CUDA", "ROCM") s = s.replace("GPU_WARP_SIZE = 32", "GPU_WARP_SIZE = 64") s = s.replace("std::exp", "expf") s = s.replace("std::log", "logf") s = s.replace( "#include ", "#include \n#include ", ) s = s.replace( '#include "cub/device/device_radix_sort.cuh"', "#include \n#include ", ) s = s.replace( "#include ", "#include " ) s = s.replace( "#include ", "#include ", ) s = s.replace("#include ", "#include ") s = s.replace( "#include ", "#include ", ) s = s.replace( "#include ", "#include ", ) s = s.replace("#include ", "#include ") s = s.replace('#include "cub/util_allocator.cuh"', "#include ") s = s.replace("#include ", "#include ") s = s.replace('#include "cub/util_type.cuh"', "#include ") s = s.replace("typedef half MappedType", "typedef __half MappedType") # CUBLAS -> HIPBLAS # Note: We do not use the hipblas marshalling interfaces; use rocblas instead. # s = s.replace('CUBLAS', 'HIPBLAS') # s = s.replace('Cublas', 'Hipblas') # s = s.replace('cublas', 'hipblas') # CUBLAS -> ROCBLAS s = s.replace("CUBLAS", "ROCBLAS") s = s.replace("Cublas", "Rocblas") s = s.replace("cublas", "rocblas") # CURAND -> HIPRAND s = s.replace("CURAND", "HIPRAND") s = s.replace("Curand", "Hiprand") s = s.replace("curand", "hiprand") # NCCL -> RCCL # s = s.replace('NCCL_CALL', 'RCCL_CALL') s = s.replace("#include ", "#include ") # CUDNN -> MIOpen s = s.replace("CUDNN", "MIOPEN") s = s.replace("Cudnn", "Miopen") s = s.replace("cudnn", "miopen") # hipify seems to have a bug for MIOpen, cudnn.h -> hipDNN.h, cudnn -> hipdnn s = s.replace("#include ", "#include ") s = s.replace("hipdnn", "miopen") s = s.replace("HIPDNN_STATUS_SUCCESS", "miopenStatusSuccess") s = s.replace("HIPDNN", "MIOPEN") # CUSPARSE -> HIPSPARSE s = s.replace("CUSPARSE", "HIPSPARSE") # CUFFT -> HIPFFT s = s.replace("CUFFT", "HIPFFT") # Undo where above hipify steps went too far. s = s.replace("id, ROCM", "id, CUDA") # cuda_execution_provider.cc s = s.replace("ROCM error executing", "HIP error executing") s = s.replace("ROCM_PINNED", "CUDA_PINNED") s = s.replace("rocm_err", "hip_err") s = s.replace("RegisterHipTrainingKernels", "RegisterRocmTrainingKernels") s = s.replace("ROCM_VERSION", "CUDA_VERSION") # semantically different meanings, cannot hipify s = s.replace("__ROCM_ARCH__", "__CUDA_ARCH__") # semantically different meanings, cannot hipify # "std::log" above incorrectly changed "std::logic_error" to "logfic_error" s = s.replace("logfic_error", "std::logic_error") # Deletions s = s.replace('#include "device_atomic_functions.h"', "") # HIP atomics in main hip header already do_write = True if os.path.exists(dst_file_path): with open(dst_file_path, "r", encoding="utf-8") as fout_old: do_write = fout_old.read() != s if do_write: with open(dst_file_path, "w") as f: f.write(s) return 'Hipified: "{}" -> "{}"'.format(src_file_path, dst_file_path) else: return 'Repeated: "{}" -> "{}"'.format(src_file_path, dst_file_path) def list_files(prefix, path): all_files = [] curr_path = os.path.join(prefix, path) for root, dirs, files in os.walk(curr_path): for file in files: full_path = os.path.join(root, file) all_files.append(os.path.relpath(full_path, curr_path)) return all_files def amd_hipify(config_build_dir): # determine hipify script path now to avoid doing so concurrently in the thread pool print("Using %s" % get_hipify_path()) with concurrent.futures.ThreadPoolExecutor() as executor: cuda_path = os.path.join(contrib_ops_path, "cuda") rocm_path = os.path.join(config_build_dir, "amdgpu", contrib_ops_path, "rocm") contrib_files = list_files(cuda_path, "") contrib_results = [ executor.submit(hipify, os.path.join(cuda_path, f), os.path.join(rocm_path, f)) for f in contrib_files if f not in contrib_ops_excluded_files ] cuda_path = os.path.join(providers_path, "cuda") rocm_path = os.path.join(config_build_dir, "amdgpu", providers_path, "rocm") provider_files = list_files(cuda_path, "") provider_results = [ executor.submit(hipify, os.path.join(cuda_path, f), os.path.join(rocm_path, f)) for f in provider_files if f not in provider_excluded_files ] cuda_path = os.path.join(training_ops_path, "cuda") rocm_path = os.path.join(config_build_dir, "amdgpu", training_ops_path, "rocm") training_files = list_files(cuda_path, "") training_results = [ executor.submit(hipify, os.path.join(cuda_path, f), os.path.join(rocm_path, f)) for f in training_files if f not in training_ops_excluded_files ] # explicitly wait so that hipify warnings finish printing before logging the hipify statements concurrent.futures.wait(contrib_results) concurrent.futures.wait(provider_results) concurrent.futures.wait(training_results) for result in contrib_results: log.debug(result.result()) for result in provider_results: log.debug(result.result()) for result in training_results: log.debug(result.result()) if __name__ == "__main__": import sys amd_hipify(sys.argv[1])