# Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. find_package(Python3 COMPONENTS Interpreter REQUIRED) # GLOB pattern of file to be excluded set(contrib_ops_excluded_files "bert/cudnn_fmha/*" "bert/cutlass_fmha/*" "bert/fastertransformer_decoder_attention/*" "bert/flash_attention/*" "bert/tensorrt_fused_multihead_attention/*" "bert/attention.cc" "bert/attention.h" "bert/attention_impl.cu" "bert/attention_softmax.h" "bert/attention_softmax.cu" "bert/attention_prepare_qkv.cu" "bert/attention_kernel_options.h" "bert/attention_kernel_options.cc" "bert/decoder_attention_impl.h" "bert/decoder_attention_impl.cu" "bert/decoder_masked_multihead_attention.h" "bert/decoder_masked_multihead_attention.cc" "bert/decoder_masked_self_attention.h" "bert/decoder_masked_self_attention.cc" "bert/multihead_attention.cc" "bert/multihead_attention.h" "bert/relative_attn_bias.cc" "bert/relative_attn_bias.h" "bert/relative_attn_bias_impl.cu" "bert/relative_attn_bias_impl.h" "bert/skip_layer_norm.cc" "bert/skip_layer_norm.h" "bert/skip_layer_norm_impl.cu" "bert/skip_layer_norm_impl.h" "bert/transformer_common.h" "bert/transformer_common.cc" "bert/packed_attention.h" "bert/packed_attention.cc" "bert/packed_attention_impl.h" "bert/packed_attention_impl.cu" "bert/packed_multihead_attention.h" "bert/packed_multihead_attention.cc" "bert/packed_multihead_attention_impl.h" "bert/packed_multihead_attention_impl.cu" "diffusion/group_norm_impl.cu" "diffusion/nhwc_conv.cc" "math/gemm_float8.cc" "math/gemm_float8.cu" "math/gemm_float8.h" "moe/*" "sparse/*" "quantization/attention_quantization.cc" "quantization/attention_quantization.h" "quantization/attention_quantization_impl.cu" "quantization/attention_quantization_impl.cuh" "quantization/dequantize_blockwise_bnb4.cuh" "quantization/dequantize_blockwise_bnb4.cu" "quantization/matmul_bnb4.cc" "quantization/matmul_bnb4.cuh" "quantization/matmul_bnb4.cu" "quantization/moe_quantization.h" "quantization/moe_quantization.cc" "quantization/quantize_dequantize_linear.cc" "quantization/qordered_ops/qordered_attention_impl.cu" "quantization/qordered_ops/qordered_attention_impl.h" "quantization/qordered_ops/qordered_attention_input_enum.h" "quantization/qordered_ops/qordered_attention.cc" "quantization/qordered_ops/qordered_attention.h" "quantization/qordered_ops/qordered_common.cuh" "quantization/qordered_ops/qordered_layer_norm.h" "quantization/qordered_ops/qordered_layer_norm.cc" "quantization/qordered_ops/qordered_layer_norm_impl.h" "quantization/qordered_ops/qordered_layer_norm_impl.cu" "quantization/qordered_ops/qordered_longformer_attention.cc" "quantization/qordered_ops/qordered_longformer_attention.h" "quantization/qordered_ops/qordered_matmul.h" "quantization/qordered_ops/qordered_matmul.cc" "quantization/qordered_ops/qordered_matmul_utils.h" "quantization/qordered_ops/qordered_matmul_utils.cc" "quantization/qordered_ops/qordered_qdq_impl.cu" "quantization/qordered_ops/qordered_qdq_impl.h" "quantization/qordered_ops/qordered_qdq.cc" "quantization/qordered_ops/qordered_qdq.h" "quantization/qordered_ops/qordered_unary_ops.h" "quantization/qordered_ops/qordered_unary_ops.cc" "quantization/qordered_ops/qordered_unary_ops_impl.h" "quantization/qordered_ops/qordered_unary_ops_impl.cu" "cuda_contrib_kernels.cc" "cuda_contrib_kernels.h" "inverse.cc" "fused_conv.cc" "bert/group_query_attention.h" "bert/group_query_attention.cc" "bert/group_query_attention_impl.h" "bert/group_query_attention_impl.cu" "collective/custom_*" "collective/distributed_*" "collective/ipc_*" "collective/shard*" ) if (NOT onnxruntime_USE_NCCL) # Those are string patterns to exclude. Do NOT use stars such as # collective/*.cc or *.h. list(APPEND contrib_ops_excluded_files "collective/nccl_kernels.cc") endif() if (NOT onnxruntime_ENABLE_ATEN) list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc") endif() set(provider_excluded_files "atomic/common.cuh" "cu_inc/common.cuh" "math/einsum_utils/einsum_auxiliary_ops.cc" "math/einsum_utils/einsum_auxiliary_ops.h" "math/einsum_utils/einsum_auxiliary_ops_diagonal.cu" "math/einsum_utils/einsum_auxiliary_ops_diagonal.h" "math/einsum.cc" "math/einsum.h" "math/gemm.cc" "math/matmul.cc" "math/softmax_impl.cu" "math/softmax_warpwise_impl.cuh" "math/softmax_common.cc" "math/softmax_common.h" "math/softmax.cc" "math/softmax.h" "nn/conv.cc" "nn/conv.h" "nn/conv_transpose.cc" "nn/conv_transpose.h" "nn/pool.cc" "nn/pool.h" "reduction/reduction_ops.cc" "rnn/cudnn_rnn_base.cc" "rnn/cudnn_rnn_base.h" "rnn/gru.cc" "rnn/gru.h" "rnn/lstm.cc" "rnn/lstm.h" "rnn/rnn.cc" "rnn/rnn.h" "rnn/rnn_impl.cu" "rnn/rnn_impl.h" "shared_inc/cuda_call.h" "shared_inc/cudnn_fe_call.h" "shared_inc/fpgeneric.h" "cuda_allocator.cc" "cuda_allocator.h" "cuda_call.cc" "cuda_common.cc" "cuda_common.h" "cuda_execution_provider_info.cc" "cuda_execution_provider_info.h" "cuda_execution_provider.cc" "cuda_execution_provider.h" "cuda_kernel.h" "cuda_pch.cc" "cuda_pch.h" "cuda_profiler.cc" "cuda_profiler.h" "cuda_provider_factory.cc" "cuda_provider_factory.h" "cuda_stream_handle.cc", "cuda_stream_handle.h", "cuda_utils.cu" "cudnn_common.cc" "cudnn_common.h" "cudnn_fe_call.cc" "cupti_manager.cc" "cupti_manager.h" "fpgeneric.cu" "gpu_data_transfer.cc" "gpu_data_transfer.h" "integer_gemm.cc" "tunable/*" "cuda_nhwc_kernels.cc" "cuda_nhwc_kernels.h" ) set(training_ops_excluded_files "activation/gelu_grad_impl_common.cuh" # uses custom tanh "collective/adasum_kernels.cc" "collective/adasum_kernels.h" "math/div_grad.cc" # miopen API differs from cudnn, no double type support "nn/batch_norm_grad.cc" # no double type support "nn/batch_norm_grad.h" # miopen API differs from cudnn "nn/batch_norm_internal.cc" # miopen API differs from cudnn, no double type support "nn/batch_norm_internal.h" # miopen API differs from cudnn, no double type support "nn/conv_grad.cc" "nn/conv_grad.h" "reduction/reduction_all.cc" # deterministic = true, ignore ctx setting "reduction/reduction_ops.cc" # no double type support "cuda_training_kernels.cc" "cuda_training_kernels.h" "nn/conv_shared.cc" "nn/conv_shared.h" "nn/conv_transpose_grad.cc" "nn/conv_transpose_grad.h" ) function(auto_set_source_files_hip_language) foreach(f ${ARGN}) if(f MATCHES ".*\\.cu$") set_source_files_properties(${f} PROPERTIES LANGUAGE HIP) endif() endforeach() endfunction() # cuda_dir must be relative to REPO_ROOT function(hipify cuda_dir in_excluded_file_patterns out_generated_cc_files out_generated_cu_files) set(hipify_tool ${REPO_ROOT}/tools/ci_build/amd_hipify.py) file(GLOB_RECURSE srcs CONFIGURE_DEPENDS "${REPO_ROOT}/${cuda_dir}/cuda/*.h" "${REPO_ROOT}/${cuda_dir}/cuda/*.cc" "${REPO_ROOT}/${cuda_dir}/cuda/*.cuh" "${REPO_ROOT}/${cuda_dir}/cuda/*.cu" ) # do exclusion set(excluded_file_patterns ${${in_excluded_file_patterns}}) list(TRANSFORM excluded_file_patterns PREPEND "${REPO_ROOT}/${cuda_dir}/cuda/") file(GLOB_RECURSE excluded_srcs CONFIGURE_DEPENDS ${excluded_file_patterns}) foreach(f ${excluded_srcs}) message(STATUS "Excluded from hipify: ${f}") endforeach() list(REMOVE_ITEM srcs ${excluded_srcs}) foreach(f ${srcs}) file(RELATIVE_PATH cuda_f_rel "${REPO_ROOT}" ${f}) string(REPLACE "cuda" "rocm" rocm_f_rel ${cuda_f_rel}) set(f_out "${CMAKE_CURRENT_BINARY_DIR}/amdgpu/${rocm_f_rel}") add_custom_command( OUTPUT ${f_out} COMMAND Python3::Interpreter ${hipify_tool} --hipify_perl ${onnxruntime_HIPIFY_PERL} ${f} -o ${f_out} DEPENDS ${hipify_tool} ${f} COMMENT "Hipify: ${cuda_f_rel} -> amdgpu/${rocm_f_rel}" ) if(f MATCHES ".*\\.cuh?$") list(APPEND generated_cu_files ${f_out}) else() list(APPEND generated_cc_files ${f_out}) endif() endforeach() set_source_files_properties(${generated_cc_files} PROPERTIES GENERATED TRUE) set_source_files_properties(${generated_cu_files} PROPERTIES GENERATED TRUE) auto_set_source_files_hip_language(${generated_cu_files}) set(${out_generated_cc_files} ${generated_cc_files} PARENT_SCOPE) set(${out_generated_cu_files} ${generated_cu_files} PARENT_SCOPE) endfunction()