mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-26 03:00:54 +00:00
### Description <!-- Describe your changes. --> This commit introduces a new vectorized AVX512F kernel, MlasReduceMaximumF32KernelAvx512F, which efficiently computes the maximum value of the supplied buffer. Additionally, microbenchmarks have been added for MlasComputeSoftmax (inplace), MlasReduceMaximumF32KernelAvx, MlasComputeSumExpF32KernelAvx512F, and MlasComputeSoftmaxOutputF32KernelAvx. ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> The goal of this commit is to enhance the performance of ReduceMaximumF32Kernel on CPUs with AVX512F instruction support. | AVX | | | AVX512 | | | -- | -- | -- | -- | -- | -- | -- | -- name | iterations | real_time | cpu_time | iterations | real_time | cpu_time | time_unit REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:3/real_time | 271277304 | 2.58095 | 2.58091 | 263338132 | 2.65661 | 2.65661 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:3/real_time | 271220477 | 2.58095 | 2.58095 | 263509929 | 2.65652 | 2.65649 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:3/real_time | 271240587 | 2.58064 | 2.58064 | 263479542 | 2.65671 | 2.65665 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:3/real_time | 271227745 | 2.58083 | 2.58079 | 263402506 | 2.65657 | 2.65657 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:3/real_time | 271255069 | 2.58073 | 2.58071 | 263463858 | 2.65682 | 2.65682 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:3/real_time | 271257174 | 2.58058 | 2.58052 | 263460120 | 2.65682 | 2.65682 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:4/real_time | 174395051 | 4.01401 | 4.01401 | 197330481 | 3.5465 | 3.54636 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:4/real_time | 174645502 | 3.99691 | 3.99691 | 197474831 | 3.54298 | 3.54278 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:4/real_time | 174523308 | 4.01391 | 4.01386 | 197389981 | 3.54518 | 3.54506 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:4/real_time | 174779200 | 3.99874 | 3.99874 | 197519075 | 3.54227 | 3.54209 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:4/real_time | 174642874 | 4.00645 | 4.00641 | 197642101 | 3.54195 | 3.54188 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:4/real_time | 174546754 | 4.0061 | 4.00608 | 197621033 | 3.54296 | 3.54281 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:5/real_time | 162752651 | 4.30119 | 4.30114 | 215552503 | 3.24767 | 3.24752 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:5/real_time | 162717463 | 4.30123 | 4.30116 | 215541082 | 3.24711 | 3.24695 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:5/real_time | 162718819 | 4.3016 | 4.30153 | 215589239 | 3.24725 | 3.24708 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:5/real_time | 162719596 | 4.30151 | 4.30145 | 215563846 | 3.24956 | 3.24949 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:5/real_time | 162753333 | 4.30125 | 4.30125 | 215537315 | 3.24924 | 3.24908 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:5/real_time | 162752258 | 4.3014 | 4.30141 | 215526482 | 3.24744 | 3.24735 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:7/real_time | 143579660 | 4.87526 | 4.87516 | 100000000 | 5.25767 | 5.25752 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:7/real_time | 143585097 | 4.87476 | 4.87467 | 100000000 | 5.41583 | 5.41567 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:7/real_time | 143571011 | 4.87506 | 4.87503 | 182359467 | 3.83773 | 3.83764 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:7/real_time | 143587142 | 4.87487 | 4.8748 | 182397261 | 3.83807 | 3.8379 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:7/real_time | 143578465 | 4.87525 | 4.87521 | 182428602 | 3.83777 | 3.83768 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:7/real_time | 143588555 | 4.87491 | 4.87488 | 125280452 | 5.59791 | 5.59766 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:9/real_time | 284851058 | 2.43476 | 2.43476 | 156879863 | 4.42895 | 4.42884 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:9/real_time | 270700898 | 2.59031 | 2.59024 | 157953114 | 4.42995 | 4.42968 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:9/real_time | 282871172 | 2.45385 | 2.45385 | 157801156 | 4.42817 | 4.42804 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:9/real_time | 285307738 | 2.47009 | 2.47005 | 158058507 | 4.4279 | 4.42786 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:9/real_time | 285709536 | 2.45481 | 2.45476 | 158070961 | 4.42809 | 4.42799 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:9/real_time | 285449733 | 2.47495 | 2.47491 | 158069718 | 4.45026 | 4.45017 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:11/real_time | 189213618 | 3.79684 | 3.79676 | 139459497 | 5.01882 | 5.01871 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:11/real_time | 185600468 | 3.76394 | 3.76376 | 139444892 | 5.01922 | 5.01905 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:11/real_time | 184968668 | 3.80636 | 3.80636 | 139470834 | 5.01948 | 5.01936 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:11/real_time | 183867226 | 3.80432 | 3.80427 | 139481986 | 5.01975 | 5.01944 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:11/real_time | 184301650 | 3.81634 | 3.81634 | 139452846 | 5.01983 | 5.01972 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:11/real_time | 186215795 | 3.82659 | 3.82654 | 139497736 | 5.02119 | 5.02113 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:13/real_time | 135622415 | 5.16256 | 5.16252 | 124661337 | 5.61227 | 5.61194 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:13/real_time | 135618907 | 5.15967 | 5.1596 | 124805224 | 5.6088 | 5.60854 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:13/real_time | 135612192 | 5.15506 | 5.15501 | 124803221 | 5.60901 | 5.60869 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:13/real_time | 135906082 | 5.15818 | 5.15818 | 124776601 | 5.60898 | 5.60886 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:13/real_time | 135369523 | 5.15709 | 5.15682 | 124790370 | 5.60927 | 5.60902 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:13/real_time | 135596827 | 5.1603 | 5.1603 | 124792145 | 5.61637 | 5.61614 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:15/real_time | 110947137 | 5.96511 | 5.96495 | 112861522 | 6.20035 | 6.20014 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:15/real_time | 118004792 | 6.22645 | 6.22628 | 112909900 | 6.20073 | 6.20073 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:15/real_time | 112630319 | 6.25564 | 6.25552 | 112874563 | 6.19932 | 6.19924 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:15/real_time | 117403034 | 6.17263 | 6.17258 | 112927318 | 6.19866 | 6.19842 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:15/real_time | 108921863 | 6.48624 | 6.48612 | 112927746 | 6.20057 | 6.20026 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:15/real_time | 110358148 | 6.66805 | 6.66789 | 112907312 | 6.19938 | 6.19908 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:16/real_time | 203419574 | 3.4415 | 3.44137 | 237134525 | 2.95649 | 2.95638 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:16/real_time | 203414035 | 3.4411 | 3.44099 | 237129564 | 2.95178 | 2.95171 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:16/real_time | 203404068 | 3.44157 | 3.44151 | 236981704 | 2.9518 | 2.95167 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:16/real_time | 203391471 | 3.44146 | 3.44137 | 237108807 | 2.95203 | 2.95196 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:16/real_time | 203393801 | 3.44131 | 3.44127 | 237126460 | 2.95278 | 2.95272 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:16/real_time | 203407476 | 3.44181 | 3.44162 | 237154444 | 2.95293 | 2.9528 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:500/real_time | 37551439 | 18.6407 | 18.6407 | 39222534 | 17.858 | 17.8571 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:500/real_time | 37544097 | 18.6404 | 18.6401 | 39174151 | 17.8539 | 17.8536 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:500/real_time | 37549837 | 18.6391 | 18.6391 | 39233956 | 17.8507 | 17.8505 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:500/real_time | 45996345 | 15.2157 | 15.2153 | 39285929 | 17.848 | 17.8474 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:500/real_time | 46012429 | 15.2184 | 15.2179 | 65664865 | 10.7366 | 10.7364 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:500/real_time | 45912375 | 15.2349 | 15.2346 | 65205908 | 10.8498 | 10.8492 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:4/D:2000/real_time | 9493955 | 73.7232 | 73.7203 | 10188090 | 68.7931 | 68.7908 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:8/D:2000/real_time | 9495562 | 73.7173 | 73.7173 | 10180895 | 68.7533 | 68.7511 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:16/D:2000/real_time | 9487371 | 73.7852 | 73.7831 | 10164473 | 68.7279 | 68.725 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:32/D:2000/real_time | 10816047 | 64.7322 | 64.7287 | 10168481 | 68.8109 | 68.8096 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:64/D:2000/real_time | 10808802 | 64.7232 | 64.721 | 19478320 | 36.1471 | 36.1461 | ns REDUCEMAXIMUMF32KERNEL[]/ByteAligned:128/D:2000/real_time | 10818192 | 64.7304 | 64.728 | 19419672 | 35.9635 | 35.9635 | ns
706 lines
29 KiB
CMake
706 lines
29 KiB
CMake
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT License.
|
|
|
|
set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
|
|
set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
|
|
set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
|
|
|
|
#
|
|
# All hardware agnostic source files here
|
|
# hardware specific files would cause trouble in
|
|
# multi-target build
|
|
#
|
|
onnxruntime_add_static_library(onnxruntime_mlas
|
|
${MLAS_SRC_DIR}/mlasi.h
|
|
${MLAS_SRC_DIR}/platform.cpp
|
|
${MLAS_SRC_DIR}/threading.cpp
|
|
${MLAS_SRC_DIR}/sgemm.cpp
|
|
${MLAS_SRC_DIR}/halfgemm.cpp
|
|
${MLAS_SRC_DIR}/qgemm.cpp
|
|
${MLAS_SRC_DIR}/qdwconv.cpp
|
|
${MLAS_SRC_DIR}/convolve.cpp
|
|
${MLAS_SRC_DIR}/convsym.cpp
|
|
${MLAS_SRC_DIR}/pooling.cpp
|
|
${MLAS_SRC_DIR}/transpose.cpp
|
|
${MLAS_SRC_DIR}/reorder.cpp
|
|
${MLAS_SRC_DIR}/snchwc.cpp
|
|
${MLAS_SRC_DIR}/activate.cpp
|
|
${MLAS_SRC_DIR}/logistic.cpp
|
|
${MLAS_SRC_DIR}/tanh.cpp
|
|
${MLAS_SRC_DIR}/erf.cpp
|
|
${MLAS_SRC_DIR}/compute.cpp
|
|
${MLAS_SRC_DIR}/quantize.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_default.cpp
|
|
${MLAS_SRC_DIR}/qladd.cpp
|
|
${MLAS_SRC_DIR}/qlmul.cpp
|
|
${MLAS_SRC_DIR}/qpostprocessor.cpp
|
|
${MLAS_SRC_DIR}/qlgavgpool.cpp
|
|
${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm.h
|
|
${MLAS_SRC_DIR}/sqnbitgemm.cpp
|
|
)
|
|
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_INC_DIR}/mlas_float16.h
|
|
${MLAS_INC_DIR}/mlas_gemm_postprocessor.h
|
|
${MLAS_INC_DIR}/mlas_q4.h
|
|
${MLAS_INC_DIR}/mlas_qnbit.h
|
|
${MLAS_INC_DIR}/mlas.h
|
|
)
|
|
|
|
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/q4_dq.cpp
|
|
${MLAS_SRC_DIR}/q4gemm.cpp
|
|
)
|
|
endif()
|
|
|
|
set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
|
|
|
|
#TODO: set MASM flags properly
|
|
function(setup_mlas_source_for_windows)
|
|
|
|
#
|
|
# Sources common for all platforms.
|
|
#
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/activate_fp16.cpp
|
|
${MLAS_SRC_DIR}/dwconv.cpp
|
|
${MLAS_SRC_DIR}/pooling_fp16.cpp
|
|
)
|
|
|
|
#The onnxruntime_target_platform variable was added by Windows AI team in onnxruntime_common.cmake
|
|
#Don't use it for other platforms.
|
|
if((onnxruntime_target_platform STREQUAL "ARM64") OR (onnxruntime_target_platform STREQUAL "ARM64EC"))
|
|
set(PREPROCESS_ARMASM_FLAGS "")
|
|
set(ARMASM_FLAGS "")
|
|
|
|
if(onnxruntime_target_platform STREQUAL "ARM64")
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
|
|
)
|
|
|
|
set(mlas_platform_preprocess_srcs
|
|
${MLAS_SRC_DIR}/arm64/ConvSymS8KernelDot.asm
|
|
${MLAS_SRC_DIR}/arm64/ConvSymS8KernelDotLd64.asm
|
|
${MLAS_SRC_DIR}/arm64/ConvSymU8KernelDot.asm
|
|
${MLAS_SRC_DIR}/arm64/ConvSymS8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/ConvSymU8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/DepthwiseQConvSymS8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/DepthwiseQConvSymU8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/DepthwiseQConvKernelSize9Neon.asm
|
|
${MLAS_SRC_DIR}/arm64/HalfGemmKernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/QgemmU8X8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/QgemmS8S8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/QgemmU8X8KernelUdot.asm
|
|
${MLAS_SRC_DIR}/arm64/QgemmS8S8KernelSdot.asm
|
|
${MLAS_SRC_DIR}/arm64/SgemmKernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/SgemvKernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDot.asm
|
|
${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm
|
|
)
|
|
else()
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
|
|
)
|
|
|
|
set(mlas_platform_preprocess_srcs
|
|
${MLAS_SRC_DIR}/arm64ec/QgemmU8X8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64ec/SgemmKernelNeon.asm
|
|
)
|
|
|
|
string(APPEND PREPROCESS_ARMASM_FLAGS " /arm64EC")
|
|
string(APPEND ARMASM_FLAGS " -machine ARM64EC")
|
|
endif()
|
|
|
|
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
|
string(APPEND ARMASM_FLAGS " -g")
|
|
endif()
|
|
|
|
# Remove double quotes from flag strings.
|
|
separate_arguments(PREPROCESS_ARMASM_FLAGS NATIVE_COMMAND "${PREPROCESS_ARMASM_FLAGS}")
|
|
separate_arguments(ARMASM_FLAGS NATIVE_COMMAND "${ARMASM_FLAGS}")
|
|
|
|
# Run the C precompiler on each input before the assembler.
|
|
foreach(asm_filename ${mlas_platform_preprocess_srcs})
|
|
get_filename_component(asm_filename_base ${asm_filename} NAME_WLE)
|
|
set(preprocess_filename ${CMAKE_CURRENT_BINARY_DIR}/${asm_filename_base}.i)
|
|
set(obj_filename ${CMAKE_CURRENT_BINARY_DIR}/${asm_filename_base}.obj)
|
|
add_custom_command(
|
|
OUTPUT ${obj_filename}
|
|
COMMAND
|
|
cl.exe ${PREPROCESS_ARMASM_FLAGS} /P ${asm_filename} /Fi${preprocess_filename}
|
|
COMMAND
|
|
armasm64.exe ${ARMASM_FLAGS} ${preprocess_filename} ${obj_filename}
|
|
DEPENDS ${asm_filename}
|
|
BYPRODUCTS ${preprocess_filename}
|
|
)
|
|
target_sources(onnxruntime_mlas PRIVATE ${obj_filename})
|
|
endforeach()
|
|
elseif(onnxruntime_target_platform STREQUAL "ARM")
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/arm/sgemmc.cpp
|
|
)
|
|
elseif(onnxruntime_target_platform STREQUAL "x64")
|
|
|
|
file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS
|
|
"${MLAS_SRC_DIR}/intrinsics/avx/*.cpp"
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "/arch:AVX")
|
|
|
|
file(GLOB_RECURSE mlas_platform_srcs_avx2 CONFIGURE_DEPENDS
|
|
"${MLAS_SRC_DIR}/intrinsics/avx2/*.cpp"
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "/arch:AVX2")
|
|
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/dgemm.cpp
|
|
${mlas_platform_srcs_avx}
|
|
${mlas_platform_srcs_avx2}
|
|
${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_avx2.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
|
|
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAmx.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8U8KernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8X8KernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8X8KernelAvx512Core.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx512Core.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx512Vnni.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvxVnni.asm
|
|
${MLAS_SRC_DIR}/amd64/ConvSymKernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/ConvSymKernelAvx512Core.asm
|
|
${MLAS_SRC_DIR}/amd64/DgemmKernelSse2.asm
|
|
${MLAS_SRC_DIR}/amd64/DgemmKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/DgemmKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/DgemmKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelSse2.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelM1Avx.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/SconvKernelSse2.asm
|
|
${MLAS_SRC_DIR}/amd64/SconvKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/SconvKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/SconvKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/SpoolKernelSse2.asm
|
|
${MLAS_SRC_DIR}/amd64/SpoolKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/SpoolKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/sgemma.asm
|
|
${MLAS_SRC_DIR}/amd64/cvtfp16a.asm
|
|
${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/TransKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/TransKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/LogisticKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/TanhKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/ErfKernelFma3.asm
|
|
)
|
|
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/q4gemm_avx512.cpp
|
|
)
|
|
endif()
|
|
else()
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
|
|
${MLAS_SRC_DIR}/i386/SgemmKernelSse2.asm
|
|
${MLAS_SRC_DIR}/i386/SgemmKernelAvx.asm
|
|
)
|
|
endif()
|
|
endfunction()
|
|
|
|
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
|
|
if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
|
|
file(GLOB_RECURSE mlas_platform_srcs
|
|
"${MLAS_SRC_DIR}/wasm_simd/*.cpp"
|
|
)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/qgemm_kernel_wasmsimd.cpp
|
|
)
|
|
else()
|
|
file(GLOB_RECURSE mlas_platform_srcs
|
|
"${MLAS_SRC_DIR}/scalar/*.cpp"
|
|
)
|
|
endif()
|
|
target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
|
|
elseif(MSVC)
|
|
setup_mlas_source_for_windows()
|
|
else()
|
|
|
|
if(APPLE)
|
|
get_target_property(ONNXRUNTIME_MLAS_OSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
|
|
|
|
if(NOT ONNXRUNTIME_MLAS_OSX_ARCH)
|
|
set(ONNXRUNTIME_MLAS_OSX_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
|
|
endif()
|
|
foreach(OSX_ARCH ${ONNXRUNTIME_MLAS_OSX_ARCH})
|
|
if (OSX_ARCH STREQUAL "arm64")
|
|
set(ARM64 TRUE)
|
|
elseif (OSX_ARCH STREQUAL "arm64e")
|
|
set(ARM64 TRUE)
|
|
elseif (OSX_ARCH STREQUAL "arm")
|
|
set(ARM TRUE)
|
|
elseif (OSX_ARCH STREQUAL "x86_64")
|
|
set(X86_64 TRUE)
|
|
elseif (OSX_ARCH STREQUAL "i386")
|
|
set(X86 TRUE)
|
|
endif()
|
|
endforeach()
|
|
elseif(ANDROID)
|
|
if (CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
|
|
set(ARM TRUE)
|
|
elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "arm64-v8a")
|
|
set(ARM64 TRUE)
|
|
elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86_64")
|
|
set(X86_64 TRUE)
|
|
elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86")
|
|
set(X86 TRUE)
|
|
endif()
|
|
else()
|
|
#Linux/FreeBSD/PowerPC/...
|
|
#The value of CMAKE_SYSTEM_PROCESSOR should be from `uname -m`
|
|
#Example values:
|
|
#arm64v8/ubuntu -> aarch64
|
|
#arm32v6/alpine -> armv7l
|
|
#arm32v7/centos -> armv7l
|
|
#ppc64le/debian -> ppc64le
|
|
#s390x/ubuntu -> s390x
|
|
#ppc64le/busybox -> ppc64le
|
|
#arm64v8/ubuntu -> aarch64
|
|
#Android: armv7-a aarch64 i686 x86_64
|
|
#chasun: I don't think anyone uses 'arm64'
|
|
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm64.*")
|
|
set(ARM64 TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm.*")
|
|
set(ARM TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
|
|
set(ARM64 TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc.*|ppc.*)")
|
|
set(POWER TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
|
|
set(X86 TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
|
|
set(X86_64 TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*")
|
|
set(LOONGARCH64 TRUE)
|
|
endif()
|
|
endif()
|
|
|
|
if(APPLE)
|
|
get_target_property(ONNXRUNTIME_MLAS_MACOSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
|
|
endif()
|
|
list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH)
|
|
if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH GREATER 1)
|
|
set(ONNXRUNTIME_MLAS_MULTI_ARCH TRUE)
|
|
endif()
|
|
#If ONNXRUNTIME_MLAS_MULTI_ARCH is true, we need to go through every if branch below
|
|
#and split MLAS to multiple static libraries.
|
|
#Otherwise, it works like if(...) elseif(...) elseif(...) endif()
|
|
set(MLAS_SOURCE_IS_NOT_SET 1)
|
|
if(ARM)
|
|
enable_language(ASM)
|
|
|
|
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfpu=neon")
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
|
|
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/aarch32/QgemmU8X8KernelNeon.S
|
|
${MLAS_SRC_DIR}/arm/sgemmc.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
|
|
)
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(ARM64 AND MLAS_SOURCE_IS_NOT_SET )
|
|
enable_language(ASM)
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymS8KernelDot.S
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymS8KernelDotLd64.S
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymU8KernelDot.S
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymS8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymU8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/DepthwiseQConvSymS8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/DepthwiseQConvSymU8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/DepthwiseQConvKernelSize9Neon.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUdot.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSdot.S
|
|
${MLAS_SRC_DIR}/aarch64/SgemmKernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/SgemvKernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelSdot.S
|
|
${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelSdotLd64.S
|
|
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
|
|
PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
|
|
if (NOT APPLE)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S
|
|
${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S
|
|
${MLAS_SRC_DIR}/activate_fp16.cpp
|
|
${MLAS_SRC_DIR}/dwconv.cpp
|
|
${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/pooling_fp16.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
|
|
${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/activate_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
|
|
endif()
|
|
|
|
if(ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
onnxruntime_add_static_library(onnxruntime_mlas_arm64 ${mlas_platform_srcs})
|
|
set_target_properties(onnxruntime_mlas_arm64 PROPERTIES OSX_ARCHITECTURES "arm64")
|
|
list(APPEND ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas_arm64)
|
|
set(mlas_platform_srcs )
|
|
else()
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(POWER AND MLAS_SOURCE_IS_NOT_SET)
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/power/SgemmKernelPower.cpp
|
|
${MLAS_SRC_DIR}/dgemm.cpp
|
|
${MLAS_SRC_DIR}/power/DgemmKernelPower.cpp
|
|
${MLAS_SRC_DIR}/power/QuantizePower.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/SgemmKernelPower.cpp PROPERTIES COMPILE_FLAGS "-DSINGLE")
|
|
|
|
check_cxx_compiler_flag("-mcpu=power9" HAS_POWER9)
|
|
if (HAS_POWER9)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/power/QuantizePowerVSX.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/QuantizePowerVSX.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9")
|
|
endif()
|
|
|
|
check_cxx_compiler_flag("-mcpu=power10" HAS_POWER10)
|
|
if(HAS_POWER10)
|
|
set(CMAKE_REQUIRED_FLAGS "-mcpu=power10")
|
|
check_cxx_source_compiles("
|
|
#include <altivec.h>
|
|
int main() {
|
|
__vector_quad acc0;
|
|
__builtin_mma_xxsetaccz (&acc0);
|
|
return 0;
|
|
}"
|
|
COMPILES_P10
|
|
)
|
|
if(COMPILES_P10)
|
|
check_cxx_source_compiles("
|
|
#include <sys/auxv.h>
|
|
int main() {
|
|
unsigned long hwcap2 = getauxval(AT_HWCAP2);
|
|
bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
|
|
return 0;
|
|
}"
|
|
HAS_P10_RUNTIME
|
|
)
|
|
if (HAS_P10_RUNTIME)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/platform.cpp PROPERTIES COMPILE_FLAGS "-DPOWER10")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/qgemm.cpp PROPERTIES COMPILE_FLAGS "-DPOWER10")
|
|
endif()
|
|
set(mlas_platform_srcs_power10
|
|
${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp
|
|
${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp
|
|
${MLAS_SRC_DIR}/power/qgemm_kernel_power10.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10 -DSINGLE")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/qgemm_kernel_power10.cpp PROPERTIES COMPILE_FLAGS "-O3 -mcpu=power10")
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${mlas_platform_srcs_power10}
|
|
)
|
|
endif()
|
|
endif()
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(X86 AND MLAS_SOURCE_IS_NOT_SET)
|
|
enable_language(ASM)
|
|
|
|
set(mlas_platform_srcs_sse2
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
|
|
${MLAS_SRC_DIR}/x86/SgemmKernelSse2.S
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2")
|
|
|
|
set(mlas_platform_srcs_avx
|
|
${MLAS_SRC_DIR}/x86/SgemmKernelAvx.S
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
|
|
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs_sse2}
|
|
${mlas_platform_srcs_avx}
|
|
)
|
|
|
|
# In r23, NDK remove __x86.get_pc_thunk.* from libatomic. Add our own
|
|
# implementation to avoid external dependency.
|
|
if(ANDROID)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/x86/x86.get_pc_thunk.S
|
|
)
|
|
endif()
|
|
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(X86_64 AND MLAS_SOURCE_IS_NOT_SET)
|
|
enable_language(ASM)
|
|
|
|
# Forward the flags for the minimum target platform version from the C
|
|
# compiler to the assembler. This works around CMakeASMCompiler.cmake.in
|
|
# not including the logic to set this flag for the assembler.
|
|
set(CMAKE_ASM${ASM_DIALECT}_OSX_DEPLOYMENT_TARGET_FLAG "${CMAKE_C_OSX_DEPLOYMENT_TARGET_FLAG}")
|
|
|
|
# The LLVM assembler does not support the .arch directive to enable instruction
|
|
# set extensions and also doesn't support AVX-512F instructions without
|
|
# turning on support via command-line option. Group the sources by the
|
|
# instruction set extension and explicitly set the compiler flag as appropriate.
|
|
|
|
set(mlas_platform_srcs_sse2
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
|
|
${MLAS_SRC_DIR}/x86_64/DgemmKernelSse2.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelSse2.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmTransposePackB16x4Sse2.S
|
|
${MLAS_SRC_DIR}/x86_64/SconvKernelSse2.S
|
|
${MLAS_SRC_DIR}/x86_64/SpoolKernelSse2.S
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2")
|
|
|
|
set(mlas_platform_srcs_avx
|
|
${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelM1Avx.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelM1TransposeBAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmTransposePackB16x4Avx.S
|
|
${MLAS_SRC_DIR}/x86_64/SconvKernelAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx.S
|
|
${MLAS_SRC_DIR}/intrinsics/avx/min_max_elements.cpp
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
|
|
|
|
set(mlas_platform_srcs_avx2
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8U8KernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvxVnni.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/DgemmKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/SconvKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/TransKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/LogisticKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/TanhKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/ErfKernelFma3.S
|
|
${MLAS_SRC_DIR}/intrinsics/avx2/qladd_avx2.cpp
|
|
${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
|
|
|
|
set(mlas_platform_srcs_avx512f
|
|
${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/SconvKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/TransKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f")
|
|
|
|
set(mlas_platform_srcs_avx512core
|
|
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Core.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Vnni.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx512Core.S
|
|
${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
|
|
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/activate_fp16.cpp
|
|
${MLAS_SRC_DIR}/dwconv.cpp
|
|
${MLAS_SRC_DIR}/dgemm.cpp
|
|
${MLAS_SRC_DIR}/pooling_fp16.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_avx2.cpp
|
|
${mlas_platform_srcs_sse2}
|
|
${mlas_platform_srcs_avx}
|
|
${mlas_platform_srcs_avx2}
|
|
${mlas_platform_srcs_avx512f}
|
|
${mlas_platform_srcs_avx512core}
|
|
)
|
|
|
|
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/q4gemm_avx512.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/q4gemm_avx512.cpp PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl -mavx512f")
|
|
endif()
|
|
if(NOT APPLE)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmxCommon.S
|
|
${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmx.S
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmx.S PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
|
|
endif()
|
|
|
|
if(ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
onnxruntime_add_static_library(onnxruntime_mlas_x86_64 ${mlas_platform_srcs})
|
|
set_target_properties(onnxruntime_mlas_x86_64 PROPERTIES OSX_ARCHITECTURES "x86_64")
|
|
list(APPEND ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas_x86_64)
|
|
set(mlas_platform_srcs )
|
|
else()
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(LOONGARCH64 AND MLAS_SOURCE_IS_NOT_SET)
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/qgemm_kernel_lsx.cpp
|
|
${MLAS_SRC_DIR}/loongarch64/SgemmKernelLasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/DgemmKernelLsx.S
|
|
${MLAS_SRC_DIR}/loongarch64/DgemmKernelLasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SgemmKernelLsx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SconvKernelLsx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SconvKernelLasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SpoolKernelLSX.S
|
|
${MLAS_SRC_DIR}/loongarch64/SpoolKernelLasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4LSX.S
|
|
${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4Lasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SoftmaxKernelLasx.S
|
|
)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx -mlasx")
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
|
|
file(GLOB_RECURSE mlas_platform_srcs
|
|
"${MLAS_SRC_DIR}/scalar/*.cpp")
|
|
endif()
|
|
target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
|
|
endif()
|
|
|
|
foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
|
|
target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
|
|
onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
|
|
|
|
set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
|
|
endforeach()
|
|
|
|
if (WIN32)
|
|
target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd6385>" "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
|
|
if (onnxruntime_ENABLE_STATIC_ANALYSIS)
|
|
target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/analyze:stacksize 131072>")
|
|
endif()
|
|
endif()
|
|
|
|
if (PLATFORM_NAME STREQUAL "macabi")
|
|
# Needed for maccatalyst C compilation
|
|
# i.e. the flags below add "--target=x86_64-apple-ios14.0-macabi -ffunction-sections -fdata-sections"
|
|
target_compile_options(onnxruntime_mlas PRIVATE ${CMAKE_C_FLAGS})
|
|
endif()
|
|
|
|
if (NOT onnxruntime_BUILD_SHARED_LIB)
|
|
install(TARGETS onnxruntime_mlas
|
|
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
|
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
endif()
|
|
|
|
# set up source group for MLAS source files
|
|
block()
|
|
set(source_group_srcs)
|
|
foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
|
|
get_target_property(mlas_target_srcs ${mlas_target} SOURCES)
|
|
foreach(mlas_target_src ${mlas_target_srcs})
|
|
cmake_path(IS_PREFIX MLAS_ROOT ${mlas_target_src} in_mlas_root)
|
|
if(in_mlas_root)
|
|
list(APPEND source_group_srcs ${mlas_target_src})
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
source_group(TREE ${MLAS_ROOT} FILES ${source_group_srcs})
|
|
endblock()
|
|
|
|
|
|
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
|
|
|
|
#
|
|
# Command line tool for quantization and de-quantization of 2-D fp32 tensors
|
|
# based on block-wise quantization of int4
|
|
#
|
|
|
|
onnxruntime_add_executable(onnxruntime_mlas_q4dq
|
|
${MLAS_SRC_DIR}/q4_dq_cli.cpp
|
|
)
|
|
target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
|
|
set_target_properties(onnxruntime_mlas_q4dq PROPERTIES FOLDER "ONNXRuntimeTest")
|
|
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
|
|
if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE cpuinfo)
|
|
endif()
|
|
if(NOT WIN32)
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS})
|
|
endif()
|
|
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${android_shared_libs})
|
|
endif()
|
|
|
|
if(WIN32)
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE debug Dbghelp Advapi32)
|
|
endif()
|
|
if (onnxruntime_LINK_LIBATOMIC)
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE atomic)
|
|
endif()
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE Threads::Threads)
|
|
|
|
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
|
|
if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
|
|
set_target_properties(onnxruntime_mlas_q4dq PROPERTIES LINK_FLAGS "-s ALLOW_MEMORY_GROWTH=1 -s PROXY_TO_PTHREAD=1 -s EXIT_RUNTIME=1")
|
|
else()
|
|
set_target_properties(onnxruntime_mlas_q4dq PROPERTIES LINK_FLAGS "-s ALLOW_MEMORY_GROWTH=1")
|
|
endif()
|
|
endif()
|
|
|
|
endif()
|