mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-18 21:21:17 +00:00
### Description model: phi-3-mini-4k-instruct avx2 symmetric blklen|updated prompt tps | baseline prompt tps | prompt tps change%|updated token gen tps | baseline token gen tps | token gen change% -|-|-|-|-|-|- 16 |49.5|70.0|-29.2%|9.6|10.8|-34.2% 32 |76.8|52.4|9.7%|15.2|14.6|4.1% 64 |78.2|71.4|9.5%|16.6|16.3|1.8% 128 |72.9|70.6|3.2%|17.1|16.8|1.7% 256 |83.7|63.6|31.6%|18.1|17.4|4% avx2 asymmetric blklen|updated prompt tps | baseline prompt tps | prompt tps change%|updated token gen tps | baseline token gen tps | token gen change% -|-|-|-|-|-|- 16 |50.7|61.5|-17.5%|9.6|9.2|4.3% 32 |77.4|52.4|47.7%|14.6|13.9|5.0% 64 |78.7|63.0|24.9%|16.2|15.9|1.8% 128 |80.0|61.9|29.2%|17.2|16.9|1.7% 256 |81.5|63.3|28.7%|17.9|17.3|3.4% avx2vnni symmetric blklen|updated prompt tps | baseline prompt tps | prompt tps change%|updated token gen tps | baseline token gen tps | token gen change% -|-|-|-|-|-|- 16 |82.9|117.0|-29.0%|15.9|19.3|-17.6% 32 |133.0|100.4|32.4%|26.1|24.5|6.5% 64 |166.9|118.8|40.4%|28.3|27.1|4.4% 128 |165.9|119.6|38.7%|29.3|28.5|2.8% 256 |165.2|119.6|38.1%|30.2|29.0|4.1% avx2vnni asymmetric blklen|updated prompt tps | baseline prompt tps | prompt tps change%|updated token gen tps | baseline token gen tps | token gen change% -|-|-|-|-|-|- 16 |80.2|118.9|-32.5%|15.1|16.7|-9.5% 32 |130.7|99.7|31.0%|25.0|23.8|5.0% 64 |168.7|124.9|35.0%|27.3|26.8|1.8% 128 |169.6|123.8|36.9%|29.2|27.9|4.6% 256 |175.0|125.7|39.0%|30.0|29.7|1.0% avx512 symmetric blklen|updated prompt tps | baseline prompt tps | prompt tps change%|updated token gen tps | baseline token gen tps | token gen change% -|-|-|-|-|-|- 16 |135.2|156.5|-13.6|25.5|23.8|7.1 32 |150.0|159.5|-5.9|34.9|29.6|17.9 64 |167.5|157.5|6.3|39.7|34.4|15.4 128 |177.8|158.0|12.5|40.3|35.4|13.8 256 |182.6|157.3|16.0|41.7|37.7|10.6 avx512 asymmetric blklen|updated prompt tps | baseline prompt tps | prompt tps change%|updated token gen tps | baseline token gen tps | token gen change% -|-|-|-|-|-|- 16 |136.1|151.4|-10.1%|26.1|19.9|31.1% 32 |150.0|157.8|-4.9%|34.3|29.3|17.0% 64 |165.7|156.6|5.8%|38.7|30.7|26.0% 128 |180.4|156.6|15.1%|40.2|34.7|15.8% 256 |181.3|158.0|14.7%|41.6|36.6|13.6% avx512vnni symmetric blklen|updated prompt tps | baseline prompt tps | prompt tps change%|updated token gen tps | baseline token gen tps | token gen change% -|-|-|-|-|-|- 16 |143.4|155.4|-7.7%|25.6|23.3|9.8% 32 |159.2|157.0|1.4%|34.1|29.8|14.4% 64 |182.0|159.5|14.1%|38.4|34.8|10.3% 128 |221.2|160.8|37.5%|41.0|36.4|12.6% 256 |250.5|162.4|54.2%|41.6|37.7|10.3% avx512vnni asymmetric blklen|updated prompt tps | baseline prompt tps | prompt tps change%|updated token gen tps | baseline token gen tps | token gen change% -|-|-|-|-|-|- 16 |142.5|152.3|-6.4%|26.3|19.7|33.5% 32 |158.2|155.0|2.0%|34.3|29.2|17.4% 64 |184.1|156.6|17.5%|38.3|30.9|23.9% 128 |215.8|156.1|17.5%|41.3|35.0|17.9% 256 |249.2|155.9|59.8%|41.1|36.3|13.2% 4bit gemm implementation with avx using tile. 1. tile size is 2blk by 4. in case of size less then tile, it reduce to 1blk by 4, 2blk by 1 and lastly 1blk by 1. with internal kernel, weight and activation are loaded based on SIMD register width and blk length: avx2 256bit register, 64 weights and activation are loaded. blklen16: 4 blks are computed by the internal kernel blklen32: 2 blks are computed by the internal kernel blklen64: 1 blk are computed by the internal kernel blklen128: 1 blks are computed 2 times by the internal kernel blklen16: 1 blks are computed 4 times by the internal kernel avx512 512bit register, 128 weights and activation are loaded. blklen16: 8 blks are computed by the internal kernel blklen32: 4 blks are computed by the internal kernel blklen64: 2 blk are computed by the internal kernel blklen128: 1 blks are computed by the internal kernel blklen16: 1 blks are computed 2 times by the internal kernel 2. blksum is precomputed during prepacking. computation is reformed: Sum1(scale_a * scale_b * Sum_blk(a_i * b_i)) + Sum2(blksum_a * blksum_b) Sum_blk is over one blk Sum1 is over all blks for one output Sum2 is over all blks for one output Sum is computed with sgemm with the current implementation. Further improvement is possible. --------- Signed-off-by: Liqun Fu <liqfu@microsoft.com> Signed-off-by: liqunfu <liqun.fu@microsoft.com> Signed-off-by: Liqun Fu <liqun_fu@hotmail.com>
746 lines
31 KiB
CMake
746 lines
31 KiB
CMake
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT License.
|
|
|
|
set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
|
|
set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
|
|
set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
|
|
|
|
#
|
|
# All hardware agnostic source files here
|
|
# hardware specific files would cause trouble in
|
|
# multi-target build
|
|
#
|
|
onnxruntime_add_static_library(onnxruntime_mlas
|
|
${MLAS_SRC_DIR}/mlasi.h
|
|
${MLAS_SRC_DIR}/platform.cpp
|
|
${MLAS_SRC_DIR}/threading.cpp
|
|
${MLAS_SRC_DIR}/sgemm.cpp
|
|
${MLAS_SRC_DIR}/halfgemm.cpp
|
|
${MLAS_SRC_DIR}/qgemm.cpp
|
|
${MLAS_SRC_DIR}/qdwconv.cpp
|
|
${MLAS_SRC_DIR}/convolve.cpp
|
|
${MLAS_SRC_DIR}/convsym.cpp
|
|
${MLAS_SRC_DIR}/pooling.cpp
|
|
${MLAS_SRC_DIR}/transpose.cpp
|
|
${MLAS_SRC_DIR}/reorder.cpp
|
|
${MLAS_SRC_DIR}/snchwc.cpp
|
|
${MLAS_SRC_DIR}/activate.cpp
|
|
${MLAS_SRC_DIR}/logistic.cpp
|
|
${MLAS_SRC_DIR}/tanh.cpp
|
|
${MLAS_SRC_DIR}/erf.cpp
|
|
${MLAS_SRC_DIR}/compute.cpp
|
|
${MLAS_SRC_DIR}/quantize.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_default.cpp
|
|
${MLAS_SRC_DIR}/qladd.cpp
|
|
${MLAS_SRC_DIR}/qlmul.cpp
|
|
${MLAS_SRC_DIR}/qpostprocessor.cpp
|
|
${MLAS_SRC_DIR}/qlgavgpool.cpp
|
|
${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm.h
|
|
${MLAS_SRC_DIR}/sqnbitgemm.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
|
|
${MLAS_SRC_DIR}/flashattn.cpp
|
|
)
|
|
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_INC_DIR}/mlas_float16.h
|
|
${MLAS_INC_DIR}/mlas_gemm_postprocessor.h
|
|
${MLAS_INC_DIR}/mlas_q4.h
|
|
${MLAS_INC_DIR}/mlas_qnbit.h
|
|
${MLAS_INC_DIR}/mlas.h
|
|
)
|
|
|
|
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/q4_dq.cpp
|
|
${MLAS_SRC_DIR}/q4gemm.cpp
|
|
)
|
|
endif()
|
|
|
|
set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
|
|
|
|
#TODO: set MASM flags properly
|
|
function(setup_mlas_source_for_windows)
|
|
|
|
#
|
|
# Sources common for all platforms.
|
|
#
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/activate_fp16.cpp
|
|
${MLAS_SRC_DIR}/dwconv.cpp
|
|
${MLAS_SRC_DIR}/pooling_fp16.cpp
|
|
)
|
|
|
|
#The onnxruntime_target_platform variable was added by Windows AI team in onnxruntime_common.cmake
|
|
#Don't use it for other platforms.
|
|
if((onnxruntime_target_platform STREQUAL "ARM64") OR (onnxruntime_target_platform STREQUAL "ARM64EC"))
|
|
set(PREPROCESS_ARMASM_FLAGS "")
|
|
set(ARMASM_FLAGS "")
|
|
|
|
if(onnxruntime_target_platform STREQUAL "ARM64")
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
|
|
)
|
|
|
|
set(mlas_platform_preprocess_srcs
|
|
${MLAS_SRC_DIR}/arm64/ConvSymS8KernelDot.asm
|
|
${MLAS_SRC_DIR}/arm64/ConvSymS8KernelDotLd64.asm
|
|
${MLAS_SRC_DIR}/arm64/ConvSymU8KernelDot.asm
|
|
${MLAS_SRC_DIR}/arm64/ConvSymS8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/ConvSymU8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/DepthwiseQConvSymS8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/DepthwiseQConvSymU8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/DepthwiseQConvKernelSize9Neon.asm
|
|
${MLAS_SRC_DIR}/arm64/HalfGemmKernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/QgemmU8X8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/QgemmS8S8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/QgemmU8X8KernelUdot.asm
|
|
${MLAS_SRC_DIR}/arm64/QgemmS8S8KernelSdot.asm
|
|
${MLAS_SRC_DIR}/arm64/SgemmKernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/SgemvKernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDot.asm
|
|
${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm
|
|
)
|
|
else()
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
|
|
)
|
|
|
|
set(mlas_platform_preprocess_srcs
|
|
${MLAS_SRC_DIR}/arm64ec/QgemmU8X8KernelNeon.asm
|
|
${MLAS_SRC_DIR}/arm64ec/SgemmKernelNeon.asm
|
|
)
|
|
|
|
string(APPEND PREPROCESS_ARMASM_FLAGS " /arm64EC")
|
|
string(APPEND ARMASM_FLAGS " -machine ARM64EC")
|
|
endif()
|
|
|
|
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
|
|
string(APPEND ARMASM_FLAGS " -g")
|
|
endif()
|
|
|
|
# Remove double quotes from flag strings.
|
|
separate_arguments(PREPROCESS_ARMASM_FLAGS NATIVE_COMMAND "${PREPROCESS_ARMASM_FLAGS}")
|
|
separate_arguments(ARMASM_FLAGS NATIVE_COMMAND "${ARMASM_FLAGS}")
|
|
|
|
# Run the C precompiler on each input before the assembler.
|
|
foreach(asm_filename ${mlas_platform_preprocess_srcs})
|
|
get_filename_component(asm_filename_base ${asm_filename} NAME_WLE)
|
|
set(preprocess_filename ${CMAKE_CURRENT_BINARY_DIR}/${asm_filename_base}.i)
|
|
set(obj_filename ${CMAKE_CURRENT_BINARY_DIR}/${asm_filename_base}.obj)
|
|
add_custom_command(
|
|
OUTPUT ${obj_filename}
|
|
COMMAND
|
|
cl.exe ${PREPROCESS_ARMASM_FLAGS} /P ${asm_filename} /Fi${preprocess_filename}
|
|
COMMAND
|
|
armasm64.exe ${ARMASM_FLAGS} ${preprocess_filename} ${obj_filename}
|
|
DEPENDS ${asm_filename}
|
|
BYPRODUCTS ${preprocess_filename}
|
|
)
|
|
target_sources(onnxruntime_mlas PRIVATE ${obj_filename})
|
|
endforeach()
|
|
elseif(onnxruntime_target_platform STREQUAL "ARM")
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/arm/sgemmc.cpp
|
|
)
|
|
elseif(onnxruntime_target_platform STREQUAL "x64")
|
|
|
|
file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS
|
|
"${MLAS_SRC_DIR}/intrinsics/avx/*.cpp"
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "/arch:AVX")
|
|
|
|
file(GLOB_RECURSE mlas_platform_srcs_avx2 CONFIGURE_DEPENDS
|
|
"${MLAS_SRC_DIR}/intrinsics/avx2/*.cpp"
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "/arch:AVX2")
|
|
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/dgemm.cpp
|
|
${mlas_platform_srcs_avx}
|
|
${mlas_platform_srcs_avx2}
|
|
${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_avx2.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
|
|
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAmx.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8U8KernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8X8KernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemmU8X8KernelAvx512Core.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx512Core.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx512Vnni.asm
|
|
${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvxVnni.asm
|
|
${MLAS_SRC_DIR}/amd64/ConvSymKernelAvx2.asm
|
|
${MLAS_SRC_DIR}/amd64/ConvSymKernelAvx512Core.asm
|
|
${MLAS_SRC_DIR}/amd64/DgemmKernelSse2.asm
|
|
${MLAS_SRC_DIR}/amd64/DgemmKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/DgemmKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/DgemmKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelSse2.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelM1Avx.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/SgemmKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/SconvKernelSse2.asm
|
|
${MLAS_SRC_DIR}/amd64/SconvKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/SconvKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/SconvKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/SpoolKernelSse2.asm
|
|
${MLAS_SRC_DIR}/amd64/SpoolKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/SpoolKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/sgemma.asm
|
|
${MLAS_SRC_DIR}/amd64/cvtfp16a.asm
|
|
${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx.asm
|
|
${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/TransKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/TransKernelAvx512F.asm
|
|
${MLAS_SRC_DIR}/amd64/LogisticKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/TanhKernelFma3.asm
|
|
${MLAS_SRC_DIR}/amd64/ErfKernelFma3.asm
|
|
)
|
|
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/q4gemm_avx512.cpp
|
|
)
|
|
endif()
|
|
else()
|
|
target_sources(onnxruntime_mlas PRIVATE
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
|
|
${MLAS_SRC_DIR}/i386/SgemmKernelSse2.asm
|
|
${MLAS_SRC_DIR}/i386/SgemmKernelAvx.asm
|
|
)
|
|
endif()
|
|
endfunction()
|
|
|
|
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
|
|
if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
|
|
file(GLOB_RECURSE mlas_platform_srcs
|
|
"${MLAS_SRC_DIR}/wasm_simd/*.cpp"
|
|
)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/qgemm_kernel_wasmsimd.cpp
|
|
)
|
|
else()
|
|
file(GLOB_RECURSE mlas_platform_srcs
|
|
"${MLAS_SRC_DIR}/scalar/*.cpp"
|
|
)
|
|
endif()
|
|
target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
|
|
elseif(MSVC)
|
|
setup_mlas_source_for_windows()
|
|
else()
|
|
|
|
if(APPLE)
|
|
get_target_property(ONNXRUNTIME_MLAS_OSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
|
|
|
|
if(NOT ONNXRUNTIME_MLAS_OSX_ARCH)
|
|
set(ONNXRUNTIME_MLAS_OSX_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
|
|
endif()
|
|
foreach(OSX_ARCH ${ONNXRUNTIME_MLAS_OSX_ARCH})
|
|
if (OSX_ARCH STREQUAL "arm64")
|
|
set(ARM64 TRUE)
|
|
elseif (OSX_ARCH STREQUAL "arm64e")
|
|
set(ARM64 TRUE)
|
|
elseif (OSX_ARCH STREQUAL "arm")
|
|
set(ARM TRUE)
|
|
elseif (OSX_ARCH STREQUAL "x86_64")
|
|
set(X86_64 TRUE)
|
|
elseif (OSX_ARCH STREQUAL "i386")
|
|
set(X86 TRUE)
|
|
endif()
|
|
endforeach()
|
|
elseif(ANDROID)
|
|
if (CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
|
|
set(ARM TRUE)
|
|
elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "arm64-v8a")
|
|
set(ARM64 TRUE)
|
|
elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86_64")
|
|
set(X86_64 TRUE)
|
|
elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86")
|
|
set(X86 TRUE)
|
|
endif()
|
|
else()
|
|
#Linux/FreeBSD/PowerPC/...
|
|
#The value of CMAKE_SYSTEM_PROCESSOR should be from `uname -m`
|
|
#Example values:
|
|
#arm64v8/ubuntu -> aarch64
|
|
#arm32v6/alpine -> armv7l
|
|
#arm32v7/centos -> armv7l
|
|
#ppc64le/debian -> ppc64le
|
|
#s390x/ubuntu -> s390x
|
|
#ppc64le/busybox -> ppc64le
|
|
#arm64v8/ubuntu -> aarch64
|
|
#Android: armv7-a aarch64 i686 x86_64
|
|
#chasun: I don't think anyone uses 'arm64'
|
|
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm64.*")
|
|
set(ARM64 TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm.*")
|
|
set(ARM TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
|
|
set(ARM64 TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc.*|ppc.*)")
|
|
set(POWER TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
|
|
set(X86 TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
|
|
set(X86_64 TRUE)
|
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*")
|
|
set(LOONGARCH64 TRUE)
|
|
endif()
|
|
endif()
|
|
|
|
if(APPLE)
|
|
get_target_property(ONNXRUNTIME_MLAS_MACOSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
|
|
endif()
|
|
list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH)
|
|
if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH GREATER 1)
|
|
set(ONNXRUNTIME_MLAS_MULTI_ARCH TRUE)
|
|
endif()
|
|
#If ONNXRUNTIME_MLAS_MULTI_ARCH is true, we need to go through every if branch below
|
|
#and split MLAS to multiple static libraries.
|
|
#Otherwise, it works like if(...) elseif(...) elseif(...) endif()
|
|
set(MLAS_SOURCE_IS_NOT_SET 1)
|
|
if(ARM)
|
|
enable_language(ASM)
|
|
|
|
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfpu=neon")
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
|
|
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/aarch32/QgemmU8X8KernelNeon.S
|
|
${MLAS_SRC_DIR}/arm/sgemmc.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
|
|
)
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(ARM64 AND MLAS_SOURCE_IS_NOT_SET )
|
|
enable_language(ASM)
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymS8KernelDot.S
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymS8KernelDotLd64.S
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymU8KernelDot.S
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymS8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/ConvSymU8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/DepthwiseQConvSymS8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/DepthwiseQConvSymU8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/DepthwiseQConvKernelSize9Neon.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUdot.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSdot.S
|
|
${MLAS_SRC_DIR}/aarch64/SgemmKernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/SgemvKernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelSdot.S
|
|
${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelSdotLd64.S
|
|
${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
|
|
PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
|
|
if (NOT APPLE)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S
|
|
${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S
|
|
${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S
|
|
${MLAS_SRC_DIR}/activate_fp16.cpp
|
|
${MLAS_SRC_DIR}/dwconv.cpp
|
|
${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
|
|
${MLAS_SRC_DIR}/pooling_fp16.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
|
|
${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/activate_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
|
|
endif()
|
|
|
|
if(ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
onnxruntime_add_static_library(onnxruntime_mlas_arm64 ${mlas_platform_srcs})
|
|
set_target_properties(onnxruntime_mlas_arm64 PROPERTIES OSX_ARCHITECTURES "arm64")
|
|
list(APPEND ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas_arm64)
|
|
set(mlas_platform_srcs )
|
|
else()
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(POWER AND MLAS_SOURCE_IS_NOT_SET)
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/power/SgemmKernelPower.cpp
|
|
${MLAS_SRC_DIR}/dgemm.cpp
|
|
${MLAS_SRC_DIR}/power/DgemmKernelPower.cpp
|
|
${MLAS_SRC_DIR}/power/QuantizePower.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/SgemmKernelPower.cpp PROPERTIES COMPILE_FLAGS "-DSINGLE")
|
|
|
|
check_cxx_compiler_flag("-mcpu=power9" HAS_POWER9)
|
|
if (HAS_POWER9)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/power/QuantizePowerVSX.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/QuantizePowerVSX.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9")
|
|
endif()
|
|
|
|
check_cxx_compiler_flag("-mcpu=power10" HAS_POWER10)
|
|
if(HAS_POWER10)
|
|
set(CMAKE_REQUIRED_FLAGS "-mcpu=power10")
|
|
check_cxx_source_compiles("
|
|
#include <altivec.h>
|
|
int main() {
|
|
__vector_quad acc0;
|
|
__builtin_mma_xxsetaccz (&acc0);
|
|
return 0;
|
|
}"
|
|
COMPILES_P10
|
|
)
|
|
if(COMPILES_P10)
|
|
check_cxx_source_compiles("
|
|
#ifdef _AIX
|
|
#define POWER_10 0x40000
|
|
#define POWER_10_ANDUP (POWER_10)
|
|
#include <sys/systemcfg.h>
|
|
#define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
|
|
int main() {
|
|
bool HasP10 = (__power_10_andup() && __power_mma_version() == MMA_V31);
|
|
return 0;
|
|
}
|
|
#else
|
|
#include <sys/auxv.h>
|
|
int main() {
|
|
unsigned long hwcap2 = getauxval(AT_HWCAP2);
|
|
bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
|
|
return 0;
|
|
}
|
|
}
|
|
#endif"
|
|
HAS_P10_RUNTIME
|
|
)
|
|
if (HAS_P10_RUNTIME)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/platform.cpp PROPERTIES COMPILE_FLAGS "-DPOWER10")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/qgemm.cpp PROPERTIES COMPILE_FLAGS "-DPOWER10")
|
|
endif()
|
|
set(mlas_platform_srcs_power10
|
|
${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp
|
|
${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp
|
|
${MLAS_SRC_DIR}/power/qgemm_kernel_power10.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10 -DSINGLE")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/power/qgemm_kernel_power10.cpp PROPERTIES COMPILE_FLAGS "-O3 -mcpu=power10")
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${mlas_platform_srcs_power10}
|
|
)
|
|
endif()
|
|
endif()
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(X86 AND MLAS_SOURCE_IS_NOT_SET)
|
|
enable_language(ASM)
|
|
|
|
set(mlas_platform_srcs_sse2
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
|
|
${MLAS_SRC_DIR}/x86/SgemmKernelSse2.S
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2")
|
|
|
|
set(mlas_platform_srcs_avx
|
|
${MLAS_SRC_DIR}/x86/SgemmKernelAvx.S
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
|
|
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs_sse2}
|
|
${mlas_platform_srcs_avx}
|
|
)
|
|
|
|
# In r23, NDK remove __x86.get_pc_thunk.* from libatomic. Add our own
|
|
# implementation to avoid external dependency.
|
|
if(ANDROID)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/x86/x86.get_pc_thunk.S
|
|
)
|
|
endif()
|
|
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(X86_64 AND MLAS_SOURCE_IS_NOT_SET)
|
|
enable_language(ASM)
|
|
|
|
# Forward the flags for the minimum target platform version from the C
|
|
# compiler to the assembler. This works around CMakeASMCompiler.cmake.in
|
|
# not including the logic to set this flag for the assembler.
|
|
set(CMAKE_ASM${ASM_DIALECT}_OSX_DEPLOYMENT_TARGET_FLAG "${CMAKE_C_OSX_DEPLOYMENT_TARGET_FLAG}")
|
|
|
|
# The LLVM assembler does not support the .arch directive to enable instruction
|
|
# set extensions and also doesn't support AVX-512F instructions without
|
|
# turning on support via command-line option. Group the sources by the
|
|
# instruction set extension and explicitly set the compiler flag as appropriate.
|
|
|
|
set(mlas_platform_srcs_sse2
|
|
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
|
|
${MLAS_SRC_DIR}/x86_64/DgemmKernelSse2.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelSse2.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmTransposePackB16x4Sse2.S
|
|
${MLAS_SRC_DIR}/x86_64/SconvKernelSse2.S
|
|
${MLAS_SRC_DIR}/x86_64/SpoolKernelSse2.S
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2")
|
|
|
|
set(mlas_platform_srcs_avx
|
|
${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelM1Avx.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelM1TransposeBAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmTransposePackB16x4Avx.S
|
|
${MLAS_SRC_DIR}/x86_64/SconvKernelAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx.S
|
|
${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx.S
|
|
${MLAS_SRC_DIR}/intrinsics/avx/min_max_elements.cpp
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
|
|
|
|
set(mlas_platform_srcs_avx2
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8U8KernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvxVnni.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx2.S
|
|
${MLAS_SRC_DIR}/x86_64/DgemmKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/SconvKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/TransKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/LogisticKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/TanhKernelFma3.S
|
|
${MLAS_SRC_DIR}/x86_64/ErfKernelFma3.S
|
|
${MLAS_SRC_DIR}/intrinsics/avx2/qladd_avx2.cpp
|
|
${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
|
|
)
|
|
|
|
message(STATUS "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
|
|
message(STATUS "CMAKE_CXX_COMPILER_VERSION: ${CMAKE_CXX_COMPILER_VERSION}")
|
|
|
|
if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "10")
|
|
message(STATUS "Using -mavx2 -mfma -mavxvnni flags")
|
|
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mavxvnni")
|
|
else()
|
|
message(STATUS "Using -mavx2 -mfma flags")
|
|
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
|
|
endif()
|
|
set(mlas_platform_srcs_avx512f
|
|
${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/SconvKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/x86_64/TransKernelAvx512F.S
|
|
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f")
|
|
|
|
set(mlas_platform_srcs_avx512core
|
|
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Core.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Vnni.S
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx512Core.S
|
|
${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl")
|
|
|
|
set(mlas_platform_srcs_avx512vnni
|
|
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
|
|
)
|
|
set_source_files_properties(${mlas_platform_srcs_avx512vnni} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl -mavx512f")
|
|
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/activate_fp16.cpp
|
|
${MLAS_SRC_DIR}/dwconv.cpp
|
|
${MLAS_SRC_DIR}/dgemm.cpp
|
|
${MLAS_SRC_DIR}/pooling_fp16.cpp
|
|
${MLAS_SRC_DIR}/qgemm_kernel_avx2.cpp
|
|
${mlas_platform_srcs_sse2}
|
|
${mlas_platform_srcs_avx}
|
|
${mlas_platform_srcs_avx2}
|
|
${mlas_platform_srcs_avx512f}
|
|
${mlas_platform_srcs_avx512core}
|
|
${mlas_platform_srcs_avx512vnni}
|
|
)
|
|
|
|
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/q4gemm_avx512.cpp
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/q4gemm_avx512.cpp PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl -mavx512f")
|
|
endif()
|
|
if(NOT APPLE)
|
|
set(mlas_platform_srcs
|
|
${mlas_platform_srcs}
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmxCommon.S
|
|
${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp
|
|
${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmx.S
|
|
)
|
|
set_source_files_properties(${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
|
|
set_source_files_properties(${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmx.S PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
|
|
endif()
|
|
|
|
if(ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
onnxruntime_add_static_library(onnxruntime_mlas_x86_64 ${mlas_platform_srcs})
|
|
set_target_properties(onnxruntime_mlas_x86_64 PROPERTIES OSX_ARCHITECTURES "x86_64")
|
|
list(APPEND ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas_x86_64)
|
|
set(mlas_platform_srcs )
|
|
else()
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(LOONGARCH64 AND MLAS_SOURCE_IS_NOT_SET)
|
|
set(mlas_platform_srcs
|
|
${MLAS_SRC_DIR}/qgemm_kernel_lsx.cpp
|
|
${MLAS_SRC_DIR}/loongarch64/SgemmKernelLasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/DgemmKernelLsx.S
|
|
${MLAS_SRC_DIR}/loongarch64/DgemmKernelLasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SgemmKernelLsx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SconvKernelLsx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SconvKernelLasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SpoolKernelLSX.S
|
|
${MLAS_SRC_DIR}/loongarch64/SpoolKernelLasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4LSX.S
|
|
${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4Lasx.S
|
|
${MLAS_SRC_DIR}/loongarch64/SoftmaxKernelLasx.S
|
|
)
|
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx -mlasx")
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
|
|
set(MLAS_SOURCE_IS_NOT_SET 0)
|
|
endif()
|
|
endif()
|
|
if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
|
|
file(GLOB_RECURSE mlas_platform_srcs
|
|
"${MLAS_SRC_DIR}/scalar/*.cpp")
|
|
endif()
|
|
target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
|
|
endif()
|
|
|
|
foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
|
|
target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
|
|
onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
|
|
|
|
set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
|
|
endforeach()
|
|
|
|
if (WIN32)
|
|
target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd6385>" "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
|
|
if (onnxruntime_ENABLE_STATIC_ANALYSIS)
|
|
target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/analyze:stacksize 131072>")
|
|
endif()
|
|
endif()
|
|
|
|
if (PLATFORM_NAME STREQUAL "macabi")
|
|
# Needed for maccatalyst C compilation
|
|
# i.e. the flags below add "--target=x86_64-apple-ios14.0-macabi -ffunction-sections -fdata-sections"
|
|
target_compile_options(onnxruntime_mlas PRIVATE ${CMAKE_C_FLAGS})
|
|
endif()
|
|
|
|
if (NOT onnxruntime_BUILD_SHARED_LIB)
|
|
install(TARGETS onnxruntime_mlas
|
|
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
|
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
|
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
|
|
endif()
|
|
|
|
# set up source group for MLAS source files
|
|
block()
|
|
set(source_group_srcs)
|
|
foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
|
|
get_target_property(mlas_target_srcs ${mlas_target} SOURCES)
|
|
foreach(mlas_target_src ${mlas_target_srcs})
|
|
cmake_path(IS_PREFIX MLAS_ROOT ${mlas_target_src} in_mlas_root)
|
|
if(in_mlas_root)
|
|
list(APPEND source_group_srcs ${mlas_target_src})
|
|
endif()
|
|
endforeach()
|
|
endforeach()
|
|
source_group(TREE ${MLAS_ROOT} FILES ${source_group_srcs})
|
|
endblock()
|
|
|
|
|
|
if (NOT onnxruntime_ORT_MINIMAL_BUILD)
|
|
|
|
#
|
|
# Command line tool for quantization and de-quantization of 2-D fp32 tensors
|
|
# based on block-wise quantization of int4
|
|
#
|
|
|
|
onnxruntime_add_executable(onnxruntime_mlas_q4dq
|
|
${MLAS_SRC_DIR}/q4_dq_cli.cpp
|
|
)
|
|
target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
|
|
set_target_properties(onnxruntime_mlas_q4dq PROPERTIES FOLDER "ONNXRuntimeTest")
|
|
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
|
|
if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE cpuinfo)
|
|
endif()
|
|
if(NOT WIN32)
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS})
|
|
endif()
|
|
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${android_shared_libs})
|
|
endif()
|
|
|
|
if(WIN32)
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE debug Dbghelp Advapi32)
|
|
endif()
|
|
if (onnxruntime_LINK_LIBATOMIC)
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE atomic)
|
|
endif()
|
|
target_link_libraries(onnxruntime_mlas_q4dq PRIVATE Threads::Threads)
|
|
|
|
if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
|
|
if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
|
|
set_target_properties(onnxruntime_mlas_q4dq PROPERTIES LINK_FLAGS "-s ALLOW_MEMORY_GROWTH=1 -s PROXY_TO_PTHREAD=1 -s EXIT_RUNTIME=1")
|
|
else()
|
|
set_target_properties(onnxruntime_mlas_q4dq PROPERTIES LINK_FLAGS "-s ALLOW_MEMORY_GROWTH=1")
|
|
endif()
|
|
endif()
|
|
|
|
endif()
|