diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index c6204268b2..3fee9c0cb5 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -201,11 +201,7 @@ else() set(mlas_platform_srcs_avx ${ONNXRUNTIME_ROOT}/core/mlas/lib/x86/SgemmKernelAvx.S ) - if (CMAKE_SYSTEM_NAME STREQUAL "Android") - set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx -fno-integrated-as") - else() - set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx") - endif() + set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx") set(mlas_platform_srcs ${mlas_platform_srcs_sse2} diff --git a/onnxruntime/core/mlas/lib/i386/SgemmKernelAvx.asm b/onnxruntime/core/mlas/lib/i386/SgemmKernelAvx.asm index 6db5a10302..cbc88c3c8d 100644 --- a/onnxruntime/core/mlas/lib/i386/SgemmKernelAvx.asm +++ b/onnxruntime/core/mlas/lib/i386/SgemmKernelAvx.asm @@ -27,7 +27,7 @@ INCLUDE SgemmKernelCommon.inc ASSUME DS:FLAT,ES:FLAT,SS:NOTHING,FS:NOTHING,GS:NOTHING - EXTERN _MlasMaskMoveAvx:NEAR + EXTERN _MlasMaskMoveTableAvx:NEAR _TEXT SEGMENT DWORD PUBLIC 'CODE' @@ -319,11 +319,8 @@ SkipAccumulateMasked16x2Block: add ebp,8 ; correct for over-subtract above OutputMasked8x2Block: - mov SgemmKernelFrame.CountN[esp],ebp - vbroadcastss xmm0,SgemmKernelFrame.CountN[esp] - vpcmpgtd xmm1,xmm0,XMMWORD PTR [_MlasMaskMoveAvx+16] - vpcmpgtd xmm0,xmm0,XMMWORD PTR [_MlasMaskMoveAvx] - vinsertf128 ymm0,ymm0,xmm1,1 + neg ebp + vmovdqu ymm0,YMMWORD PTR [_MlasMaskMoveTableAvx+ebp*4+8*4] cmp BYTE PTR SgemmKernelFrame.ZeroMode[esp],0 jnz SkipAccumulateMasked8x2Block vmaskmovps ymm4,ymm0,YMMWORD PTR [esi] @@ -398,11 +395,8 @@ SkipAccumulateMasked16x1Block: add ebp,8 ; correct for over-subtract above OutputMasked8x1Block: - mov SgemmKernelFrame.CountN[esp],ebp - vbroadcastss xmm0,SgemmKernelFrame.CountN[esp] - vpcmpgtd xmm1,xmm0,XMMWORD PTR [_MlasMaskMoveAvx+16] - vpcmpgtd xmm0,xmm0,XMMWORD PTR [_MlasMaskMoveAvx] - vinsertf128 ymm0,ymm0,xmm1,1 + neg ebp + vmovdqu ymm0,YMMWORD PTR [_MlasMaskMoveTableAvx+ebp*4+8*4] cmp BYTE PTR SgemmKernelFrame.ZeroMode[esp],0 jnz SkipAccumulateMasked8x1Block vmaskmovps ymm4,ymm0,YMMWORD PTR [esi] diff --git a/onnxruntime/core/mlas/lib/x86/SgemmKernelAvx.S b/onnxruntime/core/mlas/lib/x86/SgemmKernelAvx.S index f874710c1a..7af2a9e118 100644 --- a/onnxruntime/core/mlas/lib/x86/SgemmKernelAvx.S +++ b/onnxruntime/core/mlas/lib/x86/SgemmKernelAvx.S @@ -238,8 +238,7 @@ Return Value: --*/ - .globl C_UNDERSCORE(MlasGemmFloatKernelAvx) -C_UNDERSCORE(MlasGemmFloatKernelAvx): + FUNCTION_ENTRY MlasGemmFloatKernelAvx push ebp push ebx @@ -342,14 +341,10 @@ C_UNDERSCORE(MlasGemmFloatKernelAvx): add ebp,8 # correct for over-subtract above .LOutputMasked8x2Block: - call __x86.get_pc_thunk.bx - add ebx,OFFSET _GLOBAL_OFFSET_TABLE_ - mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveAvx)@GOT[ebx] - mov .LSgemmKernelFrame_CountN[esp],ebp - vbroadcastss xmm0,.LSgemmKernelFrame_CountN[esp] - vpcmpgtd xmm1,xmm0,XMMWORD PTR [ebx+16] - vpcmpgtd xmm0,xmm0,XMMWORD PTR [ebx] - vinsertf128 ymm0,ymm0,xmm1,1 + neg ebp + LoadGlobalOffsetTable bx + mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveTableAvx)@GOT[ebx] + vmovdqu ymm0,YMMWORD PTR [ebx+ebp*4+8*4] cmp BYTE PTR .LSgemmKernelFrame_ZeroMode[esp],0 jnz .LSkipAccumulateMasked8x2Block vmaskmovps ymm4,ymm0,YMMWORD PTR [esi] @@ -424,14 +419,10 @@ C_UNDERSCORE(MlasGemmFloatKernelAvx): add ebp,8 # correct for over-subtract above .LOutputMasked8x1Block: - call __x86.get_pc_thunk.bx - add ebx,OFFSET _GLOBAL_OFFSET_TABLE_ - mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveAvx)@GOT[ebx] - mov .LSgemmKernelFrame_CountN[esp],ebp - vbroadcastss xmm0,.LSgemmKernelFrame_CountN[esp] - vpcmpgtd xmm1,xmm0,XMMWORD PTR [ebx+16] - vpcmpgtd xmm0,xmm0,XMMWORD PTR [ebx] - vinsertf128 ymm0,ymm0,xmm1,1 + neg ebp + LoadGlobalOffsetTable bx + mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveTableAvx)@GOT[ebx] + vmovdqu ymm0,YMMWORD PTR [ebx+ebp*4+8*4] cmp BYTE PTR .LSgemmKernelFrame_ZeroMode[esp],0 jnz .LSkipAccumulateMasked8x1Block vmaskmovps ymm4,ymm0,YMMWORD PTR [esi] diff --git a/onnxruntime/core/mlas/lib/x86/SgemmKernelSse2.S b/onnxruntime/core/mlas/lib/x86/SgemmKernelSse2.S index f7718d3cef..f42175ec6c 100644 --- a/onnxruntime/core/mlas/lib/x86/SgemmKernelSse2.S +++ b/onnxruntime/core/mlas/lib/x86/SgemmKernelSse2.S @@ -225,8 +225,7 @@ Return Value: --*/ - .globl C_UNDERSCORE(MlasGemmFloatKernelSse) -C_UNDERSCORE(MlasGemmFloatKernelSse): + FUNCTION_ENTRY MlasGemmFloatKernelSse push ebp push ebx diff --git a/onnxruntime/core/mlas/lib/x86/asmmacro.h b/onnxruntime/core/mlas/lib/x86/asmmacro.h index 00f11eea3f..4b80eea735 100644 --- a/onnxruntime/core/mlas/lib/x86/asmmacro.h +++ b/onnxruntime/core/mlas/lib/x86/asmmacro.h @@ -19,3 +19,61 @@ Abstract: #else #define C_UNDERSCORE(symbol) symbol #endif + +/*++ + +Macro Description: + + This macro emits the assembler directives to annotate a new function. + +Arguments: + + FunctionName - Supplies the name of the function. + +--*/ + + .macro FUNCTION_ENTRY FunctionName + + .p2align 4 +#if defined(__APPLE__) + .globl _\FunctionName\() +_\FunctionName\(): +#else + .globl \FunctionName\() + .type \FunctionName\(),@function +\FunctionName\(): +#endif + + .endm + +/*++ + +Macro Description: + + This macro emits the code to load the global offset table address into the + supplied register. + +Arguments: + + TargetReg - Specifies the target register. + +--*/ + + .macro LoadGlobalOffsetTable, TargetReg + +// +// The LLVM integrated assembler doesn't support the Intel syntax for OFFSET: +// +// add ebx,OFFSET _GLOBAL_OFFSET_TABLE_ +// +// Workaround this by temporarily switching to AT&T syntax. +// + + .att_syntax + + calll __x86.get_pc_thunk.\TargetReg\() + addl $_GLOBAL_OFFSET_TABLE_,%e\TargetReg\() + + .intel_syntax noprefix + + .endm