MLAS: workaround LLVM x86 assembler (#6922)

Implement an alternate workaround for the LLVM x86 problem described in PR #5088. That change made the x86 assembly files build with the GNU assembler by using -fno-integrated-as
This commit is contained in:
Tracy Sharpe 2021-03-08 14:18:49 -08:00 committed by GitHub
parent b89f52c277
commit bc27652188
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 74 additions and 36 deletions

View file

@ -201,11 +201,7 @@ else()
set(mlas_platform_srcs_avx
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86/SgemmKernelAvx.S
)
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx -fno-integrated-as")
else()
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
endif()
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
set(mlas_platform_srcs
${mlas_platform_srcs_sse2}

View file

@ -27,7 +27,7 @@ INCLUDE SgemmKernelCommon.inc
ASSUME DS:FLAT,ES:FLAT,SS:NOTHING,FS:NOTHING,GS:NOTHING
EXTERN _MlasMaskMoveAvx:NEAR
EXTERN _MlasMaskMoveTableAvx:NEAR
_TEXT SEGMENT DWORD PUBLIC 'CODE'
@ -319,11 +319,8 @@ SkipAccumulateMasked16x2Block:
add ebp,8 ; correct for over-subtract above
OutputMasked8x2Block:
mov SgemmKernelFrame.CountN[esp],ebp
vbroadcastss xmm0,SgemmKernelFrame.CountN[esp]
vpcmpgtd xmm1,xmm0,XMMWORD PTR [_MlasMaskMoveAvx+16]
vpcmpgtd xmm0,xmm0,XMMWORD PTR [_MlasMaskMoveAvx]
vinsertf128 ymm0,ymm0,xmm1,1
neg ebp
vmovdqu ymm0,YMMWORD PTR [_MlasMaskMoveTableAvx+ebp*4+8*4]
cmp BYTE PTR SgemmKernelFrame.ZeroMode[esp],0
jnz SkipAccumulateMasked8x2Block
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]
@ -398,11 +395,8 @@ SkipAccumulateMasked16x1Block:
add ebp,8 ; correct for over-subtract above
OutputMasked8x1Block:
mov SgemmKernelFrame.CountN[esp],ebp
vbroadcastss xmm0,SgemmKernelFrame.CountN[esp]
vpcmpgtd xmm1,xmm0,XMMWORD PTR [_MlasMaskMoveAvx+16]
vpcmpgtd xmm0,xmm0,XMMWORD PTR [_MlasMaskMoveAvx]
vinsertf128 ymm0,ymm0,xmm1,1
neg ebp
vmovdqu ymm0,YMMWORD PTR [_MlasMaskMoveTableAvx+ebp*4+8*4]
cmp BYTE PTR SgemmKernelFrame.ZeroMode[esp],0
jnz SkipAccumulateMasked8x1Block
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]

View file

@ -238,8 +238,7 @@ Return Value:
--*/
.globl C_UNDERSCORE(MlasGemmFloatKernelAvx)
C_UNDERSCORE(MlasGemmFloatKernelAvx):
FUNCTION_ENTRY MlasGemmFloatKernelAvx
push ebp
push ebx
@ -342,14 +341,10 @@ C_UNDERSCORE(MlasGemmFloatKernelAvx):
add ebp,8 # correct for over-subtract above
.LOutputMasked8x2Block:
call __x86.get_pc_thunk.bx
add ebx,OFFSET _GLOBAL_OFFSET_TABLE_
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveAvx)@GOT[ebx]
mov .LSgemmKernelFrame_CountN[esp],ebp
vbroadcastss xmm0,.LSgemmKernelFrame_CountN[esp]
vpcmpgtd xmm1,xmm0,XMMWORD PTR [ebx+16]
vpcmpgtd xmm0,xmm0,XMMWORD PTR [ebx]
vinsertf128 ymm0,ymm0,xmm1,1
neg ebp
LoadGlobalOffsetTable bx
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveTableAvx)@GOT[ebx]
vmovdqu ymm0,YMMWORD PTR [ebx+ebp*4+8*4]
cmp BYTE PTR .LSgemmKernelFrame_ZeroMode[esp],0
jnz .LSkipAccumulateMasked8x2Block
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]
@ -424,14 +419,10 @@ C_UNDERSCORE(MlasGemmFloatKernelAvx):
add ebp,8 # correct for over-subtract above
.LOutputMasked8x1Block:
call __x86.get_pc_thunk.bx
add ebx,OFFSET _GLOBAL_OFFSET_TABLE_
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveAvx)@GOT[ebx]
mov .LSgemmKernelFrame_CountN[esp],ebp
vbroadcastss xmm0,.LSgemmKernelFrame_CountN[esp]
vpcmpgtd xmm1,xmm0,XMMWORD PTR [ebx+16]
vpcmpgtd xmm0,xmm0,XMMWORD PTR [ebx]
vinsertf128 ymm0,ymm0,xmm1,1
neg ebp
LoadGlobalOffsetTable bx
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveTableAvx)@GOT[ebx]
vmovdqu ymm0,YMMWORD PTR [ebx+ebp*4+8*4]
cmp BYTE PTR .LSgemmKernelFrame_ZeroMode[esp],0
jnz .LSkipAccumulateMasked8x1Block
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]

View file

@ -225,8 +225,7 @@ Return Value:
--*/
.globl C_UNDERSCORE(MlasGemmFloatKernelSse)
C_UNDERSCORE(MlasGemmFloatKernelSse):
FUNCTION_ENTRY MlasGemmFloatKernelSse
push ebp
push ebx

View file

@ -19,3 +19,61 @@ Abstract:
#else
#define C_UNDERSCORE(symbol) symbol
#endif
/*++
Macro Description:
This macro emits the assembler directives to annotate a new function.
Arguments:
FunctionName - Supplies the name of the function.
--*/
.macro FUNCTION_ENTRY FunctionName
.p2align 4
#if defined(__APPLE__)
.globl _\FunctionName\()
_\FunctionName\():
#else
.globl \FunctionName\()
.type \FunctionName\(),@function
\FunctionName\():
#endif
.endm
/*++
Macro Description:
This macro emits the code to load the global offset table address into the
supplied register.
Arguments:
TargetReg - Specifies the target register.
--*/
.macro LoadGlobalOffsetTable, TargetReg
//
// The LLVM integrated assembler doesn't support the Intel syntax for OFFSET:
//
// add ebx,OFFSET _GLOBAL_OFFSET_TABLE_
//
// Workaround this by temporarily switching to AT&T syntax.
//
.att_syntax
calll __x86.get_pc_thunk.\TargetReg\()
addl $_GLOBAL_OFFSET_TABLE_,%e\TargetReg\()
.intel_syntax noprefix
.endm