mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-28 22:56:32 +00:00
MLAS: workaround LLVM x86 assembler (#6922)
Implement an alternate workaround for the LLVM x86 problem described in PR #5088. That change made the x86 assembly files build with the GNU assembler by using -fno-integrated-as
This commit is contained in:
parent
b89f52c277
commit
bc27652188
5 changed files with 74 additions and 36 deletions
|
|
@ -201,11 +201,7 @@ else()
|
|||
set(mlas_platform_srcs_avx
|
||||
${ONNXRUNTIME_ROOT}/core/mlas/lib/x86/SgemmKernelAvx.S
|
||||
)
|
||||
if (CMAKE_SYSTEM_NAME STREQUAL "Android")
|
||||
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx -fno-integrated-as")
|
||||
else()
|
||||
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
|
||||
endif()
|
||||
set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
|
||||
|
||||
set(mlas_platform_srcs
|
||||
${mlas_platform_srcs_sse2}
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ INCLUDE SgemmKernelCommon.inc
|
|||
|
||||
ASSUME DS:FLAT,ES:FLAT,SS:NOTHING,FS:NOTHING,GS:NOTHING
|
||||
|
||||
EXTERN _MlasMaskMoveAvx:NEAR
|
||||
EXTERN _MlasMaskMoveTableAvx:NEAR
|
||||
|
||||
_TEXT SEGMENT DWORD PUBLIC 'CODE'
|
||||
|
||||
|
|
@ -319,11 +319,8 @@ SkipAccumulateMasked16x2Block:
|
|||
add ebp,8 ; correct for over-subtract above
|
||||
|
||||
OutputMasked8x2Block:
|
||||
mov SgemmKernelFrame.CountN[esp],ebp
|
||||
vbroadcastss xmm0,SgemmKernelFrame.CountN[esp]
|
||||
vpcmpgtd xmm1,xmm0,XMMWORD PTR [_MlasMaskMoveAvx+16]
|
||||
vpcmpgtd xmm0,xmm0,XMMWORD PTR [_MlasMaskMoveAvx]
|
||||
vinsertf128 ymm0,ymm0,xmm1,1
|
||||
neg ebp
|
||||
vmovdqu ymm0,YMMWORD PTR [_MlasMaskMoveTableAvx+ebp*4+8*4]
|
||||
cmp BYTE PTR SgemmKernelFrame.ZeroMode[esp],0
|
||||
jnz SkipAccumulateMasked8x2Block
|
||||
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]
|
||||
|
|
@ -398,11 +395,8 @@ SkipAccumulateMasked16x1Block:
|
|||
add ebp,8 ; correct for over-subtract above
|
||||
|
||||
OutputMasked8x1Block:
|
||||
mov SgemmKernelFrame.CountN[esp],ebp
|
||||
vbroadcastss xmm0,SgemmKernelFrame.CountN[esp]
|
||||
vpcmpgtd xmm1,xmm0,XMMWORD PTR [_MlasMaskMoveAvx+16]
|
||||
vpcmpgtd xmm0,xmm0,XMMWORD PTR [_MlasMaskMoveAvx]
|
||||
vinsertf128 ymm0,ymm0,xmm1,1
|
||||
neg ebp
|
||||
vmovdqu ymm0,YMMWORD PTR [_MlasMaskMoveTableAvx+ebp*4+8*4]
|
||||
cmp BYTE PTR SgemmKernelFrame.ZeroMode[esp],0
|
||||
jnz SkipAccumulateMasked8x1Block
|
||||
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]
|
||||
|
|
|
|||
|
|
@ -238,8 +238,7 @@ Return Value:
|
|||
|
||||
--*/
|
||||
|
||||
.globl C_UNDERSCORE(MlasGemmFloatKernelAvx)
|
||||
C_UNDERSCORE(MlasGemmFloatKernelAvx):
|
||||
FUNCTION_ENTRY MlasGemmFloatKernelAvx
|
||||
|
||||
push ebp
|
||||
push ebx
|
||||
|
|
@ -342,14 +341,10 @@ C_UNDERSCORE(MlasGemmFloatKernelAvx):
|
|||
add ebp,8 # correct for over-subtract above
|
||||
|
||||
.LOutputMasked8x2Block:
|
||||
call __x86.get_pc_thunk.bx
|
||||
add ebx,OFFSET _GLOBAL_OFFSET_TABLE_
|
||||
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveAvx)@GOT[ebx]
|
||||
mov .LSgemmKernelFrame_CountN[esp],ebp
|
||||
vbroadcastss xmm0,.LSgemmKernelFrame_CountN[esp]
|
||||
vpcmpgtd xmm1,xmm0,XMMWORD PTR [ebx+16]
|
||||
vpcmpgtd xmm0,xmm0,XMMWORD PTR [ebx]
|
||||
vinsertf128 ymm0,ymm0,xmm1,1
|
||||
neg ebp
|
||||
LoadGlobalOffsetTable bx
|
||||
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveTableAvx)@GOT[ebx]
|
||||
vmovdqu ymm0,YMMWORD PTR [ebx+ebp*4+8*4]
|
||||
cmp BYTE PTR .LSgemmKernelFrame_ZeroMode[esp],0
|
||||
jnz .LSkipAccumulateMasked8x2Block
|
||||
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]
|
||||
|
|
@ -424,14 +419,10 @@ C_UNDERSCORE(MlasGemmFloatKernelAvx):
|
|||
add ebp,8 # correct for over-subtract above
|
||||
|
||||
.LOutputMasked8x1Block:
|
||||
call __x86.get_pc_thunk.bx
|
||||
add ebx,OFFSET _GLOBAL_OFFSET_TABLE_
|
||||
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveAvx)@GOT[ebx]
|
||||
mov .LSgemmKernelFrame_CountN[esp],ebp
|
||||
vbroadcastss xmm0,.LSgemmKernelFrame_CountN[esp]
|
||||
vpcmpgtd xmm1,xmm0,XMMWORD PTR [ebx+16]
|
||||
vpcmpgtd xmm0,xmm0,XMMWORD PTR [ebx]
|
||||
vinsertf128 ymm0,ymm0,xmm1,1
|
||||
neg ebp
|
||||
LoadGlobalOffsetTable bx
|
||||
mov ebx,DWORD PTR C_UNDERSCORE(MlasMaskMoveTableAvx)@GOT[ebx]
|
||||
vmovdqu ymm0,YMMWORD PTR [ebx+ebp*4+8*4]
|
||||
cmp BYTE PTR .LSgemmKernelFrame_ZeroMode[esp],0
|
||||
jnz .LSkipAccumulateMasked8x1Block
|
||||
vmaskmovps ymm4,ymm0,YMMWORD PTR [esi]
|
||||
|
|
|
|||
|
|
@ -225,8 +225,7 @@ Return Value:
|
|||
|
||||
--*/
|
||||
|
||||
.globl C_UNDERSCORE(MlasGemmFloatKernelSse)
|
||||
C_UNDERSCORE(MlasGemmFloatKernelSse):
|
||||
FUNCTION_ENTRY MlasGemmFloatKernelSse
|
||||
|
||||
push ebp
|
||||
push ebx
|
||||
|
|
|
|||
|
|
@ -19,3 +19,61 @@ Abstract:
|
|||
#else
|
||||
#define C_UNDERSCORE(symbol) symbol
|
||||
#endif
|
||||
|
||||
/*++
|
||||
|
||||
Macro Description:
|
||||
|
||||
This macro emits the assembler directives to annotate a new function.
|
||||
|
||||
Arguments:
|
||||
|
||||
FunctionName - Supplies the name of the function.
|
||||
|
||||
--*/
|
||||
|
||||
.macro FUNCTION_ENTRY FunctionName
|
||||
|
||||
.p2align 4
|
||||
#if defined(__APPLE__)
|
||||
.globl _\FunctionName\()
|
||||
_\FunctionName\():
|
||||
#else
|
||||
.globl \FunctionName\()
|
||||
.type \FunctionName\(),@function
|
||||
\FunctionName\():
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
/*++
|
||||
|
||||
Macro Description:
|
||||
|
||||
This macro emits the code to load the global offset table address into the
|
||||
supplied register.
|
||||
|
||||
Arguments:
|
||||
|
||||
TargetReg - Specifies the target register.
|
||||
|
||||
--*/
|
||||
|
||||
.macro LoadGlobalOffsetTable, TargetReg
|
||||
|
||||
//
|
||||
// The LLVM integrated assembler doesn't support the Intel syntax for OFFSET:
|
||||
//
|
||||
// add ebx,OFFSET _GLOBAL_OFFSET_TABLE_
|
||||
//
|
||||
// Workaround this by temporarily switching to AT&T syntax.
|
||||
//
|
||||
|
||||
.att_syntax
|
||||
|
||||
calll __x86.get_pc_thunk.\TargetReg\()
|
||||
addl $_GLOBAL_OFFSET_TABLE_,%e\TargetReg\()
|
||||
|
||||
.intel_syntax noprefix
|
||||
|
||||
.endm
|
||||
|
|
|
|||
Loading…
Reference in a new issue